def spiderPlanet(only_if_new=False): """ Spider (fetch) an entire planet """ log = planet.logger global index index = True timeout = config.feed_timeout() try: socket.setdefaulttimeout(float(timeout)) log.info("Socket timeout set to %d seconds", timeout) except: try: import timeoutsocket timeoutsocket.setDefaultSocketTimeout(float(timeout)) log.info("Socket timeout set to %d seconds", timeout) except: log.warning("Timeout set to invalid value '%s', skipping", timeout) from Queue import Queue from threading import Thread fetch_queue = Queue() parse_queue = Queue() threads = {} http_cache = config.http_cache_directory() # Should this be done in config? if http_cache and not os.path.exists(http_cache): os.makedirs(http_cache) if int(config.spider_threads()): # Start all the worker threads for i in range(int(config.spider_threads())): threads[i] = Thread(target=httpThread, args=(i, fetch_queue, parse_queue, log)) threads[i].start() else: log.info("Building work queue") # Load the fetch and parse work queues for uri in config.subscriptions(): # read cached feed info sources = config.cache_sources_directory() feed_source = filename(sources, uri) feed_info = feedparser.parse(feed_source) if feed_info.feed and only_if_new: log.info("Feed %s already in cache", uri) continue if feed_info.feed.get('planet_http_status', None) == '410': log.info("Feed %s gone", uri) continue if threads and _is_http_uri(uri): fetch_queue.put(item=(uri, feed_info)) else: parse_queue.put(item=(uri, feed_info, uri)) # Mark the end of the fetch queue for thread in threads.keys(): fetch_queue.put(item=(None, None)) # Process the results as they arrive feeds_seen = {} while fetch_queue.qsize() or parse_queue.qsize() or threads: while parse_queue.qsize(): (uri, feed_info, feed) = parse_queue.get(False) try: if not hasattr(feed, 'headers') or int(feed.headers.status) < 300: options = {} if hasattr(feed_info, 'feed'): options['etag'] = \ feed_info.feed.get('planet_http_etag',None) try: modified = time.strptime( feed_info.feed.get('planet_http_last_modified', None)) except: pass data = feedparser.parse(feed, **options) else: data = feedparser.FeedParserDict({ 'version': None, 'headers': feed.headers, 'entries': [], 'feed': {}, 'href': feed.url, 'bozo': 0, 'status': int(feed.headers.status) }) # duplicate feed? id = data.feed.get('id', None) if not id: id = feed_info.feed.get('id', None) href = uri if data.has_key('href'): href = data.href duplicate = None if id and id in feeds_seen: duplicate = id elif href and href in feeds_seen: duplicate = href if duplicate: feed_info.feed['planet_message'] = \ 'duplicate subscription: ' + feeds_seen[duplicate] log.warn('Duplicate subscription: %s and %s' % (uri, feeds_seen[duplicate])) if href: feed_info.feed['planet_http_location'] = href if id: feeds_seen[id] = uri if href: feeds_seen[href] = uri # complete processing for the feed writeCache(uri, feed_info, data) except Exception, e: import sys, traceback type, value, tb = sys.exc_info() log.error('Error processing %s', uri) for line in (traceback.format_exception_only(type, value) + traceback.format_tb(tb)): log.error(line.rstrip()) time.sleep(0.1) for index in threads.keys(): if not threads[index].isAlive(): del threads[index] if not threads: log.info("Finished threaded part of processing.")
def spiderPlanet(only_if_new = False): """ Spider (fetch) an entire planet """ log = planet.logger global index index = True timeout = config.feed_timeout() try: socket.setdefaulttimeout(float(timeout)) log.info("Socket timeout set to %d seconds", timeout) except: try: import timeoutsocket timeoutsocket.setDefaultSocketTimeout(float(timeout)) log.info("Socket timeout set to %d seconds", timeout) except: log.warning("Timeout set to invalid value '%s', skipping", timeout) from Queue import Queue from threading import Thread fetch_queue = Queue() parse_queue = Queue() threads = {} http_cache = config.http_cache_directory() # Should this be done in config? if http_cache and not os.path.exists(http_cache): os.makedirs(http_cache) if int(config.spider_threads()): # Start all the worker threads for i in range(int(config.spider_threads())): threads[i] = Thread(target=httpThread, args=(i,fetch_queue, parse_queue, log)) threads[i].start() else: log.info("Building work queue") # Load the fetch and parse work queues for uri in config.subscriptions(): # read cached feed info sources = config.cache_sources_directory() feed_source = filename(sources, uri) feed_info = feedparser.parse(feed_source) if feed_info.feed and only_if_new: log.info("Feed %s already in cache", uri) continue if feed_info.feed.get('planet_http_status',None) == '410': log.info("Feed %s gone", uri) continue if threads and _is_http_uri(uri): fetch_queue.put(item=(uri, feed_info)) else: parse_queue.put(item=(uri, feed_info, uri)) # Mark the end of the fetch queue for thread in threads.keys(): fetch_queue.put(item=(None, None)) # Process the results as they arrive feeds_seen = {} while fetch_queue.qsize() or parse_queue.qsize() or threads: while parse_queue.qsize(): (uri, feed_info, feed) = parse_queue.get(False) try: if not hasattr(feed,'headers') or int(feed.headers.status)<300: options = {} if hasattr(feed_info,'feed'): options['etag'] = \ feed_info.feed.get('planet_http_etag',None) try: modified=time.strptime( feed_info.feed.get('planet_http_last_modified', None)) except: pass data = feedparser.parse(feed, **options) else: data = feedparser.FeedParserDict({'version': None, 'headers': feed.headers, 'entries': [], 'feed': {}, 'href': feed.url, 'bozo': 0, 'status': int(feed.headers.status)}) # duplicate feed? id = data.feed.get('id', None) if not id: id = feed_info.feed.get('id', None) href=uri if data.has_key('href'): href=data.href duplicate = None if id and id in feeds_seen: duplicate = id elif href and href in feeds_seen: duplicate = href if duplicate: feed_info.feed['planet_message'] = \ 'duplicate subscription: ' + feeds_seen[duplicate] log.warn('Duplicate subscription: %s and %s' % (uri, feeds_seen[duplicate])) if href: feed_info.feed['planet_http_location'] = href if id: feeds_seen[id] = uri if href: feeds_seen[href] = uri # complete processing for the feed writeCache(uri, feed_info, data) except Exception, e: import sys, traceback type, value, tb = sys.exc_info() log.error('Error processing %s', uri) for line in (traceback.format_exception_only(type, value) + traceback.format_tb(tb)): log.error(line.rstrip()) time.sleep(0.1) for index in threads.keys(): if not threads[index].isAlive(): del threads[index] if not threads: log.info("Finished threaded part of processing.")