def main(): config_file = CONFIG_FILE offline = 0 verbose = 0 for arg in sys.argv[1:]: if arg == "-h" or arg == "--help": print "Usage: planet [options] [CONFIGFILE]" print print "Options:" print " -v, --verbose DEBUG level logging during update" print " -o, --offline Update the Planet from the cache only" print " -h, --help Display this help message and exit" print sys.exit(0) elif arg == "-v" or arg == "--verbose": verbose = 1 elif arg == "-o" or arg == "--offline": offline = 1 elif arg.startswith("-"): print >>sys.stderr, "Unknown option:", arg sys.exit(1) else: config_file = arg # Read the configuration file config = ConfigParser() config.read(config_file) if not config.has_section("Planet"): print >>sys.stderr, "Configuration missing [Planet] section." sys.exit(1) # Read the [Planet] config section planet_name = config_get(config, "Planet", "name", PLANET_NAME) planet_link = config_get(config, "Planet", "link", PLANET_LINK) planet_feed = config_get(config, "Planet", "feed", PLANET_FEED) owner_name = config_get(config, "Planet", "owner_name", OWNER_NAME) owner_email = config_get(config, "Planet", "owner_email", OWNER_EMAIL) if verbose: log_level = "DEBUG" else: log_level = config_get(config, "Planet", "log_level", LOG_LEVEL) feed_timeout = config_get(config, "Planet", "feed_timeout", FEED_TIMEOUT) template_files = config_get(config, "Planet", "template_files", TEMPLATE_FILES).split(" ") # Default feed to the first feed for which there is a template if not planet_feed: for template_file in template_files: name = os.path.splitext(os.path.basename(template_file))[0] if name.find('atom')>=0 or name.find('rss')>=0: planet_feed = urlparse.urljoin(planet_link, name) break # Define locale if config.has_option("Planet", "locale"): # The user can specify more than one locale (separated by ":") as # fallbacks. locale_ok = False for user_locale in config.get("Planet", "locale").split(':'): user_locale = user_locale.strip() try: locale.setlocale(locale.LC_ALL, user_locale) except locale.Error: pass else: locale_ok = True break if not locale_ok: print >>sys.stderr, "Unsupported locale setting." sys.exit(1) # Activate logging planet.logging.basicConfig() planet.logging.getLogger().setLevel(planet.logging.getLevelName(log_level)) log = planet.logging.getLogger("planet.runner") try: log.warning except: log.warning = log.warn # timeoutsocket allows feedparser to time out rather than hang forever on # ultra-slow servers. Python 2.3 now has this functionality available in # the standard socket library, so under 2.3 you don't need to install # anything. But you probably should anyway, because the socket module is # buggy and timeoutsocket is better. if feed_timeout: try: feed_timeout = float(feed_timeout) except: log.warning("Feed timeout set to invalid value '%s', skipping", feed_timeout) feed_timeout = None if feed_timeout and not offline: try: from planet import timeoutsocket timeoutsocket.setDefaultSocketTimeout(feed_timeout) log.debug("Socket timeout set to %d seconds", feed_timeout) except ImportError: import socket if hasattr(socket, 'setdefaulttimeout'): log.debug("timeoutsocket not found, using python function") socket.setdefaulttimeout(feed_timeout) log.debug("Socket timeout set to %d seconds", feed_timeout) else: log.error("Unable to set timeout to %d seconds", feed_timeout) # run the planet my_planet = planet.Planet(config) my_planet.run(planet_name, planet_link, template_files, offline) my_planet.generate_all_files(template_files, planet_name, planet_link, planet_feed, owner_name, owner_email)
def spiderPlanet(only_if_new = False): """ Spider (fetch) an entire planet """ # log = planet.getLogger(config.log_level(),config.log_format()) log = planet.getLogger(config.log_level(),config.log_format()) global index index = True timeout = config.feed_timeout() try: socket.setdefaulttimeout(float(timeout)) log.info("Socket timeout set to %d seconds", timeout) except: try: from planet import timeoutsocket timeoutsocket.setDefaultSocketTimeout(float(timeout)) log.info("Socket timeout set to %d seconds", timeout) except: log.warning("Timeout set to invalid value '%s', skipping", timeout) from Queue import Queue from threading import Thread fetch_queue = Queue() parse_queue = Queue() threads = {} http_cache = config.http_cache_directory() # Should this be done in config? if http_cache and not os.path.exists(http_cache): os.makedirs(http_cache) if int(config.spider_threads()): # Start all the worker threads for i in range(int(config.spider_threads())): threads[i] = Thread(target=httpThread, args=(i,fetch_queue, parse_queue, log)) threads[i].start() else: log.info("Building work queue") # Load the fetch and parse work queues for uri in config.subscriptions(): # read cached feed info sources = config.cache_sources_directory() feed_source = filename(sources, uri) feed_info = feedparser.parse(feed_source) if feed_info.feed and only_if_new: log.info("Feed %s already in cache", uri) continue if feed_info.feed.get('planet_http_status',None) == '410': log.info("Feed %s gone", uri) continue if threads and _is_http_uri(uri): fetch_queue.put(item=(uri, feed_info)) else: parse_queue.put(item=(uri, feed_info, uri)) # Mark the end of the fetch queue for thread in threads.keys(): fetch_queue.put(item=(None, None)) # Process the results as they arrive while fetch_queue.qsize() or parse_queue.qsize() or threads: while parse_queue.qsize() == 0 and threads: time.sleep(0.1) while parse_queue.qsize(): (uri, feed_info, feed) = parse_queue.get(False) try: if not hasattr(feed,'headers') or int(feed.headers.status)<300: options = {} if hasattr(feed_info,'feed'): options['etag'] = \ feed_info.feed.get('planet_http_etag',None) try: modified=time.strptime( feed_info.feed.get('planet_http_last_modified', None)) except: pass data = feedparser.parse(feed, **options) else: data = feedparser.FeedParserDict({'version': None, 'headers': feed.headers, 'entries': [], 'feed': {}, 'bozo': 0, 'status': int(feed.headers.status)}) writeCache(uri, feed_info, data) except Exception, e: import sys, traceback type, value, tb = sys.exc_info() log.error('Error processing %s', uri) for line in (traceback.format_exception_only(type, value) + traceback.format_tb(tb)): log.error(line.rstrip()) for index in threads.keys(): if not threads[index].isAlive(): del threads[index] if not threads: log.info("Finished threaded part of processing.")
def main(): config_file = CONFIG_FILE offline = 0 verbose = 0 for arg in sys.argv[1:]: if arg == "-h" or arg == "--help": print "Usage: planet [options] [CONFIGFILE]" print print "Options:" print " -v, --verbose DEBUG level logging during update" print " -o, --offline Update the Planet from the cache only" print " -h, --help Display this help message and exit" print sys.exit(0) elif arg == "-v" or arg == "--verbose": verbose = 1 elif arg == "-o" or arg == "--offline": offline = 1 elif arg.startswith("-"): print >> sys.stderr, "Unknown option:", arg sys.exit(1) else: config_file = arg # Read the configuration file config = ConfigParser() config.read(config_file) if not config.has_section("Planet"): print >> sys.stderr, "Configuration missing [Planet] section." sys.exit(1) # Read the [Planet] config section planet_name = config_get(config, "Planet", "name", PLANET_NAME) planet_link = config_get(config, "Planet", "link", PLANET_LINK) planet_feed = config_get(config, "Planet", "feed", PLANET_FEED) owner_name = config_get(config, "Planet", "owner_name", OWNER_NAME) owner_email = config_get(config, "Planet", "owner_email", OWNER_EMAIL) if verbose: log_level = "DEBUG" else: log_level = config_get(config, "Planet", "log_level", LOG_LEVEL) feed_timeout = config_get(config, "Planet", "feed_timeout", FEED_TIMEOUT) template_files = config_get(config, "Planet", "template_files", TEMPLATE_FILES).split(" ") # Default feed to the first feed for which there is a template if not planet_feed: for template_file in template_files: name = os.path.splitext(os.path.basename(template_file))[0] if name.find('atom') >= 0 or name.find('rss') >= 0: planet_feed = urlparse.urljoin(planet_link, name) break # Define locale if config.has_option("Planet", "locale"): # The user can specify more than one locale (separated by ":") as # fallbacks. locale_ok = False for user_locale in config.get("Planet", "locale").split(':'): user_locale = user_locale.strip() try: locale.setlocale(locale.LC_ALL, user_locale) except locale.Error: pass else: locale_ok = True break if not locale_ok: print >> sys.stderr, "Unsupported locale setting." sys.exit(1) # Activate logging planet.logging.basicConfig() planet.logging.getLogger().setLevel(planet.logging.getLevelName(log_level)) log = planet.logging.getLogger("planet.runner") try: log.warning except: log.warning = log.warn # timeoutsocket allows feedparser to time out rather than hang forever on # ultra-slow servers. Python 2.3 now has this functionality available in # the standard socket library, so under 2.3 you don't need to install # anything. But you probably should anyway, because the socket module is # buggy and timeoutsocket is better. if feed_timeout: try: feed_timeout = float(feed_timeout) except: log.warning("Feed timeout set to invalid value '%s', skipping", feed_timeout) feed_timeout = None if feed_timeout and not offline: try: from planet import timeoutsocket timeoutsocket.setDefaultSocketTimeout(feed_timeout) log.debug("Socket timeout set to %d seconds", feed_timeout) except ImportError: import socket if hasattr(socket, 'setdefaulttimeout'): log.debug("timeoutsocket not found, using python function") socket.setdefaulttimeout(feed_timeout) log.debug("Socket timeout set to %d seconds", feed_timeout) else: log.error("Unable to set timeout to %d seconds", feed_timeout) # run the planet my_planet = planet.Planet(config) my_planet.run(planet_name, planet_link, template_files, offline) my_planet.generate_all_files(template_files, planet_name, planet_link, planet_feed, owner_name, owner_email)
def run_planet(config_file=None, PLANET_NAME="Unconfigured Planet", PLANET_LINK="Unconfigured Planet", PLANET_FEED=None, OWNER_NAME="Anonymous Coward", OWNER_EMAIL="", verbose=0, offline=0, LOG_LEVEL="WARNING", FEED_TIMEOUT=20, TEMPLATE_FILES="examples/basic/planet.html.tmpl"): # Read the configuration file config = ConfigParser() config.read(config_file) assert config.has_section("Planet"), \ "Configuration missing [Planet] section." # Read the [Planet] config section planet_name = config_get(config, "Planet", "name", PLANET_NAME) planet_link = config_get(config, "Planet", "link", PLANET_LINK) planet_feed = config_get(config, "Planet", "feed", PLANET_FEED) owner_name = config_get(config, "Planet", "owner_name", OWNER_NAME) owner_email = config_get(config, "Planet", "owner_email", OWNER_EMAIL) if verbose: log_level = "DEBUG" else: log_level = config_get(config, "Planet", "log_level", LOG_LEVEL) feed_timeout = config_get(config, "Planet", "feed_timeout", FEED_TIMEOUT) template_files = config_get(config, "Planet", "template_files", TEMPLATE_FILES).split(" ") # Default feed to the first feed for which there is a template if not planet_feed: for template_file in template_files: name = os.path.splitext(os.path.basename(template_file))[0] if name.find('atom')>=0 or name.find('rss')>=0: planet_feed = urlparse.urljoin(planet_link, name) break # Define locale if config.has_option("Planet", "locale"): # The user can specify more than one locale (separated by ":") as # fallbacks. locale_ok = False for user_locale in config.get("Planet", "locale").split(':'): user_locale = user_locale.strip() try: locale.setlocale(locale.LC_ALL, user_locale) except locale.Error: pass else: locale_ok = True break if not locale_ok: print >>sys.stderr, "Unsupported locale setting." sys.exit(1) # Activate logging planet.logging.basicConfig() planet.logging.getLogger().setLevel(planet.logging.getLevelName(log_level)) log = planet.logging.getLogger("planet.runner") try: log.warning except: log.warning = log.warn # timeoutsocket allows feedparser to time out rather than hang forever on # ultra-slow servers. Python 2.3 now has this functionality available in # the standard socket library, so under 2.3 you don't need to install # anything. But you probably should anyway, because the socket module is # buggy and timeoutsocket is better. if feed_timeout: try: feed_timeout = float(feed_timeout) except: log.warning("Feed timeout set to invalid value '%s', skipping", feed_timeout) feed_timeout = None if feed_timeout and not offline: try: from planet import timeoutsocket timeoutsocket.setDefaultSocketTimeout(feed_timeout) log.debug("Socket timeout set to %d seconds", feed_timeout) except ImportError: import socket if hasattr(socket, 'setdefaulttimeout'): log.debug("timeoutsocket not found, using python function") socket.setdefaulttimeout(feed_timeout) log.debug("Socket timeout set to %d seconds", feed_timeout) else: log.error("Unable to set timeout to %d seconds", feed_timeout) # run the planet my_planet = planet.Planet(config) my_planet.run(planet_name, planet_link, template_files, offline) my_planet.generate_all_files(template_files, planet_name, planet_link, planet_feed, owner_name, owner_email)