def main(): """ Main function. Nothing to see here. Move along. """ parser = optparse.OptionParser(usage='%prog [options]', version=USER_AGENT) parser.add_option('--settings', help='Python path to settings module. If this isn\'t provided, ' \ 'the DJANGO_SETTINGS_MODULE enviroment variable will be used.') parser.add_option('-f', '--feed', action='append', type='int', help='A feed id to be updated. This option can be given multiple ' \ 'times to update several feeds at the same time ' \ '(-f 1 -f 4 -f 7).') parser.add_option('-s', '--site', type='int', help='A site id to update.') parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, help='Verbose output.') parser.add_option('-t', '--timeout', type='int', default=10, help='Wait timeout in seconds when connecting to feeds.') parser.add_option('-w', '--workerthreads', type='int', default=10, help='Worker threads that will fetch feeds in parallel.') options = parser.parse_args()[0] if options.settings: os.environ["DJANGO_SETTINGS_MODULE"] = options.settings from feedjack import models, fjcache # settting socket timeout (default= 10 seconds) socket.setdefaulttimeout(options.timeout) # our job dispatcher disp = Dispatcher(options, options.workerthreads) prints('* BEGIN: %s' % (unicode(datetime.datetime.now()),)) if options.feed: feeds = models.Feed.objects.filter(id__in=options.feed) known_ids = [] for feed in feeds: known_ids.append(feed.id) disp.add_job(feed) for feed in options.feed: if feed not in known_ids: prints('! Unknown feed id: %d' % (feed,)) elif options.site: try: site = models.Site.objects.get(pk=int(options.site)) except models.Site.DoesNotExist: site = None prints('! Unknown site id: %d' % (options.site,)) if site: feeds = [sub.feed for sub in site.subscriber_set.all()] for feed in feeds: disp.add_job(feed) else: for feed in models.Feed.objects.filter(is_active=True): disp.add_job(feed) disp.poll() # removing the cached data in all sites, this will only work with the # memcached, db and file backends [fjcache.cache_delsite(site.id) for site in models.Site.objects.all()] if threadpool: tcom = u'%d threads' % (options.workerthreads,) else: tcom = u'no threadpool module available, no parallel fetching' prints('* END: %s (%s)' % (unicode(datetime.datetime.now()), tcom))
def main(): """ Main function. Nothing to see here. Move along. """ parser = optparse.OptionParser(usage='%prog [options]', version=USER_AGENT) parser.add_option('--settings', help='Python path to settings module. If this isn\'t provided, ' \ 'the DJANGO_SETTINGS_MODULE enviroment variable will be used.') parser.add_option('-f', '--feed', action='append', type='int', help='A feed id to be updated. This option can be given multiple ' \ 'times to update several feeds at the same time ' \ '(-f 1 -f 4 -f 7).') parser.add_option('-s', '--site', type='int', help='A site id to update.') parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, help='Verbose output.') parser.add_option('-t', '--timeout', type='int', default=10, help='Wait timeout in seconds when connecting to feeds.') parser.add_option('-w', '--workerthreads', type='int', default=0, help='Worker threads that will fetch feeds in parallel.') options = parser.parse_args()[0] if options.settings: os.environ["DJANGO_SETTINGS_MODULE"] = options.settings from feedjack import models, fjcache # settting socket timeout (default= 10 seconds) socket.setdefaulttimeout(options.timeout) # our job dispatcher if options.workerthreads: disp = ThreadPoolDispatcher(options, options.workerthreads) else: disp = BaseDispatcher(options) prints('* BEGIN: %s' % (unicode(datetime.datetime.now()), )) if options.feed: feeds = models.Feed.objects.filter(id__in=options.feed) known_ids = [] for feed in feeds: known_ids.append(feed.id) disp.add_job(feed) for feed in options.feed: if feed not in known_ids: prints('! Unknown feed id: %d' % (feed, )) elif options.site: try: site = models.Site.objects.get(pk=int(options.site)) except models.Site.DoesNotExist: site = None prints('! Unknown site id: %d' % (options.site, )) if site: feeds = [sub.feed for sub in site.subscriber_set.all()] for feed in feeds: disp.add_job(feed) else: for feed in models.Feed.objects.filter(is_active=True): disp.add_job(feed) disp.poll() # removing the cached data in all sites, this will only work with the # memcached, db and file backends [fjcache.cache_delsite(site.id) for site in models.Site.objects.all()] if threadpool: tcom = u'%d threads' % (options.workerthreads, ) else: tcom = u'no threadpool module available, no parallel fetching' prints('* END: %s (%s)' % (unicode(datetime.datetime.now()), tcom))
def bulk_update(opts): global _exc_feed_id # updated to be available on uncaught errors from feedjack.models import Feed, Site from feedjack import fjcache import socket socket.setdefaulttimeout(opts.timeout) affected_feeds = set() # for post-transaction signals Site.signal_updated.connect( lambda sender, instance, **kwz: fjcache.cache_delsite(instance.id) ) def transaction_commit(): log.debug('Comitting db transaction') transaction_signaled_commit() for feed in affected_feeds: feed.signal_updated_dispatch(sender=FeedProcessor) for site in Site.objects.filter(subscriber__feed__in=affected_feeds): site.signal_updated_dispatch(sender=FeedProcessor) transaction_signaled_commit() # in case of any immediate changes from signals if not opts.feed and not opts.site: # fetches even unbound feeds feeds = Feed.objects.filter(is_active=True) else: feeds = set() if opts.feed: # no is_active check if specified explicitly feeds.update(Feed.objects.get_by_string(spec) for spec in opts.feed) if opts.site: sites = list(Site.objects.get_by_string(unicode(spec)) for spec in opts.site) for site in sites: feeds.update(site.active_feeds) feeds = list(feeds) time_delta_global = time_delta_commit = timezone.now() log.info( '* BEGIN: {0}, feeds to process: {1}'\ .format(time_delta_global, len(feeds)) ) feed_stats, entry_stats = defaultdict(int), defaultdict(int) for feed in feeds: _exc_feed_id = feed.id log.info('[{0}] Processing feed: {1}'.format(feed.id, feed.feed_url)) # Check if feed has to be fetched if opts.adaptive_interval: check_opts = opts.interval_parameters.copy() check_clc = check_opts.pop('consider_last_check') or False if feed.last_checked: check_interval, check_interval_ts =\ fjcache.feed_interval_get(feed.id, check_opts) if check_interval is None: # calculate and cache it check_interval = feed.calculate_check_interval(**check_opts) fjcache.feed_interval_set( feed.id, check_opts, check_interval, check_interval_ts ) # With "consider_last_check", interval to feed.last_checked is added to average time_delta = timedelta( 0, feed.calculate_check_interval( ewma=check_interval, ewma_ts=check_interval_ts, add_partial=feed.last_checked, **check_opts )\ if check_clc else check_interval ) if not check_interval_ts: # Cache miss, legacy case or first post on the feed # Normally, it should be set after any feed update check_interval_ts = feed.last_checked time_delta_chk = (timezone.now() - time_delta) - check_interval_ts if time_delta_chk < timedelta(0): log.info( ( '[{0}] Skipping check for feed (url: {1}) due to adaptive interval setting.' ' Minimal time until next check {2} (calculated min interval: {3}).' )\ .format(feed.id, feed.feed_url, abs(time_delta_chk), abs(time_delta)) ) continue else: check_interval, check_interval_ts = 0, None # Fetch new/updated stuff from the feed to db time_delta = timezone.now() if not opts.dry_run: ret_feed, ret_entries = FeedProcessor(feed, opts).process() else: log.debug('[{0}] Not fetching feed, because dry-run flag is set'.format(feed.id)) ret_feed, ret_entries = FEED_SAME, dict() time_delta = timezone.now() - time_delta # FEED_SAME or errors don't invalidate cache or generate "updated" signals if ret_feed == FEED_OK: affected_feeds.add(feed) # Update check_interval ewma if feed had updates if opts.adaptive_interval and any(it.imap( ret_entries.get, [ENTRY_NEW, ENTRY_UPDATED, ENTRY_ERR] )): if not check_interval_ts: assert feed.last_checked check_interval_ts = feed.last_checked check_interval = feed.calculate_check_interval( ewma=check_interval, ewma_ts=check_interval_ts, **check_opts ) fjcache.feed_interval_set(feed.id, check_opts, check_interval, check_interval_ts) # Feedback, stats, commit, delay log.info('[{0}] Processed {1} in {2}s [{3}] [{4}]{5}'.format( feed.id, feed.feed_url, time_delta, feed_keys_dict[ret_feed], ' '.join('{0}={1}'.format( label, ret_entries.get(key, 0) ) for key,label in entry_keys), ' (SLOW FEED!)' if time_delta.seconds > SLOWFEED_WARNING else '' )) feed_stats[ret_feed] += 1 for k,v in ret_entries.iteritems(): entry_stats[k] += v if opts.commit_interval: if isinstance(opts.commit_interval, timedelta): ts = timezone.now() if ts - time_delta_commit > opts.commit_interval: transaction_commit() time_delta_commit = ts elif sum(feed_stats.itervalues()) % opts.commit_interval == 0: transaction_commit() if opts.delay: log.debug('Waiting for {0}s (delay option)'.format(opts.delay)) sleep(opts.delay) _exc_feed_id = None time_delta_global = timezone.now() - time_delta_global log.info('* END: {0} (delta: {1}s), entries: {2}, feeds: {3}'.format( timezone.now(), time_delta_global, ' '.join('{0}={1}'.format(label, entry_stats[key]) for key,label in entry_keys), ' '.join('{0}={1}'.format(label, feed_stats[key]) for key,label in feed_keys) )) transaction_commit()
def bulk_update(optz): import socket socket.setdefaulttimeout(optz.timeout) from feedjack.models import Feed, Site affected_sites = set() # to drop cache if optz.feed: feeds = list(Feed.objects.filter(pk__in=optz.feed)) # no is_active check for feed_id in set(optz.feed).difference(it.imap(op.attrgetter('id'), feeds)): log.warn('Unknown feed id: {0}'.format(feed_id)) affected_sites.update(Site.objects.filter( subscriber__feed__in=feeds ).values_list('id', flat=True)) if optz.site: feeds = Feed.objects.filter( is_active=True, subscriber__site__pk__in=optz.site ) sites = Site.objects.filter(pk__in=optz.site).values_list('id', flat=True) for site_id in set(optz.site).difference(sites): log.warn('Unknown site id: {0}'.format(site_id)) affected_sites.update(sites) if not optz.feed and not optz.site: # fetches even unbound feeds feeds = Feed.objects.filter(is_active=True) affected_sites = Site.objects.all().values_list('id', flat=True) feeds, time_delta_global = list(feeds), datetime.now() log.info( '* BEGIN: {0}, feeds to process: {1}'\ .format(time_delta_global, len(feeds)) ) feed_stats, entry_stats = defaultdict(int), defaultdict(int) for feed in feeds: time_delta = datetime.now() ret_feed, ret_entries = FeedProcessor(feed, optz).process() time_delta = datetime.now() - time_delta log.info('[{0}] Processed {1} in {2}s [{3}] [{4}]{5}'.format( feed.id, feed.feed_url, time_delta, feed_keys_dict[ret_feed], ' '.join('{0}={1}'.format( label, ret_entries.get(key, 0) ) for key,label in entry_keys), ' (SLOW FEED!)' if time_delta.seconds > SLOWFEED_WARNING else '' )) feed_stats[ret_feed] += 1 for k,v in ret_entries.iteritems(): entry_stats[k] += v if optz.delay: sleep(optz.delay) time_delta_global = datetime.now() - time_delta_global log.info('* END: {0} (delta: {1}s), entries: {2}, feeds: {3}'.format( datetime.now(), time_delta_global, ' '.join('{0}={1}'.format(label, entry_stats[key]) for key,label in entry_keys), ' '.join('{0}={1}'.format(label, feed_stats[key]) for key,label in feed_keys) )) # Removing the cached data in all sites, # this will only work with the memcached, db and file backends # TODO: make it work by "magic" through model signals from feedjack import fjcache for site_id in affected_sites: fjcache.cache_delsite(site_id) transaction.commit()