def update_article(article): parsed_article = load_article(article.url) if parsed_article is None: return to_store = unicode(parsed_article).encode('utf8') t = datetime.now() v, boring, diff_info = add_to_git_repo(to_store, url_to_filename(article.url), article) if v: logger.info('Modifying! new blob: %s', v) v_row = models.Version( v=v, boring=boring, title=parsed_article.title, byline=parsed_article.byline, date=t, article=article, ) v_row.diff_info = diff_info v_row.save() if not boring: article.last_update = t article.save()
def get_all_article_urls(): ans = set() for parser in parsers.parsers: logger.info('Looking up %s' % parser.domains) urls = parser.feed_urls() ans = ans.union(map(canonicalize_url, urls)) return ans
def update_versions(do_all=False): articles = list(models.Article.objects.all()) total_articles = len(articles) update_priority = lambda x: x.minutes_since_check() * 1. / get_update_delay(x.minutes_since_update()) articles = sorted([a for a in articles if update_priority(a) > 1 or do_all], key=update_priority, reverse=True) logger.info('Checking %s of %s articles', len(articles), total_articles) for i, article in enumerate(articles): logger.debug('Woo: %s %s %s (%s/%s)', article.minutes_since_update(), article.minutes_since_check(), update_priority(article), i+1, len(articles)) delay = get_update_delay(article.minutes_since_update()) if article.minutes_since_check() < delay and not do_all: continue logger.info('Considering %s', article.url) article.last_check = datetime.now() try: update_article(article) except Exception, e: if isinstance(e, subprocess.CalledProcessError): logger.error('CalledProcessError when updating %s', article.url) logger.error(repr(e.output)) else: logger.error('Unknown exception when updating %s', article.url) logger.error(traceback.format_exc()) article.save()
def update_versions(do_all=False): articles = list(models.Article.objects.all()) total_articles = len(articles) update_priority = lambda x: x.minutes_since_check() * 1. / get_update_delay(x.minutes_since_update()) articles = sorted([a for a in articles if (update_priority(a) > 1 or do_all)], key=update_priority, reverse=True) logger.info('Checking %s of %s articles', len(articles), total_articles) for i, article in enumerate(articles): logger.debug('Woo: %s %s %s (%s/%s)', article.minutes_since_update(), article.minutes_since_check(), update_priority(article), i+1, len(articles)) delay = get_update_delay(article.minutes_since_update()) if article.minutes_since_check() < delay and not do_all: continue logger.info('Considering %s', article.url) article.last_check = datetime.now() try: update_article(article) except Exception, e: if isinstance(e, subprocess.CalledProcessError): logger.error('CalledProcessError when updating %s', article.url) logger.error(repr(e.output)) else: logger.error('Unknown exception when updating %s', article.url) logger.error(traceback.format_exc()) article.save()
def update_articles(todays_git_dir): logger.info('Starting scraper; looking for new URLs') for url in get_all_article_urls(): if len(url) > 255: #Icky hack, but otherwise they're truncated in DB. continue if not models.Article.objects.filter(url=url).count(): models.Article(url=url, git_dir=todays_git_dir).save()
def update_article(article): parsed_article = load_article(article.url) if parsed_article is None: return to_store = unicode(parsed_article).encode('utf8') t = datetime.now() logger.debug('Article parsed; trying to store') v, boring, diff = add_to_git_repo(to_store, url_to_filename(article.url), article) if v: logger.info('Modifying! new blob: %s', v) v_row = models.Version(v=v, boring=boring, title=parsed_article.title, byline=parsed_article.byline, date=t, article=article, ) v_row.diff_info = get_diff_info(diff) v_row.diff_details_json = diff v_row.update_severity(save=False) if not boring: article.last_update = t v_row.save() article.save()
def update_articles(): logger.info('Starting scraper; looking for new URLs') for url in get_all_article_urls(): if len(url) > 255: #Icky hack, but otherwise they're truncated in DB. continue if not models.Article.objects.filter(url=url).count(): models.Article(url=url).save()
def load_article(url): try: parser = parsers.get_parser(url) except KeyError: logger.info('Unable to parse domain, skipping') return parsed_article = parser(url) if not parsed_article.real_article: return return parsed_article
def update_versions(todays_repo, do_all=False): logger.info('Looking for articles to check') # For memory issues, restrict to the last year of articles threshold = datetime.now() - timedelta(days=366) article_query = models.Article.objects.exclude(git_dir='old').filter( Q(last_update__gt=threshold) | Q(initial_date__gt=threshold)) articles = list(article_query) total_articles = len(articles) update_priority = lambda x: x.minutes_since_check( ) * 1. / get_update_delay(x.minutes_since_update()) articles = sorted( [a for a in articles if update_priority(a) > 1 or do_all], key=update_priority, reverse=True) logger.info('Checking %s of %s articles', len(articles), total_articles) # Do git gc at the beginning, so if we're falling behind and killed # it still happens and I don't run out of quota. =) logger.info('Starting with gc:') try: run_git_command(['gc'], models.GIT_DIR + todays_repo) except subprocess.CalledProcessError as e: print >> sys.stderr, 'Error on initial gc!' print >> sys.stderr, 'Output was """' print >> sys.stderr, e.output print >> sys.stderr, '"""' raise logger.info('Done with gc!') for i, article in enumerate(articles): logger.debug('Woo: %s %s %s (%s/%s)', article.minutes_since_update(), article.minutes_since_check(), update_priority(article), i + 1, len(articles)) delay = get_update_delay(article.minutes_since_update()) # isn't this inherent in update_priority being > 1 above? if article.minutes_since_check() < delay and not do_all: continue logger.info('Considering %s', article.url) article.last_check = datetime.now() try: update_article(article) except Exception, e: if isinstance(e, subprocess.CalledProcessError): logger.error('CalledProcessError when updating %s', article.url) logger.error(repr(e.output)) else: logger.error('Unknown exception when updating %s', article.url) logger.error(traceback.format_exc()) article.save()
def update_versions(todays_repo, do_all=False): logger.info('Looking for articles to check') # For memory issues, restrict to the last year of articles threshold = datetime.now() - timedelta(days=366) article_query = models.Article.objects.exclude(git_dir='old').filter(Q(last_update__gt=threshold) | Q(initial_date__gt=threshold)) articles = list(article_query) total_articles = len(articles) update_priority = lambda x: x.minutes_since_check() * 1. / get_update_delay(x.minutes_since_update()) articles = sorted([a for a in articles if update_priority(a) > 1 or do_all], key=update_priority, reverse=True) logger.info('Checking %s of %s articles', len(articles), total_articles) # Do git gc at the beginning, so if we're falling behind and killed # it still happens and I don't run out of quota. =) logger.info('Starting with gc:') try: run_git_command(['gc'], models.GIT_DIR + todays_repo) except subprocess.CalledProcessError as e: print >> sys.stderr, 'Error on initial gc!' print >> sys.stderr, 'Output was """' print >> sys.stderr, e.output print >> sys.stderr, '"""' raise logger.info('Done!') for i, article in enumerate(articles): logger.debug('Woo: %s %s %s (%s/%s)', article.minutes_since_update(), article.minutes_since_check(), update_priority(article), i+1, len(articles)) delay = get_update_delay(article.minutes_since_update()) if article.minutes_since_check() < delay and not do_all: continue logger.info('Considering %s', article.url) article.last_check = datetime.now() try: update_article(article) except Exception, e: if isinstance(e, subprocess.CalledProcessError): logger.error('CalledProcessError when updating %s', article.url) logger.error(repr(e.output)) else: logger.error('Unknown exception when updating %s', article.url) logger.error(traceback.format_exc()) article.save()
def load_article(url): try: parser = parsers.get_parser(url) except KeyError: logger.info('Unable to parse domain, skipping') return try: parsed_article = parser(url) except (AttributeError, urllib2.HTTPError, httplib.HTTPException), e: if isinstance(e, urllib2.HTTPError) and e.msg == 'Gone': return logger.error('Exception when parsing %s', url) logger.error(traceback.format_exc()) logger.error('Continuing') return
def update_article(article): parsed_article = load_article(article.url) if parsed_article is None: return to_store = unicode(parsed_article).encode('utf8') t = datetime.now() logger.debug('Article parsed; trying to store') v, boring, diff_info = add_to_git_repo(to_store, article.filename(), article) if v: logger.info('Modifying! new blob: %s', v) v_row = models.Version( v=v, boring=boring, title=parsed_article.title, byline=parsed_article.byline, date=t, article=article, ) v_row.diff_info = diff_info v_row.save() if not boring: article.last_update = t article.save() # Notify slack team = models.SlackBotTeam.objects.get() standalone = models.StandaloneArticle.objects.get(url=article.url) if standalone: user = standalone.added_by link = '/article-history/%s' % article.url link = '%s%s' % (settings.WEBAPP_URL, link) bot_text = 'Veo cambios en %s (%s)' % (article.url, link) if not user.startswith('web_frontend'): Client = SlackClient(team.bot_access_token) dm = Client.api_call(method='im.open', user=user) channel = dm.get('channel').get('id') channel = channel if channel else user Client.api_call(method='chat.postMessage', channel=channel, text=bot_text)
def handle(self, *args, **options): ch = logging.FileHandler('/tmp/newsdiffs_logging', mode='w') ch = logging.StreamHandler(stream=sys.stdout) ch.setLevel(logging.DEBUG) ch.setFormatter(formatter) logger.addHandler(ch) ch = logging.FileHandler(ERROR_FILE_PATH, mode='a') ch = logging.StreamHandler(stream=sys.stdout) ch.setLevel(logging.WARNING) ch.setFormatter(formatter) logger.addHandler(ch) for repo in all_git_repos(): cleanup_git_repo(repo) todays_repo = get_and_make_git_repo() update_articles(todays_repo) update_versions(todays_repo, options['all']) logger.info('Done scraping!')
def update_versions(todays_repo, do_all=False): articles = list(models.Article.objects.all()) total_articles = len(articles) update_priority = lambda x: x.minutes_since_check( ) * 1. / get_update_delay(x.minutes_since_update()) articles = sorted( [a for a in articles if update_priority(a) > 1 or do_all], key=update_priority, reverse=True) logger.info('Checking %s of %s articles', len(articles), total_articles) # Do git gc at the beginning, so if we're falling behind and killed # it still happens and I don't run out of quota. =) logger.info('Starting with gc:') try: run_git_command(['gc'], models.GIT_DIR + todays_repo) except subprocess.CalledProcessError as e: print >> sys.stderr, 'Error on initial gc!' print >> sys.stderr, 'Output was """' print >> sys.stderr, e.output print >> sys.stderr, '"""' raise logger.info('Done!') for i, article in enumerate(articles): logger.debug('Woo: %s %s %s (%s/%s)', article.minutes_since_update(), article.minutes_since_check(), update_priority(article), i + 1, len(articles)) delay = get_update_delay(article.minutes_since_update()) if article.minutes_since_check() < delay and not do_all: continue logger.info('Considering %s', article.url) article.last_check = datetime.now() try: update_article(article) except Exception, e: if isinstance(e, subprocess.CalledProcessError): logger.error('CalledProcessError when updating %s', article.url) logger.error(repr(e.output)) else: logger.error('Unknown exception when updating %s', article.url) logger.error(traceback.format_exc()) article.save()
def update_article(article): parsed_article = load_article(article.url) if parsed_article is None: return to_store = unicode(parsed_article).encode('utf8') t = datetime.now() v, boring, diff_info = add_to_git_repo(to_store, url_to_filename(article.url), article) if v: logger.info('Modifying! new blob: %s', v) v_row = models.Version(v=v, boring=boring, title=parsed_article.title, byline=parsed_article.byline, date=t, article=article, ) v_row.diff_info = diff_info v_row.save() if not boring: article.last_update = t article.save()
def update_versions(do_all=False): articles = list(models.Article.objects.all()) total_articles = len(articles) update_priority = lambda x: x.minutes_since_check() * 1. / get_update_delay(x.minutes_since_update()) articles = sorted([a for a in articles if update_priority(a) > 1 or do_all], key=update_priority, reverse=True) logger.info('Checking %s of %s articles', len(articles), total_articles) # Do git gc at the beginning, so if we're falling behind and killed # it still happens and I don't run out of quota. =) logger.info('Starting with gc:') try: run_git_command(['gc']) except subprocess.CalledProcessError as e: print >> sys.stderr, 'Error on initial gc!' print >> sys.stderr, 'Output was """' print >> sys.stderr, e.output print >> sys.stderr, '"""' raise logger.info('Done!') for i, article in enumerate(articles): logger.debug('Woo: %s %s %s (%s/%s)', article.minutes_since_update(), article.minutes_since_check(), update_priority(article), i+1, len(articles)) delay = get_update_delay(article.minutes_since_update()) if article.minutes_since_check() < delay and not do_all: continue logger.info('Considering %s', article.url) article.last_check = datetime.now() try: update_article(article) except Exception, e: if isinstance(e, subprocess.CalledProcessError): logger.error('CalledProcessError when updating %s', article.url) logger.error(repr(e.output)) else: logger.error('Unknown exception when updating %s', article.url) logger.error(traceback.format_exc()) article.save()
def update_articles(todays_git_dir): logger.info('Starting scraper; looking for new URLs') all_urls = get_all_article_urls() logger.info('Got all %s urls; storing to database' % len(all_urls)) for i, url in enumerate(all_urls): logger.debug('Woo: %d/%d is %s' % (i+1, len(all_urls), url)) if len(url) > 255: #Icky hack, but otherwise they're truncated in DB. continue if not models.Article.objects.filter(url=url).count(): logger.debug('Adding!') models.Article(url=url, git_dir=todays_git_dir).save() logger.info('Done storing to database')
def update_articles(todays_git_dir): logger.info('Starting scraper; looking for new URLs') all_urls = get_all_article_urls() logger.info('Got all %s urls; storing to database' % len(all_urls)) for i, url in enumerate(all_urls): logger.debug('Woo: %d/%d is %s' % (i + 1, len(all_urls), url)) if len(url) > 255: #Icky hack, but otherwise they're truncated in DB. continue if not models.Article.objects.filter(url=url).count(): logger.debug('Adding!') models.Article(url=url, git_dir=todays_git_dir).save() logger.info('Done storing to database')
def update_articles(todays_git_dir): logger.info('Starting scraper; looking for new URLs') all_urls = get_all_article_urls() logger.info('Got all %s urls; storing to database' % len(all_urls)) for i, url in enumerate(all_urls): logger.debug('Woo: %d/%d is %s' % (i + 1, len(all_urls), url)) # Looks like it skips URLs longer than 255? if len(url) > 255: #Icky hack, but otherwise they're truncated in DB. continue # Is there an index on this column? if not models.Article.objects.filter(url=url).count(): logger.debug('Adding Article {0}'.format(url)) models.Article(url=url, git_dir=todays_git_dir).save() logger.info('Done storing to database')
def update_articles(): logger.info('Starting scraper; looking for new URLs') for url in get_all_article_urls(): if not models.Article.objects.filter(url=url).count(): models.Article(url=url).save()
article.last_check = datetime.now() try: update_article(article) except Exception, e: if isinstance(e, subprocess.CalledProcessError): logger.error('CalledProcessError when updating %s', article.url) logger.error(repr(e.output)) else: logger.error('Unknown exception when updating %s', article.url) logger.error(traceback.format_exc()) article.save() #logger.info('Ending with gc:') #run_git_command(['gc']) logger.info('Done!') #Remove index.lock if 5 minutes old def cleanup_git_repo(git_dir): for name in [ '.git/index.lock', '.git/refs/heads/master.lock', '.git/gc.pid.lock' ]: fname = os.path.join(git_dir, name) try: stat = os.stat(fname) except OSError: return age = time.time() - stat.st_ctime if age > 60 * 5:
article.last_check = datetime.now() try: update_article(article) except Exception, e: if isinstance(e, subprocess.CalledProcessError): logger.error('CalledProcessError when updating %s', article.url) logger.error(repr(e.output)) else: logger.error('Unknown exception when updating %s', article.url) logger.error(traceback.format_exc()) article.save() #logger.info('Ending with gc:') #run_git_command(['gc']) logger.info('Done!') #Remove index.lock if 5 minutes old def cleanup_git_repo(git_dir): for name in ['.git/index.lock', '.git/refs/heads/master.lock', '.git/gc.pid.lock']: fname = os.path.join(git_dir, name) try: stat = os.stat(fname) except OSError: return age = time.time() - stat.st_ctime if age > 60*5: os.remove(fname) if __name__ == '__main__': print >> sys.stderr, "Try `python website/manage.py scraper`."