def notify_admins_of_errors(): with open(ERROR_FILE_PATH, 'r') as error_file: errors = error_file.read().strip() if errors: logger.error('Error file is non-empty at end of run; emailing contents to admins') admin_emails = map(lambda e: e[1], settings.ADMINS) send_email(admin_emails, 'NewsDiffs scraper errors', errors)
def find_url(txt): try: regres=re.search("(?P<url>https?://[^\s]+)", txt) except Exception,e: logger.error('Failed to find url in %s'%txt) logger.error(e) return
def get_hash(version): """Return the SHA1 hash of filename in a given version""" output = run_git_command(['ls-tree', '-r', version, filename], article.full_git_dir) try: return output.split()[2] except IndexError as e: logger.error('git ls-tree value: [%s]' % output) return ''
def send_email(recipients, subject, body): contents = 'Subject: %s\n\n%s' % (subject, body) msmtp_path = '/usr/bin/msmtp' if os.path.exists(msmtp_path): p = subprocess.Popen([msmtp_path, '-t'] + recipients, stdin=subprocess.PIPE) p.communicate(contents) if p.wait(): logger.error('Bad return code:', p.returncode) else: logger.error('%s does not exist; cannot email errors to admins' % (msmtp_path,))
def update_versions(do_all=False): articles = list(models.Article.objects.all()) total_articles = len(articles) update_priority = lambda x: x.minutes_since_check() * 1. / get_update_delay(x.minutes_since_update()) articles = sorted([a for a in articles if (update_priority(a) > 1 or do_all)], key=update_priority, reverse=True) logger.info('Checking %s of %s articles', len(articles), total_articles) for i, article in enumerate(articles): logger.debug('Woo: %s %s %s (%s/%s)', article.minutes_since_update(), article.minutes_since_check(), update_priority(article), i+1, len(articles)) delay = get_update_delay(article.minutes_since_update()) if article.minutes_since_check() < delay and not do_all: continue logger.info('Considering %s', article.url) article.last_check = datetime.now() try: update_article(article) except Exception, e: if isinstance(e, subprocess.CalledProcessError): logger.error('CalledProcessError when updating %s', article.url) logger.error(repr(e.output)) else: logger.error('Unknown exception when updating %s', article.url) logger.error(traceback.format_exc()) article.save()
def update_versions(do_all=False): articles = list(models.Article.objects.all()) total_articles = len(articles) update_priority = lambda x: x.minutes_since_check() * 1. / get_update_delay(x.minutes_since_update()) articles = sorted([a for a in articles if update_priority(a) > 1 or do_all], key=update_priority, reverse=True) logger.info('Checking %s of %s articles', len(articles), total_articles) for i, article in enumerate(articles): logger.debug('Woo: %s %s %s (%s/%s)', article.minutes_since_update(), article.minutes_since_check(), update_priority(article), i+1, len(articles)) delay = get_update_delay(article.minutes_since_update()) if article.minutes_since_check() < delay and not do_all: continue logger.info('Considering %s', article.url) article.last_check = datetime.now() try: update_article(article) except Exception, e: if isinstance(e, subprocess.CalledProcessError): logger.error('CalledProcessError when updating %s', article.url) logger.error(repr(e.output)) else: logger.error('Unknown exception when updating %s', article.url) logger.error(traceback.format_exc()) article.save()
def update_versions(todays_repo, do_all=False): logger.info('Looking for articles to check') # For memory issues, restrict to the last year of articles threshold = datetime.now() - timedelta(days=366) article_query = models.Article.objects.exclude(git_dir='old').filter( Q(last_update__gt=threshold) | Q(initial_date__gt=threshold)) articles = list(article_query) total_articles = len(articles) update_priority = lambda x: x.minutes_since_check( ) * 1. / get_update_delay(x.minutes_since_update()) articles = sorted( [a for a in articles if update_priority(a) > 1 or do_all], key=update_priority, reverse=True) logger.info('Checking %s of %s articles', len(articles), total_articles) # Do git gc at the beginning, so if we're falling behind and killed # it still happens and I don't run out of quota. =) logger.info('Starting with gc:') try: run_git_command(['gc'], models.GIT_DIR + todays_repo) except subprocess.CalledProcessError as e: print >> sys.stderr, 'Error on initial gc!' print >> sys.stderr, 'Output was """' print >> sys.stderr, e.output print >> sys.stderr, '"""' raise logger.info('Done with gc!') for i, article in enumerate(articles): logger.debug('Woo: %s %s %s (%s/%s)', article.minutes_since_update(), article.minutes_since_check(), update_priority(article), i + 1, len(articles)) delay = get_update_delay(article.minutes_since_update()) # isn't this inherent in update_priority being > 1 above? if article.minutes_since_check() < delay and not do_all: continue logger.info('Considering %s', article.url) article.last_check = datetime.now() try: update_article(article) except Exception, e: if isinstance(e, subprocess.CalledProcessError): logger.error('CalledProcessError when updating %s', article.url) logger.error(repr(e.output)) else: logger.error('Unknown exception when updating %s', article.url) logger.error(traceback.format_exc()) article.save()
def update_versions(todays_repo, do_all=False): logger.info('Looking for articles to check') # For memory issues, restrict to the last year of articles threshold = datetime.now() - timedelta(days=366) article_query = models.Article.objects.exclude(git_dir='old').filter(Q(last_update__gt=threshold) | Q(initial_date__gt=threshold)) articles = list(article_query) total_articles = len(articles) update_priority = lambda x: x.minutes_since_check() * 1. / get_update_delay(x.minutes_since_update()) articles = sorted([a for a in articles if update_priority(a) > 1 or do_all], key=update_priority, reverse=True) logger.info('Checking %s of %s articles', len(articles), total_articles) # Do git gc at the beginning, so if we're falling behind and killed # it still happens and I don't run out of quota. =) logger.info('Starting with gc:') try: run_git_command(['gc'], models.GIT_DIR + todays_repo) except subprocess.CalledProcessError as e: print >> sys.stderr, 'Error on initial gc!' print >> sys.stderr, 'Output was """' print >> sys.stderr, e.output print >> sys.stderr, '"""' raise logger.info('Done!') for i, article in enumerate(articles): logger.debug('Woo: %s %s %s (%s/%s)', article.minutes_since_update(), article.minutes_since_check(), update_priority(article), i+1, len(articles)) delay = get_update_delay(article.minutes_since_update()) if article.minutes_since_check() < delay and not do_all: continue logger.info('Considering %s', article.url) article.last_check = datetime.now() try: update_article(article) except Exception, e: if isinstance(e, subprocess.CalledProcessError): logger.error('CalledProcessError when updating %s', article.url) logger.error(repr(e.output)) else: logger.error('Unknown exception when updating %s', article.url) logger.error(traceback.format_exc()) article.save()
def main(): os.system('sudo mysql -u root -ptoor nyt -e "DROP TABLE IF EXISTS post_full;"') os.system('sudo mysql -u root -ptoor nyt -e "CREATE TABLE post_full(id BIGINT(20), url TEXT, article TEXT);"') db=MySQLdb.connect(host='localhost',user='******',passwd='toor',db='nyt') db.set_character_set('utf8') cur=db.cursor() cur.execute('SELECT Id,Message FROM post WHERE message IS NOT NULL AND YEAR(created_time)=2013;') for row in cur.fetchall(): post_id=row[0] msg=row[1] try: article='' url=find_url(msg) if url: article=load_article(url) if article: article=article.body else: article='' else: url='' except Exception, e: logger.error('Failed to process post:%d'%post_id) logger.error('Failed url:%s'%url) logger.error(traceback.format_exc()) time.sleep(10) finally:
def update_versions(todays_repo, do_all=False): articles = list(models.Article.objects.all()) total_articles = len(articles) update_priority = lambda x: x.minutes_since_check( ) * 1. / get_update_delay(x.minutes_since_update()) articles = sorted( [a for a in articles if update_priority(a) > 1 or do_all], key=update_priority, reverse=True) logger.info('Checking %s of %s articles', len(articles), total_articles) # Do git gc at the beginning, so if we're falling behind and killed # it still happens and I don't run out of quota. =) logger.info('Starting with gc:') try: run_git_command(['gc'], models.GIT_DIR + todays_repo) except subprocess.CalledProcessError as e: print >> sys.stderr, 'Error on initial gc!' print >> sys.stderr, 'Output was """' print >> sys.stderr, e.output print >> sys.stderr, '"""' raise logger.info('Done!') for i, article in enumerate(articles): logger.debug('Woo: %s %s %s (%s/%s)', article.minutes_since_update(), article.minutes_since_check(), update_priority(article), i + 1, len(articles)) delay = get_update_delay(article.minutes_since_update()) if article.minutes_since_check() < delay and not do_all: continue logger.info('Considering %s', article.url) article.last_check = datetime.now() try: update_article(article) except Exception, e: if isinstance(e, subprocess.CalledProcessError): logger.error('CalledProcessError when updating %s', article.url) logger.error(repr(e.output)) else: logger.error('Unknown exception when updating %s', article.url) logger.error(traceback.format_exc()) article.save()
def update_versions(do_all=False): articles = list(models.Article.objects.all()) total_articles = len(articles) update_priority = lambda x: x.minutes_since_check() * 1. / get_update_delay(x.minutes_since_update()) articles = sorted([a for a in articles if update_priority(a) > 1 or do_all], key=update_priority, reverse=True) logger.info('Checking %s of %s articles', len(articles), total_articles) # Do git gc at the beginning, so if we're falling behind and killed # it still happens and I don't run out of quota. =) logger.info('Starting with gc:') try: run_git_command(['gc']) except subprocess.CalledProcessError as e: print >> sys.stderr, 'Error on initial gc!' print >> sys.stderr, 'Output was """' print >> sys.stderr, e.output print >> sys.stderr, '"""' raise logger.info('Done!') for i, article in enumerate(articles): logger.debug('Woo: %s %s %s (%s/%s)', article.minutes_since_update(), article.minutes_since_check(), update_priority(article), i+1, len(articles)) delay = get_update_delay(article.minutes_since_update()) if article.minutes_since_check() < delay and not do_all: continue logger.info('Considering %s', article.url) article.last_check = datetime.now() try: update_article(article) except Exception, e: if isinstance(e, subprocess.CalledProcessError): logger.error('CalledProcessError when updating %s', article.url) logger.error(repr(e.output)) else: logger.error('Unknown exception when updating %s', article.url) logger.error(traceback.format_exc()) article.save()
def load_article(url): try: parser = parsers.get_parser(url) except KeyError: logger.info('Unable to parse domain, skipping') return try: parsed_article = parser(url) except (AttributeError, urllib2.HTTPError, httplib.HTTPException), e: if isinstance(e, urllib2.HTTPError) and e.msg == 'Gone': return logger.error('Exception when parsing %s', url) logger.error(traceback.format_exc()) logger.error('Continuing') return