示例#1
0
def notify_admins_of_errors():
    with open(ERROR_FILE_PATH, 'r') as error_file:
        errors = error_file.read().strip()
        if errors:
            logger.error('Error file is non-empty at end of run; emailing contents to admins')
            admin_emails = map(lambda e: e[1], settings.ADMINS)
            send_email(admin_emails, 'NewsDiffs scraper errors', errors)
示例#2
0
def find_url(txt):
	try:
		regres=re.search("(?P<url>https?://[^\s]+)", txt)
	except Exception,e:
		logger.error('Failed to find url in %s'%txt)
		logger.error(e)
		return
示例#3
0
 def get_hash(version):
     """Return the SHA1 hash of filename in a given version"""
     output = run_git_command(['ls-tree', '-r', version, filename],
                              article.full_git_dir)
     try:
         return output.split()[2]
     except IndexError as e:
         logger.error('git ls-tree value: [%s]' % output)
         return ''
示例#4
0
def send_email(recipients, subject, body):
    contents = 'Subject: %s\n\n%s' % (subject, body)

    msmtp_path = '/usr/bin/msmtp'
    if os.path.exists(msmtp_path):
        p = subprocess.Popen([msmtp_path, '-t'] + recipients,
                             stdin=subprocess.PIPE)
        p.communicate(contents)
        if p.wait():
            logger.error('Bad return code:', p.returncode)
    else:
        logger.error('%s does not exist; cannot email errors to admins' % (msmtp_path,))
示例#5
0
def update_versions(do_all=False):
    articles = list(models.Article.objects.all())
    total_articles = len(articles)

    update_priority = lambda x: x.minutes_since_check() * 1. / get_update_delay(x.minutes_since_update())
    articles = sorted([a for a in articles if (update_priority(a) > 1 or do_all)], key=update_priority, reverse=True)

    logger.info('Checking %s of %s articles', len(articles), total_articles)
    for i, article in enumerate(articles):
        logger.debug('Woo: %s %s %s (%s/%s)', article.minutes_since_update(), article.minutes_since_check(), update_priority(article), i+1, len(articles))
        delay = get_update_delay(article.minutes_since_update())
        if article.minutes_since_check() < delay and not do_all:
            continue
        logger.info('Considering %s', article.url)
        article.last_check = datetime.now()
        try:
            update_article(article)
        except Exception, e:
            if isinstance(e, subprocess.CalledProcessError):
                logger.error('CalledProcessError when updating %s', article.url)
                logger.error(repr(e.output))
            else:
                logger.error('Unknown exception when updating %s', article.url)

            logger.error(traceback.format_exc())
        article.save()
示例#6
0
文件: scraper.py 项目: toyg/newsdiffs
def update_versions(do_all=False):
    articles = list(models.Article.objects.all())
    total_articles = len(articles)

    update_priority = lambda x: x.minutes_since_check() * 1. / get_update_delay(x.minutes_since_update())
    articles = sorted([a for a in articles if update_priority(a) > 1 or do_all],
                      key=update_priority, reverse=True)

    logger.info('Checking %s of %s articles', len(articles), total_articles)
    for i, article in enumerate(articles):
        logger.debug('Woo: %s %s %s (%s/%s)',
                     article.minutes_since_update(),
                     article.minutes_since_check(),
                     update_priority(article), i+1, len(articles))
        delay = get_update_delay(article.minutes_since_update())
        if article.minutes_since_check() < delay and not do_all:
            continue
        logger.info('Considering %s', article.url)
        article.last_check = datetime.now()
        try:
            update_article(article)
        except Exception, e:
            if isinstance(e, subprocess.CalledProcessError):
                logger.error('CalledProcessError when updating %s', article.url)
                logger.error(repr(e.output))
            else:
                logger.error('Unknown exception when updating %s', article.url)

            logger.error(traceback.format_exc())
        article.save()
示例#7
0
def update_versions(todays_repo, do_all=False):
    logger.info('Looking for articles to check')
    # For memory issues, restrict to the last year of articles
    threshold = datetime.now() - timedelta(days=366)
    article_query = models.Article.objects.exclude(git_dir='old').filter(
        Q(last_update__gt=threshold) | Q(initial_date__gt=threshold))
    articles = list(article_query)
    total_articles = len(articles)

    update_priority = lambda x: x.minutes_since_check(
    ) * 1. / get_update_delay(x.minutes_since_update())
    articles = sorted(
        [a for a in articles if update_priority(a) > 1 or do_all],
        key=update_priority,
        reverse=True)

    logger.info('Checking %s of %s articles', len(articles), total_articles)

    # Do git gc at the beginning, so if we're falling behind and killed
    # it still happens and I don't run out of quota. =)
    logger.info('Starting with gc:')
    try:
        run_git_command(['gc'], models.GIT_DIR + todays_repo)
    except subprocess.CalledProcessError as e:
        print >> sys.stderr, 'Error on initial gc!'
        print >> sys.stderr, 'Output was """'
        print >> sys.stderr, e.output
        print >> sys.stderr, '"""'
        raise

    logger.info('Done with gc!')

    for i, article in enumerate(articles):
        logger.debug('Woo: %s %s %s (%s/%s)', article.minutes_since_update(),
                     article.minutes_since_check(), update_priority(article),
                     i + 1, len(articles))
        delay = get_update_delay(article.minutes_since_update())
        # isn't this inherent in update_priority being > 1 above?
        if article.minutes_since_check() < delay and not do_all:
            continue
        logger.info('Considering %s', article.url)

        article.last_check = datetime.now()
        try:
            update_article(article)
        except Exception, e:
            if isinstance(e, subprocess.CalledProcessError):
                logger.error('CalledProcessError when updating %s',
                             article.url)
                logger.error(repr(e.output))
            else:
                logger.error('Unknown exception when updating %s', article.url)

            logger.error(traceback.format_exc())
        article.save()
示例#8
0
def update_versions(todays_repo, do_all=False):
    logger.info('Looking for articles to check')
    # For memory issues, restrict to the last year of articles
    threshold = datetime.now() - timedelta(days=366)
    article_query = models.Article.objects.exclude(git_dir='old').filter(Q(last_update__gt=threshold) | 
                                                                         Q(initial_date__gt=threshold))
    articles = list(article_query)
    total_articles = len(articles)

    update_priority = lambda x: x.minutes_since_check() * 1. / get_update_delay(x.minutes_since_update())
    articles = sorted([a for a in articles if update_priority(a) > 1 or do_all],
                      key=update_priority, reverse=True)

    logger.info('Checking %s of %s articles', len(articles), total_articles)

    # Do git gc at the beginning, so if we're falling behind and killed
    # it still happens and I don't run out of quota. =)
    logger.info('Starting with gc:')
    try:
        run_git_command(['gc'], models.GIT_DIR + todays_repo)
    except subprocess.CalledProcessError as e:
        print >> sys.stderr, 'Error on initial gc!'
        print >> sys.stderr, 'Output was """'
        print >> sys.stderr, e.output
        print >> sys.stderr, '"""'
        raise

    logger.info('Done!')
    for i, article in enumerate(articles):
        logger.debug('Woo: %s %s %s (%s/%s)',
                     article.minutes_since_update(),
                     article.minutes_since_check(),
                     update_priority(article), i+1, len(articles))
        delay = get_update_delay(article.minutes_since_update())
        if article.minutes_since_check() < delay and not do_all:
            continue
        logger.info('Considering %s', article.url)

        article.last_check = datetime.now()
        try:
            update_article(article)
        except Exception, e:
            if isinstance(e, subprocess.CalledProcessError):
                logger.error('CalledProcessError when updating %s', article.url)
                logger.error(repr(e.output))
            else:
                logger.error('Unknown exception when updating %s', article.url)

            logger.error(traceback.format_exc())
        article.save()
示例#9
0
def main():
	os.system('sudo mysql -u root -ptoor nyt -e "DROP TABLE  IF EXISTS post_full;"')
	os.system('sudo mysql -u root -ptoor nyt -e "CREATE TABLE post_full(id BIGINT(20), url TEXT, article TEXT);"')
	db=MySQLdb.connect(host='localhost',user='******',passwd='toor',db='nyt')
	db.set_character_set('utf8')
	cur=db.cursor()

	cur.execute('SELECT Id,Message FROM post WHERE message IS NOT NULL AND YEAR(created_time)=2013;')
	for row in cur.fetchall():
		post_id=row[0]
		msg=row[1]
		try:
			article=''
			url=find_url(msg)
			if url:
				article=load_article(url)
				if article:
					article=article.body
				else:
					article=''
			else:
				url=''
		except Exception, e:
			logger.error('Failed to process post:%d'%post_id)
			logger.error('Failed url:%s'%url)
			logger.error(traceback.format_exc())
			time.sleep(10)
		finally:
示例#10
0
def update_versions(todays_repo, do_all=False):
    articles = list(models.Article.objects.all())
    total_articles = len(articles)

    update_priority = lambda x: x.minutes_since_check(
    ) * 1. / get_update_delay(x.minutes_since_update())
    articles = sorted(
        [a for a in articles if update_priority(a) > 1 or do_all],
        key=update_priority,
        reverse=True)

    logger.info('Checking %s of %s articles', len(articles), total_articles)

    # Do git gc at the beginning, so if we're falling behind and killed
    # it still happens and I don't run out of quota. =)
    logger.info('Starting with gc:')
    try:
        run_git_command(['gc'], models.GIT_DIR + todays_repo)
    except subprocess.CalledProcessError as e:
        print >> sys.stderr, 'Error on initial gc!'
        print >> sys.stderr, 'Output was """'
        print >> sys.stderr, e.output
        print >> sys.stderr, '"""'
        raise

    logger.info('Done!')
    for i, article in enumerate(articles):
        logger.debug('Woo: %s %s %s (%s/%s)', article.minutes_since_update(),
                     article.minutes_since_check(), update_priority(article),
                     i + 1, len(articles))
        delay = get_update_delay(article.minutes_since_update())
        if article.minutes_since_check() < delay and not do_all:
            continue
        logger.info('Considering %s', article.url)

        article.last_check = datetime.now()
        try:
            update_article(article)
        except Exception, e:
            if isinstance(e, subprocess.CalledProcessError):
                logger.error('CalledProcessError when updating %s',
                             article.url)
                logger.error(repr(e.output))
            else:
                logger.error('Unknown exception when updating %s', article.url)

            logger.error(traceback.format_exc())
        article.save()
示例#11
0
def update_versions(do_all=False):
    articles = list(models.Article.objects.all())
    total_articles = len(articles)

    update_priority = lambda x: x.minutes_since_check() * 1. / get_update_delay(x.minutes_since_update())
    articles = sorted([a for a in articles if update_priority(a) > 1 or do_all],
                      key=update_priority, reverse=True)

    logger.info('Checking %s of %s articles', len(articles), total_articles)

    # Do git gc at the beginning, so if we're falling behind and killed
    # it still happens and I don't run out of quota. =)
    logger.info('Starting with gc:')
    try:
        run_git_command(['gc'])
    except subprocess.CalledProcessError as e:
        print >> sys.stderr, 'Error on initial gc!'
        print >> sys.stderr, 'Output was """'
        print >> sys.stderr, e.output
        print >> sys.stderr, '"""'
        raise

    logger.info('Done!')
    for i, article in enumerate(articles):
        logger.debug('Woo: %s %s %s (%s/%s)',
                     article.minutes_since_update(),
                     article.minutes_since_check(),
                     update_priority(article), i+1, len(articles))
        delay = get_update_delay(article.minutes_since_update())
        if article.minutes_since_check() < delay and not do_all:
            continue
        logger.info('Considering %s', article.url)
        article.last_check = datetime.now()
        try:
            update_article(article)
        except Exception, e:
            if isinstance(e, subprocess.CalledProcessError):
                logger.error('CalledProcessError when updating %s', article.url)
                logger.error(repr(e.output))
            else:
                logger.error('Unknown exception when updating %s', article.url)

            logger.error(traceback.format_exc())
        article.save()
示例#12
0
def load_article(url):
    try:
        parser = parsers.get_parser(url)
    except KeyError:
        logger.info('Unable to parse domain, skipping')
        return
    try:
        parsed_article = parser(url)
    except (AttributeError, urllib2.HTTPError, httplib.HTTPException), e:
        if isinstance(e, urllib2.HTTPError) and e.msg == 'Gone':
            return
        logger.error('Exception when parsing %s', url)
        logger.error(traceback.format_exc())
        logger.error('Continuing')
        return
示例#13
0
def load_article(url):
    try:
        parser = parsers.get_parser(url)
    except KeyError:
        logger.info('Unable to parse domain, skipping')
        return
    try:
        parsed_article = parser(url)
    except (AttributeError, urllib2.HTTPError, httplib.HTTPException), e:
        if isinstance(e, urllib2.HTTPError) and e.msg == 'Gone':
            return
        logger.error('Exception when parsing %s', url)
        logger.error(traceback.format_exc())
        logger.error('Continuing')
        return