示例#1
0
def update_article(article):
    parsed_article = load_article(article.url)
    if parsed_article is None:
        return
    to_store = unicode(parsed_article).encode('utf8')
    t = datetime.now()

    v, boring, diff_info = add_to_git_repo(to_store,
                                           url_to_filename(article.url),
                                           article)
    if v:
        logger.info('Modifying! new blob: %s', v)
        v_row = models.Version(
            v=v,
            boring=boring,
            title=parsed_article.title,
            byline=parsed_article.byline,
            date=t,
            article=article,
        )
        v_row.diff_info = diff_info
        v_row.save()
        if not boring:
            article.last_update = t
            article.save()
示例#2
0
def get_all_article_urls():
    ans = set()
    for parser in parsers.parsers:
        logger.info('Looking up %s' % parser.domains)
        urls = parser.feed_urls()
        ans = ans.union(map(canonicalize_url, urls))
    return ans
示例#3
0
文件: scraper.py 项目: toyg/newsdiffs
def update_versions(do_all=False):
    articles = list(models.Article.objects.all())
    total_articles = len(articles)

    update_priority = lambda x: x.minutes_since_check() * 1. / get_update_delay(x.minutes_since_update())
    articles = sorted([a for a in articles if update_priority(a) > 1 or do_all],
                      key=update_priority, reverse=True)

    logger.info('Checking %s of %s articles', len(articles), total_articles)
    for i, article in enumerate(articles):
        logger.debug('Woo: %s %s %s (%s/%s)',
                     article.minutes_since_update(),
                     article.minutes_since_check(),
                     update_priority(article), i+1, len(articles))
        delay = get_update_delay(article.minutes_since_update())
        if article.minutes_since_check() < delay and not do_all:
            continue
        logger.info('Considering %s', article.url)
        article.last_check = datetime.now()
        try:
            update_article(article)
        except Exception, e:
            if isinstance(e, subprocess.CalledProcessError):
                logger.error('CalledProcessError when updating %s', article.url)
                logger.error(repr(e.output))
            else:
                logger.error('Unknown exception when updating %s', article.url)

            logger.error(traceback.format_exc())
        article.save()
示例#4
0
def update_versions(do_all=False):
    articles = list(models.Article.objects.all())
    total_articles = len(articles)

    update_priority = lambda x: x.minutes_since_check() * 1. / get_update_delay(x.minutes_since_update())
    articles = sorted([a for a in articles if (update_priority(a) > 1 or do_all)], key=update_priority, reverse=True)

    logger.info('Checking %s of %s articles', len(articles), total_articles)
    for i, article in enumerate(articles):
        logger.debug('Woo: %s %s %s (%s/%s)', article.minutes_since_update(), article.minutes_since_check(), update_priority(article), i+1, len(articles))
        delay = get_update_delay(article.minutes_since_update())
        if article.minutes_since_check() < delay and not do_all:
            continue
        logger.info('Considering %s', article.url)
        article.last_check = datetime.now()
        try:
            update_article(article)
        except Exception, e:
            if isinstance(e, subprocess.CalledProcessError):
                logger.error('CalledProcessError when updating %s', article.url)
                logger.error(repr(e.output))
            else:
                logger.error('Unknown exception when updating %s', article.url)

            logger.error(traceback.format_exc())
        article.save()
示例#5
0
def get_all_article_urls():
    ans = set()
    for parser in parsers.parsers:
        logger.info('Looking up %s' % parser.domains)
        urls = parser.feed_urls()
        ans = ans.union(map(canonicalize_url, urls))
    return ans
示例#6
0
def update_articles(todays_git_dir):
    logger.info('Starting scraper; looking for new URLs')
    for url in get_all_article_urls():
        if len(url) > 255:  #Icky hack, but otherwise they're truncated in DB.
            continue
        if not models.Article.objects.filter(url=url).count():
            models.Article(url=url, git_dir=todays_git_dir).save()
示例#7
0
def update_article(article):
    parsed_article = load_article(article.url)
    if parsed_article is None:
        return
    to_store = unicode(parsed_article).encode('utf8')
    t = datetime.now()
    logger.debug('Article parsed; trying to store')
    v, boring, diff = add_to_git_repo(to_store,
                                           url_to_filename(article.url),
                                           article)
    if v:
        logger.info('Modifying! new blob: %s', v)
        v_row = models.Version(v=v,
                               boring=boring,
                               title=parsed_article.title,
                               byline=parsed_article.byline,
                               date=t,
                               article=article,
                               )
        v_row.diff_info = get_diff_info(diff)
        v_row.diff_details_json = diff
        v_row.update_severity(save=False)
        if not boring:
            article.last_update = t
        v_row.save()
        article.save()
示例#8
0
def update_articles():
    logger.info('Starting scraper; looking for new URLs')
    for url in get_all_article_urls():
        if len(url) > 255:  #Icky hack, but otherwise they're truncated in DB.
            continue
        if not models.Article.objects.filter(url=url).count():
            models.Article(url=url).save()
示例#9
0
def load_article(url):
    try:
        parser = parsers.get_parser(url)
    except KeyError:
        logger.info('Unable to parse domain, skipping')
        return
    parsed_article = parser(url)
    if not parsed_article.real_article:
        return
    return parsed_article
示例#10
0
def update_versions(todays_repo, do_all=False):
    logger.info('Looking for articles to check')
    # For memory issues, restrict to the last year of articles
    threshold = datetime.now() - timedelta(days=366)
    article_query = models.Article.objects.exclude(git_dir='old').filter(
        Q(last_update__gt=threshold) | Q(initial_date__gt=threshold))
    articles = list(article_query)
    total_articles = len(articles)

    update_priority = lambda x: x.minutes_since_check(
    ) * 1. / get_update_delay(x.minutes_since_update())
    articles = sorted(
        [a for a in articles if update_priority(a) > 1 or do_all],
        key=update_priority,
        reverse=True)

    logger.info('Checking %s of %s articles', len(articles), total_articles)

    # Do git gc at the beginning, so if we're falling behind and killed
    # it still happens and I don't run out of quota. =)
    logger.info('Starting with gc:')
    try:
        run_git_command(['gc'], models.GIT_DIR + todays_repo)
    except subprocess.CalledProcessError as e:
        print >> sys.stderr, 'Error on initial gc!'
        print >> sys.stderr, 'Output was """'
        print >> sys.stderr, e.output
        print >> sys.stderr, '"""'
        raise

    logger.info('Done with gc!')

    for i, article in enumerate(articles):
        logger.debug('Woo: %s %s %s (%s/%s)', article.minutes_since_update(),
                     article.minutes_since_check(), update_priority(article),
                     i + 1, len(articles))
        delay = get_update_delay(article.minutes_since_update())
        # isn't this inherent in update_priority being > 1 above?
        if article.minutes_since_check() < delay and not do_all:
            continue
        logger.info('Considering %s', article.url)

        article.last_check = datetime.now()
        try:
            update_article(article)
        except Exception, e:
            if isinstance(e, subprocess.CalledProcessError):
                logger.error('CalledProcessError when updating %s',
                             article.url)
                logger.error(repr(e.output))
            else:
                logger.error('Unknown exception when updating %s', article.url)

            logger.error(traceback.format_exc())
        article.save()
示例#11
0
def update_versions(todays_repo, do_all=False):
    logger.info('Looking for articles to check')
    # For memory issues, restrict to the last year of articles
    threshold = datetime.now() - timedelta(days=366)
    article_query = models.Article.objects.exclude(git_dir='old').filter(Q(last_update__gt=threshold) | 
                                                                         Q(initial_date__gt=threshold))
    articles = list(article_query)
    total_articles = len(articles)

    update_priority = lambda x: x.minutes_since_check() * 1. / get_update_delay(x.minutes_since_update())
    articles = sorted([a for a in articles if update_priority(a) > 1 or do_all],
                      key=update_priority, reverse=True)

    logger.info('Checking %s of %s articles', len(articles), total_articles)

    # Do git gc at the beginning, so if we're falling behind and killed
    # it still happens and I don't run out of quota. =)
    logger.info('Starting with gc:')
    try:
        run_git_command(['gc'], models.GIT_DIR + todays_repo)
    except subprocess.CalledProcessError as e:
        print >> sys.stderr, 'Error on initial gc!'
        print >> sys.stderr, 'Output was """'
        print >> sys.stderr, e.output
        print >> sys.stderr, '"""'
        raise

    logger.info('Done!')
    for i, article in enumerate(articles):
        logger.debug('Woo: %s %s %s (%s/%s)',
                     article.minutes_since_update(),
                     article.minutes_since_check(),
                     update_priority(article), i+1, len(articles))
        delay = get_update_delay(article.minutes_since_update())
        if article.minutes_since_check() < delay and not do_all:
            continue
        logger.info('Considering %s', article.url)

        article.last_check = datetime.now()
        try:
            update_article(article)
        except Exception, e:
            if isinstance(e, subprocess.CalledProcessError):
                logger.error('CalledProcessError when updating %s', article.url)
                logger.error(repr(e.output))
            else:
                logger.error('Unknown exception when updating %s', article.url)

            logger.error(traceback.format_exc())
        article.save()
示例#12
0
def load_article(url):
    try:
        parser = parsers.get_parser(url)
    except KeyError:
        logger.info('Unable to parse domain, skipping')
        return
    try:
        parsed_article = parser(url)
    except (AttributeError, urllib2.HTTPError, httplib.HTTPException), e:
        if isinstance(e, urllib2.HTTPError) and e.msg == 'Gone':
            return
        logger.error('Exception when parsing %s', url)
        logger.error(traceback.format_exc())
        logger.error('Continuing')
        return
示例#13
0
def load_article(url):
    try:
        parser = parsers.get_parser(url)
    except KeyError:
        logger.info('Unable to parse domain, skipping')
        return
    try:
        parsed_article = parser(url)
    except (AttributeError, urllib2.HTTPError, httplib.HTTPException), e:
        if isinstance(e, urllib2.HTTPError) and e.msg == 'Gone':
            return
        logger.error('Exception when parsing %s', url)
        logger.error(traceback.format_exc())
        logger.error('Continuing')
        return
示例#14
0
def update_article(article):
    parsed_article = load_article(article.url)
    if parsed_article is None:
        return
    to_store = unicode(parsed_article).encode('utf8')
    t = datetime.now()
    logger.debug('Article parsed; trying to store')
    v, boring, diff_info = add_to_git_repo(to_store, article.filename(),
                                           article)
    if v:
        logger.info('Modifying! new blob: %s', v)
        v_row = models.Version(
            v=v,
            boring=boring,
            title=parsed_article.title,
            byline=parsed_article.byline,
            date=t,
            article=article,
        )
        v_row.diff_info = diff_info
        v_row.save()

        if not boring:
            article.last_update = t
            article.save()

            # Notify slack
            team = models.SlackBotTeam.objects.get()
            standalone = models.StandaloneArticle.objects.get(url=article.url)
            if standalone:
                user = standalone.added_by
                link = '/article-history/%s' % article.url
                link = '%s%s' % (settings.WEBAPP_URL, link)
                bot_text = 'Veo cambios en %s (%s)' % (article.url, link)

                if not user.startswith('web_frontend'):
                    Client = SlackClient(team.bot_access_token)
                    dm = Client.api_call(method='im.open', user=user)
                    channel = dm.get('channel').get('id')
                    channel = channel if channel else user
                    Client.api_call(method='chat.postMessage',
                                    channel=channel,
                                    text=bot_text)
示例#15
0
    def handle(self, *args, **options):
        ch = logging.FileHandler('/tmp/newsdiffs_logging', mode='w')
        ch = logging.StreamHandler(stream=sys.stdout)
        ch.setLevel(logging.DEBUG)
        ch.setFormatter(formatter)
        logger.addHandler(ch)

        ch = logging.FileHandler(ERROR_FILE_PATH, mode='a')
        ch = logging.StreamHandler(stream=sys.stdout)
        ch.setLevel(logging.WARNING)
        ch.setFormatter(formatter)
        logger.addHandler(ch)

        for repo in all_git_repos():
            cleanup_git_repo(repo)

        todays_repo = get_and_make_git_repo()

        update_articles(todays_repo)
        update_versions(todays_repo, options['all'])

        logger.info('Done scraping!')
示例#16
0
def update_versions(todays_repo, do_all=False):
    articles = list(models.Article.objects.all())
    total_articles = len(articles)

    update_priority = lambda x: x.minutes_since_check(
    ) * 1. / get_update_delay(x.minutes_since_update())
    articles = sorted(
        [a for a in articles if update_priority(a) > 1 or do_all],
        key=update_priority,
        reverse=True)

    logger.info('Checking %s of %s articles', len(articles), total_articles)

    # Do git gc at the beginning, so if we're falling behind and killed
    # it still happens and I don't run out of quota. =)
    logger.info('Starting with gc:')
    try:
        run_git_command(['gc'], models.GIT_DIR + todays_repo)
    except subprocess.CalledProcessError as e:
        print >> sys.stderr, 'Error on initial gc!'
        print >> sys.stderr, 'Output was """'
        print >> sys.stderr, e.output
        print >> sys.stderr, '"""'
        raise

    logger.info('Done!')
    for i, article in enumerate(articles):
        logger.debug('Woo: %s %s %s (%s/%s)', article.minutes_since_update(),
                     article.minutes_since_check(), update_priority(article),
                     i + 1, len(articles))
        delay = get_update_delay(article.minutes_since_update())
        if article.minutes_since_check() < delay and not do_all:
            continue
        logger.info('Considering %s', article.url)

        article.last_check = datetime.now()
        try:
            update_article(article)
        except Exception, e:
            if isinstance(e, subprocess.CalledProcessError):
                logger.error('CalledProcessError when updating %s',
                             article.url)
                logger.error(repr(e.output))
            else:
                logger.error('Unknown exception when updating %s', article.url)

            logger.error(traceback.format_exc())
        article.save()
示例#17
0
def update_article(article):
    parsed_article = load_article(article.url)
    if parsed_article is None:
        return
    to_store = unicode(parsed_article).encode('utf8')
    t = datetime.now()

    v, boring, diff_info = add_to_git_repo(to_store,
                                           url_to_filename(article.url),
                                           article)
    if v:
        logger.info('Modifying! new blob: %s', v)
        v_row = models.Version(v=v,
                               boring=boring,
                               title=parsed_article.title,
                               byline=parsed_article.byline,
                               date=t,
                               article=article,
                               )
        v_row.diff_info = diff_info
        v_row.save()
        if not boring:
            article.last_update = t
            article.save()
示例#18
0
def update_versions(do_all=False):
    articles = list(models.Article.objects.all())
    total_articles = len(articles)

    update_priority = lambda x: x.minutes_since_check() * 1. / get_update_delay(x.minutes_since_update())
    articles = sorted([a for a in articles if update_priority(a) > 1 or do_all],
                      key=update_priority, reverse=True)

    logger.info('Checking %s of %s articles', len(articles), total_articles)

    # Do git gc at the beginning, so if we're falling behind and killed
    # it still happens and I don't run out of quota. =)
    logger.info('Starting with gc:')
    try:
        run_git_command(['gc'])
    except subprocess.CalledProcessError as e:
        print >> sys.stderr, 'Error on initial gc!'
        print >> sys.stderr, 'Output was """'
        print >> sys.stderr, e.output
        print >> sys.stderr, '"""'
        raise

    logger.info('Done!')
    for i, article in enumerate(articles):
        logger.debug('Woo: %s %s %s (%s/%s)',
                     article.minutes_since_update(),
                     article.minutes_since_check(),
                     update_priority(article), i+1, len(articles))
        delay = get_update_delay(article.minutes_since_update())
        if article.minutes_since_check() < delay and not do_all:
            continue
        logger.info('Considering %s', article.url)
        article.last_check = datetime.now()
        try:
            update_article(article)
        except Exception, e:
            if isinstance(e, subprocess.CalledProcessError):
                logger.error('CalledProcessError when updating %s', article.url)
                logger.error(repr(e.output))
            else:
                logger.error('Unknown exception when updating %s', article.url)

            logger.error(traceback.format_exc())
        article.save()
示例#19
0
def update_articles(todays_git_dir):
    logger.info('Starting scraper; looking for new URLs')
    all_urls = get_all_article_urls()
    logger.info('Got all %s urls; storing to database' % len(all_urls))
    for i, url in enumerate(all_urls):
        logger.debug('Woo: %d/%d is %s' % (i+1, len(all_urls), url))
        if len(url) > 255:  #Icky hack, but otherwise they're truncated in DB.
            continue
        if not models.Article.objects.filter(url=url).count():
            logger.debug('Adding!')
            models.Article(url=url, git_dir=todays_git_dir).save()
    logger.info('Done storing to database')
示例#20
0
def update_articles(todays_git_dir):
    logger.info('Starting scraper; looking for new URLs')
    all_urls = get_all_article_urls()
    logger.info('Got all %s urls; storing to database' % len(all_urls))
    for i, url in enumerate(all_urls):
        logger.debug('Woo: %d/%d is %s' % (i + 1, len(all_urls), url))
        if len(url) > 255:  #Icky hack, but otherwise they're truncated in DB.
            continue
        if not models.Article.objects.filter(url=url).count():
            logger.debug('Adding!')
            models.Article(url=url, git_dir=todays_git_dir).save()
    logger.info('Done storing to database')
示例#21
0
def update_articles(todays_git_dir):
    logger.info('Starting scraper; looking for new URLs')
    all_urls = get_all_article_urls()
    logger.info('Got all %s urls; storing to database' % len(all_urls))
    for i, url in enumerate(all_urls):
        logger.debug('Woo: %d/%d is %s' % (i + 1, len(all_urls), url))
        # Looks like it skips URLs longer than 255?
        if len(url) > 255:  #Icky hack, but otherwise they're truncated in DB.
            continue
        # Is there an index on this column?
        if not models.Article.objects.filter(url=url).count():
            logger.debug('Adding Article {0}'.format(url))
            models.Article(url=url, git_dir=todays_git_dir).save()
    logger.info('Done storing to database')
示例#22
0
def update_articles():
    logger.info('Starting scraper; looking for new URLs')
    for url in get_all_article_urls():
        if not models.Article.objects.filter(url=url).count():
            models.Article(url=url).save()
示例#23
0
def update_articles():
    logger.info('Starting scraper; looking for new URLs')
    for url in get_all_article_urls():
        if not models.Article.objects.filter(url=url).count():
            models.Article(url=url).save()
示例#24
0
        article.last_check = datetime.now()
        try:
            update_article(article)
        except Exception, e:
            if isinstance(e, subprocess.CalledProcessError):
                logger.error('CalledProcessError when updating %s',
                             article.url)
                logger.error(repr(e.output))
            else:
                logger.error('Unknown exception when updating %s', article.url)

            logger.error(traceback.format_exc())
        article.save()
    #logger.info('Ending with gc:')
    #run_git_command(['gc'])
    logger.info('Done!')


#Remove index.lock if 5 minutes old
def cleanup_git_repo(git_dir):
    for name in [
            '.git/index.lock', '.git/refs/heads/master.lock',
            '.git/gc.pid.lock'
    ]:
        fname = os.path.join(git_dir, name)
        try:
            stat = os.stat(fname)
        except OSError:
            return
        age = time.time() - stat.st_ctime
        if age > 60 * 5:
示例#25
0
        article.last_check = datetime.now()
        try:
            update_article(article)
        except Exception, e:
            if isinstance(e, subprocess.CalledProcessError):
                logger.error('CalledProcessError when updating %s', article.url)
                logger.error(repr(e.output))
            else:
                logger.error('Unknown exception when updating %s', article.url)

            logger.error(traceback.format_exc())
        article.save()
    #logger.info('Ending with gc:')
    #run_git_command(['gc'])
    logger.info('Done!')

#Remove index.lock if 5 minutes old
def cleanup_git_repo(git_dir):
    for name in ['.git/index.lock', '.git/refs/heads/master.lock', '.git/gc.pid.lock']:
        fname = os.path.join(git_dir, name)
        try:
            stat = os.stat(fname)
        except OSError:
            return
        age = time.time() - stat.st_ctime
        if age > 60*5:
            os.remove(fname)

if __name__ == '__main__':
    print >> sys.stderr, "Try `python website/manage.py scraper`."