示例#1
0
def update_article(article):
    parsed_article = load_article(article.url)
    if parsed_article is None:
        return
    to_store = unicode(parsed_article).encode('utf8')
    t = datetime.now()
    logger.debug('Article parsed; trying to store')
    v, boring, diff_info = add_to_git_repo(to_store,
                                           url_to_filename(article.url),
                                           article)
    if v:
        logger.info('Modifying! new blob: %s', v)
        v_row = models.Version(
            v=v,
            boring=boring,
            title=parsed_article.title,
            byline=parsed_article.byline,
            date=t,
            article=article,
        )
        v_row.diff_info = diff_info
        v_row.save()
        if not boring:
            article.last_update = t
            article.save()
示例#2
0
文件: scraper.py 项目: toyg/newsdiffs
def update_versions(do_all=False):
    articles = list(models.Article.objects.all())
    total_articles = len(articles)

    update_priority = lambda x: x.minutes_since_check() * 1. / get_update_delay(x.minutes_since_update())
    articles = sorted([a for a in articles if update_priority(a) > 1 or do_all],
                      key=update_priority, reverse=True)

    logger.info('Checking %s of %s articles', len(articles), total_articles)
    for i, article in enumerate(articles):
        logger.debug('Woo: %s %s %s (%s/%s)',
                     article.minutes_since_update(),
                     article.minutes_since_check(),
                     update_priority(article), i+1, len(articles))
        delay = get_update_delay(article.minutes_since_update())
        if article.minutes_since_check() < delay and not do_all:
            continue
        logger.info('Considering %s', article.url)
        article.last_check = datetime.now()
        try:
            update_article(article)
        except Exception, e:
            if isinstance(e, subprocess.CalledProcessError):
                logger.error('CalledProcessError when updating %s', article.url)
                logger.error(repr(e.output))
            else:
                logger.error('Unknown exception when updating %s', article.url)

            logger.error(traceback.format_exc())
        article.save()
示例#3
0
def update_versions(do_all=False):
    articles = list(models.Article.objects.all())
    total_articles = len(articles)

    update_priority = lambda x: x.minutes_since_check() * 1. / get_update_delay(x.minutes_since_update())
    articles = sorted([a for a in articles if (update_priority(a) > 1 or do_all)], key=update_priority, reverse=True)

    logger.info('Checking %s of %s articles', len(articles), total_articles)
    for i, article in enumerate(articles):
        logger.debug('Woo: %s %s %s (%s/%s)', article.minutes_since_update(), article.minutes_since_check(), update_priority(article), i+1, len(articles))
        delay = get_update_delay(article.minutes_since_update())
        if article.minutes_since_check() < delay and not do_all:
            continue
        logger.info('Considering %s', article.url)
        article.last_check = datetime.now()
        try:
            update_article(article)
        except Exception, e:
            if isinstance(e, subprocess.CalledProcessError):
                logger.error('CalledProcessError when updating %s', article.url)
                logger.error(repr(e.output))
            else:
                logger.error('Unknown exception when updating %s', article.url)

            logger.error(traceback.format_exc())
        article.save()
示例#4
0
def update_article(article):
    parsed_article = load_article(article.url)
    if parsed_article is None:
        return
    to_store = unicode(parsed_article).encode('utf8')
    t = datetime.now()
    logger.debug('Article parsed; trying to store')
    v, boring, diff = add_to_git_repo(to_store,
                                           url_to_filename(article.url),
                                           article)
    if v:
        logger.info('Modifying! new blob: %s', v)
        v_row = models.Version(v=v,
                               boring=boring,
                               title=parsed_article.title,
                               byline=parsed_article.byline,
                               date=t,
                               article=article,
                               )
        v_row.diff_info = get_diff_info(diff)
        v_row.diff_details_json = diff
        v_row.update_severity(save=False)
        if not boring:
            article.last_update = t
        v_row.save()
        article.save()
示例#5
0
def update_versions(todays_repo, do_all=False):
    logger.info('Looking for articles to check')
    # For memory issues, restrict to the last year of articles
    threshold = datetime.now() - timedelta(days=366)
    article_query = models.Article.objects.exclude(git_dir='old').filter(
        Q(last_update__gt=threshold) | Q(initial_date__gt=threshold))
    articles = list(article_query)
    total_articles = len(articles)

    update_priority = lambda x: x.minutes_since_check(
    ) * 1. / get_update_delay(x.minutes_since_update())
    articles = sorted(
        [a for a in articles if update_priority(a) > 1 or do_all],
        key=update_priority,
        reverse=True)

    logger.info('Checking %s of %s articles', len(articles), total_articles)

    # Do git gc at the beginning, so if we're falling behind and killed
    # it still happens and I don't run out of quota. =)
    logger.info('Starting with gc:')
    try:
        run_git_command(['gc'], models.GIT_DIR + todays_repo)
    except subprocess.CalledProcessError as e:
        print >> sys.stderr, 'Error on initial gc!'
        print >> sys.stderr, 'Output was """'
        print >> sys.stderr, e.output
        print >> sys.stderr, '"""'
        raise

    logger.info('Done with gc!')

    for i, article in enumerate(articles):
        logger.debug('Woo: %s %s %s (%s/%s)', article.minutes_since_update(),
                     article.minutes_since_check(), update_priority(article),
                     i + 1, len(articles))
        delay = get_update_delay(article.minutes_since_update())
        # isn't this inherent in update_priority being > 1 above?
        if article.minutes_since_check() < delay and not do_all:
            continue
        logger.info('Considering %s', article.url)

        article.last_check = datetime.now()
        try:
            update_article(article)
        except Exception, e:
            if isinstance(e, subprocess.CalledProcessError):
                logger.error('CalledProcessError when updating %s',
                             article.url)
                logger.error(repr(e.output))
            else:
                logger.error('Unknown exception when updating %s', article.url)

            logger.error(traceback.format_exc())
        article.save()
示例#6
0
def update_articles(todays_git_dir):
    logger.info('Starting scraper; looking for new URLs')
    all_urls = get_all_article_urls()
    logger.info('Got all %s urls; storing to database' % len(all_urls))
    for i, url in enumerate(all_urls):
        logger.debug('Woo: %d/%d is %s' % (i + 1, len(all_urls), url))
        if len(url) > 255:  #Icky hack, but otherwise they're truncated in DB.
            continue
        if not models.Article.objects.filter(url=url).count():
            logger.debug('Adding!')
            models.Article(url=url, git_dir=todays_git_dir).save()
    logger.info('Done storing to database')
示例#7
0
def update_articles(todays_git_dir):
    logger.info('Starting scraper; looking for new URLs')
    all_urls = get_all_article_urls()
    logger.info('Got all %s urls; storing to database' % len(all_urls))
    for i, url in enumerate(all_urls):
        logger.debug('Woo: %d/%d is %s' % (i+1, len(all_urls), url))
        if len(url) > 255:  #Icky hack, but otherwise they're truncated in DB.
            continue
        if not models.Article.objects.filter(url=url).count():
            logger.debug('Adding!')
            models.Article(url=url, git_dir=todays_git_dir).save()
    logger.info('Done storing to database')
示例#8
0
def update_articles(todays_git_dir):
    logger.info('Starting scraper; looking for new URLs')
    all_urls = get_all_article_urls()
    logger.info('Got all %s urls; storing to database' % len(all_urls))
    for i, url in enumerate(all_urls):
        logger.debug('Woo: %d/%d is %s' % (i + 1, len(all_urls), url))
        # Looks like it skips URLs longer than 255?
        if len(url) > 255:  #Icky hack, but otherwise they're truncated in DB.
            continue
        # Is there an index on this column?
        if not models.Article.objects.filter(url=url).count():
            logger.debug('Adding Article {0}'.format(url))
            models.Article(url=url, git_dir=todays_git_dir).save()
    logger.info('Done storing to database')
示例#9
0
def update_versions(todays_repo, do_all=False):
    logger.info('Looking for articles to check')
    # For memory issues, restrict to the last year of articles
    threshold = datetime.now() - timedelta(days=366)
    article_query = models.Article.objects.exclude(git_dir='old').filter(Q(last_update__gt=threshold) | 
                                                                         Q(initial_date__gt=threshold))
    articles = list(article_query)
    total_articles = len(articles)

    update_priority = lambda x: x.minutes_since_check() * 1. / get_update_delay(x.minutes_since_update())
    articles = sorted([a for a in articles if update_priority(a) > 1 or do_all],
                      key=update_priority, reverse=True)

    logger.info('Checking %s of %s articles', len(articles), total_articles)

    # Do git gc at the beginning, so if we're falling behind and killed
    # it still happens and I don't run out of quota. =)
    logger.info('Starting with gc:')
    try:
        run_git_command(['gc'], models.GIT_DIR + todays_repo)
    except subprocess.CalledProcessError as e:
        print >> sys.stderr, 'Error on initial gc!'
        print >> sys.stderr, 'Output was """'
        print >> sys.stderr, e.output
        print >> sys.stderr, '"""'
        raise

    logger.info('Done!')
    for i, article in enumerate(articles):
        logger.debug('Woo: %s %s %s (%s/%s)',
                     article.minutes_since_update(),
                     article.minutes_since_check(),
                     update_priority(article), i+1, len(articles))
        delay = get_update_delay(article.minutes_since_update())
        if article.minutes_since_check() < delay and not do_all:
            continue
        logger.info('Considering %s', article.url)

        article.last_check = datetime.now()
        try:
            update_article(article)
        except Exception, e:
            if isinstance(e, subprocess.CalledProcessError):
                logger.error('CalledProcessError when updating %s', article.url)
                logger.error(repr(e.output))
            else:
                logger.error('Unknown exception when updating %s', article.url)

            logger.error(traceback.format_exc())
        article.save()
示例#10
0
def is_boring(old, new):
    oldu = canonicalize(old.decode('utf8'))
    newu = canonicalize(new.decode('utf8'))

    if oldu.splitlines()[1:] == newu.splitlines()[1:]:
        return True

    for charset in CHARSET_LIST:
        try:
            if oldu.encode(charset) == new:
                logger.debug('Boring!')
                return True
        except UnicodeEncodeError:
            pass
    return False
示例#11
0
def update_versions(todays_repo, do_all=False):
    articles = list(models.Article.objects.all())
    total_articles = len(articles)

    update_priority = lambda x: x.minutes_since_check(
    ) * 1. / get_update_delay(x.minutes_since_update())
    articles = sorted(
        [a for a in articles if update_priority(a) > 1 or do_all],
        key=update_priority,
        reverse=True)

    logger.info('Checking %s of %s articles', len(articles), total_articles)

    # Do git gc at the beginning, so if we're falling behind and killed
    # it still happens and I don't run out of quota. =)
    logger.info('Starting with gc:')
    try:
        run_git_command(['gc'], models.GIT_DIR + todays_repo)
    except subprocess.CalledProcessError as e:
        print >> sys.stderr, 'Error on initial gc!'
        print >> sys.stderr, 'Output was """'
        print >> sys.stderr, e.output
        print >> sys.stderr, '"""'
        raise

    logger.info('Done!')
    for i, article in enumerate(articles):
        logger.debug('Woo: %s %s %s (%s/%s)', article.minutes_since_update(),
                     article.minutes_since_check(), update_priority(article),
                     i + 1, len(articles))
        delay = get_update_delay(article.minutes_since_update())
        if article.minutes_since_check() < delay and not do_all:
            continue
        logger.info('Considering %s', article.url)

        article.last_check = datetime.now()
        try:
            update_article(article)
        except Exception, e:
            if isinstance(e, subprocess.CalledProcessError):
                logger.error('CalledProcessError when updating %s',
                             article.url)
                logger.error(repr(e.output))
            else:
                logger.error('Unknown exception when updating %s', article.url)

            logger.error(traceback.format_exc())
        article.save()
示例#12
0
def update_versions(do_all=False):
    articles = list(models.Article.objects.all())
    total_articles = len(articles)

    update_priority = lambda x: x.minutes_since_check() * 1. / get_update_delay(x.minutes_since_update())
    articles = sorted([a for a in articles if update_priority(a) > 1 or do_all],
                      key=update_priority, reverse=True)

    logger.info('Checking %s of %s articles', len(articles), total_articles)

    # Do git gc at the beginning, so if we're falling behind and killed
    # it still happens and I don't run out of quota. =)
    logger.info('Starting with gc:')
    try:
        run_git_command(['gc'])
    except subprocess.CalledProcessError as e:
        print >> sys.stderr, 'Error on initial gc!'
        print >> sys.stderr, 'Output was """'
        print >> sys.stderr, e.output
        print >> sys.stderr, '"""'
        raise

    logger.info('Done!')
    for i, article in enumerate(articles):
        logger.debug('Woo: %s %s %s (%s/%s)',
                     article.minutes_since_update(),
                     article.minutes_since_check(),
                     update_priority(article), i+1, len(articles))
        delay = get_update_delay(article.minutes_since_update())
        if article.minutes_since_check() < delay and not do_all:
            continue
        logger.info('Considering %s', article.url)
        article.last_check = datetime.now()
        try:
            update_article(article)
        except Exception, e:
            if isinstance(e, subprocess.CalledProcessError):
                logger.error('CalledProcessError when updating %s', article.url)
                logger.error(repr(e.output))
            else:
                logger.error('Unknown exception when updating %s', article.url)

            logger.error(traceback.format_exc())
        article.save()
示例#13
0
def update_article(article):
    parsed_article = load_article(article.url)
    if parsed_article is None:
        return
    to_store = unicode(parsed_article).encode('utf8')
    t = datetime.now()
    logger.debug('Article parsed; trying to store')
    v, boring, diff_info = add_to_git_repo(to_store, article.filename(),
                                           article)
    if v:
        logger.info('Modifying! new blob: %s', v)
        v_row = models.Version(
            v=v,
            boring=boring,
            title=parsed_article.title,
            byline=parsed_article.byline,
            date=t,
            article=article,
        )
        v_row.diff_info = diff_info
        v_row.save()

        if not boring:
            article.last_update = t
            article.save()

            # Notify slack
            team = models.SlackBotTeam.objects.get()
            standalone = models.StandaloneArticle.objects.get(url=article.url)
            if standalone:
                user = standalone.added_by
                link = '/article-history/%s' % article.url
                link = '%s%s' % (settings.WEBAPP_URL, link)
                bot_text = 'Veo cambios en %s (%s)' % (article.url, link)

                if not user.startswith('web_frontend'):
                    Client = SlackClient(team.bot_access_token)
                    dm = Client.api_call(method='im.open', user=user)
                    channel = dm.get('channel').get('id')
                    channel = channel if channel else user
                    Client.api_call(method='chat.postMessage',
                                    channel=channel,
                                    text=bot_text)
示例#14
0
def is_boring(old, new):
    oldu = canonicalize(old.decode('utf8'))
    newu = canonicalize(new.decode('utf8'))

    def extra_canonical(s):
        """Ignore changes in whitespace or the date line"""
        nondate_portion = s.split('\n', 1)[1]
        return nondate_portion.split()

    if extra_canonical(oldu) == extra_canonical(newu):
        return True

    for charset in CHARSET_LIST:
        try:
            if oldu.encode(charset) == new:
                logger.debug('Boring!')
                return True
        except UnicodeEncodeError:
            pass
    return False
示例#15
0
def is_boring(old, new):
    oldu = canonicalize(old.decode('utf8'))
    newu = canonicalize(new.decode('utf8'))

    def extra_canonical(s):
        """Ignore changes in whitespace or the date line"""
        nondate_portion = s.split('\n', 1)[1]
        return nondate_portion.split()

    if extra_canonical(oldu) == extra_canonical(newu):
        return True

    for charset in CHARSET_LIST:
        try:
            if oldu.encode(charset) == new:
                logger.debug('Boring!')
                return True
        except UnicodeEncodeError:
            pass
    return False
示例#16
0
def run_git_command(command, git_dir, max_timeout=15):
    """Run a git command like ['show', filename] and return the output.

    First, wait up to max_timeout seconds for the index.lock file to go away.
    If the index.lock file remains, raise an IndexLockError.

    Still have a race condition if two programs run this at the same time.
    """
    end_time = time.time() + max_timeout
    delay = 0.1
    lock_file = os.path.join(git_dir, '.git/index.lock')
    while os.path.exists(lock_file):
        if time.time() < end_time - delay:
            time.sleep(delay)
        else:
            raise IndexLockError('Git index.lock file exists for %s seconds' %
                                 max_timeout)

    logger.debug('GIT call: %s' % command)
    output = subprocess.check_output([GIT_PROGRAM] + command,
                                     cwd=git_dir,
                                     stderr=subprocess.STDOUT)
    return output
示例#17
0
def is_boring(old, new):
    oldu = canonicalize(old.decode('utf8'))
    newu = canonicalize(new.decode('utf8'))

    def extra_canonical(s):
        """Ignore changes in whitespace or the date line"""
        # This is fragile: depending on the text looking a particular way!
        nondate_portion = s.split('\n', 1)[1]
        return nondate_portion.split()

    if extra_canonical(oldu) == extra_canonical(newu):
        return True

    # This seems kind of fragile.  Are we 100% sure that differences between
    # these encodings are unimportant?  Also, how does this relate to non-latin
    # text?
    for charset in CHARSET_LIST:
        try:
            if oldu.encode(charset) == new:
                logger.debug('Boring!')
                return True
        except UnicodeEncodeError:
            pass
    return False
示例#18
0
def add_to_git_repo(data, filename, article):
    start_time = time.time()

    #Don't use full path because it can exceed the maximum filename length
    #full_path = os.path.join(models.GIT_DIR, filename)
    os.chdir(article.full_git_dir)
    mkdir_p(os.path.dirname(filename))

    boring = False
    diff_info = None

    try:
        previous = run_git_command(['show', 'HEAD:' + filename],
                                   article.full_git_dir)
    except subprocess.CalledProcessError as e:
        if (e.output.endswith("does not exist in 'HEAD'\n")
                or e.output.endswith("exists on disk, but not in 'HEAD'.\n")):
            already_exists = False
        else:
            raise
    else:
        already_exists = True

    open(filename, 'w').write(data)

    if already_exists:
        if previous == data:
            logger.debug('Article matches current version in repo')
            return None, None, None

        #Now check how many times this same version has appeared before
        my_hash = run_git_command(['hash-object', filename],
                                  article.full_git_dir).strip()

        commits = [v.v for v in article.versions()]
        if len(commits) > 2:
            logger.debug('Checking for duplicates among %s commits',
                         len(commits))

            def get_hash(version):
                """Return the SHA1 hash of filename in a given version"""
                output = run_git_command(['ls-tree', '-r', version, filename],
                                         article.full_git_dir)
                return output.split()[2]

            hashes = map(get_hash, commits)

            number_equal = sum(1 for h in hashes if h == my_hash)

            logger.debug('Got %s', number_equal)

            if number_equal >= 2:  #Refuse to list a version more than twice
                run_git_command(['checkout', filename], article.full_git_dir)
                return None, None, None

        if is_boring(previous, data):
            boring = True
        else:
            diff_info = get_diff_info(previous, data)

    run_git_command(['add', filename], article.full_git_dir)
    if not already_exists:
        commit_message = 'Adding file %s' % filename
    else:
        commit_message = 'Change to %s' % filename
    logger.debug('Running git commit... %s', time.time() - start_time)
    run_git_command(['commit', filename, '-m', commit_message],
                    article.full_git_dir)
    logger.debug('git revlist... %s', time.time() - start_time)

    # Now figure out what the commit ID was.
    # I would like this to be "git rev-list HEAD -n1 filename"
    # unfortunately, this command is slow: it doesn't abort after the
    # first line is output.  Without filename, it does abort; therefore
    # we do this and hope no intervening commit occurs.
    # (looks like the slowness is fixed in git HEAD)
    v = run_git_command(['rev-list', 'HEAD', '-n1'],
                        article.full_git_dir).strip()
    logger.debug('done %s', time.time() - start_time)
    return v, boring, diff_info
示例#19
0
def add_to_git_repo(data, filename, article):
    start_time = time.time()

    #Don't use full path because it can exceed the maximum filename length
    #full_path = os.path.join(models.GIT_DIR, filename)
    os.chdir(article.full_git_dir)
    mkdir_p(os.path.dirname(filename))

    boring = False
    diff_info = None

    try:
        previous = run_git_command(['show', 'HEAD:'+filename], article.full_git_dir)
    except subprocess.CalledProcessError as e:
        if (e.output.endswith("does not exist in 'HEAD'\n") or
            e.output.endswith("exists on disk, but not in 'HEAD'.\n")):
            already_exists = False
        else:
            raise
    else:
        already_exists = True


    open(filename, 'w').write(data)

    if already_exists:
        if previous == data:
            logger.debug('Article matches current version in repo')
            return None, None, None

        #Now check how many times this same version has appeared before
        my_hash = run_git_command(['hash-object', filename],
                                  article.full_git_dir).strip()

        commits = [v.v for v in article.versions()]
        if len(commits) > 2:
            logger.debug('Checking for duplicates among %s commits',
                          len(commits))
            def get_hash(version):
                """Return the SHA1 hash of filename in a given version"""
                output = run_git_command(['ls-tree', '-r', version, filename],
                                         article.full_git_dir)
                return output.split()[2]
            hashes = map(get_hash, commits)

            number_equal = sum(1 for h in hashes if h == my_hash)

            logger.debug('Got %s', number_equal)

            if number_equal >= 2: #Refuse to list a version more than twice
                run_git_command(['checkout', filename], article.full_git_dir)
                return None, None, None

        if is_boring(previous, data):
            boring = True
        else:
            diff_info = get_diff_info(previous, data)

    run_git_command(['add', filename], article.full_git_dir)
    if not already_exists:
        commit_message = 'Adding file %s' % filename
    else:
        commit_message = 'Change to %s' % filename
    logger.debug('Running git commit... %s', time.time()-start_time)
    run_git_command(['commit', filename, '-m', commit_message],
                    article.full_git_dir)
    logger.debug('git revlist... %s', time.time()-start_time)

    # Now figure out what the commit ID was.
    # I would like this to be "git rev-list HEAD -n1 filename"
    # unfortunately, this command is slow: it doesn't abort after the
    # first line is output.  Without filename, it does abort; therefore
    # we do this and hope no intervening commit occurs.
    # (looks like the slowness is fixed in git HEAD)
    v = run_git_command(['rev-list', 'HEAD', '-n1'],
                        article.full_git_dir).strip()
    logger.debug('done %s', time.time()-start_time)
    return v, boring, diff_info