Пример #1
0
def remove_duplicates(by='pmid'):
    """Remove duplicate articles by field.

    :param str by: Article field to identify duplicates

    """
    counts = collections.defaultdict(int)
    values = [
        value[by]
        for value in Article._storage[0].store.find({}, {by: 1})
    ]
    for value in values:
        counts[value] += 1

    for value, count in counts.items():
        if count == 1:
            continue
        articles = list(Article.find(Q(by, 'eq', value)))
        for duplicate in articles[1:]:
            logger.debug(
                'Deleting duplicate record: {}'.format(
                    value
                )
            )
            Article.remove_one(duplicate)
Пример #2
0
def batch_rescrape(processes, query=None, limit=None, **kwargs):
    pool = multiprocessing.Pool(processes=processes)
    articles = Article.find(query)
    if limit:
        articles = articles.limit(limit)
    results = pool.map(
        functools.partial(rescrape, **kwargs),
        (article._id for article in articles),
    )
Пример #3
0
def batch_rescrape(processes, query=None, limit=None, overwrite=False):
    pool = multiprocessing.Pool(processes=processes)
    articles = Article.find(query)
    if limit:
        articles = articles.limit(limit)
    results = pool.map(
        rescrape,
        (RescrapeCommand(article._id, overwrite) for article in articles),
    )
Пример #4
0
def batch_rescrape(processes, query=None, limit=None, **kwargs):
    pool = multiprocessing.Pool(processes=processes)
    articles = Article.find(query)
    if limit:
        articles = articles.limit(limit)
    results = pool.map(
        functools.partial(rescrape, **kwargs),
        (article._id for article in articles),
    )
Пример #5
0
def remove_duplicates(by='pmid'):
    """Remove duplicate articles by field.

    :param str by: Article field to identify duplicates
    """
    counts = collections.defaultdict(int)
    values = [
        value[by] for value in Article._storage[0].store.find({}, {by: 1})
    ]
    for value in values:
        counts[value] += 1

    for value, count in counts.items():
        if count == 1:
            continue
        articles = list(Article.find(Q(by, 'eq', value)))
        for duplicate in articles[1:]:
            logger.debug('Deleting duplicate record: {}'.format(value))
            Article.remove_one(duplicate)
Пример #6
0
def count_verified(threshold=VERIFY_THRESHOLD):
    """Count the number of downloaded and verified documents across all
    articles.

    :param float threshold: Document verification threshold
    :return: Tuple of total and verified dictionaries, each mapping document
    types to counts
    """
    count = defaultdict(int)
    verified = defaultdict(int)

    for article in Article.find():
        for type_, field in DOCUMENT_TYPES_TO_FIELDS.iteritems():
            value = getattr(article, field)
            if value:
                count[type_] += 1
                if value.verification_score > threshold:
                    verified[type_] += 1

    return count, verified
Пример #7
0
def count_verified(threshold=VERIFY_THRESHOLD):
    """Count the number of downloaded and verified documents across all
    articles.

    :param float threshold: Document verification threshold
    :return: Tuple of total and verified dictionaries, each mapping document
    types to counts
    """
    count = defaultdict(int)
    verified = defaultdict(int)

    for article in Article.find():
        for type_, field in DOCUMENT_TYPES_TO_FIELDS.iteritems():
            value = getattr(article, field)
            if value:
                count[type_] += 1
                if value.verification_score > threshold:
                    verified[type_] += 1

    return count, verified
Пример #8
0
def update_dates(overwrite=False):
    query = None if overwrite else Q('date', 'eq', None)
    articles = Article.find(query)
    for article in articles:
        article.update_date()
        article.save()