def remove_duplicates(by='pmid'): """Remove duplicate articles by field. :param str by: Article field to identify duplicates """ counts = collections.defaultdict(int) values = [ value[by] for value in Article._storage[0].store.find({}, {by: 1}) ] for value in values: counts[value] += 1 for value, count in counts.items(): if count == 1: continue articles = list(Article.find(Q(by, 'eq', value))) for duplicate in articles[1:]: logger.debug( 'Deleting duplicate record: {}'.format( value ) ) Article.remove_one(duplicate)
def add_missing(query, max_count, randomize=False): """Search PubMed for articles and scrape documents. :param str query: PubMed query :param int max_count: Maximum number of articles to process :param bool randomize: Randomize list of articles to fetch :return: Added article objects """ pmids = pubtools.search_pmids(query) stored_pmids = [ article['pmid'] for article in mongo['article'].find({}, {'pmid': 1}) ] missing_pmids = set(pmids) - set(stored_pmids) logger.warn('Found {0} articles to add.'.format(len(missing_pmids))) pmids_to_add = list(missing_pmids)[:max_count] if randomize: random.shuffle(pmids_to_add) records = pubtools.download_pmids(pmids_to_add) scraper = SCRAPE_CLASS(**SCRAPE_KWARGS) added = [] for pmid, record in zip(pmids_to_add, records): logger.debug('Adding article {}'.format(pmid)) article = Article.from_record(record) article.scrape(scraper) article.tag() added.append(article) return added
def rescrape(article_id, **kwargs): logger.info('Re-scraping article {0}'.format(article_id)) try: article = Article.load(article_id) article.scrape(**kwargs) except Exception as error: logger.error('Error scraping article {0}'.format(article_id)) logger.exception(error)
def retag(article_id, **kwargs): logger.info('Re-tagging article {0}'.format(article_id)) try: article = Article.load(article_id) article.tag(**kwargs) except Exception as error: logger.error('Error tagging article {0}'.format(article_id)) logger.exception(error)
def rescrape(command): logger.info('Re-scraping article {0}'.format(command.article_id)) try: article = Article.load(command.article_id) article.scrape(overwrite=command.overwrite) except Exception as error: logger.error('Error scraping article {0}'.format(command.article_id)) logger.exception(error)
def retag(command): logger.info('Re-tagging article {0}'.format(command.article_id)) try: article = Article.load(command.article_id) article.tag(overwrite=command.overwrite) except Exception as error: logger.error('Error tagging article {0}'.format(command.article_id)) logger.exception(error)
def batch_rescrape(processes, query=None, limit=None, **kwargs): pool = multiprocessing.Pool(processes=processes) articles = Article.find(query) if limit: articles = articles.limit(limit) results = pool.map( functools.partial(rescrape, **kwargs), (article._id for article in articles), )
def batch_rescrape(processes, query=None, limit=None, overwrite=False): pool = multiprocessing.Pool(processes=processes) articles = Article.find(query) if limit: articles = articles.limit(limit) results = pool.map( rescrape, (RescrapeCommand(article._id, overwrite) for article in articles), )
def remove_duplicates(by='pmid'): """Remove duplicate articles by field. :param str by: Article field to identify duplicates """ counts = collections.defaultdict(int) values = [ value[by] for value in Article._storage[0].store.find({}, {by: 1}) ] for value in values: counts[value] += 1 for value, count in counts.items(): if count == 1: continue articles = list(Article.find(Q(by, 'eq', value))) for duplicate in articles[1:]: logger.debug('Deleting duplicate record: {}'.format(value)) Article.remove_one(duplicate)
def count_verified(threshold=VERIFY_THRESHOLD): """Count the number of downloaded and verified documents across all articles. :param float threshold: Document verification threshold :return: Tuple of total and verified dictionaries, each mapping document types to counts """ count = defaultdict(int) verified = defaultdict(int) for article in Article.find(): for type_, field in DOCUMENT_TYPES_TO_FIELDS.iteritems(): value = getattr(article, field) if value: count[type_] += 1 if value.verification_score > threshold: verified[type_] += 1 return count, verified
def add_missing(query, max_count, randomize=False): """Search PubMed for articles and scrape documents. :param str query: PubMed query :param int max_count: Maximum number of articles to process :param bool randomize: Randomize list of articles to fetch :return: Added article objects """ pmids = pubtools.search_pmids(query) stored_pmids = [ article['pmid'] for article in mongo['article'].find( {}, {'pmid': 1} ) ] missing_pmids = set(pmids) - set(stored_pmids) logger.warn('Found {0} articles to add.'.format(len(missing_pmids))) pmids_to_add = list(missing_pmids)[:max_count] if randomize: random.shuffle(pmids_to_add) records = pubtools.download_pmids(pmids_to_add) scraper = SCRAPE_CLASS(**SCRAPE_KWARGS) added = [] for pmid, record in zip(pmids_to_add, records): logger.debug('Adding article {}'.format(pmid)) article = Article.from_record(record) article.scrape(scraper) article.tag() added.append(article) return added
def update_dates(overwrite=False): query = None if overwrite else Q('date', 'eq', None) articles = Article.find(query) for article in articles: article.update_date() article.save()