def add_missing(query, max_count, randomize=False): """Search PubMed for articles and scrape documents. :param str query: PubMed query :param int max_count: Maximum number of articles to process :param bool randomize: Randomize list of articles to fetch :return: Added article objects """ pmids = pubtools.search_pmids(query) stored_pmids = [ article['pmid'] for article in mongo['article'].find({}, {'pmid': 1}) ] missing_pmids = set(pmids) - set(stored_pmids) logger.warn('Found {0} articles to add.'.format(len(missing_pmids))) pmids_to_add = list(missing_pmids)[:max_count] if randomize: random.shuffle(pmids_to_add) records = pubtools.download_pmids(pmids_to_add) scraper = SCRAPE_CLASS(**SCRAPE_KWARGS) added = [] for pmid, record in zip(pmids_to_add, records): logger.debug('Adding article {}'.format(pmid)) article = Article.from_record(record) article.scrape(scraper) article.tag() added.append(article) return added
def update(cls, query, max_count): """Search PubMed for articles and scrape documents. :param str query: PubMed query :param int max_count: Maximum number of articles to process :return list: Added article objects """ pmids = pubtools.search_pmids(query) stored_pmids = [ article['pmid'] for article in mongo['article'].find( {}, {'pmid': 1} ) ] pmids_to_add = set(pmids) - set(stored_pmids) pmids_to_add = list(pmids_to_add)[:max_count] records = pubtools.download_pmids(pmids_to_add) scraper = SCRAPE_CLASS(**SCRAPE_KWARGS) added = [] for pmid, record in zip(pmids_to_add, records): logging.debug('Adding article {}'.format(pmid)) article = Article.from_record(record) article.scrape(scraper) added.append(article) return added
def add_missing(query, max_count, randomize=False): """Search PubMed for articles and scrape documents. :param str query: PubMed query :param int max_count: Maximum number of articles to process :param bool randomize: Randomize list of articles to fetch :return: Added article objects """ pmids = pubtools.search_pmids(query) stored_pmids = [ article['pmid'] for article in mongo['article'].find( {}, {'pmid': 1} ) ] missing_pmids = set(pmids) - set(stored_pmids) logger.warn('Found {0} articles to add.'.format(len(missing_pmids))) pmids_to_add = list(missing_pmids)[:max_count] if randomize: random.shuffle(pmids_to_add) records = pubtools.download_pmids(pmids_to_add) scraper = SCRAPE_CLASS(**SCRAPE_KWARGS) added = [] for pmid, record in zip(pmids_to_add, records): logger.debug('Adding article {}'.format(pmid)) article = Article.from_record(record) article.scrape(scraper) article.tag() added.append(article) return added
def test_search_pmids(self): """ Searching for an article by PMID should return the same PMID. """ with assert_raises(pubtools.EntrezEmailError): pubtools.search_pmids('1[uid]')
def test_search_pmids(self): """ Searching for an article by PMID should return the same PMID. """ pmids = pubtools.search_pmids('1[uid]') assert_equal(pmids, ['1'])