def add_missing(query, max_count, randomize=False): """Search PubMed for articles and scrape documents. :param str query: PubMed query :param int max_count: Maximum number of articles to process :param bool randomize: Randomize list of articles to fetch :return: Added article objects """ pmids = pubtools.search_pmids(query) stored_pmids = [ article['pmid'] for article in mongo['article'].find({}, {'pmid': 1}) ] missing_pmids = set(pmids) - set(stored_pmids) logger.warn('Found {0} articles to add.'.format(len(missing_pmids))) pmids_to_add = list(missing_pmids)[:max_count] if randomize: random.shuffle(pmids_to_add) records = pubtools.download_pmids(pmids_to_add) scraper = SCRAPE_CLASS(**SCRAPE_KWARGS) added = [] for pmid, record in zip(pmids_to_add, records): logger.debug('Adding article {}'.format(pmid)) article = Article.from_record(record) article.scrape(scraper) article.tag() added.append(article) return added
def update(cls, query, max_count): """Search PubMed for articles and scrape documents. :param str query: PubMed query :param int max_count: Maximum number of articles to process :return list: Added article objects """ pmids = pubtools.search_pmids(query) stored_pmids = [ article['pmid'] for article in mongo['article'].find( {}, {'pmid': 1} ) ] pmids_to_add = set(pmids) - set(stored_pmids) pmids_to_add = list(pmids_to_add)[:max_count] records = pubtools.download_pmids(pmids_to_add) scraper = SCRAPE_CLASS(**SCRAPE_KWARGS) added = [] for pmid, record in zip(pmids_to_add, records): logging.debug('Adding article {}'.format(pmid)) article = Article.from_record(record) article.scrape(scraper) added.append(article) return added
def test_download_pmids(self): """ Downloading articles by PMID should return correct MEDLINE data. """ records = pubtools.download_pmids(['1', '2', '3'], chunk_size=2) assert_equal(len(records), 3) assert_equal(records[0], DATA_PMID[0]) assert_equal(records[1], DATA_PMID[1]) assert_equal(records[2], DATA_PMID[2])
def test_download_pmids(self): """ Downloading articles by PMID should return correct MEDLINE data. """ records = pubtools.download_pmids(["1", "2", "3"], chunk_size=2) assert_equal(len(records), 3) assert_equal(records[0], DATA_PMID_1) assert_equal(records[1], DATA_PMID_2) assert_equal(records[2], DATA_PMID_3)
def _resolve_pmid(self, pmid): """Follow PMID link, store HTML, and return final URL.""" # Get DOI from PubMed API pub_data = pubtools.download_pmids([pmid])[0] doi = pubtools.record_to_doi(pub_data) if doi: return self._resolve_doi(doi) pub_link = pubtools.pmid_to_publisher_link(pmid) # Follow publisher link if pub_link: # Browse to link self.browser.open(pub_link) # Read documents and save in ScrapeInfo self.info.init_html, self.info.init_qhtml = self.browser.get_docs() # Return URL return self.browser.geturl()
def add_missing(query, max_count, randomize=False): """Search PubMed for articles and scrape documents. :param str query: PubMed query :param int max_count: Maximum number of articles to process :param bool randomize: Randomize list of articles to fetch :return: Added article objects """ pmids = pubtools.search_pmids(query) stored_pmids = [ article['pmid'] for article in mongo['article'].find( {}, {'pmid': 1} ) ] missing_pmids = set(pmids) - set(stored_pmids) logger.warn('Found {0} articles to add.'.format(len(missing_pmids))) pmids_to_add = list(missing_pmids)[:max_count] if randomize: random.shuffle(pmids_to_add) records = pubtools.download_pmids(pmids_to_add) scraper = SCRAPE_CLASS(**SCRAPE_KWARGS) added = [] for pmid, record in zip(pmids_to_add, records): logger.debug('Adding article {}'.format(pmid)) article = Article.from_record(record) article.scrape(scraper) article.tag() added.append(article) return added
def from_pmid(cls, pmid): records = pubtools.download_pmids([pmid]) return cls.from_record(records[0])
def test_download_pmids(self): """ Downloading articles by PMID should return correct MEDLINE data. """ with assert_raises(pubtools.EntrezEmailError): pubtools.download_pmids(['1', '2', '3'], chunk_size=2)