Пример #1
0
def add_missing(query, max_count, randomize=False):
    """Search PubMed for articles and scrape documents.

    :param str query: PubMed query
    :param int max_count: Maximum number of articles to process
    :param bool randomize: Randomize list of articles to fetch
    :return: Added article objects
    """
    pmids = pubtools.search_pmids(query)
    stored_pmids = [
        article['pmid'] for article in mongo['article'].find({}, {'pmid': 1})
    ]

    missing_pmids = set(pmids) - set(stored_pmids)
    logger.warn('Found {0} articles to add.'.format(len(missing_pmids)))

    pmids_to_add = list(missing_pmids)[:max_count]
    if randomize:
        random.shuffle(pmids_to_add)

    records = pubtools.download_pmids(pmids_to_add)

    scraper = SCRAPE_CLASS(**SCRAPE_KWARGS)

    added = []

    for pmid, record in zip(pmids_to_add, records):
        logger.debug('Adding article {}'.format(pmid))
        article = Article.from_record(record)
        article.scrape(scraper)
        article.tag()
        added.append(article)

    return added
Пример #2
0
    def update(cls, query, max_count):
        """Search PubMed for articles and scrape documents.

        :param str query: PubMed query
        :param int max_count: Maximum number of articles to process
        :return list: Added article objects

        """
        pmids = pubtools.search_pmids(query)
        stored_pmids = [
            article['pmid']
            for article in mongo['article'].find(
                {}, {'pmid': 1}
            )
        ]

        pmids_to_add = set(pmids) - set(stored_pmids)
        pmids_to_add = list(pmids_to_add)[:max_count]

        records = pubtools.download_pmids(pmids_to_add)

        scraper = SCRAPE_CLASS(**SCRAPE_KWARGS)

        added = []

        for pmid, record in zip(pmids_to_add, records):
            logging.debug('Adding article {}'.format(pmid))
            article = Article.from_record(record)
            article.scrape(scraper)
            added.append(article)

        return added
Пример #3
0
    def test_download_pmids(self):
        """ Downloading articles by PMID should return correct MEDLINE data. """

        records = pubtools.download_pmids(['1', '2', '3'], chunk_size=2)

        assert_equal(len(records), 3)

        assert_equal(records[0], DATA_PMID[0])
        assert_equal(records[1], DATA_PMID[1])
        assert_equal(records[2], DATA_PMID[2])
Пример #4
0
    def test_download_pmids(self):
        """ Downloading articles by PMID should return correct MEDLINE data. """

        records = pubtools.download_pmids(["1", "2", "3"], chunk_size=2)

        assert_equal(len(records), 3)

        assert_equal(records[0], DATA_PMID_1)
        assert_equal(records[1], DATA_PMID_2)
        assert_equal(records[2], DATA_PMID_3)
Пример #5
0
    def _resolve_pmid(self, pmid):
        """Follow PMID link, store HTML, and return final URL."""

        # Get DOI from PubMed API
        pub_data = pubtools.download_pmids([pmid])[0]
        doi = pubtools.record_to_doi(pub_data)
        if doi:
            return self._resolve_doi(doi)

        pub_link = pubtools.pmid_to_publisher_link(pmid)

        # Follow publisher link
        if pub_link:

            # Browse to link
            self.browser.open(pub_link)

            # Read documents and save in ScrapeInfo
            self.info.init_html, self.info.init_qhtml = self.browser.get_docs()

            # Return URL
            return self.browser.geturl()
Пример #6
0
def add_missing(query, max_count, randomize=False):
    """Search PubMed for articles and scrape documents.

    :param str query: PubMed query
    :param int max_count: Maximum number of articles to process
    :param bool randomize: Randomize list of articles to fetch
    :return: Added article objects
    """
    pmids = pubtools.search_pmids(query)
    stored_pmids = [
        article['pmid']
        for article in mongo['article'].find(
            {}, {'pmid': 1}
        )
    ]

    missing_pmids = set(pmids) - set(stored_pmids)
    logger.warn('Found {0} articles to add.'.format(len(missing_pmids)))

    pmids_to_add = list(missing_pmids)[:max_count]
    if randomize:
        random.shuffle(pmids_to_add)

    records = pubtools.download_pmids(pmids_to_add)

    scraper = SCRAPE_CLASS(**SCRAPE_KWARGS)

    added = []

    for pmid, record in zip(pmids_to_add, records):
        logger.debug('Adding article {}'.format(pmid))
        article = Article.from_record(record)
        article.scrape(scraper)
        article.tag()
        added.append(article)

    return added
Пример #7
0
 def from_pmid(cls, pmid):
     records = pubtools.download_pmids([pmid])
     return cls.from_record(records[0])
Пример #8
0
 def from_pmid(cls, pmid):
     records = pubtools.download_pmids([pmid])
     return cls.from_record(records[0])
Пример #9
0
    def test_download_pmids(self):
        """ Downloading articles by PMID should return correct MEDLINE data. """

        with assert_raises(pubtools.EntrezEmailError):
            pubtools.download_pmids(['1', '2', '3'], chunk_size=2)