示例#1
0
    def test_detect(self, name, publisher, start_url):
        """Test that publishers are correctly detected.

        :param name: Test name; used by nose_parameterized
        :param publisher: Correct publisher name
        :param start_url: Staring URL

        """
        self.browser.open(start_url)
        init_html, init_qhtml = self.browser.get_docs()
        assert_equal(publisher, pubdet.pubdet(init_html))
示例#2
0
    def scrape(self, doi=None, pmid=None, fetch_pmid=True, fetch_types=None):
        """Download documents for a target article.

        :param doi: Article DOI
        :param pmid: Article PubMed ID
        :param fetch_pmid: Look up PMID if not provided
        :return: ScrapeInfo instance

        """
        logger.info('Fetching article with DOI={0}, PMID={1}'.format(
            doi, pmid,
        ))
        # Initialize ScrapeInfo object to store results
        self.info = ScrapeInfo(doi, pmid)

        # Get publisher link
        pub_link = None
        if doi:
            try:
                pub_link = self._resolve_doi(doi)
            except BadDOIError:
                logger.info('Could not resolve DOI {}'.format(doi))
            if not pmid and fetch_pmid:
                logger.info('Looking up PMID by DOI')
                self.info.pmid = pmid_doi.pmid_doi({'doi': doi})['pmid']
        if pmid and not pub_link:
            pub_link = self._resolve_pmid(pmid)

        # Quit if no publisher link found
        if not pub_link:
            raise ScrapeError('No publisher link found')

        # Log publisher link to ScrapeInfo
        self.info.pub_link = pub_link

        # Detect publisher
        self.info.publisher = pubdet.pubdet(self.info.init_html)

        # Get documents
        for doc_type in getter_map:

            # Skip documents not to be included
            if fetch_types and doc_type not in fetch_types:
                continue

            # Identify getter
            getter_class = getter_map[doc_type][self.info.publisher]

            # Skip if getter is set to false-y
            if not getter_class:
                continue

            # Construct getter
            getter = getter_class()

            # Browser to publisher link
            if self.browser.geturl() != pub_link:
                try:
                    self.browser.reopen(pub_link)
                except URLError:
                    self.info.status[doc_type] = 'Timeout'

            # Get document
            try:
                # Success
                get_success = getter.reget(self.info, self.browser)
                self.info.docs[doc_type] = self.info.html
                self.info.status[doc_type] = 'Success'
            except Exception as error:
                # Failure
                self.info.status[doc_type] = repr(error)

        # Return ScrapeInfo object
        return self.info