Пример #1
0
def is_a_pdf_page(response, page_publisher):
    if is_pdf_from_header(response):
        if DEBUG_SCRAPING:
            logger.info(u"http header says this is a PDF {}".format(
                response.request.url)
            )
        return True

    # everything below here needs to look at the content
    # so bail here if the page is too big
    if is_response_too_large(response):
        if DEBUG_SCRAPING:
            logger.info(u"response is too big for more checks in is_a_pdf_page")
        return False

    content = response.content_big()

    # PDFs start with this character
    if re.match(u"%PDF", content):
        return True

    if page_publisher:
        says_free_publisher_patterns = [
            ("Wiley-Blackwell", u'<span class="freeAccess" title="You have free access to this content">'),
            ("Wiley-Blackwell", u'<iframe id="pdfDocument"'),
            ("JSTOR", ur'<li class="download-pdf-button">.*Download PDF.*</li>'),
            ("Institute of Electrical and Electronics Engineers (IEEE)",
             ur'<frame src="http://ieeexplore.ieee.org/.*?pdf.*?</frameset>'),
            ("IOP Publishing", ur'Full Refereed Journal Article')
        ]
        for (publisher, pattern) in says_free_publisher_patterns:
            matches = re.findall(pattern, content, re.IGNORECASE | re.DOTALL)
            if is_same_publisher(page_publisher, publisher) and matches:
                return True
    return False
Пример #2
0
def is_a_word_doc(response):
    if not (response.url.endswith('.doc') or response.url.endswith('.docx')):
        return False

    if is_a_word_doc_from_header(response):
        if DEBUG_SCRAPING:
            logger.info(u"http header says this is a word doc {}".format(
                response.request.url))
        return True

    # everything below here needs to look at the content
    # so bail here if the page is too big
    if is_response_too_large(response):
        if DEBUG_SCRAPING:
            logger.info(
                u"response is too big for more checks in is_a_word_doc")
        return False

    content = response.content_big()

    # docx
    if content[-22:].startswith('PK'):
        return True

    return False
Пример #3
0
def is_a_pdf_page(response, page_publisher):
    if is_pdf_from_header(response):
        if DEBUG_SCRAPING:
            logger.info(u"http header says this is a PDF {}".format(
                response.request.url)
            )
        return True

    # everything below here needs to look at the content
    # so bail here if the page is too big
    if is_response_too_large(response):
        if DEBUG_SCRAPING:
            logger.info(u"response is too big for more checks in is_a_pdf_page")
        return False

    content = response.content_big()

    # PDFs start with this character
    if re.match(u"%PDF", content):
        return True

    if page_publisher:
        says_free_publisher_patterns = [
            ("Wiley-Blackwell", u'<span class="freeAccess" title="You have free access to this content">'),
            ("Wiley-Blackwell", u'<iframe id="pdfDocument"'),
            ("JSTOR", ur'<li class="download-pdf-button">.*Download PDF.*</li>'),
            ("Institute of Electrical and Electronics Engineers (IEEE)",
             ur'<frame src="http://ieeexplore.ieee.org/.*?pdf.*?</frameset>'),
            ("IOP Publishing", ur'Full Refereed Journal Article')
        ]
        for (publisher, pattern) in says_free_publisher_patterns:
            matches = re.findall(pattern, content, re.IGNORECASE | re.DOTALL)
            if is_same_publisher(page_publisher, publisher) and matches:
                return True
    return False
Пример #4
0
    def scrape_for_fulltext_link(self):
        url = self.url

        dont_scrape_list = [
                u"ncbi.nlm.nih.gov",
                u"europepmc.org",
                u"/europepmc/",
                u"pubmed",
                u"elar.rsvpu.ru",  #these ones based on complaint in email
                u"elib.uraic.ru",
                u"elar.usfeu.ru",
                u"elar.urfu.ru",
                u"elar.uspu.ru"]
        for url_fragment in dont_scrape_list:
            if url_fragment in url:
                logger.info(u"not scraping {} because is on our do not scrape list.".format(url))
                return

        try:
            self.r = http_get(url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly)

            if self.r.status_code != 200:
                if self.r.status_code in [401]:
                    # not authorized, so not open
                    pass
                else:
                    self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link".format(self.r.status_code, url)
                return

            # if our url redirects to a pdf, we're done.
            # = open repo http://hdl.handle.net/2060/20140010374
            if self.is_a_pdf_page():
                if DEBUG_SCRAPING:
                    logger.info(u"this is a PDF. success! [{}]".format(url))
                self.scraped_pdf_url = url
                return

            else:
                if DEBUG_SCRAPING:
                    logger.info(u"is not a PDF for {}.  continuing more checks".format(url))

            # now before reading the content, bail it too large
            if is_response_too_large(self.r):
                logger.info(u"landing page is too large, skipping")
                return

            # get the HTML tree
            page = self.r.content_small()

            # set the license if we can find one
            scraped_license = find_normalized_license(page)
            if scraped_license:
                self.scraped_license = scraped_license

            pdf_download_link = None
            # special exception for citeseer because we want the pdf link where
            # the copy is on the third party repo, not the cached link, if we can get it
            if url and u"citeseerx.ist.psu.edu/" in url:
                matches = re.findall(u'<h3>Download Links</h3>.*?href="(.*?)"', page, re.DOTALL)
                if matches:
                    pdf_download_link = DuckLink(unicode(matches[0], "utf-8"), "download")

            # osf doesn't have their download link in their pages
            # so look at the page contents to see if it is osf-hosted
            # if so, compute the url.  example:  http://osf.io/tyhqm
            elif page and u"osf-cookie" in unicode(page, "utf-8", errors='replace'):
                pdf_download_link = DuckLink(u"{}/download".format(url), "download")

            # otherwise look for it the normal way
            else:
                pdf_download_link = self.find_pdf_link(page)

            if pdf_download_link is not None:
                if DEBUG_SCRAPING:
                    logger.info(u"found a PDF download link: {} {} [{}]".format(
                        pdf_download_link.href, pdf_download_link.anchor, url))

                pdf_url = get_link_target(pdf_download_link.href, self.r.url)
                # if they are linking to a PDF, we need to follow the link to make sure it's legit
                if DEBUG_SCRAPING:
                    logger.info(u"checking to see the PDF link actually gets a PDF [{}]".format(url))
                if self.gets_a_pdf(pdf_download_link, self.r.url):
                    self.scraped_pdf_url = pdf_url
                    self.scraped_open_metadata_url = url
                    return

            # try this later because would rather get a pdfs
            # if they are linking to a .docx or similar, this is open.
            doc_link = find_doc_download_link(page)
            if doc_link is not None:
                if DEBUG_SCRAPING:
                    logger.info(u"found a .doc download link {} [{}]".format(
                        get_link_target(doc_link.href, self.r.url), url))
                self.scraped_open_metadata_url = url
                return

        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException in scrape_for_fulltext_link"
            logger.info(self.error)
            return
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except Exception as e:
            self.error += u"ERROR: Exception error on in scrape_for_fulltext_link"
            logger.exception(self.error)
            return

        if DEBUG_SCRAPING:
            logger.info(u"found no PDF download link.  end of the line. [{}]".format(url))

        return self
Пример #5
0
    def scrape_for_fulltext_link(self):
        url = self.url

        dont_scrape_list = [
            u"ncbi.nlm.nih.gov",
            u"europepmc.org",
            u"/europepmc/",
            u"pubmed",
            u"elar.rsvpu.ru",  #these ones based on complaint in email
            u"elib.uraic.ru",
            u"elar.usfeu.ru",
            u"elar.urfu.ru",
            u"elar.uspu.ru"
        ]
        for url_fragment in dont_scrape_list:
            if url_fragment in url:
                logger.info(
                    u"not scraping {} because is on our do not scrape list.".
                    format(url))
                return

        try:
            self.r = http_get(url,
                              stream=True,
                              publisher=self.publisher,
                              session_id=self.session_id,
                              ask_slowly=self.ask_slowly)

            if self.r.status_code != 200:
                if self.r.status_code in [401]:
                    # not authorized, so not open
                    pass
                else:
                    self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link".format(
                        self.r.status_code, url)
                return

            # if our url redirects to a pdf, we're done.
            # = open repo http://hdl.handle.net/2060/20140010374
            if self.is_a_pdf_page():
                if DEBUG_SCRAPING:
                    logger.info(u"this is a PDF. success! [{}]".format(url))
                self.scraped_pdf_url = url
                return

            else:
                if DEBUG_SCRAPING:
                    logger.info(
                        u"is not a PDF for {}.  continuing more checks".format(
                            url))

            # now before reading the content, bail it too large
            if is_response_too_large(self.r):
                logger.info(u"landing page is too large, skipping")
                return

            # get the HTML tree
            page = self.r.content_small()

            # set the license if we can find one
            scraped_license = find_normalized_license(page)
            if scraped_license:
                self.scraped_license = scraped_license

            pdf_download_link = None

            # osf doesn't have their download link in their pages
            # so look at the page contents to see if it is osf-hosted
            # if so, compute the url.  example:  http://osf.io/tyhqm
            if page and u"osf-cookie" in unicode(page, "utf-8"):
                pdf_download_link = DuckLink(u"{}/download".format(url),
                                             "download")

            # otherwise look for it the normal way
            else:
                pdf_download_link = self.find_pdf_link(page)

            if pdf_download_link is not None:
                if DEBUG_SCRAPING:
                    logger.info(
                        u"found a PDF download link: {} {} [{}]".format(
                            pdf_download_link.href, pdf_download_link.anchor,
                            url))

                pdf_url = get_link_target(pdf_download_link.href, self.r.url)
                # if they are linking to a PDF, we need to follow the link to make sure it's legit
                if DEBUG_SCRAPING:
                    logger.info(
                        u"checking to see the PDF link actually gets a PDF [{}]"
                        .format(url))
                if self.gets_a_pdf(pdf_download_link, self.r.url):
                    self.scraped_pdf_url = pdf_url
                    self.scraped_open_metadata_url = url
                    return

            # try this later because would rather get a pdfs
            # if they are linking to a .docx or similar, this is open.
            doc_link = find_doc_download_link(page)
            if doc_link is not None:
                if DEBUG_SCRAPING:
                    logger.info(u"found a .doc download link {} [{}]".format(
                        get_link_target(doc_link.href, self.r.url), url))
                self.scraped_open_metadata_url = url
                return

        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException in scrape_for_fulltext_link"
            logger.info(self.error)
            return
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except Exception as e:
            self.error += u"ERROR: Exception error on in scrape_for_fulltext_link"
            logger.exception(self.error)
            return

        if DEBUG_SCRAPING:
            logger.info(
                u"found no PDF download link.  end of the line. [{}]".format(
                    url))

        return self
Пример #6
0
    def scrape_for_fulltext_link(self, find_pdf_link=True):
        url = self.url

        dont_scrape_list = [
            u"ncbi.nlm.nih.gov",
            u"europepmc.org",
            u"/europepmc/",
            u"pubmed",
            u"elar.rsvpu.ru",  #these ones based on complaint in email
            u"elib.uraic.ru",
            u"elar.usfeu.ru",
            u"elar.urfu.ru",
            u"elar.uspu.ru"
        ]
        for url_fragment in dont_scrape_list:
            if url_fragment in url:
                logger.info(
                    u"not scraping {} because is on our do not scrape list.".
                    format(url))
                return

        try:
            self.r = http_get(url,
                              stream=True,
                              publisher=self.publisher,
                              session_id=self.session_id,
                              ask_slowly=self.ask_slowly)
            resolved_url = self.r.url

            if self.r.status_code != 200:
                if self.r.status_code in [401]:
                    # not authorized, so not open
                    pass
                else:
                    self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link".format(
                        self.r.status_code, url)
                return

            # if our url redirects to a pdf, we're done.
            # = open repo http://hdl.handle.net/2060/20140010374
            if self.is_a_pdf_page():
                if DEBUG_SCRAPING:
                    logger.info(u"this is a PDF. success! [{}]".format(url))
                self.scraped_pdf_url = url
                return

            else:
                if DEBUG_SCRAPING:
                    logger.info(
                        u"is not a PDF for {}.  continuing more checks".format(
                            url))

            if is_a_word_doc(self.r):
                if DEBUG_SCRAPING:
                    logger.info(
                        u"this is a word doc. success! [{}]".format(url))
                self.scraped_open_metadata_url = url
                return

            # now before reading the content, bail it too large
            if is_response_too_large(self.r):
                logger.info(u"landing page is too large, skipping")
                return

            # get the HTML tree
            page = self.r.content_small()

            # remove script tags
            try:
                soup = BeautifulSoup(page, 'html.parser')
                [script.extract() for script in soup('script')]
                page = str(soup)
            except HTMLParseError as e:
                logger.error(
                    u'error parsing html, skipped script removal: {}'.format(
                        e))

            # set the license if we can find one
            scraped_license = find_normalized_license(page)
            if scraped_license:
                self.scraped_license = scraped_license

            pdf_download_link = None
            # special exception for citeseer because we want the pdf link where
            # the copy is on the third party repo, not the cached link, if we can get it
            if url and u"citeseerx.ist.psu.edu/" in url:
                matches = re.findall(u'<h3>Download Links</h3>.*?href="(.*?)"',
                                     page, re.DOTALL)
                if matches:
                    pdf_download_link = DuckLink(unicode(matches[0], "utf-8"),
                                                 "download")

            # osf doesn't have their download link in their pages
            # so look at the page contents to see if it is osf-hosted
            # if so, compute the url.  example:  http://osf.io/tyhqm
            elif page and u"osf-cookie" in unicode(
                    page, "utf-8", errors='replace'):
                pdf_download_link = DuckLink(u"{}/download".format(url),
                                             "download")

            # otherwise look for it the normal way
            else:
                pdf_download_link = self.find_pdf_link(page)

            if pdf_download_link is not None:
                if DEBUG_SCRAPING:
                    logger.info(
                        u"found a PDF download link: {} {} [{}]".format(
                            pdf_download_link.href, pdf_download_link.anchor,
                            url))

                pdf_url = get_link_target(pdf_download_link.href, self.r.url)
                # if they are linking to a PDF, we need to follow the link to make sure it's legit
                if DEBUG_SCRAPING:
                    logger.info(
                        u"checking to see the PDF link actually gets a PDF [{}]"
                        .format(url))
                if self.gets_a_pdf(pdf_download_link, self.r.url):
                    self.scraped_pdf_url = pdf_url
                    self.scraped_open_metadata_url = url
                    return

            # try this later because would rather get a pdfs
            # if they are linking to a .docx or similar, this is open.
            doc_link = find_doc_download_link(page)

            if doc_link is not None:
                absolute_doc_url = get_link_target(doc_link.href, resolved_url)
                if DEBUG_SCRAPING:
                    logger.info(
                        u"found a possible .doc download link [{}]".format(
                            absolute_doc_url))
                if self.gets_a_word_doc(doc_link, self.r.url):
                    if DEBUG_SCRAPING:
                        logger.info(
                            u"we've decided this is a word doc. [{}]".format(
                                absolute_doc_url))
                    self.scraped_open_metadata_url = url
                    return
                else:
                    if DEBUG_SCRAPING:
                        logger.info(
                            u"we've decided this ain't a word doc. [{}]".
                            format(absolute_doc_url))

            bhl_link = find_bhl_view_link(resolved_url, page)
            if bhl_link is not None:
                logger.info('found a BHL document link: {}'.format(
                    get_link_target(bhl_link.href, resolved_url)))
                self.scraped_open_metadata_url = url
                return

            if _trust_repo_license(resolved_url) and self.scraped_license:
                logger.info(u'trusting license {}'.format(
                    self.scraped_license))
                self.scraped_open_metadata_url = self.url

        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException in scrape_for_fulltext_link"
            logger.info(self.error)
            return
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except Exception as e:
            self.error += u"ERROR: Exception error on in scrape_for_fulltext_link"
            logger.exception(self.error)
            return

        if DEBUG_SCRAPING:
            logger.info(
                u"found no PDF download link.  end of the line. [{}]".format(
                    url))

        return self
Пример #7
0
    def scrape_for_fulltext_link(self):
        url = self.url

        dont_scrape_list = [
            u"ncbi.nlm.nih.gov",
            u"pubmed",
            u"elar.rsvpu.ru",  #these ones based on complaint in email
            u"elib.uraic.ru",
            u"elar.usfeu.ru",
            u"elar.urfu.ru",
            u"elar.uspu.ru"
        ]
        for url_fragment in dont_scrape_list:
            if url_fragment in url:
                logger.info(
                    u"not scraping {} because is on our do not scrape list.".
                    format(url))
                return

        try:
            with closing(
                    http_get(url,
                             stream=True,
                             related_pub=self.related_pub,
                             ask_slowly=self.ask_slowly)) as self.r:

                if self.r.status_code != 200:
                    self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link".format(
                        self.r.status_code, url)
                    return

                # if our url redirects to a pdf, we're done.
                # = open repo http://hdl.handle.net/2060/20140010374
                if self.is_a_pdf_page():
                    if DEBUG_SCRAPING:
                        logger.info(
                            u"this is a PDF. success! [{}]".format(url))
                    self.scraped_pdf_url = url
                    return

                else:
                    if DEBUG_SCRAPING:
                        logger.info(
                            u"is not a PDF for {}.  continuing more checks".
                            format(url))

                # now before reading the content, bail it too large
                if is_response_too_large(self.r):
                    logger.info(u"landing page is too large, skipping")
                    return

                # get the HTML tree
                page = self.r.content

                # set the license if we can find one
                scraped_license = find_normalized_license(page)
                if scraped_license:
                    self.scraped_license = scraped_license

                # special exception for citeseer because we want the pdf link where
                # the copy is on the third party repo, not the cached link, if we can get it
                if u"citeseerx.ist.psu.edu/" in url:
                    matches = re.findall(
                        u'<h3>Download Links</h3>.*?href="(.*?)"', page,
                        re.DOTALL)
                    if matches:
                        self.scraped_pdf_url = unicode(matches[0], "utf-8")
                        self.scraped_open_metadata_url = url
                        return

                pdf_download_link = self.find_pdf_link(page)
                if pdf_download_link is not None:
                    if DEBUG_SCRAPING:
                        logger.info(
                            u"found a PDF download link: {} {} [{}]".format(
                                pdf_download_link.href,
                                pdf_download_link.anchor, url))

                    pdf_url = get_link_target(pdf_download_link.href,
                                              self.r.url)
                    # if they are linking to a PDF, we need to follow the link to make sure it's legit
                    if DEBUG_SCRAPING:
                        logger.info(
                            u"checking to see the PDF link actually gets a PDF [{}]"
                            .format(url))
                    if self.gets_a_pdf(pdf_download_link, self.r.url):
                        self.scraped_pdf_url = pdf_url
                        self.scraped_open_metadata_url = url
                        return

                # try this later because would rather get a pdfs
                # if they are linking to a .docx or similar, this is open.
                doc_link = find_doc_download_link(page)
                if doc_link is not None:
                    if DEBUG_SCRAPING:
                        logger.info(
                            u"found a .doc download link {} [{}]".format(
                                get_link_target(doc_link.href, self.r.url),
                                url))
                    self.scraped_open_metadata_url = url
                    return

        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return

        if DEBUG_SCRAPING:
            logger.info(
                u"found no PDF download link.  end of the line. [{}]".format(
                    url))

        return self
Пример #8
0
    def scrape_for_fulltext_link(self):
        url = self.url
        is_journal = u"/doi/" in url or u"10." in url

        if DEBUG_SCRAPING:
            print u"in scrape_for_fulltext_link, getting URL: {}".format(url)

        if u"ncbi.nlm.nih.gov" in url:
            print u"not scraping {} because is on our do not scrape list.".format(
                url)
            if "ncbi.nlm.nih.gov/pmc/articles/PMC" in url:
                # pmc has fulltext
                self.scraped_open_metadata_url = url
                pmcid_matches = re.findall(".*(PMC\d+).*", url)
                if pmcid_matches:
                    pmcid = pmcid_matches[0]
                    self.scraped_pdf_url = u"https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf".format(
                        pmcid)
            else:
                # is an nlm page but not a pmc page, so is not full text
                return

        try:
            with closing(
                    http_get(url, stream=True, read_timeout=10,
                             doi=self.doi)) as r:

                if is_response_too_large(r):
                    print "landing page is too large, skipping"
                    return

                # if our url redirects to a pdf, we're done.
                # = open repo http://hdl.handle.net/2060/20140010374
                if resp_is_pdf_from_header(r):

                    if DEBUG_SCRAPING:
                        print u"the head says this is a PDF. success! [{}]".format(
                            url)
                    self.scraped_pdf_url = url
                    return

                else:
                    if DEBUG_SCRAPING:
                        print u"head says not a PDF for {}.  continuing more checks".format(
                            url)

                # get the HTML tree
                page = r.content

                # set the license if we can find one
                scraped_license = find_normalized_license(page)
                if scraped_license:
                    self.scraped_license = scraped_license

                pdf_download_link = find_pdf_link(page, url)
                if pdf_download_link is not None:
                    if DEBUG_SCRAPING:
                        print u"found a PDF download link: {} {} [{}]".format(
                            pdf_download_link.href, pdf_download_link.anchor,
                            url)

                    pdf_url = get_link_target(pdf_download_link, r.url)
                    if is_journal:
                        # if they are linking to a PDF, we need to follow the link to make sure it's legit
                        if DEBUG_SCRAPING:
                            print u"this is a journal. checking to see the PDF link actually gets a PDF [{}]".format(
                                url)
                        if gets_a_pdf(pdf_download_link, r.url, self.doi):
                            self.scraped_pdf_url = pdf_url
                            self.scraped_open_metadata_url = url
                            return
                    else:
                        self.scraped_pdf_url = pdf_url
                        self.scraped_open_metadata_url = url
                        return

                # try this later because would rather get a pdfs
                # if they are linking to a .docx or similar, this is open.
                # this only works for repos... a ".doc" in a journal is not the article. example:
                # = closed journal http://doi.org/10.1007/s10822-012-9571-0
                if not is_journal:
                    doc_link = find_doc_download_link(page)
                    if doc_link is not None:
                        if DEBUG_SCRAPING:
                            print u"found a .doc download link {} [{}]".format(
                                get_link_target(doc_link, r.url), url)
                        self.scraped_open_metadata_url = url
                        return

        except requests.exceptions.ConnectionError:
            print u"ERROR: connection error on {} in scrape_for_fulltext_link, skipping.".format(
                url)
            return
        except requests.Timeout:
            print u"ERROR: timeout error on {} in scrape_for_fulltext_link, skipping.".format(
                url)
            return
        except requests.exceptions.InvalidSchema:
            print u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link, skipping.".format(
                url)
            return
        except requests.exceptions.RequestException as e:
            print u"ERROR: RequestException error on {} in scrape_for_fulltext_link, skipping.".format(
                url)
            return

        if DEBUG_SCRAPING:
            print u"found no PDF download link.  end of the line. [{}]".format(
                url)

        return self
Пример #9
0
def gets_a_pdf(link, base_url, doi=None):

    if is_purchase_link(link):
        return False

    absolute_url = get_link_target(link, base_url)
    if DEBUG_SCRAPING:
        print u"checking to see if {} is a pdf".format(absolute_url)

    start = time()
    try:
        with closing(
                http_get(absolute_url, stream=True, read_timeout=10,
                         doi=doi)) as r:
            if resp_is_pdf_from_header(r):
                if DEBUG_SCRAPING:
                    print u"http header says this is a PDF. took {}s {}".format(
                        elapsed(start), absolute_url)
                return True

            # everything below here needs to look at the content
            # so bail here if the page is too big
            if is_response_too_large(r):
                if DEBUG_SCRAPING:
                    print u"response is too big for more checks in gets_a_pdf"
                return False

            # some publishers send a pdf back wrapped in an HTML page using frames.
            # this is where we detect that, using each publisher's idiosyncratic templates.
            # we only check based on a whitelist of publishers, because downloading this whole
            # page (r.content) is expensive to do for everyone.
            if 'onlinelibrary.wiley.com' in absolute_url:
                # = closed journal http://doi.org/10.1111/ele.12585
                # = open journal http://doi.org/10.1111/ele.12587 cc-by
                if '<iframe' in r.content:
                    if DEBUG_SCRAPING:
                        print u"this is a Wiley 'enhanced PDF' page. took {}s [{}]".format(
                            elapsed(start), absolute_url)
                    return True

            elif 'ieeexplore' in absolute_url:
                # (this is a good example of one dissem.in misses)
                # = open journal http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6740844
                # = closed journal http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6045214
                if '<frame' in r.content:
                    if DEBUG_SCRAPING:
                        print u"this is a IEEE 'enhanced PDF' page. took {}s [{}]".format(
                            elapsed(start), absolute_url)
                    return True

            elif 'sciencedirect' in absolute_url:
                if u"does not support the use of the crawler software" in r.content:
                    return True

        if DEBUG_SCRAPING:
            print u"we've decided this ain't a PDF. took {}s [{}]".format(
                elapsed(start), absolute_url)
        return False
    except requests.exceptions.ConnectionError:
        print u"ERROR: connection error in gets_a_pdf, skipping."
        return False
    except requests.Timeout:
        print u"ERROR: timeout error in gets_a_pdf, skipping."
        return False
    except requests.exceptions.InvalidSchema:
        print u"ERROR: InvalidSchema error in gets_a_pdf, skipping."
        return False
    except requests.exceptions.RequestException:
        print u"ERROR: RequestException error in gets_a_pdf, skipping."
        return False