예제 #1
0
def keep_redirecting(r, publisher):
    # don't read r.content unless we have to, because it will cause us to download the whole thig instead of just the headers

    # 10.5762/kais.2016.17.5.316
    if ("content-length" in r.headers):
        # manually follow javascript if that's all that's in the payload
        file_size = int(r.headers["content-length"])
        if file_size < 500:
            matches = re.findall(ur"<script>location.href='(.*)'</script>", r.content_small(), re.IGNORECASE)
            if matches:
                redirect_url = matches[0]
                if redirect_url.startswith(u"/"):
                    redirect_url = get_link_target(redirect_url, r.url)
                return redirect_url

    # 10.1097/00003643-201406001-00238
    if publisher and is_same_publisher(publisher, "Ovid Technologies (Wolters Kluwer Health)"):
        matches = re.findall(ur"OvidAN = '(.*?)';", r.content_small(), re.IGNORECASE)
        if matches:
            an_number = matches[0]
            redirect_url = "http://content.wkhealth.com/linkback/openurl?an={}".format(an_number)
            return redirect_url

    # handle meta redirects
    redirect_re = re.compile('<meta[^>]*?url=["\'](.*?)["\']', re.IGNORECASE)
    redirect_match = redirect_re.findall(r.content_small())
    if redirect_match:
        redirect_path = HTMLParser().unescape(redirect_match[0].strip())
        redirect_url = urlparse.urljoin(r.request.url, redirect_path)
        logger.info(u"redirect_match! redirecting to {}".format(redirect_url))
        return redirect_url

    return None
예제 #2
0
    def gets_a_word_doc(self, link, base_url):
        if is_purchase_link(link):
            return False

        absolute_url = get_link_target(link.href, base_url)
        if DEBUG_SCRAPING:
            logger.info(
                u"checking to see if {} is a word doc".format(absolute_url))

        start = time()
        try:
            r = http_get(absolute_url,
                         stream=True,
                         publisher=self.publisher,
                         session_id=self.session_id,
                         ask_slowly=self.ask_slowly)

            if r.status_code != 200:
                return False

            if is_a_word_doc(r):
                return True

        except Exception as e:
            logger.exception(u'error in gets_a_word_doc: {}'.format(e))

        return False
예제 #3
0
def keep_redirecting(r, publisher):
    # don't read r.content unless we have to, because it will cause us to download the whole thig instead of just the headers

    # 10.5762/kais.2016.17.5.316
    if "content-length" in r.headers:
        # manually follow javascript if that's all that's in the payload
        file_size = int(r.headers["content-length"])
        if file_size < 500:
            matches = re.findall(r"<script>location.href='(.*)'</script>",
                                 r.text_small(), re.IGNORECASE)
            if matches:
                redirect_url = matches[0]
                if redirect_url.startswith("/"):
                    redirect_url = get_link_target(redirect_url, r.url)
                return redirect_url

    # 10.1097/00003643-201406001-00238
    if publisher and is_same_publisher(
            publisher, "Ovid Technologies (Wolters Kluwer Health)"):
        matches = re.findall(r"OvidAN = '(.*?)';", r.text_small(),
                             re.IGNORECASE)
        if matches:
            an_number = matches[0]
            redirect_url = "http://content.wkhealth.com/linkback/openurl?an={}".format(
                an_number)
            return redirect_url

    # 10.1097/01.xps.0000491010.82675.1c
    hostname = urlparse(r.url).hostname
    if hostname and hostname.endswith('ovid.com'):
        matches = re.findall(r'var journalURL = "(.*?)";', r.text_small(),
                             re.IGNORECASE)
        if matches:
            journal_url = matches[0]
            logger.info(
                'ovid journal match. redirecting to {}'.format(journal_url))
            return journal_url

    # handle meta redirects
    redirect_re = re.compile('<meta[^>]*http-equiv="?refresh"?[^>]*>',
                             re.IGNORECASE | re.DOTALL)
    redirect_match = redirect_re.findall(r.text_small())
    if redirect_match:
        redirect = redirect_match[0]
        logger.info('found a meta refresh element: {}'.format(redirect))
        url_re = re.compile('url=["\']?([^">\']*)', re.IGNORECASE | re.DOTALL)
        url_match = url_re.findall(redirect)

        if url_match:
            redirect_path = html.unescape(url_match[0].strip())
            redirect_url = urljoin(r.request.url, redirect_path)
            if not redirect_url.endswith(
                    'Error/JavaScript.html') and not redirect_url.endswith(
                        '/?reason=expired'):
                logger.info(
                    "redirect_match! redirecting to {}".format(redirect_url))
                return redirect_url

    return None
예제 #4
0
    def gets_a_pdf(self, link, base_url):

        if is_purchase_link(link):
            return False

        absolute_url = get_link_target(link.href, base_url)
        if DEBUG_SCRAPING:
            logger.info(u"checking to see if {} is a pdf".format(absolute_url))

        start = time()
        try:
            self.r = http_get(absolute_url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly)

            if self.r.status_code != 200:
                if self.r.status_code in [401]:
                    # is unauthorized, so not open
                    pass
                else:
                    self.error += u"ERROR: status_code={} on {} in gets_a_pdf".format(self.r.status_code, absolute_url)
                return False

            if self.is_a_pdf_page():
                return True

        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error in gets_a_pdf for {}: {}".format(absolute_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error in gets_a_pdf for {}: {}".format(absolute_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error in gets_a_pdf for {}: {}".format(absolute_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException error in gets_a_pdf"
            logger.info(self.error)
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error in gets_a_pdf for {}: {}".format(absolute_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error in gets_a_pdf for {}: {}".format(absolute_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except Exception as e:
            self.error += u"ERROR: Exception error in gets_a_pdf"
            logger.exception(self.error)

        if DEBUG_SCRAPING:
            logger.info(u"we've decided this ain't a PDF. took {} seconds [{}]".format(
                elapsed(start), absolute_url))
        return False
예제 #5
0
    def gets_a_pdf(self, link, base_url):

        if is_purchase_link(link):
            return False

        absolute_url = get_link_target(link.href, base_url)
        if DEBUG_SCRAPING:
            logger.info(u"checking to see if {} is a pdf".format(absolute_url))

        start = time()
        try:
            self.r = http_get(absolute_url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly)

            if self.r.status_code != 200:
                if self.r.status_code in [401]:
                    # is unauthorized, so not open
                    pass
                else:
                    self.error += u"ERROR: status_code={} on {} in gets_a_pdf".format(self.r.status_code, absolute_url)
                return False

            if self.is_a_pdf_page():
                return True

        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error in gets_a_pdf for {}: {}".format(absolute_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error in gets_a_pdf for {}: {}".format(absolute_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error in gets_a_pdf for {}: {}".format(absolute_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException error in gets_a_pdf"
            logger.info(self.error)
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error in gets_a_pdf for {}: {}".format(absolute_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error in gets_a_pdf for {}: {}".format(absolute_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except Exception as e:
            self.error += u"ERROR: Exception error in gets_a_pdf"
            logger.exception(self.error)

        if DEBUG_SCRAPING:
            logger.info(u"we've decided this ain't a PDF. took {} seconds [{}]".format(
                elapsed(start), absolute_url))
        return False
예제 #6
0
def keep_redirecting(r, publisher):
    # don't read r.content unless we have to, because it will cause us to download the whole thig instead of just the headers

    # 10.5762/kais.2016.17.5.316
    if ("content-length" in r.headers):
        # manually follow javascript if that's all that's in the payload
        file_size = int(r.headers["content-length"])
        if file_size < 500:
            matches = re.findall(ur"<script>location.href='(.*)'</script>",
                                 r.content_small(), re.IGNORECASE)
            if matches:
                redirect_url = matches[0]
                if redirect_url.startswith(u"/"):
                    redirect_url = get_link_target(redirect_url, r.url)
                return redirect_url

    # 10.1097/00003643-201406001-00238
    if publisher and is_same_publisher(
            publisher, "Ovid Technologies (Wolters Kluwer Health)"):
        matches = re.findall(ur"OvidAN = '(.*?)';", r.content_small(),
                             re.IGNORECASE)
        if matches:
            an_number = matches[0]
            redirect_url = "http://content.wkhealth.com/linkback/openurl?an={}".format(
                an_number)
            return redirect_url

    # handle meta redirects
    redirect_re = re.compile('<meta[^>]*http-equiv="refresh"[^>]*>',
                             re.IGNORECASE | re.DOTALL)
    redirect_match = redirect_re.findall(r.content_small())
    if redirect_match:
        redirect = redirect_match[0]
        logger.info('found a meta refresh element: {}'.format(redirect))
        url_re = re.compile('url=["\'](.*?)["\']', re.IGNORECASE | re.DOTALL)
        url_match = url_re.findall(redirect)
        if url_match:
            redirect_path = HTMLParser().unescape(url_match[0].strip())
            redirect_url = urlparse.urljoin(r.request.url, redirect_path)
            logger.info(
                u"redirect_match! redirecting to {}".format(redirect_url))
            return redirect_url

    return None
예제 #7
0
    def scrape_for_fulltext_link(self):
        url = self.url

        dont_scrape_list = [
                u"ncbi.nlm.nih.gov",
                u"europepmc.org",
                u"/europepmc/",
                u"pubmed",
                u"elar.rsvpu.ru",  #these ones based on complaint in email
                u"elib.uraic.ru",
                u"elar.usfeu.ru",
                u"elar.urfu.ru",
                u"elar.uspu.ru"]
        for url_fragment in dont_scrape_list:
            if url_fragment in url:
                logger.info(u"not scraping {} because is on our do not scrape list.".format(url))
                return

        try:
            self.r = http_get(url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly)

            if self.r.status_code != 200:
                if self.r.status_code in [401]:
                    # not authorized, so not open
                    pass
                else:
                    self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link".format(self.r.status_code, url)
                return

            # if our url redirects to a pdf, we're done.
            # = open repo http://hdl.handle.net/2060/20140010374
            if self.is_a_pdf_page():
                if DEBUG_SCRAPING:
                    logger.info(u"this is a PDF. success! [{}]".format(url))
                self.scraped_pdf_url = url
                return

            else:
                if DEBUG_SCRAPING:
                    logger.info(u"is not a PDF for {}.  continuing more checks".format(url))

            # now before reading the content, bail it too large
            if is_response_too_large(self.r):
                logger.info(u"landing page is too large, skipping")
                return

            # get the HTML tree
            page = self.r.content_small()

            # set the license if we can find one
            scraped_license = find_normalized_license(page)
            if scraped_license:
                self.scraped_license = scraped_license

            pdf_download_link = None
            # special exception for citeseer because we want the pdf link where
            # the copy is on the third party repo, not the cached link, if we can get it
            if url and u"citeseerx.ist.psu.edu/" in url:
                matches = re.findall(u'<h3>Download Links</h3>.*?href="(.*?)"', page, re.DOTALL)
                if matches:
                    pdf_download_link = DuckLink(unicode(matches[0], "utf-8"), "download")

            # osf doesn't have their download link in their pages
            # so look at the page contents to see if it is osf-hosted
            # if so, compute the url.  example:  http://osf.io/tyhqm
            elif page and u"osf-cookie" in unicode(page, "utf-8", errors='replace'):
                pdf_download_link = DuckLink(u"{}/download".format(url), "download")

            # otherwise look for it the normal way
            else:
                pdf_download_link = self.find_pdf_link(page)

            if pdf_download_link is not None:
                if DEBUG_SCRAPING:
                    logger.info(u"found a PDF download link: {} {} [{}]".format(
                        pdf_download_link.href, pdf_download_link.anchor, url))

                pdf_url = get_link_target(pdf_download_link.href, self.r.url)
                # if they are linking to a PDF, we need to follow the link to make sure it's legit
                if DEBUG_SCRAPING:
                    logger.info(u"checking to see the PDF link actually gets a PDF [{}]".format(url))
                if self.gets_a_pdf(pdf_download_link, self.r.url):
                    self.scraped_pdf_url = pdf_url
                    self.scraped_open_metadata_url = url
                    return

            # try this later because would rather get a pdfs
            # if they are linking to a .docx or similar, this is open.
            doc_link = find_doc_download_link(page)
            if doc_link is not None:
                if DEBUG_SCRAPING:
                    logger.info(u"found a .doc download link {} [{}]".format(
                        get_link_target(doc_link.href, self.r.url), url))
                self.scraped_open_metadata_url = url
                return

        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException in scrape_for_fulltext_link"
            logger.info(self.error)
            return
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except Exception as e:
            self.error += u"ERROR: Exception error on in scrape_for_fulltext_link"
            logger.exception(self.error)
            return

        if DEBUG_SCRAPING:
            logger.info(u"found no PDF download link.  end of the line. [{}]".format(url))

        return self
예제 #8
0
    def scrape_for_fulltext_link(self):
        landing_url = self.url

        if DEBUG_SCRAPING:
            logger.info(u"checking to see if {} says it is open".format(landing_url))

        start = time()
        try:
            self.r = http_get(landing_url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly)
            resolved_landing_url = self.r.url

            if self.r.status_code != 200:
                if self.r.status_code in [401]:
                    # is unauthorized, so not open
                    pass
                else:
                    self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link, skipping.".format(self.r.status_code, self.r.url)
                logger.info(u"DIDN'T GET THE PAGE: {}".format(self.error))
                # logger.debug(self.r.request.headers)
                return

            # example 10.1007/978-3-642-01445-1
            if u"crossref.org/_deleted-doi/" in self.r.url:
                logger.info(u"this is a deleted doi")
                return

            # if our landing_url redirects to a pdf, we're done.
            # = open repo http://hdl.handle.net/2060/20140010374
            if self.is_a_pdf_page():
                if DEBUG_SCRAPING:
                    logger.info(u"this is a PDF. success! [{}]".format(landing_url))
                self.scraped_pdf_url = landing_url
                self.open_version_source_string = "open (via free pdf)"
                # don't bother looking for open access lingo because it is a PDF (or PDF wannabe)
                return

            else:
                if DEBUG_SCRAPING:
                    logger.info(u"landing page is not a PDF for {}.  continuing more checks".format(landing_url))

            # get the HTML tree
            page = self.r.content_small()

            # remove script tags
            try:
                soup = BeautifulSoup(page, 'html.parser')
                [script.extract() for script in soup('script')]
                page = str(soup)
            except HTMLParseError as e:
                logger.error(u'error parsing html, skipped script removal: {}'.format(e))

            # set the license if we can find one
            scraped_license = find_normalized_license(page)
            if scraped_license:
                self.scraped_license = scraped_license

            pdf_download_link = self.find_pdf_link(page)

            if pdf_download_link is not None:
                pdf_url = get_link_target(pdf_download_link.href, self.r.url)
                if self.gets_a_pdf(pdf_download_link, self.r.url):
                    self.scraped_pdf_url = pdf_url
                    self.scraped_open_metadata_url = self.url
                    self.open_version_source_string = "open (via free pdf)"

            # now look and see if it is not just free, but open!
            says_open_url_snippet_patterns = [
                ('projecteuclid.org/', u'<strong>Full-text: Open access</strong>'),
                ('sciencedirect.com/', u'<div class="OpenAccessLabel">open access</div>'),
                ('sciencedirect.com/', u'<div class="OpenAccessLabel">open archive</div>'),
            ]

            for (url_snippet, pattern) in says_open_url_snippet_patterns:
                matches = re.findall(pattern, page, re.IGNORECASE)
                if url_snippet in resolved_landing_url.lower() and matches:
                    self.scraped_open_metadata_url = landing_url
                    self.open_version_source_string = "open (via page says Open Access)"
                    self.scraped_license = "implied-oa"

            says_open_access_patterns = [
                ("Informa UK Limited", u"/accessOA.png"),
                ("Oxford University Press (OUP)", u"<i class='icon-availability_open'"),
                ("Institute of Electrical and Electronics Engineers (IEEE)", ur'"isOpenAccess":true'),
                ("Institute of Electrical and Electronics Engineers (IEEE)", ur'"openAccessFlag":"yes"'),
                ("Informa UK Limited", u"/accessOA.png"),
                ("Royal Society of Chemistry (RSC)", u"/open_access_blue.png"),
                ("Cambridge University Press (CUP)", u'<span class="icon access open-access cursorDefault">'),
            ]
            for (publisher, pattern) in says_open_access_patterns:
                matches = re.findall(pattern, page, re.IGNORECASE | re.DOTALL)
                if self.is_same_publisher(publisher) and matches:
                    self.scraped_license = "implied-oa"
                    self.scraped_open_metadata_url = landing_url
                    self.open_version_source_string = "open (via page says Open Access)"

            license_patterns = [
                ur"(creativecommons.org/licenses/[a-z\-]+)",
                u"distributed under the terms (.*) which permits",
                u"This is an open access article under the terms (.*) which permits",
                u"This is an open access article published under (.*) which permits",
                u'<div class="openAccess-articleHeaderContainer(.*?)</div>'
            ]

            for pattern in license_patterns:
                matches = re.findall(pattern, page, re.IGNORECASE)
                if matches:
                    self.scraped_license = find_normalized_license(matches[0])
                    self.scraped_open_metadata_url = self.url
                    self.open_version_source_string = "open (via page says license)"

            if self.is_open:
                if DEBUG_SCRAPING:
                    logger.info(u"we've decided this is open! took {} seconds [{}]".format(
                        elapsed(start), landing_url))
                return True
            else:
                if DEBUG_SCRAPING:
                    logger.info(u"we've decided this doesn't say open. took {} seconds [{}]".format(
                        elapsed(start), landing_url))
                return False
        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException error in scrape_for_fulltext_link"
            logger.info(self.error)
            return False
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except Exception as e:
            self.error += u"ERROR: Exception error in scrape_for_fulltext_link"
            logger.exception(self.error)
            return False
예제 #9
0
    def scrape_for_fulltext_link(self):
        url = self.url

        dont_scrape_list = [
            u"ncbi.nlm.nih.gov",
            u"europepmc.org",
            u"/europepmc/",
            u"pubmed",
            u"elar.rsvpu.ru",  #these ones based on complaint in email
            u"elib.uraic.ru",
            u"elar.usfeu.ru",
            u"elar.urfu.ru",
            u"elar.uspu.ru"
        ]
        for url_fragment in dont_scrape_list:
            if url_fragment in url:
                logger.info(
                    u"not scraping {} because is on our do not scrape list.".
                    format(url))
                return

        try:
            self.r = http_get(url,
                              stream=True,
                              publisher=self.publisher,
                              session_id=self.session_id,
                              ask_slowly=self.ask_slowly)

            if self.r.status_code != 200:
                if self.r.status_code in [401]:
                    # not authorized, so not open
                    pass
                else:
                    self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link".format(
                        self.r.status_code, url)
                return

            # if our url redirects to a pdf, we're done.
            # = open repo http://hdl.handle.net/2060/20140010374
            if self.is_a_pdf_page():
                if DEBUG_SCRAPING:
                    logger.info(u"this is a PDF. success! [{}]".format(url))
                self.scraped_pdf_url = url
                return

            else:
                if DEBUG_SCRAPING:
                    logger.info(
                        u"is not a PDF for {}.  continuing more checks".format(
                            url))

            # now before reading the content, bail it too large
            if is_response_too_large(self.r):
                logger.info(u"landing page is too large, skipping")
                return

            # get the HTML tree
            page = self.r.content_small()

            # set the license if we can find one
            scraped_license = find_normalized_license(page)
            if scraped_license:
                self.scraped_license = scraped_license

            pdf_download_link = None

            # osf doesn't have their download link in their pages
            # so look at the page contents to see if it is osf-hosted
            # if so, compute the url.  example:  http://osf.io/tyhqm
            if page and u"osf-cookie" in unicode(page, "utf-8"):
                pdf_download_link = DuckLink(u"{}/download".format(url),
                                             "download")

            # otherwise look for it the normal way
            else:
                pdf_download_link = self.find_pdf_link(page)

            if pdf_download_link is not None:
                if DEBUG_SCRAPING:
                    logger.info(
                        u"found a PDF download link: {} {} [{}]".format(
                            pdf_download_link.href, pdf_download_link.anchor,
                            url))

                pdf_url = get_link_target(pdf_download_link.href, self.r.url)
                # if they are linking to a PDF, we need to follow the link to make sure it's legit
                if DEBUG_SCRAPING:
                    logger.info(
                        u"checking to see the PDF link actually gets a PDF [{}]"
                        .format(url))
                if self.gets_a_pdf(pdf_download_link, self.r.url):
                    self.scraped_pdf_url = pdf_url
                    self.scraped_open_metadata_url = url
                    return

            # try this later because would rather get a pdfs
            # if they are linking to a .docx or similar, this is open.
            doc_link = find_doc_download_link(page)
            if doc_link is not None:
                if DEBUG_SCRAPING:
                    logger.info(u"found a .doc download link {} [{}]".format(
                        get_link_target(doc_link.href, self.r.url), url))
                self.scraped_open_metadata_url = url
                return

        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException in scrape_for_fulltext_link"
            logger.info(self.error)
            return
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except Exception as e:
            self.error += u"ERROR: Exception error on in scrape_for_fulltext_link"
            logger.exception(self.error)
            return

        if DEBUG_SCRAPING:
            logger.info(
                u"found no PDF download link.  end of the line. [{}]".format(
                    url))

        return self
예제 #10
0
    def scrape_for_fulltext_link(self):
        landing_url = self.url

        if DEBUG_SCRAPING:
            logger.info(
                u"checking to see if {} says it is open".format(landing_url))

        start = time()
        try:
            self.r = http_get(landing_url,
                              stream=True,
                              publisher=self.publisher,
                              session_id=self.session_id,
                              ask_slowly=self.ask_slowly)

            if self.r.status_code != 200:
                if self.r.status_code in [401]:
                    # is unauthorized, so not open
                    pass
                else:
                    self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link, skipping.".format(
                        self.r.status_code, self.r.url)
                logger.info(u"DIDN'T GET THE PAGE: {}".format(self.error))
                # logger.debug(self.r.request.headers)
                return

            # example 10.1007/978-3-642-01445-1
            if u"crossref.org/_deleted-doi/" in self.r.url:
                logger.info(u"this is a deleted doi")
                return

            # if our landing_url redirects to a pdf, we're done.
            # = open repo http://hdl.handle.net/2060/20140010374
            if self.is_a_pdf_page():
                if DEBUG_SCRAPING:
                    logger.info(
                        u"this is a PDF. success! [{}]".format(landing_url))
                self.scraped_pdf_url = landing_url
                self.open_version_source_string = "open (via free pdf)"
                # don't bother looking for open access lingo because it is a PDF (or PDF wannabe)
                return

            else:
                if DEBUG_SCRAPING:
                    logger.info(
                        u"landing page is not a PDF for {}.  continuing more checks"
                        .format(landing_url))

            # get the HTML tree
            page = self.r.content_small()

            # set the license if we can find one
            scraped_license = find_normalized_license(page)
            if scraped_license:
                self.scraped_license = scraped_license

            pdf_download_link = self.find_pdf_link(page)

            if pdf_download_link is not None:
                pdf_url = get_link_target(pdf_download_link.href, self.r.url)
                if self.gets_a_pdf(pdf_download_link, self.r.url):
                    self.scraped_pdf_url = pdf_url
                    self.scraped_open_metadata_url = self.url
                    self.open_version_source_string = "open (via free pdf)"

            # now look and see if it is not just free, but open!
            license_patterns = [
                u"(creativecommons.org\/licenses\/[a-z\-]+)",
                u"distributed under the terms (.*) which permits",
                u"This is an open access article under the terms (.*) which permits",
                u"This is an open access article published under (.*) which permits",
                u'<div class="openAccess-articleHeaderContainer(.*?)</div>'
            ]
            for pattern in license_patterns:
                matches = re.findall(pattern, page, re.IGNORECASE)
                if matches:
                    self.scraped_license = find_normalized_license(matches[0])
                    self.scraped_open_metadata_url = self.url
                    self.open_version_source_string = "open (via page says license)"

            says_open_url_snippet_patterns = [
                ("projecteuclid.org/",
                 u'<strong>Full-text: Open access</strong>'),
            ]
            for (url_snippet, pattern) in says_open_url_snippet_patterns:
                matches = re.findall(pattern, self.r.content_small(),
                                     re.IGNORECASE)
                if url_snippet in self.r.request.url.lower() and matches:
                    self.scraped_open_metadata_url = self.r.request.url
                    self.open_version_source_string = "open (via page says Open Access)"
                    self.scraped_license = "implied-oa"

            says_open_access_patterns = [
                ("Informa UK Limited", u"/accessOA.png"),
                ("Oxford University Press (OUP)",
                 u"<i class='icon-availability_open'"),
                ("Institute of Electrical and Electronics Engineers (IEEE)",
                 ur'"isOpenAccess":true'),
                ("Institute of Electrical and Electronics Engineers (IEEE)",
                 ur'"openAccessFlag":"yes"'),
                ("Informa UK Limited", u"/accessOA.png"),
                ("Royal Society of Chemistry (RSC)", u"/open_access_blue.png"),
                ("Cambridge University Press (CUP)",
                 u'<span class="icon access open-access cursorDefault">'),
            ]
            for (publisher, pattern) in says_open_access_patterns:
                matches = re.findall(pattern, page, re.IGNORECASE | re.DOTALL)
                if self.is_same_publisher(publisher) and matches:
                    self.scraped_license = "implied-oa"
                    self.scraped_open_metadata_url = landing_url
                    self.open_version_source_string = "open (via page says Open Access)"

            if self.is_open:
                if DEBUG_SCRAPING:
                    logger.info(
                        u"we've decided this is open! took {} seconds [{}]".
                        format(elapsed(start), landing_url))
                return True
            else:
                if DEBUG_SCRAPING:
                    logger.info(
                        u"we've decided this doesn't say open. took {} seconds [{}]"
                        .format(elapsed(start), landing_url))
                return False
        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error in scrape_for_fulltext_link on {}: {}".format(
                landing_url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error in scrape_for_fulltext_link on {}: {}".format(
                landing_url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error in scrape_for_fulltext_link on {}: {}".format(
                landing_url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException error in scrape_for_fulltext_link"
            logger.info(self.error)
            return False
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error in scrape_for_fulltext_link on {}: {}".format(
                landing_url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error in scrape_for_fulltext_link on {}: {}".format(
                landing_url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except Exception as e:
            self.error += u"ERROR: Exception error in scrape_for_fulltext_link"
            logger.exception(self.error)
            return False
예제 #11
0
    def scrape_for_fulltext_link(self, find_pdf_link=True):
        url = self.url

        dont_scrape_list = [
            u"ncbi.nlm.nih.gov",
            u"europepmc.org",
            u"/europepmc/",
            u"pubmed",
            u"elar.rsvpu.ru",  #these ones based on complaint in email
            u"elib.uraic.ru",
            u"elar.usfeu.ru",
            u"elar.urfu.ru",
            u"elar.uspu.ru"
        ]
        for url_fragment in dont_scrape_list:
            if url_fragment in url:
                logger.info(
                    u"not scraping {} because is on our do not scrape list.".
                    format(url))
                return

        try:
            self.r = http_get(url,
                              stream=True,
                              publisher=self.publisher,
                              session_id=self.session_id,
                              ask_slowly=self.ask_slowly)
            resolved_url = self.r.url

            if self.r.status_code != 200:
                if self.r.status_code in [401]:
                    # not authorized, so not open
                    pass
                else:
                    self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link".format(
                        self.r.status_code, url)
                return

            # if our url redirects to a pdf, we're done.
            # = open repo http://hdl.handle.net/2060/20140010374
            if self.is_a_pdf_page():
                if DEBUG_SCRAPING:
                    logger.info(u"this is a PDF. success! [{}]".format(url))
                self.scraped_pdf_url = url
                return

            else:
                if DEBUG_SCRAPING:
                    logger.info(
                        u"is not a PDF for {}.  continuing more checks".format(
                            url))

            if is_a_word_doc(self.r):
                if DEBUG_SCRAPING:
                    logger.info(
                        u"this is a word doc. success! [{}]".format(url))
                self.scraped_open_metadata_url = url
                return

            # now before reading the content, bail it too large
            if is_response_too_large(self.r):
                logger.info(u"landing page is too large, skipping")
                return

            # get the HTML tree
            page = self.r.content_small()

            # remove script tags
            try:
                soup = BeautifulSoup(page, 'html.parser')
                [script.extract() for script in soup('script')]
                page = str(soup)
            except HTMLParseError as e:
                logger.error(
                    u'error parsing html, skipped script removal: {}'.format(
                        e))

            # set the license if we can find one
            scraped_license = find_normalized_license(page)
            if scraped_license:
                self.scraped_license = scraped_license

            pdf_download_link = None
            # special exception for citeseer because we want the pdf link where
            # the copy is on the third party repo, not the cached link, if we can get it
            if url and u"citeseerx.ist.psu.edu/" in url:
                matches = re.findall(u'<h3>Download Links</h3>.*?href="(.*?)"',
                                     page, re.DOTALL)
                if matches:
                    pdf_download_link = DuckLink(unicode(matches[0], "utf-8"),
                                                 "download")

            # osf doesn't have their download link in their pages
            # so look at the page contents to see if it is osf-hosted
            # if so, compute the url.  example:  http://osf.io/tyhqm
            elif page and u"osf-cookie" in unicode(
                    page, "utf-8", errors='replace'):
                pdf_download_link = DuckLink(u"{}/download".format(url),
                                             "download")

            # otherwise look for it the normal way
            else:
                pdf_download_link = self.find_pdf_link(page)

            if pdf_download_link is not None:
                if DEBUG_SCRAPING:
                    logger.info(
                        u"found a PDF download link: {} {} [{}]".format(
                            pdf_download_link.href, pdf_download_link.anchor,
                            url))

                pdf_url = get_link_target(pdf_download_link.href, self.r.url)
                # if they are linking to a PDF, we need to follow the link to make sure it's legit
                if DEBUG_SCRAPING:
                    logger.info(
                        u"checking to see the PDF link actually gets a PDF [{}]"
                        .format(url))
                if self.gets_a_pdf(pdf_download_link, self.r.url):
                    self.scraped_pdf_url = pdf_url
                    self.scraped_open_metadata_url = url
                    return

            # try this later because would rather get a pdfs
            # if they are linking to a .docx or similar, this is open.
            doc_link = find_doc_download_link(page)

            if doc_link is not None:
                absolute_doc_url = get_link_target(doc_link.href, resolved_url)
                if DEBUG_SCRAPING:
                    logger.info(
                        u"found a possible .doc download link [{}]".format(
                            absolute_doc_url))
                if self.gets_a_word_doc(doc_link, self.r.url):
                    if DEBUG_SCRAPING:
                        logger.info(
                            u"we've decided this is a word doc. [{}]".format(
                                absolute_doc_url))
                    self.scraped_open_metadata_url = url
                    return
                else:
                    if DEBUG_SCRAPING:
                        logger.info(
                            u"we've decided this ain't a word doc. [{}]".
                            format(absolute_doc_url))

            bhl_link = find_bhl_view_link(resolved_url, page)
            if bhl_link is not None:
                logger.info('found a BHL document link: {}'.format(
                    get_link_target(bhl_link.href, resolved_url)))
                self.scraped_open_metadata_url = url
                return

            if _trust_repo_license(resolved_url) and self.scraped_license:
                logger.info(u'trusting license {}'.format(
                    self.scraped_license))
                self.scraped_open_metadata_url = self.url

        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException in scrape_for_fulltext_link"
            logger.info(self.error)
            return
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except Exception as e:
            self.error += u"ERROR: Exception error on in scrape_for_fulltext_link"
            logger.exception(self.error)
            return

        if DEBUG_SCRAPING:
            logger.info(
                u"found no PDF download link.  end of the line. [{}]".format(
                    url))

        return self
예제 #12
0
    def scrape_for_fulltext_link(self, find_pdf_link=True):
        landing_url = self.url

        if DEBUG_SCRAPING:
            logger.info(
                u"checking to see if {} says it is open".format(landing_url))

        start = time()
        try:
            self.r = http_get(landing_url,
                              stream=True,
                              publisher=self.publisher,
                              session_id=self.session_id,
                              ask_slowly=self.ask_slowly)
            resolved_landing_url = self.r.url

            if self.r.status_code != 200:
                if self.r.status_code in [401]:
                    # is unauthorized, so not open
                    pass
                else:
                    self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link, skipping.".format(
                        self.r.status_code, self.r.url)
                logger.info(u"DIDN'T GET THE PAGE: {}".format(self.error))
                # logger.debug(self.r.request.headers)
                return

            # example 10.1007/978-3-642-01445-1
            if u"crossref.org/_deleted-doi/" in resolved_landing_url:
                logger.info(u"this is a deleted doi")
                return

            # if our landing_url redirects to a pdf, we're done.
            # = open repo http://hdl.handle.net/2060/20140010374
            if self.is_a_pdf_page():
                if DEBUG_SCRAPING:
                    logger.info(
                        u"this is a PDF. success! [{}]".format(landing_url))
                self.scraped_pdf_url = landing_url
                self.open_version_source_string = "open (via free pdf)"
                # don't bother looking for open access lingo because it is a PDF (or PDF wannabe)
                return

            else:
                if DEBUG_SCRAPING:
                    logger.info(
                        u"landing page is not a PDF for {}.  continuing more checks"
                        .format(landing_url))

            # get the HTML tree
            page = self.r.content_small()

            # remove script tags
            try:
                soup = BeautifulSoup(page, 'html.parser')
                [script.extract() for script in soup('script')]
                page = str(soup)
            except HTMLParseError as e:
                logger.error(
                    u'error parsing html, skipped script removal: {}'.format(
                        e))

            # Look for a pdf link. If we find one, look for a license.

            pdf_download_link = self.find_pdf_link(
                page) if find_pdf_link else None

            if pdf_download_link is not None:
                pdf_url = get_link_target(pdf_download_link.href, self.r.url)
                if self.gets_a_pdf(pdf_download_link, self.r.url):
                    self.scraped_pdf_url = pdf_url
                    self.scraped_open_metadata_url = landing_url
                    self.open_version_source_string = "open (via free pdf)"

                    # set the license if we can find one
                    scraped_license = find_normalized_license(page)
                    if scraped_license:
                        self.scraped_license = scraped_license

            # Look for patterns that indicate availability but not necessarily openness and make this a bronze location.

            bronze_url_snippet_patterns = [
                ('sciencedirect.com/',
                 u'<div class="OpenAccessLabel">open archive</div>'),
            ]

            for (url_snippet, pattern) in bronze_url_snippet_patterns:
                if url_snippet in resolved_landing_url.lower() and re.findall(
                        pattern, page, re.IGNORECASE | re.DOTALL):
                    self.scraped_open_metadata_url = landing_url
                    self.open_version_source_string = "open (via free article)"

            bronze_publisher_patterns = [
                ("New England Journal of Medicine (NEJM/MMS)",
                 u'<meta content="yes" name="evt-free"'),
                ("Massachusetts Medical Society",
                 u'<meta content="yes" name="evt-free"'),
            ]

            for (publisher, pattern) in bronze_publisher_patterns:
                if self.is_same_publisher(publisher) and re.findall(
                        pattern, page, re.IGNORECASE | re.DOTALL):
                    self.scraped_open_metadata_url = landing_url
                    self.open_version_source_string = "open (via free article)"

            # Look for some license-like patterns that make this a hybrid location.

            hybrid_url_snippet_patterns = [
                ('projecteuclid.org/',
                 u'<strong>Full-text: Open access</strong>'),
                ('sciencedirect.com/',
                 u'<div class="OpenAccessLabel">open access</div>'),
                ('journals.ametsoc.org/',
                 ur'src="/templates/jsp/_style2/_ams/images/access_free\.gif"'
                 ),
                ('apsjournals.apsnet.org',
                 ur'src="/products/aps/releasedAssets/images/open-access-icon\.png"'
                 ),
                ('psychiatriapolska.pl', u'is an Open Access journal:'),
                ('journals.lww.com', u'<span class="[^>]*ejp-indicator--free'),
            ]

            for (url_snippet, pattern) in hybrid_url_snippet_patterns:
                if url_snippet in resolved_landing_url.lower() and re.findall(
                        pattern, page, re.IGNORECASE | re.DOTALL):
                    self.scraped_open_metadata_url = landing_url
                    self.open_version_source_string = "open (via page says Open Access)"
                    self.scraped_license = "implied-oa"

            hybrid_publisher_patterns = [
                ("Informa UK Limited", u"/accessOA.png"),
                ("Oxford University Press (OUP)",
                 u"<i class='icon-availability_open'"),
                ("Institute of Electrical and Electronics Engineers (IEEE)",
                 ur'"isOpenAccess":true'),
                ("Institute of Electrical and Electronics Engineers (IEEE)",
                 ur'"openAccessFlag":"yes"'),
                ("Informa UK Limited", u"/accessOA.png"),
                ("Royal Society of Chemistry (RSC)", u"/open_access_blue.png"),
                ("Cambridge University Press (CUP)",
                 u'<span class="icon access open-access cursorDefault">'),
            ]

            for (publisher, pattern) in hybrid_publisher_patterns:
                if self.is_same_publisher(publisher) and re.findall(
                        pattern, page, re.IGNORECASE | re.DOTALL):
                    self.scraped_open_metadata_url = landing_url
                    self.open_version_source_string = "open (via page says Open Access)"
                    self.scraped_license = "implied-oa"

            # Look for more license-like patterns that make this a hybrid location.
            # Extract the specific license if present.

            license_patterns = [
                ur"(creativecommons.org/licenses/[a-z\-]+)",
                u"distributed under the terms (.*) which permits",
                u"This is an open access article under the terms (.*) which permits",
                u"This is an open access article published under (.*) which permits",
                u'<div class="openAccess-articleHeaderContainer(.*?)</div>'
            ]

            for pattern in license_patterns:
                matches = re.findall(pattern, page, re.IGNORECASE)
                if matches:
                    self.scraped_open_metadata_url = landing_url
                    normalized_license = find_normalized_license(matches[0])
                    self.scraped_license = normalized_license or 'implied-oa'
                    if normalized_license:
                        self.open_version_source_string = 'open (via page says license)'
                    else:
                        self.open_version_source_string = 'open (via page says Open Access)'

            if self.is_open:
                if DEBUG_SCRAPING:
                    logger.info(
                        u"we've decided this is open! took {} seconds [{}]".
                        format(elapsed(start), landing_url))
                return True
            else:
                if DEBUG_SCRAPING:
                    logger.info(
                        u"we've decided this doesn't say open. took {} seconds [{}]"
                        .format(elapsed(start), landing_url))
                return False
        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error in scrape_for_fulltext_link on {}: {}".format(
                landing_url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error in scrape_for_fulltext_link on {}: {}".format(
                landing_url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error in scrape_for_fulltext_link on {}: {}".format(
                landing_url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException error in scrape_for_fulltext_link"
            logger.info(self.error)
            return False
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error in scrape_for_fulltext_link on {}: {}".format(
                landing_url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error in scrape_for_fulltext_link on {}: {}".format(
                landing_url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except Exception as e:
            self.error += u"ERROR: Exception error in scrape_for_fulltext_link"
            logger.exception(self.error)
            return False
예제 #13
0
    def scrape_for_fulltext_link(self):
        url = self.url

        dont_scrape_list = [
            u"ncbi.nlm.nih.gov",
            u"pubmed",
            u"elar.rsvpu.ru",  #these ones based on complaint in email
            u"elib.uraic.ru",
            u"elar.usfeu.ru",
            u"elar.urfu.ru",
            u"elar.uspu.ru"
        ]
        for url_fragment in dont_scrape_list:
            if url_fragment in url:
                logger.info(
                    u"not scraping {} because is on our do not scrape list.".
                    format(url))
                return

        try:
            with closing(
                    http_get(url,
                             stream=True,
                             related_pub=self.related_pub,
                             ask_slowly=self.ask_slowly)) as self.r:

                if self.r.status_code != 200:
                    self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link".format(
                        self.r.status_code, url)
                    return

                # if our url redirects to a pdf, we're done.
                # = open repo http://hdl.handle.net/2060/20140010374
                if self.is_a_pdf_page():
                    if DEBUG_SCRAPING:
                        logger.info(
                            u"this is a PDF. success! [{}]".format(url))
                    self.scraped_pdf_url = url
                    return

                else:
                    if DEBUG_SCRAPING:
                        logger.info(
                            u"is not a PDF for {}.  continuing more checks".
                            format(url))

                # now before reading the content, bail it too large
                if is_response_too_large(self.r):
                    logger.info(u"landing page is too large, skipping")
                    return

                # get the HTML tree
                page = self.r.content

                # set the license if we can find one
                scraped_license = find_normalized_license(page)
                if scraped_license:
                    self.scraped_license = scraped_license

                # special exception for citeseer because we want the pdf link where
                # the copy is on the third party repo, not the cached link, if we can get it
                if u"citeseerx.ist.psu.edu/" in url:
                    matches = re.findall(
                        u'<h3>Download Links</h3>.*?href="(.*?)"', page,
                        re.DOTALL)
                    if matches:
                        self.scraped_pdf_url = unicode(matches[0], "utf-8")
                        self.scraped_open_metadata_url = url
                        return

                pdf_download_link = self.find_pdf_link(page)
                if pdf_download_link is not None:
                    if DEBUG_SCRAPING:
                        logger.info(
                            u"found a PDF download link: {} {} [{}]".format(
                                pdf_download_link.href,
                                pdf_download_link.anchor, url))

                    pdf_url = get_link_target(pdf_download_link.href,
                                              self.r.url)
                    # if they are linking to a PDF, we need to follow the link to make sure it's legit
                    if DEBUG_SCRAPING:
                        logger.info(
                            u"checking to see the PDF link actually gets a PDF [{}]"
                            .format(url))
                    if self.gets_a_pdf(pdf_download_link, self.r.url):
                        self.scraped_pdf_url = pdf_url
                        self.scraped_open_metadata_url = url
                        return

                # try this later because would rather get a pdfs
                # if they are linking to a .docx or similar, this is open.
                doc_link = find_doc_download_link(page)
                if doc_link is not None:
                    if DEBUG_SCRAPING:
                        logger.info(
                            u"found a .doc download link {} [{}]".format(
                                get_link_target(doc_link.href, self.r.url),
                                url))
                    self.scraped_open_metadata_url = url
                    return

        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error on {} in scrape_for_fulltext_link: {}".format(
                url,
                unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return

        if DEBUG_SCRAPING:
            logger.info(
                u"found no PDF download link.  end of the line. [{}]".format(
                    url))

        return self