def is_a_pdf_page(response, page_publisher): if is_pdf_from_header(response): if DEBUG_SCRAPING: logger.info(u"http header says this is a PDF {}".format( response.request.url) ) return True # everything below here needs to look at the content # so bail here if the page is too big if is_response_too_large(response): if DEBUG_SCRAPING: logger.info(u"response is too big for more checks in is_a_pdf_page") return False content = response.content_big() # PDFs start with this character if re.match(u"%PDF", content): return True if page_publisher: says_free_publisher_patterns = [ ("Wiley-Blackwell", u'<span class="freeAccess" title="You have free access to this content">'), ("Wiley-Blackwell", u'<iframe id="pdfDocument"'), ("JSTOR", ur'<li class="download-pdf-button">.*Download PDF.*</li>'), ("Institute of Electrical and Electronics Engineers (IEEE)", ur'<frame src="http://ieeexplore.ieee.org/.*?pdf.*?</frameset>'), ("IOP Publishing", ur'Full Refereed Journal Article') ] for (publisher, pattern) in says_free_publisher_patterns: matches = re.findall(pattern, content, re.IGNORECASE | re.DOTALL) if is_same_publisher(page_publisher, publisher) and matches: return True return False
def is_a_word_doc(response): if not (response.url.endswith('.doc') or response.url.endswith('.docx')): return False if is_a_word_doc_from_header(response): if DEBUG_SCRAPING: logger.info(u"http header says this is a word doc {}".format( response.request.url)) return True # everything below here needs to look at the content # so bail here if the page is too big if is_response_too_large(response): if DEBUG_SCRAPING: logger.info( u"response is too big for more checks in is_a_word_doc") return False content = response.content_big() # docx if content[-22:].startswith('PK'): return True return False
def is_a_pdf_page(response, page_publisher): if is_pdf_from_header(response): if DEBUG_SCRAPING: logger.info(u"http header says this is a PDF {}".format( response.request.url) ) return True # everything below here needs to look at the content # so bail here if the page is too big if is_response_too_large(response): if DEBUG_SCRAPING: logger.info(u"response is too big for more checks in is_a_pdf_page") return False content = response.content_big() # PDFs start with this character if re.match(u"%PDF", content): return True if page_publisher: says_free_publisher_patterns = [ ("Wiley-Blackwell", u'<span class="freeAccess" title="You have free access to this content">'), ("Wiley-Blackwell", u'<iframe id="pdfDocument"'), ("JSTOR", ur'<li class="download-pdf-button">.*Download PDF.*</li>'), ("Institute of Electrical and Electronics Engineers (IEEE)", ur'<frame src="http://ieeexplore.ieee.org/.*?pdf.*?</frameset>'), ("IOP Publishing", ur'Full Refereed Journal Article') ] for (publisher, pattern) in says_free_publisher_patterns: matches = re.findall(pattern, content, re.IGNORECASE | re.DOTALL) if is_same_publisher(page_publisher, publisher) and matches: return True return False
def scrape_for_fulltext_link(self): url = self.url dont_scrape_list = [ u"ncbi.nlm.nih.gov", u"europepmc.org", u"/europepmc/", u"pubmed", u"elar.rsvpu.ru", #these ones based on complaint in email u"elib.uraic.ru", u"elar.usfeu.ru", u"elar.urfu.ru", u"elar.uspu.ru"] for url_fragment in dont_scrape_list: if url_fragment in url: logger.info(u"not scraping {} because is on our do not scrape list.".format(url)) return try: self.r = http_get(url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly) if self.r.status_code != 200: if self.r.status_code in [401]: # not authorized, so not open pass else: self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link".format(self.r.status_code, url) return # if our url redirects to a pdf, we're done. # = open repo http://hdl.handle.net/2060/20140010374 if self.is_a_pdf_page(): if DEBUG_SCRAPING: logger.info(u"this is a PDF. success! [{}]".format(url)) self.scraped_pdf_url = url return else: if DEBUG_SCRAPING: logger.info(u"is not a PDF for {}. continuing more checks".format(url)) # now before reading the content, bail it too large if is_response_too_large(self.r): logger.info(u"landing page is too large, skipping") return # get the HTML tree page = self.r.content_small() # set the license if we can find one scraped_license = find_normalized_license(page) if scraped_license: self.scraped_license = scraped_license pdf_download_link = None # special exception for citeseer because we want the pdf link where # the copy is on the third party repo, not the cached link, if we can get it if url and u"citeseerx.ist.psu.edu/" in url: matches = re.findall(u'<h3>Download Links</h3>.*?href="(.*?)"', page, re.DOTALL) if matches: pdf_download_link = DuckLink(unicode(matches[0], "utf-8"), "download") # osf doesn't have their download link in their pages # so look at the page contents to see if it is osf-hosted # if so, compute the url. example: http://osf.io/tyhqm elif page and u"osf-cookie" in unicode(page, "utf-8", errors='replace'): pdf_download_link = DuckLink(u"{}/download".format(url), "download") # otherwise look for it the normal way else: pdf_download_link = self.find_pdf_link(page) if pdf_download_link is not None: if DEBUG_SCRAPING: logger.info(u"found a PDF download link: {} {} [{}]".format( pdf_download_link.href, pdf_download_link.anchor, url)) pdf_url = get_link_target(pdf_download_link.href, self.r.url) # if they are linking to a PDF, we need to follow the link to make sure it's legit if DEBUG_SCRAPING: logger.info(u"checking to see the PDF link actually gets a PDF [{}]".format(url)) if self.gets_a_pdf(pdf_download_link, self.r.url): self.scraped_pdf_url = pdf_url self.scraped_open_metadata_url = url return # try this later because would rather get a pdfs # if they are linking to a .docx or similar, this is open. doc_link = find_doc_download_link(page) if doc_link is not None: if DEBUG_SCRAPING: logger.info(u"found a .doc download link {} [{}]".format( get_link_target(doc_link.href, self.r.url), url)) self.scraped_open_metadata_url = url return except requests.exceptions.ConnectionError as e: self.error += u"ERROR: connection error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.Timeout as e: self.error += u"ERROR: timeout error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.InvalidSchema as e: self.error += u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.RequestException as e: self.error += u"ERROR: RequestException in scrape_for_fulltext_link" logger.info(self.error) return except requests.exceptions.ChunkedEncodingError as e: self.error += u"ERROR: ChunkedEncodingError error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except NoDoiException as e: self.error += u"ERROR: NoDoiException error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except Exception as e: self.error += u"ERROR: Exception error on in scrape_for_fulltext_link" logger.exception(self.error) return if DEBUG_SCRAPING: logger.info(u"found no PDF download link. end of the line. [{}]".format(url)) return self
def scrape_for_fulltext_link(self): url = self.url dont_scrape_list = [ u"ncbi.nlm.nih.gov", u"europepmc.org", u"/europepmc/", u"pubmed", u"elar.rsvpu.ru", #these ones based on complaint in email u"elib.uraic.ru", u"elar.usfeu.ru", u"elar.urfu.ru", u"elar.uspu.ru" ] for url_fragment in dont_scrape_list: if url_fragment in url: logger.info( u"not scraping {} because is on our do not scrape list.". format(url)) return try: self.r = http_get(url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly) if self.r.status_code != 200: if self.r.status_code in [401]: # not authorized, so not open pass else: self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link".format( self.r.status_code, url) return # if our url redirects to a pdf, we're done. # = open repo http://hdl.handle.net/2060/20140010374 if self.is_a_pdf_page(): if DEBUG_SCRAPING: logger.info(u"this is a PDF. success! [{}]".format(url)) self.scraped_pdf_url = url return else: if DEBUG_SCRAPING: logger.info( u"is not a PDF for {}. continuing more checks".format( url)) # now before reading the content, bail it too large if is_response_too_large(self.r): logger.info(u"landing page is too large, skipping") return # get the HTML tree page = self.r.content_small() # set the license if we can find one scraped_license = find_normalized_license(page) if scraped_license: self.scraped_license = scraped_license pdf_download_link = None # osf doesn't have their download link in their pages # so look at the page contents to see if it is osf-hosted # if so, compute the url. example: http://osf.io/tyhqm if page and u"osf-cookie" in unicode(page, "utf-8"): pdf_download_link = DuckLink(u"{}/download".format(url), "download") # otherwise look for it the normal way else: pdf_download_link = self.find_pdf_link(page) if pdf_download_link is not None: if DEBUG_SCRAPING: logger.info( u"found a PDF download link: {} {} [{}]".format( pdf_download_link.href, pdf_download_link.anchor, url)) pdf_url = get_link_target(pdf_download_link.href, self.r.url) # if they are linking to a PDF, we need to follow the link to make sure it's legit if DEBUG_SCRAPING: logger.info( u"checking to see the PDF link actually gets a PDF [{}]" .format(url)) if self.gets_a_pdf(pdf_download_link, self.r.url): self.scraped_pdf_url = pdf_url self.scraped_open_metadata_url = url return # try this later because would rather get a pdfs # if they are linking to a .docx or similar, this is open. doc_link = find_doc_download_link(page) if doc_link is not None: if DEBUG_SCRAPING: logger.info(u"found a .doc download link {} [{}]".format( get_link_target(doc_link.href, self.r.url), url)) self.scraped_open_metadata_url = url return except requests.exceptions.ConnectionError as e: self.error += u"ERROR: connection error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.Timeout as e: self.error += u"ERROR: timeout error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.InvalidSchema as e: self.error += u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.RequestException as e: self.error += u"ERROR: RequestException in scrape_for_fulltext_link" logger.info(self.error) return except requests.exceptions.ChunkedEncodingError as e: self.error += u"ERROR: ChunkedEncodingError error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except NoDoiException as e: self.error += u"ERROR: NoDoiException error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except Exception as e: self.error += u"ERROR: Exception error on in scrape_for_fulltext_link" logger.exception(self.error) return if DEBUG_SCRAPING: logger.info( u"found no PDF download link. end of the line. [{}]".format( url)) return self
def scrape_for_fulltext_link(self, find_pdf_link=True): url = self.url dont_scrape_list = [ u"ncbi.nlm.nih.gov", u"europepmc.org", u"/europepmc/", u"pubmed", u"elar.rsvpu.ru", #these ones based on complaint in email u"elib.uraic.ru", u"elar.usfeu.ru", u"elar.urfu.ru", u"elar.uspu.ru" ] for url_fragment in dont_scrape_list: if url_fragment in url: logger.info( u"not scraping {} because is on our do not scrape list.". format(url)) return try: self.r = http_get(url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly) resolved_url = self.r.url if self.r.status_code != 200: if self.r.status_code in [401]: # not authorized, so not open pass else: self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link".format( self.r.status_code, url) return # if our url redirects to a pdf, we're done. # = open repo http://hdl.handle.net/2060/20140010374 if self.is_a_pdf_page(): if DEBUG_SCRAPING: logger.info(u"this is a PDF. success! [{}]".format(url)) self.scraped_pdf_url = url return else: if DEBUG_SCRAPING: logger.info( u"is not a PDF for {}. continuing more checks".format( url)) if is_a_word_doc(self.r): if DEBUG_SCRAPING: logger.info( u"this is a word doc. success! [{}]".format(url)) self.scraped_open_metadata_url = url return # now before reading the content, bail it too large if is_response_too_large(self.r): logger.info(u"landing page is too large, skipping") return # get the HTML tree page = self.r.content_small() # remove script tags try: soup = BeautifulSoup(page, 'html.parser') [script.extract() for script in soup('script')] page = str(soup) except HTMLParseError as e: logger.error( u'error parsing html, skipped script removal: {}'.format( e)) # set the license if we can find one scraped_license = find_normalized_license(page) if scraped_license: self.scraped_license = scraped_license pdf_download_link = None # special exception for citeseer because we want the pdf link where # the copy is on the third party repo, not the cached link, if we can get it if url and u"citeseerx.ist.psu.edu/" in url: matches = re.findall(u'<h3>Download Links</h3>.*?href="(.*?)"', page, re.DOTALL) if matches: pdf_download_link = DuckLink(unicode(matches[0], "utf-8"), "download") # osf doesn't have their download link in their pages # so look at the page contents to see if it is osf-hosted # if so, compute the url. example: http://osf.io/tyhqm elif page and u"osf-cookie" in unicode( page, "utf-8", errors='replace'): pdf_download_link = DuckLink(u"{}/download".format(url), "download") # otherwise look for it the normal way else: pdf_download_link = self.find_pdf_link(page) if pdf_download_link is not None: if DEBUG_SCRAPING: logger.info( u"found a PDF download link: {} {} [{}]".format( pdf_download_link.href, pdf_download_link.anchor, url)) pdf_url = get_link_target(pdf_download_link.href, self.r.url) # if they are linking to a PDF, we need to follow the link to make sure it's legit if DEBUG_SCRAPING: logger.info( u"checking to see the PDF link actually gets a PDF [{}]" .format(url)) if self.gets_a_pdf(pdf_download_link, self.r.url): self.scraped_pdf_url = pdf_url self.scraped_open_metadata_url = url return # try this later because would rather get a pdfs # if they are linking to a .docx or similar, this is open. doc_link = find_doc_download_link(page) if doc_link is not None: absolute_doc_url = get_link_target(doc_link.href, resolved_url) if DEBUG_SCRAPING: logger.info( u"found a possible .doc download link [{}]".format( absolute_doc_url)) if self.gets_a_word_doc(doc_link, self.r.url): if DEBUG_SCRAPING: logger.info( u"we've decided this is a word doc. [{}]".format( absolute_doc_url)) self.scraped_open_metadata_url = url return else: if DEBUG_SCRAPING: logger.info( u"we've decided this ain't a word doc. [{}]". format(absolute_doc_url)) bhl_link = find_bhl_view_link(resolved_url, page) if bhl_link is not None: logger.info('found a BHL document link: {}'.format( get_link_target(bhl_link.href, resolved_url))) self.scraped_open_metadata_url = url return if _trust_repo_license(resolved_url) and self.scraped_license: logger.info(u'trusting license {}'.format( self.scraped_license)) self.scraped_open_metadata_url = self.url except requests.exceptions.ConnectionError as e: self.error += u"ERROR: connection error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.Timeout as e: self.error += u"ERROR: timeout error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.InvalidSchema as e: self.error += u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.RequestException as e: self.error += u"ERROR: RequestException in scrape_for_fulltext_link" logger.info(self.error) return except requests.exceptions.ChunkedEncodingError as e: self.error += u"ERROR: ChunkedEncodingError error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except NoDoiException as e: self.error += u"ERROR: NoDoiException error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except Exception as e: self.error += u"ERROR: Exception error on in scrape_for_fulltext_link" logger.exception(self.error) return if DEBUG_SCRAPING: logger.info( u"found no PDF download link. end of the line. [{}]".format( url)) return self
def scrape_for_fulltext_link(self): url = self.url dont_scrape_list = [ u"ncbi.nlm.nih.gov", u"pubmed", u"elar.rsvpu.ru", #these ones based on complaint in email u"elib.uraic.ru", u"elar.usfeu.ru", u"elar.urfu.ru", u"elar.uspu.ru" ] for url_fragment in dont_scrape_list: if url_fragment in url: logger.info( u"not scraping {} because is on our do not scrape list.". format(url)) return try: with closing( http_get(url, stream=True, related_pub=self.related_pub, ask_slowly=self.ask_slowly)) as self.r: if self.r.status_code != 200: self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link".format( self.r.status_code, url) return # if our url redirects to a pdf, we're done. # = open repo http://hdl.handle.net/2060/20140010374 if self.is_a_pdf_page(): if DEBUG_SCRAPING: logger.info( u"this is a PDF. success! [{}]".format(url)) self.scraped_pdf_url = url return else: if DEBUG_SCRAPING: logger.info( u"is not a PDF for {}. continuing more checks". format(url)) # now before reading the content, bail it too large if is_response_too_large(self.r): logger.info(u"landing page is too large, skipping") return # get the HTML tree page = self.r.content # set the license if we can find one scraped_license = find_normalized_license(page) if scraped_license: self.scraped_license = scraped_license # special exception for citeseer because we want the pdf link where # the copy is on the third party repo, not the cached link, if we can get it if u"citeseerx.ist.psu.edu/" in url: matches = re.findall( u'<h3>Download Links</h3>.*?href="(.*?)"', page, re.DOTALL) if matches: self.scraped_pdf_url = unicode(matches[0], "utf-8") self.scraped_open_metadata_url = url return pdf_download_link = self.find_pdf_link(page) if pdf_download_link is not None: if DEBUG_SCRAPING: logger.info( u"found a PDF download link: {} {} [{}]".format( pdf_download_link.href, pdf_download_link.anchor, url)) pdf_url = get_link_target(pdf_download_link.href, self.r.url) # if they are linking to a PDF, we need to follow the link to make sure it's legit if DEBUG_SCRAPING: logger.info( u"checking to see the PDF link actually gets a PDF [{}]" .format(url)) if self.gets_a_pdf(pdf_download_link, self.r.url): self.scraped_pdf_url = pdf_url self.scraped_open_metadata_url = url return # try this later because would rather get a pdfs # if they are linking to a .docx or similar, this is open. doc_link = find_doc_download_link(page) if doc_link is not None: if DEBUG_SCRAPING: logger.info( u"found a .doc download link {} [{}]".format( get_link_target(doc_link.href, self.r.url), url)) self.scraped_open_metadata_url = url return except requests.exceptions.ConnectionError as e: self.error += u"ERROR: connection error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.Timeout as e: self.error += u"ERROR: timeout error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.InvalidSchema as e: self.error += u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.RequestException as e: self.error += u"ERROR: RequestException error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.ChunkedEncodingError as e: self.error += u"ERROR: ChunkedEncodingError error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except NoDoiException as e: self.error += u"ERROR: NoDoiException error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return if DEBUG_SCRAPING: logger.info( u"found no PDF download link. end of the line. [{}]".format( url)) return self
def scrape_for_fulltext_link(self): url = self.url is_journal = u"/doi/" in url or u"10." in url if DEBUG_SCRAPING: print u"in scrape_for_fulltext_link, getting URL: {}".format(url) if u"ncbi.nlm.nih.gov" in url: print u"not scraping {} because is on our do not scrape list.".format( url) if "ncbi.nlm.nih.gov/pmc/articles/PMC" in url: # pmc has fulltext self.scraped_open_metadata_url = url pmcid_matches = re.findall(".*(PMC\d+).*", url) if pmcid_matches: pmcid = pmcid_matches[0] self.scraped_pdf_url = u"https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf".format( pmcid) else: # is an nlm page but not a pmc page, so is not full text return try: with closing( http_get(url, stream=True, read_timeout=10, doi=self.doi)) as r: if is_response_too_large(r): print "landing page is too large, skipping" return # if our url redirects to a pdf, we're done. # = open repo http://hdl.handle.net/2060/20140010374 if resp_is_pdf_from_header(r): if DEBUG_SCRAPING: print u"the head says this is a PDF. success! [{}]".format( url) self.scraped_pdf_url = url return else: if DEBUG_SCRAPING: print u"head says not a PDF for {}. continuing more checks".format( url) # get the HTML tree page = r.content # set the license if we can find one scraped_license = find_normalized_license(page) if scraped_license: self.scraped_license = scraped_license pdf_download_link = find_pdf_link(page, url) if pdf_download_link is not None: if DEBUG_SCRAPING: print u"found a PDF download link: {} {} [{}]".format( pdf_download_link.href, pdf_download_link.anchor, url) pdf_url = get_link_target(pdf_download_link, r.url) if is_journal: # if they are linking to a PDF, we need to follow the link to make sure it's legit if DEBUG_SCRAPING: print u"this is a journal. checking to see the PDF link actually gets a PDF [{}]".format( url) if gets_a_pdf(pdf_download_link, r.url, self.doi): self.scraped_pdf_url = pdf_url self.scraped_open_metadata_url = url return else: self.scraped_pdf_url = pdf_url self.scraped_open_metadata_url = url return # try this later because would rather get a pdfs # if they are linking to a .docx or similar, this is open. # this only works for repos... a ".doc" in a journal is not the article. example: # = closed journal http://doi.org/10.1007/s10822-012-9571-0 if not is_journal: doc_link = find_doc_download_link(page) if doc_link is not None: if DEBUG_SCRAPING: print u"found a .doc download link {} [{}]".format( get_link_target(doc_link, r.url), url) self.scraped_open_metadata_url = url return except requests.exceptions.ConnectionError: print u"ERROR: connection error on {} in scrape_for_fulltext_link, skipping.".format( url) return except requests.Timeout: print u"ERROR: timeout error on {} in scrape_for_fulltext_link, skipping.".format( url) return except requests.exceptions.InvalidSchema: print u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link, skipping.".format( url) return except requests.exceptions.RequestException as e: print u"ERROR: RequestException error on {} in scrape_for_fulltext_link, skipping.".format( url) return if DEBUG_SCRAPING: print u"found no PDF download link. end of the line. [{}]".format( url) return self
def gets_a_pdf(link, base_url, doi=None): if is_purchase_link(link): return False absolute_url = get_link_target(link, base_url) if DEBUG_SCRAPING: print u"checking to see if {} is a pdf".format(absolute_url) start = time() try: with closing( http_get(absolute_url, stream=True, read_timeout=10, doi=doi)) as r: if resp_is_pdf_from_header(r): if DEBUG_SCRAPING: print u"http header says this is a PDF. took {}s {}".format( elapsed(start), absolute_url) return True # everything below here needs to look at the content # so bail here if the page is too big if is_response_too_large(r): if DEBUG_SCRAPING: print u"response is too big for more checks in gets_a_pdf" return False # some publishers send a pdf back wrapped in an HTML page using frames. # this is where we detect that, using each publisher's idiosyncratic templates. # we only check based on a whitelist of publishers, because downloading this whole # page (r.content) is expensive to do for everyone. if 'onlinelibrary.wiley.com' in absolute_url: # = closed journal http://doi.org/10.1111/ele.12585 # = open journal http://doi.org/10.1111/ele.12587 cc-by if '<iframe' in r.content: if DEBUG_SCRAPING: print u"this is a Wiley 'enhanced PDF' page. took {}s [{}]".format( elapsed(start), absolute_url) return True elif 'ieeexplore' in absolute_url: # (this is a good example of one dissem.in misses) # = open journal http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6740844 # = closed journal http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6045214 if '<frame' in r.content: if DEBUG_SCRAPING: print u"this is a IEEE 'enhanced PDF' page. took {}s [{}]".format( elapsed(start), absolute_url) return True elif 'sciencedirect' in absolute_url: if u"does not support the use of the crawler software" in r.content: return True if DEBUG_SCRAPING: print u"we've decided this ain't a PDF. took {}s [{}]".format( elapsed(start), absolute_url) return False except requests.exceptions.ConnectionError: print u"ERROR: connection error in gets_a_pdf, skipping." return False except requests.Timeout: print u"ERROR: timeout error in gets_a_pdf, skipping." return False except requests.exceptions.InvalidSchema: print u"ERROR: InvalidSchema error in gets_a_pdf, skipping." return False except requests.exceptions.RequestException: print u"ERROR: RequestException error in gets_a_pdf, skipping." return False