def set_local_lookup_oa(self): start_time = time() evidence = None fulltext_url = self.url license = "unknown" if oa_local.is_open_via_doaj_issn(self.issns): license = oa_local.is_open_via_doaj_issn(self.issns) evidence = "oa journal (via issn in doaj)" elif oa_local.is_open_via_doaj_journal(self.journal): license = oa_local.is_open_via_doaj_journal(self.journal) evidence = "oa journal (via journal title in doaj)" elif oa_local.is_open_via_datacite_prefix(self.doi): evidence = "oa repository (via datacite prefix)" elif oa_local.is_open_via_doi_fragment(self.doi): evidence = "oa repository (via doi prefix)" elif oa_local.is_open_via_url_fragment(self.url): evidence = "oa repository (via url prefix)" elif oa_local.is_open_via_license_urls(self.crossref_license_urls): freetext_license = oa_local.is_open_via_license_urls(self.crossref_license_urls) license = oa_local.find_normalized_license(freetext_license) evidence = "hybrid journal (via crossref license url)" # oa_color depends on this including the word "hybrid" if evidence: self.fulltext_url = fulltext_url self.evidence = evidence self.license = license if self.fulltext_url and self.license and self.license != "unknown": self.response_done = True
def ask_local_lookup(self): start_time = time() evidence = None fulltext_url = self.url license = "unknown" if oa_local.is_open_via_doaj_issn(self.issns): license = oa_local.is_open_via_doaj_issn(self.issns) evidence = "oa journal (via issn in doaj)" elif oa_local.is_open_via_doaj_journal(self.journal): license = oa_local.is_open_via_doaj_journal(self.journal) evidence = "oa journal (via journal title in doaj)" elif oa_local.is_open_via_datacite_prefix(self.doi): evidence = "oa repository (via datacite prefix)" elif oa_local.is_open_via_doi_fragment(self.doi): evidence = "oa repository (via doi prefix)" elif oa_local.is_open_via_url_fragment(self.url): evidence = "oa repository (via url prefix)" elif oa_local.is_open_via_license_urls(self.crossref_license_urls): freetext_license = oa_local.is_open_via_license_urls(self.crossref_license_urls) license = oa_local.find_normalized_license(freetext_license) evidence = "hybrid journal (via crossref license url)" # oa_color depends on this including the word "hybrid" if evidence: my_version = OpenVersion() my_version.metadata_url = fulltext_url my_version.license = license my_version.source = evidence my_version.doi = self.doi self.open_versions.append(my_version)
def update_with_local_info(self): scrape_version_old = self.scrape_version scrape_license_old = self.scrape_license # if this repo has told us they will never have submitted, set default to be accepted if self.endpoint and self.endpoint.policy_promises_no_submitted and self.scrape_version != "publishedVersion": self.scrape_version = "acceptedVersion" # now look at the pmh record if self.pmh_record: # trust accepted in a variety of formats accepted_patterns = [ re.compile(ur"accepted.?version", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"version.?accepted", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"accepted.?manuscript", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"<dc:type>peer.?reviewed</dc:type>", re.IGNORECASE | re.MULTILINE | re.DOTALL) ] for pattern in accepted_patterns: if pattern.findall(self.pmh_record.api_raw): self.scrape_version = "acceptedVersion" # print u"version for is {}".format(self.scrape_version) # trust a strict version of published version published_patterns = [ re.compile(ur"<dc:type>.*publishedVersion</dc:type>", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"<free_to_read>.*published.*</free_to_read>", re.IGNORECASE | re.MULTILINE | re.DOTALL) ] for published_pattern in published_patterns: if published_pattern.findall(self.pmh_record.api_raw): self.scrape_version = "publishedVersion" # get license if it is in pmh record rights_pattern = re.compile( ur"<dc:rights>(.*)</dc:rights>", re.IGNORECASE | re.MULTILINE | re.DOTALL) rights_matches = rights_pattern.findall(self.pmh_record.api_raw) for rights_text in rights_matches: open_license = find_normalized_license(rights_text) # only overwrite it if there is one, so doesn't overwrite anything scraped if open_license: self.scrape_license = open_license self.scrape_version = _scrape_version_override().get( self.pmh_record.pmh_id, self.scrape_version) if scrape_version_old != self.scrape_version or scrape_license_old != self.scrape_license: self.updated = datetime.datetime.utcnow().isoformat() print u"based on OAI-PMH metadata, updated {} {} for {} {}".format( self.scrape_version, self.scrape_license, self.url, self.id) return True # print u"based on metadata, assuming {} {} for {} {}".format(self.scrape_version, self.scrape_license, self.url, self.id) return False
def set_info_for_pmc_page(self): if not self.pmcid: return result_list = query_pmc(self.pmcid) if not result_list: return result = result_list[0] has_pdf = result.get("hasPDF", None) is_author_manuscript = result.get("authMan", None) is_open_access = result.get("isOpenAccess", None) raw_license = result.get("license", None) self.scrape_metadata_url = u"http://europepmc.org/articles/{}".format( self.pmcid) if has_pdf == u"Y": self.scrape_pdf_url = u"http://europepmc.org/articles/{}?pdf=render".format( self.pmcid) if is_author_manuscript == u"Y": self.scrape_version = u"acceptedVersion" else: self.scrape_version = u"publishedVersion" if raw_license: self.scrape_license = find_normalized_license(raw_license) elif is_open_access == "Y": self.scrape_license = u"implied-oa"
def set_info_for_pmc_page(self): if not self.pmcid: return url_template = u"https://www.ebi.ac.uk/europepmc/webservices/rest/search?query={}&resulttype=core&format=json&tool=oadoi" url = url_template.format(self.pmcid) # try: r = http_get(url) data = r.json() result_list = data["resultList"]["result"] if not result_list: return result = result_list[0] has_pdf = result.get("hasPDF", None) is_author_manuscript = result.get("authMan", None) is_open_access = result.get("isOpenAccess", None) raw_license = result.get("license", None) self.scrape_metadata_url = u"http://europepmc.org/articles/{}".format(self.pmcid) if has_pdf == u"Y": self.scrape_pdf_url = u"http://europepmc.org/articles/{}?pdf=render".format(self.pmcid) if is_author_manuscript == u"Y": self.scrape_version = u"acceptedVersion" else: self.scrape_version = u"publishedVersion" if raw_license: self.scrape_license = find_normalized_license(raw_license) elif is_open_access == "Y": self.scrape_license = u"implied-oa"
def set_info_for_pmc_page(self): if not self.pmcid: return url_template = u"https://www.ebi.ac.uk/europepmc/webservices/rest/search?query={}&resulttype=core&format=json&tool=oadoi" url = url_template.format(self.pmcid) # try: r = http_get(url) data = r.json() result_list = data["resultList"]["result"] if not result_list: return result = result_list[0] has_pdf = result.get("hasPDF", None) is_author_manuscript = result.get("authMan", None) is_open_access = result.get("isOpenAccess", None) raw_license = result.get("license", None) self.scrape_metadata_url = u"http://europepmc.org/articles/{}".format( self.pmcid) if has_pdf == u"Y": self.scrape_pdf_url = u"http://europepmc.org/articles/{}?pdf=render".format( self.pmcid) if is_author_manuscript == u"Y": self.scrape_version = u"acceptedVersion" else: self.scrape_version = u"publishedVersion" if raw_license: self.scrape_license = find_normalized_license(raw_license) elif is_open_access == "Y": self.scrape_license = u"implied-oa"
def set_version_and_license(self, r=None): self.updated = datetime.datetime.utcnow().isoformat() if self.is_pmc: self.set_info_for_pmc_page() return # set as default self.scrape_version = "submittedVersion" is_updated = self.update_with_local_info() # now try to see what we can get out of the pdf itself if not r: logger.info(u"before scrape returning {} with scrape_version: {}, license {}".format(self.url, self.scrape_version, self.scrape_license)) return try: # http://crossmark.dyndns.org/dialog/?doi=10.1016/j.jml.2012 at http://dspace.mit.edu/bitstream/1721.1/102417/1/Gibson_The%20syntactic.pdf if re.findall(u"crossmark\.[^/]*\.org/", r.content_big(), re.IGNORECASE): self.scrape_version = "publishedVersion" text = convert_pdf_to_txt(r, max_pages=25) # logger.info(text) if text and self.scrape_version == "submittedVersion": patterns = [ re.compile(ur"©.?\d{4}", re.UNICODE), re.compile(ur"\(C\).?\d{4}", re.IGNORECASE), re.compile(ur"copyright.{0,6}\d{4}", re.IGNORECASE), re.compile(ur"received.{0,100}revised.{0,100}accepted.{0,100}publication", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"all rights reserved", re.IGNORECASE), re.compile(ur"This article is distributed under the terms of the Creative Commons", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"This article is licensed under a Creative Commons", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"this is an open access article", re.IGNORECASE | re.MULTILINE | re.DOTALL) ] for pattern in patterns: if pattern.findall(text): logger.info(u'found {}, decided PDF is published version'.format(pattern.pattern)) self.scrape_version = "publishedVersion" if not self.scrape_license: open_license = find_normalized_license(text) if open_license: logger.info(u'found license in PDF: {}'.format(open_license)) self.scrape_license = open_license except Exception as e: logger.exception(u"exception in convert_pdf_to_txt for {}".format(self.url)) self.error += u"Exception doing convert_pdf_to_txt!" logger.info(self.error) logger.info(u"scrape returning {} with scrape_version: {}, license {}".format(self.url, self.scrape_version, self.scrape_license))
def set_version_and_license(self, r=None): if self.is_pmc: self.set_info_for_pmc_page() return # set as default self.scrape_version = "submittedVersion" if not r: return try: # http://crossmark.dyndns.org/dialog/?doi=10.1016/j.jml.2012 at http://dspace.mit.edu/bitstream/1721.1/102417/1/Gibson_The%20syntactic.pdf if re.findall(u"crossmark\.[^/]*\.org/", r.content_big(), re.IGNORECASE): self.scrape_version = "publishedVersion" text = convert_pdf_to_txt(r) # logger.info(text) if text and self.scrape_version == "submittedVersion": patterns = [ re.compile(ur"©.?\d{4}", re.UNICODE), re.compile(ur"\(C\).?\d{4}", re.IGNORECASE), re.compile(ur"copyright \d{4}", re.IGNORECASE), re.compile(ur"all rights reserved", re.IGNORECASE), re.compile( ur"This article is distributed under the terms of the Creative Commons", re.IGNORECASE), re.compile(ur"this is an open access article", re.IGNORECASE) ] for pattern in patterns: matches = pattern.findall(text) if matches: self.scrape_version = "publishedVersion" logger.info(u"returning {} with scrape_version: {}".format( self.url, self.scrape_version)) open_license = find_normalized_license(text) if open_license: self.scrape_license = open_license except Exception as e: logger.exception(u"exception in convert_pdf_to_txt for {}".format( self.url)) self.error += u"Exception doing convert_pdf_to_txt!" logger.info(self.error) pass
def update_with_local_info(self): scrape_version_old = self.scrape_version scrape_license_old = self.scrape_license # if this repo has told us they will never have submitted, set default to be accepted if self.endpoint and self.endpoint.policy_promises_no_submitted and self.scrape_version != "publishedVersion": self.scrape_version = "acceptedVersion" # now look at the pmh record if self.pmh_record: # trust accepted in a variety of formats accepted_patterns = [ re.compile(ur"accepted.?version", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"version.?accepted", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"accepted.?manuscript", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"<dc:type>peer.?reviewed</dc:type>", re.IGNORECASE | re.MULTILINE | re.DOTALL) ] for pattern in accepted_patterns: if pattern.findall(self.pmh_record.api_raw): self.scrape_version = "acceptedVersion" # print u"version for is {}".format(self.scrape_version) # trust a strict version of published version published_patterns = [ re.compile(ur"<dc:type>.*publishedVersion</dc:type>", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"<dc:type\.version>.*publishedVersion</dc:type\.version>", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"<free_to_read>.*published.*</free_to_read>", re.IGNORECASE | re.MULTILINE | re.DOTALL) ] for published_pattern in published_patterns: if published_pattern.findall(self.pmh_record.api_raw): self.scrape_version = "publishedVersion" # get license if it is in pmh record rights_pattern = re.compile(ur"<dc:rights>(.*)</dc:rights>", re.IGNORECASE | re.MULTILINE | re.DOTALL) rights_matches = rights_pattern.findall(self.pmh_record.api_raw) rights_license_pattern = re.compile(ur"<dc:rights\.license>(.*)</dc:rights\.license>", re.IGNORECASE | re.MULTILINE | re.DOTALL) rights_matches.extend(rights_license_pattern.findall(self.pmh_record.api_raw)) for rights_text in rights_matches: open_license = find_normalized_license(rights_text) # only overwrite it if there is one, so doesn't overwrite anything scraped if open_license: self.scrape_license = open_license self.scrape_version = _scrape_version_override().get(self.pmh_record.pmh_id, self.scrape_version) if self.scrape_pdf_url and re.search(ur'^https?://rke\.abertay\.ac\.uk', self.scrape_pdf_url): if re.search(ur'Publishe[dr]_?\d\d\d\d\.pdf$', self.scrape_pdf_url): self.scrape_version = "publishedVersion"
def update_with_local_info(self): scrape_version_old = self.scrape_version scrape_license_old = self.scrape_license # if this repo has told us they will never have submitted, set default to be accepted if self.endpoint and self.endpoint.policy_promises_no_submitted: self.scrape_version = "acceptedVersion" # now look at the pmh record if self.pmh_record: # trust accepted in a variety of formats accepted_patterns = [ re.compile(ur"accepted.?version", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"version.?accepted", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"accepted.?manuscript", re.IGNORECASE | re.MULTILINE | re.DOTALL) ] for pattern in accepted_patterns: if pattern.findall(self.pmh_record.api_raw): self.scrape_version = "acceptedVersion" # print u"version for is {}".format(self.scrape_version) # trust a strict version of published version published_patterns = [ re.compile(ur"<dc:type>.*publishedVersion</dc:type>", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"<free_to_read>.*published.*</free_to_read>", re.IGNORECASE | re.MULTILINE | re.DOTALL) ] for published_pattern in published_patterns: if published_pattern.findall(self.pmh_record.api_raw): self.scrape_version = "publishedVersion" # get license if it is in pmh record rights_pattern = re.compile(ur"<dc:rights>(.*)</dc:rights>", re.IGNORECASE | re.MULTILINE | re.DOTALL) rights_matches = rights_pattern.findall(self.pmh_record.api_raw) for rights_text in rights_matches: open_license = find_normalized_license(rights_text) # only overwrite it if there is one, so doesn't overwrite anything scraped if open_license: self.scrape_license = open_license if scrape_version_old != self.scrape_version or scrape_license_old != self.scrape_license: self.updated = datetime.datetime.utcnow().isoformat() print u"based on OAI-PMH metadata, updated {} {} for {} {}".format(self.scrape_version, self.scrape_license, self.url, self.id) return True # print u"based on metadata, assuming {} {} for {} {}".format(self.scrape_version, self.scrape_license, self.url, self.id) return False
def set_version_and_license(self, r=None): # set as default self.scrape_version = "submittedVersion" if self.is_pmc: print "implement PMC version properly" print 1 / 0 # todo if r: try: text = convert_pdf_to_txt(r) # logger.info(text) if text: patterns = [ re.compile(ur"©.?\d{4}", re.UNICODE), re.compile(ur"copyright \d{4}", re.IGNORECASE), re.compile(ur"all rights reserved", re.IGNORECASE), re.compile( ur"This article is distributed under the terms of the Creative Commons", re.IGNORECASE), re.compile(ur"this is an open access article", re.IGNORECASE) ] for pattern in patterns: matches = pattern.findall(text) if matches: self.scrape_version = "publishedVersion" logger.info(u"returning with scrape_version={}".format( self.scrape_version)) open_license = find_normalized_license(text) if open_license: self.scrape_license = open_license except Exception as e: self.error += u"Exception doing convert_pdf_to_txt on {}! investigate! {}".format( self.scrape_pdf_url, unicode(e.message).encode("utf-8")) logger.info(self.error) pass
def set_fulltext_urls(self): # first set license if there is one originally. overwrite it later if scraped a better one. if "license" in self.doc and self.doc["license"]: self.license = oa_local.find_normalized_license( self.doc["license"]) for my_webpage in self.open_webpages: if my_webpage.has_fulltext_url: response = {} self.fulltext_url_dicts += [{ "free_pdf_url": my_webpage.scraped_pdf_url, "pdf_landing_page": my_webpage.url }] if not self.license or self.license == "unknown": self.license = my_webpage.scraped_license else: print "{} has no fulltext url alas".format(my_webpage) if self.license == "unknown": self.license = None
def ask_local_lookup(self): start_time = time() evidence = None fulltext_url = self.url license = None if oa_local.is_open_via_doaj_issn(self.issns, self.year): license = oa_local.is_open_via_doaj_issn(self.issns, self.year) evidence = "oa journal (via issn in doaj)" elif not self.issns and oa_local.is_open_via_doaj_journal( self.all_journals, self.year): license = oa_local.is_open_via_doaj_journal( self.all_journals, self.year) evidence = "oa journal (via journal title in doaj)" elif oa_local.is_open_via_publisher(self.publisher): evidence = "oa journal (via publisher name)" elif oa_local.is_open_via_doi_fragment(self.doi): evidence = "oa repository (via doi prefix)" elif oa_local.is_open_via_url_fragment(self.url): evidence = "oa repository (via url prefix)" elif oa_local.is_open_via_license_urls(self.crossref_license_urls): freetext_license = oa_local.is_open_via_license_urls( self.crossref_license_urls) license = oa_local.find_normalized_license(freetext_license) # logger.info(u"freetext_license: {} {}".format(freetext_license, license)) evidence = "open (via crossref license)" # oa_color depends on this including the word "hybrid" if evidence: my_location = OpenLocation() my_location.metadata_url = fulltext_url my_location.license = license my_location.evidence = evidence my_location.updated = datetime.datetime.utcnow() my_location.doi = self.doi my_location.version = "publishedVersion" self.open_locations.append(my_location)
def set_info_for_pmc_page(self): if not self.pmcid: return result_list = query_pmc(self.pmcid) if not result_list: return result = result_list[0] has_pdf = result.get("hasPDF", None) is_author_manuscript = result.get("authMan", None) is_open_access = result.get("isOpenAccess", None) raw_license = result.get("license", None) self.scrape_metadata_url = u"http://europepmc.org/articles/{}".format(self.pmcid) if has_pdf == u"Y": self.scrape_pdf_url = u"http://europepmc.org/articles/{}?pdf=render".format(self.pmcid) if is_author_manuscript == u"Y": self.scrape_version = u"acceptedVersion" else: self.scrape_version = u"publishedVersion" if raw_license: self.scrape_license = find_normalized_license(raw_license) elif is_open_access == "Y": self.scrape_license = u"implied-oa"
def get_fulltext_webpages_from_our_base_doc(doc): response = [] license = doc.get("fulltext_license", None) # workaround for a bug there was in the normalized license license_string_in_doc = doc.get("license", "") if license_string_in_doc: if "orks not in the public domain" in license_string_in_doc: license = None if not license: license = find_normalized_license(license_string_in_doc) if "fulltext_url_dicts" in doc: for scrape_results in doc["fulltext_url_dicts"]: my_webpage = WebpageInOpenRepo(url=scrape_results.get("pdf_landing_page", None)) my_webpage.scraped_pdf_url = scrape_results.get("free_pdf_url", None) my_webpage.scraped_open_metadata_url = scrape_results.get("pdf_landing_page", None) my_webpage.scraped_license = license response.append(my_webpage) # eventually these will have fulltext_url_dicts populated as well if doc["oa"] == 1: for url in get_urls_from_our_base_doc(doc): my_webpage = WebpageInOpenRepo(url=url) my_webpage.scraped_open_metadata_url = url # this will get handled when the oa1 urls get added pmcid_matches = re.findall(".*(PMC\d+).*", url) if pmcid_matches: pmcid = pmcid_matches[0] my_webpage.scraped_pdf_url = u"https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf".format(pmcid) my_webpage.scraped_license = license response.append(my_webpage) return response
def scrape_for_fulltext_link(self, find_pdf_link=True): landing_url = self.url if DEBUG_SCRAPING: logger.info( u"checking to see if {} says it is open".format(landing_url)) start = time() try: self.r = http_get(landing_url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly) resolved_landing_url = self.r.url if self.r.status_code != 200: if self.r.status_code in [401]: # is unauthorized, so not open pass else: self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link, skipping.".format( self.r.status_code, self.r.url) logger.info(u"DIDN'T GET THE PAGE: {}".format(self.error)) # logger.debug(self.r.request.headers) return # example 10.1007/978-3-642-01445-1 if u"crossref.org/_deleted-doi/" in resolved_landing_url: logger.info(u"this is a deleted doi") return # if our landing_url redirects to a pdf, we're done. # = open repo http://hdl.handle.net/2060/20140010374 if self.is_a_pdf_page(): if DEBUG_SCRAPING: logger.info( u"this is a PDF. success! [{}]".format(landing_url)) self.scraped_pdf_url = landing_url self.open_version_source_string = "open (via free pdf)" # don't bother looking for open access lingo because it is a PDF (or PDF wannabe) return else: if DEBUG_SCRAPING: logger.info( u"landing page is not a PDF for {}. continuing more checks" .format(landing_url)) # get the HTML tree page = self.r.content_small() # remove script tags try: soup = BeautifulSoup(page, 'html.parser') [script.extract() for script in soup('script')] page = str(soup) except HTMLParseError as e: logger.error( u'error parsing html, skipped script removal: {}'.format( e)) # Look for a pdf link. If we find one, look for a license. pdf_download_link = self.find_pdf_link( page) if find_pdf_link else None if pdf_download_link is not None: pdf_url = get_link_target(pdf_download_link.href, self.r.url) if self.gets_a_pdf(pdf_download_link, self.r.url): self.scraped_pdf_url = pdf_url self.scraped_open_metadata_url = landing_url self.open_version_source_string = "open (via free pdf)" # set the license if we can find one scraped_license = find_normalized_license(page) if scraped_license: self.scraped_license = scraped_license # Look for patterns that indicate availability but not necessarily openness and make this a bronze location. bronze_url_snippet_patterns = [ ('sciencedirect.com/', u'<div class="OpenAccessLabel">open archive</div>'), ] for (url_snippet, pattern) in bronze_url_snippet_patterns: if url_snippet in resolved_landing_url.lower() and re.findall( pattern, page, re.IGNORECASE | re.DOTALL): self.scraped_open_metadata_url = landing_url self.open_version_source_string = "open (via free article)" bronze_publisher_patterns = [ ("New England Journal of Medicine (NEJM/MMS)", u'<meta content="yes" name="evt-free"'), ("Massachusetts Medical Society", u'<meta content="yes" name="evt-free"'), ] for (publisher, pattern) in bronze_publisher_patterns: if self.is_same_publisher(publisher) and re.findall( pattern, page, re.IGNORECASE | re.DOTALL): self.scraped_open_metadata_url = landing_url self.open_version_source_string = "open (via free article)" # Look for some license-like patterns that make this a hybrid location. hybrid_url_snippet_patterns = [ ('projecteuclid.org/', u'<strong>Full-text: Open access</strong>'), ('sciencedirect.com/', u'<div class="OpenAccessLabel">open access</div>'), ('journals.ametsoc.org/', ur'src="/templates/jsp/_style2/_ams/images/access_free\.gif"' ), ('apsjournals.apsnet.org', ur'src="/products/aps/releasedAssets/images/open-access-icon\.png"' ), ('psychiatriapolska.pl', u'is an Open Access journal:'), ('journals.lww.com', u'<span class="[^>]*ejp-indicator--free'), ] for (url_snippet, pattern) in hybrid_url_snippet_patterns: if url_snippet in resolved_landing_url.lower() and re.findall( pattern, page, re.IGNORECASE | re.DOTALL): self.scraped_open_metadata_url = landing_url self.open_version_source_string = "open (via page says Open Access)" self.scraped_license = "implied-oa" hybrid_publisher_patterns = [ ("Informa UK Limited", u"/accessOA.png"), ("Oxford University Press (OUP)", u"<i class='icon-availability_open'"), ("Institute of Electrical and Electronics Engineers (IEEE)", ur'"isOpenAccess":true'), ("Institute of Electrical and Electronics Engineers (IEEE)", ur'"openAccessFlag":"yes"'), ("Informa UK Limited", u"/accessOA.png"), ("Royal Society of Chemistry (RSC)", u"/open_access_blue.png"), ("Cambridge University Press (CUP)", u'<span class="icon access open-access cursorDefault">'), ] for (publisher, pattern) in hybrid_publisher_patterns: if self.is_same_publisher(publisher) and re.findall( pattern, page, re.IGNORECASE | re.DOTALL): self.scraped_open_metadata_url = landing_url self.open_version_source_string = "open (via page says Open Access)" self.scraped_license = "implied-oa" # Look for more license-like patterns that make this a hybrid location. # Extract the specific license if present. license_patterns = [ ur"(creativecommons.org/licenses/[a-z\-]+)", u"distributed under the terms (.*) which permits", u"This is an open access article under the terms (.*) which permits", u"This is an open access article published under (.*) which permits", u'<div class="openAccess-articleHeaderContainer(.*?)</div>' ] for pattern in license_patterns: matches = re.findall(pattern, page, re.IGNORECASE) if matches: self.scraped_open_metadata_url = landing_url normalized_license = find_normalized_license(matches[0]) self.scraped_license = normalized_license or 'implied-oa' if normalized_license: self.open_version_source_string = 'open (via page says license)' else: self.open_version_source_string = 'open (via page says Open Access)' if self.is_open: if DEBUG_SCRAPING: logger.info( u"we've decided this is open! took {} seconds [{}]". format(elapsed(start), landing_url)) return True else: if DEBUG_SCRAPING: logger.info( u"we've decided this doesn't say open. took {} seconds [{}]" .format(elapsed(start), landing_url)) return False except requests.exceptions.ConnectionError as e: self.error += u"ERROR: connection error in scrape_for_fulltext_link on {}: {}".format( landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except requests.Timeout as e: self.error += u"ERROR: timeout error in scrape_for_fulltext_link on {}: {}".format( landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except requests.exceptions.InvalidSchema as e: self.error += u"ERROR: InvalidSchema error in scrape_for_fulltext_link on {}: {}".format( landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except requests.exceptions.RequestException as e: self.error += u"ERROR: RequestException error in scrape_for_fulltext_link" logger.info(self.error) return False except requests.exceptions.ChunkedEncodingError as e: self.error += u"ERROR: ChunkedEncodingError error in scrape_for_fulltext_link on {}: {}".format( landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except NoDoiException as e: self.error += u"ERROR: NoDoiException error in scrape_for_fulltext_link on {}: {}".format( landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except Exception as e: self.error += u"ERROR: Exception error in scrape_for_fulltext_link" logger.exception(self.error) return False
def set_version_and_license(self, r=None): self.updated = datetime.datetime.utcnow().isoformat() if self.is_pmc: self.set_info_for_pmc_page() return # set as default self.scrape_version = "submittedVersion" is_updated = self.update_with_local_info() # now try to see what we can get out of the pdf itself if not r: logger.info( u"before scrape returning {} with scrape_version: {}, license {}" .format(self.url, self.scrape_version, self.scrape_license)) return try: # http://crossmark.dyndns.org/dialog/?doi=10.1016/j.jml.2012 at http://dspace.mit.edu/bitstream/1721.1/102417/1/Gibson_The%20syntactic.pdf if re.findall(u"crossmark\.[^/]*\.org/", r.content_big(), re.IGNORECASE): self.scrape_version = "publishedVersion" text = convert_pdf_to_txt(r, max_pages=25) # logger.info(text) if text and self.scrape_version == "submittedVersion": patterns = [ re.compile(ur"©.?\d{4}", re.UNICODE), re.compile(ur"\(C\).?\d{4}", re.IGNORECASE), re.compile(ur"copyright.{0,6}\d{4}", re.IGNORECASE), re.compile( ur"received.{0,100}revised.{0,100}accepted.{0,100}publication", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"all rights reserved", re.IGNORECASE), re.compile( ur"This article is distributed under the terms of the Creative Commons", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile( ur"This article is licensed under a Creative Commons", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"this is an open access article", re.IGNORECASE | re.MULTILINE | re.DOTALL) ] for pattern in patterns: if pattern.findall(text): self.scrape_version = "publishedVersion" if not self.scrape_license: open_license = find_normalized_license(text) if open_license: self.scrape_license = open_license except Exception as e: logger.exception(u"exception in convert_pdf_to_txt for {}".format( self.url)) self.error += u"Exception doing convert_pdf_to_txt!" logger.info(self.error) pass logger.info( u"scrape returning {} with scrape_version: {}, license {}".format( self.url, self.scrape_version, self.scrape_license))
def scrape_green(self): # handle these special cases, where we compute the pdf rather than looking for it if "oai:arXiv.org" in self.pmh_id: self.scrape_metadata_url = self.url self.scrape_pdf_url = self.url.replace("abs", "pdf") if self.is_pmc: self.set_info_for_pmc_page() return # https://ink.library.smu.edu.sg/do/oai/ if self.endpoint and self.endpoint.id == 'ys9xnlw27yogrfsecedx' and u'ink.library.smu.edu.sg' in self.url: if u'viewcontent.cgi?' in self.url: return if self.pmh_record and find_normalized_license(self.pmh_record.license): self.scrape_metadata_url = self.url self.set_version_and_license() return if not self.scrape_pdf_url or not self.scrape_version: with PmhRepoWebpage(url=self.url, scraped_pdf_url=self.scrape_pdf_url, repo_id=self.repo_id) as my_webpage: if not self.scrape_pdf_url: my_webpage.scrape_for_fulltext_link() self.error += my_webpage.error if my_webpage.is_open: logger.info(u"** found an open copy! {}".format(my_webpage.fulltext_url)) self.scrape_updated = datetime.datetime.utcnow().isoformat() self.scrape_metadata_url = self.url if my_webpage.scraped_pdf_url: self.scrape_pdf_url = my_webpage.scraped_pdf_url if my_webpage.scraped_open_metadata_url: self.scrape_metadata_url = my_webpage.scraped_open_metadata_url if my_webpage.scraped_license: self.scrape_license = my_webpage.scraped_license if my_webpage.scraped_version: self.scrape_version = my_webpage.scraped_version if self.scrape_pdf_url and not self.scrape_version: self.set_version_and_license(r=my_webpage.r) if self.scrape_pdf_url and not self.scrape_version: with PmhRepoWebpage(url=self.url, scraped_pdf_url=self.scrape_pdf_url, repo_id=self.repo_id) as my_webpage: my_webpage.set_r_for_pdf() self.set_version_and_license(r=my_webpage.r) if self.is_open and not self.scrape_version: self.scrape_version = self.default_version() # associate certain landing page URLs with PDFs # https://repository.uantwerpen.be if self.endpoint and self.endpoint.id == 'mmv3envg3kaaztya9tmo': if self.scrape_pdf_url and self.scrape_pdf_url == self.scrape_metadata_url and self.pmh_record: logger.info(u'looking for landing page for {}'.format(self.scrape_pdf_url)) landing_urls = [u for u in self.pmh_record.urls if u'hdl.handle.net' in u] if len(landing_urls) == 1: logger.info(u'trying landing page {}'.format(landing_urls[0])) try: if http_get(landing_urls[0]).status_code == 200: self.scrape_metadata_url = landing_urls[0] except: pass if self.scrape_metadata_url: logger.info(u'set landing page {}'.format(self.scrape_metadata_url)) # https://lirias.kuleuven.be if (self.endpoint and self.endpoint.id == 'ycf3gzxeiyuw3jqwjmx3' and self.scrape_pdf_url == self.scrape_metadata_url and self.scrape_pdf_url and 'lirias.kuleuven.be' in self.scrape_pdf_url ): if self.pmh_record and self.pmh_record.bare_pmh_id and 'oai:lirias2repo.kuleuven.be:' in self.pmh_record.bare_pmh_id: self.scrape_metadata_url = 'https://lirias.kuleuven.be/handle/{}'.format( self.pmh_record.bare_pmh_id.replace('oai:lirias2repo.kuleuven.be:', '') )
def scrape_for_fulltext_link(self): url = self.url dont_scrape_list = [ u"ncbi.nlm.nih.gov", u"europepmc.org", u"/europepmc/", u"pubmed", u"elar.rsvpu.ru", #these ones based on complaint in email u"elib.uraic.ru", u"elar.usfeu.ru", u"elar.urfu.ru", u"elar.uspu.ru" ] for url_fragment in dont_scrape_list: if url_fragment in url: logger.info( u"not scraping {} because is on our do not scrape list.". format(url)) return try: self.r = http_get(url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly) if self.r.status_code != 200: if self.r.status_code in [401]: # not authorized, so not open pass else: self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link".format( self.r.status_code, url) return # if our url redirects to a pdf, we're done. # = open repo http://hdl.handle.net/2060/20140010374 if self.is_a_pdf_page(): if DEBUG_SCRAPING: logger.info(u"this is a PDF. success! [{}]".format(url)) self.scraped_pdf_url = url return else: if DEBUG_SCRAPING: logger.info( u"is not a PDF for {}. continuing more checks".format( url)) # now before reading the content, bail it too large if is_response_too_large(self.r): logger.info(u"landing page is too large, skipping") return # get the HTML tree page = self.r.content_small() # set the license if we can find one scraped_license = find_normalized_license(page) if scraped_license: self.scraped_license = scraped_license pdf_download_link = None # osf doesn't have their download link in their pages # so look at the page contents to see if it is osf-hosted # if so, compute the url. example: http://osf.io/tyhqm if page and u"osf-cookie" in unicode(page, "utf-8"): pdf_download_link = DuckLink(u"{}/download".format(url), "download") # otherwise look for it the normal way else: pdf_download_link = self.find_pdf_link(page) if pdf_download_link is not None: if DEBUG_SCRAPING: logger.info( u"found a PDF download link: {} {} [{}]".format( pdf_download_link.href, pdf_download_link.anchor, url)) pdf_url = get_link_target(pdf_download_link.href, self.r.url) # if they are linking to a PDF, we need to follow the link to make sure it's legit if DEBUG_SCRAPING: logger.info( u"checking to see the PDF link actually gets a PDF [{}]" .format(url)) if self.gets_a_pdf(pdf_download_link, self.r.url): self.scraped_pdf_url = pdf_url self.scraped_open_metadata_url = url return # try this later because would rather get a pdfs # if they are linking to a .docx or similar, this is open. doc_link = find_doc_download_link(page) if doc_link is not None: if DEBUG_SCRAPING: logger.info(u"found a .doc download link {} [{}]".format( get_link_target(doc_link.href, self.r.url), url)) self.scraped_open_metadata_url = url return except requests.exceptions.ConnectionError as e: self.error += u"ERROR: connection error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.Timeout as e: self.error += u"ERROR: timeout error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.InvalidSchema as e: self.error += u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.RequestException as e: self.error += u"ERROR: RequestException in scrape_for_fulltext_link" logger.info(self.error) return except requests.exceptions.ChunkedEncodingError as e: self.error += u"ERROR: ChunkedEncodingError error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except NoDoiException as e: self.error += u"ERROR: NoDoiException error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except Exception as e: self.error += u"ERROR: Exception error on in scrape_for_fulltext_link" logger.exception(self.error) return if DEBUG_SCRAPING: logger.info( u"found no PDF download link. end of the line. [{}]".format( url)) return self
def scrape_for_fulltext_link(self): landing_url = self.url if DEBUG_SCRAPING: logger.info( u"checking to see if {} says it is open".format(landing_url)) start = time() try: self.r = http_get(landing_url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly) if self.r.status_code != 200: if self.r.status_code in [401]: # is unauthorized, so not open pass else: self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link, skipping.".format( self.r.status_code, self.r.url) logger.info(u"DIDN'T GET THE PAGE: {}".format(self.error)) # logger.debug(self.r.request.headers) return # example 10.1007/978-3-642-01445-1 if u"crossref.org/_deleted-doi/" in self.r.url: logger.info(u"this is a deleted doi") return # if our landing_url redirects to a pdf, we're done. # = open repo http://hdl.handle.net/2060/20140010374 if self.is_a_pdf_page(): if DEBUG_SCRAPING: logger.info( u"this is a PDF. success! [{}]".format(landing_url)) self.scraped_pdf_url = landing_url self.open_version_source_string = "open (via free pdf)" # don't bother looking for open access lingo because it is a PDF (or PDF wannabe) return else: if DEBUG_SCRAPING: logger.info( u"landing page is not a PDF for {}. continuing more checks" .format(landing_url)) # get the HTML tree page = self.r.content_small() # set the license if we can find one scraped_license = find_normalized_license(page) if scraped_license: self.scraped_license = scraped_license pdf_download_link = self.find_pdf_link(page) if pdf_download_link is not None: pdf_url = get_link_target(pdf_download_link.href, self.r.url) if self.gets_a_pdf(pdf_download_link, self.r.url): self.scraped_pdf_url = pdf_url self.scraped_open_metadata_url = self.url self.open_version_source_string = "open (via free pdf)" # now look and see if it is not just free, but open! license_patterns = [ u"(creativecommons.org\/licenses\/[a-z\-]+)", u"distributed under the terms (.*) which permits", u"This is an open access article under the terms (.*) which permits", u"This is an open access article published under (.*) which permits", u'<div class="openAccess-articleHeaderContainer(.*?)</div>' ] for pattern in license_patterns: matches = re.findall(pattern, page, re.IGNORECASE) if matches: self.scraped_license = find_normalized_license(matches[0]) self.scraped_open_metadata_url = self.url self.open_version_source_string = "open (via page says license)" says_open_url_snippet_patterns = [ ("projecteuclid.org/", u'<strong>Full-text: Open access</strong>'), ] for (url_snippet, pattern) in says_open_url_snippet_patterns: matches = re.findall(pattern, self.r.content_small(), re.IGNORECASE) if url_snippet in self.r.request.url.lower() and matches: self.scraped_open_metadata_url = self.r.request.url self.open_version_source_string = "open (via page says Open Access)" self.scraped_license = "implied-oa" says_open_access_patterns = [ ("Informa UK Limited", u"/accessOA.png"), ("Oxford University Press (OUP)", u"<i class='icon-availability_open'"), ("Institute of Electrical and Electronics Engineers (IEEE)", ur'"isOpenAccess":true'), ("Institute of Electrical and Electronics Engineers (IEEE)", ur'"openAccessFlag":"yes"'), ("Informa UK Limited", u"/accessOA.png"), ("Royal Society of Chemistry (RSC)", u"/open_access_blue.png"), ("Cambridge University Press (CUP)", u'<span class="icon access open-access cursorDefault">'), ] for (publisher, pattern) in says_open_access_patterns: matches = re.findall(pattern, page, re.IGNORECASE | re.DOTALL) if self.is_same_publisher(publisher) and matches: self.scraped_license = "implied-oa" self.scraped_open_metadata_url = landing_url self.open_version_source_string = "open (via page says Open Access)" if self.is_open: if DEBUG_SCRAPING: logger.info( u"we've decided this is open! took {} seconds [{}]". format(elapsed(start), landing_url)) return True else: if DEBUG_SCRAPING: logger.info( u"we've decided this doesn't say open. took {} seconds [{}]" .format(elapsed(start), landing_url)) return False except requests.exceptions.ConnectionError as e: self.error += u"ERROR: connection error in scrape_for_fulltext_link on {}: {}".format( landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except requests.Timeout as e: self.error += u"ERROR: timeout error in scrape_for_fulltext_link on {}: {}".format( landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except requests.exceptions.InvalidSchema as e: self.error += u"ERROR: InvalidSchema error in scrape_for_fulltext_link on {}: {}".format( landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except requests.exceptions.RequestException as e: self.error += u"ERROR: RequestException error in scrape_for_fulltext_link" logger.info(self.error) return False except requests.exceptions.ChunkedEncodingError as e: self.error += u"ERROR: ChunkedEncodingError error in scrape_for_fulltext_link on {}: {}".format( landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except NoDoiException as e: self.error += u"ERROR: NoDoiException error in scrape_for_fulltext_link on {}: {}".format( landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except Exception as e: self.error += u"ERROR: Exception error in scrape_for_fulltext_link" logger.exception(self.error) return False
def scrape_for_fulltext_link(url): if DEBUG_SCRAPING: print u"getting URL: {}".format(url) license = "unknown" is_journal = is_doi_url(url) or (u"/doi/" in url) if u"ncbi.nlm.nih.gov" in url: print u"not scraping {} because is on our do not scrape list.".format( url) if "ncbi.nlm.nih.gov/pmc/articles/PMC" in url: # pmc has fulltext return (url, license) else: # is an nlm page but not a pmc page, so is not full text return (None, license) if DEBUG_SCRAPING: print u"in scrape_for_fulltext_link" with closing(http_get(url, stream=True, timeout=10)) as r: # if our url redirects to a pdf, we're done. # = open repo http://hdl.handle.net/2060/20140010374 if resp_is_pdf(r): if DEBUG_SCRAPING: print u"the head says this is a PDF. success! [{}]".format(url) return (url, license) else: if DEBUG_SCRAPING: print u"head says not a PDF. continuing more checks" # get the HTML tree page = r.content license = find_normalized_license(page) # if they are linking to a .docx or similar, this is open. # this only works for repos... a ".doc" in a journal is not the article. example: # = closed journal http://doi.org/10.1007/s10822-012-9571-0 if not is_journal: doc_link = find_doc_download_link(page) if doc_link is not None: if DEBUG_SCRAPING: print u"found a .doc download link {} [{}]".format( get_link_target(doc_link, r.url), url) return (url, license) pdf_download_link = find_pdf_link(page, url) if pdf_download_link is not None: if DEBUG_SCRAPING: print u"found a PDF download link: {} {} [{}]".format( pdf_download_link.href, pdf_download_link.anchor, url) pdf_url = get_link_target(pdf_download_link, r.url) if is_journal: # if they are linking to a PDF, we need to follow the link to make sure it's legit if DEBUG_SCRAPING: print u"this is a journal. checking to see the PDF link actually gets a PDF [{}]".format( url) if gets_a_pdf(pdf_download_link, r.url): return (pdf_url, license) else: return (pdf_url, license) if license != "unknown": # = open 10.1136/bmj.i2716 cc-by # = open 10.1136/bmj.i1209 cc-by-nc # print "FOUND A LICENSE!", license, url return (None, license) if DEBUG_SCRAPING: print u"found no PDF download link [{}]".format(url) return (None, license)
def set_version_and_license(self, r=None): self.updated = datetime.datetime.utcnow().isoformat() if self.is_pmc: self.set_info_for_pmc_page() return # set as default self.scrape_version = self.default_version() is_updated = self.update_with_local_info() # now try to see what we can get out of the pdf itself version_is_from_strict_metadata = self.pmh_record and self.pmh_record.api_raw and re.compile( ur"<dc:type>{}</dc:type>".format(self.scrape_version), re.IGNORECASE | re.MULTILINE | re.DOTALL ).findall(self.pmh_record.api_raw) if version_is_from_strict_metadata or not r: logger.info(u"before scrape returning {} with scrape_version: {}, license {}".format(self.url, self.scrape_version, self.scrape_license)) return try: # http://crossmark.dyndns.org/dialog/?doi=10.1016/j.jml.2012 at http://dspace.mit.edu/bitstream/1721.1/102417/1/Gibson_The%20syntactic.pdf if re.findall(u"crossmark\.[^/]*\.org/", r.content_big(), re.IGNORECASE): self.scrape_version = "publishedVersion" text = convert_pdf_to_txt(r, max_pages=25) # logger.info(text) if text and self.scrape_version != "publishedVersion" and not version_is_from_strict_metadata: patterns = [ re.compile(ur"©.?\d{4}", re.UNICODE), re.compile(ur"\(C\).?\d{4}", re.IGNORECASE), re.compile(ur"copyright.{0,6}\d{4}", re.IGNORECASE), re.compile(ur"received.{0,100}revised.{0,100}accepted.{0,100}publication", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"all rights reserved", re.IGNORECASE), re.compile(ur"This article is distributed under the terms of the Creative Commons", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"This article is licensed under a Creative Commons", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"this is an open access article", re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur"This article is brought to you for free and open access by Works.", re.IGNORECASE | re.MULTILINE | re.DOTALL), ] for pattern in patterns: if pattern.findall(text): logger.info(u'found {}, decided PDF is published version'.format(pattern.pattern)) self.scrape_version = "publishedVersion" if text and self.scrape_version != 'acceptedVersion': patterns = [ re.compile(ur'This is a post-peer-review, pre-copyedit version', re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur'This is the peer reviewed version of the following article', re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur'The present manuscript as of \d\d \w+ \d\d\d\d has been accepted', re.IGNORECASE | re.MULTILINE | re.DOTALL), re.compile(ur'Post-peer-review, pre-copyedit version of accepted manuscript', re.IGNORECASE | re.MULTILINE | re.DOTALL), ] for pattern in patterns: if pattern.findall(text): logger.info(u'found {}, decided PDF is accepted version'.format(pattern.pattern)) self.scrape_version = "acceptedVersion" if r and r.url and '61RMIT_INST' in r.url: if 'Version: Accepted' in text: logger.info(u'found Version: Accepted, decided PDF is accepted version') self.scrape_version = "acceptedVersion" heading_text = text[0:50].lower() accepted_headings = [ "final accepted version", "accepted manuscript", ] for heading in accepted_headings: if heading in heading_text: logger.info(u'found {} in heading, decided PDF is accepted version'.format(heading)) self.scrape_version = "acceptedVersion" break if not self.scrape_license: open_license = find_normalized_license(text) if open_license: logger.info(u'found license in PDF: {}'.format(open_license)) self.scrape_license = open_license except Exception as e: logger.exception(u"exception in convert_pdf_to_txt for {}".format(self.url)) self.error += u"Exception doing convert_pdf_to_txt!" logger.info(self.error) if self.pmh_record: self.scrape_version = _scrape_version_override().get(self.pmh_record.bare_pmh_id, self.scrape_version) logger.info(u"scrape returning {} with scrape_version: {}, license {}".format(self.url, self.scrape_version, self.scrape_license))
def scrape_for_fulltext_link(self): url = self.url dont_scrape_list = [ u"ncbi.nlm.nih.gov", u"pubmed", u"elar.rsvpu.ru", #these ones based on complaint in email u"elib.uraic.ru", u"elar.usfeu.ru", u"elar.urfu.ru", u"elar.uspu.ru" ] for url_fragment in dont_scrape_list: if url_fragment in url: logger.info( u"not scraping {} because is on our do not scrape list.". format(url)) return try: with closing( http_get(url, stream=True, related_pub=self.related_pub, ask_slowly=self.ask_slowly)) as self.r: if self.r.status_code != 200: self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link".format( self.r.status_code, url) return # if our url redirects to a pdf, we're done. # = open repo http://hdl.handle.net/2060/20140010374 if self.is_a_pdf_page(): if DEBUG_SCRAPING: logger.info( u"this is a PDF. success! [{}]".format(url)) self.scraped_pdf_url = url return else: if DEBUG_SCRAPING: logger.info( u"is not a PDF for {}. continuing more checks". format(url)) # now before reading the content, bail it too large if is_response_too_large(self.r): logger.info(u"landing page is too large, skipping") return # get the HTML tree page = self.r.content # set the license if we can find one scraped_license = find_normalized_license(page) if scraped_license: self.scraped_license = scraped_license # special exception for citeseer because we want the pdf link where # the copy is on the third party repo, not the cached link, if we can get it if u"citeseerx.ist.psu.edu/" in url: matches = re.findall( u'<h3>Download Links</h3>.*?href="(.*?)"', page, re.DOTALL) if matches: self.scraped_pdf_url = unicode(matches[0], "utf-8") self.scraped_open_metadata_url = url return pdf_download_link = self.find_pdf_link(page) if pdf_download_link is not None: if DEBUG_SCRAPING: logger.info( u"found a PDF download link: {} {} [{}]".format( pdf_download_link.href, pdf_download_link.anchor, url)) pdf_url = get_link_target(pdf_download_link.href, self.r.url) # if they are linking to a PDF, we need to follow the link to make sure it's legit if DEBUG_SCRAPING: logger.info( u"checking to see the PDF link actually gets a PDF [{}]" .format(url)) if self.gets_a_pdf(pdf_download_link, self.r.url): self.scraped_pdf_url = pdf_url self.scraped_open_metadata_url = url return # try this later because would rather get a pdfs # if they are linking to a .docx or similar, this is open. doc_link = find_doc_download_link(page) if doc_link is not None: if DEBUG_SCRAPING: logger.info( u"found a .doc download link {} [{}]".format( get_link_target(doc_link.href, self.r.url), url)) self.scraped_open_metadata_url = url return except requests.exceptions.ConnectionError as e: self.error += u"ERROR: connection error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.Timeout as e: self.error += u"ERROR: timeout error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.InvalidSchema as e: self.error += u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.RequestException as e: self.error += u"ERROR: RequestException error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.ChunkedEncodingError as e: self.error += u"ERROR: ChunkedEncodingError error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except NoDoiException as e: self.error += u"ERROR: NoDoiException error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return if DEBUG_SCRAPING: logger.info( u"found no PDF download link. end of the line. [{}]".format( url)) return self
def call_base(products): if not products: # print "empty product list so not calling base" return titles = [] # may be more than one product for a given title, so is a dict of lists titles_to_products = defaultdict(list) for p in products: p.license_string = "" p.base_dcoa = None p.repo_urls = {"urls": []} title = p.best_title titles_to_products[normalize(title)].append(p) title = title.lower() # can't just replace all punctuation because ' replaced with ? gets no hits title = title.replace('"', "?") title = title.replace('#', "?") title = title.replace('=', "?") title = title.replace('&', "?") title = title.replace('%', "?") title = title.replace('-', "*") # only bother looking up titles that are at least 3 words long title_words = title.split() if len(title_words) >= 3: # only look up the first 12 words title_to_query = u" ".join(title_words[0:12]) titles.append(title_to_query) # now do the lookup in base titles_string = u"%20OR%20".join( [u'%22{}%22'.format(title) for title in titles]) # print u"{}: calling base with query string of length {}, utf8 bits {}".format(self.id, len(titles_string), 8*len(titles_string.encode('utf-8'))) url_template = u"https://api.base-search.net/cgi-bin/BaseHttpSearchInterface.fcgi?func=PerformSearch&query=(dcoa:1%20OR%20dcoa:2)%20AND%20dctitle:({titles_string})&fields=dctitle,dccreator,dcyear,dcrights,dcprovider,dcidentifier,dcoa,dclink&hits=100000&format=json" url = url_template.format(titles_string=titles_string) # print u"calling base with {}".format(url) start_time = time() proxy_url = os.getenv("STATIC_IP_PROXY") proxies = {"https": proxy_url} r = None try: r = requests.get(url, proxies=proxies, timeout=6) # print u"** querying with {} titles took {}s".format(len(titles), elapsed(start_time)) except requests.exceptions.ConnectionError: print u"connection error in set_fulltext_urls, skipping." except requests.Timeout: print u"timeout error in set_fulltext_urls, skipping." if r != None and r.status_code != 200: print u"problem searching base! status_code={}".format(r.status_code) for p in products: p.base_dcoa = u"base query error: status_code={}".format( r.status_code) else: try: data = r.json()["response"] # print "number found:", data["numFound"] for doc in data["docs"]: base_dcoa = str(doc["dcoa"]) try: # print "normalize(doc['dctitle'])", normalize(doc["dctitle"]), doc["dctitle"], doc["dcidentifier"] # print "titles", titles matching_products = titles_to_products[normalize( doc["dctitle"])] except KeyError: matching_products = [] for p in matching_products: if base_dcoa == "1": # got a 1 hit. yay! overwrite no matter what. if p.fulltext_url: urls_to_choose_from = [p.fulltext_url ] + doc["dcidentifier"] else: urls_to_choose_from = doc["dcidentifier"] # print "urls_to_choose_from", urls_to_choose_from p.fulltext_url = pick_best_base_url( urls_to_choose_from) p.evidence = "oa repository (via base-search.net oa url)" p.repo_urls["urls"] = {} p.base_dcoa = base_dcoa if "dcrights" in doc: p.license_string += u"{};".format(doc["dcrights"]) elif base_dcoa == "2" and p.base_dcoa != "1": # got a 2 hit. use only if we don't already have a 1. p.repo_urls["urls"] += doc["dcidentifier"] p.base_dcoa = base_dcoa except ValueError: # includes simplejson.decoder.JSONDecodeError print u'decoding JSON has failed base response' for p in products: p.base_dcoa = u"base lookup error: json response parsing" except AttributeError: # no json # print u"no hit with title {}".format(doc["dctitle"]) # print u"normalized: {}".format(normalize(doc["dctitle"])) pass if p.repo_urls["urls"]: p.repo_urls["urls"] = sorted(p.repo_urls["urls"], key=lambda x: base_url_sort_score(x)) for p in products: if p.license_string: p.license = oa_local.find_normalized_license(p.license_string) if p.best_title and (normalize(p.best_title) in BASE_RESULT_OVERRIDE): p.fulltext_url = BASE_RESULT_OVERRIDE[normalize(p.best_title)] print u"finished base step of set_fulltext_urls with {} titles in {}s".format( len(titles_to_products), elapsed(start_time, 2))
def scrape_for_fulltext_link(self): url = self.url is_journal = u"/doi/" in url or u"10." in url if DEBUG_SCRAPING: print u"in scrape_for_fulltext_link, getting URL: {}".format(url) if u"ncbi.nlm.nih.gov" in url: print u"not scraping {} because is on our do not scrape list.".format( url) if "ncbi.nlm.nih.gov/pmc/articles/PMC" in url: # pmc has fulltext self.scraped_open_metadata_url = url pmcid_matches = re.findall(".*(PMC\d+).*", url) if pmcid_matches: pmcid = pmcid_matches[0] self.scraped_pdf_url = u"https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf".format( pmcid) else: # is an nlm page but not a pmc page, so is not full text return try: with closing( http_get(url, stream=True, read_timeout=10, doi=self.doi)) as r: if is_response_too_large(r): print "landing page is too large, skipping" return # if our url redirects to a pdf, we're done. # = open repo http://hdl.handle.net/2060/20140010374 if resp_is_pdf_from_header(r): if DEBUG_SCRAPING: print u"the head says this is a PDF. success! [{}]".format( url) self.scraped_pdf_url = url return else: if DEBUG_SCRAPING: print u"head says not a PDF for {}. continuing more checks".format( url) # get the HTML tree page = r.content # set the license if we can find one scraped_license = find_normalized_license(page) if scraped_license: self.scraped_license = scraped_license pdf_download_link = find_pdf_link(page, url) if pdf_download_link is not None: if DEBUG_SCRAPING: print u"found a PDF download link: {} {} [{}]".format( pdf_download_link.href, pdf_download_link.anchor, url) pdf_url = get_link_target(pdf_download_link, r.url) if is_journal: # if they are linking to a PDF, we need to follow the link to make sure it's legit if DEBUG_SCRAPING: print u"this is a journal. checking to see the PDF link actually gets a PDF [{}]".format( url) if gets_a_pdf(pdf_download_link, r.url, self.doi): self.scraped_pdf_url = pdf_url self.scraped_open_metadata_url = url return else: self.scraped_pdf_url = pdf_url self.scraped_open_metadata_url = url return # try this later because would rather get a pdfs # if they are linking to a .docx or similar, this is open. # this only works for repos... a ".doc" in a journal is not the article. example: # = closed journal http://doi.org/10.1007/s10822-012-9571-0 if not is_journal: doc_link = find_doc_download_link(page) if doc_link is not None: if DEBUG_SCRAPING: print u"found a .doc download link {} [{}]".format( get_link_target(doc_link, r.url), url) self.scraped_open_metadata_url = url return except requests.exceptions.ConnectionError: print u"ERROR: connection error on {} in scrape_for_fulltext_link, skipping.".format( url) return except requests.Timeout: print u"ERROR: timeout error on {} in scrape_for_fulltext_link, skipping.".format( url) return except requests.exceptions.InvalidSchema: print u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link, skipping.".format( url) return except requests.exceptions.RequestException as e: print u"ERROR: RequestException error on {} in scrape_for_fulltext_link, skipping.".format( url) return if DEBUG_SCRAPING: print u"found no PDF download link. end of the line. [{}]".format( url) return self
def scrape_for_fulltext_link(self): landing_url = self.url if DEBUG_SCRAPING: logger.info(u"checking to see if {} says it is open".format(landing_url)) start = time() try: self.r = http_get(landing_url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly) resolved_landing_url = self.r.url if self.r.status_code != 200: if self.r.status_code in [401]: # is unauthorized, so not open pass else: self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link, skipping.".format(self.r.status_code, self.r.url) logger.info(u"DIDN'T GET THE PAGE: {}".format(self.error)) # logger.debug(self.r.request.headers) return # example 10.1007/978-3-642-01445-1 if u"crossref.org/_deleted-doi/" in self.r.url: logger.info(u"this is a deleted doi") return # if our landing_url redirects to a pdf, we're done. # = open repo http://hdl.handle.net/2060/20140010374 if self.is_a_pdf_page(): if DEBUG_SCRAPING: logger.info(u"this is a PDF. success! [{}]".format(landing_url)) self.scraped_pdf_url = landing_url self.open_version_source_string = "open (via free pdf)" # don't bother looking for open access lingo because it is a PDF (or PDF wannabe) return else: if DEBUG_SCRAPING: logger.info(u"landing page is not a PDF for {}. continuing more checks".format(landing_url)) # get the HTML tree page = self.r.content_small() # remove script tags try: soup = BeautifulSoup(page, 'html.parser') [script.extract() for script in soup('script')] page = str(soup) except HTMLParseError as e: logger.error(u'error parsing html, skipped script removal: {}'.format(e)) # set the license if we can find one scraped_license = find_normalized_license(page) if scraped_license: self.scraped_license = scraped_license pdf_download_link = self.find_pdf_link(page) if pdf_download_link is not None: pdf_url = get_link_target(pdf_download_link.href, self.r.url) if self.gets_a_pdf(pdf_download_link, self.r.url): self.scraped_pdf_url = pdf_url self.scraped_open_metadata_url = self.url self.open_version_source_string = "open (via free pdf)" # now look and see if it is not just free, but open! says_open_url_snippet_patterns = [ ('projecteuclid.org/', u'<strong>Full-text: Open access</strong>'), ('sciencedirect.com/', u'<div class="OpenAccessLabel">open access</div>'), ('sciencedirect.com/', u'<div class="OpenAccessLabel">open archive</div>'), ] for (url_snippet, pattern) in says_open_url_snippet_patterns: matches = re.findall(pattern, page, re.IGNORECASE) if url_snippet in resolved_landing_url.lower() and matches: self.scraped_open_metadata_url = landing_url self.open_version_source_string = "open (via page says Open Access)" self.scraped_license = "implied-oa" says_open_access_patterns = [ ("Informa UK Limited", u"/accessOA.png"), ("Oxford University Press (OUP)", u"<i class='icon-availability_open'"), ("Institute of Electrical and Electronics Engineers (IEEE)", ur'"isOpenAccess":true'), ("Institute of Electrical and Electronics Engineers (IEEE)", ur'"openAccessFlag":"yes"'), ("Informa UK Limited", u"/accessOA.png"), ("Royal Society of Chemistry (RSC)", u"/open_access_blue.png"), ("Cambridge University Press (CUP)", u'<span class="icon access open-access cursorDefault">'), ] for (publisher, pattern) in says_open_access_patterns: matches = re.findall(pattern, page, re.IGNORECASE | re.DOTALL) if self.is_same_publisher(publisher) and matches: self.scraped_license = "implied-oa" self.scraped_open_metadata_url = landing_url self.open_version_source_string = "open (via page says Open Access)" license_patterns = [ ur"(creativecommons.org/licenses/[a-z\-]+)", u"distributed under the terms (.*) which permits", u"This is an open access article under the terms (.*) which permits", u"This is an open access article published under (.*) which permits", u'<div class="openAccess-articleHeaderContainer(.*?)</div>' ] for pattern in license_patterns: matches = re.findall(pattern, page, re.IGNORECASE) if matches: self.scraped_license = find_normalized_license(matches[0]) self.scraped_open_metadata_url = self.url self.open_version_source_string = "open (via page says license)" if self.is_open: if DEBUG_SCRAPING: logger.info(u"we've decided this is open! took {} seconds [{}]".format( elapsed(start), landing_url)) return True else: if DEBUG_SCRAPING: logger.info(u"we've decided this doesn't say open. took {} seconds [{}]".format( elapsed(start), landing_url)) return False except requests.exceptions.ConnectionError as e: self.error += u"ERROR: connection error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except requests.Timeout as e: self.error += u"ERROR: timeout error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except requests.exceptions.InvalidSchema as e: self.error += u"ERROR: InvalidSchema error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except requests.exceptions.RequestException as e: self.error += u"ERROR: RequestException error in scrape_for_fulltext_link" logger.info(self.error) return False except requests.exceptions.ChunkedEncodingError as e: self.error += u"ERROR: ChunkedEncodingError error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except NoDoiException as e: self.error += u"ERROR: NoDoiException error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8")) logger.info(self.error) return False except Exception as e: self.error += u"ERROR: Exception error in scrape_for_fulltext_link" logger.exception(self.error) return False
def scrape_for_fulltext_link(self): url = self.url dont_scrape_list = [ u"ncbi.nlm.nih.gov", u"europepmc.org", u"/europepmc/", u"pubmed", u"elar.rsvpu.ru", #these ones based on complaint in email u"elib.uraic.ru", u"elar.usfeu.ru", u"elar.urfu.ru", u"elar.uspu.ru"] for url_fragment in dont_scrape_list: if url_fragment in url: logger.info(u"not scraping {} because is on our do not scrape list.".format(url)) return try: self.r = http_get(url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly) if self.r.status_code != 200: if self.r.status_code in [401]: # not authorized, so not open pass else: self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link".format(self.r.status_code, url) return # if our url redirects to a pdf, we're done. # = open repo http://hdl.handle.net/2060/20140010374 if self.is_a_pdf_page(): if DEBUG_SCRAPING: logger.info(u"this is a PDF. success! [{}]".format(url)) self.scraped_pdf_url = url return else: if DEBUG_SCRAPING: logger.info(u"is not a PDF for {}. continuing more checks".format(url)) # now before reading the content, bail it too large if is_response_too_large(self.r): logger.info(u"landing page is too large, skipping") return # get the HTML tree page = self.r.content_small() # set the license if we can find one scraped_license = find_normalized_license(page) if scraped_license: self.scraped_license = scraped_license pdf_download_link = None # special exception for citeseer because we want the pdf link where # the copy is on the third party repo, not the cached link, if we can get it if url and u"citeseerx.ist.psu.edu/" in url: matches = re.findall(u'<h3>Download Links</h3>.*?href="(.*?)"', page, re.DOTALL) if matches: pdf_download_link = DuckLink(unicode(matches[0], "utf-8"), "download") # osf doesn't have their download link in their pages # so look at the page contents to see if it is osf-hosted # if so, compute the url. example: http://osf.io/tyhqm elif page and u"osf-cookie" in unicode(page, "utf-8", errors='replace'): pdf_download_link = DuckLink(u"{}/download".format(url), "download") # otherwise look for it the normal way else: pdf_download_link = self.find_pdf_link(page) if pdf_download_link is not None: if DEBUG_SCRAPING: logger.info(u"found a PDF download link: {} {} [{}]".format( pdf_download_link.href, pdf_download_link.anchor, url)) pdf_url = get_link_target(pdf_download_link.href, self.r.url) # if they are linking to a PDF, we need to follow the link to make sure it's legit if DEBUG_SCRAPING: logger.info(u"checking to see the PDF link actually gets a PDF [{}]".format(url)) if self.gets_a_pdf(pdf_download_link, self.r.url): self.scraped_pdf_url = pdf_url self.scraped_open_metadata_url = url return # try this later because would rather get a pdfs # if they are linking to a .docx or similar, this is open. doc_link = find_doc_download_link(page) if doc_link is not None: if DEBUG_SCRAPING: logger.info(u"found a .doc download link {} [{}]".format( get_link_target(doc_link.href, self.r.url), url)) self.scraped_open_metadata_url = url return except requests.exceptions.ConnectionError as e: self.error += u"ERROR: connection error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.Timeout as e: self.error += u"ERROR: timeout error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.InvalidSchema as e: self.error += u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.RequestException as e: self.error += u"ERROR: RequestException in scrape_for_fulltext_link" logger.info(self.error) return except requests.exceptions.ChunkedEncodingError as e: self.error += u"ERROR: ChunkedEncodingError error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except NoDoiException as e: self.error += u"ERROR: NoDoiException error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except Exception as e: self.error += u"ERROR: Exception error on in scrape_for_fulltext_link" logger.exception(self.error) return if DEBUG_SCRAPING: logger.info(u"found no PDF download link. end of the line. [{}]".format(url)) return self
def scrape_for_fulltext_link(self, find_pdf_link=True): url = self.url dont_scrape_list = [ u"ncbi.nlm.nih.gov", u"europepmc.org", u"/europepmc/", u"pubmed", u"elar.rsvpu.ru", #these ones based on complaint in email u"elib.uraic.ru", u"elar.usfeu.ru", u"elar.urfu.ru", u"elar.uspu.ru" ] for url_fragment in dont_scrape_list: if url_fragment in url: logger.info( u"not scraping {} because is on our do not scrape list.". format(url)) return try: self.r = http_get(url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly) resolved_url = self.r.url if self.r.status_code != 200: if self.r.status_code in [401]: # not authorized, so not open pass else: self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link".format( self.r.status_code, url) return # if our url redirects to a pdf, we're done. # = open repo http://hdl.handle.net/2060/20140010374 if self.is_a_pdf_page(): if DEBUG_SCRAPING: logger.info(u"this is a PDF. success! [{}]".format(url)) self.scraped_pdf_url = url return else: if DEBUG_SCRAPING: logger.info( u"is not a PDF for {}. continuing more checks".format( url)) if is_a_word_doc(self.r): if DEBUG_SCRAPING: logger.info( u"this is a word doc. success! [{}]".format(url)) self.scraped_open_metadata_url = url return # now before reading the content, bail it too large if is_response_too_large(self.r): logger.info(u"landing page is too large, skipping") return # get the HTML tree page = self.r.content_small() # remove script tags try: soup = BeautifulSoup(page, 'html.parser') [script.extract() for script in soup('script')] page = str(soup) except HTMLParseError as e: logger.error( u'error parsing html, skipped script removal: {}'.format( e)) # set the license if we can find one scraped_license = find_normalized_license(page) if scraped_license: self.scraped_license = scraped_license pdf_download_link = None # special exception for citeseer because we want the pdf link where # the copy is on the third party repo, not the cached link, if we can get it if url and u"citeseerx.ist.psu.edu/" in url: matches = re.findall(u'<h3>Download Links</h3>.*?href="(.*?)"', page, re.DOTALL) if matches: pdf_download_link = DuckLink(unicode(matches[0], "utf-8"), "download") # osf doesn't have their download link in their pages # so look at the page contents to see if it is osf-hosted # if so, compute the url. example: http://osf.io/tyhqm elif page and u"osf-cookie" in unicode( page, "utf-8", errors='replace'): pdf_download_link = DuckLink(u"{}/download".format(url), "download") # otherwise look for it the normal way else: pdf_download_link = self.find_pdf_link(page) if pdf_download_link is not None: if DEBUG_SCRAPING: logger.info( u"found a PDF download link: {} {} [{}]".format( pdf_download_link.href, pdf_download_link.anchor, url)) pdf_url = get_link_target(pdf_download_link.href, self.r.url) # if they are linking to a PDF, we need to follow the link to make sure it's legit if DEBUG_SCRAPING: logger.info( u"checking to see the PDF link actually gets a PDF [{}]" .format(url)) if self.gets_a_pdf(pdf_download_link, self.r.url): self.scraped_pdf_url = pdf_url self.scraped_open_metadata_url = url return # try this later because would rather get a pdfs # if they are linking to a .docx or similar, this is open. doc_link = find_doc_download_link(page) if doc_link is not None: absolute_doc_url = get_link_target(doc_link.href, resolved_url) if DEBUG_SCRAPING: logger.info( u"found a possible .doc download link [{}]".format( absolute_doc_url)) if self.gets_a_word_doc(doc_link, self.r.url): if DEBUG_SCRAPING: logger.info( u"we've decided this is a word doc. [{}]".format( absolute_doc_url)) self.scraped_open_metadata_url = url return else: if DEBUG_SCRAPING: logger.info( u"we've decided this ain't a word doc. [{}]". format(absolute_doc_url)) bhl_link = find_bhl_view_link(resolved_url, page) if bhl_link is not None: logger.info('found a BHL document link: {}'.format( get_link_target(bhl_link.href, resolved_url))) self.scraped_open_metadata_url = url return if _trust_repo_license(resolved_url) and self.scraped_license: logger.info(u'trusting license {}'.format( self.scraped_license)) self.scraped_open_metadata_url = self.url except requests.exceptions.ConnectionError as e: self.error += u"ERROR: connection error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.Timeout as e: self.error += u"ERROR: timeout error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.InvalidSchema as e: self.error += u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except requests.exceptions.RequestException as e: self.error += u"ERROR: RequestException in scrape_for_fulltext_link" logger.info(self.error) return except requests.exceptions.ChunkedEncodingError as e: self.error += u"ERROR: ChunkedEncodingError error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except NoDoiException as e: self.error += u"ERROR: NoDoiException error on {} in scrape_for_fulltext_link: {}".format( url, unicode(e.message).encode("utf-8")) logger.info(self.error) return except Exception as e: self.error += u"ERROR: Exception error on in scrape_for_fulltext_link" logger.exception(self.error) return if DEBUG_SCRAPING: logger.info( u"found no PDF download link. end of the line. [{}]".format( url)) return self