def get_paper_info(doi=None, url=None): # Resolve DOI or URL through PyPub pub_resolve methods publisher_base_url, full_url = pub_resolve.get_publisher_urls(doi=doi, url=url) pub_dict = pub_resolve.get_publisher_site_info(publisher_base_url) # Create a PaperInfo object to hold all information and call appropriate scraper paper_info = PaperInfo(doi=doi, scraper_obj=pub_dict['object'], url=full_url) paper_info.populate_info() return paper_info
def science_direct(self): # Sample journal article sd_link = 'http://www.sciencedirect.com/science/article/pii/S0006899313013048' # Make a PaperInfo object from the live site information pi = PaperInfo(url=sd_link, scraper_obj='sciencedirect_selenium') pi.populate_info() pi.publisher_interface = None # Write saved version of the PaperInfo object file_path_name = os.path.join(self.dirname, 'sd_info.txt') with open(file_path_name, 'wb') as file: pickle.dump(pi, file)
def springer(self): # Sample journal article sp_link = 'http://link.springer.com/article/10.1007/s10237-015-0706-9' sp_doi = '10.1007/s10237-015-0706-9' # Make a PaperInfo object from the live site information pi = PaperInfo(url=sp_link, doi=sp_doi, scraper_obj='springer') pi.populate_info() pi.publisher_interface = None # Write saved version of the PaperInfo object file_path_name = os.path.join(self.dirname, 'sp_info.txt') with open(file_path_name, 'wb') as file: pickle.dump(pi, file)
def nature_nrg(self): # Sample journal article nrg_link = 'http://www.nature.com/nrg/journal/v15/n5/full/nrg3686.html' nrg_doi = '10.1038/nrg3686' # Make a PaperInfo object from the live site information pi = PaperInfo(url=nrg_link, doi=nrg_doi, scraper_obj='nature_nrg') pi.populate_info() pi.publisher_interface = None # Write saved version of the PaperInfo object file_path_name = os.path.join(self.dirname, 'nrg_info.txt') with open(file_path_name, 'wb') as file: pickle.dump(pi, file)
def _construct_object(self, json): if json is None: return None entry = models.ScopusEntry(json=json) # Get references from the API response ref_list = BibliographyRetrieval._refs_from_json(json=json) references = [] if ref_list is not None: for ref_json in ref_list: references.append(models.ScopusRef(ref_json)) paper_info = PaperInfo() #paper_info.entry = utils.convert_to_dict(entry) #paper_info.references = utils.refs_to_list(references) paper_info.entry = entry paper_info.references = references paper_info.doi = getattr(entry, 'doi', None) paper_info.pdf_link = None paper_info.publisher_interface = None paper_info.scraper_obj = None return paper_info
def doi_to_info(doi=None, url=None): """ Gets entry and references information for an article DOI. Uses saved dicts matching DOI prefixes to publishers and web scrapers to retrieve information. Will fail if DOI prefix hasn't been saved with a publisher link or if a scraper for a specific publisher site hasn't been built. Parameters ---------- doi : str Unique ID assigned to a journal article. url : str The CrossRef URL to the article page. I.e. http://dx.doi.org/10.###### Returns ------- paper_info : PaperInfo Class containing parameters including the following: entry_dict : dict Contains information about the paper referenced by the DOI. Includes title, authors, affiliations, publish date, journal title, volume, and pages, and keywords. Some values are other dicts (for example, the author info with affiliation values). Formatted to be JSON serializable. refs_dicts : list of dicts Each list item is a dict corresponding to an individual reference from the article's reference list. Includes title, authors, publishing date, journal title, volume, and pages (if listed), and any external URL links available (i.e. to where it is hosted on other sites, or pdf links). full_url : str URL to the journal article page on publisher's website. """ # Resolve DOI or URL through PyPub pub_resolve methods publisher_base_url, full_url = pub_resolve.get_publisher_urls(doi=doi, url=url) pub_dict = pub_resolve.get_publisher_site_info(publisher_base_url) # Create a PaperInfo object to hold all information and call appropriate scraper paper_info = PaperInfo(doi=doi, scraper_obj=pub_dict['object'], url=full_url) paper_info.populate_info() return paper_info
def taylor_francis(self): # NOTE: The current version of the T&F scraper is for a deprecated version # of the site. All of the HTML tags need to be changed. # Sample journal article tf_link = 'http://www.tandfonline.com/doi/full/10.1080/21624054.2016.1184390' tf_doi = '10.1080/21624054.2016.1184390' # Make a PaperInfo object from the live site information pi = PaperInfo(url=tf_link, doi=tf_doi, scraper_obj='taylorfrancis') pi.populate_info() pi.publisher_interface = None # Write saved version of the PaperInfo object file_path_name = os.path.join(self.dirname, 'tf_info.txt') with open(file_path_name, 'wb') as file: pickle.dump(pi, file)
def wiley(self): # Sample journal article wy_link = 'http://onlinelibrary.wiley.com/doi/10.1002/biot.201400046/references' wy_doi = '10.1002/biot.201400046' # Make a PaperInfo object from the live site information # pi.publisher_interface needs to be set to None or else # the object could not be saved. pi = PaperInfo(url=wy_link, doi=wy_doi, scraper_obj='wiley') pi.populate_info() pi.publisher_interface = None # Write saved version of the PaperInfo object file_path_name = os.path.join(self.dirname, 'wy_info.txt') with open(file_path_name, 'wb') as file: pickle.dump(pi, file)
def __init__(self): self.curpath = str(os.path.dirname(os.path.abspath(__file__))) self.link = 'http://www.sciencedirect.com/science/article/pii/S0006899313013048' self.doi = 'S0006899313013048' # Make a PaperInfo object from the live site information try: pi = PaperInfo(url=self.link, doi=self.doi, scraper_obj='sciencedirect') pi.populate_info() except Exception: self.pi = None self.entry_dict = None else: self.pi = pi self.entry_dict = self.pi.entry.__dict__ # Load saved version of the PaperInfo object saved_dir = os.path.join(self.curpath, 'saved_info') saved_file_path = os.path.join(saved_dir, 'sp_info.txt') self.saved_pi = pickle.load(open(saved_file_path, 'rb')) # Make the saved versions into dicts self.saved_entry_dict = self.saved_pi.entry.__dict__
def __init__(self): self.curpath = str(os.path.dirname(os.path.abspath(__file__))) self.link = 'http://www.nature.com/nrg/journal/v15/n5/full/nrg3686.html' self.doi = '10.1038/nrg3686' # Make a PaperInfo object from the live site information try: pi = PaperInfo(url=self.link, doi=self.doi, scraper_obj='nature') pi.populate_info() except Exception: self.pi = None self.entry_dict = None else: self.pi = pi self.entry_dict = self.pi.entry.__dict__ # Load saved version of the PaperInfo object saved_dir = os.path.join(self.curpath, 'saved_info') saved_file_path = os.path.join(saved_dir, 'sp_info.txt') self.saved_pi = pickle.load(open(saved_file_path, 'rb')) # Make the saved versions into dicts self.saved_entry_dict = self.saved_pi.entry.__dict__
def __init__(self): self.curpath = str(os.path.dirname(os.path.abspath(__file__))) self.link = 'http://link.springer.com/article/10.1186/s12984-016-0150-9' self.doi = '10.1186/s12984-016-0150-9' # Make a PaperInfo object from the live site information try: pi = PaperInfo(url=self.link, doi=self.doi, scraper_obj='Springer') pi.populate_info() except Exception: self.pi = None self.entry_dict = None else: self.pi = pi self.entry_dict = self.pi.entry.__dict__ # Load saved version of the PaperInfo object saved_dir = os.path.join(self.curpath, 'saved_info') saved_file_path = os.path.join(saved_dir, 'sp_info.txt') self.saved_pi = pickle.load(open(saved_file_path, 'rb')) # Make the saved versions into dicts self.saved_entry_dict = self.saved_pi.entry.__dict__
def __init__(self): self.curpath = str(os.path.dirname(os.path.abspath(__file__))) self.link = 'http://onlinelibrary.wiley.com/doi/10.1002/biot.201400046/references' self.doi = '10.1002/biot.201400046' # Make a PaperInfo object from the live site information try: pi = PaperInfo(url=self.link, doi=self.doi, scraper_obj='wiley') pi.populate_info() except Exception: self.pi = None self.entry_dict = None else: self.pi = pi self.entry_dict = self.pi.entry.__dict__ # Load saved version of the PaperInfo object saved_dir = os.path.join(self.curpath, 'saved_info') saved_file_path = os.path.join(saved_dir, 'wy_info.txt') self.saved_pi = pickle.load(open(saved_file_path, 'rb')) # Make the saved versions into dicts self.saved_entry_dict = self.saved_pi.entry.__dict__
def resolve_doi(doi): """ Gets the paper and references information from an article DOI. Parameters ---------- doi : str DOI = digital object identifier. Unique ID assigned to a journal article. Example: 10.1002/biot.201400046 Returns ------- paper_info : PaperInfo See resolve_citation for description. """ doi_prefix = doi[0:7] # Same steps as in resolve_citation saved_info = get_saved_info(doi) if saved_info is not None: saved_paper_info = PaperInfo() for k, v in saved_info.items(): setattr(saved_paper_info, k, v) saved_paper_info.make_interface_object() return saved_paper_info paper_info = doi_to_info(doi) idnum = assign_id() paper_info.idnum = idnum log_info(paper_info) return paper_info
def get_paper_info(self): paper_info = PaperInfo() paper_info.entry = self.get_entry_info()
def resolve_citation(citation): """ Gets the paper and references information from a plaintext citation. Uses a search to CrossRef.org to retrive paper DOI. Parameters ---------- citation : str Full journal article citation. Example: Senís, Elena, et al. "CRISPR/Cas9‐mediated genome engineering: An adeno‐associated viral (AAV) vector toolbox. Biotechnology journal 9.11 (2014): 1402-1412. Returns ------- paper_info : PaperInfo Class containing relevant paper meta-information and references list. Information about the paper itself is in 'entry' value, and is a dict (with str and dict values). References list is in 'references' value and is a list of dicts (each with str and dict values). Must call .__dict__ to be JSON-serializable """ # Encode raw citation citation = urllib_quote(citation) # Search for citation on CrossRef.org to try to get a DOI link api_search_url = 'http://search.labs.crossref.org/dois?q=' + citation response = requests.get(api_search_url).json() doi = response[0]['doi'] print(doi) # If crossref returns a http://dx.doi.org/ link, retrieve the doi from it # and save the URL to pass to doi_to_info url = None if doi[0:18] == 'http://dx.doi.org/': url = doi doi = doi[18:] doi_prefix = doi[0:7] # Check if this DOI has been searched and saved before. # If it has, return saved information saved_info = get_saved_info(doi) if saved_info is not None: saved_paper_info = PaperInfo() for k, v in saved_info.items(): setattr(saved_paper_info, k, v) saved_paper_info.make_interface_object() return saved_paper_info paper_info = doi_to_info(doi, doi_prefix, url) idnum = assign_id() paper_info.idnum = idnum log_info(paper_info) return paper_info