def get_paper_info(doi=None, url=None): # Resolve DOI or URL through PyPub pub_resolve methods publisher_base_url, full_url = pub_resolve.get_publisher_urls(doi=doi, url=url) pub_dict = pub_resolve.get_publisher_site_info(publisher_base_url) # Create a PaperInfo object to hold all information and call appropriate scraper paper_info = PaperInfo(doi=doi, scraper_obj=pub_dict['object'], url=full_url) paper_info.populate_info() return paper_info
def science_direct(self): # Sample journal article sd_link = 'http://www.sciencedirect.com/science/article/pii/S0006899313013048' # Make a PaperInfo object from the live site information pi = PaperInfo(url=sd_link, scraper_obj='sciencedirect_selenium') pi.populate_info() pi.publisher_interface = None # Write saved version of the PaperInfo object file_path_name = os.path.join(self.dirname, 'sd_info.txt') with open(file_path_name, 'wb') as file: pickle.dump(pi, file)
def springer(self): # Sample journal article sp_link = 'http://link.springer.com/article/10.1007/s10237-015-0706-9' sp_doi = '10.1007/s10237-015-0706-9' # Make a PaperInfo object from the live site information pi = PaperInfo(url=sp_link, doi=sp_doi, scraper_obj='springer') pi.populate_info() pi.publisher_interface = None # Write saved version of the PaperInfo object file_path_name = os.path.join(self.dirname, 'sp_info.txt') with open(file_path_name, 'wb') as file: pickle.dump(pi, file)
def nature_nrg(self): # Sample journal article nrg_link = 'http://www.nature.com/nrg/journal/v15/n5/full/nrg3686.html' nrg_doi = '10.1038/nrg3686' # Make a PaperInfo object from the live site information pi = PaperInfo(url=nrg_link, doi=nrg_doi, scraper_obj='nature_nrg') pi.populate_info() pi.publisher_interface = None # Write saved version of the PaperInfo object file_path_name = os.path.join(self.dirname, 'nrg_info.txt') with open(file_path_name, 'wb') as file: pickle.dump(pi, file)
def doi_to_info(doi=None, url=None): """ Gets entry and references information for an article DOI. Uses saved dicts matching DOI prefixes to publishers and web scrapers to retrieve information. Will fail if DOI prefix hasn't been saved with a publisher link or if a scraper for a specific publisher site hasn't been built. Parameters ---------- doi : str Unique ID assigned to a journal article. url : str The CrossRef URL to the article page. I.e. http://dx.doi.org/10.###### Returns ------- paper_info : PaperInfo Class containing parameters including the following: entry_dict : dict Contains information about the paper referenced by the DOI. Includes title, authors, affiliations, publish date, journal title, volume, and pages, and keywords. Some values are other dicts (for example, the author info with affiliation values). Formatted to be JSON serializable. refs_dicts : list of dicts Each list item is a dict corresponding to an individual reference from the article's reference list. Includes title, authors, publishing date, journal title, volume, and pages (if listed), and any external URL links available (i.e. to where it is hosted on other sites, or pdf links). full_url : str URL to the journal article page on publisher's website. """ # Resolve DOI or URL through PyPub pub_resolve methods publisher_base_url, full_url = pub_resolve.get_publisher_urls(doi=doi, url=url) pub_dict = pub_resolve.get_publisher_site_info(publisher_base_url) # Create a PaperInfo object to hold all information and call appropriate scraper paper_info = PaperInfo(doi=doi, scraper_obj=pub_dict['object'], url=full_url) paper_info.populate_info() return paper_info
def taylor_francis(self): # NOTE: The current version of the T&F scraper is for a deprecated version # of the site. All of the HTML tags need to be changed. # Sample journal article tf_link = 'http://www.tandfonline.com/doi/full/10.1080/21624054.2016.1184390' tf_doi = '10.1080/21624054.2016.1184390' # Make a PaperInfo object from the live site information pi = PaperInfo(url=tf_link, doi=tf_doi, scraper_obj='taylorfrancis') pi.populate_info() pi.publisher_interface = None # Write saved version of the PaperInfo object file_path_name = os.path.join(self.dirname, 'tf_info.txt') with open(file_path_name, 'wb') as file: pickle.dump(pi, file)
def wiley(self): # Sample journal article wy_link = 'http://onlinelibrary.wiley.com/doi/10.1002/biot.201400046/references' wy_doi = '10.1002/biot.201400046' # Make a PaperInfo object from the live site information # pi.publisher_interface needs to be set to None or else # the object could not be saved. pi = PaperInfo(url=wy_link, doi=wy_doi, scraper_obj='wiley') pi.populate_info() pi.publisher_interface = None # Write saved version of the PaperInfo object file_path_name = os.path.join(self.dirname, 'wy_info.txt') with open(file_path_name, 'wb') as file: pickle.dump(pi, file)
def __init__(self): self.curpath = str(os.path.dirname(os.path.abspath(__file__))) self.link = 'http://www.sciencedirect.com/science/article/pii/S0006899313013048' self.doi = 'S0006899313013048' # Make a PaperInfo object from the live site information try: pi = PaperInfo(url=self.link, doi=self.doi, scraper_obj='sciencedirect') pi.populate_info() except Exception: self.pi = None self.entry_dict = None else: self.pi = pi self.entry_dict = self.pi.entry.__dict__ # Load saved version of the PaperInfo object saved_dir = os.path.join(self.curpath, 'saved_info') saved_file_path = os.path.join(saved_dir, 'sp_info.txt') self.saved_pi = pickle.load(open(saved_file_path, 'rb')) # Make the saved versions into dicts self.saved_entry_dict = self.saved_pi.entry.__dict__
def __init__(self): self.curpath = str(os.path.dirname(os.path.abspath(__file__))) self.link = 'http://www.nature.com/nrg/journal/v15/n5/full/nrg3686.html' self.doi = '10.1038/nrg3686' # Make a PaperInfo object from the live site information try: pi = PaperInfo(url=self.link, doi=self.doi, scraper_obj='nature') pi.populate_info() except Exception: self.pi = None self.entry_dict = None else: self.pi = pi self.entry_dict = self.pi.entry.__dict__ # Load saved version of the PaperInfo object saved_dir = os.path.join(self.curpath, 'saved_info') saved_file_path = os.path.join(saved_dir, 'sp_info.txt') self.saved_pi = pickle.load(open(saved_file_path, 'rb')) # Make the saved versions into dicts self.saved_entry_dict = self.saved_pi.entry.__dict__
def __init__(self): self.curpath = str(os.path.dirname(os.path.abspath(__file__))) self.link = 'http://link.springer.com/article/10.1186/s12984-016-0150-9' self.doi = '10.1186/s12984-016-0150-9' # Make a PaperInfo object from the live site information try: pi = PaperInfo(url=self.link, doi=self.doi, scraper_obj='Springer') pi.populate_info() except Exception: self.pi = None self.entry_dict = None else: self.pi = pi self.entry_dict = self.pi.entry.__dict__ # Load saved version of the PaperInfo object saved_dir = os.path.join(self.curpath, 'saved_info') saved_file_path = os.path.join(saved_dir, 'sp_info.txt') self.saved_pi = pickle.load(open(saved_file_path, 'rb')) # Make the saved versions into dicts self.saved_entry_dict = self.saved_pi.entry.__dict__
def __init__(self): self.curpath = str(os.path.dirname(os.path.abspath(__file__))) self.link = 'http://onlinelibrary.wiley.com/doi/10.1002/biot.201400046/references' self.doi = '10.1002/biot.201400046' # Make a PaperInfo object from the live site information try: pi = PaperInfo(url=self.link, doi=self.doi, scraper_obj='wiley') pi.populate_info() except Exception: self.pi = None self.entry_dict = None else: self.pi = pi self.entry_dict = self.pi.entry.__dict__ # Load saved version of the PaperInfo object saved_dir = os.path.join(self.curpath, 'saved_info') saved_file_path = os.path.join(saved_dir, 'wy_info.txt') self.saved_pi = pickle.load(open(saved_file_path, 'rb')) # Make the saved versions into dicts self.saved_entry_dict = self.saved_pi.entry.__dict__