Exemplo n.º 1
0
def get_paper_info(doi=None, url=None):
    # Resolve DOI or URL through PyPub pub_resolve methods
    publisher_base_url, full_url = pub_resolve.get_publisher_urls(doi=doi, url=url)
    pub_dict = pub_resolve.get_publisher_site_info(publisher_base_url)

    # Create a PaperInfo object to hold all information and call appropriate scraper
    paper_info = PaperInfo(doi=doi, scraper_obj=pub_dict['object'], url=full_url)
    paper_info.populate_info()

    return paper_info
Exemplo n.º 2
0
    def science_direct(self):
        # Sample journal article
        sd_link = 'http://www.sciencedirect.com/science/article/pii/S0006899313013048'

        # Make a PaperInfo object from the live site information
        pi = PaperInfo(url=sd_link, scraper_obj='sciencedirect_selenium')
        pi.populate_info()
        pi.publisher_interface = None

        # Write saved version of the PaperInfo object
        file_path_name = os.path.join(self.dirname, 'sd_info.txt')
        with open(file_path_name, 'wb') as file:
            pickle.dump(pi, file)
Exemplo n.º 3
0
    def springer(self):
        # Sample journal article
        sp_link = 'http://link.springer.com/article/10.1007/s10237-015-0706-9'
        sp_doi = '10.1007/s10237-015-0706-9'

        # Make a PaperInfo object from the live site information
        pi = PaperInfo(url=sp_link, doi=sp_doi, scraper_obj='springer')
        pi.populate_info()
        pi.publisher_interface = None

        # Write saved version of the PaperInfo object
        file_path_name = os.path.join(self.dirname, 'sp_info.txt')
        with open(file_path_name, 'wb') as file:
            pickle.dump(pi, file)
Exemplo n.º 4
0
    def nature_nrg(self):
        # Sample journal article
        nrg_link = 'http://www.nature.com/nrg/journal/v15/n5/full/nrg3686.html'
        nrg_doi = '10.1038/nrg3686'

        # Make a PaperInfo object from the live site information
        pi = PaperInfo(url=nrg_link, doi=nrg_doi, scraper_obj='nature_nrg')
        pi.populate_info()
        pi.publisher_interface = None

        # Write saved version of the PaperInfo object
        file_path_name = os.path.join(self.dirname, 'nrg_info.txt')
        with open(file_path_name, 'wb') as file:
            pickle.dump(pi, file)
Exemplo n.º 5
0
    def _construct_object(self, json):
        if json is None:
            return None

        entry = models.ScopusEntry(json=json)

        # Get references from the API response
        ref_list = BibliographyRetrieval._refs_from_json(json=json)
        references = []
        if ref_list is not None:
            for ref_json in ref_list:
                references.append(models.ScopusRef(ref_json))

        paper_info = PaperInfo()
        #paper_info.entry = utils.convert_to_dict(entry)
        #paper_info.references = utils.refs_to_list(references)
        paper_info.entry = entry
        paper_info.references = references

        paper_info.doi = getattr(entry, 'doi', None)
        paper_info.pdf_link = None
        paper_info.publisher_interface = None
        paper_info.scraper_obj = None

        return paper_info
def doi_to_info(doi=None, url=None):
    """
    Gets entry and references information for an article DOI.

    Uses saved dicts matching DOI prefixes to publishers and web scrapers
    to retrieve information. Will fail if DOI prefix hasn't been saved
    with a publisher link or if a scraper for a specific publisher
    site hasn't been built.

    Parameters
    ----------
    doi : str
        Unique ID assigned to a journal article.
    url : str
        The CrossRef URL to the article page.
        I.e. http://dx.doi.org/10.######

    Returns
    -------
    paper_info : PaperInfo
        Class containing parameters including the following:

        entry_dict : dict
            Contains information about the paper referenced by the DOI.
            Includes title, authors, affiliations, publish date, journal
            title, volume, and pages, and keywords. Some values are other
            dicts (for example, the author info with affiliation values).
            Formatted to be JSON serializable.

        refs_dicts : list of dicts
            Each list item is a dict corresponding to an individual reference
            from the article's reference list. Includes title, authors,
            publishing date, journal title, volume, and pages (if listed),
            and any external URL links available (i.e. to where it is hosted
            on other sites, or pdf links).

        full_url : str
            URL to the journal article page on publisher's website.

    """
    # Resolve DOI or URL through PyPub pub_resolve methods
    publisher_base_url, full_url = pub_resolve.get_publisher_urls(doi=doi, url=url)
    pub_dict = pub_resolve.get_publisher_site_info(publisher_base_url)

    # Create a PaperInfo object to hold all information and call appropriate scraper
    paper_info = PaperInfo(doi=doi, scraper_obj=pub_dict['object'], url=full_url)
    paper_info.populate_info()

    return paper_info
Exemplo n.º 7
0
    def taylor_francis(self):
        # NOTE: The current version of the T&F scraper is for a deprecated version
        # of the site. All of the HTML tags need to be changed.
        # Sample journal article
        tf_link = 'http://www.tandfonline.com/doi/full/10.1080/21624054.2016.1184390'
        tf_doi = '10.1080/21624054.2016.1184390'

        # Make a PaperInfo object from the live site information
        pi = PaperInfo(url=tf_link, doi=tf_doi, scraper_obj='taylorfrancis')
        pi.populate_info()
        pi.publisher_interface = None

        # Write saved version of the PaperInfo object
        file_path_name = os.path.join(self.dirname, 'tf_info.txt')
        with open(file_path_name, 'wb') as file:
            pickle.dump(pi, file)
Exemplo n.º 8
0
    def wiley(self):
        # Sample journal article
        wy_link = 'http://onlinelibrary.wiley.com/doi/10.1002/biot.201400046/references'
        wy_doi = '10.1002/biot.201400046'

        # Make a PaperInfo object from the live site information
        # pi.publisher_interface needs to be set to None or else
        # the object could not be saved.
        pi = PaperInfo(url=wy_link, doi=wy_doi, scraper_obj='wiley')
        pi.populate_info()
        pi.publisher_interface = None

        # Write saved version of the PaperInfo object
        file_path_name = os.path.join(self.dirname, 'wy_info.txt')
        with open(file_path_name, 'wb') as file:
            pickle.dump(pi, file)
Exemplo n.º 9
0
    def _construct_object(self, json):
        if json is None:
            return None

        entry = models.ScopusEntry(json=json)

        # Get references from the API response
        ref_list = BibliographyRetrieval._refs_from_json(json=json)
        references = []
        if ref_list is not None:
            for ref_json in ref_list:
                references.append(models.ScopusRef(ref_json))

        paper_info = PaperInfo()
        #paper_info.entry = utils.convert_to_dict(entry)
        #paper_info.references = utils.refs_to_list(references)
        paper_info.entry = entry
        paper_info.references = references

        paper_info.doi = getattr(entry, 'doi', None)
        paper_info.pdf_link = None
        paper_info.publisher_interface = None
        paper_info.scraper_obj = None

        return paper_info
Exemplo n.º 10
0
    def __init__(self):
        self.curpath = str(os.path.dirname(os.path.abspath(__file__)))
        self.link = 'http://www.sciencedirect.com/science/article/pii/S0006899313013048'
        self.doi = 'S0006899313013048'

        # Make a PaperInfo object from the live site information
        try:
            pi = PaperInfo(url=self.link, doi=self.doi, scraper_obj='sciencedirect')
            pi.populate_info()
        except Exception:
            self.pi = None
            self.entry_dict = None
        else:
            self.pi = pi
            self.entry_dict = self.pi.entry.__dict__

        # Load saved version of the PaperInfo object
        saved_dir = os.path.join(self.curpath, 'saved_info')
        saved_file_path = os.path.join(saved_dir, 'sp_info.txt')
        self.saved_pi = pickle.load(open(saved_file_path, 'rb'))

        # Make the saved versions into dicts
        self.saved_entry_dict = self.saved_pi.entry.__dict__
Exemplo n.º 11
0
    def __init__(self):
        self.curpath = str(os.path.dirname(os.path.abspath(__file__)))
        self.link = 'http://www.nature.com/nrg/journal/v15/n5/full/nrg3686.html'
        self.doi = '10.1038/nrg3686'

        # Make a PaperInfo object from the live site information
        try:
            pi = PaperInfo(url=self.link, doi=self.doi, scraper_obj='nature')
            pi.populate_info()
        except Exception:
            self.pi = None
            self.entry_dict = None
        else:
            self.pi = pi
            self.entry_dict = self.pi.entry.__dict__

        # Load saved version of the PaperInfo object
        saved_dir = os.path.join(self.curpath, 'saved_info')
        saved_file_path = os.path.join(saved_dir, 'sp_info.txt')
        self.saved_pi = pickle.load(open(saved_file_path, 'rb'))

        # Make the saved versions into dicts
        self.saved_entry_dict = self.saved_pi.entry.__dict__
Exemplo n.º 12
0
    def __init__(self):
        self.curpath = str(os.path.dirname(os.path.abspath(__file__)))
        self.link = 'http://link.springer.com/article/10.1186/s12984-016-0150-9'
        self.doi = '10.1186/s12984-016-0150-9'

        # Make a PaperInfo object from the live site information
        try:
            pi = PaperInfo(url=self.link, doi=self.doi, scraper_obj='Springer')
            pi.populate_info()
        except Exception:
            self.pi = None
            self.entry_dict = None
        else:
            self.pi = pi
            self.entry_dict = self.pi.entry.__dict__

        # Load saved version of the PaperInfo object
        saved_dir = os.path.join(self.curpath, 'saved_info')
        saved_file_path = os.path.join(saved_dir, 'sp_info.txt')
        self.saved_pi = pickle.load(open(saved_file_path, 'rb'))

        # Make the saved versions into dicts
        self.saved_entry_dict = self.saved_pi.entry.__dict__
Exemplo n.º 13
0
    def __init__(self):
        self.curpath = str(os.path.dirname(os.path.abspath(__file__)))
        self.link = 'http://onlinelibrary.wiley.com/doi/10.1002/biot.201400046/references'
        self.doi = '10.1002/biot.201400046'

        # Make a PaperInfo object from the live site information
        try:
            pi = PaperInfo(url=self.link, doi=self.doi, scraper_obj='wiley')
            pi.populate_info()
        except Exception:
            self.pi = None
            self.entry_dict = None
        else:
            self.pi = pi
            self.entry_dict = self.pi.entry.__dict__

        # Load saved version of the PaperInfo object
        saved_dir = os.path.join(self.curpath, 'saved_info')
        saved_file_path = os.path.join(saved_dir, 'wy_info.txt')
        self.saved_pi = pickle.load(open(saved_file_path, 'rb'))

        # Make the saved versions into dicts
        self.saved_entry_dict = self.saved_pi.entry.__dict__
def resolve_doi(doi):
    """
    Gets the paper and references information from an article DOI.

    Parameters
    ----------
    doi : str
        DOI = digital object identifier.
        Unique ID assigned to a journal article.
        Example: 10.1002/biot.201400046

    Returns
    -------
    paper_info : PaperInfo
        See resolve_citation for description.

    """
    doi_prefix = doi[0:7]

    # Same steps as in resolve_citation
    saved_info = get_saved_info(doi)
    if saved_info is not None:
        saved_paper_info = PaperInfo()
        for k, v in saved_info.items():
            setattr(saved_paper_info, k, v)
        saved_paper_info.make_interface_object()
        return saved_paper_info



    paper_info = doi_to_info(doi)
    idnum = assign_id()
    paper_info.idnum = idnum

    log_info(paper_info)

    return paper_info
Exemplo n.º 15
0
 def get_paper_info(self):
     paper_info = PaperInfo()
     paper_info.entry = self.get_entry_info()
def resolve_citation(citation):
    """
    Gets the paper and references information from
    a plaintext citation.

    Uses a search to CrossRef.org to retrive paper DOI.

    Parameters
    ----------
    citation : str
        Full journal article citation.
        Example: Senís, Elena, et al. "CRISPR/Cas9‐mediated genome
                engineering: An adeno‐associated viral (AAV) vector
                toolbox. Biotechnology journal 9.11 (2014): 1402-1412.

    Returns
    -------
    paper_info : PaperInfo
        Class containing relevant paper meta-information and
        references list.
        Information about the paper itself is in 'entry' value, and is a dict
        (with str and dict values). References list is in 'references'
        value and is a list of dicts (each with str and dict values).
        Must call .__dict__ to be JSON-serializable

    """
    # Encode raw citation
    citation = urllib_quote(citation)

    # Search for citation on CrossRef.org to try to get a DOI link
    api_search_url = 'http://search.labs.crossref.org/dois?q=' + citation
    response = requests.get(api_search_url).json()
    doi = response[0]['doi']
    print(doi)

    # If crossref returns a http://dx.doi.org/ link, retrieve the doi from it
    # and save the URL to pass to doi_to_info
    url = None
    if doi[0:18] == 'http://dx.doi.org/':
        url = doi
        doi = doi[18:]
    doi_prefix = doi[0:7]

    # Check if this DOI has been searched and saved before.
    # If it has, return saved information
    saved_info = get_saved_info(doi)
    if saved_info is not None:
        saved_paper_info = PaperInfo()
        for k, v in saved_info.items():
            setattr(saved_paper_info, k, v)
        saved_paper_info.make_interface_object()
        return saved_paper_info

    paper_info = doi_to_info(doi, doi_prefix, url)

    idnum = assign_id()
    paper_info.idnum = idnum

    log_info(paper_info)

    return paper_info