示例#1
0
def get_article_xml(article_file, tag_path_elements=None):
    """
    For a local article file, read its XML tree
    Can also interpret DOIs
    Defaults to reading the tree location for uncorrected proofs/versions of record
    :param article_file: the xml file for a single article
    :param tag_path_elements: xpath location in the XML tree of the article file
    :return: content of article file at that xpath location
    """
    if tag_path_elements is None:
        tag_path_elements = ('/', 'article', 'front', 'article-meta',
                             'custom-meta-group', 'custom-meta', 'meta-value')

    try:
        article_tree = et.parse(article_file)
    except OSError:
        if validate_doi(article_file):
            article_file = doi_to_path(article_file)
        elif article_file.endswith('xml'):
            article_file = article_file[:-3] + 'XML'
        elif article_file.endswith('XML'):
            article_file = article_file[:-3] + 'xml'
        elif article_file.endswith('nxml'):
            article_file = article_file[:-3] + 'nxml'
        elif not article_file.endswith('.'):
            article_file = article_file + '.xml'
        else:
            article_file = article_file + 'xml'
        article_tree = et.parse(article_file)
    articleXML = article_tree.getroot()
    tag_location = '/'.join(tag_path_elements)
    return articleXML.xpath(tag_location)
示例#2
0
 def doi(self, d):
     """
     Using regular expressions, make sure the doi is valid before
     instantiating the article object.
     """
     if validate_doi(d) is False:
         raise Exception("Invalid format for PLOS DOI")
     self.reset_memoized_attrs()
     self._doi = d
示例#3
0
def validate_corpus(corpusdir=corpusdir):
    """
    For every local article file and DOI listed on Solr, validate file names, DOIs, URLs in terms of
    regular expressions.
    Stops checking as soon as encounters problem and prints it
    :return: boolean of whether corpus passed validity checks
    """
    # check DOIs
    plos_dois = get_all_plos_dois()
    plos_valid_dois = [doi for doi in plos_dois if validate_doi(doi)]
    if set(plos_dois) == set(plos_valid_dois):
        pass
    else:
        print("Invalid DOIs: {}".format(set(plos_dois) - set(plos_valid_dois)))
        return False

    # check urls
    plos_urls = [doi_to_url(doi) for doi in plos_valid_dois]
    plos_valid_urls = [url for url in plos_urls if validate_url(url)]
    if set(plos_urls) == set(plos_valid_urls) and len(plos_valid_urls) == len(
            plos_valid_dois):
        pass
    else:
        print("Invalid URLs: {}".format(set(plos_urls) - set(plos_valid_urls)))
        return False

    # check files and filenames
    plos_files = listdir_nohidden(corpusdir)
    if plos_files:
        plos_valid_filenames = [
            article for article in plos_files if validate_file(article)
        ]
        if len(plos_valid_dois) == len(plos_valid_filenames):
            pass
        else:
            print("Invalid filenames: {}".format(
                set(plos_valid_dois) - set(plos_valid_filenames)))
            return False
        plos_valid_files = [
            article for article in plos_valid_filenames
            if os.path.isfile(article)
        ]
        if set(plos_valid_filenames) == set(plos_valid_files):
            return True
        else:
            invalid_files = set(plos_valid_filenames) - set(plos_valid_files)
            if len(invalid_files) > max_invalid_files_to_print:
                print("Too many invalid files to print: {}".format(
                    len(invalid_files)))
            else:
                print("Invalid files: {}".format(invalid_files))
            return False
    else:
        print(
            "Corpus directory empty. Re-download by running create_local_plos_corpus()"
        )
        return False
示例#4
0
def doi_to_path(doi, directory=corpusdir):
    """
    For a given PLOS DOI, return the relative path to that local article
    For DOIs that contain the word 'annotation', searches online version of the article xml to extract
    the journal name, which goes into the filename. Will print DOI if it can't find the journal name
    Uses regex to make sure it's a DOI and not a file
    Example:
    doi_to_path('10.1371/journal.pone.1000001') = 'allofplos_xml/journal.pone.1000001.xml'
    :param doi: full unique identifier for a PLOS article
    :param directory: defaults to corpusdir, containing article files
    :return: relative path to local XML file
    """
    if doi.startswith(annotation_doi) and validate_doi(doi):
        article_file = os.path.join(directory, "plos.correction." + doi.split('/')[-1] + suffix_lower)
    elif validate_doi(doi):
        article_file = os.path.join(directory, doi.lstrip(prefix) + suffix_lower)
    # NOTE: The following check is weird, a DOI should never validate as a file name.
    elif validate_filename(doi):
        article_file = doi
    return article_file
示例#5
0
def check_if_doi_resolves(doi, plos_valid=True):
    """
    Return metadata for a given DOI. If the link works, make sure that it points to the same DOI
    Checks first if it's a valid DOI
    or see if it's a redirect.
    """
    if plos_valid and validate_doi(doi) is False:
        return "Not valid PLOS DOI structure"
    url = "http://dx.doi.org/" + doi
    if check_if_link_works(url):
        headers = {"accept": "application/vnd.citationstyles.csl+json"}
        r = requests.get(url, headers=headers)
        r_doi = r.json()['DOI']
        if r_doi == doi:
            return 'works'
        else:
            return r_doi
    else:
        return "doesn't work"
示例#6
0
def filename_to_doi(filename):
    """
    For a local XML file in the corpusdir directory, transform it to the article's DOI
    Includes transform for the 'annotation' DOIs
    Uses regex to make sure it's a file and not a DOI
    Example:
    filename_to_doi('journal.pone.1000001.xml') = '10.1371/journal.pone.1000001'
    :param article_file: relative path to local XML file in the corpusdir directory
    :param directory: defaults to corpusdir, containing article files
    :return: full unique identifier for a PLOS article
    """
    if correction in filename and validate_filename(filename):
        article = 'annotation/' + (filename.split('.', 4)[2])
        doi = prefix + article
    elif validate_filename(filename):
        doi = prefix + os.path.splitext((os.path.basename(filename)))[0]
    # NOTE: A filename should never validate as a DOI, so the next elif is wrong.
    elif validate_doi(filename):
        doi = filename
    return doi
示例#7
0
    def check_if_doi_resolves(self, plos_valid=True):
        """Whether a PLOS DOI resolves via dx.doi.org to the correct article landing page.

        If the link works, make sure that it points to the same DOI
        Checks first if it's a valid DOI or see if it's a redirect.
        :return: 'works' if works as expected, 'doesn't work' if it doesn't resolve correctly,
        or if the metadata DOI doesn't match self.doi, return the metadata DOI
        """
        if plos_valid and validate_doi(self.doi) is False:
            return "Not valid PLOS DOI structure"
        url = "http://dx.doi.org/" + self.doi
        if self.check_if_link_works() is True:
            headers = {"accept": "application/vnd.citationstyles.csl+json"}
            r = requests.get(url, headers=headers)
            r_doi = r.json()['DOI']
            if r_doi == self.doi:
                return "works"
            else:
                return r_doi
        else:
            return "doesn't work"