Пример #1
0
 def test_doi_conversions(self):
     """
     TODO: What this tests are about!
     """
     self.assertEqual(os.path.join(corpusdir, example_file), doi_to_path(example_doi), "{0} does not transform to {1}".format(example_doi, example_file))
     self.assertEqual(example_file2, doi_to_path(example_doi2, ''), "{0} does not transform to {1}".format(example_doi2, example_file2))
     self.assertEqual(example_url2, doi_to_url(example_doi2), "{0} does not transform to {1}".format(example_doi2, example_url2))
     self.assertEqual(example_url, doi_to_url(example_doi), "In doi_to_url, {0} does not transform to {1}".format(example_doi, example_url))
     self.assertEqual(example_url2_int, doi_to_url(example_doi2, plos_network=True),
                      "In doi_to_url, {0} does not transform to {1}, but to {2}".format(example_doi2,
                      example_url2_int, doi_to_url(example_doi2)))
     self.assertEqual(example_url_int, doi_to_url(example_doi, plos_network=True),
                      "{0} does not transform to {1}".format(example_doi, example_url_int))
Пример #2
0
def remote_proofs_direct_check(tempdir=newarticledir,
                               article_list=None,
                               plos_network=False):
    """
    Takes list of of DOIs of uncorrected proofs and compared to raw XML of the article online
    If article status is now 'vor-update-to-uncorrected-proof', download new copy
    This will not be necessary once Solr is indexing VOR article information correctly.
    https://developer.plos.org/jira/browse/DPRO-3418
    :param tempdir: temporary directory for downloading articles
    :param article-list: list of uncorrected proofs to check for updates.
    :return: list of all articles with updated vor
    """
    try:
        os.mkdir(tempdir)
    except FileExistsError:
        pass
    proofs_download_list = []
    if article_list is None:
        article_list = get_uncorrected_proofs_list()
    for doi in list(set(article_list)):
        file = doi_to_path(doi)
        updated = download_updated_xml(file, vor_check=True)
        if updated:
            proofs_download_list.append(doi)
    if proofs_download_list:
        print(len(proofs_download_list), "VOR articles directly downloaded.")
    else:
        print("No new VOR articles found.")
    return proofs_download_list
Пример #3
0
def get_article_xml(article_file, tag_path_elements=None):
    """
    For a local article file, read its XML tree
    Can also interpret DOIs
    Defaults to reading the tree location for uncorrected proofs/versions of record
    :param article_file: the xml file for a single article
    :param tag_path_elements: xpath location in the XML tree of the article file
    :return: content of article file at that xpath location
    """
    if tag_path_elements is None:
        tag_path_elements = ('/', 'article', 'front', 'article-meta',
                             'custom-meta-group', 'custom-meta', 'meta-value')

    try:
        article_tree = et.parse(article_file)
    except OSError:
        if validate_doi(article_file):
            article_file = doi_to_path(article_file)
        elif article_file.endswith('xml'):
            article_file = article_file[:-3] + 'XML'
        elif article_file.endswith('XML'):
            article_file = article_file[:-3] + 'xml'
        elif article_file.endswith('nxml'):
            article_file = article_file[:-3] + 'nxml'
        elif not article_file.endswith('.'):
            article_file = article_file + '.xml'
        else:
            article_file = article_file + 'xml'
        article_tree = et.parse(article_file)
    articleXML = article_tree.getroot()
    tag_location = '/'.join(tag_path_elements)
    return articleXML.xpath(tag_location)
Пример #4
0
def download_updated_xml(article_file, tempdir=newarticledir, vor_check=False):
    """
    For an article file, compare local XML to remote XML
    If they're different, download new version of article
    :param article_file: the filename for a single article
    :param tempdir: directory where files are downloaded to
    :param vor_check: whether checking to see if uncorrected proof is updated
    :return: boolean for whether update was available & downloaded
    """
    doi = filename_to_doi(article_file)
    try:
        os.mkdir(tempdir)
    except FileExistsError:
        pass
    url = URL_TMP.format(doi)
    articletree_remote = et.parse(url)
    articleXML_remote = et.tostring(articletree_remote,
                                    method='xml',
                                    encoding='unicode')
    if not article_file.endswith('.xml'):
        article_file += '.xml'
    try:
        articletree_local = et.parse(
            os.path.join(corpusdir, os.path.basename(article_file)))
    except OSError:
        article_file_alt = os.path.join(
            tempdir, os.path.basename(doi_to_path(article_file)))
        articletree_local = et.parse(article_file_alt)
    articleXML_local = et.tostring(articletree_local,
                                   method='xml',
                                   encoding='unicode')

    if articleXML_remote == articleXML_local:
        updated = False
        get_new = False
    else:
        get_new = True
        if vor_check:
            # make sure that update is to a VOR for uncorrected proof
            get_new = False
            path_parts = [
                '/', 'article', 'front', 'article-meta', 'custom-meta-group',
                'custom-meta', 'meta-value'
            ]
            r = articletree_remote.xpath("/".join(path_parts))
            for x in r:
                if x.text == 'vor-update-to-uncorrected-proof':
                    get_new = True
                    break
        if get_new:
            article_path = os.path.join(tempdir,
                                        os.path.basename(article_file))
            with open(article_path, 'w') as file:
                file.write(articleXML_remote)
            updated = True
    return updated
Пример #5
0
def check_for_corrected_articles(directory=newarticledir, article_list=None):
    """
    For articles in the temporary download directory, check if article_type is correction
    If correction, surface the DOI of the article being corrected
    Use with download_corrected_articles
    :param article: the filename for a single article
    :param directory: directory where the article file is, default is newarticledir
    :return: list of filenames to existing local files for articles issued a correction
    """
    corrected_doi_list = []
    if article_list is None:
        article_list = listdir_nohidden(directory)
    for article_file in article_list:
        article_type = check_article_type(article_file=article_file)
        if article_type == 'correction':
            corrected_article = get_related_article_doi(article_file)[0]
            corrected_doi_list.append(corrected_article)
    corrected_article_list = [
        doi_to_path(doi) if os.path.exists(doi_to_path(doi)) else doi_to_path(
            doi, directory=newarticledir) for doi in list(corrected_doi_list)
    ]
    print(len(corrected_article_list), 'corrected articles found.')
    return corrected_article_list
Пример #6
0
def repo_download(dois, tempdir, ignore_existing=True, plos_network=False):
    """
    Downloads a list of articles by DOI from PLOS's content-repo (crepo) to a temporary directory
    Use in conjunction with get_dois_needed_list
    :param dois: Iterable with DOIs for articles to obtain
    :param tempdir: Temporary directory where files are copied to
    :param ignore_existing: Don't re-download to tempdir if already downloaded
    """
    # make temporary directory, if needed
    try:
        os.mkdir(tempdir)
    except FileExistsError:
        pass

    if ignore_existing:
        existing_articles = [
            filename_to_doi(file) for file in listdir_nohidden(tempdir)
        ]
        dois = set(dois) - set(existing_articles)

    max_value = len(dois)
    bar = progressbar.ProgressBar(redirect_stdout=True, max_value=max_value)
    for i, doi in enumerate(sorted(dois)):
        url = URL_TMP.format(doi)
        articleXML = et.parse(url)
        article_path = doi_to_path(doi, directory=tempdir)
        # create new local XML files
        if ignore_existing is False or ignore_existing and os.path.isfile(
                article_path) is False:
            with open(article_path, 'w') as file:
                file.write(
                    et.tostring(articleXML, method='xml', encoding='unicode'))
            if not plos_network:
                time.sleep(1)
        bar.update(i + 1)
    bar.finish()
    print(len(listdir_nohidden(tempdir)), "new articles downloaded.")
    logging.info(len(listdir_nohidden(tempdir)))
Пример #7
0
def get_article_metadata(article_file, size='small'):
    """
    For an individual article in the PLOS corpus, create a tuple of a set of metadata fields sbout that corpus.
    Make it small, medium, or large depending on number of fields desired.
    :param article_file: individual local PLOS XML article
    :param size: small, medium or large, aka how many fields to return for each article
    :return: tuple of metadata fields tuple, wrong_date_strings dict
    """
    doi = filename_to_doi(article_file)
    filename = os.path.basename(doi_to_path(article_file)).rstrip('.xml')
    title = get_article_title(article_file)
    journal = get_plos_journal(article_file)
    jats_article_type = check_article_type(article_file)
    plos_article_type = get_plos_article_type(article_file)
    dtd_version = get_article_dtd(article_file)
    dates, wrong_date_strings = get_article_dates(article_file, string_=True)
    (pubdate, collection, received, accepted) = ('', '', '', '')
    pubdate = dates['epub']
    counts = get_article_counts(article_file)
    (fig_count, table_count, page_count) = ('', '', '')
    body_word_count = get_article_body_word_count(article_file)
    if jats_article_type == 'correction':
        related_article = get_related_article_doi(article_file,
                                                  corrected=True)[0]
    elif jats_article_type == 'retraction':
        related_article = get_related_retraction_article(article_file)[0]
    else:
        related_article = ''
    abstract = get_article_abstract(article_file)
    try:
        collection = dates['collection']
    except KeyError:
        pass
    try:
        received = dates['received']
    except KeyError:
        pass
    try:
        accepted = dates['accepted']
    except KeyError:
        pass
    try:
        fig_count = counts['fig-count']
    except KeyError:
        pass
    try:
        table_count = counts['table-count']
    except KeyError:
        pass
    try:
        page_count = counts['page-count']
    except KeyError:
        pass
    metadata = [
        doi, filename, title, journal, jats_article_type, plos_article_type,
        dtd_version, pubdate, received, accepted, collection, fig_count,
        table_count, page_count, body_word_count, related_article, abstract
    ]
    metadata = tuple(metadata)
    if len(metadata) == 17:
        return metadata, wrong_date_strings
    else:
        print('Error in {}: {} items'.format(article_file, len(metadata)))
        return False