def test_doi_conversions(self): """ TODO: What this tests are about! """ self.assertEqual(os.path.join(corpusdir, example_file), doi_to_path(example_doi), "{0} does not transform to {1}".format(example_doi, example_file)) self.assertEqual(example_file2, doi_to_path(example_doi2, ''), "{0} does not transform to {1}".format(example_doi2, example_file2)) self.assertEqual(example_url2, doi_to_url(example_doi2), "{0} does not transform to {1}".format(example_doi2, example_url2)) self.assertEqual(example_url, doi_to_url(example_doi), "In doi_to_url, {0} does not transform to {1}".format(example_doi, example_url)) self.assertEqual(example_url2_int, doi_to_url(example_doi2, plos_network=True), "In doi_to_url, {0} does not transform to {1}, but to {2}".format(example_doi2, example_url2_int, doi_to_url(example_doi2))) self.assertEqual(example_url_int, doi_to_url(example_doi, plos_network=True), "{0} does not transform to {1}".format(example_doi, example_url_int))
def remote_proofs_direct_check(tempdir=newarticledir, article_list=None, plos_network=False): """ Takes list of of DOIs of uncorrected proofs and compared to raw XML of the article online If article status is now 'vor-update-to-uncorrected-proof', download new copy This will not be necessary once Solr is indexing VOR article information correctly. https://developer.plos.org/jira/browse/DPRO-3418 :param tempdir: temporary directory for downloading articles :param article-list: list of uncorrected proofs to check for updates. :return: list of all articles with updated vor """ try: os.mkdir(tempdir) except FileExistsError: pass proofs_download_list = [] if article_list is None: article_list = get_uncorrected_proofs_list() for doi in list(set(article_list)): file = doi_to_path(doi) updated = download_updated_xml(file, vor_check=True) if updated: proofs_download_list.append(doi) if proofs_download_list: print(len(proofs_download_list), "VOR articles directly downloaded.") else: print("No new VOR articles found.") return proofs_download_list
def get_article_xml(article_file, tag_path_elements=None): """ For a local article file, read its XML tree Can also interpret DOIs Defaults to reading the tree location for uncorrected proofs/versions of record :param article_file: the xml file for a single article :param tag_path_elements: xpath location in the XML tree of the article file :return: content of article file at that xpath location """ if tag_path_elements is None: tag_path_elements = ('/', 'article', 'front', 'article-meta', 'custom-meta-group', 'custom-meta', 'meta-value') try: article_tree = et.parse(article_file) except OSError: if validate_doi(article_file): article_file = doi_to_path(article_file) elif article_file.endswith('xml'): article_file = article_file[:-3] + 'XML' elif article_file.endswith('XML'): article_file = article_file[:-3] + 'xml' elif article_file.endswith('nxml'): article_file = article_file[:-3] + 'nxml' elif not article_file.endswith('.'): article_file = article_file + '.xml' else: article_file = article_file + 'xml' article_tree = et.parse(article_file) articleXML = article_tree.getroot() tag_location = '/'.join(tag_path_elements) return articleXML.xpath(tag_location)
def download_updated_xml(article_file, tempdir=newarticledir, vor_check=False): """ For an article file, compare local XML to remote XML If they're different, download new version of article :param article_file: the filename for a single article :param tempdir: directory where files are downloaded to :param vor_check: whether checking to see if uncorrected proof is updated :return: boolean for whether update was available & downloaded """ doi = filename_to_doi(article_file) try: os.mkdir(tempdir) except FileExistsError: pass url = URL_TMP.format(doi) articletree_remote = et.parse(url) articleXML_remote = et.tostring(articletree_remote, method='xml', encoding='unicode') if not article_file.endswith('.xml'): article_file += '.xml' try: articletree_local = et.parse( os.path.join(corpusdir, os.path.basename(article_file))) except OSError: article_file_alt = os.path.join( tempdir, os.path.basename(doi_to_path(article_file))) articletree_local = et.parse(article_file_alt) articleXML_local = et.tostring(articletree_local, method='xml', encoding='unicode') if articleXML_remote == articleXML_local: updated = False get_new = False else: get_new = True if vor_check: # make sure that update is to a VOR for uncorrected proof get_new = False path_parts = [ '/', 'article', 'front', 'article-meta', 'custom-meta-group', 'custom-meta', 'meta-value' ] r = articletree_remote.xpath("/".join(path_parts)) for x in r: if x.text == 'vor-update-to-uncorrected-proof': get_new = True break if get_new: article_path = os.path.join(tempdir, os.path.basename(article_file)) with open(article_path, 'w') as file: file.write(articleXML_remote) updated = True return updated
def check_for_corrected_articles(directory=newarticledir, article_list=None): """ For articles in the temporary download directory, check if article_type is correction If correction, surface the DOI of the article being corrected Use with download_corrected_articles :param article: the filename for a single article :param directory: directory where the article file is, default is newarticledir :return: list of filenames to existing local files for articles issued a correction """ corrected_doi_list = [] if article_list is None: article_list = listdir_nohidden(directory) for article_file in article_list: article_type = check_article_type(article_file=article_file) if article_type == 'correction': corrected_article = get_related_article_doi(article_file)[0] corrected_doi_list.append(corrected_article) corrected_article_list = [ doi_to_path(doi) if os.path.exists(doi_to_path(doi)) else doi_to_path( doi, directory=newarticledir) for doi in list(corrected_doi_list) ] print(len(corrected_article_list), 'corrected articles found.') return corrected_article_list
def repo_download(dois, tempdir, ignore_existing=True, plos_network=False): """ Downloads a list of articles by DOI from PLOS's content-repo (crepo) to a temporary directory Use in conjunction with get_dois_needed_list :param dois: Iterable with DOIs for articles to obtain :param tempdir: Temporary directory where files are copied to :param ignore_existing: Don't re-download to tempdir if already downloaded """ # make temporary directory, if needed try: os.mkdir(tempdir) except FileExistsError: pass if ignore_existing: existing_articles = [ filename_to_doi(file) for file in listdir_nohidden(tempdir) ] dois = set(dois) - set(existing_articles) max_value = len(dois) bar = progressbar.ProgressBar(redirect_stdout=True, max_value=max_value) for i, doi in enumerate(sorted(dois)): url = URL_TMP.format(doi) articleXML = et.parse(url) article_path = doi_to_path(doi, directory=tempdir) # create new local XML files if ignore_existing is False or ignore_existing and os.path.isfile( article_path) is False: with open(article_path, 'w') as file: file.write( et.tostring(articleXML, method='xml', encoding='unicode')) if not plos_network: time.sleep(1) bar.update(i + 1) bar.finish() print(len(listdir_nohidden(tempdir)), "new articles downloaded.") logging.info(len(listdir_nohidden(tempdir)))
def get_article_metadata(article_file, size='small'): """ For an individual article in the PLOS corpus, create a tuple of a set of metadata fields sbout that corpus. Make it small, medium, or large depending on number of fields desired. :param article_file: individual local PLOS XML article :param size: small, medium or large, aka how many fields to return for each article :return: tuple of metadata fields tuple, wrong_date_strings dict """ doi = filename_to_doi(article_file) filename = os.path.basename(doi_to_path(article_file)).rstrip('.xml') title = get_article_title(article_file) journal = get_plos_journal(article_file) jats_article_type = check_article_type(article_file) plos_article_type = get_plos_article_type(article_file) dtd_version = get_article_dtd(article_file) dates, wrong_date_strings = get_article_dates(article_file, string_=True) (pubdate, collection, received, accepted) = ('', '', '', '') pubdate = dates['epub'] counts = get_article_counts(article_file) (fig_count, table_count, page_count) = ('', '', '') body_word_count = get_article_body_word_count(article_file) if jats_article_type == 'correction': related_article = get_related_article_doi(article_file, corrected=True)[0] elif jats_article_type == 'retraction': related_article = get_related_retraction_article(article_file)[0] else: related_article = '' abstract = get_article_abstract(article_file) try: collection = dates['collection'] except KeyError: pass try: received = dates['received'] except KeyError: pass try: accepted = dates['accepted'] except KeyError: pass try: fig_count = counts['fig-count'] except KeyError: pass try: table_count = counts['table-count'] except KeyError: pass try: page_count = counts['page-count'] except KeyError: pass metadata = [ doi, filename, title, journal, jats_article_type, plos_article_type, dtd_version, pubdate, received, accepted, collection, fig_count, table_count, page_count, body_word_count, related_article, abstract ] metadata = tuple(metadata) if len(metadata) == 17: return metadata, wrong_date_strings else: print('Error in {}: {} items'.format(article_file, len(metadata))) return False