def scrub_articles_from_root(source_dir, stops, updater = 500): """ Runs through a whole directory, opens all files and removes newlines matching stops. Then saves back to the same file. """ sys.stderr.write("Scrubbing extraneous newlines...\n") articles = Raw.generate_files(source_dir) article_strings = ({ "file": codecs.open(article, "r", "utf8").read(), "path": article } for article in articles) scrubbed_articles = ({ "file":scrub_article(a["file"], stops), "path":a["path"] } for a in article_strings) for n, scrubbed in enumerate(scrubbed_articles): fOut = codecs.open(scrubbed["path"], "w", "utf8") fOut.write(scrubbed["file"]) fOut.close() if n % updater == 0: sys.stderr.write("%d articles scrubbed..." % n) sys.stderr.write("All scrubbed up.\n")
import os import sys import itertools from control import settings from pmcminer.ProcessFiles import Articles from pmcminer.ProcessFiles import Raw articles = ({ "articles": Raw.generate_files(os.path.join(settings.DATA_DIR, settings.RAW_ARTICLES_DIR, subject)), "subject": subject} for subject in settings.DOI_LISTS) class Purify(Articles.ExtractArticleBody): """ builds an article set for article subjects that are non-overlapping. put a tuple or list in dirSet to specify directories within inPath. """ def __init__(self, inPath, outPath, dirSet = False): if dirSet: self.subjects = dirSet else: self.subjects = [subject for subject in os.listdir(inPath) if os.path.isdir(os.path.join(inPath, subject))] self.subjectCount = [len(os.listdir(os.path.join(inPath, subject))) for subject in self.subjects]
""" import sys import os from pmcminer.ProcessFiles import Raw, Articles from control import settings def check_DOI_by_dict(DOI_dict, article_ID, data_dir, raw_article_dir): """generate subject, DOI and filename for PMC XML file""" for subject in DOI_dict: for item in article_ID[0]: if item in DOI_dict[subject]: yield (subject, item, article_ID[1]) DOIs = Raw.get_DOI_list(settings.DOI_LISTS) archived_articles = Raw.generate_files(settings.ARCHIVE_DIR) article_trees = ((Articles.parse_XML(article), article) for article in archived_articles) article_IDs = ((list(Articles.extract_element(tree[0], "front/article-meta/article-id")), tree[1]) for tree in article_trees) check_dict = dict(((key, {}) for key in settings.DOI_LISTS.keys())) #for ID in article_IDs: for n, ID in enumerate(article_IDs): checker = check_DOI_by_dict(DOIs, ID, settings.DATA_DIR, settings.RAW_ARTICLES_DIR) for i in checker: try: check_dict[i[0]][i[1]].append(i[2]) except KeyError:
#!/usr/bin/python2 import os from control import settings """ Executable for PMCminer app for extracting species and ecological interaction terms from PMCOASS XML files """ if settings.EXTRACT_RAW_ARTICLES_BY_DOI: from pmcminer.ProcessFiles import Raw Raw.extract_raw_articles(data_dir = settings.DATA_DIR, raw_dir = settings.RAW_ARTICLES_DIR, doi_lists = settings.DOI_LISTS, archive_dir = settings.ARCHIVE_DIR) if settings.PURIFY_ARTICLE_SET: from pmcminer.ProcessFiles.Pure import PurifyArticleSet from pmcminer.ProcessFiles.Pure import ArticleScrubber PurifyArticleSet.purify_articles(settings.DATA_DIR, settings.RAW_ARTICLES_DIR, settings.DOI_LISTS, settings.PURE_ARTICLES_DIR) ArticleScrubber.scrub_articles_from_root(source_dir = os.path.join(settings.DATA_DIR, settings.PURE_ARTICLES_DIR), stops = settings.STOPS) if settings.TAG_SPECIES: from pmcminer.Linnaeus.PyLinnaeus import * for subject in range(len(settings.DOI_LISTS)): spp_tagger = Tagger(**lin_opts(0,subject)) spp_tagger.check_directories()