def output_pure_article(self, article, subject): self.article = os.path.join(self.inPath, subject, article) self.tree = Articles.parse_XML_no_Table(self.article) self.body = self.get_article_body_text(self.tree) self.sentences = self.tokenize_text(self.body) self.doi = Articles.extract_element(self.tree, "front/article-meta/article-id") if not os.path.exists(os.path.join(self.outPath, subject)): sys.stderr.write("creating directory %s\n" % os.path.join(self.outPath, subject)) os.mkdir(os.path.join(self.outPath, subject)) self.write(os.path.join(self.outPath, subject))
def collect_files_by_DOI(DOI_dict, archive_dir, data_dir, raw_article_dir, updater = 200): """ iterates over all archived articles and copies files to subject dirs if article DOIs are in the DOI list. """ archived_articles = generate_files(archive_dir) article_trees = ((Articles.parse_XML(article), article) for article in archived_articles) article_IDs = ((list(Articles.extract_element(tree[0], "front/article-meta/article-id")), tree[1]) for tree in article_trees) found_DOIs = dict(((key, {}) for key in settings.DOI_LISTS.keys())) for n, ID in enumerate(article_IDs): found_DOIs = check_and_copy_DOI(DOI_dict, ID, data_dir, raw_article_dir, found_DOIs) if n % updater == 0: sys.stderr.write( "%d files processed...\n" % n)
from pmcminer.ProcessFiles import Raw, Articles from control import settings def check_DOI_by_dict(DOI_dict, article_ID, data_dir, raw_article_dir): """generate subject, DOI and filename for PMC XML file""" for subject in DOI_dict: for item in article_ID[0]: if item in DOI_dict[subject]: yield (subject, item, article_ID[1]) DOIs = Raw.get_DOI_list(settings.DOI_LISTS) archived_articles = Raw.generate_files(settings.ARCHIVE_DIR) article_trees = ((Articles.parse_XML(article), article) for article in archived_articles) article_IDs = ((list(Articles.extract_element(tree[0], "front/article-meta/article-id")), tree[1]) for tree in article_trees) check_dict = dict(((key, {}) for key in settings.DOI_LISTS.keys())) #for ID in article_IDs: for n, ID in enumerate(article_IDs): checker = check_DOI_by_dict(DOIs, ID, settings.DATA_DIR, settings.RAW_ARTICLES_DIR) for i in checker: try: check_dict[i[0]][i[1]].append(i[2]) except KeyError: check_dict[i[0]][i[1]] = [i[2]] if n % 500 == 0: sys.stderr.write( "%d files processed...\n" % n) #are there dois which are in mulitple files in the directory?