Exemplo n.º 1
0
def scrub_articles_from_root(source_dir, stops, updater = 500):
    """
        Runs through a whole directory, opens all files and removes newlines matching 
        stops.  Then saves back to the same file.
    """
    sys.stderr.write("Scrubbing extraneous newlines...\n")
    articles = Raw.generate_files(source_dir)
    article_strings = ({
                            "file": codecs.open(article, "r", "utf8").read(),
                            "path": article
                        } for article in articles)
    scrubbed_articles = ({
                            "file":scrub_article(a["file"], stops),
                            "path":a["path"]
                          } for a in article_strings)
    for n, scrubbed in enumerate(scrubbed_articles):
        fOut = codecs.open(scrubbed["path"], "w", "utf8")
        fOut.write(scrubbed["file"])
        fOut.close()
        if n % updater == 0:
            sys.stderr.write("%d articles scrubbed..." % n)
    sys.stderr.write("All scrubbed up.\n")
Exemplo n.º 2
0
import os
import sys
import itertools

from control import settings

from pmcminer.ProcessFiles import Articles
from pmcminer.ProcessFiles import Raw






articles = ({ "articles": Raw.generate_files(os.path.join(settings.DATA_DIR, 
                                    settings.RAW_ARTICLES_DIR, 
                                    subject)),
               "subject": subject} for subject in settings.DOI_LISTS)

class Purify(Articles.ExtractArticleBody):
    """
        builds an article set for article subjects that are non-overlapping.
        put a tuple or list in dirSet to specify directories within inPath.
    """
    def __init__(self, inPath, outPath, dirSet = False):
        if dirSet:
            self.subjects = dirSet
        else:
            self.subjects = [subject for subject in os.listdir(inPath) if 
                               os.path.isdir(os.path.join(inPath, subject))]
        self.subjectCount = [len(os.listdir(os.path.join(inPath, subject))) for subject in self.subjects]
Exemplo n.º 3
0
"""
import sys
import os

from pmcminer.ProcessFiles import Raw, Articles
from  control import settings

def check_DOI_by_dict(DOI_dict, article_ID, data_dir, raw_article_dir):
    """generate subject, DOI and filename for PMC XML file"""
    for subject in DOI_dict:
        for item in article_ID[0]:
            if item in DOI_dict[subject]:
                yield (subject, item, article_ID[1])
                

DOIs = Raw.get_DOI_list(settings.DOI_LISTS)

archived_articles = Raw.generate_files(settings.ARCHIVE_DIR)
article_trees = ((Articles.parse_XML(article), article) for article in archived_articles)
article_IDs = ((list(Articles.extract_element(tree[0],
                "front/article-meta/article-id")), tree[1]) for tree in article_trees)

check_dict = dict(((key, {}) for key in settings.DOI_LISTS.keys()))

#for ID in article_IDs:
for n, ID in enumerate(article_IDs):
    checker = check_DOI_by_dict(DOIs, ID, settings.DATA_DIR, settings.RAW_ARTICLES_DIR)
    for i in checker:
        try:
            check_dict[i[0]][i[1]].append(i[2])
        except KeyError:
Exemplo n.º 4
0
#!/usr/bin/python2

import os

from  control import settings

"""
    Executable for PMCminer app for extracting species and ecological 
    interaction terms from PMCOASS XML files
"""

if settings.EXTRACT_RAW_ARTICLES_BY_DOI:
   from pmcminer.ProcessFiles import Raw
   Raw.extract_raw_articles(data_dir = settings.DATA_DIR, 
                            raw_dir = settings.RAW_ARTICLES_DIR, 
                            doi_lists = settings.DOI_LISTS, 
                            archive_dir = settings.ARCHIVE_DIR)
   
if settings.PURIFY_ARTICLE_SET:
    from pmcminer.ProcessFiles.Pure import PurifyArticleSet
    from pmcminer.ProcessFiles.Pure import ArticleScrubber
    PurifyArticleSet.purify_articles(settings.DATA_DIR, settings.RAW_ARTICLES_DIR,
                                     settings.DOI_LISTS, settings.PURE_ARTICLES_DIR)
    ArticleScrubber.scrub_articles_from_root(source_dir = os.path.join(settings.DATA_DIR,
                                                            settings.PURE_ARTICLES_DIR), 
                                                            stops = settings.STOPS)
if settings.TAG_SPECIES:
    from pmcminer.Linnaeus.PyLinnaeus import *
    for subject in range(len(settings.DOI_LISTS)):
        spp_tagger = Tagger(**lin_opts(0,subject))
        spp_tagger.check_directories()