Пример #1
0
def processAtharCorpus(infiles, outfile):
    """
        Loads the Athar .html corpus into a JSON file of contexts

        Args:
            infiles: file mask of HTML files to load
            outfile: name of .json file to write to
    """
    import glob
    cp.useLocalCorpus()
    cp.Corpus.connectCorpus("g:\\nlp\\phd\\aac")

    reader=AtharCorpusReader()
    all_contexts=[]
    all_docs=[]
    for f in glob.glob(infiles):
        print("Loading ",f)
        docs,contexts=reader.readFile(f)
        all_contexts.extend(contexts)
        all_docs.extend(docs)

    doc_dict={}
    for doc in all_docs:
        doc_dict[doc.metadata["guid"]]=doc.data

    json.dump(doc_dict,file(outfile,"w"))
Пример #2
0
def main():
    drive="g"
    cp.useLocalCorpus()
    cp.Corpus.connectCorpus(drive+":\\nlp\\phd\\pmc")

    debugging_files=[
    r"articles.I-N\J_Contemp_Brachytherapy\J_Contemp_Brachytherapy_2012_Sep_29_4(3)_176-181.nxml",
    r"articles.O-Z\PLoS_ONE\PLoS_One_2013_Dec_20_8(12)_e85076.nxml",
    r"articles.C-H\Gastroenterol_Rep_(Oxf)\Gastroenterol_Rep_(Oxf)_2013_Sep_17_1(2)_149-152.nxml",
    ]

##    debugging_files=cp.Corpus.selectRandomInputFiles(500,"*.nxml")
##    generateSideBySide(debugging_files)
##    inspectFiles(debugging_files)
    pass
Пример #3
0
def main():
    import minerva.db.corpora as cp

    drive="g"
    cp.useLocalCorpus()
    cp.Corpus.connectCorpus(drive+":\\nlp\\phd\\pmc_coresc\\")

    import read_jatsxml

    debugging_files=[
    r"data\scratch\mpx245\epmc\output\Out_PMC3184115_PMC3205799.xml.gz.gz\3187739_annotated.xml",
##        r"Out_PMC549041_PMC1240567.xml.gz.gz\555959_done.xml",
##        r"Out_PMC549041_PMC1240567.xml.gz.gz\555763_done.xml",
    ]

    read_jatsxml.generateSideBySide(debugging_files)
    pass
Пример #4
0
def basicTest():
    print (__file__)
    import minerva.db.corpora as cp
    drive="g"
    cp.useLocalCorpus()
    cp.Corpus.connectCorpus(drive+":\\nlp\\phd\\pmc")

    from minerva.proc.general_utils import loadFileText
    from minerva.scidoc.xmlformats.read_jatsxml import JATSXMLReader
    reader = JATSXMLReader()
    doc=reader.read(loadFileText(r"G:\NLP\PhD\pmc\inputXML\articles.O-Z\PLoS_ONE\\PLoS_One_2013_Dec_20_8(12)_e85076.nxml"),"one")

##    renderer=CSLRenderer(doc,".." + os.sep + "cit_styles" + os.sep + 'ama.csl')
    renderer=CSLRenderer(doc,"ama")

    print("Citations\n\n")
    for cit in doc.citations:
        print(renderer.getCitationText(cit))

    print("Bibliography\n\n")
    for line in renderer.getBibliography():
        print(line)