def processAtharCorpus(infiles, outfile): """ Loads the Athar .html corpus into a JSON file of contexts Args: infiles: file mask of HTML files to load outfile: name of .json file to write to """ import glob cp.useLocalCorpus() cp.Corpus.connectCorpus("g:\\nlp\\phd\\aac") reader=AtharCorpusReader() all_contexts=[] all_docs=[] for f in glob.glob(infiles): print("Loading ",f) docs,contexts=reader.readFile(f) all_contexts.extend(contexts) all_docs.extend(docs) doc_dict={} for doc in all_docs: doc_dict[doc.metadata["guid"]]=doc.data json.dump(doc_dict,file(outfile,"w"))
def main(): drive="g" cp.useLocalCorpus() cp.Corpus.connectCorpus(drive+":\\nlp\\phd\\pmc") debugging_files=[ r"articles.I-N\J_Contemp_Brachytherapy\J_Contemp_Brachytherapy_2012_Sep_29_4(3)_176-181.nxml", r"articles.O-Z\PLoS_ONE\PLoS_One_2013_Dec_20_8(12)_e85076.nxml", r"articles.C-H\Gastroenterol_Rep_(Oxf)\Gastroenterol_Rep_(Oxf)_2013_Sep_17_1(2)_149-152.nxml", ] ## debugging_files=cp.Corpus.selectRandomInputFiles(500,"*.nxml") ## generateSideBySide(debugging_files) ## inspectFiles(debugging_files) pass
def main(): import minerva.db.corpora as cp drive="g" cp.useLocalCorpus() cp.Corpus.connectCorpus(drive+":\\nlp\\phd\\pmc_coresc\\") import read_jatsxml debugging_files=[ r"data\scratch\mpx245\epmc\output\Out_PMC3184115_PMC3205799.xml.gz.gz\3187739_annotated.xml", ## r"Out_PMC549041_PMC1240567.xml.gz.gz\555959_done.xml", ## r"Out_PMC549041_PMC1240567.xml.gz.gz\555763_done.xml", ] read_jatsxml.generateSideBySide(debugging_files) pass
def basicTest(): print (__file__) import minerva.db.corpora as cp drive="g" cp.useLocalCorpus() cp.Corpus.connectCorpus(drive+":\\nlp\\phd\\pmc") from minerva.proc.general_utils import loadFileText from minerva.scidoc.xmlformats.read_jatsxml import JATSXMLReader reader = JATSXMLReader() doc=reader.read(loadFileText(r"G:\NLP\PhD\pmc\inputXML\articles.O-Z\PLoS_ONE\\PLoS_One_2013_Dec_20_8(12)_e85076.nxml"),"one") ## renderer=CSLRenderer(doc,".." + os.sep + "cit_styles" + os.sep + 'ama.csl') renderer=CSLRenderer(doc,"ama") print("Citations\n\n") for cit in doc.citations: print(renderer.getCitationText(cit)) print("Bibliography\n\n") for line in renderer.getBibliography(): print(line)