def fixPaperReferences(annotated_file, pmc_file, pmc_id, original_text=None): """ Replaces the <ref-list> section in `annotated_file` with that from `pmc_file` Checking that they actually the same file is done outside. """ annotated_text=loadFileText(annotated_file) if not original_text: original_text=loadFileText(pmc_file) try: orig_start, orig_end=selectRefListSection(original_text, pmc_file, pmc_id) except ValueError: return original_refs=original_text[orig_start:orig_end] try: annot_start, annot_end=selectRefListSection(annotated_text, annotated_file, getFilePMCID(annotated_file)) except ValueError: return new_annotated_text=annotated_text[:annot_start]+original_text[orig_start:orig_end]+annotated_text[annot_end:] writeFileText(new_annotated_text, annotated_file)
def generateSideBySide(doc_list): """ Generates side-by-side visualizations of a Paper XML: one using an XML to HTML converter, one loading the XML into SciDocJSON and rendering it back as HTML """ from subprocess import Popen reader=PaperXMLReader() output_dir="g:\\nlp\\phd\\aac\\conversion_visualization\\" if not os.path.exists(output_dir): os.makedirs(output_dir) file_list=[] for filename in doc_list: print("Converting %s" % filename) input_file=cp.Corpus.paths.inputXML+filename output_file=output_dir+"%s_1.html" % os.path.basename(filename) input_text=loadFileText(input_file) writeFileText(input_text,output_file) doc=reader.read(input_text, input_file) try: json.dumps(doc.data) except: print("Not JSON Serializable!!!!") html=SciDocRenderer(doc).prettyPrintDocumentHTML(True,True,True, True) output_file2=output_file.replace("_1.html","_2.html") writeFileText(html,output_file2) file_list.append([os.path.basename(output_file),os.path.basename(output_file2)]) file_list_json="file_data=%s;" % json.dumps(file_list) writeFileText(file_list_json,output_dir+"file_data.json")
def convertAANmetadata(infile): """ Load strange text file format from AAN, convert to CSV. WARNING: breaks backwards compatibility Args: infile: path to acl-metadata.txt Returns: returns a dict where [id] = {"authors", "title", etc.} """ alltext=loadFileText(infile) filedict={} for match in re.finditer(r"id\s\=\s\{(.+?)\}\nauthor\s\=\s\{(.+?)\}\ntitle\s\=\s\{(.+?)\}\nvenue\s\=\s\{(.+?)}\nyear\s\=\s\{(.+?)\}",alltext,re.IGNORECASE): fn=match.group(1).lower() authors=match.group(2).split(";") surnames=[] parsed_authors=[] for a in authors: bits=a.split(",") surnames.append(bits[0].strip()) parsed_authors.append({"given":"".join(bits[1:]).strip(),"family":bits[0].strip()}) title=match.group(3) conference=match.group(4) year=match.group(5) filedict[fn]={"authors":parsed_authors,"surnames":surnames, "title":title, "conference":conference, "year":year, "corpus_id":fn} author_string="["+",".join(authors)+"]" return filedict
def readFile(self, filename): """ Args: filename: full path to file to read """ text=loadFileText(filename) return self.read(text, filename)
def getPaperPMCID(filename): """ Loads JATS file, returns its pmcid and the loaded text, or None if pmcid not found """ original_text = loadFileText(filename) pmcid = re.search(r"<article-id pub-id-type=\"pmcid\">(.*?)</article-id>", original_text, re.IGNORECASE) if not pmcid: print ("File %s has no original pmcid " % filename) return None return pmcid.group(1), original_text
def fixPaperReferences(annotated_file, pmc_file, original_text=None): """ Replaces the <ref-list> section in `annotated_file` with that from `pmc_file` Checking that they actually the same file is done outside. """ annotated_text = loadFileText(annotated_file) if not original_text: original_text = loadFileText(pmc_file) orig_start, orig_end = selectRefListSection(original_text, pmc_file, pmc_id) original_refs = original_text[orig_start:orig_end] annot_start, annot_end = selectRefListSection(original_text) original_refs = original_text[orig_start:orig_end] annotated_text = annotated_text[:annot_start] + original_text[orig_start:orig_end] + annotated_text[annot_end:] writeFileText(annotated_text, annotated_file)
def readFile(self, filename): """ Load an XML file into a SciDoc. Args: filename: full path to file to read Returns: SciDoc instance """ text=loadFileText(filename) return self.read(text, filename)
def basicTest(): print (__file__) import minerva.db.corpora as cp drive="g" cp.useLocalCorpus() cp.Corpus.connectCorpus(drive+":\\nlp\\phd\\pmc") from minerva.proc.general_utils import loadFileText from minerva.scidoc.xmlformats.read_jatsxml import JATSXMLReader reader = JATSXMLReader() doc=reader.read(loadFileText(r"G:\NLP\PhD\pmc\inputXML\articles.O-Z\PLoS_ONE\\PLoS_One_2013_Dec_20_8(12)_e85076.nxml"),"one") ## renderer=CSLRenderer(doc,".." + os.sep + "cit_styles" + os.sep + 'ama.csl') renderer=CSLRenderer(doc,"ama") print("Citations\n\n") for cit in doc.citations: print(renderer.getCitationText(cit)) print("Bibliography\n\n") for line in renderer.getBibliography(): print(line)