def convert(inFiles, inFormat, outFile, outFormat): outBiocHandle, outTxtHandle = None, None assert inFormat in acceptedInFormats, "%s is not an accepted input format. Options are: %s" % ( inFormat, "/".join(acceptedInFormats)) assert outFormat in acceptedOutFormats, "%s is not an accepted output format. Options are: %s" % ( outFormat, "/".join(acceptedOutFormats)) if outFormat == 'biocxml': outBiocHandle = bioc.BioCXMLDocumentWriter(outFile) elif outFormat == 'txt': outTxtHandle = open(outFile, 'w', 'utf-8') for inFile in inFiles: for biocDoc in docs2bioc(inFile, inFormat): if outFormat == 'biocxml': outBiocHandle.write_document(biocDoc) elif outFormat == 'txt': for passage in biocDoc.passages: outTxtHandle.write(passage.text) outTxtHandle.write("\n\n") if outFormat == 'biocxml': outBiocHandle.close() elif outFormat == 'txt': outTxtHandle.close()
def pubmedxml2bioc(pubmedxmlFilename, biocFilename): with bioc.BioCXMLDocumentWriter(biocFilename) as writer: for pmDoc in processMedlineFile(pubmedxmlFilename): biocDoc = bioc.BioCDocument() biocDoc.id = pmDoc["pmid"] biocDoc.infons['title'] = " ".join(pmDoc["title"]) biocDoc.infons['pmid'] = pmDoc["pmid"] biocDoc.infons['year'] = pmDoc["pubYear"] biocDoc.infons['month'] = pmDoc["pubMonth"] biocDoc.infons['day'] = pmDoc["pubDay"] biocDoc.infons['journal'] = pmDoc["journal"] biocDoc.infons['journalISO'] = pmDoc["journalISO"] biocDoc.infons['authors'] = ", ".join(pmDoc["authors"]) biocDoc.infons['chemicals'] = pmDoc['chemicals'] biocDoc.infons['meshHeadings'] = pmDoc['meshHeadings'] offset = 0 for section in ["title", "abstract"]: for textSource in pmDoc[section]: textSource = trimSentenceLengths(textSource) passage = bioc.BioCPassage() passage.infons['section'] = section passage.text = textSource passage.offset = offset offset += len(textSource) biocDoc.add_passage(passage) writer.write_document(biocDoc)
def marcxml2bioc(marcxmlFilename, biocFilename): with open(marcxmlFilename, 'rb') as inF, bioc.BioCXMLDocumentWriter(biocFilename) as writer: def marcxml2bioc_helper(record): writeMarcXMLRecordToBiocFile(record, writer) pymarc.map_xml(marcxml2bioc_helper, inF)
def splitBioc(inBioc, outDir, maxLength, stripAnnotations=False): assert os.path.isfile(inBioc) assert os.path.isdir(outDir) pmids = set() textLength = 0 docNumber = 0 docName = os.path.join(outDir, "%08d.bioc.xml" % docNumber) writer = bioc.BioCXMLDocumentWriter(docName) with open(inBioc, 'rb') as f: parser = bioc.BioCXMLDocumentReader(f) for i, doc in enumerate(parser): if 'pmid' in doc.infons: if doc.infons['pmid'] in pmids: continue pmids.add(doc.infons['pmid']) thisDocLength = sum(len(passage.text) for passage in doc.passages) assert len( doc.passages ) > 0 and thisDocLength > 0, "Corpus file cannot contain empty documents" if stripAnnotations: for passage in doc.passages: passage.annotations = [] passage.relations = [] if textLength > 0 and maxLength and (textLength + thisDocLength) > maxLength: textLength = 0 docNumber += 1 docName = os.path.join(outDir, "%08d.bioc.xml" % docNumber) writer.close() writer = bioc.BioCXMLDocumentWriter(docName) textLength += thisDocLength writer.write_document(doc) writer.close() if textLength == 0: os.remove(docName)
def test_BioCXMLDocumentWriter_io(): collection = _get_collection() f = io.BytesIO() writer = bioc.BioCXMLDocumentWriter(f) writer.write_collection_info(collection) for document in collection.documents: writer.write_document(document) writer.close() collection = bioc.loads(f.getvalue().decode('utf-8')) assert_everything(collection)
def mergeBioc(inDir, outBioc): inBiocs = sorted([ os.path.join(inDir, filename) for filename in os.listdir(inDir) if filename.lower().endswith('.xml') and not filename.lower().endswith('.ga.xml') ]) with bioc.BioCXMLDocumentWriter(outBioc) as writer: for inBioc in inBiocs: with open(inBioc, 'rb') as f: parser = bioc.BioCXMLDocumentReader(f) for doc in parser: writer.write_document(doc)
def test_BioCXMLDocumentWriter_file(): collection = _get_collection() tmp = tempfile.mktemp() with bioc.BioCXMLDocumentWriter(tmp) as writer: writer.write_collection_info(collection) for document in collection.documents: writer.write_document(document) with open(tmp, encoding='utf8') as fp: collection = bioc.load(fp) assert_everything(collection)
def write_bioc_collection(filename: str, collection: bioc.BioCCollection): """write a BiocCollection as an xml document It will return 1 :param filename: a str filename of the collection :param collection: a bioc collection :returns: 1 """ with bioc.BioCXMLDocumentWriter(filename) as writer: writer.write_collection_info(collection) for document in collection.documents: writer.write_document(document) return (1)
def save(corpus, dataFormat, path): """ Save a corpus to a directory :param corpus: The corpus of documents to save :param dataFormat: Format of data to save (only 'standoff', 'biocxml', 'pubannotation' and 'csv' are supported currently) :param path: Path where corpus should be saved. Must be an existing directory for 'standoff'. :type corpus: kindred.Corpus :type dataFormat: str :type path: str """ assert dataFormat in ['standoff', 'biocxml', 'pubannotation', 'csv'] assert isinstance(corpus, kindred.Corpus) if dataFormat == 'standoff': assert os.path.isdir(path), "Path must be an existing directory" for i, d in enumerate(corpus.documents): if d.sourceFilename is None: base = "%08d" % i else: base = d.sourceFilename txtPath = os.path.join(path, '%s.txt' % base) a1Path = os.path.join(path, '%s.a1' % base) a2Path = os.path.join(path, '%s.a2' % base) saveDocToSTFormat(d, txtPath, a1Path, a2Path) elif dataFormat == 'biocxml': assert not os.path.isdir( path), "Path cannot be an existing directory for 'biocxml'." collection = convertKindredCorpusToBioCCollection(corpus) with bioc.BioCXMLDocumentWriter(path) as writer: for doc in collection.documents: writer.write_document(doc) elif dataFormat == 'pubannotation': assert not os.path.isdir( path), "Path cannot be an existing directory for 'pubannotation'." saveCorpusToPubAnnotationFormat(corpus, path) elif dataFormat == 'csv': assert not os.path.isdir( path), "Path cannot be an existing directory for 'csv'." saveCorpusToCSVFormat(corpus, path)
def pmcxml2bioc(pmcxmlFilename, biocFilename): try: with bioc.BioCXMLDocumentWriter(biocFilename) as writer: for pmcDoc in processPMCFile(pmcxmlFilename): biocDoc = bioc.BioCDocument() biocDoc.id = pmcDoc["pmid"] biocDoc.infons['title'] = " ".join( pmcDoc["textSources"]["title"]) biocDoc.infons['pmid'] = pmcDoc["pmid"] biocDoc.infons['pmcid'] = pmcDoc["pmcid"] biocDoc.infons['doi'] = pmcDoc["doi"] biocDoc.infons['year'] = pmcDoc["pubYear"] biocDoc.infons['month'] = pmcDoc["pubMonth"] biocDoc.infons['day'] = pmcDoc["pubDay"] biocDoc.infons['journal'] = pmcDoc["journal"] biocDoc.infons['journalISO'] = pmcDoc["journalISO"] offset = 0 for groupName, textSourceGroup in pmcDoc["textSources"].items( ): subsection = None for textSource in textSourceGroup: textSource = trimSentenceLengths(textSource) passage = bioc.BioCPassage() subsectionCheck = textSource.lower().strip( '01234567890. ') if subsectionCheck in allowedSubsections: subsection = subsectionCheck passage.infons['section'] = groupName passage.infons['subsection'] = subsection passage.text = textSource passage.offset = offset offset += len(textSource) biocDoc.add_passage(passage) writer.write_document(biocDoc) except etree.ParseError: raise RuntimeError("Parsing error in PMC xml file: %s" % pmcxmlFilename)
def convertFiles(inFiles, inFormat, outFile, outFormat, idFilterfiles=None): outBiocHandle, outTxtHandle = None, None if outFormat == 'bioc': outBiocHandle = bioc.BioCXMLDocumentWriter(outFile) elif outFormat == 'txt': outTxtHandle = codecs.open(outFile, 'w', 'utf-8') if idFilterfiles is None: idFilterfiles = [None for _ in inFiles] print("Converting %d files to %s" % (len(inFiles), outFile)) for inFile, idFilterfile in zip(inFiles, idFilterfiles): if idFilterfile is None: idFilter = None else: with open(idFilterfile) as f: idFilter = set([line.strip() for line in f]) with tempfile.NamedTemporaryFile() as temp: if inFormat == 'bioc': shutil.copyfile(inFile, temp.name) elif inFormat == 'pubmedxml': pubmedxml2bioc(inFile, temp.name) elif inFormat == 'marcxml': marcxml2bioc(inFile, temp.name) elif inFormat == 'pmcxml': pmcxml2bioc(inFile, temp.name) elif inFormat == 'uimaxmi': uimaxmi2bioc(inFile, temp.name) else: raise RuntimeError("Unknown input format: %s" % inFormat) if outFormat == 'bioc': mergeBioc(temp.name, outBiocHandle, idFilter) elif outFormat == 'txt': bioc2txt(temp.name, outTxtHandle, idFilter) else: raise RuntimeError("Unknown output format: %s" % outFormat) print("Output to %s complete" % outFile)
def convert(in_files, in_format, out_file, out_format, **kwargs): out_bioc_handle, out_txt_handle = None, None assert ( in_format in accepted_in_formats ), "%s is not an accepted input format. Options are: %s" % ( in_format, "/".join(accepted_in_formats), ) assert ( out_format in accepted_out_formats ), "%s is not an accepted output format. Options are: %s" % ( out_format, "/".join(accepted_out_formats), ) if out_format == "biocxml": out_bioc_handle = bioc.BioCXMLDocumentWriter(out_file) elif out_format == "txt": out_txt_handle = open(out_file, "w", encoding="utf-8") for in_file in in_files: for bioc_doc in docs2bioc(in_file, in_format, **kwargs): if out_format == "biocxml": out_bioc_handle.write_document(bioc_doc) elif out_format == "txt": for passage in bioc_doc.passages: out_txt_handle.write(passage.text) out_txt_handle.write("\n\n") if out_format == "biocxml": out_bioc_handle.close() elif out_format == "txt": out_txt_handle.close()
def mergeBiocWithMetadata(metaDir, inDir, outBioc): filenames = sorted([ filename for filename in os.listdir(inDir) if filename.lower().endswith('.xml') and not filename.lower().endswith('.ga.xml') ]) with bioc.BioCXMLDocumentWriter(outBioc) as writer: for filename in filenames: inBioc = os.path.join(inDir, filename) metaBioc = os.path.join(metaDir, filename) with open(inBioc, 'rb') as f1, open(metaBioc, 'rb') as f2: inParser = bioc.BioCXMLDocumentReader(f1) metaParser = bioc.BioCXMLDocumentReader(f2) for inDoc, metaDoc in zip(inParser, metaParser): assert len(inDoc.passages) == len(metaDoc.passages) for inP, metaP in zip(inDoc.passages, metaDoc.passages): assert inP.text == metaP.text inP.infons.update(metaP.infons) inDoc.infons.update(metaDoc.infons) writer.write_document(inDoc)
def uimaxmi2bioc(xmiFilename, biocFilename): tree = etree.parse(xmiFilename) root = tree.getroot() metadataNode = root.find( '{http:///de/tudarmstadt/ukp/dkpro/core/api/metadata/type.ecore}DocumentMetaData' ) documentTitle = metadataNode.attrib['documentTitle'] contentNode = root.find('{http:///uima/cas.ecore}Sofa') content = contentNode.attrib['sofaString'] with bioc.BioCXMLDocumentWriter(biocFilename) as writer: biocDoc = bioc.BioCDocument() biocDoc.id = None biocDoc.infons['title'] = documentTitle passage = bioc.BioCPassage() passage.infons['section'] = 'article' passage.text = content passage.offset = 0 biocDoc.add_passage(passage) writer.write_document(biocDoc)
pmidToAnnotations = defaultdict(list) with open(args.annotations) as f: for line in f: split = line.strip('\n').split('\t') pmid,annotationType,conceptid,mentions,database = split mentions = mentions.strip() pmid = int(pmid) if len(mentions) > 0 and pmid in pmids: pmidToAnnotations[pmid].append((annotationType,conceptid,mentions)) print("Starting text alignment...") currentID = 1 writer = bioc.BioCXMLDocumentWriter(args.outBioc) #with bioc.BioCXMLDocumentReader(args.inBioc) as parser: with open(args.inBioc,'rb') as f: parser = bioc.BioCXMLDocumentReader(f) for i,doc in enumerate(parser): for passage in doc.passages: passage.annotations = [] if 'pmid' in doc.infons and doc.infons['pmid']: pmid = int(doc.infons['pmid']) #print(now(),i,pmid) #sys.stdout.flush() for passage in doc.passages: candidates = defaultdict(lambda : defaultdict(list))
import argparse import bioc if __name__ == '__main__': parser = argparse.ArgumentParser(description='Make some minor fixes to BioC files to make them play nicely with some NER tools') parser.add_argument('--inBiocXML',type=str,required=True,help='Input BioC XML file') parser.add_argument('--outBiocXML',type=str,required=True,help='Output BioC XML file') args = parser.parse_args() pmids = set() textLength = 0 with open(args.inBiocXML,'rb') as f, bioc.BioCXMLDocumentWriter(args.outBiocXML) as writer: parser = bioc.BioCXMLDocumentReader(f) for i,doc in enumerate(parser): if doc.infons['pmid'] in pmids: continue pmids.add(doc.infons['pmid']) for passage in doc.passages: if 'section' in passage.infons: passage.infons['type'] = passage.infons['section'] else: passage.infons['type'] = 'unknown' passage.text = passage.text.strip() thisDocLength = sum( len(passage.text) for passage in doc.passages ) if len(doc.passages) == 0 or thisDocLength == 0:
parser.add_argument('--outFile',required=True,type=str,help='File to save to') args = parser.parse_args() assert args.format == 'biocxml' grouping_file = os.path.join(args.pmcDir,'groupings.json') with open(grouping_file) as f: block = json.load(f)['groups'][args.block] source = os.path.join(args.pmcDir, block['src']) files_to_extract = block['group'] #print(source) #print(len(files_to_extract)) with bioc.BioCXMLDocumentWriter(args.outFile) as writer: tar = tarfile.open(source) for i,filename in enumerate(files_to_extract): #print(i,filename) member = tar.getmember(filename) #print(member) file_handle = tar.extractfile(member) #print(file_handle) data = file_handle.read().decode('utf-8') #print(len(data)) #print(data[:500]) for biocDoc in pmcxml2bioc(io.StringIO(data)): writer.write_document(biocDoc) #break print("Saved %d documents to %s" % (len(files_to_extract), args.outFile))
parser.add_argument('--outFile',required=True,type=str,help='File to save to') parser.add_argument('--db',action='store_true',help="Whether to output as an SQLite database") args = parser.parse_args() assert args.format == 'biocxml' grouping_file = os.path.join(args.pmcDir,'groupings.json') with open(grouping_file) as f: block = json.load(f)['groups'][args.block] source = os.path.join(args.pmcDir, block['src']) files_to_extract = block['group'] with tempfile.NamedTemporaryFile() as tf_out: out_file = tf_out.name if args.db else args.outFile with bioc.BioCXMLDocumentWriter(out_file) as writer: tar = tarfile.open(source) for i,filename in enumerate(files_to_extract): try: member = tar.getmember(filename) except KeyError: print("WARNING. Didn't find %s in %s. Skipping" % (filename,source)) continue file_handle = tar.extractfile(member) data = file_handle.read().decode('utf-8') for bioc_doc in pmcxml2bioc(io.StringIO(data)): writer.write_document(bioc_doc)