def convertBioC2TSV(biocFilename,tsvFilename): with bioc.iterparse(biocFilename) as inBioC, codecs.open(tsvFilename,'w','utf-8') as outTSV: # Output the headers to the TSV file headers = ['pmid','year','title','abstract'] outTSV.write("\t".join(headers) + "\n") # Iterate over every document (Pubmed citation) for docCount,biocDoc in enumerate(inBioC): # Get some metadata for the citation pmid = biocDoc.infons['pmid'] year = biocDoc.infons['year'] title = biocDoc.infons['title'] # Get the abstract text abstract = [ passage.text for passage in biocDoc.passages if passage.infons['section'] == 'abstract' ] if len(abstract) > 0: abstract = " ".join(abstract).replace('\n','').replace('\r','') else: abstract = "" # Save to the file output = [pmid,year,title,abstract] outTSV.write("\t".join(output) + "\n") if ((docCount+1) % 1000) == 0: print("Processed %d documents..." % (docCount+1)) print("Complete")
def bioc2txt(biocFilename, txtHandle,idFilter): with bioc.iterparse(biocFilename) as parser: for biocDoc in parser: if idFilter is None or biocDoc.id in idFilter: for passage in biocDoc.passages: txtHandle.write(passage.text) txtHandle.write("\n\n")
def iterLoad(dataFormat,path,corpusSizeCutoff=500): """ Iteratively load sections of a (presumably large) corpus. This will create a generator that provides kindred.Corpus objects that are subsets of the larger corpus. This should be used to lower the memory requirements (so that the entire file doesn't need to be loaded into memory at one time). :param dataFormat: Format of the data files to load (only 'biocxml' is currently supported) :param path: Path to data. Can be directory or an individual file (for bioc, json or simpletag) :param corpusSizeCutoff: Approximate maximum number of documents to be in each corpus subset :type dataFormat: str :type path: str :type corpusSizeCutoff: int :return: Subsets of the BioC file :rtype: A kindred.Corpus generator """ assert dataFormat == 'biocxml' corpus = kindred.Corpus() if os.path.isdir(path): filenames = [ os.path.join(path,x) for x in os.listdir(path) if x.endswith('bioc.xml') ] else: filenames = [path] for filename in filenames: with bioc.iterparse(filename) as parser: for document in parser: if len(corpus.documents) >= corpusSizeCutoff: yield corpus corpus = kindred.Corpus() kindredDocs = convertBiocDocToKindredDocs(document) for kindredDoc in kindredDocs: corpus.addDocument(kindredDoc) if len(corpus.documents) > 0: yield corpus
def triage2doc(corpus, mode, stop_words=True): """ Extract title and abstract from a BioC collection. :param corpus: BioC collection :param stop_words: True if we want to remove stop words :return: a list where each item is a document with it title and abstract """ ids = [] texts = [] labels = [] stopwords = [] if stop_words: for line in open('stopwords.txt'): stopwords.append(line.strip()) with bioc.iterparse(corpus) as parser: for document in parser: ids.append(document.id) texts.append(extract_text(document, stopwords)) if mode != 'eval': relevant = document.infons['relevant'] labels.append(0 if relevant == 'no' else 1) return texts, labels, ids
def findSmallText(biocFilename, tsvFilename): with bioc.iterparse(biocFilename) as inBioC, codecs.open( tsvFilename, 'w', 'utf-8') as outTSV: # Iterate over every document (Pubmed citation) for docCount, biocDoc in enumerate(inBioC): # Get some metadata for the citation pmid = biocDoc.infons['pmid'] journal = biocDoc.infons['journal'] shortArticleSections = [ passage.text for passage in biocDoc.passages if passage.infons['section'] == 'article' and len(passage.text) < 30 ] for shortArticleSection in shortArticleSections: # Save to the file output = [pmid, journal, shortArticleSection] outTSV.write("\t".join(output) + "\n") print("Complete")
def iterLoadDataFromBioc(filename, corpusSizeCutoff=500): """ Iteratively load documents from a BioC file. This will a generator that provides kindred.Corpus objects that are subsets of the entire BioC file. This should be used to lower the memory requirements (so that the entire file doesn't need to be loaded into memory at one time). :param filename: Path of the Bioc file :param corpusSizeCutoff: Approximate maximum number of documents to be in each corpus subset :type filename: str :type corpusSizeCutoff: int :return: Subsets of the BioC file :rtype: A kindred.Corpus generator """ corpus = kindred.Corpus() with bioc.iterparse(filename) as parser: for document in parser: if len(corpus.documents) >= corpusSizeCutoff: yield corpus corpus = kindred.Corpus() kindredDocs = convertBiocDocToKindredDocs(document) for kindredDoc in kindredDocs: corpus.addDocument(kindredDoc) if len(corpus.documents) > 0: yield corpus
""" Test file for counting num docs, this library does not suite our needs Gives memory blowup because of bad etree management """ import bioc if __name__ == "__main__": file_name = r"/home/daniel/Downloads/abstracts_collection.xml" # dtd = r"C:\Users\Danie\Downloads\BioC.dtd" with bioc.iterparse(file_name) as parser: count = 0 # collection_info = parser.get_collection_info() # print(collection_info.source) # print(collection_info.date) # print(collection_info.key) for document in parser: # print(document.id) # for key in document.infons: # print(key + "\n" + document.infons[key]) # for passage in document.passages: # print(passage.infons) # print(passage.text) # if count > 0: # break count += 1 if count % 1000 == 0: print(count) print(count)
def mergeBioc(biocFilename, outBiocWriter,idFilter): with bioc.iterparse(biocFilename) as parser: for biocDoc in parser: if idFilter is None or biocDoc.id in idFilter: outBiocWriter.writedocument(biocDoc)
required=True, type=str, help='Input BioC file') parser.add_argument('--annotations', required=True, type=str, help='Pre-pickled annotations') parser.add_argument('--outBioc', required=True, type=str, help='Output BioC file') args = parser.parse_args() pmids = set() with bioc.iterparse(args.inBioc) as parser: for i, doc in enumerate(parser): if 'pmid' in doc.infons and doc.infons['pmid'] != 'None': pmid = int(doc.infons['pmid']) pmids.add(pmid) pmidToAnnotations = defaultdict(list) with open(args.annotations) as f: for line in f: split = line.strip('\n').split('\t') pmid, annotationType, conceptid, mentions, database = split mentions = mentions.strip() pmid = int(pmid) if len(mentions) > 0 and pmid in pmids: pmidToAnnotations[pmid].append( (annotationType, conceptid, mentions))
def get_bioc_file(filename): list_result = [] with bioc.iterparse(filename) as parser: for document in parser: list_result.append(document) return list_result
if search: term = search.groupdict()['term'] termLower = term.lower() if termLower in acceptableTerms and not termLower in stoplist: out = [term, pmid, title, journal, year, s] outTxt = "\t".join(out) print(outTxt) #print("%%s\t%s" % (pmid,term,s)) #print(document.infons) if __name__ == '__main__': parser = argparse.ArgumentParser( description='Search for mentions of tumor antigens') parser.add_argument('--biocFile', required=True, type=str, help='Filename of BioC file to search') parser.add_argument('--acceptableTermsFile', required=True, type=str, help='List of terms to help filter') args = parser.parse_args() with open(args.acceptableTermsFile) as f: acceptableTerms = set([line.strip().lower() for line in f]) with bioc.iterparse(args.biocFile) as parser: for document in parser: searchForTumorAntigens(document, acceptableTerms)
def read_made_data(made_base_dir, index_tokenizer, ignore_labels=['PHI'], verbose=False): annotated_docs = [] corpus_path = made_base_dir + r'\corpus' for f in os.listdir(corpus_path): corpus_file_path = os.path.join(corpus_path, f) if not os.path.isfile(corpus_file_path): continue corpus_file = open(corpus_file_path, 'r', encoding='utf-8') text = corpus_file.read() corpus_file.close() annotated_doc = AnnotatedDocument() annotated_doc.text = text annotated_doc.filename = f # now tokenize by sentence and tokens and store this annotated_doc.tokenized_doc = index_tokenizer.tokenize_document(text) #print('Document total Sentences : {}'.format(len(annotated_doc.tokenized_doc.sentences))) # we also need to read in the BIOC annotations... bioc_file_name = os.path.join( os.path.join(made_base_dir, 'annotations'), f + '.bioc.xml') # add this even if we don't end up finding a BIOC file annotated_docs.append(annotated_doc) if not os.path.isfile(bioc_file_name): print('No BIOC annotations for file : {}'.format(bioc_file_name)) continue if verbose: print('Loading annotations for {}'.format(bioc_file_name)) with bioc.iterparse(bioc_file_name) as parser: collection_info = parser.get_collection_info() for document in parser: i = 1 #print(document) for passage in document.passages: #print(len(passage.annotations)) for bioc_annotation in passage.annotations: locations = bioc_annotation.locations if len(locations) != 1: print( 'Expected Annotation to have 1 Locations, but got {}' .format(len(locations))) continue # just use the first since we expect only 1 location = locations[0] anno = Annotation() anno.start_index = location.offset anno.end_index = location.offset + location.length anno.spanned_text = bioc_annotation.text #print(bioc_infons) if 'type' in bioc_annotation.infons: anno.type = bioc_annotation.infons['type'] # let's see if we should ignore this type for ignore_label in ignore_labels: if anno.type == ignore_label: anno.type = 'O' break else: anno.type = 'UNK' #print(anno) #break annotated_doc.annotations.append(anno) print('Total annotated documents loaded : {}'.format(len(annotated_docs))) return annotated_docs