def convertBioC2TSV(biocFilename,tsvFilename):
	with bioc.iterparse(biocFilename) as inBioC, codecs.open(tsvFilename,'w','utf-8') as outTSV:
		# Output the headers to the TSV file
		headers = ['pmid','year','title','abstract']
		outTSV.write("\t".join(headers) + "\n")

		# Iterate over every document (Pubmed citation)
		for docCount,biocDoc in enumerate(inBioC):
			# Get some metadata for the citation
			pmid = biocDoc.infons['pmid']
			year = biocDoc.infons['year']
			title = biocDoc.infons['title']

			# Get the abstract text
			abstract = [ passage.text for passage in biocDoc.passages if passage.infons['section'] == 'abstract' ]
			if len(abstract) > 0:
				abstract = " ".join(abstract).replace('\n','').replace('\r','')
			else:
				abstract = ""
			
			# Save to the file
			output = [pmid,year,title,abstract]
			outTSV.write("\t".join(output) + "\n")

			if ((docCount+1) % 1000) == 0:
				print("Processed %d documents..." % (docCount+1))

	print("Complete")
示例#2
0
def bioc2txt(biocFilename, txtHandle,idFilter):
	with bioc.iterparse(biocFilename) as parser:
		for biocDoc in parser:
			if idFilter is None or biocDoc.id in idFilter:
				for passage in biocDoc.passages:
					txtHandle.write(passage.text)
					txtHandle.write("\n\n")
示例#3
0
def iterLoad(dataFormat,path,corpusSizeCutoff=500):
	"""
	Iteratively load sections of a (presumably large) corpus. This will create a generator that provides kindred.Corpus objects that are subsets of the larger corpus. This should be used to lower the memory requirements (so that the entire file doesn't need to be loaded into memory at one time).

	:param dataFormat: Format of the data files to load (only 'biocxml' is currently supported)
	:param path: Path to data. Can be directory or an individual file (for bioc, json or simpletag)
	:param corpusSizeCutoff: Approximate maximum number of documents to be in each corpus subset
	:type dataFormat: str
	:type path: str
	:type corpusSizeCutoff: int
	:return: Subsets of the BioC file
	:rtype: A kindred.Corpus generator
	"""
	assert dataFormat == 'biocxml'

	corpus = kindred.Corpus()

	if os.path.isdir(path):
		filenames = [ os.path.join(path,x) for x in os.listdir(path) if x.endswith('bioc.xml') ]
	else:
		filenames = [path]

	for filename in filenames:
		with bioc.iterparse(filename) as parser:
			for document in parser:
				if len(corpus.documents) >= corpusSizeCutoff:
					yield corpus
					corpus = kindred.Corpus()
				kindredDocs = convertBiocDocToKindredDocs(document)
				for kindredDoc in kindredDocs:
					corpus.addDocument(kindredDoc)

	if len(corpus.documents) > 0:
		yield corpus
示例#4
0
def triage2doc(corpus, mode, stop_words=True):
    """
    Extract title and abstract from a BioC collection.

    :param corpus: BioC collection
    :param stop_words: True if we want to remove stop words
    :return: a list where each item is a document with it title and abstract
    """
    ids = []
    texts = []
    labels = []
    stopwords = []
    if stop_words:
        for line in open('stopwords.txt'):
            stopwords.append(line.strip())

    with bioc.iterparse(corpus) as parser:
        for document in parser:
            ids.append(document.id)
            texts.append(extract_text(document, stopwords))
            if mode != 'eval':
                relevant = document.infons['relevant']
                labels.append(0 if relevant == 'no' else 1)

    return texts, labels, ids
def findSmallText(biocFilename, tsvFilename):
    with bioc.iterparse(biocFilename) as inBioC, codecs.open(
            tsvFilename, 'w', 'utf-8') as outTSV:
        # Iterate over every document (Pubmed citation)
        for docCount, biocDoc in enumerate(inBioC):
            # Get some metadata for the citation
            pmid = biocDoc.infons['pmid']
            journal = biocDoc.infons['journal']

            shortArticleSections = [
                passage.text for passage in biocDoc.passages
                if passage.infons['section'] == 'article'
                and len(passage.text) < 30
            ]

            for shortArticleSection in shortArticleSections:
                # Save to the file
                output = [pmid, journal, shortArticleSection]
                outTSV.write("\t".join(output) + "\n")

    print("Complete")
示例#6
0
def iterLoadDataFromBioc(filename, corpusSizeCutoff=500):
    """
	Iteratively load documents from a BioC file. This will a generator that provides kindred.Corpus objects that are subsets of the entire BioC file. This should be used to lower the memory requirements (so that the entire file doesn't need to be loaded into memory at one time).

	:param filename: Path of the Bioc file
	:param corpusSizeCutoff: Approximate maximum number of documents to be in each corpus subset
	:type filename: str
	:type corpusSizeCutoff: int
	:return: Subsets of the BioC file
	:rtype: A kindred.Corpus generator
	"""
    corpus = kindred.Corpus()
    with bioc.iterparse(filename) as parser:
        for document in parser:
            if len(corpus.documents) >= corpusSizeCutoff:
                yield corpus
                corpus = kindred.Corpus()
            kindredDocs = convertBiocDocToKindredDocs(document)
            for kindredDoc in kindredDocs:
                corpus.addDocument(kindredDoc)

    if len(corpus.documents) > 0:
        yield corpus
"""
Test file for counting num docs, this library does not suite our needs
Gives memory blowup because of bad etree management
"""
import bioc

if __name__ == "__main__":
    file_name = r"/home/daniel/Downloads/abstracts_collection.xml"
    # dtd = r"C:\Users\Danie\Downloads\BioC.dtd"
    with bioc.iterparse(file_name) as parser:
        count = 0
        # collection_info = parser.get_collection_info()
        # print(collection_info.source)
        # print(collection_info.date)
        # print(collection_info.key)
        for document in parser:
            # print(document.id)
            # for key in document.infons:
            #     print(key + "\n" + document.infons[key])
            # for passage in document.passages:
            #     print(passage.infons)
            #     print(passage.text)
            # if count > 0:
            #     break
            count += 1
            if count % 1000 == 0:
                print(count)
    print(count)
示例#8
0
def mergeBioc(biocFilename, outBiocWriter,idFilter):
	with bioc.iterparse(biocFilename) as parser:
		for biocDoc in parser:
			if idFilter is None or biocDoc.id in idFilter:
				outBiocWriter.writedocument(biocDoc)
示例#9
0
文件: align.py 项目: flywind2/pgxmine
                        required=True,
                        type=str,
                        help='Input BioC file')
    parser.add_argument('--annotations',
                        required=True,
                        type=str,
                        help='Pre-pickled annotations')
    parser.add_argument('--outBioc',
                        required=True,
                        type=str,
                        help='Output BioC file')
    args = parser.parse_args()

    pmids = set()

    with bioc.iterparse(args.inBioc) as parser:
        for i, doc in enumerate(parser):
            if 'pmid' in doc.infons and doc.infons['pmid'] != 'None':
                pmid = int(doc.infons['pmid'])
                pmids.add(pmid)

    pmidToAnnotations = defaultdict(list)
    with open(args.annotations) as f:
        for line in f:
            split = line.strip('\n').split('\t')
            pmid, annotationType, conceptid, mentions, database = split
            mentions = mentions.strip()
            pmid = int(pmid)
            if len(mentions) > 0 and pmid in pmids:
                pmidToAnnotations[pmid].append(
                    (annotationType, conceptid, mentions))
示例#10
0
def get_bioc_file(filename):
    list_result = []
    with bioc.iterparse(filename) as parser:
        for document in parser:
            list_result.append(document)
    return list_result
示例#11
0
                if search:
                    term = search.groupdict()['term']
                    termLower = term.lower()
                    if termLower in acceptableTerms and not termLower in stoplist:
                        out = [term, pmid, title, journal, year, s]
                        outTxt = "\t".join(out)
                        print(outTxt)
                        #print("%%s\t%s" % (pmid,term,s))
    #print(document.infons)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Search for mentions of tumor antigens')
    parser.add_argument('--biocFile',
                        required=True,
                        type=str,
                        help='Filename of BioC file to search')
    parser.add_argument('--acceptableTermsFile',
                        required=True,
                        type=str,
                        help='List of terms to help filter')
    args = parser.parse_args()

    with open(args.acceptableTermsFile) as f:
        acceptableTerms = set([line.strip().lower() for line in f])

    with bioc.iterparse(args.biocFile) as parser:
        for document in parser:
            searchForTumorAntigens(document, acceptableTerms)
示例#12
0
def read_made_data(made_base_dir,
                   index_tokenizer,
                   ignore_labels=['PHI'],
                   verbose=False):

    annotated_docs = []
    corpus_path = made_base_dir + r'\corpus'
    for f in os.listdir(corpus_path):
        corpus_file_path = os.path.join(corpus_path, f)
        if not os.path.isfile(corpus_file_path):
            continue

        corpus_file = open(corpus_file_path, 'r', encoding='utf-8')

        text = corpus_file.read()

        corpus_file.close()

        annotated_doc = AnnotatedDocument()
        annotated_doc.text = text
        annotated_doc.filename = f
        # now tokenize by sentence and tokens and store this
        annotated_doc.tokenized_doc = index_tokenizer.tokenize_document(text)

        #print('Document total Sentences : {}'.format(len(annotated_doc.tokenized_doc.sentences)))

        # we also need to read in the BIOC annotations...
        bioc_file_name = os.path.join(
            os.path.join(made_base_dir, 'annotations'), f + '.bioc.xml')

        # add this even if we don't end up finding a BIOC file
        annotated_docs.append(annotated_doc)

        if not os.path.isfile(bioc_file_name):
            print('No BIOC annotations for file : {}'.format(bioc_file_name))
            continue

        if verbose:
            print('Loading annotations for {}'.format(bioc_file_name))

        with bioc.iterparse(bioc_file_name) as parser:
            collection_info = parser.get_collection_info()
            for document in parser:
                i = 1
                #print(document)

                for passage in document.passages:
                    #print(len(passage.annotations))
                    for bioc_annotation in passage.annotations:
                        locations = bioc_annotation.locations
                        if len(locations) != 1:
                            print(
                                'Expected Annotation to have 1 Locations, but got {}'
                                .format(len(locations)))
                            continue
                        # just use the first since we expect only 1
                        location = locations[0]
                        anno = Annotation()
                        anno.start_index = location.offset
                        anno.end_index = location.offset + location.length
                        anno.spanned_text = bioc_annotation.text

                        #print(bioc_infons)
                        if 'type' in bioc_annotation.infons:
                            anno.type = bioc_annotation.infons['type']
                            # let's see if we should ignore this type
                            for ignore_label in ignore_labels:
                                if anno.type == ignore_label:
                                    anno.type = 'O'
                                    break
                        else:
                            anno.type = 'UNK'

                        #print(anno)
                        #break

                        annotated_doc.annotations.append(anno)

    print('Total annotated documents loaded : {}'.format(len(annotated_docs)))

    return annotated_docs