Exemplo n.º 1
0
def main():
    # Use file defined by BIOC_IN as default if no other provided
    bioc_in = BIOC_IN
    if len(sys.argv) >= 2:
        bioc_in = sys.argv[1]
    
    # A BioCReader object is put in place to hold the example BioC XML
    # document
    bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE)
    
    # A BioCWRiter object is prepared to write out the annotated data
    bioc_writer = BioCWriter(BIOC_OUT)
    
    # The NLTK porter stemmer is used for stemming
    stemmer = PorterStemmer()
    
    # The example input file given above (by BIOC_IN) is fed into
    # a BioCReader object; validation is done by the BioC DTD
    bioc_reader.read()
    
    # Pass over basic data
    bioc_writer.collection = bioc_reader.collection
    
    # Get documents to manipulate
    documents = bioc_writer.collection.documents
    
    # Go through each document
    annotation_id = 0
    for document in documents:
        
        # Go through each passage of the document
        for passage in document:
            #  Stem all the tokens found
            stems = [stemmer.stem(token) for 
                     token in wordpunct_tokenize(passage.text)]
            # Add an anotation showing the stemmed version, in the
            # given order
            for stem in stems:
                annotation_id += 1
                
                # For each token an annotation is created, providing
                # the surface form of a 'stemmed token'.
                # (The annotations are collectively added following
                #  a document passage with a <text> tag.)
                bioc_annotation = BioCAnnotation()
                bioc_annotation.text = stem
                bioc_annotation.id = str(annotation_id)
                bioc_annotation.put_infon('surface form', 
                                          'stemmed token')
                passage.add_annotation(bioc_annotation)
    
    # Print file to screen w/o trailing newline
    # (Can be redirected into a file, e. g output_bioc.xml)
    sys.stdout.write(str(bioc_writer))
    
    # Write to disk
    bioc_writer.write()
Exemplo n.º 2
0
    def test_should_get_gene_names_normalised(self, list_dict, expected_genes):
        # Arrange
        sut = BiocAnnotationGenes()
        bioc_doc = BioCDocument()
        bioc_passage = BioCPassage()
        bioc_doc.add_passage(bioc_passage)

        for dict in list_dict:
            annotation = BioCAnnotation()
            annotation.infons = dict
            bioc_passage.add_annotation(annotation)

        # act
        actual = sut.get_gene_names_normalised(bioc_doc)

        # assert
        self.assertEqual(set(expected_genes), actual)
Exemplo n.º 3
0
def brat2bioc_entity(bratentity: BratEntity) -> BioCAnnotation:
    ann = BioCAnnotation()
    ann.id = bratentity.id
    ann.text = bratentity.text
    ann.infons['type'] = bratentity.type
    for span in bratentity.locations:
        ann.add_location(BioCLocation(span.begin, span.end - span.begin))
    return ann
 def to_bioc(self):
     entity_bioc = BioCAnnotation()
     entity_bioc.infons['type'] = self.type
     entity_bioc.text = self.text
     entity_bioc.id = str(self.id)
     location = BioCLocation(self.start, len(self.text))
     entity_bioc.add_location(location)
     return entity_bioc
Exemplo n.º 5
0
 def __parse_annotation(self, tree):
     annotation = BioCAnnotation()
     annotation.id = tree.attrib['id']
     annotation.infons = self.__parse_infons(tree)
     annotation.text = tree.findtext('text')
     for child in tree.findall('location'):
         annotation.add_location(
             BioCLocation(int(child.attrib['offset']), int(child.attrib['length'])))
     return annotation
Exemplo n.º 6
0
    def test_should_get_gene_names_to_normalised_dict(
            self, list_gene_dict_in_passage, expected_dict):
        # Arrange
        sut = BiocAnnotationGenes()
        bioc_doc = BioCDocument()
        for list_gene_dict in list_gene_dict_in_passage:
            bioc_passage = BioCPassage()
            bioc_doc.add_passage(bioc_passage)

            for dict in list_gene_dict:
                annotation = BioCAnnotation()
                annotation.text = dict["text"]
                annotation.infons = dict
                bioc_passage.add_annotation(annotation)

        # act
        actual = sut.get_gene_names_to_normalised_dict(bioc_doc)

        # assert
        self.assertEqual(expected_dict, actual)
Exemplo n.º 7
0
def main():
    # Use file defined by BIOC_IN as default if no other provided
    bioc_in = BIOC_IN
    if len(sys.argv) >= 2:
        bioc_in = sys.argv[1]

    # A BioCReader object is put in place to hold the example BioC XML
    # document
    bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE)

    # A BioCWRiter object is prepared to write out the annotated data
    bioc_writer = BioCWriter(BIOC_OUT)

    # The NLTK porter stemmer is used for stemming
    stemmer = PorterStemmer()

    # The example input file given above (by BIOC_IN) is fed into
    # a BioCReader object; validation is done by the BioC DTD
    bioc_reader.read()

    # Pass over basic data
    bioc_writer.collection = bioc_reader.collection

    # Get documents to manipulate
    documents = bioc_writer.collection.documents

    # Go through each document
    annotation_id = 0
    for document in documents:

        # Go through each passage of the document
        for passage in document:
            #  Stem all the tokens found
            stems = [
                stemmer.stem(token)
                for token in wordpunct_tokenize(passage.text)
            ]
            # Add an anotation showing the stemmed version, in the
            # given order
            for stem in stems:
                annotation_id += 1

                # For each token an annotation is created, providing
                # the surface form of a 'stemmed token'.
                # (The annotations are collectively added following
                #  a document passage with a <text> tag.)
                bioc_annotation = BioCAnnotation()
                bioc_annotation.text = stem
                bioc_annotation.id = str(annotation_id)
                bioc_annotation.put_infon('surface form', 'stemmed token')
                passage.add_annotation(bioc_annotation)

    # Print file to screen w/o trailing newline
    # (Can be redirected into a file, e. g output_bioc.xml)
    sys.stdout.write(str(bioc_writer))

    # Write to disk
    bioc_writer.write()
Exemplo n.º 8
0
def add_annotation(triple, annotation_id):
    # initialize annotation element
    bioc_annotation = BioCAnnotation()
    # MeSH term in a tag <text> ... </text> (origininal term, searched case insensitive)
    bioc_annotation.text = triple[2]
    # generate XML structure for the annotation and add infon
    bioc_annotation.id = str(annotation_id)
    bioc_annotation.put_infon('type', 'MeSH term')
    # add location element
    bioc_location = BioCLocation()
    # add length of MeSH term
    bioc_location.length = str(triple[1])
    # add start position (offset)
    bioc_location.offset = str(triple[0])
    bioc_annotation.add_location(bioc_location)
    return bioc_annotation
def add_annotation(triple, annotation_id):
    # initialize annotation element
    bioc_annotation = BioCAnnotation()
    # MeSH term in a tag <text> ... </text> (origininal term, searched case insensitive)
    bioc_annotation.text = triple[2]
    # generate XML structure for the annotation and add infon
    bioc_annotation.id = str(annotation_id)
    bioc_annotation.put_infon('type', 'MeSH term')
    # add location element
    bioc_location = BioCLocation()
    # add length of MeSH term
    bioc_location.length = str(triple[1])
    # add start position (offset) 
    bioc_location.offset = str(triple[0])
    bioc_annotation.add_location(bioc_location)
    return bioc_annotation
Exemplo n.º 10
0
 def __read_annotation(self, start_elem):
     ann = BioCAnnotation()
     ann.id = start_elem.get('id')
     while self.__has_next():
         event, elem = self.__next_event()
         if event == 'start':
             pass
         elif event == 'end':
             if elem.tag == 'text':
                 ann.text = elem.text
             elif elem.tag == 'infon':
                 ann.infons[elem.get('key')] = elem.text
             elif elem.tag == 'location':
                 ann.add_location(BioCLocation(int(elem.get('offset')), int(elem.get('length'))))
             elif elem.tag == 'annotation':
                 return ann
     raise RuntimeError("should not reach here")  # pragma: no cover
Exemplo n.º 11
0
def bioconcepts2pubtator_annotations(tag, index):
    """Bioconcepts to Annotations
    Specifically for bioconcepts2pubtator and converts each annotation
    into an annotation object that BioC can parse.
    Keyword Arguments:
    tag -- the annotation line that was parsed into an array
    index -- the id of each document specific annotation
    """

    annt = BioCAnnotation()
    annt.id = str(index)
    annt.infons["type"] = tag["type"]

    # If the annotation type is a Gene,Species, Mutation, SNP
    # Write out relevant tag
    tag_type = tag['type'] or ''
    tag_id = tag['tag_id']
    if tag_type == "Gene":
        annt.infons["NCBI Gene"] = tag_id

    elif tag_type == "Species":
        annt.infons["NCBI Species"] = tag_id

    elif "Mutation" in tag_type:
        annt.infons["tmVar"] = tag_id

    elif "SNP" in tag_type:
        annt.infons["tmVar"] = tag_id

    else:
        # If there is no MESH ID for an annotation
        if tag_id:
            # check to see if there are multiple mesh tags
            if "|" in tag_id:
                # Write out each MESH id as own tag
                for tag_num, ids in enumerate(tag_id.split("|")):
                    # Some ids dont have the MESH:#### form so added case to that
                    if ":" not in ids:
                        annt.infons["MESH {}".format(tag_num)] = tag_id
                    else:
                        term_type, term_id = ids.split(":")
                        annt.infons["{} {}".format(term_type, tag_num)] = term_id
            else:
                # Some ids dont have the MESH:#### form so added case to that
                if ":" in tag_id:
                    term_type, term_id = tag_id.split(":")
                    annt.infons[term_type] = term_id
                else:
                    annt.infons["MESH"] = tag_id
        else:
            annt.infons["MESH"] = "Unknown"

    location = BioCLocation()
    location.offset = str(tag["start"])
    location.length = str(len(tag["term"]))
    annt.locations.append(location)
    annt.text = tag["term"]
    return annt