示例#1
0
def main():
    # Use file defined by BIOC_IN as default if no other provided
    bioc_in = BIOC_IN
    if len(sys.argv) >= 2:
        bioc_in = sys.argv[1]

    # A BioCReader object is put in place to hold the example BioC XML
    # document
    bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE)

    # A BioCWRiter object is prepared to write out the annotated data
    bioc_writer = BioCWriter(BIOC_OUT)

    # The NLTK porter stemmer is used for stemming
    stemmer = PorterStemmer()

    # The example input file given above (by BIOC_IN) is fed into
    # a BioCReader object; validation is done by the BioC DTD
    bioc_reader.read()

    # Pass over basic data
    bioc_writer.collection = bioc_reader.collection

    # Get documents to manipulate
    documents = bioc_writer.collection.documents

    # Go through each document
    annotation_id = 0
    for document in documents:

        # Go through each passage of the document
        for passage in document:
            #  Stem all the tokens found
            stems = [
                stemmer.stem(token)
                for token in wordpunct_tokenize(passage.text)
            ]
            # Add an anotation showing the stemmed version, in the
            # given order
            for stem in stems:
                annotation_id += 1

                # For each token an annotation is created, providing
                # the surface form of a 'stemmed token'.
                # (The annotations are collectively added following
                #  a document passage with a <text> tag.)
                bioc_annotation = BioCAnnotation()
                bioc_annotation.text = stem
                bioc_annotation.id = str(annotation_id)
                bioc_annotation.put_infon('surface form', 'stemmed token')
                passage.add_annotation(bioc_annotation)

    # Print file to screen w/o trailing newline
    # (Can be redirected into a file, e. g output_bioc.xml)
    sys.stdout.write(str(bioc_writer))

    # Write to disk
    bioc_writer.write()
示例#2
0
文件: stemmer.py 项目: 2mh/PyBioC
def main():
    # Use file defined by BIOC_IN as default if no other provided
    bioc_in = BIOC_IN
    if len(sys.argv) >= 2:
        bioc_in = sys.argv[1]
    
    # A BioCReader object is put in place to hold the example BioC XML
    # document
    bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE)
    
    # A BioCWRiter object is prepared to write out the annotated data
    bioc_writer = BioCWriter(BIOC_OUT)
    
    # The NLTK porter stemmer is used for stemming
    stemmer = PorterStemmer()
    
    # The example input file given above (by BIOC_IN) is fed into
    # a BioCReader object; validation is done by the BioC DTD
    bioc_reader.read()
    
    # Pass over basic data
    bioc_writer.collection = bioc_reader.collection
    
    # Get documents to manipulate
    documents = bioc_writer.collection.documents
    
    # Go through each document
    annotation_id = 0
    for document in documents:
        
        # Go through each passage of the document
        for passage in document:
            #  Stem all the tokens found
            stems = [stemmer.stem(token) for 
                     token in wordpunct_tokenize(passage.text)]
            # Add an anotation showing the stemmed version, in the
            # given order
            for stem in stems:
                annotation_id += 1
                
                # For each token an annotation is created, providing
                # the surface form of a 'stemmed token'.
                # (The annotations are collectively added following
                #  a document passage with a <text> tag.)
                bioc_annotation = BioCAnnotation()
                bioc_annotation.text = stem
                bioc_annotation.id = str(annotation_id)
                bioc_annotation.put_infon('surface form', 
                                          'stemmed token')
                passage.add_annotation(bioc_annotation)
    
    # Print file to screen w/o trailing newline
    # (Can be redirected into a file, e. g output_bioc.xml)
    sys.stdout.write(str(bioc_writer))
    
    # Write to disk
    bioc_writer.write()
def add_annotation(triple, annotation_id):
    # initialize annotation element
    bioc_annotation = BioCAnnotation()
    # MeSH term in a tag <text> ... </text> (origininal term, searched case insensitive)
    bioc_annotation.text = triple[2]
    # generate XML structure for the annotation and add infon
    bioc_annotation.id = str(annotation_id)
    bioc_annotation.put_infon('type', 'MeSH term')
    # add location element
    bioc_location = BioCLocation()
    # add length of MeSH term
    bioc_location.length = str(triple[1])
    # add start position (offset) 
    bioc_location.offset = str(triple[0])
    bioc_annotation.add_location(bioc_location)
    return bioc_annotation
示例#4
0
def add_annotation(triple, annotation_id):
    # initialize annotation element
    bioc_annotation = BioCAnnotation()
    # MeSH term in a tag <text> ... </text> (origininal term, searched case insensitive)
    bioc_annotation.text = triple[2]
    # generate XML structure for the annotation and add infon
    bioc_annotation.id = str(annotation_id)
    bioc_annotation.put_infon('type', 'MeSH term')
    # add location element
    bioc_location = BioCLocation()
    # add length of MeSH term
    bioc_location.length = str(triple[1])
    # add start position (offset)
    bioc_location.offset = str(triple[0])
    bioc_annotation.add_location(bioc_location)
    return bioc_annotation