Exemplo n.º 1
0
def main():
    # Use file defined by BIOC_IN as default if no other provided
    bioc_in = BIOC_IN
    if len(sys.argv) >= 2:
        bioc_in = sys.argv[1]

    # A BioCReader object is put in place to hold the example BioC XML
    # document
    bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE)

    # A BioCWRiter object is prepared to write out the annotated data
    bioc_writer = BioCWriter(BIOC_OUT)

    # The NLTK porter stemmer is used for stemming
    stemmer = PorterStemmer()

    # The example input file given above (by BIOC_IN) is fed into
    # a BioCReader object; validation is done by the BioC DTD
    bioc_reader.read()

    # Pass over basic data
    bioc_writer.collection = bioc_reader.collection

    # Get documents to manipulate
    documents = bioc_writer.collection.documents

    # Go through each document
    annotation_id = 0
    for document in documents:

        # Go through each passage of the document
        for passage in document:
            #  Stem all the tokens found
            stems = [
                stemmer.stem(token)
                for token in wordpunct_tokenize(passage.text)
            ]
            # Add an anotation showing the stemmed version, in the
            # given order
            for stem in stems:
                annotation_id += 1

                # For each token an annotation is created, providing
                # the surface form of a 'stemmed token'.
                # (The annotations are collectively added following
                #  a document passage with a <text> tag.)
                bioc_annotation = BioCAnnotation()
                bioc_annotation.text = stem
                bioc_annotation.id = str(annotation_id)
                bioc_annotation.put_infon('surface form', 'stemmed token')
                passage.add_annotation(bioc_annotation)

    # Print file to screen w/o trailing newline
    # (Can be redirected into a file, e. g output_bioc.xml)
    sys.stdout.write(str(bioc_writer))

    # Write to disk
    bioc_writer.write()
Exemplo n.º 2
0
def main():
    # Use file defined by BIOC_IN as default if no other provided
    bioc_in = BIOC_IN
    if len(sys.argv) >= 2:
        bioc_in = sys.argv[1]
    
    # A BioCReader object is put in place to hold the example BioC XML
    # document
    bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE)
    
    # A BioCWRiter object is prepared to write out the annotated data
    bioc_writer = BioCWriter(BIOC_OUT)
    
    # The NLTK porter stemmer is used for stemming
    stemmer = PorterStemmer()
    
    # The example input file given above (by BIOC_IN) is fed into
    # a BioCReader object; validation is done by the BioC DTD
    bioc_reader.read()
    
    # Pass over basic data
    bioc_writer.collection = bioc_reader.collection
    
    # Get documents to manipulate
    documents = bioc_writer.collection.documents
    
    # Go through each document
    annotation_id = 0
    for document in documents:
        
        # Go through each passage of the document
        for passage in document:
            #  Stem all the tokens found
            stems = [stemmer.stem(token) for 
                     token in wordpunct_tokenize(passage.text)]
            # Add an anotation showing the stemmed version, in the
            # given order
            for stem in stems:
                annotation_id += 1
                
                # For each token an annotation is created, providing
                # the surface form of a 'stemmed token'.
                # (The annotations are collectively added following
                #  a document passage with a <text> tag.)
                bioc_annotation = BioCAnnotation()
                bioc_annotation.text = stem
                bioc_annotation.id = str(annotation_id)
                bioc_annotation.put_infon('surface form', 
                                          'stemmed token')
                passage.add_annotation(bioc_annotation)
    
    # Print file to screen w/o trailing newline
    # (Can be redirected into a file, e. g output_bioc.xml)
    sys.stdout.write(str(bioc_writer))
    
    # Write to disk
    bioc_writer.write()
Exemplo n.º 3
0
def main():
    bioc_reader = BioCReader(test_file, dtd_valid_file=dtd_file)
    bioc_reader.read()
    '''
    sentences = bioc_reader.collection.documents[0].passages[0].sentences
    for sentence in sentences:
        print sentence.offset
    '''

    bioc_writer = BioCWriter('output_bioc.xml')
    bioc_writer.collection = bioc_reader.collection
    bioc_writer.write()
    print(bioc_writer)
Exemplo n.º 4
0
def main():
    bioc_reader = BioCReader(test_file, dtd_valid_file=dtd_file)
    bioc_reader.read()
    '''
    sentences = bioc_reader.collection.documents[0].passages[0].sentences
    for sentence in sentences:
        print sentence.offset
    '''

    bioc_writer = BioCWriter('output_bioc.xml')
    bioc_writer.collection = bioc_reader.collection
    bioc_writer.write()
    print(bioc_writer)
 def write_text_bioc(self, output_path):
     bioc_writer = BioCWriter(output_path)
     bioc_collection = BioCCollection()
     # Insert option for either writing text only or annotations?
     # to keep document as it is:
     #collection.add_document(self.document)
     bioc_document = BioCDocument()
     for passage in self.abstract_dict.keys():
         bioc_passage = BioCPassage()
         bioc_passage.text = self.abstract_dict[passage]
         bioc_document.add_passage(bioc_passage)
     bioc_collection.add_document(bioc_document)
     
     print 'BioC output path', output_path
     bioc_writer.collection = bioc_collection
     bioc_writer.write()
    input_file  = options.i
    dtd_file    = options.b
    output_file = options.o

    # open input files
    try:
        bioc_reader = BioCReader(input_file, dtd_valid_file=dtd_file)
        bioc_reader.read()
    except:
        ## debug:
        #raise
        sys.exit("Probably, your input file contains an empty passage. Maybe one of the PubMed-IDs does not have an abstract. Please, remove empty passage and document tags. No output file was written.")

    # the elements <date> and <key> will not be changed or updated by this script (it only adds (MeSH) annotations)
    # define output file
    bioc_writer = BioCWriter(output_file)
    # initialization for reading input file
    bioc_writer.collection = bioc_reader.collection
    # get documents (one PubMed-ID with title and text equals one document)
    docs = bioc_writer.collection.documents
    # different annotation IDs can be confusing - add a type to the iterating number
    annotation_type = "_MeSH"
    # iteration over PubMed abstracts with ID, title, and text
    for doc in docs:
        # get MeSH terms from PostgreSQL database
        mesh_terms = get_MeSH_terms(doc.id)
        # use annotation IDs starting at 0 for each document
        annotation_id = 0
        # if at least one MeSH term was found for this PubMed-ID (doc.id)
        if mesh_terms:
            # abstract title is one passage and abstract text is one passage
Exemplo n.º 7
0
def convert_pubtator(input_path, output_path):
    """Convert pubtators annotation list to BioC XML
    Keyword Arguments:
    input_file -- the path of pubtators annotation file
    output_file -- the path to output the BioC XML file
    """

    # Set up BioCWriter to write specifically Pubtator
    # Can change to incorporate other sources besides pubtator
    writer = BioCWriter()
    writer.collection = BioCCollection()
    collection = writer.collection
    collection.date = time.strftime("%Y/%m/%d")
    collection.source = "Pubtator"
    collection.key = "Pubtator.key"

    opener = utilities.get_opener(output_path)
    with opener(output_path, 'wb') as xml_file:

        # Have to manually do this because hangs otherwise
        # Write the head of the xml file
        xml_shell = writer.tostring('UTF-8')
        *xml_head, xml_tail = xml_shell.rstrip().split(b'\n')
        for line in xml_head:
            xml_file.write(line + b'\n')

        article_generator = read_bioconcepts2pubtator_offsets(input_path)
        # Write each article in BioC format
        for article in tqdm.tqdm(article_generator):
            document = BioCDocument()
            document.id = article["pubmed_id"]

            title_passage = BioCPassage()
            title_passage.put_infon('type', 'title')
            title_passage.offset = '0'
            title_passage.text = article["title"]

            abstract_passage = BioCPassage()
            abstract_passage.put_infon('type', 'abstract')
            abstract_passage.offset = article["abstract"]
            abstract_passage.text = article["abstract"]

            id_index = 0
            for tag in article["title_annot"]:
                title_passage.annotations.append(bioconcepts2pubtator_annotations(tag, id_index))
                id_index += 1

            for tag in article["abstract_annot"]:
                abstract_passage.annotations.append(bioconcepts2pubtator_annotations(tag, id_index))
                id_index += 1

            document.add_passage(title_passage)
            document.add_passage(abstract_passage)

            step_parent = E('collection')
            writer._build_documents([document], step_parent)
            xml_file.write(tostring(step_parent[0], pretty_print=True))
            step_parent.clear()

        # Write the closing tag of the xml document
        xml_file.write(xml_tail + b'\n')
Exemplo n.º 8
0
def convert_pubtator(input_file, output_file=None):
    """Convert pubtators annotation list to BioC XML

    Keyword Arguments:
    input_file -- the path of pubtators annotation file
    output_file -- the path to output the converted text
    """
    if output_file is None:
        output_file = "bioc-converted-docs.xml"

    # Set up BioCWriter to write specifically Pubtator
    # Can change to incorporate other sources besides pubtator
    writer = BioCWriter()
    writer.collection = BioCCollection()
    collection = writer.collection
    collection.date = time.strftime("%Y/%m/%d")
    collection.source = "Pubtator"
    collection.key = "Pubtator.key"

    with open(output_file, 'wb') as g:

        # Have to manually do this because hangs otherwise
        # Write the head of the xml file
        xml_header = writer.tostring('UTF-8')
        xml_tail = '</collection>\n'
        xml_head = xml_header[:-len(xml_tail)]
        g.write(xml_head)

        article_generator = bioconcepts2pubtator_offsets(input_file)
        # Write each article in BioC format
        for article in tqdm.tqdm(article_generator):
            document = BioCDocument()
            document.id = article["Document ID"]

            title_passage = BioCPassage()
            title_passage.put_infon('type', 'title')
            title_passage.offset = '0'
            title_passage.text = article["Title"]

            abstract_passage = BioCPassage()
            abstract_passage.put_infon('type', 'abstract')
            abstract_passage.offset = str(article["Abstract"])
            abstract_passage.text = article["Abstract"]

            id_index = 0
            for tag in article["Title_Annot"]:
                title_passage.annotations.append(bioconcepts2pubtator_annotations(tag, id_index))
                id_index += 1

            for tag in article["Abstract_Annot"]:
                abstract_passage.annotations.append(bioconcepts2pubtator_annotations(tag, id_index))
                id_index += 1

            document.add_passage(title_passage)
            document.add_passage(abstract_passage)

            step_parent = E('collection')
            writer._build_documents([document], step_parent)
            g.write(tostring(step_parent[0], pretty_print=True))
            step_parent.clear()

        # Write the closing tag of the xml document
        g.write(xml_tail)
Exemplo n.º 9
0
    output_file = options.o

    # open input files
    try:
        bioc_reader = BioCReader(input_file, dtd_valid_file=dtd_file)
        bioc_reader.read()
    except:
        ## debug:
        #raise
        sys.exit(
            "Probably, your input file contains an empty passage. Maybe one of the PubMed-IDs does not have an abstract. Please, remove empty passage and document tags. No output file was written."
        )

    # the elements <date> and <key> will not be changed or updated by this script (it only adds (MeSH) annotations)
    # define output file
    bioc_writer = BioCWriter(output_file)
    # initialization for reading input file
    bioc_writer.collection = bioc_reader.collection
    # get documents (one PubMed-ID with title and text equals one document)
    docs = bioc_writer.collection.documents
    # different annotation IDs can be confusing - add a type to the iterating number
    annotation_type = "_MeSH"
    # iteration over PubMed abstracts with ID, title, and text
    for doc in docs:
        # get MeSH terms from PostgreSQL database
        mesh_terms = get_MeSH_terms(doc.id)
        # use annotation IDs starting at 0 for each document
        annotation_id = 0
        # if at least one MeSH term was found for this PubMed-ID (doc.id)
        if mesh_terms:
            # abstract title is one passage and abstract text is one passage