示例#1
0
def extract_annotations(xml_path, tsv_path):
    """
    Extract the annotations from pubtator xml formatted file
    Outputs a TSV file with the following header terms:
    Document - the corresponding pubmed id
    Type - the type of term (i.e. Chemical, Disease, Gene etc.)
    ID - the appropiate MESH or NCBI ID if known
    Offset - the character position where the term starts
    End - the character position where the term ends

    Keywords arguments:
    xml_path -- The path to the xml data file
    tsv_path -- the path to output the formatted data
    """
    xml_opener = utilities.get_opener(xml_path)
    csv_opener = utilities.get_opener(tsv_path)
    with xml_opener(xml_path, "rb") as xml_file, csv_opener(tsv_path, "wt") as tsv_file:
        fieldnames = ['pubmed_id', 'type', 'identifier', 'offset', 'end']
        writer = csv.DictWriter(tsv_file, fieldnames=fieldnames, delimiter='\t')
        writer.writeheader()
        tag_generator = ET.iterparse(xml_file, tag="document", recover=True, encoding="utf-8")

        try:
            for event, document in tqdm.tqdm(tag_generator):

                pubmed_id = document[0].text

                # cycle through all the annotation tags contained within document tag
                for annotation in document.iter('annotation'):

                    # not all annotations will contain an ID
                    if len(annotation) <= 3:
                        continue

                    for infon in annotation.iter('infon'):
                        if infon.attrib["key"] == "type":
                            ant_type = infon.text
                        else:
                            
                            if not infon.text:
                                continue

                            ant_id = re.sub("(MESH:|CVCL:)", "", str(infon.text))

                    location, = annotation.iter('location')
                    offset = int(location.attrib['offset'])
                    end = offset + int(location.attrib['length'])
                    row = {'pubmed_id': pubmed_id, 'type': ant_type, 'identifier': ant_id, 'offset': offset, 'end': end}
                    writer.writerow(row)

                # prevent memory overload
                document.clear()

        except Exception as e:
            print(e)
            print(document[0].text)
示例#2
0
def read_bioconcepts2pubtator_offsets(path):
    """Bioconcepts to pubtator

    Yields an article that is a dictionary described in the article generator
    function.

    Keywords:
    path - the path to the bioconcepts2putator_offset file (obtained from pubtator's ftp site: ftp://ftp.ncbi.nlm.nih.gov/pub/lu/PubTator/)
    """
    opener = utilities.get_opener(path)
    f = opener(path, "rt")

    lines = (line.rstrip() for line in f)

    for k, g in groupby(lines, key=bool):
        # Group articles based on empty lines as separators. Only pass
        # on non-empty lines.
        g = list(g)
        if g[0]:
            yield pubtator_stanza_to_article(g)

    f.close()
示例#3
0
def convert_pubtator(input_path, output_path):
    """Convert pubtators annotation list to BioC XML
    Keyword Arguments:
    input_file -- the path of pubtators annotation file
    output_file -- the path to output the BioC XML file
    """

    # Set up BioCWriter to write specifically Pubtator
    # Can change to incorporate other sources besides pubtator
    writer = BioCWriter()
    writer.collection = BioCCollection()
    collection = writer.collection
    collection.date = time.strftime("%Y/%m/%d")
    collection.source = "Pubtator"
    collection.key = "Pubtator.key"

    opener = utilities.get_opener(output_path)
    with opener(output_path, 'wb') as xml_file:

        # Have to manually do this because hangs otherwise
        # Write the head of the xml file
        xml_shell = writer.tostring('UTF-8')
        *xml_head, xml_tail = xml_shell.rstrip().split(b'\n')
        for line in xml_head:
            xml_file.write(line + b'\n')

        article_generator = read_bioconcepts2pubtator_offsets(input_path)
        # Write each article in BioC format
        for article in tqdm.tqdm(article_generator):
            document = BioCDocument()
            document.id = article["pubmed_id"]

            title_passage = BioCPassage()
            title_passage.put_infon('type', 'title')
            title_passage.offset = '0'
            title_passage.text = article["title"]

            abstract_passage = BioCPassage()
            abstract_passage.put_infon('type', 'abstract')
            abstract_passage.offset = article["abstract"]
            abstract_passage.text = article["abstract"]

            id_index = 0
            for tag in article["title_annot"]:
                title_passage.annotations.append(bioconcepts2pubtator_annotations(tag, id_index))
                id_index += 1

            for tag in article["abstract_annot"]:
                abstract_passage.annotations.append(bioconcepts2pubtator_annotations(tag, id_index))
                id_index += 1

            document.add_passage(title_passage)
            document.add_passage(abstract_passage)

            step_parent = E('collection')
            writer._build_documents([document], step_parent)
            xml_file.write(tostring(step_parent[0], pretty_print=True))
            step_parent.clear()

        # Write the closing tag of the xml document
        xml_file.write(xml_tail + b'\n')
def filter_tags(infile, outfile):
    """ This method filters pubtator tags to consist of only 
        hetnet tags

        Keyword arguments:
        infile -- the name of the file to read
        outfile -- the name of the output file
    """

    print_header = True
    hetnet_chemical_df = load_chemical_df()
    hetnet_disease_df = load_disease_df()
    hetnet_gene_df = load_gene_df()
    csv_opener = utilities.get_opener(outfile)

    with csv_opener(outfile, "wt") as tsv_file:
        for extracted_tag_df in tqdm.tqdm(get_tag_chunks(infile)):

            # Covert chemical IDs
            chemical_merged_df = pd.merge(
                extracted_tag_df[extracted_tag_df["type"] == "Chemical"],
                hetnet_chemical_df[["drugbank_id", "identifier"]],
                left_on="identifier",
                right_on="identifier")
            chemical_merged_df = chemical_merged_df.drop_duplicates()
            chemical_merged_df["type"] = "Compound"
            chemical_merged_df = chemical_merged_df[[
                "pubmed_id", "type", "offset", "end", "drugbank_id"
            ]].rename(columns={"drugbank_id": "identifier"})

            # Convert Disease IDs
            disease_merged_df = pd.merge(
                extracted_tag_df[extracted_tag_df["type"] == "Disease"],
                hetnet_disease_df[["doid_code", "resource_id"]],
                left_on="identifier",
                right_on="resource_id")
            disease_merged_df = disease_merged_df.drop_duplicates()
            disease_merged_df = disease_merged_df[[
                "pubmed_id", "type", "offset", "end", "doid_code"
            ]].rename(columns={"doid_code": "identifier"})

            # Verify Gene IDs are human genes
            gene_df = extracted_tag_df[extracted_tag_df["type"] == "Gene"]
            gene_final_df = gene_df[gene_df["identifier"].isin(
                hetnet_gene_df["GeneID"])]

            final_df = gene_final_df
            final_df = final_df.append(chemical_merged_df)
            final_df = final_df.append(disease_merged_df)

            if print_header:
                (final_df[["pubmed_id", "type", "identifier", "offset",
                           "end"]].sort_values(["pubmed_id",
                                                "offset"]).to_csv(tsv_file,
                                                                  sep="\t",
                                                                  index=False))

                print_header = False
            else:
                (final_df[["pubmed_id", "type", "identifier", "offset",
                           "end"]].sort_values(["pubmed_id", "offset"
                                                ]).to_csv(tsv_file,
                                                          sep="\t",
                                                          index=False,
                                                          header=False))