Exemplo n.º 1
0
def get_transmir_gold_ann_set(goldpath, entitytype):
    logging.info("loading gold standard... {}".format(goldpath))
    gold_entities = set()
    gold_relations = {}
    with open(goldpath, 'r') as goldfile:
        for l in goldfile:
            tsv = l.strip().split("\t")
            if tsv[-1].lower() == "human":
                # print "gold standard", tsv[8], tsv[0], tsv[3], entitytype
                pmids = tsv[8].split(";")
                norm_mirna = mirna_graph.map_label(tsv[3])
                if norm_mirna < 99:
                    norm_mirna[0] = tsv[3]
                norm_gene = get_uniprot_name(tsv[0])
                for did in pmids:
                    if entitytype == "mirna":
                        gold_entities.add(
                            ("PMID" + did, "0", "0", norm_mirna[0].lower()))
                    elif entitytype == "protein":
                        gold_entities.add(
                            ("PMID" + did, "0", "0", norm_gene[0].lower()))
                    gold_relations[("PMID" + did, norm_mirna[0], norm_gene[0],
                                    norm_mirna[0] + "=>" +
                                    norm_gene[0])] = [tsv[3] + "=>" + tsv[0]]
                    #gold_relations[("PMID", norm_mirna[0], norm_gene[0], norm_mirna[0] + "=>" + norm_gene[0])] = [tsv[3] + "=>" + tsv[0]]

    # print gold_entities
    return gold_entities, gold_relations
Exemplo n.º 2
0
def get_ddi_mirna_gold_ann_set(goldpath, entitytype, pairtype):
    logging.info("loading gold standard... {}".format(goldpath))
    gold_offsets = set()
    gold_pairs = set()
    original_id_to_offset = {}
    original_id_to_text = {}
    tree = ET.parse(goldpath)
    #with codecs.open(goldpath, 'r', 'utf-8') as xml:
    root = tree.getroot()
    #parse DDI corpus file
    t = time.time()
    # root = ET.fromstring(xml.read())
    rfile = open("corpora/miRNACorpus/miRNAcorpus_relations.txt", 'w')
    for doc in root.findall("document"):
        did = doc.get('id')
        doctext = ""
        for sentence in doc.findall('sentence'):
            sentence_text = sentence.get('text')
            #sentence_text = sentence_text.replace('\r\n', '  ')
            for entity in sentence.findall('entity'):
                entity_offset = entity.get('charOffset')
                if ";" in entity_offset:
                    continue
                offsets = entity_offset.split("-")
                start, end = int(offsets[0]) + len(doctext), int(
                    offsets[1]) + len(doctext) + 1
                etype = type_match.get(entity.get("type"))
                original_id_to_offset[entity.get("id")] = (start, end)
                original_id_to_text[entity.get("id")] = entity.get("text")
                #print this_sentence.text[offsets[0]:offsets[-1]], entity.get("text")
                #if "protein" in entity_type.lower() or "mirna" in entity_type.lower():
                if etype == entitytype:
                    gold_offsets.add((did, start, end, entity.get("text")))
            for pair in sentence.findall('pair'):
                try:
                    p_type = type_match[pair.get("type")]
                except KeyError:
                    continue
                p_true = pair.get("interaction")
                if p_type == pairtype and p_true == "True":
                    gold_pair = (did, original_id_to_offset[pair.get("e1")],
                                 original_id_to_offset[pair.get("e2")],
                                 "{}={}>{}".format(
                                     original_id_to_text[pair.get("e1")],
                                     p_type,
                                     original_id_to_text[pair.get("e2")]))
                    gold_pairs.add(gold_pair)
                    norm_mirna = mirna_graph.map_label(
                        original_id_to_text[pair.get("e1")])
                    if norm_mirna < 99:
                        norm_mirna[0] = original_id_to_text[pair.get("e1")]
                    norm_gene = get_uniprot_name(
                        original_id_to_text[pair.get("e2")])
                    rfile.write("{}\t{}\n".format(norm_mirna[0], norm_gene[0]))
            doctext += " " + sentence_text  # generate the full text of this document
    # logging.debug(gold_pairs)
    rfile.close()
    return gold_offsets, gold_pairs
Exemplo n.º 3
0
def get_ddi_mirna_gold_ann_set(goldpath, entitytype, pairtype):
    logging.info("loading gold standard... {}".format(goldpath))
    gold_offsets = set()
    gold_pairs = set()
    original_id_to_offset = {}
    original_id_to_text = {}
    tree = ET.parse(goldpath)
    #with codecs.open(goldpath, 'r', 'utf-8') as xml:
    root = tree.getroot()
    #parse DDI corpus file
    t = time.time()
    # root = ET.fromstring(xml.read())
    rfile = open("corpora/miRNACorpus/miRNAcorpus_relations.txt", 'w')
    for doc in root.findall("document"):
        did = doc.get('id')
        doctext = ""
        for sentence in doc.findall('sentence'):
            sentence_text = sentence.get('text')
            #sentence_text = sentence_text.replace('\r\n', '  ')
            for entity in sentence.findall('entity'):
                entity_offset = entity.get('charOffset')
                if ";" in entity_offset:
                    continue
                offsets = entity_offset.split("-")
                start, end = int(offsets[0]) + len(doctext), int(offsets[1]) + len(doctext) + 1
                etype = type_match.get(entity.get("type"))
                original_id_to_offset[entity.get("id")] = (start, end)
                original_id_to_text[entity.get("id")] = entity.get("text")
                #print this_sentence.text[offsets[0]:offsets[-1]], entity.get("text")
                #if "protein" in entity_type.lower() or "mirna" in entity_type.lower():
                if etype == entitytype:
                    gold_offsets.add((did, start, end, entity.get("text")))
            for pair in sentence.findall('pair'):
                try:
                    p_type = type_match[pair.get("type")]
                except KeyError:
                    continue
                p_true = pair.get("interaction")
                if p_type == pairtype and p_true == "True":
                    gold_pair = (did, original_id_to_offset[pair.get("e1")], original_id_to_offset[pair.get("e2")],
                                    "{}={}>{}".format(original_id_to_text[pair.get("e1")], p_type, original_id_to_text[pair.get("e2")]))
                    gold_pairs.add(gold_pair)
                    norm_mirna = mirna_graph.map_label(original_id_to_text[pair.get("e1")])
                    if norm_mirna < 99:
                        norm_mirna[0] = original_id_to_text[pair.get("e1")]
                    norm_gene = get_uniprot_name(original_id_to_text[pair.get("e2")])
                    rfile.write("{}\t{}\n".format(norm_mirna[0], norm_gene[0]))
            doctext += " " + sentence_text # generate the full text of this document
    # logging.debug(gold_pairs)
    rfile.close()
    return gold_offsets, gold_pairs
Exemplo n.º 4
0
def get_mirtex_gold_ann_set(goldpath, entitytype, pairtype):
    logging.info("loading gold standard... {}".format(goldpath))
    annfiles = [
        goldpath + '/' + f for f in os.listdir(goldpath) if f.endswith('.ann')
    ]
    gold_offsets = set()
    for current, f in enumerate(annfiles):
        did = f.split(".")[0]
        with open(f, 'r') as txt:
            for line in txt:
                if line.startswith("T"):
                    tid, ann, etext = line.strip().split("\t")
                    etype, dstart, dend = ann.split(" ")
                    if entitytype == type_match[etype]:
                        dstart, dend = int(dstart), int(dend)
                        gold_offsets.add((did, dstart, dend, etext))
    gold_relations = {}
    with open(goldpath + "/" + "annotations.tsv") as afile:
        for l in afile:
            v = l.strip().split("\t")
            if len(v) < 3:
                continue
            did = goldpath + '/' + v[0]
            # logging.info("{} {} {}".format(did, pairtype, v[-1]))
            if pairtype == "all" or type_match.get(" ".join(
                    v[-2:])) == pairtype:
                e1 = v[1].split(";")
                for mirna in e1:
                    mirna = mirna.replace('"', '')
                    # logging.info(mirna)
                    norm_mirna = mirna_graph.map_label(mirna)
                    if norm_mirna < 99:
                        norm_mirna[0] = mirna
                    e2 = v[2].split(";")
                    for gene in e2:
                        gene = gene.replace('"', '')
                        # logging.info(gene)
                        norm_gene = get_uniprot_name(gene)
                        #gold_relations.add((did, norm_mirna[0], norm_gene[0]))
                        gold_relations[(did, norm_mirna[0], norm_gene[0],
                                        norm_mirna[0] + "=>" +
                                        norm_gene[0])] = []
    return gold_offsets, gold_relations
Exemplo n.º 5
0
def get_mirtex_gold_ann_set(goldpath, entitytype, pairtype):
    logging.info("loading gold standard... {}".format(goldpath))
    annfiles = [goldpath + '/' + f for f in os.listdir(goldpath) if f.endswith('.ann')]
    gold_offsets = set()
    for current, f in enumerate(annfiles):
            did = f.split(".")[0]
            with open(f, 'r') as txt:
                for line in txt:
                    if line.startswith("T"):
                        tid, ann, etext = line.strip().split("\t")
                        etype, dstart, dend = ann.split(" ")
                        if entitytype == type_match[etype]:
                            dstart, dend = int(dstart), int(dend)
                            gold_offsets.add((did, dstart, dend, etext))
    gold_relations = {}
    with open(goldpath + "/" + "annotations.tsv") as afile:
        for l in afile:
            v = l.strip().split("\t")
            if len(v) < 3:
                continue
            did = goldpath + '/' + v[0]
            # logging.info("{} {} {}".format(did, pairtype, v[-1]))
            if pairtype == "all" or type_match.get(" ".join(v[-2:])) == pairtype:
                e1 = v[1].split(";")
                for mirna in e1:
                    mirna = mirna.replace('"', '')
                    # logging.info(mirna)
                    norm_mirna = mirna_graph.map_label(mirna)
                    if norm_mirna < 99:
                        norm_mirna[0] = mirna
                    e2 = v[2].split(";")
                    for gene in e2:
                        gene = gene.replace('"', '')
                        # logging.info(gene)
                        norm_gene = get_uniprot_name(gene)
                        #gold_relations.add((did, norm_mirna[0], norm_gene[0]))
                        gold_relations[(did, norm_mirna[0], norm_gene[0], norm_mirna[0] + "=>" + norm_gene[0])] = []
    return gold_offsets, gold_relations
Exemplo n.º 6
0
def get_transmir_gold_ann_set(goldpath, entitytype):
    logging.info("loading gold standard... {}".format(goldpath))
    gold_entities = set()
    gold_relations = {}
    with open(goldpath, 'r') as goldfile:
        for l in goldfile:
            tsv = l.strip().split("\t")
            if tsv[-1].lower() == "human":
                # print "gold standard", tsv[8], tsv[0], tsv[3], entitytype
                pmids = tsv[8].split(";")
                norm_mirna = mirna_graph.map_label(tsv[3])
                if norm_mirna < 99:
                    norm_mirna[0] = tsv[3]
                norm_gene = get_uniprot_name(tsv[0])
                for did in pmids:
                    if entitytype == "mirna":
                        gold_entities.add(("PMID" + did, "0", "0", norm_mirna[0].lower()))
                    elif entitytype == "protein":
                        gold_entities.add(("PMID" + did, "0", "0", norm_gene[0].lower()))
                    gold_relations[("PMID" + did, norm_mirna[0], norm_gene[0], norm_mirna[0] + "=>" + norm_gene[0])] = [tsv[3] + "=>" + tsv[0]]
                    #gold_relations[("PMID", norm_mirna[0], norm_gene[0], norm_mirna[0] + "=>" + norm_gene[0])] = [tsv[3] + "=>" + tsv[0]]

    # print gold_entities
    return gold_entities, gold_relations
Exemplo n.º 7
0
# read transmir database and generate corpus
from text.mirna_entity import mirna_graph
from text.protein_entity import get_uniprot_name

db_name = "data/transmir_v1.2.tsv"
tfs = set()
diseases = set()
funcs = set()
pmids = set()
mirnas = {}  # mirname: (function, disease)
entries = {}  # (tfname, mirname): active
with open(db_name, 'r') as dbfile:
    for line in dbfile:
        tsv = line.strip().split("\t")
        tfname = get_uniprot_name(tsv[0])
        mirname = mirna_graph.map_label(tsv[3])
        tfname = tfname[0]
        mirname = mirname[0]
        func = tsv[5].split(";")
        disease = tsv[6].split(";")
        active = tsv[7]
        pmid = tsv[8].split(";")
        if tsv[-1].lower() == "human":
            tfs.add(tfname.replace("-", ""))  # uniform TF names
            for f in func:
                funcs.add(f.strip())
            for d in disease:
                if d != "see HMDD (http://cmbi.bjmu.edu.cn/hmdd)":
                    diseases.add(d.strip())
            for p in pmid:
                pmids.add(p.strip())
Exemplo n.º 8
0
# read transmir database and generate corpus
from text.mirna_entity import mirna_graph
from text.protein_entity import get_uniprot_name

db_name = "data/transmir_v1.2.tsv"
tfs = set()
diseases = set()
funcs = set()
pmids = set()
mirnas = {} # mirname: (function, disease)
entries = {} # (tfname, mirname): active
with open(db_name, 'r') as dbfile:
    for line in dbfile:
        tsv = line.strip().split("\t")
        tfname = get_uniprot_name(tsv[0])
        mirname = mirna_graph.map_label(tsv[3])
        tfname = tfname[0]
        mirname = mirname[0]
        func = tsv[5].split(";")
        disease = tsv[6].split(";")
        active = tsv[7]
        pmid = tsv[8].split(";")
        if tsv[-1].lower() == "human":
            tfs.add(tfname.replace("-", "")) # uniform TF names
            for f in func:
                funcs.add(f.strip())
            for d in disease:
                if d != "see HMDD (http://cmbi.bjmu.edu.cn/hmdd)":
                    diseases.add(d.strip())
            for p in pmid:
                pmids.add(p.strip())