示例#1
0
def main(go_file, uniprot_file, filter_exp, prop_annots, cafa_targets,
         out_file):
    go = Ontology(go_file, with_rels=True)

    proteins, accessions, sequences, annotations, interpros, orgs = load_data(
        uniprot_file)
    df = pd.DataFrame({
        'proteins': proteins,
        'accessions': accessions,
        'sequences': sequences,
        'annotations': annotations,
        'interpros': interpros,
        'orgs': orgs
    })

    if filter_exp:
        logging.info('Filtering proteins with experimental annotations')
        index = []
        annotations = []
        for i, row in enumerate(df.itertuples()):
            annots = []
            for annot in row.annotations:
                go_id, code = annot.split('|')
                if is_exp_code(code):
                    annots.append(go_id)
            # Ignore proteins without experimental annotations
            if len(annots) == 0:
                continue
            index.append(i)
            annotations.append(annots)
        df = df.iloc[index]
        df = df.reset_index()
        df['annotations'] = annotations

    if cafa_targets:
        logging.info('Filtering cafa target proteins')
        index = []
        for i, row in enumerate(df.itertuples()):
            if is_cafa_target(row.orgs):
                index.append(i)
        df = df.iloc[index]
        df = df.reset_index()

    if prop_annots:
        prop_annotations = []
        for i, row in df.iterrows():
            # Propagate annotations
            annot_set = set()
            annots = row['annotations']
            for go_id in annots:
                go_id = go_id.split('|')[0]  # In case if it has code
                annot_set |= go.get_anchestors(go_id)
            annots = list(annot_set)
            prop_annotations.append(annots)
        df['annotations'] = prop_annotations

    df.to_pickle(out_file)
    logging.info('Successfully saved %d proteins' % (len(df), ))
示例#2
0
def main(go_file, swissprot_file, out_file):
    go = Ontology(go_file, with_rels=True)

    if swissprot_file.endswith("gz"):
        proteins, accessions, sequences, annotations, interpros, orgs = load_data_gzip(
            swissprot_file)
    else:
        proteins, accessions, sequences, annotations, interpros, orgs = load_data(
            swissprot_file)
    df = pd.DataFrame({
        'proteins': proteins,
        'accessions': accessions,
        'sequences': sequences,
        'annotations': annotations,
        'interpros': interpros,
        'orgs': orgs
    })

    logging.info('Filtering proteins with experimental annotations')
    index = []
    annotations = []
    for i, row in enumerate(df.itertuples()):
        annots = []
        for annot in row.annotations:
            go_id, code = annot.split('|')
            if is_exp_code(code):
                annots.append(go_id)
        # Ignore proteins without experimental annotations
        if len(annots) == 0:
            continue
        index.append(i)
        annotations.append(annots)
    df = df.iloc[index]
    df = df.reset_index()
    df['exp_annotations'] = annotations

    prop_annotations = []
    for i, row in df.iterrows():
        # Propagate annotations
        annot_set = set()
        annots = row['exp_annotations']
        for go_id in annots:
            annot_set |= go.get_anchestors(go_id)
        annots = list(annot_set)
        prop_annotations.append(annots)
    df['prop_annotations'] = prop_annotations

    cafa_target = []
    for i, row in enumerate(df.itertuples()):
        if is_cafa_target(row.orgs):
            cafa_target.append(True)
        else:
            cafa_target.append(False)
    df['cafa_target'] = cafa_target

    df.to_pickle(out_file)
    logging.info('Successfully saved %d proteins' % (len(df), ))
def main(go_file, uniprot_file, filter_exp, prop_annots, out_file):
    go = Ontology(go_file, with_rels=True)

    proteins, accessions, sequences, annotations, interpros, orgs, genes, gene_names = load_data(uniprot_file)
    df = pd.DataFrame({
        'proteins': proteins,
        'accessions': accessions,
        'sequences': sequences,
        'annotations': annotations,
        'interpros': interpros,
        'orgs': orgs,
        'genes': genes,
        'gene_names': gene_names
    })

    # Filter proteins
    df = df[df['orgs'] == '9606']
    
    logging.info('Filtering proteins with experimental annotations')
    index = []
    annotations = []
    iea_annotations = []
    for i, row in enumerate(df.itertuples()):
        annots = set()
        iea_annots = set()
        for annot in row.annotations:
            go_id, code = annot.split('|')
            anch_set = go.get_anchestors(go_id)
            if is_exp_code(code):
                annots |= anch_set
            iea_annots |= anch_set
        annots = list(annots)
        iea_annots = list(iea_annots)
        annotations.append(annots)
        iea_annotations.append(iea_annots)
    df['exp_annotations'] = annotations
    df['iea_annotations'] = iea_annotations

    
    df.to_pickle(out_file)
    logging.info('Successfully saved %d proteins' % (len(df),) )
示例#4
0
def main(train_data_file, test_data_file, diamond_scores_file, ont):

    go_rels = Ontology('data/go.obo', with_rels=True)

    train_df = pd.read_pickle(train_data_file)
    annotations = train_df['prop_annotations'].values
    annotations = list(map(lambda x: set(x), annotations))

    test_df = pd.read_pickle(test_data_file)
    test_annotations = test_df['prop_annotations'].values
    test_annotations = list(map(lambda x: set(x), test_annotations))
    go_rels.calculate_ic(annotations + test_annotations)

    prot_index = {}
    for i, row in enumerate(train_df.itertuples()):
        prot_index[row.proteins] = i

    # BLAST Similarity (Diamond)
    diamond_scores = {}
    with open(diamond_scores_file) as f:
        for line in f:
            it = line.strip().split()
            if it[0] not in diamond_scores:
                diamond_scores[it[0]] = {}
            diamond_scores[it[0]][it[1]] = float(it[2])

    blast_preds = []
    for i, row in enumerate(test_df.itertuples()):
        annots = {}
        prot_id = row.proteins
        # BlastKNN
        if prot_id in diamond_scores:
            sim_prots = diamond_scores[prot_id]
            allgos = set()
            total_score = 0.0
            for p_id, score in sim_prots.items():
                allgos |= annotations[prot_index[p_id]]
                total_score += score
            allgos = list(sorted(allgos))
            sim = np.zeros(len(allgos), dtype=np.float32)
            for j, go_id in enumerate(allgos):
                s = 0.0
                for p_id, score in sim_prots.items():
                    if go_id in annotations[prot_index[p_id]]:
                        s += score
                sim[j] = s / total_score
            for go_id, score in zip(allgos, sim):
                annots[go_id] = score

        blast_preds.append(annots)

    go_set = go_rels.get_namespace_terms(NAMESPACES[ont])
    go_set.remove(FUNC_DICT[ont])

    labels = test_annotations
    labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels))
    print(len(go_set))
    fmax = 0.0
    tmax = 0.0
    smin = 1000.0
    precisions = []
    recalls = []
    for t in range(101):
        threshold = t / 100.0
        preds = []
        for i, row in enumerate(test_df.itertuples()):
            annots = set()
            for go_id, score in blast_preds[i].items():
                if score >= threshold:
                    annots.add(go_id)

            new_annots = set()
            for go_id in annots:
                new_annots |= go_rels.get_anchestors(go_id)
            preds.append(new_annots)

        # Filter classes
        preds = list(
            map(lambda x: set(filter(lambda y: y in go_set, x)), preds))

        fscore, prec, rec, s = evaluate_annotations(go_rels, labels, preds)
        precisions.append(prec)
        recalls.append(rec)
        print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}')
        if fmax < fscore:
            fmax = fscore
            tmax = threshold
        if smin > s:
            smin = s
    print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}')
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print(f'AUPR: {aupr:0.3f}')
    plt.figure()
    lw = 2
    plt.plot(recalls,
             precisions,
             color='darkorange',
             lw=lw,
             label=f'AUPR curve (area = {aupr:0.3f})')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Area Under the Precision-Recall curve')
    plt.legend(loc="lower right")
    plt.savefig('aupr.pdf')
    plt.show()
示例#5
0
def main(train_data_file, test_data_file, out_file, terms_file, root_class, fold):
    # Cross validation evaluation
    out_file = f'fold{fold}_' + out_file
    test_data_file = f'fold{fold}_' + test_data_file
    
    hp = Ontology('data/hp.obo', with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    train_df = pd.read_pickle(train_data_file)

    naive_annots = {}
    for i, row in train_df.iterrows():
        for hp_id in row.hp_annotations:
            if hp_id in naive_annots:
                naive_annots[hp_id] += 1
            else:
                naive_annots[hp_id] = 1
    for hp_id in naive_annots:
        naive_annots[hp_id] /= 1.0 * len(train_df)

    test_df = pd.read_pickle(test_data_file)
    annotations = train_df['hp_annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    test_annotations = test_df['hp_annotations'].values
    test_annotations = list(map(lambda x: set(x), test_annotations))
    hp.calculate_ic(annotations)

    hp_set = set(terms)
    
    all_classes = hp.get_term_set(root_class)
    hp_set = hp_set.intersection(all_classes)
    hp_set.discard(root_class)
    print(len(hp_set))

    labels = test_annotations
    labels = list(map(lambda x: set(filter(lambda y: y in hp_set, x)), labels))

    # Compute AUC
    auc_terms = list(hp_set)
    auc_terms_dict = {v: i for i, v in enumerate(auc_terms)}
    auc_preds = np.zeros((len(test_df), len(hp_set)), dtype=np.float32)
    auc_labels = np.zeros((len(test_df), len(hp_set)), dtype=np.int32)
    for i in range(len(labels)):
        for j, hp_id in enumerate(auc_terms):
            auc_preds[i, j] = naive_annots[hp_id]
            if hp_id in labels[i]:
                auc_labels[i, j] = 1
    # Compute macro AUROC
    roc_auc = 0.0
    total = 0
    for i, hp_id in enumerate(auc_terms):
        if np.sum(auc_labels[:, i]) == 0:
            continue
        total += 1
        auc = compute_roc(auc_labels[:, i], auc_preds[:, i])
        if not math.isnan(auc): 
            roc_auc += auc
        else:
            roc_auc += 1
    roc_auc /= total
    print(roc_auc)
    return
    
    fmax = 0.0
    tmax = 0.0
    pmax = 0.0
    rmax = 0.0
    precisions = []
    recalls = []
    smin = 1000000.0
    max_preds = None
    for t in range(0, 101):
        threshold = t / 100.0
        gene_id = row.genes
        annots = set()
        for hp_id, score in naive_annots.items():
            if score >= threshold:
                annots.add(hp_id)
        new_annots = set()
        for hp_id in annots:
            new_annots |= hp.get_anchestors(hp_id)

        preds = []
        for i, row in enumerate(test_df.itertuples()):
            preds.append(new_annots)
        
    
        # Filter classes
        
        fscore, prec, rec, s = evaluate_annotations(hp, labels, preds)
        precisions.append(prec)
        recalls.append(rec)
        print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}')
        if fmax < fscore:
            fmax = fscore
            tmax = threshold
            pmax = prec
            rmax = rec
            max_preds = preds
        if smin > s:
            smin = s
    test_df['hp_preds'] = max_preds
    test_df.to_pickle(out_file)
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print(f'AUROC: {roc_auc:0.3f}, AUPR: {aupr:0.3f}, Fmax: {fmax:0.3f}, Prec: {pmax:0.3f}, Rec: {rmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}')
def main(train_data_file, test_data_file, terms_file, out_file, root_class,
         fold):
    # Cross validation evaluation
    out_file = f'fold{fold}_' + out_file
    test_data_file = f'fold{fold}_' + test_data_file

    hp = Ontology('data/hp.obo', with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    train_df = pd.read_pickle(train_data_file)
    test_df = pd.read_pickle(test_data_file)
    annotations = train_df['hp_annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    test_annotations = test_df['hp_annotations'].values
    test_annotations = list(map(lambda x: set(x), test_annotations))
    hp.calculate_ic(annotations)

    hp_set = set(terms)
    all_classes = hp.get_term_set(root_class)
    hp_set = hp_set.intersection(all_classes)
    hp_set.discard(root_class)
    print(len(hp_set))

    labels = test_annotations
    labels = list(map(lambda x: set(filter(lambda y: y in hp_set, x)), labels))

    # Compute AUC
    auc_terms = list(hp_set)
    auc_terms_dict = {v: i for i, v in enumerate(auc_terms)}
    auc_preds = np.zeros((len(test_df), len(hp_set)), dtype=np.float32)
    auc_labels = np.zeros((len(test_df), len(hp_set)), dtype=np.int32)
    for i, row in enumerate(test_df.itertuples()):
        for j, hp_id in enumerate(auc_terms):
            auc_preds[i, j] = row.preds[terms_dict[hp_id]]
            if hp_id in labels[i]:
                auc_labels[i, j] = 1
    # Compute macro AUROC
    roc_auc = 0.0
    total = 0
    for i, hp_id in enumerate(auc_terms):
        if np.sum(auc_labels[:, i]) == 0:
            continue
        total += 1
        auc = compute_roc(auc_labels[:, i], auc_preds[:, i])
        if not math.isnan(auc):
            roc_auc += auc
        else:
            roc_auc += 1
    roc_auc /= total
    print(roc_auc)
    return
    fmax = 0.0
    tmax = 0.0
    pmax = 0.0
    rmax = 0.0
    precisions = []
    recalls = []
    smin = 1000000.0
    max_preds = None
    for t in range(0, 101):
        threshold = t / 100.0
        preds = []
        for i, row in enumerate(test_df.itertuples()):
            gene_id = row.genes
            annots_dict = {}

            for j, score in enumerate(row.preds):
                hp_id = terms[j]
                # score = score * (1 - alpha)
                if hp_id in annots_dict:
                    annots_dict[hp_id] += score
                else:
                    annots_dict[hp_id] = score

            annots = set()
            for hp_id, score in annots_dict.items():
                if score >= threshold:
                    annots.add(hp_id)
            new_annots = set()
            for hp_id in annots:
                new_annots |= hp.get_anchestors(hp_id)
            new_annots = new_annots.intersection(hp_set)
            preds.append(new_annots)

        # Filter classes

        fscore, prec, rec, s = evaluate_annotations(hp, labels, preds)
        precisions.append(prec)
        recalls.append(rec)
        print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}')
        if fmax < fscore:
            fmax = fscore
            tmax = threshold
            max_preds = preds
            pmax = prec
            rmax = rec
        if smin > s:
            smin = s
    test_df['hp_preds'] = max_preds
    test_df.to_pickle(out_file)
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print(
        f'AUROC: {roc_auc:0.3f}, AUPR: {aupr:0.3f}, Fmax: {fmax:0.3f}, Prec: {pmax:0.3f}, Rec: {rmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}'
    )
    plt.figure()
    lw = 2
    plt.plot(recalls,
             precisions,
             color='darkorange',
             lw=lw,
             label=f'AUPR curve (area = {aupr:0.2f})')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Area Under the Precision-Recall curve')
    plt.legend(loc="lower right")
    df = pd.DataFrame({'precisions': precisions, 'recalls': recalls})
def main(train_data_file, test_data_file, terms_file, diamond_scores_file, ont,
         alpha):

    alpha /= 100.0
    go_rels = Ontology('data-cafa/go.obo', with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    train_df = pd.read_pickle(train_data_file)
    test_df = pd.read_pickle(test_data_file)
    annotations = train_df['annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    test_annotations = test_df['annotations'].values
    test_annotations = list(map(lambda x: set(x), test_annotations))
    go_rels.calculate_ic(annotations + test_annotations)

    # Print IC values of terms
    ics = {}
    for term in terms:
        ics[term] = go_rels.get_ic(term)

    prot_index = {}
    for i, row in enumerate(train_df.itertuples()):
        prot_index[row.proteins] = i

    # BLAST Similarity (Diamond)
    diamond_scores = {}
    with open(diamond_scores_file) as f:
        for line in f:
            it = line.strip().split()
            if it[0] not in diamond_scores:
                diamond_scores[it[0]] = {}
            diamond_scores[it[0]][it[1]] = float(it[2])

    blast_preds = []
    for i, row in enumerate(test_df.itertuples()):
        annots = {}
        prot_id = row.proteins
        # BlastKNN
        if prot_id in diamond_scores:
            sim_prots = diamond_scores[prot_id]
            allgos = set()
            total_score = 0.0
            for p_id, score in sim_prots.items():
                allgos |= annotations[prot_index[p_id]]
                total_score += score
            allgos = list(sorted(allgos))
            sim = np.zeros(len(allgos), dtype=np.float32)
            for j, go_id in enumerate(allgos):
                s = 0.0
                for p_id, score in sim_prots.items():
                    if go_id in annotations[prot_index[p_id]]:
                        s += score
                sim[j] = s / total_score
            ind = np.argsort(-sim)
            for go_id, score in zip(allgos, sim):
                annots[go_id] = score
        blast_preds.append(annots)

    # DeepGOPlus
    go_set = go_rels.get_namespace_terms(NAMESPACES[ont])
    go_set.remove(FUNC_DICT[ont])
    labels = test_df['annotations'].values
    labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels))
    # print(len(go_set))
    deep_preds = []
    alphas = {
        NAMESPACES['mf']: 0.55,
        NAMESPACES['bp']: 0.59,
        NAMESPACES['cc']: 0.46
    }
    for i, row in enumerate(test_df.itertuples()):
        annots_dict = blast_preds[i].copy()
        for go_id in annots_dict:
            annots_dict[go_id] *= alphas[go_rels.get_namespace(go_id)]
        for j, score in enumerate(row.preds):
            go_id = terms[j]
            score *= 1 - alphas[go_rels.get_namespace(go_id)]
            if go_id in annots_dict:
                annots_dict[go_id] += score
            else:
                annots_dict[go_id] = score
        deep_preds.append(annots_dict)
    print('AUTHOR DeepGOPlus')
    print('MODEL 1')
    print('KEYWORDS sequence alignment.')
    for i, row in enumerate(test_df.itertuples()):
        prot_id = row.proteins
        for go_id, score in deep_preds[i].items():
            print(f'{prot_id}\t{go_id}\t{score:.2f}')
    print('END')
    return
    # Propagate scores
    # deepgo_preds = []
    # for annots_dict in deep_preds:
    #     annots = {}
    #     for go_id, score in annots_dict.items():
    #         for a_id in go_rels.get_anchestors(go_id):
    #             if a_id in annots:
    #                 annots[a_id] = max(annots[a_id], score)
    #             else:
    #                 annots[a_id] = score
    #     deepgo_preds.append(annots)

    fmax = 0.0
    tmax = 0.0
    precisions = []
    recalls = []
    smin = 1000000.0
    rus = []
    mis = []
    for t in range(0, 101):
        threshold = t / 100.0
        preds = []
        for i, row in enumerate(test_df.itertuples()):
            annots = set()
            for go_id, score in deep_preds[i].items():
                if score >= threshold:
                    annots.add(go_id)

            new_annots = set()
            for go_id in annots:
                new_annots |= go_rels.get_anchestors(go_id)
            preds.append(new_annots)

        # Filter classes
        preds = list(
            map(lambda x: set(filter(lambda y: y in go_set, x)), preds))

        fscore, prec, rec, s, ru, mi, fps, fns = evaluate_annotations(
            go_rels, labels, preds)
        avg_fp = sum(map(lambda x: len(x), fps)) / len(fps)
        avg_ic = sum(
            map(lambda x: sum(map(lambda go_id: go_rels.get_ic(go_id), x)),
                fps)) / len(fps)
        print(f'{avg_fp} {avg_ic}')
        precisions.append(prec)
        recalls.append(rec)
        print(
            f'Fscore: {fscore}, Precision: {prec}, Recall: {rec} S: {s}, RU: {ru}, MI: {mi} threshold: {threshold}'
        )
        if fmax < fscore:
            fmax = fscore
            tmax = threshold
        if smin > s:
            smin = s
    print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}')
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print(f'AUPR: {aupr:0.3f}')
    plt.figure()
    lw = 2
    plt.plot(recalls,
             precisions,
             color='darkorange',
             lw=lw,
             label=f'AUPR curve (area = {aupr:0.2f})')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Area Under the Precision-Recall curve')
    plt.legend(loc="lower right")
    plt.savefig(f'aupr_{ont}_{alpha:0.2f}.pdf')
    df = pd.DataFrame({'precisions': precisions, 'recalls': recalls})
    df.to_pickle(f'PR_{ont}_{alpha:0.2f}.pkl')
def main(go_file, mp_file, mp_annots_file, deepgo_annots_file, id_mapping_file,
         data_file, out_data_file, out_terms_file, min_count):
    go = Ontology(go_file, with_rels=True)
    logging.info('GO loaded')
    mp = Ontology(mp_file, with_rels=True)
    logging.info('MP loaded')

    logging.info('Load MP2Uniprot mapping')
    prot2gene = {}
    with open(id_mapping_file) as f:
        next(f)
        for line in f:
            it = line.strip().split('\t')
            if it[0] not in gene2prot:
                gene2prot[it[0]] = []
            gene2prot[it[0]] += list(it[6].split())
    logging.info('Loading MP annotations')
    mp_annots = {}
    df = pd.read_pickle(data_file)
    acc2prot = {}
    for row in df.itertuples():
        p_id = row.proteins
        acc_ids = row.accessions.split('; ')
        for acc_id in acc_ids:
            acc2prot[acc_id] = p_id
    with open(mp_annots_file) as f:
        next(f)
        for line in f:
            it = line.strip().split('\t')
            for mgi in it[6].split('|'):
                if mgi not in gene2prot:
                    continue
                prot_ids = gene2prot[mgi]
                mp_id = it[4]
                for prot_id in prot_ids:
                    if prot_id not in acc2prot:
                        continue
                    prot_id = acc2prot[prot_id]
                    if prot_id not in mp_annots:
                        mp_annots[prot_id] = set()
                        if mp.has_term(mp_id):
                            mp_annots[prot_id] |= mp.get_anchestors(mp_id)
    print('MP Annotations', len(mp_annots))
    dg_annots = {}
    gos = set()
    with open(deepgo_annots_file) as f:
        for line in f:
            it = line.strip().split('\t')
            prot_id = it[0]
            annots = []
            for item in it[1:]:
                go_id, score = item.split('|')
                score = float(score)
                annots.append(go_id)
            dg_annots[prot_id] = it[1:]
            gos |= set(annots)
    print('DeepGO Annotations', len(dg_annots))
    print('Number of GOs', len(gos))
    go_df = pd.DataFrame({'gos': list(gos)})
    go_df.to_pickle('data/gos.pkl')

    logging.info('Processing annotations')

    cnt = Counter()
    annotations = list()
    for prot_id, annots in mp_annots.items():
        for term in annots:
            cnt[term] += 1

    deepgo_annots = []
    go_annots = []
    mpos = []
    prots = []
    sequences = []
    for row in df.itertuples():
        p_id = row.proteins
        if p_id in mp_annots:
            prots.append(p_id)
            mpos.append(mp_annots[p_id])
            go_annots.append(row.annotations)
            deepgo_annots.append(dg_annots[p_id])
            sequences.append(row.sequences)

    prots_set = set(prots)
    for key, val in mp_annots.items():
        if key not in prots_set:
            print(key)

    df = pd.DataFrame({
        'proteins': prots,
        'mp_annotations': mpos,
        'go_annotations': go_annots,
        'deepgo_annotations': deepgo_annots,
        'sequences': sequences
    })
    df.to_pickle(out_data_file)
    print(f'Number of proteins {len(df)}')

    # Filter terms with annotations more than min_count
    res = {}
    for key, val in cnt.items():
        if key == 'MP:0000001':
            continue
        if val >= min_count:
            ont = key.split(':')[0]
            if ont not in res:
                res[ont] = []
            res[ont].append(key)
    terms = []
    for key, val in res.items():
        print(key, len(val))
        terms += val

    logging.info(f'Number of terms {len(terms)}')

    # Save the list of terms
    df = pd.DataFrame({'terms': terms})
    df.to_pickle(out_terms_file)
示例#9
0
def helper(train_df, test_df, ont):
    go = Ontology('data-cafa/go.obo', with_rels=True)
    terms_df = pd.read_pickle('data-cafa/' + ont + '.pkl')
    terms = terms_df['functions'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    train_df = train_df.rename(columns={"gos": "annotations"})
    annotations = train_df['annotations'].values
    annotations = list(map(lambda x: set(x), annotations))

    test_df = test_df.rename(columns={"gos": "annotations"})

    # Annotations
    test_annotations = []
    for i, row in enumerate(test_df.itertuples()):
        annots = set()
        for go_id in row.annotations:
            if go.has_term(go_id):
                annots |= go.get_anchestors(go_id)
        test_annotations.append(annots)
    go.calculate_ic(annotations + test_annotations)

    prot_index = {}
    for i, row in enumerate(train_df.itertuples()):
        prot_index[row.proteins] = i

    # DeepGO
    go_set = go.get_namespace_terms(NAMESPACES[ont])
    go_set.remove(FUNC_DICT[ont])

    labels = test_annotations
    labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels))
    print(len(go_set))
    fmax = 0.0
    tmax = 0.0
    smin = 1000.0
    precisions = []
    recalls = []
    for t in range(1, 101):
        threshold = t / 100.0
        preds = []
        for i, row in enumerate(test_df.itertuples()):
            annots = set()
            for j, score in enumerate(row.predictions):
                if score >= threshold:
                    annots.add(terms[j])

            new_annots = set()
            for go_id in annots:
                new_annots |= go.get_anchestors(go_id)
            preds.append(new_annots)

        # Filter classes
        preds = list(
            map(lambda x: set(filter(lambda y: y in go_set, x)), preds))

        fscore, prec, rec, s = evaluate_annotations(go, labels, preds)
        precisions.append(prec)
        recalls.append(rec)
        print('Fscore: {}, S: {}, threshold: {}'.format(fscore, s, threshold))
        if fmax < fscore:
            fmax = fscore
            tmax = threshold
        if smin > s:
            smin = s
    print('Fmax: {:0.3f}, Smin: {:0.3f}, threshold: {}'.format(
        fmax, smin, tmax))
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print('AUPR: {:0.3f}'.format(aupr))

    return [recalls, precisions, aupr]
示例#10
0
def main(train_data_file, valid_data_file, terms_file, diamond_scores_file,
         ont):

    go_rels = Ontology('data-cafa/go.obo', with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    train_df = pd.read_pickle(train_data_file)
    valid_df = pd.read_pickle(valid_data_file)
    annotations = train_df['annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    valid_annotations = valid_df['annotations'].values
    valid_annotations = list(map(lambda x: set(x), valid_annotations))
    go_rels.calculate_ic(annotations + valid_annotations)
    # Print IC values of terms
    ics = {}
    for term in terms:
        ics[term] = go_rels.get_ic(term)

    prot_index = {}
    for i, row in enumerate(train_df.itertuples()):
        prot_index[row.proteins] = i

    # BLAST Similarity (Diamond)
    diamond_scores = {}
    with open(diamond_scores_file) as f:
        for line in f:
            it = line.strip().split()
            if it[0] not in diamond_scores:
                diamond_scores[it[0]] = {}
            diamond_scores[it[0]][it[1]] = float(it[2])

    blast_preds = []
    for i, row in enumerate(valid_df.itertuples()):
        annots = {}
        prot_id = row.proteins
        # BlastKNN
        if prot_id in diamond_scores:
            sim_prots = diamond_scores[prot_id]
            allgos = set()
            total_score = 0.0
            for p_id, score in sim_prots.items():
                allgos |= annotations[prot_index[p_id]]
                total_score += score
            allgos = list(sorted(allgos))
            sim = np.zeros(len(allgos), dtype=np.float32)
            for j, go_id in enumerate(allgos):
                s = 0.0
                for p_id, score in sim_prots.items():
                    if go_id in annotations[prot_index[p_id]]:
                        s += score
                sim[j] = s / total_score
            ind = np.argsort(-sim)
            for go_id, score in zip(allgos, sim):
                annots[go_id] = score
        blast_preds.append(annots)

    # DeepGOPlus
    go_set = go_rels.get_namespace_terms(NAMESPACES[ont])
    go_set.remove(FUNC_DICT[ont])
    labels = valid_df['annotations'].values
    labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels))
    print(len(go_set))
    best_fmax = 0.0
    best_alpha = 0.0
    for alpha in range(44, 70):
        alpha /= 100.0
        deep_preds = []
        for i, row in enumerate(valid_df.itertuples()):
            annots_dict = blast_preds[i].copy()
            for go_id in annots_dict:
                annots_dict[go_id] *= alpha
            for j, score in enumerate(row.preds):
                go_id = terms[j]
                score *= 1 - alpha
                if go_id in annots_dict:
                    annots_dict[go_id] += score
                else:
                    annots_dict[go_id] = score
            deep_preds.append(annots_dict)

        fmax = 0.0
        tmax = 0.0
        precisions = []
        recalls = []
        smin = 1000000.0
        rus = []
        mis = []
        for t in range(14, 20):
            threshold = t / 100.0
            preds = []
            for i, row in enumerate(valid_df.itertuples()):
                annots = set()
                for go_id, score in deep_preds[i].items():
                    if score >= threshold:
                        annots.add(go_id)

                new_annots = set()
                for go_id in annots:
                    new_annots |= go_rels.get_anchestors(go_id)
                preds.append(new_annots)

            # Filter classes
            preds = list(
                map(lambda x: set(filter(lambda y: y in go_set, x)), preds))

            fscore, prec, rec, s, ru, mi, fps, fns = evaluate_annotations(
                go_rels, labels, preds)
            avg_fp = sum(map(lambda x: len(x), fps)) / len(fps)
            avg_ic = sum(
                map(lambda x: sum(map(lambda go_id: go_rels.get_ic(go_id), x)),
                    fps)) / len(fps)
            print(
                f'Fscore: {fscore}, Precision: {prec}, Recall: {rec} S: {s}, RU: {ru}, MI: {mi} threshold: {threshold}'
            )
            if fmax < fscore:
                fmax = fscore
                tmax = threshold
            if smin > s:
                smin = s
        if best_fmax < fmax:
            best_fmax = fmax
            best_alpha = alpha
        print(
            f'Alpha: {alpha} Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}'
        )
    print(f'{best_alpha} {best_fmax}')
def main(benchmark_file, train_data_file, hpo_file, terms_file, root_class):

    hp = Ontology(hpo_file, with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    noknowledge_prots = set()
    with open('data-cafa/noknowledge_targets.txt') as f:
        for line in f:
            noknowledge_prots.add(line.strip())

    bench_annots = {}
    with open(benchmark_file) as f:
        for line in f:
            it = line.strip().split('\t')
            t_id = it[0]
            if t_id not in noknowledge_prots:
                continue
            hp_id = it[1]
            if t_id not in bench_annots:
                bench_annots[t_id] = set()
            bench_annots[t_id] |= hp.get_anchestors(hp_id)

    train_df = pd.read_pickle(train_data_file)
    naive_annots = {}
    for i, row in train_df.iterrows():
        for hp_id in row.hp_annotations:
            if hp_id in naive_annots:
                naive_annots[hp_id] += 1
            else:
                naive_annots[hp_id] = 1
    for hp_id in naive_annots:
        naive_annots[hp_id] /= 1.0 * len(train_df)

    annotations = train_df['hp_annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    hp.calculate_ic(annotations)

    hp_set = set(terms)
    all_classes = hp.get_term_set(root_class)
    hp_set = hp_set.intersection(all_classes)
    hp_set.discard(root_class)
    print(len(hp_set))

    labels = []
    for t_id, hps in bench_annots.items():
        labels.append(hps)
    labels = list(map(lambda x: set(filter(lambda y: y in hp_set, x)), labels))

    # Compute AUC
    auc_terms = list(hp_set)
    auc_terms_dict = {v: i for i, v in enumerate(auc_terms)}
    auc_preds = np.zeros((len(bench_annots), len(hp_set)), dtype=np.float32)
    auc_labels = np.zeros((len(bench_annots), len(hp_set)), dtype=np.int32)
    for i in range(len(labels)):
        for j, hp_id in enumerate(auc_terms):
            auc_preds[i, j] = naive_annots[hp_id]
            if hp_id in labels[i]:
                auc_labels[i, j] = 1
    # Compute macro AUROC
    roc_auc = 0.0
    total = 0
    for i, hp_id in enumerate(auc_terms):
        if np.sum(auc_labels[:, i]) == 0:
            continue
        total += 1
        auc = compute_roc(auc_labels[:, i], auc_preds[:, i])
        if not math.isnan(auc):
            roc_auc += auc
        else:
            roc_auc += 1
    roc_auc /= total
    print(roc_auc)
    return

    fmax = 0.0
    tmax = 0.0
    pmax = 0.0
    rmax = 0.0
    precisions = []
    recalls = []
    smin = 1000000.0
    max_preds = None
    for t in range(0, 101):
        threshold = t / 100.0
        annots = set()
        for hp_id, score in naive_annots.items():
            if score >= threshold:
                annots.add(hp_id)
        new_annots = set()
        for hp_id in annots:
            new_annots |= hp.get_anchestors(hp_id)
        preds = []
        for t_id, hps in bench_annots.items():
            preds.append(new_annots)

        fscore, prec, rec, s = evaluate_annotations(hp, labels, preds)
        precisions.append(prec)
        recalls.append(rec)
        print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}')
        if fmax < fscore:
            fmax = fscore
            pmax = prec
            rmax = rec
            tmax = threshold
            max_preds = preds
        if smin > s:
            smin = s
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print(
        f'AUROC: {roc_auc:0.3f}, AUPR: {aupr:0.3f}, Fmax: {fmax:0.3f}, Prec: {pmax:0.3f}, Rec: {rmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}'
    )
示例#12
0
def main(train_data_file, test_data_file, terms_file, rules_file):

    hp = Ontology('data/hp.obo', with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    train_df = pd.read_pickle(train_data_file)

    rule_annots = {}
    with open(rules_file) as f:
        for line in f:
            it = line.strip().split()
            go_id = it[0].replace('_', ':')
            hp_id = it[1].replace('_', ':')
            if go_id not in rule_annots:
                rule_annots[go_id] = set()
            rule_annots[go_id].add(hp_id)

    test_df = pd.read_pickle(test_data_file)
    annotations = train_df['hp_annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    test_annotations = test_df['hp_annotations'].values
    test_annotations = list(map(lambda x: set(x), test_annotations))
    hp.calculate_ic(annotations)

    hp_set = set(terms)

    hp_set_anch = set()
    for hp_id in hp_set:
        hp_set_anch |= hp.get_anchestors(hp_id)

    labels = test_annotations
    # labels = list(map(lambda x: set(filter(lambda y: y in hp_set_anch, x)), labels))

    fmax = 0.0
    tmax = 0.0
    precisions = []
    recalls = []
    smin = 1000000.0
    max_preds = None
    for t in range(0, 101):
        threshold = t / 100.0
        preds = []
        for i, row in enumerate(test_df.itertuples()):
            gene_id = row.genes
            annots = set()
            for item in row.deepgo_annotations:
                go_id, score = item.split('|')
                score = float(score)
                if score >= threshold and go_id in rule_annots:
                    annots |= rule_annots[go_id]
            new_annots = set()
            for hp_id in annots:
                new_annots |= hp.get_anchestors(hp_id)

            preds.append(new_annots)

        # Filter classes

        fscore, prec, rec, s = evaluate_annotations(hp, labels, preds)
        precisions.append(prec)
        recalls.append(rec)
        print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}')
        if fmax < fscore:
            fmax = fscore
            tmax = threshold
            max_preds = preds
        if smin > s:
            smin = s
    print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}')
    test_df['hp_preds'] = max_preds
    test_df.to_pickle('data/predictions_max.pkl')
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print(f'AUPR: {aupr:0.3f}')
示例#13
0
def main(train_data_file, preds_file, ont):

    go = Ontology('data/go.obo', with_rels=True)
    terms_df = pd.read_pickle('data-deepgo/' + ont + '.pkl')
    terms = terms_df['functions'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    train_df = pd.read_pickle(train_data_file)
    annotations = train_df['annotations'].values
    annotations = list(map(lambda x: set(x), annotations))

    test_df = pd.read_pickle(preds_file)
    # Annotations
    test_annotations = []
    for i, row in enumerate(test_df.itertuples()):
        annots = set()
        for go_id in row.gos:
            if go.has_term(go_id):
                annots |= go.get_anchestors(go_id)
        test_annotations.append(annots)
    go.calculate_ic(annotations + test_annotations)

    prot_index = {}
    for i, row in enumerate(train_df.itertuples()):
        prot_index[row.proteins] = i

    # DeepGO
    go_set = go.get_namespace_terms(NAMESPACES[ont])
    go_set.remove(FUNC_DICT[ont])

    labels = test_annotations
    labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels))
    print(len(go_set))
    fmax = 0.0
    tmax = 0.0
    smin = 1000.0
    precisions = []
    recalls = []
    for threshold in np.arange(0.005, 1, .01):
        # threshold = t / 100.0
        preds = []
        for i, row in enumerate(test_df.itertuples()):
            annots = set()
            for j, score in enumerate(row.predictions):
                if score >= threshold:
                    annots.add(terms[j])

            new_annots = set()
            for go_id in annots:
                new_annots |= go.get_anchestors(go_id)
            preds.append(new_annots)

        # Filter classes
        preds = list(
            map(lambda x: set(filter(lambda y: y in go_set, x)), preds))

        fscore, prec, rec, s = evaluate_annotations(go, labels, preds)
        precisions.append(prec)
        recalls.append(rec)
        print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}')
        if fmax < fscore:
            fmax = fscore
            tmax = threshold
        if smin > s:
            smin = s
    print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}')
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print(f'AUPR: {aupr:0.3f}')
    plt.figure()
    lw = 2
    plt.plot(recalls,
             precisions,
             color='darkorange',
             lw=lw,
             label=f'AUPR curve (area = {aupr:0.3f})')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Area Under the Precision-Recall curve')
    plt.legend(loc="lower right")
    plt.savefig('aupr.pdf')
    plt.show()
示例#14
0
def main(go_file, train_sequences_file, train_annotations_file,
         test_sequences_file, test_annotations_file, out_terms_file,
         train_data_file, test_data_file, min_count):
    logging.info('Loading GO')
    go = Ontology(go_file, with_rels=True)

    logging.info('Loading training annotations')
    train_annots = {}
    with open(train_annotations_file, 'r') as f:
        for line in f:
            it = line.strip().split('\t')
            prot_id = it[0]
            if prot_id not in train_annots:
                train_annots[prot_id] = set()
            go_id = it[1]
            train_annots[prot_id].add(go_id)

    logging.info('Loading training sequences')
    info, seqs = read_fasta(train_sequences_file)
    proteins = []
    sequences = []
    annotations = []
    for prot_info, sequence in zip(info, seqs):
        prot_id = prot_info.split()[0]
        if prot_id in train_annots:
            proteins.append(prot_id)
            sequences.append(sequence)
            annotations.append(train_annots[prot_id])

    prop_annotations = []
    cnt = Counter()
    for annots in annotations:
        # Propagate annotations
        annots_set = set()
        for go_id in annots:
            annots_set |= go.get_anchestors(go_id)
        prop_annotations.append(annots_set)
        for go_id in annots_set:
            cnt[go_id] += 1

    df = pd.DataFrame({
        'proteins': proteins,
        'sequences': sequences,
        'annotations': prop_annotations,
    })
    logging.info(f'Train proteins: {len(df)}')
    logging.info(f'Saving training data to {train_data_file}')
    df.to_pickle(train_data_file)

    # Filter terms with annotations more than min_count
    res = {}
    for key, val in cnt.items():
        if val >= min_count:
            ont = key.split(':')[0]
            if ont not in res:
                res[ont] = []
            res[ont].append(key)
    terms = []
    for key, val in res.items():
        terms += val

    logging.info(f'Number of terms {len(terms)}')
    logging.info(f'Saving terms to {out_terms_file}')

    df = pd.DataFrame({'terms': terms})
    df.to_pickle(out_terms_file)

    logging.info('Loading testing annotations')
    test_annots = {}
    with open(test_annotations_file, 'r') as f:
        for line in f:
            it = line.strip().split('\t')
            prot_id = it[0]
            if prot_id not in test_annots:
                test_annots[prot_id] = set()
            go_id = it[1]
            test_annots[prot_id].add(go_id)

    logging.info('Loading testing sequences')
    info, seqs = read_fasta(test_sequences_file)
    proteins = []
    sequences = []
    annotations = []
    for prot_info, sequence in zip(info, seqs):
        prot_id = prot_info.split()[0]
        if prot_id in test_annots:
            proteins.append(prot_id)
            sequences.append(sequence)
            annotations.append(test_annots[prot_id])

    prop_annotations = []
    for annots in annotations:
        # Propagate annotations
        annots_set = set()
        for go_id in annots:
            annots_set |= go.get_anchestors(go_id)
        prop_annotations.append(annots_set)

    df = pd.DataFrame({
        'proteins': proteins,
        'sequences': sequences,
        'annotations': prop_annotations,
    })
    logging.info(f'Test proteins {len(df)}')
    logging.info(f'Saving testing data to {test_data_file}')
    df.to_pickle(test_data_file)
示例#15
0
def main(train_data_file, test_data_file, terms_file, diamond_scores_file, ont,
         alpha):

    alpha /= 100.0
    mp = Ontology('data/mp.obo', with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    train_df = pd.read_pickle(train_data_file)
    test_df = pd.read_pickle(test_data_file)
    annotations = train_df['mp_annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    test_annotations = test_df['mp_annotations'].values
    test_annotations = list(map(lambda x: set(x), test_annotations))
    mp.calculate_ic(annotations)
    prot_index = {}
    for i, row in enumerate(train_df.itertuples()):
        prot_index[row.proteins] = i

    # GO2HP preds
    rules = {}
    with open('data/go2hp.txt') as f:
        for line in f:
            it = line.strip().split('\t')
            go_id = it[0].replace('_', ':')
            mp_ids = list(map(lambda x: x.replace('_', ':'), it[1:]))
            if go_id not in rules:
                rules[go_id] = []
            rules[go_id] = mp_ids
    pheno2go_preds = {}
    for i, row in enumerate(test_df.itertuples()):
        prot_id = row.proteins
        if prot_id not in pheno2go_preds:
            pheno2go_preds[prot_id] = {}
        for item in row.deepgo_annotations:
            go_id, score = item.split('|')
            if go_id in rules:
                for mp_id in rules[go_id]:
                    pheno2go_preds[prot_id][mp_id] = max(
                        float(score), pheno2go_preds[prot_id].get(mp_id, 0))

    labels = test_annotations
    fmax = 0.0
    tmax = 0.0
    precisions = []
    recalls = []
    smin = 1000000.0
    for t in range(101):
        threshold = t / 100.0
        preds = []
        for i, row in enumerate(test_df.itertuples()):
            prot_id = row.proteins
            annots_dict = {}  #pheno2go_preds[prot_id]
            for j, score in enumerate(row.preds):
                mp_id = terms[j]
                annots_dict[mp_id] = max(score, annots_dict.get(mp_id, 0))

            annots = set()
            for mp_id, score in annots_dict.items():
                if score >= threshold:
                    annots.add(mp_id)
            new_annots = set()
            for mp_id in annots:
                new_annots |= mp.get_anchestors(mp_id)
            preds.append(new_annots)

        # Filter classes

        fscore, prec, rec, s = evaluate_annotations(mp, labels, preds)
        precisions.append(prec)
        recalls.append(rec)
        print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}')
        if fmax < fscore:
            fmax = fscore
            tmax = threshold
        if smin > s:
            smin = s
    print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}')
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print(f'AUPR: {aupr:0.3f}')
    plt.figure()
    lw = 2
    plt.plot(recalls,
             precisions,
             color='darkorange',
             lw=lw,
             label=f'AUPR curve (area = {aupr:0.2f})')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Area Under the Precision-Recall curve')
    plt.legend(loc="lower right")
    plt.savefig(f'aupr_{ont}_{alpha:0.2f}.pdf')
    df = pd.DataFrame({'precisions': precisions, 'recalls': recalls})
    df.to_pickle(f'PR_{ont}_{alpha:0.2f}.pkl')
示例#16
0
def main(in_file, out_file, go_file, model_file, terms_file, annotations_file,
         chunk_size, diamond_file, threshold, batch_size, alpha):
    # Load GO and read list of all terms
    go = Ontology(go_file, with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()

    # Read known experimental annotations
    annotations = {}
    df = pd.read_pickle(annotations_file)
    for row in df.itertuples():
        annotations[row.proteins] = set(row.prop_annotations)

    go.calculate_ic(annotations.values())

    diamond_preds = {}
    mapping = {}
    with gzip.open(diamond_file, 'rt') as f:
        for line in f:
            it = line.strip().split()
            if it[0] not in mapping:
                mapping[it[0]] = {}
            mapping[it[0]][it[1]] = float(it[2])
    for prot_id, sim_prots in mapping.items():
        annots = {}
        allgos = set()
        total_score = 0.0
        for p_id, score in sim_prots.items():
            allgos |= annotations[p_id]
            total_score += score
        allgos = list(sorted(allgos))
        sim = np.zeros(len(allgos), dtype=np.float32)
        for j, go_id in enumerate(allgos):
            s = 0.0
            for p_id, score in sim_prots.items():
                if go_id in annotations[p_id]:
                    s += score
            sim[j] = s / total_score
        for go_id, score in zip(allgos, sim):
            annots[go_id] = score
        diamond_preds[prot_id] = annots
    
    # Load CNN model
    model = load_model(model_file)
    # Alphas for the latest model
    alphas = {NAMESPACES['mf']: 0.55, NAMESPACES['bp']: 0.59, NAMESPACES['cc']: 0.46}
    # Alphas for the cafa2 model
    # alphas = {NAMESPACES['mf']: 0.63, NAMESPACES['bp']: 0.68, NAMESPACES['cc']: 0.48}
    
    start_time = time.time()
    total_seq = 0
    w = gzip.open(out_file, 'wt')
    for prot_ids, sequences in read_fasta(in_file, chunk_size):
        total_seq += len(prot_ids)
        deep_preds = {}
        ids, data = get_data(sequences)

        preds = model.predict(data, batch_size=batch_size)
        assert preds.shape[1] == len(terms)
        for i, j in enumerate(ids):
            prot_id = prot_ids[j]
            if prot_id not in deep_preds:
                deep_preds[prot_id] = {}
            for l in range(len(terms)):
                if preds[i, l] >= 0.01: # Filter out very low scores
                    if terms[l] not in deep_preds[prot_id]:
                        deep_preds[prot_id][terms[l]] = preds[i, l]
                    else:
                        deep_preds[prot_id][terms[l]] = max(
                            deep_preds[prot_id][terms[l]], preds[i, l])
        # Combine diamond preds and deepgo
        for prot_id in prot_ids:
            annots = {}
            if prot_id in diamond_preds:
                for go_id, score in diamond_preds[prot_id].items():
                    annots[go_id] = score * alphas[go.get_namespace(go_id)]
            for go_id, score in deep_preds[prot_id].items():
                if go_id in annots:
                    annots[go_id] += (1 - alphas[go.get_namespace(go_id)]) * score
                else:
                    annots[go_id] = (1 - alphas[go.get_namespace(go_id)]) * score
            # Propagate scores with ontology structure
            gos = list(annots.keys())
            for go_id in gos:
                for g_id in go.get_anchestors(go_id):
                    if g_id in annots:
                        annots[g_id] = max(annots[g_id], annots[go_id])
                    else:
                        annots[g_id] = annots[go_id]
                
            sannots = sorted(annots.items(), key=lambda x: x[1], reverse=True)
            for go_id, score in sannots:
                if score >= threshold:
                    w.write(prot_id + '\t' + go_id + '\t' + go.get_term(go_id)['name'] + '\t%.2f' % go.get_ic(go_id) + '\t%.3f\n' % score)
            w.write('\n')
    w.close()
    total_time = time.time() - start_time
    print('Total prediction time for %d sequences is %d' % (total_seq, total_time))
示例#17
0
def main(go_file, hp_file, hp_annots_file, deepgo_annots_file, data_file,
         expressions_file, out_data_file, out_terms_file, min_count):
    go = Ontology(go_file, with_rels=True)
    print('GO loaded')
    hp = Ontology(hp_file, with_rels=True)
    print('HP loaded')

    iea_annots = {}
    go_annots = {}
    seqs = {}
    df = pd.read_pickle(data_file)
    df = df[df['orgs'] == '9606']

    acc2prot = {}
    name2prot = {}
    for i, row in df.iterrows():
        accs = row['accessions'].split('; ')
        names = row['gene_names']
        p_id = row['proteins']
        for acc in accs:
            acc2prot[acc] = p_id
        for name in names:
            name = name.upper()
            if name not in name2prot:
                name2prot[name] = set()
            name2prot[name].add(p_id)
        if p_id not in go_annots:
            go_annots[p_id] = set()
        if p_id not in iea_annots:
            iea_annots[p_id] = set()
        go_annots[p_id] |= set(row.exp_annotations)
        iea_annots[p_id] |= set(row.iea_annotations)
        seqs[p_id] = row.sequences

    print('GO Annotations', len(go_annots))

    print('Loading HP annotations')
    hp_annots = {}
    unrev = set()
    with open(hp_annots_file) as f:
        next(f)
        for line in f:
            it = line.strip().split('\t')
            acc_id = it[0]
            hp_id = it[1]
            if acc_id not in acc2prot:
                unrev.add(acc_id)
                continue
            p_id = acc2prot[acc_id]
            if p_id not in hp_annots:
                hp_annots[p_id] = set()
            if hp.has_term(hp_id):
                hp_annots[p_id] |= hp.get_anchestors(hp_id)

    print('HP Annotations', len(hp_annots))
    dg_annots = {}
    gos = set()
    with open(deepgo_annots_file) as f:
        for line in f:
            it = line.strip().split('\t')
            p_id = it[0]
            annots = dg_annots.get(p_id, {})
            for item in it[1:]:
                go_id, score = item.split('|')
                score = float(score)
                annots[go_id] = max(score, annots.get(go_id, 0))
            dg_annots[p_id] = annots
            gos |= set(annots.keys())
    print('DeepGO Annotations', len(dg_annots))
    deepgo_annots = {}
    for g_id, annots in dg_annots.items():
        deepgo_annots[g_id] = [
            go_id + '|' + str(score) for go_id, score in annots.items()
        ]
    print('Number of GOs', len(gos))
    df = pd.DataFrame({'gos': list(gos)})
    #df.to_pickle('data-cafa/gos.pkl')

    logging.info('Processing annotations')

    cnt = Counter()
    annotations = list()
    for p_id, annots in hp_annots.items():
        for term in annots:
            cnt[term] += 1

    gene_exp = {}
    max_val = 0
    with open(expressions_file) as f:
        for line in f:
            if line.startswith('#') or line.startswith('Gene'):
                continue
            it = line.strip().split('\t')
            gene_name = it[1].upper()
            if gene_name in name2prot:
                exp = np.zeros((53, ), dtype=np.float32)
                for i in range(len(it[2:])):
                    exp[i] = float(it[2 + i]) if it[2 + i] != '' else 0.0
                for p_id in name2prot[gene_name]:
                    gene_exp[p_id] = exp / np.max(exp)

    print('Expression values', len(gene_exp))

    deepgo_annotations = []
    go_annotations = []
    iea_annotations = []
    hpos = []
    proteins = []
    sequences = []
    expressions = []
    mis_exp = 0
    for p_id, phenos in hp_annots.items():
        if p_id not in dg_annots:
            continue
        proteins.append(p_id)
        hpos.append(phenos)
        go_annotations.append(go_annots[p_id])
        iea_annotations.append(iea_annots[p_id])
        deepgo_annotations.append(deepgo_annots[p_id])
        sequences.append(seqs[p_id])
        if p_id in gene_exp:
            expressions.append(gene_exp[p_id])
        else:
            expressions.append(np.zeros((53, ), dtype=np.float32))
            mis_exp += 1

    print('Missing expressions', mis_exp)
    df = pd.DataFrame({
        'proteins': proteins,
        'hp_annotations': hpos,
        'go_annotations': go_annotations,
        'iea_annotations': iea_annotations,
        'deepgo_annotations': deepgo_annotations,
        'sequences': sequences,
        'expressions': expressions
    })
    df.to_pickle(out_data_file)
    print(f'Number of proteins {len(df)}')

    test_annots = {}
    tar2prot = {}
    with open('data-cafa/tar2prot.txt') as f:
        for line in f:
            it = line[1:].strip().split()
            tar2prot[it[0]] = it[1]

    unknown_prots = set()
    with open('data-cafa/benchmark/groundtruth/leafonly_HPO.txt') as f:
        for line in f:
            it = line.strip().split()
            p_id = tar2prot[it[0]]
            if p_id in hp_annots:
                continue
            unknown_prots.add(it[0])
            hp_id = it[1]
            if p_id not in test_annots:
                test_annots[p_id] = set()
            if hp.has_term(hp_id):
                test_annots[p_id] |= hp.get_anchestors(hp_id)
    with open('data-cafa/noknowledge_targets.txt', 'w') as f:
        for t_id in unknown_prots:
            f.write(t_id + '\n')

    deepgo_annotations = []
    go_annotations = []
    iea_annotations = []
    hpos = []
    proteins = []
    sequences = []
    expressions = []
    mis_exp = 0
    for p_id, phenos in test_annots.items():
        if p_id not in dg_annots:
            continue
        proteins.append(p_id)
        hpos.append(phenos)
        go_annotations.append(go_annots[p_id])
        iea_annotations.append(iea_annots[p_id])
        deepgo_annotations.append(deepgo_annots[p_id])
        sequences.append(seqs[p_id])
        if p_id in gene_exp:
            expressions.append(gene_exp[p_id])
        else:
            expressions.append(np.zeros((53, ), dtype=np.float32))
            mis_exp += 1

    df = pd.DataFrame({
        'proteins': proteins,
        'hp_annotations': hpos,
        'go_annotations': go_annotations,
        'iea_annotations': iea_annotations,
        'deepgo_annotations': deepgo_annotations,
        'sequences': sequences,
        'expressions': expressions
    })
    print('Missing expressions test', mis_exp)
    df.to_pickle('data-cafa/human_test.pkl')
    print(f'Number of test proteins {len(df)}')

    # Filter terms with annotations more than min_count
    terms_set = set()
    all_terms = []
    for key, val in cnt.items():
        if key == 'HP:0000001':
            continue
        all_terms.append(key)
        if val >= min_count:
            terms_set.add(key)
    terms = []
    for t_id in hp.get_ordered_terms():
        if t_id in terms_set:
            terms.append(t_id)

    logging.info(f'Number of terms {len(terms)}')