Пример #1
0
def main(train_data_file, test_data_file, ont):

  go_rels = Ontology('data/go.obo', with_rels=True)
  # terms_df = pd.read_pickle('data-deepgo/' + ont + '.pkl')
  # terms = terms_df['functions'].values.flatten()
  # terms_dict = {v: i for i, v in enumerate(terms)}

  train_df = pd.read_pickle(train_data_file)
  annotations = train_df['annotations'].values
  annotations = list(map(lambda x: set(x), annotations))

  test_df = pd.read_pickle(test_data_file)
  test_annotations = test_df['annotations'].values
  test_annotations = list(map(lambda x: set(x), test_annotations))

  go_rels.calculate_ic(annotations + test_annotations)

  go_set = go_rels.get_namespace_terms(NAMESPACES[ont])
  go_set.remove(FUNC_DICT[ont])

  annotations = list(map(lambda x: set(filter(lambda y: y in go_set, x)), annotations))

  cnt = Counter()
  max_n = 0
  for x in annotations:
    cnt.update(x)
  print(cnt.most_common(10))
  max_n = cnt.most_common(1)[0][1]
  print(max_n)
  scores = {}
  for go_id, n in cnt.items():
    score = n / max_n
    scores[go_id] = score #! IC score?

  prot_index = {}
  for i, row in enumerate(train_df.itertuples()):
    prot_index[row.proteins] = i


  labels = test_annotations
  labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels))
  print(len(go_set))
  fmax = 0.0
  tmax = 0.0
  smin = 1000.0
  precisions = []
  recalls = []
  for threshold in np.arange(0.005,.5,.01): # 
    # threshold = t / 100.0
    preds = []
    annots = set()
    for go_id, score in scores.items():
      if score >= threshold:
        annots.add(go_id)
      # new_annots = set()
      # for go_id in annots:
      #     new_annots |= go_rels.get_anchestors(go_id)
      # new_annots = set(filter(lambda y: y in go_set, new_annots))
    for i, row in enumerate(test_df.itertuples()):
      preds.append(annots.copy())

    fscore, prec, rec, s = evaluate_annotations(go_rels, labels, preds)
    precisions.append(prec)
    recalls.append(rec)
    print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}')
    if fmax < fscore:
      fmax = fscore
      tmax = threshold
    if smin > s:
      smin = s
  print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}')
  precisions = np.array(precisions)
  recalls = np.array(recalls)
  sorted_index = np.argsort(recalls)
  recalls = recalls[sorted_index]
  precisions = precisions[sorted_index]
  aupr = np.trapz(precisions, recalls)
  print(f'AUPR: {aupr:0.3f}')
  plt.figure()
  lw = 2
  plt.plot(recalls, precisions, color='darkorange',
       lw=lw, label=f'AUPR curve (area = {aupr:0.3f})')
  plt.xlim([0.0, 1.0])
  plt.ylim([0.0, 1.05])
  plt.xlabel('Recall')
  plt.ylabel('Precision')
  plt.title('Area Under the Precision-Recall curve')
  plt.legend(loc="lower right")
  plt.savefig('aupr.pdf')
  plt.show()
Пример #2
0
def main(train_data_file, test_data_file, diamond_scores_file, ont):

    go_rels = Ontology('data/go.obo', with_rels=True)

    train_df = pd.read_pickle(train_data_file)
    annotations = train_df['prop_annotations'].values
    annotations = list(map(lambda x: set(x), annotations))

    test_df = pd.read_pickle(test_data_file)
    test_annotations = test_df['prop_annotations'].values
    test_annotations = list(map(lambda x: set(x), test_annotations))
    go_rels.calculate_ic(annotations + test_annotations)

    prot_index = {}
    for i, row in enumerate(train_df.itertuples()):
        prot_index[row.proteins] = i

    # BLAST Similarity (Diamond)
    diamond_scores = {}
    with open(diamond_scores_file) as f:
        for line in f:
            it = line.strip().split()
            if it[0] not in diamond_scores:
                diamond_scores[it[0]] = {}
            diamond_scores[it[0]][it[1]] = float(it[2])

    blast_preds = []
    for i, row in enumerate(test_df.itertuples()):
        annots = {}
        prot_id = row.proteins
        # BlastKNN
        if prot_id in diamond_scores:
            sim_prots = diamond_scores[prot_id]
            allgos = set()
            total_score = 0.0
            for p_id, score in sim_prots.items():
                allgos |= annotations[prot_index[p_id]]
                total_score += score
            allgos = list(sorted(allgos))
            sim = np.zeros(len(allgos), dtype=np.float32)
            for j, go_id in enumerate(allgos):
                s = 0.0
                for p_id, score in sim_prots.items():
                    if go_id in annotations[prot_index[p_id]]:
                        s += score
                sim[j] = s / total_score
            for go_id, score in zip(allgos, sim):
                annots[go_id] = score

        blast_preds.append(annots)

    go_set = go_rels.get_namespace_terms(NAMESPACES[ont])
    go_set.remove(FUNC_DICT[ont])

    labels = test_annotations
    labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels))
    print(len(go_set))
    fmax = 0.0
    tmax = 0.0
    smin = 1000.0
    precisions = []
    recalls = []
    for t in range(101):
        threshold = t / 100.0
        preds = []
        for i, row in enumerate(test_df.itertuples()):
            annots = set()
            for go_id, score in blast_preds[i].items():
                if score >= threshold:
                    annots.add(go_id)

            new_annots = set()
            for go_id in annots:
                new_annots |= go_rels.get_anchestors(go_id)
            preds.append(new_annots)

        # Filter classes
        preds = list(
            map(lambda x: set(filter(lambda y: y in go_set, x)), preds))

        fscore, prec, rec, s = evaluate_annotations(go_rels, labels, preds)
        precisions.append(prec)
        recalls.append(rec)
        print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}')
        if fmax < fscore:
            fmax = fscore
            tmax = threshold
        if smin > s:
            smin = s
    print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}')
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print(f'AUPR: {aupr:0.3f}')
    plt.figure()
    lw = 2
    plt.plot(recalls,
             precisions,
             color='darkorange',
             lw=lw,
             label=f'AUPR curve (area = {aupr:0.3f})')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Area Under the Precision-Recall curve')
    plt.legend(loc="lower right")
    plt.savefig('aupr.pdf')
    plt.show()
Пример #3
0
def main(in_file, out_file, go_file, model_file, terms_file, annotations_file,
         chunk_size, diamond_file, threshold, batch_size, alpha):
    # Load GO and read list of all terms
    go = Ontology(go_file, with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()

    # Read known experimental annotations
    annotations = {}
    df = pd.read_pickle(annotations_file)
    for row in df.itertuples():
        annotations[row.proteins] = set(row.prop_annotations)

    go.calculate_ic(annotations.values())

    diamond_preds = {}
    mapping = {}
    with gzip.open(diamond_file, 'rt') as f:
        for line in f:
            it = line.strip().split()
            if it[0] not in mapping:
                mapping[it[0]] = {}
            mapping[it[0]][it[1]] = float(it[2])
    for prot_id, sim_prots in mapping.items():
        annots = {}
        allgos = set()
        total_score = 0.0
        for p_id, score in sim_prots.items():
            allgos |= annotations[p_id]
            total_score += score
        allgos = list(sorted(allgos))
        sim = np.zeros(len(allgos), dtype=np.float32)
        for j, go_id in enumerate(allgos):
            s = 0.0
            for p_id, score in sim_prots.items():
                if go_id in annotations[p_id]:
                    s += score
            sim[j] = s / total_score
        for go_id, score in zip(allgos, sim):
            annots[go_id] = score
        diamond_preds[prot_id] = annots
    
    # Load CNN model
    model = load_model(model_file)
    # Alphas for the latest model
    alphas = {NAMESPACES['mf']: 0.55, NAMESPACES['bp']: 0.59, NAMESPACES['cc']: 0.46}
    # Alphas for the cafa2 model
    # alphas = {NAMESPACES['mf']: 0.63, NAMESPACES['bp']: 0.68, NAMESPACES['cc']: 0.48}
    
    start_time = time.time()
    total_seq = 0
    w = gzip.open(out_file, 'wt')
    for prot_ids, sequences in read_fasta(in_file, chunk_size):
        total_seq += len(prot_ids)
        deep_preds = {}
        ids, data = get_data(sequences)

        preds = model.predict(data, batch_size=batch_size)
        assert preds.shape[1] == len(terms)
        for i, j in enumerate(ids):
            prot_id = prot_ids[j]
            if prot_id not in deep_preds:
                deep_preds[prot_id] = {}
            for l in range(len(terms)):
                if preds[i, l] >= 0.01: # Filter out very low scores
                    if terms[l] not in deep_preds[prot_id]:
                        deep_preds[prot_id][terms[l]] = preds[i, l]
                    else:
                        deep_preds[prot_id][terms[l]] = max(
                            deep_preds[prot_id][terms[l]], preds[i, l])
        # Combine diamond preds and deepgo
        for prot_id in prot_ids:
            annots = {}
            if prot_id in diamond_preds:
                for go_id, score in diamond_preds[prot_id].items():
                    annots[go_id] = score * alphas[go.get_namespace(go_id)]
            for go_id, score in deep_preds[prot_id].items():
                if go_id in annots:
                    annots[go_id] += (1 - alphas[go.get_namespace(go_id)]) * score
                else:
                    annots[go_id] = (1 - alphas[go.get_namespace(go_id)]) * score
            # Propagate scores with ontology structure
            gos = list(annots.keys())
            for go_id in gos:
                for g_id in go.get_anchestors(go_id):
                    if g_id in annots:
                        annots[g_id] = max(annots[g_id], annots[go_id])
                    else:
                        annots[g_id] = annots[go_id]
                
            sannots = sorted(annots.items(), key=lambda x: x[1], reverse=True)
            for go_id, score in sannots:
                if score >= threshold:
                    w.write(prot_id + '\t' + go_id + '\t' + go.get_term(go_id)['name'] + '\t%.2f' % go.get_ic(go_id) + '\t%.3f\n' % score)
            w.write('\n')
    w.close()
    total_time = time.time() - start_time
    print('Total prediction time for %d sequences is %d' % (total_seq, total_time))
Пример #4
0
def main(model_file, terms_file, annotations_file):

    go_rels = Ontology('data/go.obo', with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()

    df = pd.read_pickle(annotations_file)
    annotations = df['annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    go_rels.calculate_ic(annotations)

    # df = df[df['orgs'] == '559292']
    sl = 0

    annotations = df['annotations'].values
    annotations = list(map(lambda x: set(x), annotations))

    prot_ids = df['proteins'].values
    ids, data = get_data(df['sequences'])

    # Load CNN model
    model = load_model(model_file)

    preds = model.predict(data, batch_size=100, verbose=1)
    assert preds.shape[1] == len(terms)
    mf_set = go_rels.get_namespace_terms(NAMESPACES['mf'])
    # terms = ['GO:0008047']
    for l in range(len(terms)):
        # if terms[l] not in mf_set:
        #     continue
        deep_preds = {}
        for i, j in enumerate(ids):
            prot_id = prot_ids[j]
            if prot_id not in deep_preds:
                deep_preds[prot_id] = {}
            if preds[i, l] >= 0.01:  # Filter out very low scores
                if terms[l] not in deep_preds[prot_id]:
                    deep_preds[prot_id][terms[l]] = preds[i, l]
                else:
                    deep_preds[prot_id][terms[l]] = max(
                        deep_preds[prot_id][terms[l]], preds[i, l])

        go_set = set([terms[l]])
        # go_set.remove(FUNC_DICT['mf'])
        labels = list(
            map(lambda x: set(filter(lambda y: y in go_set, x)), annotations))
        bin_labels = list(map(lambda x: len(x), labels))
        pos_cnt = sum(bin_labels)
        fmax = 0.0
        tmax = 0.0
        smin = 1000
        for t in range(0, 100):
            threshold = t / 100.0
            predictions = []
            for i, row in enumerate(df.itertuples()):
                annots_dict = deep_preds[row.proteins] or {}

                annots = set()
                for go_id, score in annots_dict.items():
                    if score >= threshold:
                        annots.add(go_id)
                # new_annots = set()
                # for go_id in annots:
                #     new_annots |= go_rels.get_anchestors(go_id)
                predictions.append(annots)

            # Filter classes
            predictions = list(
                map(lambda x: set(filter(lambda y: y in go_set, x)),
                    predictions))

            fscore, prec, rec, s = evaluate_annotations(
                go_rels, labels, predictions)
            # print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}')
            if fmax < fscore:
                fmax = fscore
                tmax = threshold
            if smin > s:
                smin = s
        print(
            f'{terms[l]} {pos_cnt} Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}'
        )
Пример #5
0
def main(train_data_file, test_data_file, out_file, terms_file, root_class, fold):
    # Cross validation evaluation
    out_file = f'fold{fold}_' + out_file
    test_data_file = f'fold{fold}_' + test_data_file
    
    hp = Ontology('data/hp.obo', with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    train_df = pd.read_pickle(train_data_file)

    naive_annots = {}
    for i, row in train_df.iterrows():
        for hp_id in row.hp_annotations:
            if hp_id in naive_annots:
                naive_annots[hp_id] += 1
            else:
                naive_annots[hp_id] = 1
    for hp_id in naive_annots:
        naive_annots[hp_id] /= 1.0 * len(train_df)

    test_df = pd.read_pickle(test_data_file)
    annotations = train_df['hp_annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    test_annotations = test_df['hp_annotations'].values
    test_annotations = list(map(lambda x: set(x), test_annotations))
    hp.calculate_ic(annotations)

    hp_set = set(terms)
    
    all_classes = hp.get_term_set(root_class)
    hp_set = hp_set.intersection(all_classes)
    hp_set.discard(root_class)
    print(len(hp_set))

    labels = test_annotations
    labels = list(map(lambda x: set(filter(lambda y: y in hp_set, x)), labels))

    # Compute AUC
    auc_terms = list(hp_set)
    auc_terms_dict = {v: i for i, v in enumerate(auc_terms)}
    auc_preds = np.zeros((len(test_df), len(hp_set)), dtype=np.float32)
    auc_labels = np.zeros((len(test_df), len(hp_set)), dtype=np.int32)
    for i in range(len(labels)):
        for j, hp_id in enumerate(auc_terms):
            auc_preds[i, j] = naive_annots[hp_id]
            if hp_id in labels[i]:
                auc_labels[i, j] = 1
    # Compute macro AUROC
    roc_auc = 0.0
    total = 0
    for i, hp_id in enumerate(auc_terms):
        if np.sum(auc_labels[:, i]) == 0:
            continue
        total += 1
        auc = compute_roc(auc_labels[:, i], auc_preds[:, i])
        if not math.isnan(auc): 
            roc_auc += auc
        else:
            roc_auc += 1
    roc_auc /= total
    print(roc_auc)
    return
    
    fmax = 0.0
    tmax = 0.0
    pmax = 0.0
    rmax = 0.0
    precisions = []
    recalls = []
    smin = 1000000.0
    max_preds = None
    for t in range(0, 101):
        threshold = t / 100.0
        gene_id = row.genes
        annots = set()
        for hp_id, score in naive_annots.items():
            if score >= threshold:
                annots.add(hp_id)
        new_annots = set()
        for hp_id in annots:
            new_annots |= hp.get_anchestors(hp_id)

        preds = []
        for i, row in enumerate(test_df.itertuples()):
            preds.append(new_annots)
        
    
        # Filter classes
        
        fscore, prec, rec, s = evaluate_annotations(hp, labels, preds)
        precisions.append(prec)
        recalls.append(rec)
        print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}')
        if fmax < fscore:
            fmax = fscore
            tmax = threshold
            pmax = prec
            rmax = rec
            max_preds = preds
        if smin > s:
            smin = s
    test_df['hp_preds'] = max_preds
    test_df.to_pickle(out_file)
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print(f'AUROC: {roc_auc:0.3f}, AUPR: {aupr:0.3f}, Fmax: {fmax:0.3f}, Prec: {pmax:0.3f}, Rec: {rmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}')
Пример #6
0
def main(train_data_file, test_data_file, terms_file, diamond_scores_file, ont,
         alpha):

    alpha /= 100.0
    go_rels = Ontology('data-cafa/go.obo', with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    train_df = pd.read_pickle(train_data_file)
    test_df = pd.read_pickle(test_data_file)
    annotations = train_df['annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    test_annotations = test_df['annotations'].values
    test_annotations = list(map(lambda x: set(x), test_annotations))
    go_rels.calculate_ic(annotations + test_annotations)

    # Print IC values of terms
    ics = {}
    for term in terms:
        ics[term] = go_rels.get_ic(term)

    prot_index = {}
    for i, row in enumerate(train_df.itertuples()):
        prot_index[row.proteins] = i

    # BLAST Similarity (Diamond)
    diamond_scores = {}
    with open(diamond_scores_file) as f:
        for line in f:
            it = line.strip().split()
            if it[0] not in diamond_scores:
                diamond_scores[it[0]] = {}
            diamond_scores[it[0]][it[1]] = float(it[2])

    blast_preds = []
    for i, row in enumerate(test_df.itertuples()):
        annots = {}
        prot_id = row.proteins
        # BlastKNN
        if prot_id in diamond_scores:
            sim_prots = diamond_scores[prot_id]
            allgos = set()
            total_score = 0.0
            for p_id, score in sim_prots.items():
                allgos |= annotations[prot_index[p_id]]
                total_score += score
            allgos = list(sorted(allgos))
            sim = np.zeros(len(allgos), dtype=np.float32)
            for j, go_id in enumerate(allgos):
                s = 0.0
                for p_id, score in sim_prots.items():
                    if go_id in annotations[prot_index[p_id]]:
                        s += score
                sim[j] = s / total_score
            ind = np.argsort(-sim)
            for go_id, score in zip(allgos, sim):
                annots[go_id] = score
        blast_preds.append(annots)

    # DeepGOPlus
    go_set = go_rels.get_namespace_terms(NAMESPACES[ont])
    go_set.remove(FUNC_DICT[ont])
    labels = test_df['annotations'].values
    labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels))
    # print(len(go_set))
    deep_preds = []
    alphas = {
        NAMESPACES['mf']: 0.55,
        NAMESPACES['bp']: 0.59,
        NAMESPACES['cc']: 0.46
    }
    for i, row in enumerate(test_df.itertuples()):
        annots_dict = blast_preds[i].copy()
        for go_id in annots_dict:
            annots_dict[go_id] *= alphas[go_rels.get_namespace(go_id)]
        for j, score in enumerate(row.preds):
            go_id = terms[j]
            score *= 1 - alphas[go_rels.get_namespace(go_id)]
            if go_id in annots_dict:
                annots_dict[go_id] += score
            else:
                annots_dict[go_id] = score
        deep_preds.append(annots_dict)
    print('AUTHOR DeepGOPlus')
    print('MODEL 1')
    print('KEYWORDS sequence alignment.')
    for i, row in enumerate(test_df.itertuples()):
        prot_id = row.proteins
        for go_id, score in deep_preds[i].items():
            print(f'{prot_id}\t{go_id}\t{score:.2f}')
    print('END')
    return
    # Propagate scores
    # deepgo_preds = []
    # for annots_dict in deep_preds:
    #     annots = {}
    #     for go_id, score in annots_dict.items():
    #         for a_id in go_rels.get_anchestors(go_id):
    #             if a_id in annots:
    #                 annots[a_id] = max(annots[a_id], score)
    #             else:
    #                 annots[a_id] = score
    #     deepgo_preds.append(annots)

    fmax = 0.0
    tmax = 0.0
    precisions = []
    recalls = []
    smin = 1000000.0
    rus = []
    mis = []
    for t in range(0, 101):
        threshold = t / 100.0
        preds = []
        for i, row in enumerate(test_df.itertuples()):
            annots = set()
            for go_id, score in deep_preds[i].items():
                if score >= threshold:
                    annots.add(go_id)

            new_annots = set()
            for go_id in annots:
                new_annots |= go_rels.get_anchestors(go_id)
            preds.append(new_annots)

        # Filter classes
        preds = list(
            map(lambda x: set(filter(lambda y: y in go_set, x)), preds))

        fscore, prec, rec, s, ru, mi, fps, fns = evaluate_annotations(
            go_rels, labels, preds)
        avg_fp = sum(map(lambda x: len(x), fps)) / len(fps)
        avg_ic = sum(
            map(lambda x: sum(map(lambda go_id: go_rels.get_ic(go_id), x)),
                fps)) / len(fps)
        print(f'{avg_fp} {avg_ic}')
        precisions.append(prec)
        recalls.append(rec)
        print(
            f'Fscore: {fscore}, Precision: {prec}, Recall: {rec} S: {s}, RU: {ru}, MI: {mi} threshold: {threshold}'
        )
        if fmax < fscore:
            fmax = fscore
            tmax = threshold
        if smin > s:
            smin = s
    print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}')
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print(f'AUPR: {aupr:0.3f}')
    plt.figure()
    lw = 2
    plt.plot(recalls,
             precisions,
             color='darkorange',
             lw=lw,
             label=f'AUPR curve (area = {aupr:0.2f})')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Area Under the Precision-Recall curve')
    plt.legend(loc="lower right")
    plt.savefig(f'aupr_{ont}_{alpha:0.2f}.pdf')
    df = pd.DataFrame({'precisions': precisions, 'recalls': recalls})
    df.to_pickle(f'PR_{ont}_{alpha:0.2f}.pkl')
def main(train_data_file, test_data_file, terms_file, out_file, root_class,
         fold):
    # Cross validation evaluation
    out_file = f'fold{fold}_' + out_file
    test_data_file = f'fold{fold}_' + test_data_file

    hp = Ontology('data/hp.obo', with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    train_df = pd.read_pickle(train_data_file)
    test_df = pd.read_pickle(test_data_file)
    annotations = train_df['hp_annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    test_annotations = test_df['hp_annotations'].values
    test_annotations = list(map(lambda x: set(x), test_annotations))
    hp.calculate_ic(annotations)

    hp_set = set(terms)
    all_classes = hp.get_term_set(root_class)
    hp_set = hp_set.intersection(all_classes)
    hp_set.discard(root_class)
    print(len(hp_set))

    labels = test_annotations
    labels = list(map(lambda x: set(filter(lambda y: y in hp_set, x)), labels))

    # Compute AUC
    auc_terms = list(hp_set)
    auc_terms_dict = {v: i for i, v in enumerate(auc_terms)}
    auc_preds = np.zeros((len(test_df), len(hp_set)), dtype=np.float32)
    auc_labels = np.zeros((len(test_df), len(hp_set)), dtype=np.int32)
    for i, row in enumerate(test_df.itertuples()):
        for j, hp_id in enumerate(auc_terms):
            auc_preds[i, j] = row.preds[terms_dict[hp_id]]
            if hp_id in labels[i]:
                auc_labels[i, j] = 1
    # Compute macro AUROC
    roc_auc = 0.0
    total = 0
    for i, hp_id in enumerate(auc_terms):
        if np.sum(auc_labels[:, i]) == 0:
            continue
        total += 1
        auc = compute_roc(auc_labels[:, i], auc_preds[:, i])
        if not math.isnan(auc):
            roc_auc += auc
        else:
            roc_auc += 1
    roc_auc /= total
    print(roc_auc)
    return
    fmax = 0.0
    tmax = 0.0
    pmax = 0.0
    rmax = 0.0
    precisions = []
    recalls = []
    smin = 1000000.0
    max_preds = None
    for t in range(0, 101):
        threshold = t / 100.0
        preds = []
        for i, row in enumerate(test_df.itertuples()):
            gene_id = row.genes
            annots_dict = {}

            for j, score in enumerate(row.preds):
                hp_id = terms[j]
                # score = score * (1 - alpha)
                if hp_id in annots_dict:
                    annots_dict[hp_id] += score
                else:
                    annots_dict[hp_id] = score

            annots = set()
            for hp_id, score in annots_dict.items():
                if score >= threshold:
                    annots.add(hp_id)
            new_annots = set()
            for hp_id in annots:
                new_annots |= hp.get_anchestors(hp_id)
            new_annots = new_annots.intersection(hp_set)
            preds.append(new_annots)

        # Filter classes

        fscore, prec, rec, s = evaluate_annotations(hp, labels, preds)
        precisions.append(prec)
        recalls.append(rec)
        print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}')
        if fmax < fscore:
            fmax = fscore
            tmax = threshold
            max_preds = preds
            pmax = prec
            rmax = rec
        if smin > s:
            smin = s
    test_df['hp_preds'] = max_preds
    test_df.to_pickle(out_file)
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print(
        f'AUROC: {roc_auc:0.3f}, AUPR: {aupr:0.3f}, Fmax: {fmax:0.3f}, Prec: {pmax:0.3f}, Rec: {rmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}'
    )
    plt.figure()
    lw = 2
    plt.plot(recalls,
             precisions,
             color='darkorange',
             lw=lw,
             label=f'AUPR curve (area = {aupr:0.2f})')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Area Under the Precision-Recall curve')
    plt.legend(loc="lower right")
    df = pd.DataFrame({'precisions': precisions, 'recalls': recalls})
Пример #8
0
def main(train_data_file, valid_data_file, terms_file, diamond_scores_file,
         ont):

    go_rels = Ontology('data-cafa/go.obo', with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    train_df = pd.read_pickle(train_data_file)
    valid_df = pd.read_pickle(valid_data_file)
    annotations = train_df['annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    valid_annotations = valid_df['annotations'].values
    valid_annotations = list(map(lambda x: set(x), valid_annotations))
    go_rels.calculate_ic(annotations + valid_annotations)
    # Print IC values of terms
    ics = {}
    for term in terms:
        ics[term] = go_rels.get_ic(term)

    prot_index = {}
    for i, row in enumerate(train_df.itertuples()):
        prot_index[row.proteins] = i

    # BLAST Similarity (Diamond)
    diamond_scores = {}
    with open(diamond_scores_file) as f:
        for line in f:
            it = line.strip().split()
            if it[0] not in diamond_scores:
                diamond_scores[it[0]] = {}
            diamond_scores[it[0]][it[1]] = float(it[2])

    blast_preds = []
    for i, row in enumerate(valid_df.itertuples()):
        annots = {}
        prot_id = row.proteins
        # BlastKNN
        if prot_id in diamond_scores:
            sim_prots = diamond_scores[prot_id]
            allgos = set()
            total_score = 0.0
            for p_id, score in sim_prots.items():
                allgos |= annotations[prot_index[p_id]]
                total_score += score
            allgos = list(sorted(allgos))
            sim = np.zeros(len(allgos), dtype=np.float32)
            for j, go_id in enumerate(allgos):
                s = 0.0
                for p_id, score in sim_prots.items():
                    if go_id in annotations[prot_index[p_id]]:
                        s += score
                sim[j] = s / total_score
            ind = np.argsort(-sim)
            for go_id, score in zip(allgos, sim):
                annots[go_id] = score
        blast_preds.append(annots)

    # DeepGOPlus
    go_set = go_rels.get_namespace_terms(NAMESPACES[ont])
    go_set.remove(FUNC_DICT[ont])
    labels = valid_df['annotations'].values
    labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels))
    print(len(go_set))
    best_fmax = 0.0
    best_alpha = 0.0
    for alpha in range(44, 70):
        alpha /= 100.0
        deep_preds = []
        for i, row in enumerate(valid_df.itertuples()):
            annots_dict = blast_preds[i].copy()
            for go_id in annots_dict:
                annots_dict[go_id] *= alpha
            for j, score in enumerate(row.preds):
                go_id = terms[j]
                score *= 1 - alpha
                if go_id in annots_dict:
                    annots_dict[go_id] += score
                else:
                    annots_dict[go_id] = score
            deep_preds.append(annots_dict)

        fmax = 0.0
        tmax = 0.0
        precisions = []
        recalls = []
        smin = 1000000.0
        rus = []
        mis = []
        for t in range(14, 20):
            threshold = t / 100.0
            preds = []
            for i, row in enumerate(valid_df.itertuples()):
                annots = set()
                for go_id, score in deep_preds[i].items():
                    if score >= threshold:
                        annots.add(go_id)

                new_annots = set()
                for go_id in annots:
                    new_annots |= go_rels.get_anchestors(go_id)
                preds.append(new_annots)

            # Filter classes
            preds = list(
                map(lambda x: set(filter(lambda y: y in go_set, x)), preds))

            fscore, prec, rec, s, ru, mi, fps, fns = evaluate_annotations(
                go_rels, labels, preds)
            avg_fp = sum(map(lambda x: len(x), fps)) / len(fps)
            avg_ic = sum(
                map(lambda x: sum(map(lambda go_id: go_rels.get_ic(go_id), x)),
                    fps)) / len(fps)
            print(
                f'Fscore: {fscore}, Precision: {prec}, Recall: {rec} S: {s}, RU: {ru}, MI: {mi} threshold: {threshold}'
            )
            if fmax < fscore:
                fmax = fscore
                tmax = threshold
            if smin > s:
                smin = s
        if best_fmax < fmax:
            best_fmax = fmax
            best_alpha = alpha
        print(
            f'Alpha: {alpha} Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}'
        )
    print(f'{best_alpha} {best_fmax}')
Пример #9
0
def main(train_data_file, test_data_file, terms_file, diamond_scores_file, ont,
         alpha):

    alpha /= 100.0
    mp = Ontology('data/mp.obo', with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    train_df = pd.read_pickle(train_data_file)
    test_df = pd.read_pickle(test_data_file)
    annotations = train_df['mp_annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    test_annotations = test_df['mp_annotations'].values
    test_annotations = list(map(lambda x: set(x), test_annotations))
    mp.calculate_ic(annotations)
    prot_index = {}
    for i, row in enumerate(train_df.itertuples()):
        prot_index[row.proteins] = i

    # GO2HP preds
    rules = {}
    with open('data/go2hp.txt') as f:
        for line in f:
            it = line.strip().split('\t')
            go_id = it[0].replace('_', ':')
            mp_ids = list(map(lambda x: x.replace('_', ':'), it[1:]))
            if go_id not in rules:
                rules[go_id] = []
            rules[go_id] = mp_ids
    pheno2go_preds = {}
    for i, row in enumerate(test_df.itertuples()):
        prot_id = row.proteins
        if prot_id not in pheno2go_preds:
            pheno2go_preds[prot_id] = {}
        for item in row.deepgo_annotations:
            go_id, score = item.split('|')
            if go_id in rules:
                for mp_id in rules[go_id]:
                    pheno2go_preds[prot_id][mp_id] = max(
                        float(score), pheno2go_preds[prot_id].get(mp_id, 0))

    labels = test_annotations
    fmax = 0.0
    tmax = 0.0
    precisions = []
    recalls = []
    smin = 1000000.0
    for t in range(101):
        threshold = t / 100.0
        preds = []
        for i, row in enumerate(test_df.itertuples()):
            prot_id = row.proteins
            annots_dict = {}  #pheno2go_preds[prot_id]
            for j, score in enumerate(row.preds):
                mp_id = terms[j]
                annots_dict[mp_id] = max(score, annots_dict.get(mp_id, 0))

            annots = set()
            for mp_id, score in annots_dict.items():
                if score >= threshold:
                    annots.add(mp_id)
            new_annots = set()
            for mp_id in annots:
                new_annots |= mp.get_anchestors(mp_id)
            preds.append(new_annots)

        # Filter classes

        fscore, prec, rec, s = evaluate_annotations(mp, labels, preds)
        precisions.append(prec)
        recalls.append(rec)
        print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}')
        if fmax < fscore:
            fmax = fscore
            tmax = threshold
        if smin > s:
            smin = s
    print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}')
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print(f'AUPR: {aupr:0.3f}')
    plt.figure()
    lw = 2
    plt.plot(recalls,
             precisions,
             color='darkorange',
             lw=lw,
             label=f'AUPR curve (area = {aupr:0.2f})')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Area Under the Precision-Recall curve')
    plt.legend(loc="lower right")
    plt.savefig(f'aupr_{ont}_{alpha:0.2f}.pdf')
    df = pd.DataFrame({'precisions': precisions, 'recalls': recalls})
    df.to_pickle(f'PR_{ont}_{alpha:0.2f}.pkl')
Пример #10
0
def main(train_data_file, test_data_file, terms_file, diamond_scores_file, ont,
         alpha):

    alpha /= 100.0
    go_rels = Ontology('data-cafa/go.obo', with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    # terms_dict = {v: i for i, v in enumerate(terms)}

    train_df = pd.read_pickle(train_data_file)
    test_df = pd.read_pickle(test_data_file)
    annotations = train_df['annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    test_annotations = test_df['annotations'].values
    test_annotations = list(map(lambda x: set(x), test_annotations))

    #### ? notice that @annotations and @test_annotations are used to get IC scores, so we are not allowed to do pre-filtering
    go_rels.calculate_ic(annotations + test_annotations)

    go_set = go_rels.get_namespace_terms(
        NAMESPACES[ont])  #? consider all the MF or CC or BP

    #### ? filter terms to have only mf ?
    # terms = [t for t in terms if t in go_set]
    # print ('number of terms kept from terms_file {}'.format(len(terms)))

    # Print IC values of terms
    ics = {}
    for term in terms:
        ics[term] = go_rels.get_ic(term)
    ##!! let's save this
    pickle.dump(ics, open("data-cafa/ICsValueTable.pickle", "wb"))

    prot_index = {}
    for i, row in enumerate(train_df.itertuples()):
        prot_index[row.proteins] = i

    ####
    # BLAST Similarity (Diamond) #! we can use same call, we have their output
    diamond_scores = {}
    with open(diamond_scores_file) as f:
        for line in f:
            it = line.strip().split()
            if it[0] not in diamond_scores:
                diamond_scores[it[0]] = {}
            diamond_scores[it[0]][it[1]] = float(it[2])

    blast_preds = []
    for i, row in enumerate(test_df.itertuples()):
        annots = {}
        prot_id = row.proteins
        # BlastKNN
        if prot_id in diamond_scores:
            sim_prots = diamond_scores[prot_id]
            allgos = set()
            total_score = 0.0
            for p_id, score in sim_prots.items():
                allgos |= annotations[prot_index[p_id]]
                total_score += score
            allgos = list(sorted(allgos))
            sim = np.zeros(len(allgos), dtype=np.float32)
            for j, go_id in enumerate(allgos):
                s = 0.0
                for p_id, score in sim_prots.items():
                    if go_id in annotations[prot_index[p_id]]:
                        s += score
                sim[j] = s / total_score
            ind = np.argsort(-sim)
            for go_id, score in zip(allgos, sim):
                annots[go_id] = score
        blast_preds.append(annots)

    ####
    # DeepGOPlus

    # go_set = go_rels.get_namespace_terms(NAMESPACES[ont]) #? consider all the MF or CC or BP
    go_set.remove(FUNC_DICT[ont])
    labels = test_df['annotations'].values
    labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)),
                      labels))  ##! filter true labels by @go_set
    print("total labels {}".format(len(go_set)))

    deep_preds = []
    # alphas = {NAMESPACES['mf']: 0.55, NAMESPACES['bp']: 0.59, NAMESPACES['cc']: 0.46}
    for i, row in enumerate(
            test_df.itertuples()):  #! read in prediction of neural net
        annots_dict = {}
        # annots_dict = blast_preds[i].copy() #! copy blast score
        # for go_id in annots_dict: # * set 0 for all @blast_prediction
        #     annots_dict[go_id] = 0 # *= alphas[go_rels.get_namespace(go_id)] #! scale down blast score.
        for j, score in enumerate(row.preds):  #! prediction of @test_df
            go_id = terms[j]
            # if go_id not in go_set: #? faster filter of labels because we don't add ancestor anyway
            #     continue
            # score *= 1 - alphas[go_rels.get_namespace(go_id)] # x *= 1-0.5 --> x = x * (1-0.5)
            # if go_id in annots_dict: #? should not need this line??
            #     annots_dict[go_id] += score #! add into blast score
            # else: #! are we going to see error??
            annots_dict[go_id] = score  #! replace blast score
        deep_preds.append(annots_dict)  #! later on, we use only @deep_preds

    # print('AUTHOR DeepGOPlus')
    # print('MODEL 1')
    # print('KEYWORDS sequence alignment.')
    # for i, row in enumerate(test_df.itertuples()):
    #     prot_id = row.proteins
    #     for go_id, score in deep_preds[i].items():
    #         print(f'{prot_id}\t{go_id}\t{score:.2f}')
    # print('END')
    # return

    # Propagate scores
    # deepgo_preds = []
    # for annots_dict in deep_preds:
    #     annots = {}
    #     for go_id, score in annots_dict.items():
    #         for a_id in go_rels.get_anchestors(go_id):
    #             if a_id in annots:
    #                 annots[a_id] = max(annots[a_id], score)
    #             else:
    #                 annots[a_id] = score
    #     deepgo_preds.append(annots)

    fmax = 0.0
    tmax = 0.0
    precisions = []
    recalls = []
    smin = 1000000.0
    rus = []
    mis = []

    print('\nontology {}\n'.format(ont))

    ####

    for threshold in np.arange(0.005, .4, .01):  # np.arange(0.005,1,.01)
        # threshold = t / 100.0
        print('\n')
        preds = []
        for i, row in enumerate(test_df.itertuples()):
            annots = set()
            for go_id, score in deep_preds[i].items():
                if go_id not in go_set:  #? faster filter of labels because we don't add ancestor anyway
                    continue
                if score >= threshold:
                    annots.add(go_id)

            preds.append(annots)

            ##!! append parent terms or something ??
            # new_annots = set()
            # for go_id in annots:
            #     new_annots |= go_rels.get_anchestors(go_id)
            # preds.append(new_annots)

        # Filter classes
        preds = list(
            map(lambda x: set(filter(lambda y: y in go_set, x)), preds))

        # print ('see 1 prediction')
        # print (preds[10])
        # print ('see 1 label')
        # print (labels[10])

        fscore, prec, rec, s, ru, mi, fps, fns = evaluate_annotations(
            go_rels, labels, preds)
        avg_fp = sum(map(lambda x: len(x), fps)) / len(fps)
        avg_ic = sum(
            map(lambda x: sum(map(lambda go_id: go_rels.get_ic(go_id), x)),
                fps)) / len(fps)
        print(f'{avg_fp} {avg_ic}')
        precisions.append(prec)
        recalls.append(rec)
        print(
            f'Fscore: {fscore}, Precision: {prec}, Recall: {rec} S: {s}, RU: {ru}, MI: {mi} threshold: {threshold}'
        )
        if fmax < fscore:
            fmax = fscore
            tmax = threshold
        if smin > s:
            smin = s
    print(f'\nFmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}')
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print(f'AUPR: {aupr:0.3f}')
    plt.figure()
    lw = 2
    plt.plot(recalls,
             precisions,
             color='darkorange',
             lw=lw,
             label=f'AUPR curve (area = {aupr:0.2f})')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Area Under the Precision-Recall curve')
    plt.legend(loc="lower right")
    plt.savefig(f'aupr_{ont}_{alpha:0.2f}.pdf')
    df = pd.DataFrame({'precisions': precisions, 'recalls': recalls})
    df.to_pickle(f'PR_{ont}_{alpha:0.2f}.pkl')
Пример #11
0
def main(train_data_file, test_data_file, terms_file, diamond_scores_file,
         ont):

    go_rels = Ontology('data/go.obo', with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    train_df = pd.read_pickle(train_data_file)
    test_df = pd.read_pickle(test_data_file)
    print("Length of test set: " + str(len(test_df)))

    annotations = train_df['prop_annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    test_annotations = test_df['prop_annotations'].values
    test_annotations = list(map(lambda x: set(x), test_annotations))
    go_rels.calculate_ic(annotations + test_annotations)

    # Print IC values of terms
    ics = {}
    for term in terms:
        ics[term] = go_rels.get_ic(term)

    prot_index = {}
    for i, row in enumerate(train_df.itertuples()):
        prot_index[row.proteins] = i

    # BLAST Similarity (Diamond)
    diamond_scores = {}
    with open(diamond_scores_file) as f:
        for line in f:
            it = line.strip().split()
            if it[0] not in diamond_scores:
                diamond_scores[it[0]] = {}
            diamond_scores[it[0]][it[1]] = float(it[2])

    blast_preds = []
    #print('Diamond preds')
    for i, row in enumerate(test_df.itertuples()):
        annots = {}
        prot_id = row.proteins
        # BlastKNN
        if prot_id in diamond_scores:
            sim_prots = diamond_scores[prot_id]
            allgos = set()
            total_score = 0.0
            for p_id, score in sim_prots.items():
                allgos |= annotations[prot_index[p_id]]
                total_score += score
            allgos = list(sorted(allgos))
            sim = np.zeros(len(allgos), dtype=np.float32)
            for j, go_id in enumerate(allgos):
                s = 0.0
                for p_id, score in sim_prots.items():
                    if go_id in annotations[prot_index[p_id]]:
                        s += score
                sim[j] = s / total_score
            ind = np.argsort(-sim)
            for go_id, score in zip(allgos, sim):
                annots[go_id] = score
        blast_preds.append(annots)

    last_release_metadata = 'metadata/last_release.json'

    with open(last_release_metadata, 'r') as f:
        last_release_data = json.load(f)

    last_release_data['alphas'][ont] = find_alpha(ont, test_df, blast_preds,
                                                  go_rels, terms)

    with open(last_release_metadata, 'w') as f:
        json.dump(last_release_data, f)
def main(benchmark_file, train_data_file, hpo_file, terms_file, root_class):

    hp = Ontology(hpo_file, with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    noknowledge_prots = set()
    with open('data-cafa/noknowledge_targets.txt') as f:
        for line in f:
            noknowledge_prots.add(line.strip())

    bench_annots = {}
    with open(benchmark_file) as f:
        for line in f:
            it = line.strip().split('\t')
            t_id = it[0]
            if t_id not in noknowledge_prots:
                continue
            hp_id = it[1]
            if t_id not in bench_annots:
                bench_annots[t_id] = set()
            bench_annots[t_id] |= hp.get_anchestors(hp_id)

    train_df = pd.read_pickle(train_data_file)
    naive_annots = {}
    for i, row in train_df.iterrows():
        for hp_id in row.hp_annotations:
            if hp_id in naive_annots:
                naive_annots[hp_id] += 1
            else:
                naive_annots[hp_id] = 1
    for hp_id in naive_annots:
        naive_annots[hp_id] /= 1.0 * len(train_df)

    annotations = train_df['hp_annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    hp.calculate_ic(annotations)

    hp_set = set(terms)
    all_classes = hp.get_term_set(root_class)
    hp_set = hp_set.intersection(all_classes)
    hp_set.discard(root_class)
    print(len(hp_set))

    labels = []
    for t_id, hps in bench_annots.items():
        labels.append(hps)
    labels = list(map(lambda x: set(filter(lambda y: y in hp_set, x)), labels))

    # Compute AUC
    auc_terms = list(hp_set)
    auc_terms_dict = {v: i for i, v in enumerate(auc_terms)}
    auc_preds = np.zeros((len(bench_annots), len(hp_set)), dtype=np.float32)
    auc_labels = np.zeros((len(bench_annots), len(hp_set)), dtype=np.int32)
    for i in range(len(labels)):
        for j, hp_id in enumerate(auc_terms):
            auc_preds[i, j] = naive_annots[hp_id]
            if hp_id in labels[i]:
                auc_labels[i, j] = 1
    # Compute macro AUROC
    roc_auc = 0.0
    total = 0
    for i, hp_id in enumerate(auc_terms):
        if np.sum(auc_labels[:, i]) == 0:
            continue
        total += 1
        auc = compute_roc(auc_labels[:, i], auc_preds[:, i])
        if not math.isnan(auc):
            roc_auc += auc
        else:
            roc_auc += 1
    roc_auc /= total
    print(roc_auc)
    return

    fmax = 0.0
    tmax = 0.0
    pmax = 0.0
    rmax = 0.0
    precisions = []
    recalls = []
    smin = 1000000.0
    max_preds = None
    for t in range(0, 101):
        threshold = t / 100.0
        annots = set()
        for hp_id, score in naive_annots.items():
            if score >= threshold:
                annots.add(hp_id)
        new_annots = set()
        for hp_id in annots:
            new_annots |= hp.get_anchestors(hp_id)
        preds = []
        for t_id, hps in bench_annots.items():
            preds.append(new_annots)

        fscore, prec, rec, s = evaluate_annotations(hp, labels, preds)
        precisions.append(prec)
        recalls.append(rec)
        print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}')
        if fmax < fscore:
            fmax = fscore
            pmax = prec
            rmax = rec
            tmax = threshold
            max_preds = preds
        if smin > s:
            smin = s
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print(
        f'AUROC: {roc_auc:0.3f}, AUPR: {aupr:0.3f}, Fmax: {fmax:0.3f}, Prec: {pmax:0.3f}, Rec: {rmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}'
    )
Пример #13
0
def main(train_data_file, test_data_file, terms_file, rules_file):

    hp = Ontology('data/hp.obo', with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    train_df = pd.read_pickle(train_data_file)

    rule_annots = {}
    with open(rules_file) as f:
        for line in f:
            it = line.strip().split()
            go_id = it[0].replace('_', ':')
            hp_id = it[1].replace('_', ':')
            if go_id not in rule_annots:
                rule_annots[go_id] = set()
            rule_annots[go_id].add(hp_id)

    test_df = pd.read_pickle(test_data_file)
    annotations = train_df['hp_annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    test_annotations = test_df['hp_annotations'].values
    test_annotations = list(map(lambda x: set(x), test_annotations))
    hp.calculate_ic(annotations)

    hp_set = set(terms)

    hp_set_anch = set()
    for hp_id in hp_set:
        hp_set_anch |= hp.get_anchestors(hp_id)

    labels = test_annotations
    # labels = list(map(lambda x: set(filter(lambda y: y in hp_set_anch, x)), labels))

    fmax = 0.0
    tmax = 0.0
    precisions = []
    recalls = []
    smin = 1000000.0
    max_preds = None
    for t in range(0, 101):
        threshold = t / 100.0
        preds = []
        for i, row in enumerate(test_df.itertuples()):
            gene_id = row.genes
            annots = set()
            for item in row.deepgo_annotations:
                go_id, score = item.split('|')
                score = float(score)
                if score >= threshold and go_id in rule_annots:
                    annots |= rule_annots[go_id]
            new_annots = set()
            for hp_id in annots:
                new_annots |= hp.get_anchestors(hp_id)

            preds.append(new_annots)

        # Filter classes

        fscore, prec, rec, s = evaluate_annotations(hp, labels, preds)
        precisions.append(prec)
        recalls.append(rec)
        print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}')
        if fmax < fscore:
            fmax = fscore
            tmax = threshold
            max_preds = preds
        if smin > s:
            smin = s
    print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}')
    test_df['hp_preds'] = max_preds
    test_df.to_pickle('data/predictions_max.pkl')
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print(f'AUPR: {aupr:0.3f}')
Пример #14
0
def main(train_data_file, preds_file, ont):

    go = Ontology('data/go.obo', with_rels=True)
    terms_df = pd.read_pickle('data-deepgo/' + ont + '.pkl')
    terms = terms_df['functions'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    train_df = pd.read_pickle(train_data_file)
    annotations = train_df['annotations'].values
    annotations = list(map(lambda x: set(x), annotations))

    test_df = pd.read_pickle(preds_file)
    # Annotations
    test_annotations = []
    for i, row in enumerate(test_df.itertuples()):
        annots = set()
        for go_id in row.gos:
            if go.has_term(go_id):
                annots |= go.get_anchestors(go_id)
        test_annotations.append(annots)
    go.calculate_ic(annotations + test_annotations)

    prot_index = {}
    for i, row in enumerate(train_df.itertuples()):
        prot_index[row.proteins] = i

    # DeepGO
    go_set = go.get_namespace_terms(NAMESPACES[ont])
    go_set.remove(FUNC_DICT[ont])

    labels = test_annotations
    labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels))
    print(len(go_set))
    fmax = 0.0
    tmax = 0.0
    smin = 1000.0
    precisions = []
    recalls = []
    for threshold in np.arange(0.005, 1, .01):
        # threshold = t / 100.0
        preds = []
        for i, row in enumerate(test_df.itertuples()):
            annots = set()
            for j, score in enumerate(row.predictions):
                if score >= threshold:
                    annots.add(terms[j])

            new_annots = set()
            for go_id in annots:
                new_annots |= go.get_anchestors(go_id)
            preds.append(new_annots)

        # Filter classes
        preds = list(
            map(lambda x: set(filter(lambda y: y in go_set, x)), preds))

        fscore, prec, rec, s = evaluate_annotations(go, labels, preds)
        precisions.append(prec)
        recalls.append(rec)
        print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}')
        if fmax < fscore:
            fmax = fscore
            tmax = threshold
        if smin > s:
            smin = s
    print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}')
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print(f'AUPR: {aupr:0.3f}')
    plt.figure()
    lw = 2
    plt.plot(recalls,
             precisions,
             color='darkorange',
             lw=lw,
             label=f'AUPR curve (area = {aupr:0.3f})')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Area Under the Precision-Recall curve')
    plt.legend(loc="lower right")
    plt.savefig('aupr.pdf')
    plt.show()
Пример #15
0
def helper(train_df, test_df, ont):
    go = Ontology('data-cafa/go.obo', with_rels=True)
    terms_df = pd.read_pickle('data-cafa/' + ont + '.pkl')
    terms = terms_df['functions'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    train_df = train_df.rename(columns={"gos": "annotations"})
    annotations = train_df['annotations'].values
    annotations = list(map(lambda x: set(x), annotations))

    test_df = test_df.rename(columns={"gos": "annotations"})

    # Annotations
    test_annotations = []
    for i, row in enumerate(test_df.itertuples()):
        annots = set()
        for go_id in row.annotations:
            if go.has_term(go_id):
                annots |= go.get_anchestors(go_id)
        test_annotations.append(annots)
    go.calculate_ic(annotations + test_annotations)

    prot_index = {}
    for i, row in enumerate(train_df.itertuples()):
        prot_index[row.proteins] = i

    # DeepGO
    go_set = go.get_namespace_terms(NAMESPACES[ont])
    go_set.remove(FUNC_DICT[ont])

    labels = test_annotations
    labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels))
    print(len(go_set))
    fmax = 0.0
    tmax = 0.0
    smin = 1000.0
    precisions = []
    recalls = []
    for t in range(1, 101):
        threshold = t / 100.0
        preds = []
        for i, row in enumerate(test_df.itertuples()):
            annots = set()
            for j, score in enumerate(row.predictions):
                if score >= threshold:
                    annots.add(terms[j])

            new_annots = set()
            for go_id in annots:
                new_annots |= go.get_anchestors(go_id)
            preds.append(new_annots)

        # Filter classes
        preds = list(
            map(lambda x: set(filter(lambda y: y in go_set, x)), preds))

        fscore, prec, rec, s = evaluate_annotations(go, labels, preds)
        precisions.append(prec)
        recalls.append(rec)
        print('Fscore: {}, S: {}, threshold: {}'.format(fscore, s, threshold))
        if fmax < fscore:
            fmax = fscore
            tmax = threshold
        if smin > s:
            smin = s
    print('Fmax: {:0.3f}, Smin: {:0.3f}, threshold: {}'.format(
        fmax, smin, tmax))
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print('AUPR: {:0.3f}'.format(aupr))

    return [recalls, precisions, aupr]
Пример #16
0
def main(model_file, terms_file, annotations_file):

    go_rels = Ontology('data-cafa/go.obo', with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    terms_dict = {v: k for k, v in  enumerate(terms)}
    df = pd.read_pickle(annotations_file)
    annotations = df['annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    go_rels.calculate_ic(annotations)

    go_id = 'GO:0008047'
    go_idx = terms_dict[go_id]
    # df = df[df['orgs'] == '559292']

    index = []
    seq_lengths = []
    for i, row in enumerate(df.itertuples()):
        if go_id in row.annotations:
            index.append(i)
            seq_lengths.append(len(row.sequences))
    df = df.iloc[index]
    
    annotations = df['annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    
    
    prot_ids = df['proteins'].values
    ids, data = get_data(df['sequences'])
    # for i, row in df.iterrows():
    #     ipros = '\t'.join(row['interpros'])
    #     print(f'{row["proteins"]}\t{ipros}')
    # Load CNN model
    model = load_model(model_file)
    model.summary()
    return
    int_model = Model(inputs=model.input, outputs=model.layers[-2].output)
    dense = model.layers[-1]
    W = dense.get_weights()[0][:, go_idx]
    b = dense.get_weights()[1][go_idx]
    print(np.argsort(-W), b)
    preds = int_model.predict(data, batch_size=100, verbose=0)
    filters = np.argsort(preds, axis=1)
    filter_cnt = Counter()
    for f in filters:
        filter_cnt.update(f[:20])
    AALETTER = np.array([
        '*', 'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I',
        'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'])
    print(filter_cnt)
    return
    for f_id, cnt in filter_cnt.most_common(10):
        conv_id = f_id // 512
        fl_id = f_id % 512
        conv_layer = model.layers[conv_id + 1]
        weights = conv_layer.get_weights()
        w1 = weights[0]
        w2 = weights[1]
        motif = ''.join(AALETTER[np.argmax(w1[:, :, fl_id], axis=1)])
        print(f'>{f_id}')
        print(motif)
        conv_model = Model(inputs=model.input, outputs=conv_layer.output)
        preds = conv_model.predict(data, batch_size=100, verbose=0)
        f_out = preds[:, :, fl_id]
        f_length = conv_layer.kernel_size[0]
        starts = np.argmax(f_out, axis=1)
        ends = starts + f_length
        for i in range(starts.shape[0]):
            seq = data[i, starts[i]:ends[i], :]
            seq_ind = np.argmax(seq, axis=1)
            motif = ''.join(AALETTER[seq_ind])
            print(f'>{f_id}_{i}')
            print(motif.replace('*', ''))