def main(train_data_file, test_data_file, terms_file, out_file, root_class,
         fold):
    # Cross validation evaluation
    out_file = f'fold{fold}_' + out_file
    test_data_file = f'fold{fold}_' + test_data_file

    hp = Ontology('data/hp.obo', with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    train_df = pd.read_pickle(train_data_file)
    test_df = pd.read_pickle(test_data_file)
    annotations = train_df['hp_annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    test_annotations = test_df['hp_annotations'].values
    test_annotations = list(map(lambda x: set(x), test_annotations))
    hp.calculate_ic(annotations)

    hp_set = set(terms)
    all_classes = hp.get_term_set(root_class)
    hp_set = hp_set.intersection(all_classes)
    hp_set.discard(root_class)
    print(len(hp_set))

    labels = test_annotations
    labels = list(map(lambda x: set(filter(lambda y: y in hp_set, x)), labels))

    # Compute AUC
    auc_terms = list(hp_set)
    auc_terms_dict = {v: i for i, v in enumerate(auc_terms)}
    auc_preds = np.zeros((len(test_df), len(hp_set)), dtype=np.float32)
    auc_labels = np.zeros((len(test_df), len(hp_set)), dtype=np.int32)
    for i, row in enumerate(test_df.itertuples()):
        for j, hp_id in enumerate(auc_terms):
            auc_preds[i, j] = row.preds[terms_dict[hp_id]]
            if hp_id in labels[i]:
                auc_labels[i, j] = 1
    # Compute macro AUROC
    roc_auc = 0.0
    total = 0
    for i, hp_id in enumerate(auc_terms):
        if np.sum(auc_labels[:, i]) == 0:
            continue
        total += 1
        auc = compute_roc(auc_labels[:, i], auc_preds[:, i])
        if not math.isnan(auc):
            roc_auc += auc
        else:
            roc_auc += 1
    roc_auc /= total
    print(roc_auc)
    return
    fmax = 0.0
    tmax = 0.0
    pmax = 0.0
    rmax = 0.0
    precisions = []
    recalls = []
    smin = 1000000.0
    max_preds = None
    for t in range(0, 101):
        threshold = t / 100.0
        preds = []
        for i, row in enumerate(test_df.itertuples()):
            gene_id = row.genes
            annots_dict = {}

            for j, score in enumerate(row.preds):
                hp_id = terms[j]
                # score = score * (1 - alpha)
                if hp_id in annots_dict:
                    annots_dict[hp_id] += score
                else:
                    annots_dict[hp_id] = score

            annots = set()
            for hp_id, score in annots_dict.items():
                if score >= threshold:
                    annots.add(hp_id)
            new_annots = set()
            for hp_id in annots:
                new_annots |= hp.get_anchestors(hp_id)
            new_annots = new_annots.intersection(hp_set)
            preds.append(new_annots)

        # Filter classes

        fscore, prec, rec, s = evaluate_annotations(hp, labels, preds)
        precisions.append(prec)
        recalls.append(rec)
        print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}')
        if fmax < fscore:
            fmax = fscore
            tmax = threshold
            max_preds = preds
            pmax = prec
            rmax = rec
        if smin > s:
            smin = s
    test_df['hp_preds'] = max_preds
    test_df.to_pickle(out_file)
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print(
        f'AUROC: {roc_auc:0.3f}, AUPR: {aupr:0.3f}, Fmax: {fmax:0.3f}, Prec: {pmax:0.3f}, Rec: {rmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}'
    )
    plt.figure()
    lw = 2
    plt.plot(recalls,
             precisions,
             color='darkorange',
             lw=lw,
             label=f'AUPR curve (area = {aupr:0.2f})')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Area Under the Precision-Recall curve')
    plt.legend(loc="lower right")
    df = pd.DataFrame({'precisions': precisions, 'recalls': recalls})
Пример #2
0
def main(train_data_file, test_data_file, out_file, terms_file, root_class, fold):
    # Cross validation evaluation
    out_file = f'fold{fold}_' + out_file
    test_data_file = f'fold{fold}_' + test_data_file
    
    hp = Ontology('data/hp.obo', with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    train_df = pd.read_pickle(train_data_file)

    naive_annots = {}
    for i, row in train_df.iterrows():
        for hp_id in row.hp_annotations:
            if hp_id in naive_annots:
                naive_annots[hp_id] += 1
            else:
                naive_annots[hp_id] = 1
    for hp_id in naive_annots:
        naive_annots[hp_id] /= 1.0 * len(train_df)

    test_df = pd.read_pickle(test_data_file)
    annotations = train_df['hp_annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    test_annotations = test_df['hp_annotations'].values
    test_annotations = list(map(lambda x: set(x), test_annotations))
    hp.calculate_ic(annotations)

    hp_set = set(terms)
    
    all_classes = hp.get_term_set(root_class)
    hp_set = hp_set.intersection(all_classes)
    hp_set.discard(root_class)
    print(len(hp_set))

    labels = test_annotations
    labels = list(map(lambda x: set(filter(lambda y: y in hp_set, x)), labels))

    # Compute AUC
    auc_terms = list(hp_set)
    auc_terms_dict = {v: i for i, v in enumerate(auc_terms)}
    auc_preds = np.zeros((len(test_df), len(hp_set)), dtype=np.float32)
    auc_labels = np.zeros((len(test_df), len(hp_set)), dtype=np.int32)
    for i in range(len(labels)):
        for j, hp_id in enumerate(auc_terms):
            auc_preds[i, j] = naive_annots[hp_id]
            if hp_id in labels[i]:
                auc_labels[i, j] = 1
    # Compute macro AUROC
    roc_auc = 0.0
    total = 0
    for i, hp_id in enumerate(auc_terms):
        if np.sum(auc_labels[:, i]) == 0:
            continue
        total += 1
        auc = compute_roc(auc_labels[:, i], auc_preds[:, i])
        if not math.isnan(auc): 
            roc_auc += auc
        else:
            roc_auc += 1
    roc_auc /= total
    print(roc_auc)
    return
    
    fmax = 0.0
    tmax = 0.0
    pmax = 0.0
    rmax = 0.0
    precisions = []
    recalls = []
    smin = 1000000.0
    max_preds = None
    for t in range(0, 101):
        threshold = t / 100.0
        gene_id = row.genes
        annots = set()
        for hp_id, score in naive_annots.items():
            if score >= threshold:
                annots.add(hp_id)
        new_annots = set()
        for hp_id in annots:
            new_annots |= hp.get_anchestors(hp_id)

        preds = []
        for i, row in enumerate(test_df.itertuples()):
            preds.append(new_annots)
        
    
        # Filter classes
        
        fscore, prec, rec, s = evaluate_annotations(hp, labels, preds)
        precisions.append(prec)
        recalls.append(rec)
        print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}')
        if fmax < fscore:
            fmax = fscore
            tmax = threshold
            pmax = prec
            rmax = rec
            max_preds = preds
        if smin > s:
            smin = s
    test_df['hp_preds'] = max_preds
    test_df.to_pickle(out_file)
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print(f'AUROC: {roc_auc:0.3f}, AUPR: {aupr:0.3f}, Fmax: {fmax:0.3f}, Prec: {pmax:0.3f}, Rec: {rmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}')
def main(benchmark_file, train_data_file, hpo_file, terms_file, root_class):

    hp = Ontology(hpo_file, with_rels=True)
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['terms'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}

    noknowledge_prots = set()
    with open('data-cafa/noknowledge_targets.txt') as f:
        for line in f:
            noknowledge_prots.add(line.strip())

    bench_annots = {}
    with open(benchmark_file) as f:
        for line in f:
            it = line.strip().split('\t')
            t_id = it[0]
            if t_id not in noknowledge_prots:
                continue
            hp_id = it[1]
            if t_id not in bench_annots:
                bench_annots[t_id] = set()
            bench_annots[t_id] |= hp.get_anchestors(hp_id)

    train_df = pd.read_pickle(train_data_file)
    naive_annots = {}
    for i, row in train_df.iterrows():
        for hp_id in row.hp_annotations:
            if hp_id in naive_annots:
                naive_annots[hp_id] += 1
            else:
                naive_annots[hp_id] = 1
    for hp_id in naive_annots:
        naive_annots[hp_id] /= 1.0 * len(train_df)

    annotations = train_df['hp_annotations'].values
    annotations = list(map(lambda x: set(x), annotations))
    hp.calculate_ic(annotations)

    hp_set = set(terms)
    all_classes = hp.get_term_set(root_class)
    hp_set = hp_set.intersection(all_classes)
    hp_set.discard(root_class)
    print(len(hp_set))

    labels = []
    for t_id, hps in bench_annots.items():
        labels.append(hps)
    labels = list(map(lambda x: set(filter(lambda y: y in hp_set, x)), labels))

    # Compute AUC
    auc_terms = list(hp_set)
    auc_terms_dict = {v: i for i, v in enumerate(auc_terms)}
    auc_preds = np.zeros((len(bench_annots), len(hp_set)), dtype=np.float32)
    auc_labels = np.zeros((len(bench_annots), len(hp_set)), dtype=np.int32)
    for i in range(len(labels)):
        for j, hp_id in enumerate(auc_terms):
            auc_preds[i, j] = naive_annots[hp_id]
            if hp_id in labels[i]:
                auc_labels[i, j] = 1
    # Compute macro AUROC
    roc_auc = 0.0
    total = 0
    for i, hp_id in enumerate(auc_terms):
        if np.sum(auc_labels[:, i]) == 0:
            continue
        total += 1
        auc = compute_roc(auc_labels[:, i], auc_preds[:, i])
        if not math.isnan(auc):
            roc_auc += auc
        else:
            roc_auc += 1
    roc_auc /= total
    print(roc_auc)
    return

    fmax = 0.0
    tmax = 0.0
    pmax = 0.0
    rmax = 0.0
    precisions = []
    recalls = []
    smin = 1000000.0
    max_preds = None
    for t in range(0, 101):
        threshold = t / 100.0
        annots = set()
        for hp_id, score in naive_annots.items():
            if score >= threshold:
                annots.add(hp_id)
        new_annots = set()
        for hp_id in annots:
            new_annots |= hp.get_anchestors(hp_id)
        preds = []
        for t_id, hps in bench_annots.items():
            preds.append(new_annots)

        fscore, prec, rec, s = evaluate_annotations(hp, labels, preds)
        precisions.append(prec)
        recalls.append(rec)
        print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}')
        if fmax < fscore:
            fmax = fscore
            pmax = prec
            rmax = rec
            tmax = threshold
            max_preds = preds
        if smin > s:
            smin = s
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    sorted_index = np.argsort(recalls)
    recalls = recalls[sorted_index]
    precisions = precisions[sorted_index]
    aupr = np.trapz(precisions, recalls)
    print(
        f'AUROC: {roc_auc:0.3f}, AUPR: {aupr:0.3f}, Fmax: {fmax:0.3f}, Prec: {pmax:0.3f}, Rec: {rmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}'
    )