def main(train_data_file, test_data_file, terms_file, out_file, root_class, fold): # Cross validation evaluation out_file = f'fold{fold}_' + out_file test_data_file = f'fold{fold}_' + test_data_file hp = Ontology('data/hp.obo', with_rels=True) terms_df = pd.read_pickle(terms_file) terms = terms_df['terms'].values.flatten() terms_dict = {v: i for i, v in enumerate(terms)} train_df = pd.read_pickle(train_data_file) test_df = pd.read_pickle(test_data_file) annotations = train_df['hp_annotations'].values annotations = list(map(lambda x: set(x), annotations)) test_annotations = test_df['hp_annotations'].values test_annotations = list(map(lambda x: set(x), test_annotations)) hp.calculate_ic(annotations) hp_set = set(terms) all_classes = hp.get_term_set(root_class) hp_set = hp_set.intersection(all_classes) hp_set.discard(root_class) print(len(hp_set)) labels = test_annotations labels = list(map(lambda x: set(filter(lambda y: y in hp_set, x)), labels)) # Compute AUC auc_terms = list(hp_set) auc_terms_dict = {v: i for i, v in enumerate(auc_terms)} auc_preds = np.zeros((len(test_df), len(hp_set)), dtype=np.float32) auc_labels = np.zeros((len(test_df), len(hp_set)), dtype=np.int32) for i, row in enumerate(test_df.itertuples()): for j, hp_id in enumerate(auc_terms): auc_preds[i, j] = row.preds[terms_dict[hp_id]] if hp_id in labels[i]: auc_labels[i, j] = 1 # Compute macro AUROC roc_auc = 0.0 total = 0 for i, hp_id in enumerate(auc_terms): if np.sum(auc_labels[:, i]) == 0: continue total += 1 auc = compute_roc(auc_labels[:, i], auc_preds[:, i]) if not math.isnan(auc): roc_auc += auc else: roc_auc += 1 roc_auc /= total print(roc_auc) return fmax = 0.0 tmax = 0.0 pmax = 0.0 rmax = 0.0 precisions = [] recalls = [] smin = 1000000.0 max_preds = None for t in range(0, 101): threshold = t / 100.0 preds = [] for i, row in enumerate(test_df.itertuples()): gene_id = row.genes annots_dict = {} for j, score in enumerate(row.preds): hp_id = terms[j] # score = score * (1 - alpha) if hp_id in annots_dict: annots_dict[hp_id] += score else: annots_dict[hp_id] = score annots = set() for hp_id, score in annots_dict.items(): if score >= threshold: annots.add(hp_id) new_annots = set() for hp_id in annots: new_annots |= hp.get_anchestors(hp_id) new_annots = new_annots.intersection(hp_set) preds.append(new_annots) # Filter classes fscore, prec, rec, s = evaluate_annotations(hp, labels, preds) precisions.append(prec) recalls.append(rec) print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}') if fmax < fscore: fmax = fscore tmax = threshold max_preds = preds pmax = prec rmax = rec if smin > s: smin = s test_df['hp_preds'] = max_preds test_df.to_pickle(out_file) precisions = np.array(precisions) recalls = np.array(recalls) sorted_index = np.argsort(recalls) recalls = recalls[sorted_index] precisions = precisions[sorted_index] aupr = np.trapz(precisions, recalls) print( f'AUROC: {roc_auc:0.3f}, AUPR: {aupr:0.3f}, Fmax: {fmax:0.3f}, Prec: {pmax:0.3f}, Rec: {rmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}' ) plt.figure() lw = 2 plt.plot(recalls, precisions, color='darkorange', lw=lw, label=f'AUPR curve (area = {aupr:0.2f})') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Area Under the Precision-Recall curve') plt.legend(loc="lower right") df = pd.DataFrame({'precisions': precisions, 'recalls': recalls})
def main(train_data_file, test_data_file, out_file, terms_file, root_class, fold): # Cross validation evaluation out_file = f'fold{fold}_' + out_file test_data_file = f'fold{fold}_' + test_data_file hp = Ontology('data/hp.obo', with_rels=True) terms_df = pd.read_pickle(terms_file) terms = terms_df['terms'].values.flatten() terms_dict = {v: i for i, v in enumerate(terms)} train_df = pd.read_pickle(train_data_file) naive_annots = {} for i, row in train_df.iterrows(): for hp_id in row.hp_annotations: if hp_id in naive_annots: naive_annots[hp_id] += 1 else: naive_annots[hp_id] = 1 for hp_id in naive_annots: naive_annots[hp_id] /= 1.0 * len(train_df) test_df = pd.read_pickle(test_data_file) annotations = train_df['hp_annotations'].values annotations = list(map(lambda x: set(x), annotations)) test_annotations = test_df['hp_annotations'].values test_annotations = list(map(lambda x: set(x), test_annotations)) hp.calculate_ic(annotations) hp_set = set(terms) all_classes = hp.get_term_set(root_class) hp_set = hp_set.intersection(all_classes) hp_set.discard(root_class) print(len(hp_set)) labels = test_annotations labels = list(map(lambda x: set(filter(lambda y: y in hp_set, x)), labels)) # Compute AUC auc_terms = list(hp_set) auc_terms_dict = {v: i for i, v in enumerate(auc_terms)} auc_preds = np.zeros((len(test_df), len(hp_set)), dtype=np.float32) auc_labels = np.zeros((len(test_df), len(hp_set)), dtype=np.int32) for i in range(len(labels)): for j, hp_id in enumerate(auc_terms): auc_preds[i, j] = naive_annots[hp_id] if hp_id in labels[i]: auc_labels[i, j] = 1 # Compute macro AUROC roc_auc = 0.0 total = 0 for i, hp_id in enumerate(auc_terms): if np.sum(auc_labels[:, i]) == 0: continue total += 1 auc = compute_roc(auc_labels[:, i], auc_preds[:, i]) if not math.isnan(auc): roc_auc += auc else: roc_auc += 1 roc_auc /= total print(roc_auc) return fmax = 0.0 tmax = 0.0 pmax = 0.0 rmax = 0.0 precisions = [] recalls = [] smin = 1000000.0 max_preds = None for t in range(0, 101): threshold = t / 100.0 gene_id = row.genes annots = set() for hp_id, score in naive_annots.items(): if score >= threshold: annots.add(hp_id) new_annots = set() for hp_id in annots: new_annots |= hp.get_anchestors(hp_id) preds = [] for i, row in enumerate(test_df.itertuples()): preds.append(new_annots) # Filter classes fscore, prec, rec, s = evaluate_annotations(hp, labels, preds) precisions.append(prec) recalls.append(rec) print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}') if fmax < fscore: fmax = fscore tmax = threshold pmax = prec rmax = rec max_preds = preds if smin > s: smin = s test_df['hp_preds'] = max_preds test_df.to_pickle(out_file) precisions = np.array(precisions) recalls = np.array(recalls) sorted_index = np.argsort(recalls) recalls = recalls[sorted_index] precisions = precisions[sorted_index] aupr = np.trapz(precisions, recalls) print(f'AUROC: {roc_auc:0.3f}, AUPR: {aupr:0.3f}, Fmax: {fmax:0.3f}, Prec: {pmax:0.3f}, Rec: {rmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}')
def main(benchmark_file, train_data_file, hpo_file, terms_file, root_class): hp = Ontology(hpo_file, with_rels=True) terms_df = pd.read_pickle(terms_file) terms = terms_df['terms'].values.flatten() terms_dict = {v: i for i, v in enumerate(terms)} noknowledge_prots = set() with open('data-cafa/noknowledge_targets.txt') as f: for line in f: noknowledge_prots.add(line.strip()) bench_annots = {} with open(benchmark_file) as f: for line in f: it = line.strip().split('\t') t_id = it[0] if t_id not in noknowledge_prots: continue hp_id = it[1] if t_id not in bench_annots: bench_annots[t_id] = set() bench_annots[t_id] |= hp.get_anchestors(hp_id) train_df = pd.read_pickle(train_data_file) naive_annots = {} for i, row in train_df.iterrows(): for hp_id in row.hp_annotations: if hp_id in naive_annots: naive_annots[hp_id] += 1 else: naive_annots[hp_id] = 1 for hp_id in naive_annots: naive_annots[hp_id] /= 1.0 * len(train_df) annotations = train_df['hp_annotations'].values annotations = list(map(lambda x: set(x), annotations)) hp.calculate_ic(annotations) hp_set = set(terms) all_classes = hp.get_term_set(root_class) hp_set = hp_set.intersection(all_classes) hp_set.discard(root_class) print(len(hp_set)) labels = [] for t_id, hps in bench_annots.items(): labels.append(hps) labels = list(map(lambda x: set(filter(lambda y: y in hp_set, x)), labels)) # Compute AUC auc_terms = list(hp_set) auc_terms_dict = {v: i for i, v in enumerate(auc_terms)} auc_preds = np.zeros((len(bench_annots), len(hp_set)), dtype=np.float32) auc_labels = np.zeros((len(bench_annots), len(hp_set)), dtype=np.int32) for i in range(len(labels)): for j, hp_id in enumerate(auc_terms): auc_preds[i, j] = naive_annots[hp_id] if hp_id in labels[i]: auc_labels[i, j] = 1 # Compute macro AUROC roc_auc = 0.0 total = 0 for i, hp_id in enumerate(auc_terms): if np.sum(auc_labels[:, i]) == 0: continue total += 1 auc = compute_roc(auc_labels[:, i], auc_preds[:, i]) if not math.isnan(auc): roc_auc += auc else: roc_auc += 1 roc_auc /= total print(roc_auc) return fmax = 0.0 tmax = 0.0 pmax = 0.0 rmax = 0.0 precisions = [] recalls = [] smin = 1000000.0 max_preds = None for t in range(0, 101): threshold = t / 100.0 annots = set() for hp_id, score in naive_annots.items(): if score >= threshold: annots.add(hp_id) new_annots = set() for hp_id in annots: new_annots |= hp.get_anchestors(hp_id) preds = [] for t_id, hps in bench_annots.items(): preds.append(new_annots) fscore, prec, rec, s = evaluate_annotations(hp, labels, preds) precisions.append(prec) recalls.append(rec) print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}') if fmax < fscore: fmax = fscore pmax = prec rmax = rec tmax = threshold max_preds = preds if smin > s: smin = s precisions = np.array(precisions) recalls = np.array(recalls) sorted_index = np.argsort(recalls) recalls = recalls[sorted_index] precisions = precisions[sorted_index] aupr = np.trapz(precisions, recalls) print( f'AUROC: {roc_auc:0.3f}, AUPR: {aupr:0.3f}, Fmax: {fmax:0.3f}, Prec: {pmax:0.3f}, Rec: {rmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}' )