def main(go_file, uniprot_file, filter_exp, prop_annots, cafa_targets, out_file): go = Ontology(go_file, with_rels=True) proteins, accessions, sequences, annotations, interpros, orgs = load_data( uniprot_file) df = pd.DataFrame({ 'proteins': proteins, 'accessions': accessions, 'sequences': sequences, 'annotations': annotations, 'interpros': interpros, 'orgs': orgs }) if filter_exp: logging.info('Filtering proteins with experimental annotations') index = [] annotations = [] for i, row in enumerate(df.itertuples()): annots = [] for annot in row.annotations: go_id, code = annot.split('|') if is_exp_code(code): annots.append(go_id) # Ignore proteins without experimental annotations if len(annots) == 0: continue index.append(i) annotations.append(annots) df = df.iloc[index] df = df.reset_index() df['annotations'] = annotations if cafa_targets: logging.info('Filtering cafa target proteins') index = [] for i, row in enumerate(df.itertuples()): if is_cafa_target(row.orgs): index.append(i) df = df.iloc[index] df = df.reset_index() if prop_annots: prop_annotations = [] for i, row in df.iterrows(): # Propagate annotations annot_set = set() annots = row['annotations'] for go_id in annots: go_id = go_id.split('|')[0] # In case if it has code annot_set |= go.get_anchestors(go_id) annots = list(annot_set) prop_annotations.append(annots) df['annotations'] = prop_annotations df.to_pickle(out_file) logging.info('Successfully saved %d proteins' % (len(df), ))
def main(go_file, swissprot_file, out_file): go = Ontology(go_file, with_rels=True) if swissprot_file.endswith("gz"): proteins, accessions, sequences, annotations, interpros, orgs = load_data_gzip( swissprot_file) else: proteins, accessions, sequences, annotations, interpros, orgs = load_data( swissprot_file) df = pd.DataFrame({ 'proteins': proteins, 'accessions': accessions, 'sequences': sequences, 'annotations': annotations, 'interpros': interpros, 'orgs': orgs }) logging.info('Filtering proteins with experimental annotations') index = [] annotations = [] for i, row in enumerate(df.itertuples()): annots = [] for annot in row.annotations: go_id, code = annot.split('|') if is_exp_code(code): annots.append(go_id) # Ignore proteins without experimental annotations if len(annots) == 0: continue index.append(i) annotations.append(annots) df = df.iloc[index] df = df.reset_index() df['exp_annotations'] = annotations prop_annotations = [] for i, row in df.iterrows(): # Propagate annotations annot_set = set() annots = row['exp_annotations'] for go_id in annots: annot_set |= go.get_anchestors(go_id) annots = list(annot_set) prop_annotations.append(annots) df['prop_annotations'] = prop_annotations cafa_target = [] for i, row in enumerate(df.itertuples()): if is_cafa_target(row.orgs): cafa_target.append(True) else: cafa_target.append(False) df['cafa_target'] = cafa_target df.to_pickle(out_file) logging.info('Successfully saved %d proteins' % (len(df), ))
def main(go_file, uniprot_file, filter_exp, prop_annots, out_file): go = Ontology(go_file, with_rels=True) proteins, accessions, sequences, annotations, interpros, orgs, genes, gene_names = load_data(uniprot_file) df = pd.DataFrame({ 'proteins': proteins, 'accessions': accessions, 'sequences': sequences, 'annotations': annotations, 'interpros': interpros, 'orgs': orgs, 'genes': genes, 'gene_names': gene_names }) # Filter proteins df = df[df['orgs'] == '9606'] logging.info('Filtering proteins with experimental annotations') index = [] annotations = [] iea_annotations = [] for i, row in enumerate(df.itertuples()): annots = set() iea_annots = set() for annot in row.annotations: go_id, code = annot.split('|') anch_set = go.get_anchestors(go_id) if is_exp_code(code): annots |= anch_set iea_annots |= anch_set annots = list(annots) iea_annots = list(iea_annots) annotations.append(annots) iea_annotations.append(iea_annots) df['exp_annotations'] = annotations df['iea_annotations'] = iea_annotations df.to_pickle(out_file) logging.info('Successfully saved %d proteins' % (len(df),) )
def main(train_data_file, test_data_file, diamond_scores_file, ont): go_rels = Ontology('data/go.obo', with_rels=True) train_df = pd.read_pickle(train_data_file) annotations = train_df['prop_annotations'].values annotations = list(map(lambda x: set(x), annotations)) test_df = pd.read_pickle(test_data_file) test_annotations = test_df['prop_annotations'].values test_annotations = list(map(lambda x: set(x), test_annotations)) go_rels.calculate_ic(annotations + test_annotations) prot_index = {} for i, row in enumerate(train_df.itertuples()): prot_index[row.proteins] = i # BLAST Similarity (Diamond) diamond_scores = {} with open(diamond_scores_file) as f: for line in f: it = line.strip().split() if it[0] not in diamond_scores: diamond_scores[it[0]] = {} diamond_scores[it[0]][it[1]] = float(it[2]) blast_preds = [] for i, row in enumerate(test_df.itertuples()): annots = {} prot_id = row.proteins # BlastKNN if prot_id in diamond_scores: sim_prots = diamond_scores[prot_id] allgos = set() total_score = 0.0 for p_id, score in sim_prots.items(): allgos |= annotations[prot_index[p_id]] total_score += score allgos = list(sorted(allgos)) sim = np.zeros(len(allgos), dtype=np.float32) for j, go_id in enumerate(allgos): s = 0.0 for p_id, score in sim_prots.items(): if go_id in annotations[prot_index[p_id]]: s += score sim[j] = s / total_score for go_id, score in zip(allgos, sim): annots[go_id] = score blast_preds.append(annots) go_set = go_rels.get_namespace_terms(NAMESPACES[ont]) go_set.remove(FUNC_DICT[ont]) labels = test_annotations labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels)) print(len(go_set)) fmax = 0.0 tmax = 0.0 smin = 1000.0 precisions = [] recalls = [] for t in range(101): threshold = t / 100.0 preds = [] for i, row in enumerate(test_df.itertuples()): annots = set() for go_id, score in blast_preds[i].items(): if score >= threshold: annots.add(go_id) new_annots = set() for go_id in annots: new_annots |= go_rels.get_anchestors(go_id) preds.append(new_annots) # Filter classes preds = list( map(lambda x: set(filter(lambda y: y in go_set, x)), preds)) fscore, prec, rec, s = evaluate_annotations(go_rels, labels, preds) precisions.append(prec) recalls.append(rec) print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}') if fmax < fscore: fmax = fscore tmax = threshold if smin > s: smin = s print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}') precisions = np.array(precisions) recalls = np.array(recalls) sorted_index = np.argsort(recalls) recalls = recalls[sorted_index] precisions = precisions[sorted_index] aupr = np.trapz(precisions, recalls) print(f'AUPR: {aupr:0.3f}') plt.figure() lw = 2 plt.plot(recalls, precisions, color='darkorange', lw=lw, label=f'AUPR curve (area = {aupr:0.3f})') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Area Under the Precision-Recall curve') plt.legend(loc="lower right") plt.savefig('aupr.pdf') plt.show()
def main(train_data_file, test_data_file, out_file, terms_file, root_class, fold): # Cross validation evaluation out_file = f'fold{fold}_' + out_file test_data_file = f'fold{fold}_' + test_data_file hp = Ontology('data/hp.obo', with_rels=True) terms_df = pd.read_pickle(terms_file) terms = terms_df['terms'].values.flatten() terms_dict = {v: i for i, v in enumerate(terms)} train_df = pd.read_pickle(train_data_file) naive_annots = {} for i, row in train_df.iterrows(): for hp_id in row.hp_annotations: if hp_id in naive_annots: naive_annots[hp_id] += 1 else: naive_annots[hp_id] = 1 for hp_id in naive_annots: naive_annots[hp_id] /= 1.0 * len(train_df) test_df = pd.read_pickle(test_data_file) annotations = train_df['hp_annotations'].values annotations = list(map(lambda x: set(x), annotations)) test_annotations = test_df['hp_annotations'].values test_annotations = list(map(lambda x: set(x), test_annotations)) hp.calculate_ic(annotations) hp_set = set(terms) all_classes = hp.get_term_set(root_class) hp_set = hp_set.intersection(all_classes) hp_set.discard(root_class) print(len(hp_set)) labels = test_annotations labels = list(map(lambda x: set(filter(lambda y: y in hp_set, x)), labels)) # Compute AUC auc_terms = list(hp_set) auc_terms_dict = {v: i for i, v in enumerate(auc_terms)} auc_preds = np.zeros((len(test_df), len(hp_set)), dtype=np.float32) auc_labels = np.zeros((len(test_df), len(hp_set)), dtype=np.int32) for i in range(len(labels)): for j, hp_id in enumerate(auc_terms): auc_preds[i, j] = naive_annots[hp_id] if hp_id in labels[i]: auc_labels[i, j] = 1 # Compute macro AUROC roc_auc = 0.0 total = 0 for i, hp_id in enumerate(auc_terms): if np.sum(auc_labels[:, i]) == 0: continue total += 1 auc = compute_roc(auc_labels[:, i], auc_preds[:, i]) if not math.isnan(auc): roc_auc += auc else: roc_auc += 1 roc_auc /= total print(roc_auc) return fmax = 0.0 tmax = 0.0 pmax = 0.0 rmax = 0.0 precisions = [] recalls = [] smin = 1000000.0 max_preds = None for t in range(0, 101): threshold = t / 100.0 gene_id = row.genes annots = set() for hp_id, score in naive_annots.items(): if score >= threshold: annots.add(hp_id) new_annots = set() for hp_id in annots: new_annots |= hp.get_anchestors(hp_id) preds = [] for i, row in enumerate(test_df.itertuples()): preds.append(new_annots) # Filter classes fscore, prec, rec, s = evaluate_annotations(hp, labels, preds) precisions.append(prec) recalls.append(rec) print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}') if fmax < fscore: fmax = fscore tmax = threshold pmax = prec rmax = rec max_preds = preds if smin > s: smin = s test_df['hp_preds'] = max_preds test_df.to_pickle(out_file) precisions = np.array(precisions) recalls = np.array(recalls) sorted_index = np.argsort(recalls) recalls = recalls[sorted_index] precisions = precisions[sorted_index] aupr = np.trapz(precisions, recalls) print(f'AUROC: {roc_auc:0.3f}, AUPR: {aupr:0.3f}, Fmax: {fmax:0.3f}, Prec: {pmax:0.3f}, Rec: {rmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}')
def main(train_data_file, test_data_file, terms_file, out_file, root_class, fold): # Cross validation evaluation out_file = f'fold{fold}_' + out_file test_data_file = f'fold{fold}_' + test_data_file hp = Ontology('data/hp.obo', with_rels=True) terms_df = pd.read_pickle(terms_file) terms = terms_df['terms'].values.flatten() terms_dict = {v: i for i, v in enumerate(terms)} train_df = pd.read_pickle(train_data_file) test_df = pd.read_pickle(test_data_file) annotations = train_df['hp_annotations'].values annotations = list(map(lambda x: set(x), annotations)) test_annotations = test_df['hp_annotations'].values test_annotations = list(map(lambda x: set(x), test_annotations)) hp.calculate_ic(annotations) hp_set = set(terms) all_classes = hp.get_term_set(root_class) hp_set = hp_set.intersection(all_classes) hp_set.discard(root_class) print(len(hp_set)) labels = test_annotations labels = list(map(lambda x: set(filter(lambda y: y in hp_set, x)), labels)) # Compute AUC auc_terms = list(hp_set) auc_terms_dict = {v: i for i, v in enumerate(auc_terms)} auc_preds = np.zeros((len(test_df), len(hp_set)), dtype=np.float32) auc_labels = np.zeros((len(test_df), len(hp_set)), dtype=np.int32) for i, row in enumerate(test_df.itertuples()): for j, hp_id in enumerate(auc_terms): auc_preds[i, j] = row.preds[terms_dict[hp_id]] if hp_id in labels[i]: auc_labels[i, j] = 1 # Compute macro AUROC roc_auc = 0.0 total = 0 for i, hp_id in enumerate(auc_terms): if np.sum(auc_labels[:, i]) == 0: continue total += 1 auc = compute_roc(auc_labels[:, i], auc_preds[:, i]) if not math.isnan(auc): roc_auc += auc else: roc_auc += 1 roc_auc /= total print(roc_auc) return fmax = 0.0 tmax = 0.0 pmax = 0.0 rmax = 0.0 precisions = [] recalls = [] smin = 1000000.0 max_preds = None for t in range(0, 101): threshold = t / 100.0 preds = [] for i, row in enumerate(test_df.itertuples()): gene_id = row.genes annots_dict = {} for j, score in enumerate(row.preds): hp_id = terms[j] # score = score * (1 - alpha) if hp_id in annots_dict: annots_dict[hp_id] += score else: annots_dict[hp_id] = score annots = set() for hp_id, score in annots_dict.items(): if score >= threshold: annots.add(hp_id) new_annots = set() for hp_id in annots: new_annots |= hp.get_anchestors(hp_id) new_annots = new_annots.intersection(hp_set) preds.append(new_annots) # Filter classes fscore, prec, rec, s = evaluate_annotations(hp, labels, preds) precisions.append(prec) recalls.append(rec) print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}') if fmax < fscore: fmax = fscore tmax = threshold max_preds = preds pmax = prec rmax = rec if smin > s: smin = s test_df['hp_preds'] = max_preds test_df.to_pickle(out_file) precisions = np.array(precisions) recalls = np.array(recalls) sorted_index = np.argsort(recalls) recalls = recalls[sorted_index] precisions = precisions[sorted_index] aupr = np.trapz(precisions, recalls) print( f'AUROC: {roc_auc:0.3f}, AUPR: {aupr:0.3f}, Fmax: {fmax:0.3f}, Prec: {pmax:0.3f}, Rec: {rmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}' ) plt.figure() lw = 2 plt.plot(recalls, precisions, color='darkorange', lw=lw, label=f'AUPR curve (area = {aupr:0.2f})') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Area Under the Precision-Recall curve') plt.legend(loc="lower right") df = pd.DataFrame({'precisions': precisions, 'recalls': recalls})
def main(train_data_file, test_data_file, terms_file, diamond_scores_file, ont, alpha): alpha /= 100.0 go_rels = Ontology('data-cafa/go.obo', with_rels=True) terms_df = pd.read_pickle(terms_file) terms = terms_df['terms'].values.flatten() terms_dict = {v: i for i, v in enumerate(terms)} train_df = pd.read_pickle(train_data_file) test_df = pd.read_pickle(test_data_file) annotations = train_df['annotations'].values annotations = list(map(lambda x: set(x), annotations)) test_annotations = test_df['annotations'].values test_annotations = list(map(lambda x: set(x), test_annotations)) go_rels.calculate_ic(annotations + test_annotations) # Print IC values of terms ics = {} for term in terms: ics[term] = go_rels.get_ic(term) prot_index = {} for i, row in enumerate(train_df.itertuples()): prot_index[row.proteins] = i # BLAST Similarity (Diamond) diamond_scores = {} with open(diamond_scores_file) as f: for line in f: it = line.strip().split() if it[0] not in diamond_scores: diamond_scores[it[0]] = {} diamond_scores[it[0]][it[1]] = float(it[2]) blast_preds = [] for i, row in enumerate(test_df.itertuples()): annots = {} prot_id = row.proteins # BlastKNN if prot_id in diamond_scores: sim_prots = diamond_scores[prot_id] allgos = set() total_score = 0.0 for p_id, score in sim_prots.items(): allgos |= annotations[prot_index[p_id]] total_score += score allgos = list(sorted(allgos)) sim = np.zeros(len(allgos), dtype=np.float32) for j, go_id in enumerate(allgos): s = 0.0 for p_id, score in sim_prots.items(): if go_id in annotations[prot_index[p_id]]: s += score sim[j] = s / total_score ind = np.argsort(-sim) for go_id, score in zip(allgos, sim): annots[go_id] = score blast_preds.append(annots) # DeepGOPlus go_set = go_rels.get_namespace_terms(NAMESPACES[ont]) go_set.remove(FUNC_DICT[ont]) labels = test_df['annotations'].values labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels)) # print(len(go_set)) deep_preds = [] alphas = { NAMESPACES['mf']: 0.55, NAMESPACES['bp']: 0.59, NAMESPACES['cc']: 0.46 } for i, row in enumerate(test_df.itertuples()): annots_dict = blast_preds[i].copy() for go_id in annots_dict: annots_dict[go_id] *= alphas[go_rels.get_namespace(go_id)] for j, score in enumerate(row.preds): go_id = terms[j] score *= 1 - alphas[go_rels.get_namespace(go_id)] if go_id in annots_dict: annots_dict[go_id] += score else: annots_dict[go_id] = score deep_preds.append(annots_dict) print('AUTHOR DeepGOPlus') print('MODEL 1') print('KEYWORDS sequence alignment.') for i, row in enumerate(test_df.itertuples()): prot_id = row.proteins for go_id, score in deep_preds[i].items(): print(f'{prot_id}\t{go_id}\t{score:.2f}') print('END') return # Propagate scores # deepgo_preds = [] # for annots_dict in deep_preds: # annots = {} # for go_id, score in annots_dict.items(): # for a_id in go_rels.get_anchestors(go_id): # if a_id in annots: # annots[a_id] = max(annots[a_id], score) # else: # annots[a_id] = score # deepgo_preds.append(annots) fmax = 0.0 tmax = 0.0 precisions = [] recalls = [] smin = 1000000.0 rus = [] mis = [] for t in range(0, 101): threshold = t / 100.0 preds = [] for i, row in enumerate(test_df.itertuples()): annots = set() for go_id, score in deep_preds[i].items(): if score >= threshold: annots.add(go_id) new_annots = set() for go_id in annots: new_annots |= go_rels.get_anchestors(go_id) preds.append(new_annots) # Filter classes preds = list( map(lambda x: set(filter(lambda y: y in go_set, x)), preds)) fscore, prec, rec, s, ru, mi, fps, fns = evaluate_annotations( go_rels, labels, preds) avg_fp = sum(map(lambda x: len(x), fps)) / len(fps) avg_ic = sum( map(lambda x: sum(map(lambda go_id: go_rels.get_ic(go_id), x)), fps)) / len(fps) print(f'{avg_fp} {avg_ic}') precisions.append(prec) recalls.append(rec) print( f'Fscore: {fscore}, Precision: {prec}, Recall: {rec} S: {s}, RU: {ru}, MI: {mi} threshold: {threshold}' ) if fmax < fscore: fmax = fscore tmax = threshold if smin > s: smin = s print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}') precisions = np.array(precisions) recalls = np.array(recalls) sorted_index = np.argsort(recalls) recalls = recalls[sorted_index] precisions = precisions[sorted_index] aupr = np.trapz(precisions, recalls) print(f'AUPR: {aupr:0.3f}') plt.figure() lw = 2 plt.plot(recalls, precisions, color='darkorange', lw=lw, label=f'AUPR curve (area = {aupr:0.2f})') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Area Under the Precision-Recall curve') plt.legend(loc="lower right") plt.savefig(f'aupr_{ont}_{alpha:0.2f}.pdf') df = pd.DataFrame({'precisions': precisions, 'recalls': recalls}) df.to_pickle(f'PR_{ont}_{alpha:0.2f}.pkl')
def main(go_file, mp_file, mp_annots_file, deepgo_annots_file, id_mapping_file, data_file, out_data_file, out_terms_file, min_count): go = Ontology(go_file, with_rels=True) logging.info('GO loaded') mp = Ontology(mp_file, with_rels=True) logging.info('MP loaded') logging.info('Load MP2Uniprot mapping') prot2gene = {} with open(id_mapping_file) as f: next(f) for line in f: it = line.strip().split('\t') if it[0] not in gene2prot: gene2prot[it[0]] = [] gene2prot[it[0]] += list(it[6].split()) logging.info('Loading MP annotations') mp_annots = {} df = pd.read_pickle(data_file) acc2prot = {} for row in df.itertuples(): p_id = row.proteins acc_ids = row.accessions.split('; ') for acc_id in acc_ids: acc2prot[acc_id] = p_id with open(mp_annots_file) as f: next(f) for line in f: it = line.strip().split('\t') for mgi in it[6].split('|'): if mgi not in gene2prot: continue prot_ids = gene2prot[mgi] mp_id = it[4] for prot_id in prot_ids: if prot_id not in acc2prot: continue prot_id = acc2prot[prot_id] if prot_id not in mp_annots: mp_annots[prot_id] = set() if mp.has_term(mp_id): mp_annots[prot_id] |= mp.get_anchestors(mp_id) print('MP Annotations', len(mp_annots)) dg_annots = {} gos = set() with open(deepgo_annots_file) as f: for line in f: it = line.strip().split('\t') prot_id = it[0] annots = [] for item in it[1:]: go_id, score = item.split('|') score = float(score) annots.append(go_id) dg_annots[prot_id] = it[1:] gos |= set(annots) print('DeepGO Annotations', len(dg_annots)) print('Number of GOs', len(gos)) go_df = pd.DataFrame({'gos': list(gos)}) go_df.to_pickle('data/gos.pkl') logging.info('Processing annotations') cnt = Counter() annotations = list() for prot_id, annots in mp_annots.items(): for term in annots: cnt[term] += 1 deepgo_annots = [] go_annots = [] mpos = [] prots = [] sequences = [] for row in df.itertuples(): p_id = row.proteins if p_id in mp_annots: prots.append(p_id) mpos.append(mp_annots[p_id]) go_annots.append(row.annotations) deepgo_annots.append(dg_annots[p_id]) sequences.append(row.sequences) prots_set = set(prots) for key, val in mp_annots.items(): if key not in prots_set: print(key) df = pd.DataFrame({ 'proteins': prots, 'mp_annotations': mpos, 'go_annotations': go_annots, 'deepgo_annotations': deepgo_annots, 'sequences': sequences }) df.to_pickle(out_data_file) print(f'Number of proteins {len(df)}') # Filter terms with annotations more than min_count res = {} for key, val in cnt.items(): if key == 'MP:0000001': continue if val >= min_count: ont = key.split(':')[0] if ont not in res: res[ont] = [] res[ont].append(key) terms = [] for key, val in res.items(): print(key, len(val)) terms += val logging.info(f'Number of terms {len(terms)}') # Save the list of terms df = pd.DataFrame({'terms': terms}) df.to_pickle(out_terms_file)
def helper(train_df, test_df, ont): go = Ontology('data-cafa/go.obo', with_rels=True) terms_df = pd.read_pickle('data-cafa/' + ont + '.pkl') terms = terms_df['functions'].values.flatten() terms_dict = {v: i for i, v in enumerate(terms)} train_df = train_df.rename(columns={"gos": "annotations"}) annotations = train_df['annotations'].values annotations = list(map(lambda x: set(x), annotations)) test_df = test_df.rename(columns={"gos": "annotations"}) # Annotations test_annotations = [] for i, row in enumerate(test_df.itertuples()): annots = set() for go_id in row.annotations: if go.has_term(go_id): annots |= go.get_anchestors(go_id) test_annotations.append(annots) go.calculate_ic(annotations + test_annotations) prot_index = {} for i, row in enumerate(train_df.itertuples()): prot_index[row.proteins] = i # DeepGO go_set = go.get_namespace_terms(NAMESPACES[ont]) go_set.remove(FUNC_DICT[ont]) labels = test_annotations labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels)) print(len(go_set)) fmax = 0.0 tmax = 0.0 smin = 1000.0 precisions = [] recalls = [] for t in range(1, 101): threshold = t / 100.0 preds = [] for i, row in enumerate(test_df.itertuples()): annots = set() for j, score in enumerate(row.predictions): if score >= threshold: annots.add(terms[j]) new_annots = set() for go_id in annots: new_annots |= go.get_anchestors(go_id) preds.append(new_annots) # Filter classes preds = list( map(lambda x: set(filter(lambda y: y in go_set, x)), preds)) fscore, prec, rec, s = evaluate_annotations(go, labels, preds) precisions.append(prec) recalls.append(rec) print('Fscore: {}, S: {}, threshold: {}'.format(fscore, s, threshold)) if fmax < fscore: fmax = fscore tmax = threshold if smin > s: smin = s print('Fmax: {:0.3f}, Smin: {:0.3f}, threshold: {}'.format( fmax, smin, tmax)) precisions = np.array(precisions) recalls = np.array(recalls) sorted_index = np.argsort(recalls) recalls = recalls[sorted_index] precisions = precisions[sorted_index] aupr = np.trapz(precisions, recalls) print('AUPR: {:0.3f}'.format(aupr)) return [recalls, precisions, aupr]
def main(train_data_file, valid_data_file, terms_file, diamond_scores_file, ont): go_rels = Ontology('data-cafa/go.obo', with_rels=True) terms_df = pd.read_pickle(terms_file) terms = terms_df['terms'].values.flatten() terms_dict = {v: i for i, v in enumerate(terms)} train_df = pd.read_pickle(train_data_file) valid_df = pd.read_pickle(valid_data_file) annotations = train_df['annotations'].values annotations = list(map(lambda x: set(x), annotations)) valid_annotations = valid_df['annotations'].values valid_annotations = list(map(lambda x: set(x), valid_annotations)) go_rels.calculate_ic(annotations + valid_annotations) # Print IC values of terms ics = {} for term in terms: ics[term] = go_rels.get_ic(term) prot_index = {} for i, row in enumerate(train_df.itertuples()): prot_index[row.proteins] = i # BLAST Similarity (Diamond) diamond_scores = {} with open(diamond_scores_file) as f: for line in f: it = line.strip().split() if it[0] not in diamond_scores: diamond_scores[it[0]] = {} diamond_scores[it[0]][it[1]] = float(it[2]) blast_preds = [] for i, row in enumerate(valid_df.itertuples()): annots = {} prot_id = row.proteins # BlastKNN if prot_id in diamond_scores: sim_prots = diamond_scores[prot_id] allgos = set() total_score = 0.0 for p_id, score in sim_prots.items(): allgos |= annotations[prot_index[p_id]] total_score += score allgos = list(sorted(allgos)) sim = np.zeros(len(allgos), dtype=np.float32) for j, go_id in enumerate(allgos): s = 0.0 for p_id, score in sim_prots.items(): if go_id in annotations[prot_index[p_id]]: s += score sim[j] = s / total_score ind = np.argsort(-sim) for go_id, score in zip(allgos, sim): annots[go_id] = score blast_preds.append(annots) # DeepGOPlus go_set = go_rels.get_namespace_terms(NAMESPACES[ont]) go_set.remove(FUNC_DICT[ont]) labels = valid_df['annotations'].values labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels)) print(len(go_set)) best_fmax = 0.0 best_alpha = 0.0 for alpha in range(44, 70): alpha /= 100.0 deep_preds = [] for i, row in enumerate(valid_df.itertuples()): annots_dict = blast_preds[i].copy() for go_id in annots_dict: annots_dict[go_id] *= alpha for j, score in enumerate(row.preds): go_id = terms[j] score *= 1 - alpha if go_id in annots_dict: annots_dict[go_id] += score else: annots_dict[go_id] = score deep_preds.append(annots_dict) fmax = 0.0 tmax = 0.0 precisions = [] recalls = [] smin = 1000000.0 rus = [] mis = [] for t in range(14, 20): threshold = t / 100.0 preds = [] for i, row in enumerate(valid_df.itertuples()): annots = set() for go_id, score in deep_preds[i].items(): if score >= threshold: annots.add(go_id) new_annots = set() for go_id in annots: new_annots |= go_rels.get_anchestors(go_id) preds.append(new_annots) # Filter classes preds = list( map(lambda x: set(filter(lambda y: y in go_set, x)), preds)) fscore, prec, rec, s, ru, mi, fps, fns = evaluate_annotations( go_rels, labels, preds) avg_fp = sum(map(lambda x: len(x), fps)) / len(fps) avg_ic = sum( map(lambda x: sum(map(lambda go_id: go_rels.get_ic(go_id), x)), fps)) / len(fps) print( f'Fscore: {fscore}, Precision: {prec}, Recall: {rec} S: {s}, RU: {ru}, MI: {mi} threshold: {threshold}' ) if fmax < fscore: fmax = fscore tmax = threshold if smin > s: smin = s if best_fmax < fmax: best_fmax = fmax best_alpha = alpha print( f'Alpha: {alpha} Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}' ) print(f'{best_alpha} {best_fmax}')
def main(benchmark_file, train_data_file, hpo_file, terms_file, root_class): hp = Ontology(hpo_file, with_rels=True) terms_df = pd.read_pickle(terms_file) terms = terms_df['terms'].values.flatten() terms_dict = {v: i for i, v in enumerate(terms)} noknowledge_prots = set() with open('data-cafa/noknowledge_targets.txt') as f: for line in f: noknowledge_prots.add(line.strip()) bench_annots = {} with open(benchmark_file) as f: for line in f: it = line.strip().split('\t') t_id = it[0] if t_id not in noknowledge_prots: continue hp_id = it[1] if t_id not in bench_annots: bench_annots[t_id] = set() bench_annots[t_id] |= hp.get_anchestors(hp_id) train_df = pd.read_pickle(train_data_file) naive_annots = {} for i, row in train_df.iterrows(): for hp_id in row.hp_annotations: if hp_id in naive_annots: naive_annots[hp_id] += 1 else: naive_annots[hp_id] = 1 for hp_id in naive_annots: naive_annots[hp_id] /= 1.0 * len(train_df) annotations = train_df['hp_annotations'].values annotations = list(map(lambda x: set(x), annotations)) hp.calculate_ic(annotations) hp_set = set(terms) all_classes = hp.get_term_set(root_class) hp_set = hp_set.intersection(all_classes) hp_set.discard(root_class) print(len(hp_set)) labels = [] for t_id, hps in bench_annots.items(): labels.append(hps) labels = list(map(lambda x: set(filter(lambda y: y in hp_set, x)), labels)) # Compute AUC auc_terms = list(hp_set) auc_terms_dict = {v: i for i, v in enumerate(auc_terms)} auc_preds = np.zeros((len(bench_annots), len(hp_set)), dtype=np.float32) auc_labels = np.zeros((len(bench_annots), len(hp_set)), dtype=np.int32) for i in range(len(labels)): for j, hp_id in enumerate(auc_terms): auc_preds[i, j] = naive_annots[hp_id] if hp_id in labels[i]: auc_labels[i, j] = 1 # Compute macro AUROC roc_auc = 0.0 total = 0 for i, hp_id in enumerate(auc_terms): if np.sum(auc_labels[:, i]) == 0: continue total += 1 auc = compute_roc(auc_labels[:, i], auc_preds[:, i]) if not math.isnan(auc): roc_auc += auc else: roc_auc += 1 roc_auc /= total print(roc_auc) return fmax = 0.0 tmax = 0.0 pmax = 0.0 rmax = 0.0 precisions = [] recalls = [] smin = 1000000.0 max_preds = None for t in range(0, 101): threshold = t / 100.0 annots = set() for hp_id, score in naive_annots.items(): if score >= threshold: annots.add(hp_id) new_annots = set() for hp_id in annots: new_annots |= hp.get_anchestors(hp_id) preds = [] for t_id, hps in bench_annots.items(): preds.append(new_annots) fscore, prec, rec, s = evaluate_annotations(hp, labels, preds) precisions.append(prec) recalls.append(rec) print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}') if fmax < fscore: fmax = fscore pmax = prec rmax = rec tmax = threshold max_preds = preds if smin > s: smin = s precisions = np.array(precisions) recalls = np.array(recalls) sorted_index = np.argsort(recalls) recalls = recalls[sorted_index] precisions = precisions[sorted_index] aupr = np.trapz(precisions, recalls) print( f'AUROC: {roc_auc:0.3f}, AUPR: {aupr:0.3f}, Fmax: {fmax:0.3f}, Prec: {pmax:0.3f}, Rec: {rmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}' )
def main(train_data_file, test_data_file, terms_file, rules_file): hp = Ontology('data/hp.obo', with_rels=True) terms_df = pd.read_pickle(terms_file) terms = terms_df['terms'].values.flatten() terms_dict = {v: i for i, v in enumerate(terms)} train_df = pd.read_pickle(train_data_file) rule_annots = {} with open(rules_file) as f: for line in f: it = line.strip().split() go_id = it[0].replace('_', ':') hp_id = it[1].replace('_', ':') if go_id not in rule_annots: rule_annots[go_id] = set() rule_annots[go_id].add(hp_id) test_df = pd.read_pickle(test_data_file) annotations = train_df['hp_annotations'].values annotations = list(map(lambda x: set(x), annotations)) test_annotations = test_df['hp_annotations'].values test_annotations = list(map(lambda x: set(x), test_annotations)) hp.calculate_ic(annotations) hp_set = set(terms) hp_set_anch = set() for hp_id in hp_set: hp_set_anch |= hp.get_anchestors(hp_id) labels = test_annotations # labels = list(map(lambda x: set(filter(lambda y: y in hp_set_anch, x)), labels)) fmax = 0.0 tmax = 0.0 precisions = [] recalls = [] smin = 1000000.0 max_preds = None for t in range(0, 101): threshold = t / 100.0 preds = [] for i, row in enumerate(test_df.itertuples()): gene_id = row.genes annots = set() for item in row.deepgo_annotations: go_id, score = item.split('|') score = float(score) if score >= threshold and go_id in rule_annots: annots |= rule_annots[go_id] new_annots = set() for hp_id in annots: new_annots |= hp.get_anchestors(hp_id) preds.append(new_annots) # Filter classes fscore, prec, rec, s = evaluate_annotations(hp, labels, preds) precisions.append(prec) recalls.append(rec) print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}') if fmax < fscore: fmax = fscore tmax = threshold max_preds = preds if smin > s: smin = s print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}') test_df['hp_preds'] = max_preds test_df.to_pickle('data/predictions_max.pkl') precisions = np.array(precisions) recalls = np.array(recalls) sorted_index = np.argsort(recalls) recalls = recalls[sorted_index] precisions = precisions[sorted_index] aupr = np.trapz(precisions, recalls) print(f'AUPR: {aupr:0.3f}')
def main(train_data_file, preds_file, ont): go = Ontology('data/go.obo', with_rels=True) terms_df = pd.read_pickle('data-deepgo/' + ont + '.pkl') terms = terms_df['functions'].values.flatten() terms_dict = {v: i for i, v in enumerate(terms)} train_df = pd.read_pickle(train_data_file) annotations = train_df['annotations'].values annotations = list(map(lambda x: set(x), annotations)) test_df = pd.read_pickle(preds_file) # Annotations test_annotations = [] for i, row in enumerate(test_df.itertuples()): annots = set() for go_id in row.gos: if go.has_term(go_id): annots |= go.get_anchestors(go_id) test_annotations.append(annots) go.calculate_ic(annotations + test_annotations) prot_index = {} for i, row in enumerate(train_df.itertuples()): prot_index[row.proteins] = i # DeepGO go_set = go.get_namespace_terms(NAMESPACES[ont]) go_set.remove(FUNC_DICT[ont]) labels = test_annotations labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels)) print(len(go_set)) fmax = 0.0 tmax = 0.0 smin = 1000.0 precisions = [] recalls = [] for threshold in np.arange(0.005, 1, .01): # threshold = t / 100.0 preds = [] for i, row in enumerate(test_df.itertuples()): annots = set() for j, score in enumerate(row.predictions): if score >= threshold: annots.add(terms[j]) new_annots = set() for go_id in annots: new_annots |= go.get_anchestors(go_id) preds.append(new_annots) # Filter classes preds = list( map(lambda x: set(filter(lambda y: y in go_set, x)), preds)) fscore, prec, rec, s = evaluate_annotations(go, labels, preds) precisions.append(prec) recalls.append(rec) print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}') if fmax < fscore: fmax = fscore tmax = threshold if smin > s: smin = s print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}') precisions = np.array(precisions) recalls = np.array(recalls) sorted_index = np.argsort(recalls) recalls = recalls[sorted_index] precisions = precisions[sorted_index] aupr = np.trapz(precisions, recalls) print(f'AUPR: {aupr:0.3f}') plt.figure() lw = 2 plt.plot(recalls, precisions, color='darkorange', lw=lw, label=f'AUPR curve (area = {aupr:0.3f})') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Area Under the Precision-Recall curve') plt.legend(loc="lower right") plt.savefig('aupr.pdf') plt.show()
def main(go_file, train_sequences_file, train_annotations_file, test_sequences_file, test_annotations_file, out_terms_file, train_data_file, test_data_file, min_count): logging.info('Loading GO') go = Ontology(go_file, with_rels=True) logging.info('Loading training annotations') train_annots = {} with open(train_annotations_file, 'r') as f: for line in f: it = line.strip().split('\t') prot_id = it[0] if prot_id not in train_annots: train_annots[prot_id] = set() go_id = it[1] train_annots[prot_id].add(go_id) logging.info('Loading training sequences') info, seqs = read_fasta(train_sequences_file) proteins = [] sequences = [] annotations = [] for prot_info, sequence in zip(info, seqs): prot_id = prot_info.split()[0] if prot_id in train_annots: proteins.append(prot_id) sequences.append(sequence) annotations.append(train_annots[prot_id]) prop_annotations = [] cnt = Counter() for annots in annotations: # Propagate annotations annots_set = set() for go_id in annots: annots_set |= go.get_anchestors(go_id) prop_annotations.append(annots_set) for go_id in annots_set: cnt[go_id] += 1 df = pd.DataFrame({ 'proteins': proteins, 'sequences': sequences, 'annotations': prop_annotations, }) logging.info(f'Train proteins: {len(df)}') logging.info(f'Saving training data to {train_data_file}') df.to_pickle(train_data_file) # Filter terms with annotations more than min_count res = {} for key, val in cnt.items(): if val >= min_count: ont = key.split(':')[0] if ont not in res: res[ont] = [] res[ont].append(key) terms = [] for key, val in res.items(): terms += val logging.info(f'Number of terms {len(terms)}') logging.info(f'Saving terms to {out_terms_file}') df = pd.DataFrame({'terms': terms}) df.to_pickle(out_terms_file) logging.info('Loading testing annotations') test_annots = {} with open(test_annotations_file, 'r') as f: for line in f: it = line.strip().split('\t') prot_id = it[0] if prot_id not in test_annots: test_annots[prot_id] = set() go_id = it[1] test_annots[prot_id].add(go_id) logging.info('Loading testing sequences') info, seqs = read_fasta(test_sequences_file) proteins = [] sequences = [] annotations = [] for prot_info, sequence in zip(info, seqs): prot_id = prot_info.split()[0] if prot_id in test_annots: proteins.append(prot_id) sequences.append(sequence) annotations.append(test_annots[prot_id]) prop_annotations = [] for annots in annotations: # Propagate annotations annots_set = set() for go_id in annots: annots_set |= go.get_anchestors(go_id) prop_annotations.append(annots_set) df = pd.DataFrame({ 'proteins': proteins, 'sequences': sequences, 'annotations': prop_annotations, }) logging.info(f'Test proteins {len(df)}') logging.info(f'Saving testing data to {test_data_file}') df.to_pickle(test_data_file)
def main(train_data_file, test_data_file, terms_file, diamond_scores_file, ont, alpha): alpha /= 100.0 mp = Ontology('data/mp.obo', with_rels=True) terms_df = pd.read_pickle(terms_file) terms = terms_df['terms'].values.flatten() terms_dict = {v: i for i, v in enumerate(terms)} train_df = pd.read_pickle(train_data_file) test_df = pd.read_pickle(test_data_file) annotations = train_df['mp_annotations'].values annotations = list(map(lambda x: set(x), annotations)) test_annotations = test_df['mp_annotations'].values test_annotations = list(map(lambda x: set(x), test_annotations)) mp.calculate_ic(annotations) prot_index = {} for i, row in enumerate(train_df.itertuples()): prot_index[row.proteins] = i # GO2HP preds rules = {} with open('data/go2hp.txt') as f: for line in f: it = line.strip().split('\t') go_id = it[0].replace('_', ':') mp_ids = list(map(lambda x: x.replace('_', ':'), it[1:])) if go_id not in rules: rules[go_id] = [] rules[go_id] = mp_ids pheno2go_preds = {} for i, row in enumerate(test_df.itertuples()): prot_id = row.proteins if prot_id not in pheno2go_preds: pheno2go_preds[prot_id] = {} for item in row.deepgo_annotations: go_id, score = item.split('|') if go_id in rules: for mp_id in rules[go_id]: pheno2go_preds[prot_id][mp_id] = max( float(score), pheno2go_preds[prot_id].get(mp_id, 0)) labels = test_annotations fmax = 0.0 tmax = 0.0 precisions = [] recalls = [] smin = 1000000.0 for t in range(101): threshold = t / 100.0 preds = [] for i, row in enumerate(test_df.itertuples()): prot_id = row.proteins annots_dict = {} #pheno2go_preds[prot_id] for j, score in enumerate(row.preds): mp_id = terms[j] annots_dict[mp_id] = max(score, annots_dict.get(mp_id, 0)) annots = set() for mp_id, score in annots_dict.items(): if score >= threshold: annots.add(mp_id) new_annots = set() for mp_id in annots: new_annots |= mp.get_anchestors(mp_id) preds.append(new_annots) # Filter classes fscore, prec, rec, s = evaluate_annotations(mp, labels, preds) precisions.append(prec) recalls.append(rec) print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}') if fmax < fscore: fmax = fscore tmax = threshold if smin > s: smin = s print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}') precisions = np.array(precisions) recalls = np.array(recalls) sorted_index = np.argsort(recalls) recalls = recalls[sorted_index] precisions = precisions[sorted_index] aupr = np.trapz(precisions, recalls) print(f'AUPR: {aupr:0.3f}') plt.figure() lw = 2 plt.plot(recalls, precisions, color='darkorange', lw=lw, label=f'AUPR curve (area = {aupr:0.2f})') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Area Under the Precision-Recall curve') plt.legend(loc="lower right") plt.savefig(f'aupr_{ont}_{alpha:0.2f}.pdf') df = pd.DataFrame({'precisions': precisions, 'recalls': recalls}) df.to_pickle(f'PR_{ont}_{alpha:0.2f}.pkl')
def main(in_file, out_file, go_file, model_file, terms_file, annotations_file, chunk_size, diamond_file, threshold, batch_size, alpha): # Load GO and read list of all terms go = Ontology(go_file, with_rels=True) terms_df = pd.read_pickle(terms_file) terms = terms_df['terms'].values.flatten() # Read known experimental annotations annotations = {} df = pd.read_pickle(annotations_file) for row in df.itertuples(): annotations[row.proteins] = set(row.prop_annotations) go.calculate_ic(annotations.values()) diamond_preds = {} mapping = {} with gzip.open(diamond_file, 'rt') as f: for line in f: it = line.strip().split() if it[0] not in mapping: mapping[it[0]] = {} mapping[it[0]][it[1]] = float(it[2]) for prot_id, sim_prots in mapping.items(): annots = {} allgos = set() total_score = 0.0 for p_id, score in sim_prots.items(): allgos |= annotations[p_id] total_score += score allgos = list(sorted(allgos)) sim = np.zeros(len(allgos), dtype=np.float32) for j, go_id in enumerate(allgos): s = 0.0 for p_id, score in sim_prots.items(): if go_id in annotations[p_id]: s += score sim[j] = s / total_score for go_id, score in zip(allgos, sim): annots[go_id] = score diamond_preds[prot_id] = annots # Load CNN model model = load_model(model_file) # Alphas for the latest model alphas = {NAMESPACES['mf']: 0.55, NAMESPACES['bp']: 0.59, NAMESPACES['cc']: 0.46} # Alphas for the cafa2 model # alphas = {NAMESPACES['mf']: 0.63, NAMESPACES['bp']: 0.68, NAMESPACES['cc']: 0.48} start_time = time.time() total_seq = 0 w = gzip.open(out_file, 'wt') for prot_ids, sequences in read_fasta(in_file, chunk_size): total_seq += len(prot_ids) deep_preds = {} ids, data = get_data(sequences) preds = model.predict(data, batch_size=batch_size) assert preds.shape[1] == len(terms) for i, j in enumerate(ids): prot_id = prot_ids[j] if prot_id not in deep_preds: deep_preds[prot_id] = {} for l in range(len(terms)): if preds[i, l] >= 0.01: # Filter out very low scores if terms[l] not in deep_preds[prot_id]: deep_preds[prot_id][terms[l]] = preds[i, l] else: deep_preds[prot_id][terms[l]] = max( deep_preds[prot_id][terms[l]], preds[i, l]) # Combine diamond preds and deepgo for prot_id in prot_ids: annots = {} if prot_id in diamond_preds: for go_id, score in diamond_preds[prot_id].items(): annots[go_id] = score * alphas[go.get_namespace(go_id)] for go_id, score in deep_preds[prot_id].items(): if go_id in annots: annots[go_id] += (1 - alphas[go.get_namespace(go_id)]) * score else: annots[go_id] = (1 - alphas[go.get_namespace(go_id)]) * score # Propagate scores with ontology structure gos = list(annots.keys()) for go_id in gos: for g_id in go.get_anchestors(go_id): if g_id in annots: annots[g_id] = max(annots[g_id], annots[go_id]) else: annots[g_id] = annots[go_id] sannots = sorted(annots.items(), key=lambda x: x[1], reverse=True) for go_id, score in sannots: if score >= threshold: w.write(prot_id + '\t' + go_id + '\t' + go.get_term(go_id)['name'] + '\t%.2f' % go.get_ic(go_id) + '\t%.3f\n' % score) w.write('\n') w.close() total_time = time.time() - start_time print('Total prediction time for %d sequences is %d' % (total_seq, total_time))
def main(go_file, hp_file, hp_annots_file, deepgo_annots_file, data_file, expressions_file, out_data_file, out_terms_file, min_count): go = Ontology(go_file, with_rels=True) print('GO loaded') hp = Ontology(hp_file, with_rels=True) print('HP loaded') iea_annots = {} go_annots = {} seqs = {} df = pd.read_pickle(data_file) df = df[df['orgs'] == '9606'] acc2prot = {} name2prot = {} for i, row in df.iterrows(): accs = row['accessions'].split('; ') names = row['gene_names'] p_id = row['proteins'] for acc in accs: acc2prot[acc] = p_id for name in names: name = name.upper() if name not in name2prot: name2prot[name] = set() name2prot[name].add(p_id) if p_id not in go_annots: go_annots[p_id] = set() if p_id not in iea_annots: iea_annots[p_id] = set() go_annots[p_id] |= set(row.exp_annotations) iea_annots[p_id] |= set(row.iea_annotations) seqs[p_id] = row.sequences print('GO Annotations', len(go_annots)) print('Loading HP annotations') hp_annots = {} unrev = set() with open(hp_annots_file) as f: next(f) for line in f: it = line.strip().split('\t') acc_id = it[0] hp_id = it[1] if acc_id not in acc2prot: unrev.add(acc_id) continue p_id = acc2prot[acc_id] if p_id not in hp_annots: hp_annots[p_id] = set() if hp.has_term(hp_id): hp_annots[p_id] |= hp.get_anchestors(hp_id) print('HP Annotations', len(hp_annots)) dg_annots = {} gos = set() with open(deepgo_annots_file) as f: for line in f: it = line.strip().split('\t') p_id = it[0] annots = dg_annots.get(p_id, {}) for item in it[1:]: go_id, score = item.split('|') score = float(score) annots[go_id] = max(score, annots.get(go_id, 0)) dg_annots[p_id] = annots gos |= set(annots.keys()) print('DeepGO Annotations', len(dg_annots)) deepgo_annots = {} for g_id, annots in dg_annots.items(): deepgo_annots[g_id] = [ go_id + '|' + str(score) for go_id, score in annots.items() ] print('Number of GOs', len(gos)) df = pd.DataFrame({'gos': list(gos)}) #df.to_pickle('data-cafa/gos.pkl') logging.info('Processing annotations') cnt = Counter() annotations = list() for p_id, annots in hp_annots.items(): for term in annots: cnt[term] += 1 gene_exp = {} max_val = 0 with open(expressions_file) as f: for line in f: if line.startswith('#') or line.startswith('Gene'): continue it = line.strip().split('\t') gene_name = it[1].upper() if gene_name in name2prot: exp = np.zeros((53, ), dtype=np.float32) for i in range(len(it[2:])): exp[i] = float(it[2 + i]) if it[2 + i] != '' else 0.0 for p_id in name2prot[gene_name]: gene_exp[p_id] = exp / np.max(exp) print('Expression values', len(gene_exp)) deepgo_annotations = [] go_annotations = [] iea_annotations = [] hpos = [] proteins = [] sequences = [] expressions = [] mis_exp = 0 for p_id, phenos in hp_annots.items(): if p_id not in dg_annots: continue proteins.append(p_id) hpos.append(phenos) go_annotations.append(go_annots[p_id]) iea_annotations.append(iea_annots[p_id]) deepgo_annotations.append(deepgo_annots[p_id]) sequences.append(seqs[p_id]) if p_id in gene_exp: expressions.append(gene_exp[p_id]) else: expressions.append(np.zeros((53, ), dtype=np.float32)) mis_exp += 1 print('Missing expressions', mis_exp) df = pd.DataFrame({ 'proteins': proteins, 'hp_annotations': hpos, 'go_annotations': go_annotations, 'iea_annotations': iea_annotations, 'deepgo_annotations': deepgo_annotations, 'sequences': sequences, 'expressions': expressions }) df.to_pickle(out_data_file) print(f'Number of proteins {len(df)}') test_annots = {} tar2prot = {} with open('data-cafa/tar2prot.txt') as f: for line in f: it = line[1:].strip().split() tar2prot[it[0]] = it[1] unknown_prots = set() with open('data-cafa/benchmark/groundtruth/leafonly_HPO.txt') as f: for line in f: it = line.strip().split() p_id = tar2prot[it[0]] if p_id in hp_annots: continue unknown_prots.add(it[0]) hp_id = it[1] if p_id not in test_annots: test_annots[p_id] = set() if hp.has_term(hp_id): test_annots[p_id] |= hp.get_anchestors(hp_id) with open('data-cafa/noknowledge_targets.txt', 'w') as f: for t_id in unknown_prots: f.write(t_id + '\n') deepgo_annotations = [] go_annotations = [] iea_annotations = [] hpos = [] proteins = [] sequences = [] expressions = [] mis_exp = 0 for p_id, phenos in test_annots.items(): if p_id not in dg_annots: continue proteins.append(p_id) hpos.append(phenos) go_annotations.append(go_annots[p_id]) iea_annotations.append(iea_annots[p_id]) deepgo_annotations.append(deepgo_annots[p_id]) sequences.append(seqs[p_id]) if p_id in gene_exp: expressions.append(gene_exp[p_id]) else: expressions.append(np.zeros((53, ), dtype=np.float32)) mis_exp += 1 df = pd.DataFrame({ 'proteins': proteins, 'hp_annotations': hpos, 'go_annotations': go_annotations, 'iea_annotations': iea_annotations, 'deepgo_annotations': deepgo_annotations, 'sequences': sequences, 'expressions': expressions }) print('Missing expressions test', mis_exp) df.to_pickle('data-cafa/human_test.pkl') print(f'Number of test proteins {len(df)}') # Filter terms with annotations more than min_count terms_set = set() all_terms = [] for key, val in cnt.items(): if key == 'HP:0000001': continue all_terms.append(key) if val >= min_count: terms_set.add(key) terms = [] for t_id in hp.get_ordered_terms(): if t_id in terms_set: terms.append(t_id) logging.info(f'Number of terms {len(terms)}')