def main(hp_file, train_data_file, terms_file, dis_phenotypes, omim_file, predictions_file, gene_annots_file, dis_annots_file, fold): hp = Ontology(hp_file, with_rels=True) print('HP loaded') terms_df = pd.read_pickle(terms_file) terms = terms_df['terms'].values.flatten() terms_dict = {v: i for i, v in enumerate(terms)} predictions_file = f'fold{fold}_exp-' + predictions_file gene_annots_file = f'fold{fold}_exp-' + gene_annots_file dis_annots_file = f'fold{fold}_exp-' + dis_annots_file real_annots_file = f'fold{fold}_exp-data/gene_annotations_real.tab' diseases = set() genes = set() with open(omim_file, 'r') as f: for line in f: if line.startswith('#'): continue it = line.strip().split('\t') omim_id = it[0].split(', ')[-1].split()[0] gene_symbols = it[1].split(', ') genes |= set(gene_symbols) diseases.add('OMIM:' + omim_id) print(len(diseases), len(genes)) dis_annots = {} with open(dis_phenotypes) as f: for line in f: it = line.strip().split('\t') dis_id = it[0] + ':' + it[1] if dis_id not in diseases: continue hp_id = it[4] if not hp.has_term(hp_id): continue if dis_id not in dis_annots: dis_annots[dis_id] = set() dis_annots[dis_id].add(hp_id) with open(dis_annots_file, 'w') as w: for dis_id, annots in dis_annots.items(): w.write(dis_id) for hp_id in annots: w.write('\t' + hp_id) w.write('\n') df = pd.read_pickle(predictions_file) with open(gene_annots_file, 'w') as w: for i, row in df.iterrows(): w.write(row['genes']) for hp_id in row['hp_preds']: w.write('\t' + hp_id) w.write('\n') with open(real_annots_file, 'w') as w: for i, row in df.iterrows(): w.write(row['genes']) for hp_id in row['hp_annotations']: w.write('\t' + hp_id) w.write('\n')
def main(go_file, mp_file, mp_annots_file, deepgo_annots_file, id_mapping_file, data_file, out_data_file, out_terms_file, min_count): go = Ontology(go_file, with_rels=True) logging.info('GO loaded') mp = Ontology(mp_file, with_rels=True) logging.info('MP loaded') logging.info('Load MP2Uniprot mapping') prot2gene = {} with open(id_mapping_file) as f: next(f) for line in f: it = line.strip().split('\t') if it[0] not in gene2prot: gene2prot[it[0]] = [] gene2prot[it[0]] += list(it[6].split()) logging.info('Loading MP annotations') mp_annots = {} df = pd.read_pickle(data_file) acc2prot = {} for row in df.itertuples(): p_id = row.proteins acc_ids = row.accessions.split('; ') for acc_id in acc_ids: acc2prot[acc_id] = p_id with open(mp_annots_file) as f: next(f) for line in f: it = line.strip().split('\t') for mgi in it[6].split('|'): if mgi not in gene2prot: continue prot_ids = gene2prot[mgi] mp_id = it[4] for prot_id in prot_ids: if prot_id not in acc2prot: continue prot_id = acc2prot[prot_id] if prot_id not in mp_annots: mp_annots[prot_id] = set() if mp.has_term(mp_id): mp_annots[prot_id] |= mp.get_anchestors(mp_id) print('MP Annotations', len(mp_annots)) dg_annots = {} gos = set() with open(deepgo_annots_file) as f: for line in f: it = line.strip().split('\t') prot_id = it[0] annots = [] for item in it[1:]: go_id, score = item.split('|') score = float(score) annots.append(go_id) dg_annots[prot_id] = it[1:] gos |= set(annots) print('DeepGO Annotations', len(dg_annots)) print('Number of GOs', len(gos)) go_df = pd.DataFrame({'gos': list(gos)}) go_df.to_pickle('data/gos.pkl') logging.info('Processing annotations') cnt = Counter() annotations = list() for prot_id, annots in mp_annots.items(): for term in annots: cnt[term] += 1 deepgo_annots = [] go_annots = [] mpos = [] prots = [] sequences = [] for row in df.itertuples(): p_id = row.proteins if p_id in mp_annots: prots.append(p_id) mpos.append(mp_annots[p_id]) go_annots.append(row.annotations) deepgo_annots.append(dg_annots[p_id]) sequences.append(row.sequences) prots_set = set(prots) for key, val in mp_annots.items(): if key not in prots_set: print(key) df = pd.DataFrame({ 'proteins': prots, 'mp_annotations': mpos, 'go_annotations': go_annots, 'deepgo_annotations': deepgo_annots, 'sequences': sequences }) df.to_pickle(out_data_file) print(f'Number of proteins {len(df)}') # Filter terms with annotations more than min_count res = {} for key, val in cnt.items(): if key == 'MP:0000001': continue if val >= min_count: ont = key.split(':')[0] if ont not in res: res[ont] = [] res[ont].append(key) terms = [] for key, val in res.items(): print(key, len(val)) terms += val logging.info(f'Number of terms {len(terms)}') # Save the list of terms df = pd.DataFrame({'terms': terms}) df.to_pickle(out_terms_file)
def helper(train_df, test_df, ont): go = Ontology('data-cafa/go.obo', with_rels=True) terms_df = pd.read_pickle('data-cafa/' + ont + '.pkl') terms = terms_df['functions'].values.flatten() terms_dict = {v: i for i, v in enumerate(terms)} train_df = train_df.rename(columns={"gos": "annotations"}) annotations = train_df['annotations'].values annotations = list(map(lambda x: set(x), annotations)) test_df = test_df.rename(columns={"gos": "annotations"}) # Annotations test_annotations = [] for i, row in enumerate(test_df.itertuples()): annots = set() for go_id in row.annotations: if go.has_term(go_id): annots |= go.get_anchestors(go_id) test_annotations.append(annots) go.calculate_ic(annotations + test_annotations) prot_index = {} for i, row in enumerate(train_df.itertuples()): prot_index[row.proteins] = i # DeepGO go_set = go.get_namespace_terms(NAMESPACES[ont]) go_set.remove(FUNC_DICT[ont]) labels = test_annotations labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels)) print(len(go_set)) fmax = 0.0 tmax = 0.0 smin = 1000.0 precisions = [] recalls = [] for t in range(1, 101): threshold = t / 100.0 preds = [] for i, row in enumerate(test_df.itertuples()): annots = set() for j, score in enumerate(row.predictions): if score >= threshold: annots.add(terms[j]) new_annots = set() for go_id in annots: new_annots |= go.get_anchestors(go_id) preds.append(new_annots) # Filter classes preds = list( map(lambda x: set(filter(lambda y: y in go_set, x)), preds)) fscore, prec, rec, s = evaluate_annotations(go, labels, preds) precisions.append(prec) recalls.append(rec) print('Fscore: {}, S: {}, threshold: {}'.format(fscore, s, threshold)) if fmax < fscore: fmax = fscore tmax = threshold if smin > s: smin = s print('Fmax: {:0.3f}, Smin: {:0.3f}, threshold: {}'.format( fmax, smin, tmax)) precisions = np.array(precisions) recalls = np.array(recalls) sorted_index = np.argsort(recalls) recalls = recalls[sorted_index] precisions = precisions[sorted_index] aupr = np.trapz(precisions, recalls) print('AUPR: {:0.3f}'.format(aupr)) return [recalls, precisions, aupr]
def main(train_data_file, preds_file, ont): go = Ontology('data/go.obo', with_rels=True) terms_df = pd.read_pickle('data-deepgo/' + ont + '.pkl') terms = terms_df['functions'].values.flatten() terms_dict = {v: i for i, v in enumerate(terms)} train_df = pd.read_pickle(train_data_file) annotations = train_df['annotations'].values annotations = list(map(lambda x: set(x), annotations)) test_df = pd.read_pickle(preds_file) # Annotations test_annotations = [] for i, row in enumerate(test_df.itertuples()): annots = set() for go_id in row.gos: if go.has_term(go_id): annots |= go.get_anchestors(go_id) test_annotations.append(annots) go.calculate_ic(annotations + test_annotations) prot_index = {} for i, row in enumerate(train_df.itertuples()): prot_index[row.proteins] = i # DeepGO go_set = go.get_namespace_terms(NAMESPACES[ont]) go_set.remove(FUNC_DICT[ont]) labels = test_annotations labels = list(map(lambda x: set(filter(lambda y: y in go_set, x)), labels)) print(len(go_set)) fmax = 0.0 tmax = 0.0 smin = 1000.0 precisions = [] recalls = [] for threshold in np.arange(0.005, 1, .01): # threshold = t / 100.0 preds = [] for i, row in enumerate(test_df.itertuples()): annots = set() for j, score in enumerate(row.predictions): if score >= threshold: annots.add(terms[j]) new_annots = set() for go_id in annots: new_annots |= go.get_anchestors(go_id) preds.append(new_annots) # Filter classes preds = list( map(lambda x: set(filter(lambda y: y in go_set, x)), preds)) fscore, prec, rec, s = evaluate_annotations(go, labels, preds) precisions.append(prec) recalls.append(rec) print(f'Fscore: {fscore}, S: {s}, threshold: {threshold}') if fmax < fscore: fmax = fscore tmax = threshold if smin > s: smin = s print(f'Fmax: {fmax:0.3f}, Smin: {smin:0.3f}, threshold: {tmax}') precisions = np.array(precisions) recalls = np.array(recalls) sorted_index = np.argsort(recalls) recalls = recalls[sorted_index] precisions = precisions[sorted_index] aupr = np.trapz(precisions, recalls) print(f'AUPR: {aupr:0.3f}') plt.figure() lw = 2 plt.plot(recalls, precisions, color='darkorange', lw=lw, label=f'AUPR curve (area = {aupr:0.3f})') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Area Under the Precision-Recall curve') plt.legend(loc="lower right") plt.savefig('aupr.pdf') plt.show()
def main(go_file, hp_file, hp_annots_file, deepgo_annots_file, data_file, expressions_file, out_data_file, out_terms_file, min_count): go = Ontology(go_file, with_rels=True) print('GO loaded') hp = Ontology(hp_file, with_rels=True) print('HP loaded') iea_annots = {} go_annots = {} seqs = {} df = pd.read_pickle(data_file) df = df[df['orgs'] == '9606'] acc2prot = {} name2prot = {} for i, row in df.iterrows(): accs = row['accessions'].split('; ') names = row['gene_names'] p_id = row['proteins'] for acc in accs: acc2prot[acc] = p_id for name in names: name = name.upper() if name not in name2prot: name2prot[name] = set() name2prot[name].add(p_id) if p_id not in go_annots: go_annots[p_id] = set() if p_id not in iea_annots: iea_annots[p_id] = set() go_annots[p_id] |= set(row.exp_annotations) iea_annots[p_id] |= set(row.iea_annotations) seqs[p_id] = row.sequences print('GO Annotations', len(go_annots)) print('Loading HP annotations') hp_annots = {} unrev = set() with open(hp_annots_file) as f: next(f) for line in f: it = line.strip().split('\t') acc_id = it[0] hp_id = it[1] if acc_id not in acc2prot: unrev.add(acc_id) continue p_id = acc2prot[acc_id] if p_id not in hp_annots: hp_annots[p_id] = set() if hp.has_term(hp_id): hp_annots[p_id] |= hp.get_anchestors(hp_id) print('HP Annotations', len(hp_annots)) dg_annots = {} gos = set() with open(deepgo_annots_file) as f: for line in f: it = line.strip().split('\t') p_id = it[0] annots = dg_annots.get(p_id, {}) for item in it[1:]: go_id, score = item.split('|') score = float(score) annots[go_id] = max(score, annots.get(go_id, 0)) dg_annots[p_id] = annots gos |= set(annots.keys()) print('DeepGO Annotations', len(dg_annots)) deepgo_annots = {} for g_id, annots in dg_annots.items(): deepgo_annots[g_id] = [ go_id + '|' + str(score) for go_id, score in annots.items() ] print('Number of GOs', len(gos)) df = pd.DataFrame({'gos': list(gos)}) #df.to_pickle('data-cafa/gos.pkl') logging.info('Processing annotations') cnt = Counter() annotations = list() for p_id, annots in hp_annots.items(): for term in annots: cnt[term] += 1 gene_exp = {} max_val = 0 with open(expressions_file) as f: for line in f: if line.startswith('#') or line.startswith('Gene'): continue it = line.strip().split('\t') gene_name = it[1].upper() if gene_name in name2prot: exp = np.zeros((53, ), dtype=np.float32) for i in range(len(it[2:])): exp[i] = float(it[2 + i]) if it[2 + i] != '' else 0.0 for p_id in name2prot[gene_name]: gene_exp[p_id] = exp / np.max(exp) print('Expression values', len(gene_exp)) deepgo_annotations = [] go_annotations = [] iea_annotations = [] hpos = [] proteins = [] sequences = [] expressions = [] mis_exp = 0 for p_id, phenos in hp_annots.items(): if p_id not in dg_annots: continue proteins.append(p_id) hpos.append(phenos) go_annotations.append(go_annots[p_id]) iea_annotations.append(iea_annots[p_id]) deepgo_annotations.append(deepgo_annots[p_id]) sequences.append(seqs[p_id]) if p_id in gene_exp: expressions.append(gene_exp[p_id]) else: expressions.append(np.zeros((53, ), dtype=np.float32)) mis_exp += 1 print('Missing expressions', mis_exp) df = pd.DataFrame({ 'proteins': proteins, 'hp_annotations': hpos, 'go_annotations': go_annotations, 'iea_annotations': iea_annotations, 'deepgo_annotations': deepgo_annotations, 'sequences': sequences, 'expressions': expressions }) df.to_pickle(out_data_file) print(f'Number of proteins {len(df)}') test_annots = {} tar2prot = {} with open('data-cafa/tar2prot.txt') as f: for line in f: it = line[1:].strip().split() tar2prot[it[0]] = it[1] unknown_prots = set() with open('data-cafa/benchmark/groundtruth/leafonly_HPO.txt') as f: for line in f: it = line.strip().split() p_id = tar2prot[it[0]] if p_id in hp_annots: continue unknown_prots.add(it[0]) hp_id = it[1] if p_id not in test_annots: test_annots[p_id] = set() if hp.has_term(hp_id): test_annots[p_id] |= hp.get_anchestors(hp_id) with open('data-cafa/noknowledge_targets.txt', 'w') as f: for t_id in unknown_prots: f.write(t_id + '\n') deepgo_annotations = [] go_annotations = [] iea_annotations = [] hpos = [] proteins = [] sequences = [] expressions = [] mis_exp = 0 for p_id, phenos in test_annots.items(): if p_id not in dg_annots: continue proteins.append(p_id) hpos.append(phenos) go_annotations.append(go_annots[p_id]) iea_annotations.append(iea_annots[p_id]) deepgo_annotations.append(deepgo_annots[p_id]) sequences.append(seqs[p_id]) if p_id in gene_exp: expressions.append(gene_exp[p_id]) else: expressions.append(np.zeros((53, ), dtype=np.float32)) mis_exp += 1 df = pd.DataFrame({ 'proteins': proteins, 'hp_annotations': hpos, 'go_annotations': go_annotations, 'iea_annotations': iea_annotations, 'deepgo_annotations': deepgo_annotations, 'sequences': sequences, 'expressions': expressions }) print('Missing expressions test', mis_exp) df.to_pickle('data-cafa/human_test.pkl') print(f'Number of test proteins {len(df)}') # Filter terms with annotations more than min_count terms_set = set() all_terms = [] for key, val in cnt.items(): if key == 'HP:0000001': continue all_terms.append(key) if val >= min_count: terms_set.add(key) terms = [] for t_id in hp.get_ordered_terms(): if t_id in terms_set: terms.append(t_id) logging.info(f'Number of terms {len(terms)}')