def performanc_by_interpro(): pred_df = pd.read_pickle(DATA_ROOT + 'test-' + FUNCTION + '-preds.pkl') ipro_df = load_prot_ipro() df = pred_df.merge(ipro_df, on='proteins', how='left') ipro = get_ipro() def reshape(values): values = np.hstack(values).reshape( len(values), len(values[0])) return values for ipro_id in ipro: if len(ipro[ipro_id]['parents']) > 0: continue labels = list() predictions = list() gos = list() for i, row in df.iterrows(): if not isinstance(row['ipros'], list): continue if ipro_id in row['ipros']: labels.append(row['labels']) predictions.append(row['predictions']) gos.append(row['gos']) pr = 0 rc = 0 total = 0 p_total = 0 for i in range(len(labels)): tp = np.sum(labels[i] * predictions[i]) fp = np.sum(predictions[i]) - tp fn = np.sum(labels[i]) - tp all_gos = set() for go_id in gos[i]: if go_id in all_functions: all_gos |= get_anchestors(go, go_id) all_gos.discard(GO_ID) all_gos -= func_set fn += len(all_gos) if tp == 0 and fp == 0 and fn == 0: continue total += 1 if tp != 0: p_total += 1 precision = tp / (1.0 * (tp + fp)) recall = tp / (1.0 * (tp + fn)) pr += precision rc += recall if total > 0 and p_total > 0: rc /= total pr /= p_total if pr + rc > 0: f = 2 * pr * rc / (pr + rc) logging.info('%s\t%d\t%f\t%f\t%f' % ( ipro_id, len(labels), f, pr, rc))
def ipro_table(): ipro = get_ipro() cc = get_ipro_data('ipro_cc.res') mf = get_ipro_data('ipro_mf.res') bp = get_ipro_data('ipro_bp.res') inter = set(cc).intersection(set(mf)).intersection(set(bp)) res = list() sup = 50 for key in inter: if bp[key][0] >= sup and mf[key][0] >= sup and cc[key][0] >= sup: res.append((key, ipro[key]['name'], bp[key][1], bp[key][2], bp[key][3], mf[key][1], mf[key][2], mf[key][3], cc[key][1], cc[key][2], cc[key][3])) res = sorted(res, key=lambda x: x[2], reverse=True) for item in res: print( '%s & %s & %.2f & %.2f & %.2f & %.2f & %.2f & %.2f & %.2f & %.2f & %.2f \\\\' % item)