def ppis_gold_standard(ppis, cxs_splits, species): pdppis = pd.PairDict([p[:3] for p in ppis]) print len(pdppis.d), "predicted interactions" ppi_cxs,_,all_cxs = ppi.load_training_complexes(species, None,'') #conv doesn't matter pdcorum = pd.PairDict([(i[0],i[1],'gold') for i in co.pairs_from_complexes(ut.i1(all_cxs))]) print len(pdcorum.d), "total gold standard" pdcomb = pd.pd_union_disjoint_vals(pdppis, pdcorum) unmr_splits = cp.unmerged_splits_from_merged_splits(ppi_cxs,cxs_splits) print "unmerged split assignment lengths", [len(s) for s in unmr_splits] pdtrainpos = pd.PairDict([(t[0],t[1]) for t in co.pairs_from_complexes(unmr_splits[0])]) print len(pdtrainpos.d), "total train interactions" counterrs = 0 for tpair in pdtrainpos.d: cpair = pdcomb.find(tpair) #assert cpair is not None, "Gold standard problem--filter_methods changed since run?" if cpair is None or pdcomb.d[cpair][1] != 'gold': #print 'error: train should be subset', tpair counterrs += 1 else: pdcomb.d[cpair][1] = 'train' if counterrs: print "number of training not found in gold std:", counterrs comblist = [list(k)+list(v) for k,v in pdcomb.d.items()] print (len([1 for p in comblist if p[2] and p[3]=='gold']), "ppis in gold not train") print len([1 for p in comblist if p[2] and p[3]=='train']), "ppis in train" # only return those that are predictions return [p for p in comblist if p[2]]
def triple_venn_consv(): hints = co.load_havug_ints() ppi_cxs, clean_cxs, corconsv = ppi.load_training_complexes("Hs", "Dm") cints = co.pairs_from_complexes(ut.i1(ppi_cxs)) # exclude huge ones ints23 = ut.loadpy(ut.bigd("../23_collapsenodes/Hs_filtorth025_withsc_2sp_refilt2sp_cxs_cxppis_clust27_532cxs"))[1] ints3 = [cp.consv_pairs(i, h2d) for i in ints23, hints, cints] cp.triple_venn(ints3, ["map23", "havug", "corum"])
def tested_ppis(gold_cxs, ppis): gold_ints = co.pairs_from_complexes(gold_cxs) ntest_pos = len(gold_ints) pdtrues = pd.PairDict(gold_ints) ppis = [(p[0],p[1],p[2],1 if pdtrues.contains(tuple(p[:2])) else 0) for p in ppis] return ppis, ntest_pos
def arrfeats_prep_all_data(arrfeats, ppis, sp="Hs", gold_consv="Dm", cutoff=0.5): print "Adding species summary." arrfeats = fe.arr_add_spsummary(arrfeats, cutoff) print "Adding ppis." arrfeats = fe.arrfeats_add_ppis(arrfeats, ppis) _, _, all_cxs = ppi.load_training_complexes(sp, None, gold_consv) pdgold = pd.PairDict(co.pairs_from_complexes(ut.i1(all_cxs))) print "Setting trues." arrfeats = fe.arrfeats_set_gold(arrfeats, pdgold) return arrfeats
def hpa_stats(ppis, locs, max_set_size=None): s = attr_to_sets(locs) if max_set_size is not None: s = [c for c in s if len(c) < max_set_size] plocs = co.pairs_from_complexes(s) ppiprots = set(ut.i0(ppis)+ut.i1(ppis)) anprots = set(ut.i0(locs)) intprots = set.intersection(ppiprots, anprots) print len(ppiprots), len(anprots), len(intprots) return ppis_stats(ppis, plocs, intprots)
def _filter_ints(inlist, cxs): pairs = co.pairs_from_complexes(cxs) pdp = pd.PairDict(pairs) return [tup for tup in inlist if pdp.contains((tup[0],tup[1]))]
def clique_score(cx, pdints): cx_ints = co.pairs_from_complexes([cx]) return len([1 for edge in cx_ints if pdints.contains(edge)])/len(cx_ints)
def gold_label_ppis(ppis, merged_splits, sp, gold_nsp): gold_consv = 'Dm' if gold_nsp>1 else '' ppi_cxs,_,_ = ppi.load_training_complexes(sp, '', gold_consv) train_cxs = unmerged_splits_from_merged_splits(ppi_cxs, merged_splits)[0] ppis = cv.gold_label_ppis(ppis, co.pairs_from_complexes(train_cxs)) return ppis