def ppis_gold_standard(ppis, cxs_splits, species): pdppis = pd.PairDict([p[:3] for p in ppis]) print len(pdppis.d), "predicted interactions" ppi_cxs,_,all_cxs = ppi.load_training_complexes(species, None,'') #conv doesn't matter pdcorum = pd.PairDict([(i[0],i[1],'gold') for i in co.pairs_from_complexes(ut.i1(all_cxs))]) print len(pdcorum.d), "total gold standard" pdcomb = pd.pd_union_disjoint_vals(pdppis, pdcorum) unmr_splits = cp.unmerged_splits_from_merged_splits(ppi_cxs,cxs_splits) print "unmerged split assignment lengths", [len(s) for s in unmr_splits] pdtrainpos = pd.PairDict([(t[0],t[1]) for t in co.pairs_from_complexes(unmr_splits[0])]) print len(pdtrainpos.d), "total train interactions" counterrs = 0 for tpair in pdtrainpos.d: cpair = pdcomb.find(tpair) #assert cpair is not None, "Gold standard problem--filter_methods changed since run?" if cpair is None or pdcomb.d[cpair][1] != 'gold': #print 'error: train should be subset', tpair counterrs += 1 else: pdcomb.d[cpair][1] = 'train' if counterrs: print "number of training not found in gold std:", counterrs comblist = [list(k)+list(v) for k,v in pdcomb.d.items()] print (len([1 for p in comblist if p[2] and p[3]=='gold']), "ppis in gold not train") print len([1 for p in comblist if p[2] and p[3]=='train']), "ppis in train" # only return those that are predictions return [p for p in comblist if p[2]]
def ppis_scatter(ppis1, ppis2, useinds=range(3)): """ useinds: set to [0,1,3,2] to take ppi.learning_examples output into (score, t/f) tuples; [0,1,3] to exclude the class. """ pd1,pd2 = [pd.PairDict([[p[i] for i in useinds] for p in ppis]) for ppis in ppis1,ppis2] nvals = len(useinds)-2 pdcomb = pd.pd_union_disjoint_vals(pd1, pd2, adefaults=[0]*nvals, bdefaults=[0]*nvals) vals = zip(*ut.i1(pdcomb.d.items())) v1s,v2s = zip(*vals[:nvals]), zip(*vals[nvals:]) v1s,v2s = [ut.i0(x) for x in v1s,v2s] return v1s,v2s