def scatter_blake(a, b, which='circles', classes=[0,1], colors=['k','r'], maxval=None, **kwargs): if maxval: a,b = filter_valpairs(a,b,maxval) defaults = {'s': 50, 'alpha':.2, 'lw':0} kwargs = ut.dict_set_defaults(kwargs, defaults) if type(a[0]) == list or type(a[0]) == tuple: # second value is presumed to be class--should be 0 or 1, which will be # mapped to the colormap cmap. # Also need to clean a and b to just be values rather than values and # classes. print 'using classes' assert ut.i1(a) == ut.i1(b), "Classes not the same between a and b" kwargs['c'] = [colors[0] if x==classes[0] else colors[1] for x in ut.i1(a)] a,b = ut.i0(a), ut.i0(b) else: c = 'k' if which=='pointcloud': scatter(a, b, s=50, alpha=0.08, lw=0) scatter(a, b, **kwargs) elif which=='points': scatter(a, b, **kwargs) elif which=='fadepoints': scatter(a, b, **kwargs) elif which=='circles': del kwargs['lw'] scatter(a, b, facecolors='none', edgecolors=c, **kwargs) title('R-squared: %0.3f' % ut.r_squared(a,b))
def result_gold(splits, species, split_inds, make_unmerged=False, consv_sp='Dm'): if make_unmerged: print "Converting to unmerged using conserved:", (consv_sp if consv_sp else "None") ppi_corum,_,_ = ppi.load_training_complexes(species,'',consv_sp) splits = unmerged_splits_from_merged_splits(ut.i1(ppi_corum), [[ut.i1(s) for s in split] for split in splits]) gold = ut.i1(reduce(operator.add, [splits[i] for i in split_inds])) return gold
def unmerged_splits_from_merged_splits(unmerged, merged_splits): """ Unmerged: list of enumerated/named complex sets, usually from ppi.load_training_complexes. Merged_splits: A list of sets for each split (often 3 splits). """ merged_splits = [ut.i1(ms) for ms in merged_splits] unmerged = ut.i1(unmerged) usplits = [[] for ms in merged_splits] + [[]] # extra for non matches for u in unmerged: matches = [best_match(u, ms, sensitivity) for ms in merged_splits] which_split = np.argmax(matches) if max(matches) > 0 else -1 usplits[which_split].append(u) return usplits
def triple_venn(three_ppis, names=['a','b','c']): # Can send output to venn3(sizes, names) for plotting ppis_names = zip(three_ppis, names) full_sizes = [len(p) for p in three_ppis] print zip(names, full_sizes) trip = ints_overlap(three_ppis) print names, trip intersects_2 = [] for (a,namea),(b,nameb) in it.combinations(ppis_names,2): intersect = ints_overlap([a,b]) print namea, nameb, "--", intersect intersect_2 = intersect - trip print namea, nameb, "-only-", intersect_2 intersects_2.append((set([namea,nameb]), intersect_2)) only_singles = [] for i,(a,namea) in enumerate(ppis_names): #print namea, len(a), trip, intersects_2 #debug only_single = (len(a) - trip - sum([x[1] for x in intersects_2 if namea in x[0]])) print namea, "only:", only_single only_singles.append(only_single) # for output into matplotlib_venn format set_sizes = [0] * 7 set_sizes[6] = trip set_sizes[:2], set_sizes[3] = only_singles[:2], only_singles[2] set_sizes[2], set_sizes[4], set_sizes[5] = ut.i1(intersects_2) return set_sizes, names
def pds_alloverlaps(named_pds): """ input: [(name, pairdict), ...] """ for num in range(2,len(named_pds)+1): for n_pd in it.combinations(named_pds, num): print ut.i0(n_pd), pds_overlap(ut.i1(n_pd))
def triple_venn_consv(): hints = co.load_havug_ints() ppi_cxs, clean_cxs, corconsv = ppi.load_training_complexes("Hs", "Dm") cints = co.pairs_from_complexes(ut.i1(ppi_cxs)) # exclude huge ones ints23 = ut.loadpy(ut.bigd("../23_collapsenodes/Hs_filtorth025_withsc_2sp_refilt2sp_cxs_cxppis_clust27_532cxs"))[1] ints3 = [cp.consv_pairs(i, h2d) for i in ints23, hints, cints] cp.triple_venn(ints3, ["map23", "havug", "corum"])
def ppis_gold_standard(ppis, cxs_splits, species): pdppis = pd.PairDict([p[:3] for p in ppis]) print len(pdppis.d), "predicted interactions" ppi_cxs,_,all_cxs = ppi.load_training_complexes(species, None,'') #conv doesn't matter pdcorum = pd.PairDict([(i[0],i[1],'gold') for i in co.pairs_from_complexes(ut.i1(all_cxs))]) print len(pdcorum.d), "total gold standard" pdcomb = pd.pd_union_disjoint_vals(pdppis, pdcorum) unmr_splits = cp.unmerged_splits_from_merged_splits(ppi_cxs,cxs_splits) print "unmerged split assignment lengths", [len(s) for s in unmr_splits] pdtrainpos = pd.PairDict([(t[0],t[1]) for t in co.pairs_from_complexes(unmr_splits[0])]) print len(pdtrainpos.d), "total train interactions" counterrs = 0 for tpair in pdtrainpos.d: cpair = pdcomb.find(tpair) #assert cpair is not None, "Gold standard problem--filter_methods changed since run?" if cpair is None or pdcomb.d[cpair][1] != 'gold': #print 'error: train should be subset', tpair counterrs += 1 else: pdcomb.d[cpair][1] = 'train' if counterrs: print "number of training not found in gold std:", counterrs comblist = [list(k)+list(v) for k,v in pdcomb.d.items()] print (len([1 for p in comblist if p[2] and p[3]=='gold']), "ppis in gold not train") print len([1 for p in comblist if p[2] and p[3]=='train']), "ppis in train" # only return those that are predictions return [p for p in comblist if p[2]]
def prot_counts_pep2prots(peplist, only_uniques, pep2prots): """ - pep2prots: use supplied dict of {peptide: set(proteinids)} instead of the protein ids on the lines in the pep_list file, in which case sum up the counts for a peptide-spectral combination. """ assert only_uniques, "Only handles only_uniques=True so far." exclude_peps = (set([pep for pep,prots in pep2prots.items() if len(prots)>1]) if only_uniques else set([])) print "%s non-unique peptides to exclude." % len(exclude_peps) pep_samp_counts = defaultdict(float) for _,sample,pep,count in peplist: if pep not in exclude_peps: pep_samp_counts[(pep,sample)] += float(count) # Currently ignoring peptides without a mapping. prots = sorted(list(reduce(set.union, [pep2prots[pep] for (pep,_),_ in pep_samp_counts.items() if pep in pep2prots]))) samples = sorted(list(set(ut.i1(peplist)))) print "%s unique proteins. %s samples." % (len(prots), len(samples)) dprots, dsamples = [ut.list_inv_to_dict(lst) for lst in prots, samples] counts = np.zeros((len(prots), len(samples)), dtype='float32') for (pep,sample),count in pep_samp_counts.items(): if pep in pep2prots: # Currently ignoring peptides without a mapping. assert len(pep2prots[pep])==1, "Non-unique peptide found" counts[dprots[list(pep2prots[pep])[0]], dsamples[sample]] += count totals = counts.sum(axis=1) nonzero = totals > 0 prots, totals = [list(np.array(lst)[nonzero]) for lst in prots, totals] counts = counts[nonzero,:] return prots, samples, counts, totals
def hist_prot_counts(fs, in_prots_sets=[None],**kwargs): """ Usually set linewidth=3, histtype='step', range=(0,18), bins=36 """ pcounts = prot_counts(fs).items() for prots in in_prots_sets: if prots: pcounts = [(p,c) for p,c in pcounts if p in prots] hist(np.log2(ut.i1(pcounts)), **kwargs)
def attr_to_sets(atts): """ Given a list of items with attributes, return a list of item sets. Input: [(item1, attr1), (item2, attr2), (item3, attr1), ...] Output: [{item1,item3}, {item2}, ...] """ datts = defaultdict(set) for i,a in atts: datts[a].add(i) return ut.i1(datts.items())
def arrfeats_prep_all_data(arrfeats, ppis, sp="Hs", gold_consv="Dm", cutoff=0.5): print "Adding species summary." arrfeats = fe.arr_add_spsummary(arrfeats, cutoff) print "Adding ppis." arrfeats = fe.arrfeats_add_ppis(arrfeats, ppis) _, _, all_cxs = ppi.load_training_complexes(sp, None, gold_consv) pdgold = pd.PairDict(co.pairs_from_complexes(ut.i1(all_cxs))) print "Setting trues." arrfeats = fe.arrfeats_set_gold(arrfeats, pdgold) return arrfeats
def nonpairs_gen(pairs, n): items = list(set(ut.i0(pairs) + ut.i1(pairs))) exclude = set(pairs) pdexclude = pd.PairDict(exclude) count = 0 while count < n: pair = (random.choice(items), random.choice(items)) if not pdexclude.contains(pair): yield pair count += 1
def hpa_stats(ppis, locs, max_set_size=None): s = attr_to_sets(locs) if max_set_size is not None: s = [c for c in s if len(c) < max_set_size] plocs = co.pairs_from_complexes(s) ppiprots = set(ut.i0(ppis)+ut.i1(ppis)) anprots = set(ut.i0(locs)) intprots = set.intersection(ppiprots, anprots) print len(ppiprots), len(anprots), len(intprots) return ppis_stats(ppis, plocs, intprots)
def ppis_scatter(ppis1, ppis2, useinds=range(3)): """ useinds: set to [0,1,3,2] to take ppi.learning_examples output into (score, t/f) tuples; [0,1,3] to exclude the class. """ pd1,pd2 = [pd.PairDict([[p[i] for i in useinds] for p in ppis]) for ppis in ppis1,ppis2] nvals = len(useinds)-2 pdcomb = pd.pd_union_disjoint_vals(pd1, pd2, adefaults=[0]*nvals, bdefaults=[0]*nvals) vals = zip(*ut.i1(pdcomb.d.items())) v1s,v2s = zip(*vals[:nvals]), zip(*vals[nvals:]) v1s,v2s = [ut.i0(x) for x in v1s,v2s] return v1s,v2s
def stats(gold,cxs_list, cxppis_list=None, conv_to_sets=True, funcs=None): if conv_to_sets: gold = [set(c) for c in gold] cxs_list = [[set(c) for c in cxs] for cxs in cxs_list] funcs = funcs or ut.i0(cp_funcs) use_funcs = [f for f in cp_funcs if f[0] in funcs] print funcs arr = np.zeros(len(cxs_list),dtype=','.join(['f8']*len(funcs))) arr.dtype.names = funcs print '%s total maps.' % len(cxs_list) for i, (cxs, cxppis) in enumerate(zip(cxs_list, cxppis_list)): #sys.stdout.write(str(i)) print i arr[i] = np.array([f(gold, cxs, cxppis) for f in ut.i1(use_funcs)]) return arr
def merge_atonce(psets, cutoff, func, sep): """ Once difference seems to be that say a series of overlapping doubles can't string together with this approach, whereas it can with the iter approach. """ to_merge = list(psets) merged = [] while len(to_merge)>1: c1,c1ps = random.choice(to_merge) to_merge.remove((c1,c1ps)) matches = [(c,ps) for c,ps in to_merge if func(ps,c1ps)>cutoff] for m in matches: to_merge.remove(m) newname = sep.join([c1]+ut.i0(matches)) newps = reduce(set.union, [c1ps]+ut.i1(matches)) merged.append((newname,newps)) merged.append(to_merge[0]) return merged
def load_kegg_sequentials(fname, do_convert=True): dkegg = load_kegg_brite(fname) kegg_paths = [ut.i1(v) for v in dkegg.values() if v] def path_pairs(list_path): return [(list_path[i],list_path[i+1]) for i in range(len(list_path)-1)] group_pairs = ut.flatten([path_pairs(lpath) for lpath in kegg_paths]) #if return_groups: #if conv_dict: #return convert_groups_singles(labeled_pairs, conv_dict) #else: #return labeled_pairs single_pairs = [(xi,yi) for x,y in group_pairs for xi in x for yi in y] unique_pairs = pu.dedupe(single_pairs) print "%s total, %s single, %s unique pairs returned" % ( len(group_pairs), len(single_pairs), len(unique_pairs)) if do_convert: conv_dict = ut.dict_inverse_sets(orth.convert_dict('Hs','Hs_entrez')) conv_pairs = convert_pairs_singles(unique_pairs, conv_dict) print "%s converted pairs with 1-1 matches" % len(conv_pairs) return conv_pairs else: return unique_pairs
def load_havug_cxppis(): hints = load_havug_ppis() hcxs = load_havug_cxs() hints = cl._filter_ints(hints, ut.i1(hcxs)) return hints
def random_pairs(pairs, npairs): ps = list(set(ut.i0(pairs) + ut.i1(pairs))) rpairs = [(random.choice(ps), random.choice(ps)) for i in range(int(npairs*1.5))] return pu.dedupe(rpairs)[:npairs]
def set_from_pairs(ppis): return set(ut.i0(ppis) + ut.i1(ppis))
def unique_items(pairs): return set(ut.i0(pairs) + ut.i1(pairs))
def items_from_pairs(pairs): return set(ut.i0(pairs)+ut.i1(pairs))