def scatter_blake(a, b, which='circles', classes=[0,1], colors=['k','r'], maxval=None, **kwargs): if maxval: a,b = filter_valpairs(a,b,maxval) defaults = {'s': 50, 'alpha':.2, 'lw':0} kwargs = ut.dict_set_defaults(kwargs, defaults) if type(a[0]) == list or type(a[0]) == tuple: # second value is presumed to be class--should be 0 or 1, which will be # mapped to the colormap cmap. # Also need to clean a and b to just be values rather than values and # classes. print 'using classes' assert ut.i1(a) == ut.i1(b), "Classes not the same between a and b" kwargs['c'] = [colors[0] if x==classes[0] else colors[1] for x in ut.i1(a)] a,b = ut.i0(a), ut.i0(b) else: c = 'k' if which=='pointcloud': scatter(a, b, s=50, alpha=0.08, lw=0) scatter(a, b, **kwargs) elif which=='points': scatter(a, b, **kwargs) elif which=='fadepoints': scatter(a, b, **kwargs) elif which=='circles': del kwargs['lw'] scatter(a, b, facecolors='none', edgecolors=c, **kwargs) title('R-squared: %0.3f' % ut.r_squared(a,b))
def pds_alloverlaps(named_pds): """ input: [(name, pairdict), ...] """ for num in range(2,len(named_pds)+1): for n_pd in it.combinations(named_pds, num): print ut.i0(n_pd), pds_overlap(ut.i1(n_pd))
def hpa_stats(ppis, locs, max_set_size=None): s = attr_to_sets(locs) if max_set_size is not None: s = [c for c in s if len(c) < max_set_size] plocs = co.pairs_from_complexes(s) ppiprots = set(ut.i0(ppis)+ut.i1(ppis)) anprots = set(ut.i0(locs)) intprots = set.intersection(ppiprots, anprots) print len(ppiprots), len(anprots), len(intprots) return ppis_stats(ppis, plocs, intprots)
def nonpairs_gen(pairs, n): items = list(set(ut.i0(pairs) + ut.i1(pairs))) exclude = set(pairs) pdexclude = pd.PairDict(exclude) count = 0 while count < n: pair = (random.choice(items), random.choice(items)) if not pdexclude.contains(pair): yield pair count += 1
def load_pepcount(f): lol = ut.load_lol(f) print "Omitting header:", lol[0] lol = lol[1:] peps = ut.i0(lol[1:]) samples = lol[0][2:] arr = np.zeros((len(peps), len(samples))) for i,row in enumerate(lol[1:]): arr[i,:] = row[2:] return peps, samples, arr
def ppis_scatter(ppis1, ppis2, useinds=range(3)): """ useinds: set to [0,1,3,2] to take ppi.learning_examples output into (score, t/f) tuples; [0,1,3] to exclude the class. """ pd1,pd2 = [pd.PairDict([[p[i] for i in useinds] for p in ppis]) for ppis in ppis1,ppis2] nvals = len(useinds)-2 pdcomb = pd.pd_union_disjoint_vals(pd1, pd2, adefaults=[0]*nvals, bdefaults=[0]*nvals) vals = zip(*ut.i1(pdcomb.d.items())) v1s,v2s = zip(*vals[:nvals]), zip(*vals[nvals:]) v1s,v2s = [ut.i0(x) for x in v1s,v2s] return v1s,v2s
def collapse_alternatives(ppis): """ Simply go through the whole set of interactions, and whenever there are two nodes with identical partners, and scores are all within 20% or 0.01, collapse them to a single node. """ nodes = set([p[0] for p in ppis]+[p[1] for p in ppis]) d_ints = defaultdict(set) for node1,node2,score in ppis: d_ints[node1].add((node2,score)); d_ints[node2].add((node1,score)) alt_groups = [] for i,(node1,ints1) in enumerate(d_ints.items()): # first go through the existing collections found_group=False for group in alt_groups: anode1,an1ints = group[0] if (len(ints1)==len(an1ints) and set(ut.i0(ints1))==set(ut.i0(an1ints))): group.append((node1,ints1)) found_group=True break if not found_group: alt_groups.append([(node1,ints1)]) return alt_groups
def stats(gold,cxs_list, cxppis_list=None, conv_to_sets=True, funcs=None): if conv_to_sets: gold = [set(c) for c in gold] cxs_list = [[set(c) for c in cxs] for cxs in cxs_list] funcs = funcs or ut.i0(cp_funcs) use_funcs = [f for f in cp_funcs if f[0] in funcs] print funcs arr = np.zeros(len(cxs_list),dtype=','.join(['f8']*len(funcs))) arr.dtype.names = funcs print '%s total maps.' % len(cxs_list) for i, (cxs, cxppis) in enumerate(zip(cxs_list, cxppis_list)): #sys.stdout.write(str(i)) print i arr[i] = np.array([f(gold, cxs, cxppis) for f in ut.i1(use_funcs)]) return arr
def load_paralogs(sp_base, sp_other, ogroup_max, other_ogroup_max): """ Get all pairwise base species paralogs from sharing orthogroups in the inParanoid orthology files between the base and other species. """ ogs = orth.load_ogroups(sp_base, sp_other) use_ogs = [] if ogroup_max: for og_base, og_other in ogs: if (len(og_base) <= ogroup_max and (other_ogroup_max is None or len(og_other) <= other_ogroup_max)): use_ogs.append(og_base) else: use_ogs = ut.i0(base_ogs) base_paralogs = pu.groups_to_pairs(use_ogs) return base_paralogs
def merge_atonce(psets, cutoff, func, sep): """ Once difference seems to be that say a series of overlapping doubles can't string together with this approach, whereas it can with the iter approach. """ to_merge = list(psets) merged = [] while len(to_merge)>1: c1,c1ps = random.choice(to_merge) to_merge.remove((c1,c1ps)) matches = [(c,ps) for c,ps in to_merge if func(ps,c1ps)>cutoff] for m in matches: to_merge.remove(m) newname = sep.join([c1]+ut.i0(matches)) newps = reduce(set.union, [c1ps]+ut.i1(matches)) merged.append((newname,newps)) merged.append(to_merge[0]) return merged
def merge(proj_dir, dirnames, pq_new_path): """ Combine pepquant quantitation from project_1 (etc) PQ_FILE into project+PQ_NEW. """ if not os.path.exists(proj_dir): os.mkdir(proj_dir) proj_name = ut.shortname(proj_dir) assert not os.path.exists(pq_new_path), "%s exists. Exiting." % pq_new_path dirnames = ut.i0(sort_numbered(dirnames)) #print "Sorted dirnames:", dirnames pq_files = [os.path.join(d,PQ_FILE) for d in dirnames] for f in pq_files: if not os.path.exists(f): print "No Elution File:", f eluts = (el.load_elution(f) for f in pq_files if os.path.exists(f)) merged = reduce(el.combine_elutions, eluts) el.write_elution(merged, pq_new_path)
def select_best(clstruct, scorenames=['sensitivity','mmr','aupr','cliqueness_3_20','nonov_iter','n_proteins','n_complexes_3_20'], rfunc=operator.add, use_norm=False, dispn=15, score_factors=None, use_ranks=True, output_ranks=False, print_ranks=False, require_scores=None): cxstructs, stats = clstruct.cxstructs, clstruct.stats clusts = [cxstr.cxs for cxstr in cxstructs] scorenames = scorenames or list(stats.dtype.names) stats = stats[scorenames] ranks = rank_columns(stats) if use_ranks: stats = ranks else: if use_norm: stats = norm_columns(stats) if score_factors: stats = rescale_columns(stats, score_factors) inds = np.argsort(reduce(rfunc, [stats[n] for n in scorenames]))[::-1] if require_scores is not None: for req_name,thresh in require_scores: thresh = (np.median(clstruct.stats[req_name]) if thresh is None else thresh) inds = [i for i in inds if clstruct.stats[req_name][i] > thresh] nstats = len(stats) def filt_params(s): return " ".join([p[:2]+p.split('=')[1] for p in s.split(',')]) show_columns = (scorenames if require_scores is None else scorenames+ut.i0(require_scores)) d = DataFrame(clstruct.stats[inds[:dispn]][show_columns], index=["#%s: %sc %si %s" % (i,len(clusts[i]),len(cxstructs[i].cxppis), filt_params(cxstructs[i].params)) for i in inds[:dispn]]) print d.head(dispn) for i in inds[:dispn]: #print (i, ["%0.4f " % s for s in clstruct.stats[i]], len(clusts[i]), #len(cxstructs[i].cxppis), cxstructs[i].params) if print_ranks: print i, [nstats-s for s in ranks[i]] if output_ranks: return inds else: return clusts[inds[0]], cxstructs[inds[0]].cxppis, inds[0]
def set_from_pairs(ppis): return set(ut.i0(ppis) + ut.i1(ppis))
# Plot the feature importances of the trees and of the forest if do_plot: import pylab as pl pl.figure() pl.title("Feature importances") for tree in forest.estimators_: pl.plot(indnums, tree.feature_importances_[indices], "r") pl.plot(indnums, importances[indices], "b") pl.show() feats, weights = zip(*ranked) return list(feats), list(weights) if __name__ == '__main__': if len(sys.argv) < 4: sys.exit("usage: python ml.py train_test feats_f clf_type \ donorm kwarg1_val1-kwarg2-val2") ttf = sys.argv[1] tt = np.load(ttf) feats = ut.loadpy(sys.argv[2]) k = sys.argv[3] do_norm = sys.argv[4] kvs = sys.argv[5] kwargs = dict([tuple(kv.split('_')) for kv in kvs.split('-')]) \ if kvs else {} clf = tree(**kwargs) if k=='tree' else svm(kernel=k, **kwargs) ts = [('%s features, %s kernel, norm: %s, %s' %(n,k,do_norm, kvs), fit_and_test([fe.keep_cols(t, ut.i0(feats[:n])) for t in tt], clf, norm=do_norm)) for n in 20,30,40,50] ut.savepy(ts, 'ts_%s_%s_%s_%s' %(k,do_norm,kvs,ttf))
def items_from_pairs(pairs): return set(ut.i0(pairs)+ut.i1(pairs))
def random_pairs(pairs, npairs): ps = list(set(ut.i0(pairs) + ut.i1(pairs))) rpairs = [(random.choice(ps), random.choice(ps)) for i in range(int(npairs*1.5))] return pu.dedupe(rpairs)[:npairs]
def unique_items(pairs): return set(ut.i0(pairs) + ut.i1(pairs))