def combine_corrs(e1, e2, allprots, combine_func, default_val=None): # we combine the symmetric correlation matrices using the specified # element-wise function. function examples: max, sum # we use the specified ordering of elements in allprots default_val = default_val if default_val else -1 if combine_func.__name__.find("max") > -1 else 0 nprots = len(allprots) corr = np.matrix(np.zeros((nprots, nprots))) dprots1 = ut.list_inv_to_dict(e1.prots) dprots2 = ut.list_inv_to_dict(e2.prots) for row, p1 in enumerate(allprots): for col, p2 in enumerate(allprots): val1 = e1.corr[dprots1[p1], dprots1[p2]] if p1 in dprots1 and p2 in dprots1 else default_val val2 = e1.corr[dprots2[p1], dprots2[p2]] if p1 in dprots2 and p2 in dprots2 else default_val corr[row, col] = combine_func(val1, val2) return corr
def prot_counts_pep2prots(peplist, only_uniques, pep2prots): """ - pep2prots: use supplied dict of {peptide: set(proteinids)} instead of the protein ids on the lines in the pep_list file, in which case sum up the counts for a peptide-spectral combination. """ assert only_uniques, "Only handles only_uniques=True so far." exclude_peps = (set([pep for pep,prots in pep2prots.items() if len(prots)>1]) if only_uniques else set([])) print "%s non-unique peptides to exclude." % len(exclude_peps) pep_samp_counts = defaultdict(float) for _,sample,pep,count in peplist: if pep not in exclude_peps: pep_samp_counts[(pep,sample)] += float(count) # Currently ignoring peptides without a mapping. prots = sorted(list(reduce(set.union, [pep2prots[pep] for (pep,_),_ in pep_samp_counts.items() if pep in pep2prots]))) samples = sorted(list(set(ut.i1(peplist)))) print "%s unique proteins. %s samples." % (len(prots), len(samples)) dprots, dsamples = [ut.list_inv_to_dict(lst) for lst in prots, samples] counts = np.zeros((len(prots), len(samples)), dtype='float32') for (pep,sample),count in pep_samp_counts.items(): if pep in pep2prots: # Currently ignoring peptides without a mapping. assert len(pep2prots[pep])==1, "Non-unique peptide found" counts[dprots[list(pep2prots[pep])[0]], dsamples[sample]] += count totals = counts.sum(axis=1) nonzero = totals > 0 prots, totals = [list(np.array(lst)[nonzero]) for lst in prots, totals] counts = counts[nonzero,:] return prots, samples, counts, totals
def __init__(self, filename, sp_base="Hs", norm_rows=False, norm_cols=False): e = load_elution(filename) self.prots = e.prots self.filename = e.filename self.normarr = ut.normalize_fracs(e.mat, norm_rows=norm_rows, norm_cols=norm_cols) self.pinv = ut.list_inv_to_dict(e.prots) sp_target = ut.shortname(e.filename)[:2] self.baseid2inds = sc.orth_indices(sp_base, sp_target, e.prots, False)
def __init__(self, filename, sp_base='Hs', norm_rows=False, norm_cols=False): e = load_elution(filename) self.prots = e.prots self.filename = e.filename self.normarr = ut.normalize_fracs(e.mat, norm_rows=norm_rows, norm_cols=norm_cols) self.pinv = ut.list_inv_to_dict(e.prots) sp_target = ut.shortname(e.filename)[:2] self.baseid2inds = sc.orth_indices(sp_base, sp_target, e.prots, False)
def combine_corrs(e1, e2, allprots, combine_func, default_val=None): # we combine the symmetric correlation matrices using the specified # element-wise function. function examples: max, sum # we use the specified ordering of elements in allprots default_val = default_val if default_val else -1 if \ combine_func.__name__.find('max') > -1 else 0 nprots = len(allprots) corr = np.matrix(np.zeros((nprots,nprots))) dprots1 = ut.list_inv_to_dict(e1.prots) dprots2 = ut.list_inv_to_dict(e2.prots) for row,p1 in enumerate(allprots): for col,p2 in enumerate(allprots): val1 = e1.corr[dprots1[p1], dprots1[p2]] if p1 in dprots1 and p2 in \ dprots1 else default_val val2 = e1.corr[dprots2[p1], dprots2[p2]] if p1 in dprots2 and p2 in \ dprots2 else default_val corr[row,col] = combine_func(val1, val2) return corr
def ppis_add_sp_ppis(ppis, arrfeats): idict = ut.list_inv_to_dict(((r[0],r[1]) for r in arrfeats)) newppis = [] cols = [n for n in arrfeats.dtype.names if n[2:].startswith('_ppi_score')] for p in ppis: index = idict[(p[0],p[1])] sp_ppis = arrfeats[index:index+1][cols][0] newppis.append(tuple(p) + tuple(sp_ppis)) return newppis, cols
def elut_gene_maxes(elutfs, geneids): d = {} for f in elutfs: e = el.load_elution(f) prots_inv = ut.list_inv_to_dict(e.prots) for gid in geneids: if gid in prots_inv: d.setdefault(f,{})[gid] = np.max(e.mat[prots_inv[gid]]) return d
def ppis_add_splist(ppis, arrfeats, cutoff): idict = ut.list_inv_to_dict(((r[0],r[1]) for r in arrfeats)) newppis = [] print "Cutoff: %s" % cutoff for p in ppis: index = idict[(p[0],p[1])] splist = fe.passing_species_separate(arrfeats[index:index+1], cutoff) newppis.append(tuple(p) + tuple(splist)) sps = fe.species_list(arrfeats.dtype.names[3:]) return newppis, sps
def filter_matching_elution(edata, efilter, remove_data_ending=".map"): """ Use efilter as a mask on edata, setting edata to 0 based on efilter being 0 """ newmat = np.matrix(np.zeros(edata.mat.shape)) # First create the column-matched array from filter to data inv_fracs = ut.list_inv_to_dict(efilter.fractions) data_fracs = [f.replace(remove_data_ending, "") for f in edata.fractions] arrfilt = np.zeros((efilter.mat.shape[0], edata.mat.shape[1])) for i, f in enumerate(data_fracs): arrfilt[:, i] = np.asarray(efilter.mat)[:, inv_fracs[f]] if f in inv_fracs else np.zeros(efilter.mat.shape[0]) # Then go row-by-row filter_map = ut.list_inv_to_dict(efilter.prots) for i, g in enumerate(edata.prots): if g in filter_map: newmat[i, :] = np.asarray(edata.mat)[i, :] * (arrfilt[filter_map[g], :] > 0).astype(int) else: newmat[i, :] = np.zeros(edata.mat.shape[1]) return newmat
def complex_arr(cxs, prots): arr = np.zeros((len(prots),len(prots))) ints_dict = co.corum_ints_duped([(i,ps) for i,ps in enumerate(cxs)]) p_inds = ut.list_inv_to_dict(prots) for p,partners in ints_dict.items(): if p in p_inds: for partner in partners: if partner in p_inds: arr[p_inds[p], p_inds[partner]] = 1 return arr
def profiles_cxs(e, cxs, **kwargs): # blue/yellow/red map: 'jet' defaults = {'interpolation': 'nearest', 'cmap':'hot', 'vmin':1} kwargs = ut.dict_set_defaults(kwargs, defaults) arr = np.array(e.mat) dinds = ut.list_inv_to_dict(e.prots) useps = [p for c in cxs for p in c] useinds = [dinds[p] for p in useps if p in dinds] vals = np.clip(np.log2(arr[useinds,:]),0,100) imshow(vals, **kwargs) return vals
def filter_matching_elution(edata, efilter, remove_data_ending='.map'): """ Use efilter as a mask on edata, setting edata to 0 based on efilter being 0 """ newmat = np.matrix(np.zeros(edata.mat.shape)) # First create the column-matched array from filter to data inv_fracs = ut.list_inv_to_dict(efilter.fractions) data_fracs = [f.replace(remove_data_ending,"") for f in edata.fractions] arrfilt = np.zeros((efilter.mat.shape[0], edata.mat.shape[1])) for i,f in enumerate(data_fracs): arrfilt[:,i] = (np.asarray(efilter.mat)[:,inv_fracs[f]] if f in inv_fracs else np.zeros(efilter.mat.shape[0])) # Then go row-by-row filter_map = ut.list_inv_to_dict(efilter.prots) for i,g in enumerate(edata.prots): if g in filter_map: newmat[i,:] = (np.asarray(edata.mat)[i,:] * (arrfilt[filter_map[g],:] > 0).astype(int)) else: newmat[i,:] = np.zeros(edata.mat.shape[1]) return newmat
def orth_indices(sp_base, sp_target, prot_list, remove_multi_base): """ Using appropriate orthology, take a list of target species gene ids (corresponding to rows in the target species score matrix), and return a dict mapping base species gene ids to (sets of) indices in that list and therefore to (sets of) row/column indices in the square interaction score matrix. """ targ2inds = dict([(k,set([v])) for k,v in ut.list_inv_to_dict(prot_list).items()]) if sp_base == sp_target: return targ2inds else: base2targ = orth.odict(sp_base, sp_target) if remove_multi_base: base2targ = remove_multi_keys(base2targ) base2inds = ut.compose_dict_sets(base2targ, targ2inds) base2inds = dict([(k,v) for k,v in base2inds.items() if len(v)>0]) return base2inds
def prot_counts(peplist, only_uniques): """ Deprecated 20130628. pep_list doesn't seem consistent for protein assignments. mainly returns the counts array with protein quantations. - exclude_peps: set of peptides to exclude, probably from non_unique_peps. """ print "**Deprecated 20130628**" exclude_peps = non_unique_peps(peplist) if only_uniques else set([]) prots,samples = [sorted(list(set(lst))) for lst in zip(*[i[:2] for i in peplist])] dprots, dsamples = [ut.list_inv_to_dict(lst) for lst in prots, samples] counts = np.zeros((len(prots), len(samples)), dtype='float32') for prot_peplist, sample, pep, count in peplist: if pep not in exclude_peps: counts[dprots[prot],dsamples[sample]] += float(count) totals = counts.sum(axis=1) nonzero = totals > 0 prots, totals = [list(np.array(lst)[nonzero]) for lst in prots, totals] counts = counts[nonzero,:] return prots, samples, counts, totals
def pair2ind(items): return ut.list_inv_to_dict(((x[0],x[1]) for x in items))