def getNchanged(self): """ Return number of annotations changed by the model (sum of included and exluded genes ) """ i_use = SP.setxor1d(SP.arange(self.Pi.shape[1]), SP.hstack([self.iLatentSparse, self.iLatent])) nChanged = SP.sum((self.Pi > .5) != (self.W.C[:, :, 0] > .5), 0)[i_use] * 1.0 nChangedRel = nChanged / SP.sum((self.Pi > .5), 0)[i_use] return (nChanged, nChangedRel)
def regressOut(self, idx=None, terms=None, use_latent=False, use_lm=False, Yraw=None): """Regress out unwanted variation Args: idx (vector_like): Indices of factors to be regressed out use_latent (bool): Boolean varoable indicating whether to regress out the unwanted variation on the low-dimensional latent space or the high-dimensional gene expression space. use_lm (bool): Regress out the factors by fitting a linear model for each gene Yraw (array_like): Optionally a gene expression array can be passed from which the facotrs are regressed out Returns: A matrix containing the corrected expression values. """ #if (idx is None) and (terms is None): # raise Exception('Provide either indices or terms to regress out') if terms is None: idx = SP.array(idx) else: idx = self.getTermIndex(terms) if use_lm == False and (Yraw is None): isOn = (self.W.C[:, :, 0] > .5) * 1.0 if use_latent == False: Ycorr = self.Z.E1 - SP.dot( self.S.E1[:, idx], (isOn[:, idx] * self.W.E1[:, idx]).T) else: idx_use = SP.setxor1d(SP.arange(self.S.E1.shape[1]), idx) Ycorr = SP.dot(self.S.E1[:, idx_use], (isOn[:, idx_use] * self.W.E1[:, idx_use]).T) else: if Yraw is None: Y = self.Z.E1.shape else: Y = Yraw.copy() Ycorr = SP.zeros(Y.shape) if terms is None: X = self.S.E1[:, idx] else: X = self.getX(terms=terms) for ig in SP.arange(Y.shape[1]): lm = LinearRegression() lm.fit(X, Y[:, ig]) Ycorr[:, ig] = Y[:, ig] - lm.predict(X) return Ycorr
def getTerms(self, annotated=True, unannotated=True, unannotated_sparse=True): """Get terms """ terms = list() if unannotated_sparse == True: terms.extend(self.terms[self.iLatentSparse]) if unannotated == True: terms.extend(self.terms[self.iLatent]) if annotated == True: terms.extend(self.terms[SP.setxor1d( SP.hstack([ SP.where(self.terms == 'bias')[0], self.iLatentSparse, self.iLatent ]), SP.arange(len(self.terms)))]) return terms
def findDuplicateVectors(vec, tol=vTol, equivPM=False): """ Find vectors in an array that are equivalent to within a specified tolerance USAGE: eqv = DuplicateVectors(vec, *tol) INPUT: 1) vec is n x m, a double array of m horizontally concatenated n-dimensional vectors. *2) tol is 1 x 1, a scalar tolerance. If not specified, the default tolerance is 1e-14. *3) set equivPM to True if vec and -vec are to be treated as equivalent OUTPUT: 1) eqv is 1 x p, a list of p equivalence relationships. NOTES: Each equivalence relationship is a 1 x q vector of indices that represent the locations of duplicate columns/entries in the array vec. For example: | 1 2 2 2 1 2 7 | vec = | | | 2 3 5 3 2 3 3 | eqv = [[1x2 double] [1x3 double]], where eqv[0] = [0 4] eqv[1] = [1 3 5] """ vlen = vec.shape[1] vlen0 = vlen orid = asarray(range(vlen), dtype="int") torid = orid.copy() tvec = vec.copy() eqv = [] eqvTot = 0 uid = 0 ii = 1 while vlen > 1 and ii < vlen0: dupl = tile(tvec[:, 0], (vlen, 1)) if not equivPM: diff = abs(tvec - dupl.T).sum(0) match = abs(diff[1:]) <= tol # logical to find duplicates else: diffn = abs(tvec - dupl.T).sum(0) matchn = abs(diffn[1:]) <= tol diffp = abs(tvec + dupl.T).sum(0) matchp = abs(diffp[1:]) <= tol match = matchn + matchp kick = hstack([True, match]) # pick self too if kick.sum() > 1: eqv += [torid[kick].tolist()] eqvTot = hstack([eqvTot, torid[kick]]) uid = hstack([uid, torid[kick][0]]) cmask = ones((vlen, )) cmask[kick] = 0 cmask = cmask != 0 tvec = tvec[:, cmask] torid = torid[cmask] vlen = tvec.shape[1] ii += 1 if len(eqv) == 0: eqvTot = [] uid = [] else: eqvTot = eqvTot[1:].tolist() uid = uid[1:].tolist() # find all single-instance vectors singles = sort(setxor1d(eqvTot, range(vlen0))) # now construct list of unique vector column indices uid = int_(sort(union1d(uid, singles))).tolist() # make sure is a 1D list if not hasattr(uid, '__len__'): uid = [uid] return eqv, uid
def findDuplicateVectors(vec, tol=vTol, equivPM=False): """ Find vectors in an array that are equivalent to within a specified tolerance USAGE: eqv = DuplicateVectors(vec, *tol) INPUT: 1) vec is n x m, a double array of m horizontally concatenated n-dimensional vectors. *2) tol is 1 x 1, a scalar tolerance. If not specified, the default tolerance is 1e-14. *3) set equivPM to True if vec and -vec are to be treated as equivalent OUTPUT: 1) eqv is 1 x p, a list of p equivalence relationships. NOTES: Each equivalence relationship is a 1 x q vector of indices that represent the locations of duplicate columns/entries in the array vec. For example: | 1 2 2 2 1 2 7 | vec = | | | 2 3 5 3 2 3 3 | eqv = [[1x2 double] [1x3 double]], where eqv[0] = [0 4] eqv[1] = [1 3 5] """ vlen = vec.shape[1] vlen0 = vlen orid = asarray(range(vlen), dtype="int") torid = orid.copy() tvec = vec.copy() eqv = [] eqvTot = 0 uid = 0 ii = 1 while vlen > 1 and ii < vlen0: dupl = tile(tvec[:, 0], (vlen, 1)) if not equivPM: diff = abs(tvec - dupl.T).sum(0) match = abs(diff[1:]) <= tol # logical to find duplicates else: diffn = abs(tvec - dupl.T).sum(0) matchn = abs(diffn[1:]) <= tol diffp = abs(tvec + dupl.T).sum(0) matchp = abs(diffp[1:]) <= tol match = matchn + matchp kick = hstack([True, match]) # pick self too if kick.sum() > 1: eqv += [torid[kick].tolist()] eqvTot = hstack( [ eqvTot, torid[kick] ] ) uid = hstack( [ uid, torid[kick][0] ] ) cmask = ones((vlen,)) cmask[kick] = 0 cmask = cmask != 0 tvec = tvec[:, cmask] torid = torid[cmask] vlen = tvec.shape[1] ii += 1 if len(eqv) == 0: eqvTot = [] uid = [] else: eqvTot = eqvTot[1:].tolist() uid = uid[1:].tolist() # find all single-instance vectors singles = sort( setxor1d( eqvTot, range(vlen0) ) ) # now construct list of unique vector column indices uid = int_( sort( union1d( uid, singles ) ) ).tolist() # make sure is a 1D list if not hasattr(uid,'__len__'): uid = [uid] return eqv, uid
print(i) spike_samples_clean = pl.delete(spike_samples_clean, 0) pl.save(os.path.join(memap_folder, 'spike_samples_clean.npy'), spike_samples_clean) channels = np.empty(0) for i in pl.arange(0, pl.size(spike_samples_clean)): data = np.array(data_probe_hp[:, spike_samples_clean[i]].tolist()) channels = np.append(channels, np.argmax(data)) if i%100==0: print(i) channels_spikes_df = pd.DataFrame([(channels, spike_samples_clean)], columns=['Channels', 'Samples']) spike_times_shaftA = channels_spikes_df.Samples[0][channels_spikes_df.Channels[0]>7][channels_spikes_df.Channels[0]<16] spike_times_shaftB = channels_spikes_df.Samples[0][channels_spikes_df.Channels[0]>23] spike_times_shaftD = channels_spikes_df.Samples[0][channels_spikes_df.Channels[0]<8] spike_times_shaftC = sp.setxor1d(spike_samples_clean, sp.union1d(spike_times_shaftA, sp.union1d(spike_times_shaftB, spike_times_shaftD))) pl.save(os.path.join(memap_folder, 'spike_times_shaftA.npy'), spike_times_shaftA) pl.save(os.path.join(memap_folder, 'spike_times_shaftC.npy'), spike_times_shaftC) #----------Analysis--------------------- f_ecog = f_sampling/(int(f_sampling/f_subsample)) spike_times_shaftA_ecog = np.array(spike_times_shaftA * f_ecog / f_sampling, dtype='int') spike_times_shaftC_ecog = np.array(spike_times_shaftC * f_ecog / f_sampling, dtype='int') data_ecog_lp_ss_clean = np.delete(data_ecog_lp_ss, ecog_bad_channels, axis=0) #Generate eMUA for each Shaft time_around_spike = 2
def plotRelevance(FA, Nactive=20, stacked=True, madFilter=0.4, annotated=True, unannotated=False, unannotated_sparse=False): """Plot results of f-scLVM Identified factors and corresponding gene set size ordered by relevance (white = low relevance; black = high relevance). Top panel: Gene set augmentation, showing the number of genes added (red) and removed (blue) by the model for each factor. Args: FA (:class:`fscLVM.CSparseFA`): Factor analysis object, usually generated using `initFA` function Nactive (int): Numer of terms to be plotted stacked (bool): Boolean variable indicating whether bars should be stacked db (str): Name of database used, either 'MSigDB' or 'REACTOME' madFilter (float): Filter factors by this mean absolute deviation to exclude outliers. For large datasets this can be set to 0. annotated (bool): Indicates whether annotated factors should be plotted. Defaults to True. unannotated (bool): Indicates whether unannotated factors should be plotted. Defaults to False. unannotated (bool): Indicates whether unannotated sparse factors should be plotted. Defaults to False. """ pltparams = { 'backend': 'pdf', 'axes.labelsize': 12, 'font.size': 12, 'legend.fontsize': 13, 'xtick.labelsize': 14, 'ytick.labelsize': 12, 'text.usetex': False } plt.rcParams.update(pltparams) pattern_hidden = re.compile('hidden*') pattern_bias = re.compile('bias') terms = FA.getTerms(annotated=annotated, unannotated=unannotated, unannotated_sparse=unannotated_sparse) i_use = list() if unannotated_sparse == True: i_use.extend(FA.iLatentSparse) if unannotated == True: i_use.extend(FA.iLatent) if annotated == True: i_use.extend( SP.setxor1d( SP.hstack([ SP.where(FA.terms == 'bias')[0], FA.iLatentSparse, FA.iLatent ]), SP.arange(len(FA.terms)))) i_use = SP.array(i_use) X = FA.getX()[:, i_use] Iprior = FA.getAnnotations()[:, i_use] Iposterior = FA.getZ()[:, i_use] > .5 rel = FA.getRelevance()[i_use] MAD = mad(X) R = (MAD > madFilter) * (rel) terms = SP.array(terms) Nactive = min(SP.sum(R > 0), Nactive) #terms change,s etc. Nprior = Iprior.sum(axis=0) #gains Ngain = (Iposterior & (~Iprior)).sum(axis=0) #loss Nloss = ((~Iposterior & (Iprior))).sum(axis=0) #sort terms by relevance Iactive = R.argsort()[::-1][0:Nactive] RM = R[Iactive, SP.newaxis] xticks_range = SP.arange(Nactive) terms[terms == 'hidden'] = 'Unannotated' terms[terms == 'hiddenSparse'] = 'Unannotated-sparse' xticks_text = list(terms[Iactive]) n_gain = [] n_loss = [] n_prior = [] for i in range(Nactive): n_gain += [Ngain[Iactive[i]]] n_loss += [-1.0 * Nloss[Iactive[i]]] n_prior += [Nprior[Iactive[i]]] width = 0.6 left = SP.arange(Nactive) - 0.5 + (1. - width) / 2. fig = plt.figure(2, figsize=(10, 6)) fig.subplots_adjust(bottom=0.3) gs = mpl.gridspec.GridSpec(2, 2, height_ratios=[2., 1.], width_ratios=[1., 0.05]) gs.update(hspace=0.1) #fig.text(0.06, 0.6, 'Number of annotated genes', ha='center', va='center', rotation='vertical', fontsize=17) ################################################################################# ax1 = plt.subplot(gs[1, 0]) simpleaxis(ax1) ax1.set_xlabel('Active pathways', fontsize=15) ax1.set_ylabel('Gene set size', fontsize=13.5) #im = ax1.imshow(SP.append(RM.T,[[0]],axis=1),origin=[0,0],interpolation='nearest',cmap='Greys',aspect='auto') minima = 0 maxima = max(RM) norm = mpl.colors.Normalize(vmin=minima, vmax=maxima, clip=True) mapper = mpl.cm.ScalarMappable(norm=norm, cmap='Greys') colors = [] for v in RM.flatten(): colors += [mapper.to_rgba(v)] #colors = [] #for i in xrange(RM.shape[0]): # colors += [im.cmap(im.norm(RM[i]))[0,:-1]] y_max = Nprior[Iactive].max() + 100. bar_rel_importance = ax1.bar(left=SP.arange(Nactive) - 0.5, width=1.05, height=[y_max] * len(n_prior), bottom=0, color=colors, log=True, edgecolor='none') bar_annotated = ax1.bar(left=left, width=width, height=n_prior, bottom=0, color='w', log=True, alpha=0.6, edgecolor='k') ax1.set_ylim([10, y_max]) ax1.set_xlim([0, Nactive]) #ax1.set_yticks([]) #ax1.set_yscale('log') plt.xticks(xticks_range, xticks_text, rotation=45, fontsize=14, ha='right') color_bar_ax = plt.subplot(gs[1, 1]) mpl.colorbar.ColorbarBase(color_bar_ax, cmap='Greys', norm=norm, orientation='vertical', ticks=[minima, maxima]) #color_bar = fig.colorbar(im, cax=color_bar_ax,ticks=[0., RM.max()]) color_bar_ax.set_yticklabels([0, 1]) #color_bar_ax.set_yticklabels([0,round(RM.max(),3)]) #color_bar_ax.set_ylabel('Rel. importance') #color_bar.outline.set_visible(False) ################################################################################# ax0 = plt.subplot(gs[0, 0], sharex=ax1) simpleaxis(ax0) if stacked: bar_gain = ax0.bar(left=left, width=width, height=n_gain, bottom=0, color='#861608') bar_loss = ax0.bar(left=left, width=width, height=n_loss, bottom=0, color='#0c09a0') else: bar_gain = ax0.bar(left=SP.arange(Nactive) - 0.5, width=0.5, height=n_gain, bottom=0, color='#861608') bar_loss = ax0.bar(left=SP.arange(Nactive), width=0.5, height=n_loss, bottom=0, color='#0c09a0') #figure out range to make ylim symmatrix ax0.axhline(y=0, linestyle='-', color='gray') #ax0.set_yscale('symlog') gap = SP.ceil(max(max(n_gain), abs(min(n_loss))) / 4.) y_max = SP.ceil(max(n_gain) / gap) y_min = SP.floor(min(n_loss) / gap) yticks = SP.arange(y_min * gap, y_max * gap, gap) ax0.set_yticks(yticks) ax0.set_ylabel('Gene set augemntation', fontsize=13.5) ax0.legend((bar_gain[0], bar_loss[0]), ('Gain', 'Loss'), ncol=1, loc='center left', bbox_to_anchor=(1, 0.5), frameon=False, fontsize=15) plt.setp(ax0.get_xticklabels(), visible=False) plt.show() return fig
print(i) spike_samples_clean = pl.delete(spike_samples_clean, 0) pl.save(os.path.join(memap_folder, 'spike_samples_clean.npy'), spike_samples_clean) channels = np.empty(0) for i in pl.arange(0, pl.size(spike_samples_clean)): data = np.array(data_probe_hp[:, spike_samples_clean[i]].tolist()) channels = np.append(channels, np.argmax(data)) if i%100==0: print(i) channels_spikes_df = pd.DataFrame([(channels, spike_samples_clean)], columns=['Channels', 'Samples']) spike_times_shaftA = channels_spikes_df.Samples[0][channels_spikes_df.Channels[0]>7][channels_spikes_df.Channels[0]<16] spike_times_shaftB = channels_spikes_df.Samples[0][channels_spikes_df.Channels[0]>23] spike_times_shaftD = channels_spikes_df.Samples[0][channels_spikes_df.Channels[0]<8] spike_times_shaftC = sp.setxor1d(spike_samples_clean, sp.union1d(spike_times_shaftA, sp.union1d(spike_times_shaftB, spike_times_shaftD))) pl.save(os.path.join(memap_folder, 'spike_times_shaftA.npy'), spike_times_shaftA) pl.save(os.path.join(memap_folder, 'spike_times_shaftC.npy'), spike_times_shaftC) # ----------Analysis--------------------- f_ecog = f_sampling/(int(f_sampling/f_subsample)) spike_times_shaftA_ecog = np.array(spike_times_shaftA * f_ecog / f_sampling, dtype='int') spike_times_shaftC_ecog = np.array(spike_times_shaftC * f_ecog / f_sampling, dtype='int') data_ecog_lp_ss_clean = np.delete(data_ecog_lp_ss, ecog_bad_channels, axis=0) # Generate eMUA for each Shaft time_around_spike = 2 time_points_around_spike = int(time_around_spike * f_sampling)