def _func( self, key, signature_datasets, signature_profile, signature_CUTOFF, pyvis, # WORKDIR, ): vdf = signature_datasets signature_score = vdf.dot(signature_profile) # ax.set_ylim(0,5000) ppf = pyext.dist2ppf(signature_score) silent = 1 if not silent: fig, ax = plt.subplots(1, 1) ax.set_ylabel('signature_score') ax.set_xlabel('percentage') plt.scatter(ppf, signature_score) ax = plt.gca() ax.set_xlim(0.95, 1.01) ax.grid(1) pyvis.abline(x0=signature_CUTOFF) # CUTOFF = 0.99 ### top 1% of _targets = vdf.index[ppf > signature_CUTOFF] # with pyext.getPathStack([WORKDIR,key],force=1) as stack: pyext.printlines(_targets, pyext.f('_temp-{key}-.it')) return _targets
def qc_libsize(dfc0, silent=1, ax=None, n=20): ''' Correct the lib size deviation using lowly varying genes ''' if not isinstance(dfc0, scount.countMatrix): dfc0 = scount.countMatrix(dfc0.copy()) dfc0.qc_Avg() def getLoss(per, debug=0, ax=None, estimator=np.median): vdf = dfc0.copy() index = vdf.summary.query('per_SD < %s ' % per).index # index = vdf.qc_Avg().summary.query('per_M < 0.3 ').index # vdf = sutil.meanNorm(vdf) # const = vdf.reindex(index).values.mean(axis=0)[None] # const = np.median(vals, axis=0)[None] # const = np.mean(vals, axis=0)[None] vals = vdf.reindex(index).values const = estimator(vals, axis=0)[None] vdf = vdf.setDF(vdf.values - const) # sd = vdf.summary.reindex(index)['SD'] sd = vdf.qc_Avg().summary['SD'] lossA, lossB = sd.median(), sd.mean() if debug == 1: # if ax is None: # ax = plt.gca() vv = vals.T[-4] pyvis.histoLine(vv, 30) # print vals.T[0].mean() # print vals.shape ax.plot(vv.mean(), 0.05, 'x') ax.plot(np.median(vv), 0.07, 'x') return vdf if debug == 2: return const, vdf return lossA, lossB xs = np.linspace(0, 1, n + 1)[1:] # xs = np.arange(0,1,0.05)[1:] res = map(getLoss, xs) res = np.array(res) xmin = xs[np.argmin(res.T[0])] if not silent: if ax is None: ax = plt.gca() ax.plot(xs, res.T[0]) # plt.ylim(0.4,None) ax.twinx().plot(xs, res.T[1], 'go') # plt.ylim(0.4,None) # plt.ylim(0,None) ax.set_title(res.min(axis=0)) ax.grid(1) pyvis.abline(x0=xmin) const, vdf = getLoss(xmin, debug=2) return const, vdf
def worker((i, r)): # betas = [3.0] * 25 # betas = getBeta(i) nIter = 100 alias = 'i-%d_r-%d' % (i, r) mdl0 = pyjob.job__cluster__mixtureVMF__incr( normalizeSample=0, #### set to 1 to normalize the vector lenght tdf=tdf, meanNorm=1, ##### perform X = X-E(X)_ weighted=True, init_method='random', nIter=nIter, # start=0.001, #### specify temperature range # end=2.0, # end=0.7, start=0.2, #### specify temperature range # end=2.0, end=0.7, # betas = betas, #### alternatively, pass a callable for temperature randomState=r, alias='mdl_' + alias, #### filename of cache produced verbose=2, K=60, ) ##### produce diagnostic plot YCUT = entropy_cutoff = 2.5 XCUT = step = 30 axs = pycbk.qc__vmf__speed( mdl0, # XCUT=step,YCUT=entropy_cutoff ### not working yet ) fig = plt.gcf() ax = fig.axes[0] # pyvis.abline(y0=3.7,k=0,ax=ax) pyvis.abline(y0=YCUT, k=0, ax=ax) pyvis.abline(x0=XCUT, k=0, ax=ax) figs['diagnostic-plot'] = plt.gcf() #### using the last model to predict cluster mdls = mdl0.callback.mdls #### models is recorded for each point mdl = mdls[step][-1] #### getting the model at step clu = mdl.predictClu(tdf, entropy_cutoff=entropy_cutoff) clu.to_csv('cluster.csv') ### getting cluster assignment pyvis.heatmap(tdf.reindex(clu.sort_values('clu').index), figsize=[14, 7]) figs['clustered-heatmap'] = plt.gcf() return (alias, fig)