def routine_combineCSV(fnames, CUTOFF=1, idCol='Gene ID'): print '[PROG] Starting to readfile' dfs, (geneRef, geneValid) = combine_csv(fnames, CUTOFF=CUTOFF, idCol=idCol) print '[PROG] Finished to readfile' print '[PROG] Starting to pad' f = pyutil.functools.partial(padWithRef, ref=geneRef) lst = pyutil.mp_map(f, dfs, n_cpu=1) SHP = np.array([df.shape for df in lst]) assert np.all(SHP == SHP[0:1]), 'Arrays not sharing shape:%s' % SHP gids = np.array([df['Gene ID'] for df in lst]) assert np.all(gids == gids[0:1]) print '[PROG] Finished padding' dfs = lst dfs = [df.iloc[geneValid.index] for df in dfs] return dfs, (geneRef, geneValid.reset_index().drop('index', 1))
def job__md5sum(dfc, column='fname', n_cpu=1, **kwargs): worker = pyutil.functools.partial(worker__md5sum, column=column) it = (x.__dict__ for x in dfc.itertuples()) res = pyutil.mp_map(worker, it, n_cpu=n_cpu, **kwargs) # res = pd.DataFrame(res) return res
def extract_bigwig_multiple(bedFile=None, peakFile=None, bwFiles=None, fnames=None, radius=None, callback=lambda x: pd.concat(x, axis=1), NCORE=1, stepSize=20, outIndex=pyutil.basename, center_summit=0, **kwargs): '''extract_bigwig() for multiple bwFiles ''' ######################### #### Legacy support ##### if bedFile is None: assert peakFile is not None bedFile = peakFile else: assert peakFile is None if bwFiles is None: assert fnames is not None bwFiles = fnames #### Legacy support #### ######################### if radius is not None: bedFile = npk_expandSummit(fname=bedFile, radius=radius, center_summit=center_summit) print '[L]', pyutil.lineCount(bedFile) #### Compute Matrix worker = pyutil.functools.partial( extract_bigwig, # bwFile=fname, stepSize=stepSize, bedFile=bedFile, # outIndex=outIndex, # NCORE=1, **kwargs) bws = pyutil.mp_map(worker, bwFiles, n_cpu=NCORE) # bws = [ for fname in fnames] for i, (bwFile, out) in enumerate(zip(bwFiles, bws)): replaceCol = None if outIndex is None: replaceCol = None else: if callable(outIndex): replaceCol = outIndex(bwFile) else: replaceCol = outIndex[i] if replaceCol is not None: #### replace outer index,replace with rename() #### [TEMP] tmp = out.T tmp['ind'] = replaceCol tmp.set_index('ind', append=1, inplace=True) tmp = tmp.reorder_levels(['ind', None]) tmp.index.names = ['bwFile', 'pos'] out = tmp.T out.index.name = 'acc' bws[i] = out dfc = bws # dfc=pd.concat(bws,) # dfc = pd.concat(bws,axis=1) #### abosorbed into callback() if callback is not None: dfc = callback(dfc) return dfc
def qc_silhouette( D, # nClu = 40 method='complete', nClu=40, NCORE=10, silent=1, axs=None): Ds = D if D.ndim == 2: D = spdist.squareform(D, checks=0) if Ds.ndim != 2: Ds = spdist.squareform(Ds, checks=0) # D = D_mse Z = sphier.linkage( D, method=method, ) lst = [] clus = [] nClus = range(0, nClu) worker = pyutil.functools.partial(worker_cut_tree, Z=Z) clus = pyutil.mp_map(worker, nClus, n_cpu=NCORE) worker = pyutil.functools.partial(worker_silhouette, Ds=Ds) # lst = [] lst = pyutil.mp_map( worker, clus, # n_cpu=min(NCORE,4) n_cpu=1) lst = [x for x in lst if x is not None] shind = lst shind = np.array(shind) if not silent: if axs is None: fig, axs = plt.subplots(1, 1, figsize=[6, 4]) axs = [axs] ax = axs[0] X, Y = shind.T ax.plot(X, Y) # pyvis.abline() ax.set_xlim(left=0) wid = 3 rid = wid // 2 movavg = map( np.mean, pyutil.window( Y, n=wid, step=1, ), ) ax.plot(X[rid:-rid], movavg, 'x--', label='Moving average') ax.grid(1) ax.legend() return shind
def main( #### necessary bedFile=None, bwFiles=None, #### DIR=None, figsize=[14, 14], debug=0, ylim=[0, 10], radius=2000, stepSize=10, NCORE=4, silent=0, gtfFile=None, cdsFile=None, annotation=None, GSIZE=None, center_summit=0, trackNames=None, backend='fluff', ext='png', **kwargs): # vlim = ylim figsize = map(int, figsize) # for peakAcc in df_near.acc.unique()[:1]: prefix = 'PROG=chipShots_bedFile=' # bname = pyutil.basename(bedFile) bname = pyutil.os.path.basename(bedFile) odname = prefix + bname if DIR == 'inplace': DIR = pyutil.os.path.dirname(bedFile) + odname elif DIR is None: DIR = odname pyutil.shellexec('mkdir -p %s' % DIR, silent=silent) DIR = pyutil.os.path.abspath(DIR) # odname = pyutil.ospath if cdsFile is None: cdsFile = gtfFile + '.cds' if backend == 'synotil': # nearFile = './DE2017/type=closest_bed=lux22_radius=1_feat=genes.gtf.cds.tsv' # import synotil.filterByCDS nearFile = synotil.filterByCDS.main( peakFile=bedFile, cdsFile=cdsFile, downStream=radius, upStream=radius, peakRadius=1, GSIZE=GSIZE, center_summit=center_summit, ) df_near = pyutil.readData(nearFile, ) stderrLine('[MSG]Loading bed intervals from bigwig tracks....') chipTracks = sutil.extract_bigwig_multiple( fnames=bwFiles, bedFile=bedFile, radius=radius, stepSize=stepSize, callback=None, outIndex=trackNames, # callback=callback, center_summit=center_summit, shift=0, #### use positive coordinate stranded=False, NCORE=NCORE) if ylim is None: ylim = pyutil.span( pyutil.np.hstack([x.values.flat for x in chipTracks]), 99) ylim = list(ylim) ylim[0] = 0. callback = lambda x: [prepare_chipTrack(ele, vlim=ylim) for ele in x] chipTracks = callback(chipTracks) if debug: stderrLine(chipTracks[0].columns) gtf = pyutil.readData(gtfFile, ext='tsv', header=None, guess_index=0) gtf = scount.countMatrix(gtf, look='gtf') gtfs = [gtf] # uniqPeak = df_near.acc.unique() # bedDF = pyutil.readData(bedFile,header=None,guess_index=0) # bedDF.columns = sutil.bedHeader[:len(bedDF.columns)] bedDF = sutil.extract_peak(bedFile) # uniqPeak # uniqPeak = bedDF[bedDF.columns] worker = pyutil.functools.partial( worker__drawPeak, DIR=DIR, chipTracks=chipTracks, df_near=df_near, gtfs=gtfs, radius=radius, figsize=figsize, ylim=ylim, debug=debug, ) ofnames = pyutil.mp_map( worker, bedDF.acc, n_cpu=NCORE, ) elif backend == 'fluff': bedDF = sdio.extract_peak(bedFile) argDF = bedDF.copy() argDF = sdio.bed__addCol__interval(argDF) tracks = list(bwFiles) argDF['tracks'] = [tracks] * len(bedDF) argDF['annotation'] = annotation argDF['DIR'] = DIR argDF['ext'] = ext if trackNames is not None: argDF['labels'] = [list(trackNames)] * len(bedDF) ofnames = pyutil.mp_map( # ofnames = map( worker__fluff, (vars(x) for x in argDF.itertuples()), n_cpu=NCORE, ) # ofnames = bedDF['img'] = ofnames indexFile = '%s/%s.index.tsv' % (DIR, bname) pyutil.to_tsv(bedDF, indexFile) indexFile = '%s/figureIndex.tsv' % (DIR) pyutil.to_tsv(bedDF, indexFile) try: import synotil.shot2html as shot2html htmlFile = shot2html.shot2html(indexFile, localPath=True) except Exception as e: stderrLine('[WARN]:cannot produce html :%s' % e) htmlFile = None # print ('[OUTPUT]:',) # print ('html:',htmlFile) # print ('index:',indexFile) print(indexFile) print(htmlFile) return (indexFile, htmlFile)