def qc_Sort(df=None, fname=None, cname='test', vlim=[-2, 2], title=None, xlim=None, ylim=None, figsize2=[14, 6], nMax=5000, **heatargs): figs = collections.OrderedDict() vmin, vmax = vlim if df is None: df = pyutil.readData(fname) if title is None: title = '[file]%s' % fname heatargs.update({ 'vmin': vmin, 'vmax': vmax, 'cname': cname, 'vlim': vlim, }) if isinstance(df, pd.DataFrame): C = df.values else: C = df (M, V, CV), axsLst = qcAvg(C, silent=0, xlim=xlim, ylim=ylim, nMax=nMax) figs['qcAvg'] = plt.gcf() plt.suptitle(title) inter = -len(C) // 1000 fig, axs = plt.subplots(3, 1, figsize=figsize2, gridspec_kw={'hspace': 0.3}) axs = axs.flat pyvis.heatmap(C[V.argsort()][::inter], transpose=1, main='sorted by Varaince', ax=axs[0], **heatargs) pyvis.heatmap(C[CV.argsort()][::inter], transpose=1, main='sorted by CV', ax=axs[1], **heatargs) pyvis.heatmap(C[M.argsort()][::inter], transpose=1, main='sorted by Average', ax=axs[2], **heatargs) axsLst = np.hstack([axsLst, axs]) figs['qcSort'] = plt.gcf() return (M, V, CV), figs
def extract_peak(fname, ext='tsv', header=None, guess_index=0, **kwargs): df = pyutil.readData(fname, ext=ext, header=header, guess_index=guess_index, **kwargs) df.columns = bedHeader[:len(df.columns)] + list( df.columns)[len(bedHeader):] return df
def closestAnnotation( bedFile, RANGE=1000, ANNOTATION_FILE=None, GSIZE=None, silent=True, ): ''' use bedtools to find the feature closest to the regions contianed inthe in the given bed file. The annotation will be expanded by {RANGE} bp before queryed chrom.sizes must be supplied as {GSIZE} to make bedtools happy ''' FOUT = bedFile.split('/')[-1] FOUT = 'type=closest_bed=%s_feat=%s.tsv' % ( pyutil.basename(bedFile), pyutil.basename(ANNOTATION_FILE)) cmd = ''' bedtools slop -b {RANGE} -i {ANNO} -g {GSIZE} |bedtools sort > {ANNOBASE}.{RANGE} bedtools sort -i {bedFile} |\ bedtools closest -d -a - -b {ANNOBASE}.{RANGE} | tee {FOUT}.tmp '''.format( GSIZE=GSIZE, ANNO=ANNOTATION_FILE, ANNOBASE=ANNOTATION_FILE.split('/')[-1], bedFile=bedFile, RANGE=RANGE, FOUT=FOUT, ).strip() buf = StringIO.StringIO(pyutil.shellexec(cmd, silent=silent)) if buf.len: buf.seek(0) header = sum([ guessBedHeader(x, prefix=k) for k, x in [('', bedFile), ('feat', ANNOTATION_FILE)] ], []) header += [ 'distance', ] df = pyutil.readData(buf, header=None, ext='tsv', guess_index=False) df.columns = header # df = parseBedClosest(fname = buf) # os.system('rm %s.tmp' % FOUT) else: assert 0, ' Buffer is empty, check error msg' df = df[df['distance'] == 0] df.to_csv(FOUT, sep='\t', index=0) return FOUT
def parseBedmap( fname=None, df=None, ): ''' Parse the output of bedMap ''' if df is None: df = pyutil.readData(fname, header=None, ext='tsv', guess_index=False) df = df.dropna() df.columns = bedHeader + ['hit'] res = pyutil.explode(df, 'hit', 'acc', ';') res = res.merge(df.drop('hit', 1), on='acc') return res
def extract_closest( fname=None, df=None, ): ''' Parse the output of 'bedtools closest' ''' if df is None: df = pyutil.readData(fname, header=None, ext='tsv', guess_index=False) # df = df.dropna() header = bedHeader + pyutil.paste0([['feature_'], bedHeader]).tolist() df = df.iloc[:, :18] df.columns = header[:17] + ['distance'] df['hit'] = df['feature_acc'] return df
def bed__checkValid(bed, GSIZE, force=0): fname = None if not isinstance(bed, pd.DataFrame): fname = bed bed = sdio.extract_peak(bed) sizeDF = pyutil.readData(GSIZE, ext='tsv', header=None, guess_index=0) sizeDF.columns = ['chrom', 'length'] bedDF = sizeDF.merge(bed) bedDF['valid'] = bedDF.eval('start > 0 and end <= length') if not force: assert bedDF.valid.all() else: resDF = bedDF.query('valid').drop(columns=['valid', 'length']) if fname is not None: ofname = '%s__valid.bed' % fname.rsplit('.', 1)[0] pyutil.to_tsv(resDF, ofname) return ofname else: return resDF
def guessBedHeader(fname, silent=True, ext='tsv', guess_index=0, prefix='', **kwargs): cmd = 'head -n5 %s' % fname buf = StringIO.StringIO(pyutil.shellexec(cmd, silent=silent)) df = pyutil.readData(buf, ext=ext, header=None, guess_index=guess_index, **kwargs) if len(df.columns) > len(bedHeader): header = bedHeader + list(df.columns)[len(bedHeader):] else: header = bedHeader[:len(df.columns)] if prefix: header = ['%s_%s' % (prefix, x) for x in header] return map(str, header)
def main( bedFile, # = None, bwFiles=None, bwTrackFile=None, relistByGene=0, #### potentially takes a long time stepSize=50, radius=None, center_summit=0, NCORE=1, ): if bedFile is not None: refBed = sdio.extract_peak(bedFile) if bwTrackFile is not None: dfc = pyutil.readData(bwTrackFile, ) else: sdio.extract_bigwig_multiple( bedFile=bedFile, bwFiles=bwFiles, center_summit=center_summit, NCORE=NCORE, stepSize=stepSize, radius=radius, ) relistByGene = 1 if relistByGene: dfc = sdio.listByGene(dfc) dfc0 = dfc tdf = np.std(dfc.values, axis=1, keepdims=1) lr = hlhmm.GaussianHMM(n_components=2, covariance_type="diag", init_params="cmt", params="cmt") lr.fit(tdf) seg = lr.predict(tdf) segDF = pd.DataFrame(seg, index=dfc0.index, columns=['clu']) ofname = pyutil.getBname(bedFile) + '__HVPeak.bed' ofname = sdio.clu2bed(segDF, ofname) return ofname
def from_DataFrame(cls, df=None, fname=None, name=None, index_col=None, **kwargs): if df is None: assert fname is not None, '[ERR] must specify one of "df" or "fname" ' df = pyutil.readData(fname, **kwargs) name = pyutil.os.path.basename(fname).rsplit('.', 1)[0] elif isinstance(df, pd.Series): df = df.to_frame() if index_col is not None: assert index_col in df df.set_index(index_col, drop=0, inplace=1) # print 3,type(df) ins = cls( C=df.values, colName=df.columns, rowName=df.index, name=name, fname=fname, ) return ins
def job__render__panelPlot(tracks=None, clu=None, order=None, index=None, aliasFmt='{alias}', alias=None, baseFile=0, figsize=None, panel_kw=panel_kw_dft, how='left', debug=0, extra={}, **kwargs): if figsize is not None: panel_kw['figsize'] = figsize autoAli = alias is None if autoAli: alias = '' if isinstance(clu, basestring): alias += pyext.getBname(clu) clu = pyutil.readData(clu, baseFile=baseFile).get(['clu']) if isinstance(order, basestring): alias += pyext.getBname(order) order = pyutil.readData(order, baseFile=baseFile) if isinstance(tracks, basestring): alias += pyext.getBname(tracks) tracks = pyutil.readData(tracks, baseFile=baseFile) tracks = list(tracks) if isinstance(panel_kw, basestring): alias += pyext.getBname(panel_kw) panel_kw = pyutil.read__buffer(panel_kw, ext='json', typ='rec', guess_index=0).to_dict() if order is not None: clu = order.get(['clu']) else: assert clu is not None order = pd.DataFrame(clu) if isinstance(index, basestring): alias += pyutil.sanitise_query(index) locals().update(extra) index = eval(index) cluTrack = spanel.fixCluster(clu.get(['clu'])) alias = aliasFmt.format(**locals()) # cluFile_clean = 'clean_%s.csv' % alias # cluc.to_csv(cluFile_clean) tracks = pyext.list__realise(tracks, locals()) ##### Output heatmap pp = spanel.panelPlot(tracks, **panel_kw) pp.compile(how=how, index=index, **kwargs) pp.compile(order=order) # if debug: # return pp if debug: return pp fig = pp.render() return (alias, fig)
def count__getGeneHeader(fname, ext='tsv', pipeline=None, silent=1, **kwargs): ext = 'tsv' ### hard set res = file__header(fname, silent=silent) df = pyutil.readData(res, ext=ext, guess_index=0) return df.gene_id.tolist()
def tsv__getColumns(fname, ext='tsv'): # pyutil.readData() res = file__header(fname, silent=silent) df = pyutil.readData(res, ext=ext) return df.columns.tolist()
def summitDist(peak1, peak2, CUTOFF=400, silent=1, GSIZE=None, as_fname=0, **kwargs): '''Find nearby summits within a distance cutoff ''' if GSIZE is None: GSIZE = pyutil.os.environ.get('GSIZE', None) assert GSIZE is not None RANGE = CUTOFF // 2 - 1 infiles = [peak1, peak2] # def file_ncol(fname): # cmd = 'wc -l %s'%(fname) # res = pyutil.shellexec(cmd,silent=silent) # ncol = res[0].strip().split('\t') # incols = incols = map(pyutil.file_ncol, infiles) ### padding/inflate the summit to have radius lst = [] for infile in infiles: ofile = "{infile}.{RANGE}".format(**locals()).split('/')[-1] lst += [ofile] cmd = "bedtools slop -g {GSIZE} -b {RANGE} -i {infile} \ | tee {ofile}".format(**locals()) _ = pyutil.shellexec(cmd, silent=silent) slop1, slop2 = lst FOUT = 'infiles:'+ ":".join(map(pyutil.basename,infiles)) \ + "__cutoff:{}.tsv".format(CUTOFF) # ### bed format 1=chrom, 2=start, 3=end # cols = ','.join(map(str,[2,3,] + [x + incols[0] for x in [2,3]])) # cmd = "bedtools closest -a {slop1} -b {slop2} \ # | bedtools overlap -cols {cols} \ # | tee {FOUT}".format(**locals()) cmd = "bedtools intersect -wo -a {slop1} -b {slop2} \ | tee {FOUT}".format(**locals()) buf = pyutil.shellexec(cmd, silent=silent) ### [TBC]Memory-intensive, Replace with awk mutation in the future columns = header_closest(peak1, peak2) df = pyutil.readData(StringIO.StringIO(buf), header=None, ext='tsv', guess_index=False, columns=columns) df.distance = CUTOFF - df.distance df.to_csv(FOUT, sep='\t', index=False) if as_fname: return FOUT else: return df
def qc_narrowPeak( qfile, cutoff=0.98, ax=None, silent=1, keyFile=None, ofname=None, cutoff_key='per_FC', # cutoff = {'per_FC':0.98} ): ''' Visualise the fold-change distribution and do cutoff ''' f = open(qfile) fline = f.readline() f.close() if fline.split('\t')[0] == 'chrom': qres = pyutil.readData(qfile, guess_index=0) pass else: qres = sdio.extract_peak(qfile) qres['per_FC'] = pyutil.dist2ppf(qres.FC) qres['per_score'] = pyutil.dist2ppf(qres.score) dfc = qres.query('%s > %.3f' % (cutoff_key, cutoff)) ofname = '%s_chipTarg.tsv' % pyutil.basename(qfile) dfc.reset_index(drop=1).to_csv(ofname, sep='\t', index=None, header=None) print(qres.shape, dfc.shape) # print (dfc.shape) if keyFile is not None: keyDF = pyutil.readData(keyFile) dfcc = dfc.set_index('feature_acc', drop=0) dfcc = dfcc.loc[~dfcc.index.duplicated(), ] keyTarg = pd.concat([dfcc[['FC']], keyDF], axis=1, join='inner') pyutil.ipd.display(keyTarg) if not silent: if ax is None: fig, axs = plt.subplots(1, 2, figsize=[12, 4]) ax = axs[0] plt.sca(ax) # cutoff = 0.98 raw_key = cutoff_key.split('_')[-1] ax.plot(qres[cutoff_key], qres[raw_key], 'x') ax.set_xlim(0.5, 1.1) ax.grid(1) ax.vlines(cutoff, *ax.get_ylim()) ax.set_xlabel('percentile') ax.set_ylabel(raw_key) title = 'All=%d' % len(qres) + ', keep=%d' % len(dfc) ax.set_title(title) # dfcc =qc_ sutil.tidyBd(dfc.set_index('feature_acc',drop=0)) return ofname, ax
# execfile('/home/feng/headers/header__import.py') import pymisca.util as pyutil dfc = pyutil.readData(pyutil.base__file('TOUCHED.list'),ext='tsv',header=None) ind = dfc.query('~index.duplicated()').sort_index() # print (ind.to_csv()) ind.to_csv(pyutil.base__file('file.index',force=1)) pyutil.shellexec(''' cd $BASE cat file.index | grep ^RNA | xargs tar -cvzf RNA-seq.tar.gz ''') pyutil.shellexec(''' cd $BASE echo > tracking.index { cat file.index | grep -v ^RNA-seq echo *.tar.gz echo *.index echo *.txt echo *.list echo "Snakefile README" } >> tracking.index ''') pyutil.shellexec(''' cd $BASE echo mkdir -p dist; cat tracking.index | xargs cp -avuf --parents -t dist ''')
def main( #### necessary bedFile=None, bwFiles=None, #### DIR=None, figsize=[14, 14], debug=0, ylim=[0, 10], radius=2000, stepSize=10, NCORE=4, silent=0, gtfFile=None, cdsFile=None, annotation=None, GSIZE=None, center_summit=0, trackNames=None, backend='fluff', ext='png', **kwargs): # vlim = ylim figsize = map(int, figsize) # for peakAcc in df_near.acc.unique()[:1]: prefix = 'PROG=chipShots_bedFile=' # bname = pyutil.basename(bedFile) bname = pyutil.os.path.basename(bedFile) odname = prefix + bname if DIR == 'inplace': DIR = pyutil.os.path.dirname(bedFile) + odname elif DIR is None: DIR = odname pyutil.shellexec('mkdir -p %s' % DIR, silent=silent) DIR = pyutil.os.path.abspath(DIR) # odname = pyutil.ospath if cdsFile is None: cdsFile = gtfFile + '.cds' if backend == 'synotil': # nearFile = './DE2017/type=closest_bed=lux22_radius=1_feat=genes.gtf.cds.tsv' # import synotil.filterByCDS nearFile = synotil.filterByCDS.main( peakFile=bedFile, cdsFile=cdsFile, downStream=radius, upStream=radius, peakRadius=1, GSIZE=GSIZE, center_summit=center_summit, ) df_near = pyutil.readData(nearFile, ) stderrLine('[MSG]Loading bed intervals from bigwig tracks....') chipTracks = sutil.extract_bigwig_multiple( fnames=bwFiles, bedFile=bedFile, radius=radius, stepSize=stepSize, callback=None, outIndex=trackNames, # callback=callback, center_summit=center_summit, shift=0, #### use positive coordinate stranded=False, NCORE=NCORE) if ylim is None: ylim = pyutil.span( pyutil.np.hstack([x.values.flat for x in chipTracks]), 99) ylim = list(ylim) ylim[0] = 0. callback = lambda x: [prepare_chipTrack(ele, vlim=ylim) for ele in x] chipTracks = callback(chipTracks) if debug: stderrLine(chipTracks[0].columns) gtf = pyutil.readData(gtfFile, ext='tsv', header=None, guess_index=0) gtf = scount.countMatrix(gtf, look='gtf') gtfs = [gtf] # uniqPeak = df_near.acc.unique() # bedDF = pyutil.readData(bedFile,header=None,guess_index=0) # bedDF.columns = sutil.bedHeader[:len(bedDF.columns)] bedDF = sutil.extract_peak(bedFile) # uniqPeak # uniqPeak = bedDF[bedDF.columns] worker = pyutil.functools.partial( worker__drawPeak, DIR=DIR, chipTracks=chipTracks, df_near=df_near, gtfs=gtfs, radius=radius, figsize=figsize, ylim=ylim, debug=debug, ) ofnames = pyutil.mp_map( worker, bedDF.acc, n_cpu=NCORE, ) elif backend == 'fluff': bedDF = sdio.extract_peak(bedFile) argDF = bedDF.copy() argDF = sdio.bed__addCol__interval(argDF) tracks = list(bwFiles) argDF['tracks'] = [tracks] * len(bedDF) argDF['annotation'] = annotation argDF['DIR'] = DIR argDF['ext'] = ext if trackNames is not None: argDF['labels'] = [list(trackNames)] * len(bedDF) ofnames = pyutil.mp_map( # ofnames = map( worker__fluff, (vars(x) for x in argDF.itertuples()), n_cpu=NCORE, ) # ofnames = bedDF['img'] = ofnames indexFile = '%s/%s.index.tsv' % (DIR, bname) pyutil.to_tsv(bedDF, indexFile) indexFile = '%s/figureIndex.tsv' % (DIR) pyutil.to_tsv(bedDF, indexFile) try: import synotil.shot2html as shot2html htmlFile = shot2html.shot2html(indexFile, localPath=True) except Exception as e: stderrLine('[WARN]:cannot produce html :%s' % e) htmlFile = None # print ('[OUTPUT]:',) # print ('html:',htmlFile) # print ('index:',indexFile) print(indexFile) print(htmlFile) return (indexFile, htmlFile)