def bed__checkValid(bed, GSIZE, force=0): fname = None if not isinstance(bed, pd.DataFrame): fname = bed bed = sdio.extract_peak(bed) sizeDF = pyutil.readData(GSIZE, ext='tsv', header=None, guess_index=0) sizeDF.columns = ['chrom', 'length'] bedDF = sizeDF.merge(bed) bedDF['valid'] = bedDF.eval('start > 0 and end <= length') if not force: assert bedDF.valid.all() else: resDF = bedDF.query('valid').drop(columns=['valid', 'length']) if fname is not None: ofname = '%s__valid.bed' % fname.rsplit('.', 1)[0] pyutil.to_tsv(resDF, ofname) return ofname else: return resDF
def job__combinePeak( bwCurr, featSummit='/home/feng/ref/Arabidopsis_thaliana_TAIR10/annotation/genes.gtf.cds.summit', GSIZE='/home/feng/ref/Arabidopsis_thaliana_TAIR10/genome.sizes', CUTOFF=4000, head=1000, alias='testPeaks', center_summit=1, ): bwCurr = bwCurr.dropna(subset=['npkFile']) bwCurr['npkFileLine'] = bwCurr.eval("npkFile.map(@pyutil.lineCount)") print(bwCurr[['bname', 'npkFileLine']].sort_values('bname')) dfs = zip(bwCurr.index, map(sdio.extract_peak, bwCurr.npkFile)) dfs = dict(dfs) dfs = {k: v.sort_values('score', ascending=False) for k, v in dfs.items()} bed = pd.concat([df.head(head) for df in dfs.values()], axis=0, sort=False) bed = bed.dropna(axis=1) ofname = pyutil.to_tsv(bed, '%s__combined.bed' % alias) if (len(dfs) > 1) and center_summit: ofname = sdio.bed__merge(ofname, silent=0) # pyutil.print print(ofname, pyutil.lineCount(ofname)) bedFile = ofname # peakSummit = sdio.bed__summit(bedFile) peakSummit = sdio.npk_expandSummit(fname=bedFile, radius=1, center_summit=center_summit) peak2geneFile = sdio.job__nearAUG(featSummit=featSummit, peakSummit=peakSummit, GSIZE=GSIZE, peakWid=1, CUTOFF=CUTOFF) pyutil.fileDict__save('files.json', d=locals(), keys=['bedFile', 'peakSummit', 'peak2geneFile']) return 'files.json'
def job__chipTargPaired( bwCurr=None, bwMeta=None, control=None, treatment=None, xlab=None, ylab=None, name=None, # bwMeta, NCORE=2, params__peakBW=None, CUTOFF_FC=3.0, CUTOFF_CHIPDIFF=0.7, innerRadius=100, ): figs = pyutil.collections.OrderedDict() if control is not None and treatment is not None: xlab, ylab = control, treatment if xlab is None or ylab is None: xlab, ylab = bwCurr.index elif bwCurr is None: bwCurr = bwMeta.reindex([xlab, ylab]) if params__peakBW is None: params__peakBW = dict( outerRadius=500, innerRadius=innerRadius, NCORE=NCORE, outIndex=bwCurr.header, # detailByCHIP = 0, ) params__peakBW['innerRadius'] = innerRadius if name is None: name = '{xlab}-{ylab}'.format(**locals()) # bwCurr = bwMeta # bwCurr = bwCurr.loc[[xlab,ylab]] # bwCurr.npkFile dfs = map( sdio.extract_peak, bwCurr.npkFile, ) fig, ax = plt.subplots(1, 1, figsize=[7, 7]) # ax = plt.gca() for df in dfs: df['per_FC'] = pyutil.dist2ppf(df.FC) df.plot.scatter('per_FC', 'FC', ax=ax) fnames = [ pyutil.queryCopy(infile=fname, query='FC>%.3f' % CUTOFF_FC, reader=sdio.extract_peak, inplace=False) for fname in bwCurr.npkFile ] # dfs[1] peakFlat = ' '.join(fnames) ofname = '%s-combined.bed' % ('-'.join(bwCurr.index)) pyutil.shellexec('cat {peakFlat}>{ofname}'.format(**locals())) ofname = sdio.npk_expandSummit(fname=ofname, radius=1) pyutil.lineCount(ofname) peakFileOrig = peakFile = ofname res = sjob.figs__peakBW(peakFile=peakFile, bwFiles=bwCurr.RPKMFile, name=name, **params__peakBW) figs.update(res[0]) bwTrack, bwAvg = res[1] bwAvg.columns = bwAvg.columns.map( pyutil.df2mapper(bwCurr, 'header', 'index').get) # .set_index('RPKMFile').loc[bwAvg.columns]. # bwAvg.columns = bwCurr.index xs, ys = bwAvg[[xlab, ylab]].values.T # clu = None # peakIndex = pyutil.df__pad(bwAvg).query(query).index clu = pd.DataFrame(pyutil.df__pad(bwAvg)) query = ' val_{ylab} - val_{xlab} > {CUTOFF_CHIPDIFF} '.format(**locals()) qsans = pyutil.sanitise_query(query) peakIndex = clu.query(query).index clu['clu'] = clu.eval('index in @peakIndex') stats = sdio.extract_peak(peakFile).set_index('acc', drop=0) stats['CHIPDIFF'] = clu.eval(query.split('>')[0]) pyvis.qc_2var(xs, ys, clu=clu.clu, xlab=xlab, ylab=ylab) figs['scatterPlot__%s' % name] = plt.gcf() cluFile = ofname = qsans + '.csv' clu.to_csv(ofname) print(ofname, pyutil.lineCount(ofname)) peakBase = pyutil.getBname(peakFile) ofname = '{peakBase}-{qsans}.bed'.format(**locals()) peakFile = pyutil.to_tsv(stats.reindex(peakIndex), ofname) pyutil.shellexec('mkdir -p output/') pyutil.file__link(ofname, 'output/%s.bed' % name, force=True) # peakFile = pyutil.queryCopy(peakFile, # query='acc in @peakIndex', # reader=sdio.extract_peak, # peakIndex=peakIndex, # ) # peakFile = '{peakFile}-{qsans}.bed' # pyutil.fileDict__main(ofname='FILE.json', # **pyutil.dictFilter(locals(), # keys=['cluFile','peakFile', # 'peakFileOrig'] # )) pyutil.fileDict__save(d=locals(), keys=['cluFile', 'peakFile', 'peakFileOrig'], fname='FILE.json') return figs, clu
def clu2bed(segDF, ofname=None): '''Must have columns: ('acc','pos','clu') ''' segDF = segDF.reset_index() # stdout,isFile = get__stdout(ofname) stepSize = np.diff(segDF['pos'].values[:2], axis=0)[0] vals = segDF[['clu', 'acc']].values isDiff = (vals[1:] != vals[:-1]).any(axis=1) segDF['isDiff'] = np.concatenate([[True], isDiff], axis=0) it = (pyutil.util_obj(**vars(x)) for x in segDF.itertuples()) peak = pyutil.collections.OrderedDict(( ('chrom', None), ('start', None), ('end', None), ('acc', None), )) peaks = [] def savePeakStart(): peak['chrom'] = rec.acc peak['start'] = rec.pos return def savePeakEnd(): # kk = loc peak['end'] = oldPos + stepSize peak['acc'] = 'summitPos%d' % ((peak['start'] + peak['end']) // 2) assert peak['end'] > peak['start'], peak # pyutil.ppJson(locals()) peaks.append(peak.copy()) # line = u'\t'.join(map(unicode,peak.values())) # stdout.write(u'%s\n'%line) # print peak return def changed(): if idx != 0: if oldClu == 1: savePeakEnd() if rec.clu == 1: if (oldClu == 0) | (oldAcc != rec.acc): savePeakStart() else: if rec.clu == 1: savePeakStart() return #### Starting the loop oldClu = 0 for idx, rec in enumerate(it): if (idx == 0): changed() elif (rec.clu != oldClu) or (rec.acc != oldAcc): changed() oldClu = rec.clu oldPos = rec.pos oldAcc = rec.acc changed() resDF = pd.DataFrame(peaks) if ofname is not None: try: pyutil.to_tsv( resDF, ofname, ) return ofname except Exception as e: print e return resDF
def bed__embed(outerBed, innerBed, debug=0, ofname=None, mergeAcc=1): if not isinstance(outerBed, pd.DataFrame): outerBed = sdio.extract_peak(outerBed) if not isinstance(innerBed, pd.DataFrame): innerBed = sdio.extract_peak(innerBed) # lc[key] = sdio.extract_peak(val) assert 'acc' in outerBed, 'Reference bed file must be named' if 'acc' in innerBed: if mergeAcc: innerBed['acc'] = pyutil.df__paste0(innerBed, ['chrom', 'acc'], sep='_').tolist() for df in [outerBed, innerBed]: if 'strand' not in df.columns: df['strand'] = '+' df.strand.fillna('+') df['strandVal'] = df.strand.isin(['+', '*']) res = innerBed.merge( outerBed, how='left', left_on='chrom', right_on='acc', # prefixes=['t','1'], suffixes=['Inner', 'Outer'], ) res['strandValFinal'] = ~((res['strandValOuter']) ^ (res['strandValInner'])) isNeg = res.strandValFinal == 0 (res.loc[ isNeg,'startInner'], res.loc[isNeg,'endInner']) \ = (-res.loc[isNeg,'endInner'],-res.loc[isNeg,'startInner']) res['valid'] = res.eval('startInner<=endInner') assert res.valid.all() # print 'acc' in res.columns if debug == 2: return res if debug == 1: return res[[ 'valid', 'chromInner', 'startInner', 'endInner', 'strandValFinal', 'strandValInner', 'strandValOuter' ]] res['shift'] = 0 isOuterNeg = res.strandValOuter == 0 res.loc[isOuterNeg, 'shift'] = res.loc[isOuterNeg, 'endOuter'] res.loc[~isOuterNeg, 'shift'] = res.loc[~isOuterNeg, 'startOuter'] # res.shift = r res['start'] = res['shift'] + res['startInner'] res['end'] = res['shift'] + res['endInner'] res['chrom'] = res['chromOuter'] res['acc'] = res['accInner'] resDF = res[[ 'chrom', 'start', 'end', 'acc' # 'accInner' ]] if ofname is not None: try: pyutil.to_tsv( resDF, ofname, ) return ofname except Exception as e: print e return resDF
def main( #### necessary bedFile=None, bwFiles=None, #### DIR=None, figsize=[14, 14], debug=0, ylim=[0, 10], radius=2000, stepSize=10, NCORE=4, silent=0, gtfFile=None, cdsFile=None, annotation=None, GSIZE=None, center_summit=0, trackNames=None, backend='fluff', ext='png', **kwargs): # vlim = ylim figsize = map(int, figsize) # for peakAcc in df_near.acc.unique()[:1]: prefix = 'PROG=chipShots_bedFile=' # bname = pyutil.basename(bedFile) bname = pyutil.os.path.basename(bedFile) odname = prefix + bname if DIR == 'inplace': DIR = pyutil.os.path.dirname(bedFile) + odname elif DIR is None: DIR = odname pyutil.shellexec('mkdir -p %s' % DIR, silent=silent) DIR = pyutil.os.path.abspath(DIR) # odname = pyutil.ospath if cdsFile is None: cdsFile = gtfFile + '.cds' if backend == 'synotil': # nearFile = './DE2017/type=closest_bed=lux22_radius=1_feat=genes.gtf.cds.tsv' # import synotil.filterByCDS nearFile = synotil.filterByCDS.main( peakFile=bedFile, cdsFile=cdsFile, downStream=radius, upStream=radius, peakRadius=1, GSIZE=GSIZE, center_summit=center_summit, ) df_near = pyutil.readData(nearFile, ) stderrLine('[MSG]Loading bed intervals from bigwig tracks....') chipTracks = sutil.extract_bigwig_multiple( fnames=bwFiles, bedFile=bedFile, radius=radius, stepSize=stepSize, callback=None, outIndex=trackNames, # callback=callback, center_summit=center_summit, shift=0, #### use positive coordinate stranded=False, NCORE=NCORE) if ylim is None: ylim = pyutil.span( pyutil.np.hstack([x.values.flat for x in chipTracks]), 99) ylim = list(ylim) ylim[0] = 0. callback = lambda x: [prepare_chipTrack(ele, vlim=ylim) for ele in x] chipTracks = callback(chipTracks) if debug: stderrLine(chipTracks[0].columns) gtf = pyutil.readData(gtfFile, ext='tsv', header=None, guess_index=0) gtf = scount.countMatrix(gtf, look='gtf') gtfs = [gtf] # uniqPeak = df_near.acc.unique() # bedDF = pyutil.readData(bedFile,header=None,guess_index=0) # bedDF.columns = sutil.bedHeader[:len(bedDF.columns)] bedDF = sutil.extract_peak(bedFile) # uniqPeak # uniqPeak = bedDF[bedDF.columns] worker = pyutil.functools.partial( worker__drawPeak, DIR=DIR, chipTracks=chipTracks, df_near=df_near, gtfs=gtfs, radius=radius, figsize=figsize, ylim=ylim, debug=debug, ) ofnames = pyutil.mp_map( worker, bedDF.acc, n_cpu=NCORE, ) elif backend == 'fluff': bedDF = sdio.extract_peak(bedFile) argDF = bedDF.copy() argDF = sdio.bed__addCol__interval(argDF) tracks = list(bwFiles) argDF['tracks'] = [tracks] * len(bedDF) argDF['annotation'] = annotation argDF['DIR'] = DIR argDF['ext'] = ext if trackNames is not None: argDF['labels'] = [list(trackNames)] * len(bedDF) ofnames = pyutil.mp_map( # ofnames = map( worker__fluff, (vars(x) for x in argDF.itertuples()), n_cpu=NCORE, ) # ofnames = bedDF['img'] = ofnames indexFile = '%s/%s.index.tsv' % (DIR, bname) pyutil.to_tsv(bedDF, indexFile) indexFile = '%s/figureIndex.tsv' % (DIR) pyutil.to_tsv(bedDF, indexFile) try: import synotil.shot2html as shot2html htmlFile = shot2html.shot2html(indexFile, localPath=True) except Exception as e: stderrLine('[WARN]:cannot produce html :%s' % e) htmlFile = None # print ('[OUTPUT]:',) # print ('html:',htmlFile) # print ('index:',indexFile) print(indexFile) print(htmlFile) return (indexFile, htmlFile)