def df__deposit(dfc, runID='000R', ext='excel.count', DIR='.', init=1, silent=0, sep='\t', header=1): # dfc = rnaseq.copy() dfc = dfc.copy() fnames = assign__filename(dfc.columns, runID=runID, ext=ext, DIR=DIR, init=init) dfc.columns = fnames dfc.columns.name = 'fname' gp = dfc.reset_index().melt(value_name='TPM', id_vars=['gene_id']).groupby('fname') for fname, df in gp: odf = df.drop(columns='fname').set_index('gene_id') pyutil.shellexec( 'mkdir -p `dirname {fname}`'.format(**locals()), silent=silent, ) odf.to_csv(fname, sep=sep, header=header) return dfc.columns
def bed__merge(bedFile, silent=1, opt='-c 4 -o first'): bname = pyutil.os.path.basename(bedFile) path = pyutil.os.path.dirname(bedFile) ofname = pyutil.os.path.join(path, 'merged__%s' % bname) cmd = 'bedtools sort -i {bedFile} | bedtools merge -i - {opt} > {ofname}'.format( **locals()) pyutil.shellexec(cmd, silent=silent) return ofname
def bed_randomise(infile, GSIZE=None, silent=1): '''Create a randomly distributed bed file ''' ofile = pyutil.basename(infile) + '_type=random.bed' assert GSIZE is not None LC = pyutil.lineCount(infile) cmd = "bedtools random -g {GSIZE} -l 2 -n {LC} | tee {ofile}".format( **locals()) pyutil.shellexec(cmd, silent=silent) return ofile
def bed__makewindows(bedFile, windowSize=100, stepSize=None, silent=1): if stepSize is None: # windowSize=100 stepSize = windowSize // 2 # bedFile = 'per-score-GT-0dot6_188C_RESEQ-combined.bed' # bedbase = pyutil.bname(bedbane) ofname = '{bedFile}.w{windowSize}s{stepSize}'.format(**locals()) cmd = "bedtools makewindows -i srcwinnum -w {windowSize} -s {stepSize} -b {bedFile} > {ofname} ".format( **locals()) pyutil.shellexec(cmd, silent=silent) return ofname
def bed__summit(peakFile, GSIZE=None, silent=1, opt='-s -l -0.5 -r -0.5 -pct', inplace=True): if GSIZE is None: GSIZE = pyutil.os.environ.get('GSIZE', None) assert GSIZE is not None ofname = '%s.summit' % peakFile if not inplace: ofname = pyutil.os.path.basename(ofname) cmd = 'cat {peakFile} \ | bedtools slop -g {GSIZE} {opt} -i - \ > {ofname}'.format(**locals()) pyutil.shellexec(cmd, silent=silent) return ofname
def bed__totalLength(bedFile, silent=1): '''Source: https://www.biostars.org/p/68283/#68292 ''' cmd = "cat %s | awk -F'\t' 'BEGIN{SUM=0}{ SUM+=$3-$2 }END{print SUM}'" % bedFile res = pyutil.shellexec(cmd, silent=silent) res = int(res.strip()) return res
def wig2bigwig(fname, chromSizes='chrom.sizes', silent=1): ofbase = pyutil.getBname(fname) ofname = '%s.bw' % ofbase cmd = '''wigToBigWig {fname} {chromSizes} {ofname} '''.format(**locals()) res = pyutil.shellexec(cmd, silent=silent) return ofname
def fig__fluffProfile(interval, tracks, ofname=None, annotation=None, scaleOpt=None, fragmentSize=0, labels=None, silent=0): # annotation = BED12 # cmd = 'fluff profile' trackFlat = u' '.join(tracks) if scaleOpt is None: scaleOpt = ' -s 1:%d ' % (len(tracks)) if ofname is None: ofname = interval + '.svg' cmd = '' cmd += ' fluff profile ' cmd += scaleOpt cmd += ' -f {fragmentSize} ' if annotation is not None: cmd += ' -a {annotation} ' if labels is not None: labelFlat = u' '.join(labels) cmd += ' -l {labelFlat}' cmd += ' -o {ofname} -i {interval} -d {trackFlat} ' cmd += ' -b white ' cmd += ' 2>&1 ' cmd = cmd.format(**locals()) res = pyutil.shellexec(cmd, silent=silent) return ofname
def bam__getHeader(fname, grepKey='SQ', silent=1, head=100): cmd = u'samtools view -H %s' % fname if grepKey is not None: cmd = u'{cmd} | grep {grepKey}'.format(**locals()) if head is not None: cmd = u'{cmd} | head -n{head}'.format(**locals()) res = pyutil.shellexec(cmd, silent=silent) return res
def bed__guessWidth(bedFile, silent=1, head=100): res = pyutil.shellexec('head -n{head} {bedFile}'.format(**locals()), silent=silent) dfc = extract_peak(StringIO.StringIO(res)) span = (dfc.end - dfc.start).values.ravel() M = np.median(span) if span.std() / len(span)**0.5 > 0.1 * M: pyutil.sys.stderr.write('[WARN]:estimation may be unstable\n') return int(M)
def file__concat(bedFiles, silent=1, ofname='concated_file', ext=None # opt='-c 4 -o first' ): '''Concatentat a list of files ''' if ext is None: sp = bedFiles[0].rsplit('.', 1) if len(sp) == 2: ext = sp[-1] else: ext = None if ext is not None: ofname = '.'.join([ofname, ext]) flatName = ' '.join(bedFiles) cmd = 'cat {flatName} >{ofname}'.format(**locals()) # bname = pyutil.os.path.basename(bedFile) # path = pyutil.os.path.dirname(bedFile) # ofname = pyutil.os.path.join(path,'merged__%s'%bname) # cmd = 'bedtools sort -i {bedFile} | bedtools merge -i - {opt} > {ofname}'.format(**locals()) pyutil.shellexec(cmd, silent=silent) return ofname
def closestAnnotation( bedFile, RANGE=1000, ANNOTATION_FILE=None, GSIZE=None, silent=True, ): ''' use bedtools to find the feature closest to the regions contianed inthe in the given bed file. The annotation will be expanded by {RANGE} bp before queryed chrom.sizes must be supplied as {GSIZE} to make bedtools happy ''' FOUT = bedFile.split('/')[-1] FOUT = 'type=closest_bed=%s_feat=%s.tsv' % ( pyutil.basename(bedFile), pyutil.basename(ANNOTATION_FILE)) cmd = ''' bedtools slop -b {RANGE} -i {ANNO} -g {GSIZE} |bedtools sort > {ANNOBASE}.{RANGE} bedtools sort -i {bedFile} |\ bedtools closest -d -a - -b {ANNOBASE}.{RANGE} | tee {FOUT}.tmp '''.format( GSIZE=GSIZE, ANNO=ANNOTATION_FILE, ANNOBASE=ANNOTATION_FILE.split('/')[-1], bedFile=bedFile, RANGE=RANGE, FOUT=FOUT, ).strip() buf = StringIO.StringIO(pyutil.shellexec(cmd, silent=silent)) if buf.len: buf.seek(0) header = sum([ guessBedHeader(x, prefix=k) for k, x in [('', bedFile), ('feat', ANNOTATION_FILE)] ], []) header += [ 'distance', ] df = pyutil.readData(buf, header=None, ext='tsv', guess_index=False) df.columns = header # df = parseBedClosest(fname = buf) # os.system('rm %s.tmp' % FOUT) else: assert 0, ' Buffer is empty, check error msg' df = df[df['distance'] == 0] df.to_csv(FOUT, sep='\t', index=0) return FOUT
def readModels(DIR): DIR = DIR.rstrip('/') fnames = pyutil.shellexec("find %s/*randomState*.npy | grep normF" % DIR).splitlines() res = map(scount.countMatrix.from_npy, fnames) meta = pyutil.flat2meta( [x.replace('/', '_').rsplit('.', 1)[0] for x in fnames]) meta = pd.DataFrame( map(lambda x: dict([y for y in x if len(y) == 2]), meta)) meta['fname_'] = list(fnames) meta['obj'] = res # meta.model = [x.model for x in res] # meta['model'] = [[x.model] for x in res] # meta['model'] = [x.model for x in res] meta_model = meta return meta_model
def rawFile__combineChunk(dfc, silent=0): ''' combine chunkedFiles in df_raw according to "fname" and "fnameCombined" ''' dfc = dfc.copy() # dfc = dfc[idKeys + ['fnameCombined','fnameCombinedSize']].drop_duplicates() fnameFlat = ' \\\n'.join(dfc.fname) ofnames = dfc.fnameCombined.unique() assert len(ofnames)==1,\ 'contains mulitple fnameCombined!:%s'%ofnames ofname = ofnames[0] cmd = 'cat {fnameFlat} > {ofname}'.format(**locals()) res = pyutil.shellexec(cmd, silent=silent) dfc['fnameCombinedSize'] = pyutil.os.path.getsize(ofname) dfc = dfc[idKeys + ['fnameCombined', 'fnameCombinedSize']].drop_duplicates() return dfc
def findPromoter( INFILE=None, upStream=1000, downStream=500, opt='-s -i -', filterKey='CDS', OFILE=None, inplace=0, GSIZE=None, silent=1, ): '''Find the promoter from a GTF file ''' if GSIZE is None: TRY = os.environ.get('GSIZE', None) assert TRY is not None, 'Please specify chromosizes' GSIZE = TRY assert os.path.exists(GSIZE), 'File does not exist:"%s"' % GSIZE if OFILE is None: OFILE = os.path.basename(INFILE) + '.promoter' if inplace: OFILE = os.path.join(os.path.dirname(INFILE), OFILE) cmd = 'cat %s' % INFILE if filterKey is not None: cmd += '| grep {} \\\n'.format(filterKey) cmd += r''' | bedtools slop -s -l 0 -r -1.0 -pct {opt} \ | bedtools slop -s -l {upStream} -r {downStream} {opt} \ | sed "s/\"//g" \ >{OFILE} '''.format( # INFILE = INFILE, OFILE=OFILE, # filterKey=filterKey, upStream=upStream, downStream=downStream, opt='%s -g %s' % (opt, GSIZE), ).strip() res = pyutil.shellexec(cmd, silent=silent) return OFILE
def guessBedHeader(fname, silent=True, ext='tsv', guess_index=0, prefix='', **kwargs): cmd = 'head -n5 %s' % fname buf = StringIO.StringIO(pyutil.shellexec(cmd, silent=silent)) df = pyutil.readData(buf, ext=ext, header=None, guess_index=guess_index, **kwargs) if len(df.columns) > len(bedHeader): header = bedHeader + list(df.columns)[len(bedHeader):] else: header = bedHeader[:len(df.columns)] if prefix: header = ['%s_%s' % (prefix, x) for x in header] return map(str, header)
def job__chipTargPaired( bwCurr=None, bwMeta=None, control=None, treatment=None, xlab=None, ylab=None, name=None, # bwMeta, NCORE=2, params__peakBW=None, CUTOFF_FC=3.0, CUTOFF_CHIPDIFF=0.7, innerRadius=100, ): figs = pyutil.collections.OrderedDict() if control is not None and treatment is not None: xlab, ylab = control, treatment if xlab is None or ylab is None: xlab, ylab = bwCurr.index elif bwCurr is None: bwCurr = bwMeta.reindex([xlab, ylab]) if params__peakBW is None: params__peakBW = dict( outerRadius=500, innerRadius=innerRadius, NCORE=NCORE, outIndex=bwCurr.header, # detailByCHIP = 0, ) params__peakBW['innerRadius'] = innerRadius if name is None: name = '{xlab}-{ylab}'.format(**locals()) # bwCurr = bwMeta # bwCurr = bwCurr.loc[[xlab,ylab]] # bwCurr.npkFile dfs = map( sdio.extract_peak, bwCurr.npkFile, ) fig, ax = plt.subplots(1, 1, figsize=[7, 7]) # ax = plt.gca() for df in dfs: df['per_FC'] = pyutil.dist2ppf(df.FC) df.plot.scatter('per_FC', 'FC', ax=ax) fnames = [ pyutil.queryCopy(infile=fname, query='FC>%.3f' % CUTOFF_FC, reader=sdio.extract_peak, inplace=False) for fname in bwCurr.npkFile ] # dfs[1] peakFlat = ' '.join(fnames) ofname = '%s-combined.bed' % ('-'.join(bwCurr.index)) pyutil.shellexec('cat {peakFlat}>{ofname}'.format(**locals())) ofname = sdio.npk_expandSummit(fname=ofname, radius=1) pyutil.lineCount(ofname) peakFileOrig = peakFile = ofname res = sjob.figs__peakBW(peakFile=peakFile, bwFiles=bwCurr.RPKMFile, name=name, **params__peakBW) figs.update(res[0]) bwTrack, bwAvg = res[1] bwAvg.columns = bwAvg.columns.map( pyutil.df2mapper(bwCurr, 'header', 'index').get) # .set_index('RPKMFile').loc[bwAvg.columns]. # bwAvg.columns = bwCurr.index xs, ys = bwAvg[[xlab, ylab]].values.T # clu = None # peakIndex = pyutil.df__pad(bwAvg).query(query).index clu = pd.DataFrame(pyutil.df__pad(bwAvg)) query = ' val_{ylab} - val_{xlab} > {CUTOFF_CHIPDIFF} '.format(**locals()) qsans = pyutil.sanitise_query(query) peakIndex = clu.query(query).index clu['clu'] = clu.eval('index in @peakIndex') stats = sdio.extract_peak(peakFile).set_index('acc', drop=0) stats['CHIPDIFF'] = clu.eval(query.split('>')[0]) pyvis.qc_2var(xs, ys, clu=clu.clu, xlab=xlab, ylab=ylab) figs['scatterPlot__%s' % name] = plt.gcf() cluFile = ofname = qsans + '.csv' clu.to_csv(ofname) print(ofname, pyutil.lineCount(ofname)) peakBase = pyutil.getBname(peakFile) ofname = '{peakBase}-{qsans}.bed'.format(**locals()) peakFile = pyutil.to_tsv(stats.reindex(peakIndex), ofname) pyutil.shellexec('mkdir -p output/') pyutil.file__link(ofname, 'output/%s.bed' % name, force=True) # peakFile = pyutil.queryCopy(peakFile, # query='acc in @peakIndex', # reader=sdio.extract_peak, # peakIndex=peakIndex, # ) # peakFile = '{peakFile}-{qsans}.bed' # pyutil.fileDict__main(ofname='FILE.json', # **pyutil.dictFilter(locals(), # keys=['cluFile','peakFile', # 'peakFileOrig'] # )) pyutil.fileDict__save(d=locals(), keys=['cluFile', 'peakFile', 'peakFileOrig'], fname='FILE.json') return figs, clu
def main( #### necessary bedFile=None, bwFiles=None, #### DIR=None, figsize=[14, 14], debug=0, ylim=[0, 10], radius=2000, stepSize=10, NCORE=4, silent=0, gtfFile=None, cdsFile=None, annotation=None, GSIZE=None, center_summit=0, trackNames=None, backend='fluff', ext='png', **kwargs): # vlim = ylim figsize = map(int, figsize) # for peakAcc in df_near.acc.unique()[:1]: prefix = 'PROG=chipShots_bedFile=' # bname = pyutil.basename(bedFile) bname = pyutil.os.path.basename(bedFile) odname = prefix + bname if DIR == 'inplace': DIR = pyutil.os.path.dirname(bedFile) + odname elif DIR is None: DIR = odname pyutil.shellexec('mkdir -p %s' % DIR, silent=silent) DIR = pyutil.os.path.abspath(DIR) # odname = pyutil.ospath if cdsFile is None: cdsFile = gtfFile + '.cds' if backend == 'synotil': # nearFile = './DE2017/type=closest_bed=lux22_radius=1_feat=genes.gtf.cds.tsv' # import synotil.filterByCDS nearFile = synotil.filterByCDS.main( peakFile=bedFile, cdsFile=cdsFile, downStream=radius, upStream=radius, peakRadius=1, GSIZE=GSIZE, center_summit=center_summit, ) df_near = pyutil.readData(nearFile, ) stderrLine('[MSG]Loading bed intervals from bigwig tracks....') chipTracks = sutil.extract_bigwig_multiple( fnames=bwFiles, bedFile=bedFile, radius=radius, stepSize=stepSize, callback=None, outIndex=trackNames, # callback=callback, center_summit=center_summit, shift=0, #### use positive coordinate stranded=False, NCORE=NCORE) if ylim is None: ylim = pyutil.span( pyutil.np.hstack([x.values.flat for x in chipTracks]), 99) ylim = list(ylim) ylim[0] = 0. callback = lambda x: [prepare_chipTrack(ele, vlim=ylim) for ele in x] chipTracks = callback(chipTracks) if debug: stderrLine(chipTracks[0].columns) gtf = pyutil.readData(gtfFile, ext='tsv', header=None, guess_index=0) gtf = scount.countMatrix(gtf, look='gtf') gtfs = [gtf] # uniqPeak = df_near.acc.unique() # bedDF = pyutil.readData(bedFile,header=None,guess_index=0) # bedDF.columns = sutil.bedHeader[:len(bedDF.columns)] bedDF = sutil.extract_peak(bedFile) # uniqPeak # uniqPeak = bedDF[bedDF.columns] worker = pyutil.functools.partial( worker__drawPeak, DIR=DIR, chipTracks=chipTracks, df_near=df_near, gtfs=gtfs, radius=radius, figsize=figsize, ylim=ylim, debug=debug, ) ofnames = pyutil.mp_map( worker, bedDF.acc, n_cpu=NCORE, ) elif backend == 'fluff': bedDF = sdio.extract_peak(bedFile) argDF = bedDF.copy() argDF = sdio.bed__addCol__interval(argDF) tracks = list(bwFiles) argDF['tracks'] = [tracks] * len(bedDF) argDF['annotation'] = annotation argDF['DIR'] = DIR argDF['ext'] = ext if trackNames is not None: argDF['labels'] = [list(trackNames)] * len(bedDF) ofnames = pyutil.mp_map( # ofnames = map( worker__fluff, (vars(x) for x in argDF.itertuples()), n_cpu=NCORE, ) # ofnames = bedDF['img'] = ofnames indexFile = '%s/%s.index.tsv' % (DIR, bname) pyutil.to_tsv(bedDF, indexFile) indexFile = '%s/figureIndex.tsv' % (DIR) pyutil.to_tsv(bedDF, indexFile) try: import synotil.shot2html as shot2html htmlFile = shot2html.shot2html(indexFile, localPath=True) except Exception as e: stderrLine('[WARN]:cannot produce html :%s' % e) htmlFile = None # print ('[OUTPUT]:',) # print ('html:',htmlFile) # print ('index:',indexFile) print(indexFile) print(htmlFile) return (indexFile, htmlFile)
# execfile('/home/feng/headers/header__import.py') import pymisca.util as pyutil dfc = pyutil.readData(pyutil.base__file('TOUCHED.list'),ext='tsv',header=None) ind = dfc.query('~index.duplicated()').sort_index() # print (ind.to_csv()) ind.to_csv(pyutil.base__file('file.index',force=1)) pyutil.shellexec(''' cd $BASE cat file.index | grep ^RNA | xargs tar -cvzf RNA-seq.tar.gz ''') pyutil.shellexec(''' cd $BASE echo > tracking.index { cat file.index | grep -v ^RNA-seq echo *.tar.gz echo *.index echo *.txt echo *.list echo "Snakefile README" } >> tracking.index ''') pyutil.shellexec(''' cd $BASE echo mkdir -p dist; cat tracking.index | xargs cp -avuf --parents -t dist ''')
def file__header(fname, head=10, silent=1): res = pyutil.shellexec('head -n{head} {fname}'.format(**locals()), silent=silent) res = pyutil.StringIO.StringIO(res) return res
dct = msg.raw.copy() #### record time to precision # keys = ['create_time','receive_time','latency'] # for k in keys: # dct[k] = getattr(msg,k) json.dump(dct, f) f.write('\n') except Exception as e: print('[ERROR]', e) # print (dct) print(msg) # json.dump return dumpMsg if __name__ == '__main__': bot = wxpy.Bot() logDir = '%s-%s' % ( bot.self.uin, bot.self.name, ) pyutil.shellexec(u'mkdir -p {logDir}'.format(**locals())) logFile = '%s/messages.json' % logDir f = open(logFile, 'a', 0) callback = make__dumpMsg(f=f) chats = bot.friends(update=True) + bot.groups(update=True) bot.register(except_self=False, chats=chats)(callback) if not pyutil.hasIPD: wxpy.embed()
def process(k=None, npkFile=None, gPar=None, dbg=0, ANNOTATION_FILE=None): if k is None: assert npkFile, 'must specify one arg' k = npkFile.rsplit('.', 1)[0].split('/')[-1] # d = condDict[k] outd = {'files': {}} if 1: # parameter_string = 'fc=1.5 q=0.001 p=0.01' parameter_string = make_param_string(gPar) # fileMacs = '%s/peaks/%s_peaks.narrowPeak'%(SUMMARY_DIR,k) small_narrowPeak = '{key}.snpk'.format(key=k) ### PeakFiltering cmd = 'python {SCRIPT} {INFILE} {PARAM} > {OUTF}'.format( SCRIPT=PEAK_SELECT_SCRIPT, INFILE=npkFile, PARAM=parameter_string, OUTF=small_narrowPeak, ) print(cmd) os.system(cmd) outd['param'] = parameter_string # cmd = 'cat {INFILE} > {OUTF}'.format( # # SCRIPT= PEAK_SELECT_SCRIPT, # INFILE = npkFile, # # PARAM= parameter_string, # OUTF = small_narrowPeak, # ) # print(cmd);os.system(cmd) # outd['param'] = parameter_string # raise 0 #### Fancy Histogram fc_thresholds = [x * 0.1 for x in range(10, 10 * MAX_FOLD_CHANGE, 2)] npeak_lst = calc_npeak(fc_thresholds, k + '_peaks.narrowPeak', gPar['PVALUE'], gPar['QVALUE']) plotName = SUMMARY_DIR.strip('/') + '/' + 'npeaks_vs_fc_' + k + '.txt' # plot_npeak_vs_fc(fc_thresholds, npeak_lst, plotName) #### Produce geneLists file_bedmap = '%s.bedmap.tsv' % k # cmd = 'bedmap --echo --echo-map-id-uniq --delim \'\t\' ' \ # + '--range ' + gPar['TARGET_RANGE'] \ # + ' ' + small_narrowPeak \ # + ' ' + ANNOTATION_FILE + ' | tee ' + file_bedmap +'.tmp' # buf = StringIO.StringIO(pyutil.shellexec(cmd)) # # buf = file_bed # # res_bedmap = pd.read_table(buf,header=None) # # if buf.read(): # if buf.len: # buf.seek(0) # df = sutil.parseBedmap(fname = buf) # else: # header = sutil.bedHeader + ['hit'] # df = pd.DataFrame(columns = header) cmd = ''' bedtools slop -b {RANGE} -i {ANNO} -g $GSIZE |bedtools sort > {ANNOBASE}.{RANGE} bedtools closest -d -a {SNPK} -b {ANNOBASE}.{RANGE} | tee {FOUT}.tmp '''.format(ANNO=ANNOTATION_FILE, ANNOBASE=ANNOTATION_FILE.split('/')[-1], SNPK=small_narrowPeak, RANGE=gPar['TARGET_RANGE'], FOUT=file_bedmap).strip() buf = StringIO.StringIO(pyutil.shellexec(cmd)) if buf.len: buf.seek(0) df = sutil.parseBedClosest(fname=buf) else: assert 0, ' Buffer is empty, check error msg' df['condition'] = k df = df[df['distance'] == 0] # raise e df = df.sort_values('FC', ascending=False, inplace=False) #### deduplication on gene acc df = df.loc[~df.duplicated('hit')] res_bedmap = df df.to_csv(file_bedmap, sep='\t') genes = df outd['genes'] = None outd['nGene'] = len(df['hit'].unique()) outd['file_bedmap'] = file_bedmap # outd['res_bedmap'] = res_bedmap fname = '%s/%s.gene.txt' % (SUMMARY_DIR, k) dfc = df.copy()[[ 'hit', 'FC', 'acc', ]] dfc.columns = ['geneAcc', 'maxFoldChange', 'peakAcc'] dfc.to_csv(fname, sep='\t') outd['glst_filename'] = fname # outd['goenrich_filename'] = make_goenrichment_file(SUMMARY_DIR + '/' + k, genes) outd['goenrich_filename'] = 'NotImplemented' outd['plot_file'] = plotName outd['peak_file'] = small_narrowPeak outd['key'] = k outd['extra'] = '' return outd
def check_DIR(self, DIR=None): DIR = self.DIR if DIR is None else DIR pyutil.shellexec('mkdir -p %s' % DIR) return DIR
def summitDist(peak1, peak2, CUTOFF=400, silent=1, GSIZE=None, as_fname=0, **kwargs): '''Find nearby summits within a distance cutoff ''' if GSIZE is None: GSIZE = pyutil.os.environ.get('GSIZE', None) assert GSIZE is not None RANGE = CUTOFF // 2 - 1 infiles = [peak1, peak2] # def file_ncol(fname): # cmd = 'wc -l %s'%(fname) # res = pyutil.shellexec(cmd,silent=silent) # ncol = res[0].strip().split('\t') # incols = incols = map(pyutil.file_ncol, infiles) ### padding/inflate the summit to have radius lst = [] for infile in infiles: ofile = "{infile}.{RANGE}".format(**locals()).split('/')[-1] lst += [ofile] cmd = "bedtools slop -g {GSIZE} -b {RANGE} -i {infile} \ | tee {ofile}".format(**locals()) _ = pyutil.shellexec(cmd, silent=silent) slop1, slop2 = lst FOUT = 'infiles:'+ ":".join(map(pyutil.basename,infiles)) \ + "__cutoff:{}.tsv".format(CUTOFF) # ### bed format 1=chrom, 2=start, 3=end # cols = ','.join(map(str,[2,3,] + [x + incols[0] for x in [2,3]])) # cmd = "bedtools closest -a {slop1} -b {slop2} \ # | bedtools overlap -cols {cols} \ # | tee {FOUT}".format(**locals()) cmd = "bedtools intersect -wo -a {slop1} -b {slop2} \ | tee {FOUT}".format(**locals()) buf = pyutil.shellexec(cmd, silent=silent) ### [TBC]Memory-intensive, Replace with awk mutation in the future columns = header_closest(peak1, peak2) df = pyutil.readData(StringIO.StringIO(buf), header=None, ext='tsv', guess_index=False, columns=columns) df.distance = CUTOFF - df.distance df.to_csv(FOUT, sep='\t', index=False) if as_fname: return FOUT else: return df