def sj02bw(sj0, pathpre, genome, np=12): chroms = UT.chroms(genome) chromdf = UT.chromdf(genome).sort_values('size',ascending=False) chroms = [x for x in chromdf['chr'] if x in chroms] chromdic = UT.df2dict(chromdf, 'chr', 'size') if 'jcnt' not in sj0: sj0['jcnt'] = sj0['ucnt']+sj0['mcnt'] files = [] args = [] for c in chroms: f = '{0}.{1}.{{0}}.wig'.format(pathpre,c) args.append((sj0[sj0['chr']==c], c, chromdic[c], f)) files.append(f) rslts = UT.process_mp(sj02wig, args, np=np, doreduce=False) rmfiles = [] for strand in ['+','-','.']: s = STRANDMAP0[strand] wig = pathpre+'.sj{0}.wig'.format(s) bwpath = pathpre+'.sj{0}.bw'.format(s) with open(wig, 'w') as dst: for tmpl in files: f = tmpl.format(strand) with open(f,'r') as src: shutil.copyfileobj(src, dst) rmfiles.append(f) rmfiles.append(wig) wig2bw(wig, UT.chromsizes(genome), bwpath) for f in rmfiles: os.unlink(f) os.unlink(wig)
def merge_bigwigs_mp(bwfiles, genome, dstpath, scale=None, np=7): chroms = UT.chroms(genome) chromfile = UT.chromsizes(genome) chromsizes = UT.df2dict(UT.chromdf(genome), 'chr', 'size') # reorder chroms, so that chrX doesn't get processed alone at the end wasting MP time tmp = sorted([(chromsizes[c], c) for c in chroms])[::-1] chroms = [x[1] for x in tmp] args = [(bwfiles, c, chromsizes[c], dstpath + '.{0}.wig'.format(c), scale) for c in chroms] rslts = UT.process_mp(merge_bigwigs_chr, args, np, doreduce=False) dic = dict(rslts) LOG.debug('concatenating chromosomes...') wigpath = dstpath + '.wig' UT.makedirs(os.path.dirname(wigpath)) with open(wigpath, 'wb') as dst: for c in chroms: with open(dic[c], 'rb') as src: shutil.copyfileobj(src, dst) LOG.debug('converting wiggle to bigwig') BT.wig2bw(wigpath, chromfile, dstpath) # clean up for c in chroms: f = dstpath + '.{0}.wig'.format(c) if os.path.exists(f): os.unlink(f) if os.path.exists(wigpath): os.unlink(wigpath)
def bw2bed_mp(bwfile, bedfile, chroms, th, np=4): """ multi CPU version of bw2bed """ args = [] files = [] for chrom in chroms: bedchromfile = bedfile + '.{0}.bed.gz'.format(chrom) files.append(bedchromfile) args.append((bwfile, bedchromfile, [chrom], th, False)) rslts = UT.process_mp(bw2bed, args, np=np, doreduce=False) # concatenate gz files bedbase = bedfile[:-3] if bedfile[-3:] == '.gz' else bedfile with open(bedbase, 'wb') as dst: for f in rslts: with open(f, 'rb') as src: shutil.copyfileobj(src, dst) # !!! bedtool gzip problem againg !!! # bedtools only process first one if just concatenate gzipped files # => concatenate unzipped and gzip whole thing at the end bedfile = UT.compress(bedbase) # clean up temp files for f in rslts: os.unlink(f) return bedfile
def __call__(self): chroms = UT.chroms(self.genome) csizedic = UT.df2dict(UT.chromdf(self.genome), 'chr', 'size') args = [] for c in chroms: csize = csizedic[c] args.append((self.bwsjpre, self.statspath, c, csize, self.params)) rslts = UT.process_mp(filter_sj, args, np=self.np, doreduce=False) dstpath = self.bwsjpre + '.sjpath.filtered.bed.gz' with open(dstpath, 'wb') as dst: for c in chroms: srcpath = self.bwsjpre + '.sjpath.{0}.filtered.bed.gz'.format( c) with open(srcpath, 'rb') as src: shutil.copyfileobj(src, dst)
def calculate(self, unionexbed, addcols=['_id','_gidx'], np=10): """ Calculate PhyloCSF score. Args: unionexbed: Pandas DataFrame bed containing unioned exons addcols: additional cols to copy from unionexbed (other than chr,st,ed) np: number of CPU to use """ # process chrom wise args = [] for chrom in unionexbed['chr'].unique(): uechr = unionexbed[unionexbed['chr']==chrom][['chr','st','ed']+addcols].copy() mycopy = Phylo60(self.path) args.append((uechr, mycopy)) rslts = UT.process_mp(calc_worker, args, np=np, doreduce=False) df = PD.concat(rslts, ignore_index=True) return df
def estimatecovs(modelpre, bwpre, dstpre, genome, tcovth=1, np=6): bed = GGB.read_bed(modelpre + '.paths.withse.bed.gz') chroms = bed['chr'].unique() csizedic = UT.df2dict(UT.chromdf(genome), 'chr', 'size') bundles = [] args = [] for chrom in chroms: sub = bed[(bed['chr'] == chrom)] uc = UT.union_contiguous(sub[['chr', 'st', 'ed']], returndf=True) # total about 30K=> make batch of ~1000 n = len(uc) nb = int(N.ceil(n / 1000.)) for i in range(nb): sti = 1000 * i edi = min(1000 * (i + 1), len(uc) - 1) st = max(uc.iloc[sti]['st'] - 100, 0) ed = min(uc.iloc[edi]['ed'] + 100, csizedic[chrom]) args.append([modelpre, bwpre, chrom, st, ed, dstpre, tcovth]) bundles.append((chrom, st, ed)) rslts = UT.process_mp(bundle_estimator, args, np=np, doreduce=False) concatenate_bundles(bundles, dstpre)
def count_repeats_mp(beddf, genomefastaobj, col='#repbp', returnseq=False, seqcol='seq', idfld='_id', np=4): """ MultiCPU version of counts_repeats """ # only send relevant part i.e. chr,st,ed,id if not idfld in beddf: beddf[idfld] = N.arange(len(beddf)) # number per CPU n = int(N.ceil(len(beddf) / float(np))) # per CPU args = [(beddf.iloc[i * n:(i + 1) * n], genomefastaobj, col, returnseq, seqcol) for i in range(np)] rslts = UT.process_mp(count_repeats, args, np=np, doreduce=False) df = PD.concat(rslts, ignore_index=True) i2c = UT.df2dict(df, idfld, col) beddf[col] = [i2c[x] for x in beddf[idfld]] if returnseq: i2s = UT.df2dict(df, idfld, seqcol) beddf[seqcol] = [i2s[x] for x in beddf[idfld]] return beddf
def count_repeats_viz_mp(beddf, rmskvizpath, idcol='_id', np=3, prefix=None, expand=0, col='repnames'): """Use rmsk-viz track and check each (unioned) exon overlaps with repeats and report repeat name(s). Uses Bedtools and calculates chromosome-wise. Args: beddf: Pandas DataFrame with chr,st,ed cols, when calculating repeats bp for genes, unioned bed should be used (use utils.make_unionex) idcol: colname for unique row id (default _id) rmskvizpath: path to repeat masker viz BED7 file (created using rmskviz2bed7) np: number of CPU to use prefix: path prefix for temp file, if not None temp files are kept. (default None) expand: how many bases to expand exon region in each side (default 0) col: column name to put in overlapping repeat names (if multiple comma separated) Outputs: are put into beddf columns with colname col(default repnames) """ cleanup = False if prefix is None: cleanup = True prefix = os.path.join(os.path.dirname(rmskvizpath), str(uuid.uuid4()) + '_') # chrom-wise chroms = sorted(beddf['chr'].unique()) # check whether rmskviz is already split splitrmsk = False for chrom in chroms: rpath = rmskvizpath + '.{0}.bed.gz'.format(chrom) # reuse if not os.path.exists(rpath): splitrmsk = True break if splitrmsk: rmsk = GGB.read_bed(rmskvizpath) args = [] bfiles = [] ofiles = [] for chrom in chroms: bpath = prefix + 'tgt.{0}.bed'.format(chrom) # don't compress rpath = rmskvizpath + '.{0}.bed.gz'.format(chrom) # reuse if expand > 0: bchr = beddf[beddf['chr'] == chrom].copy() bchr['st'] = bchr['st'] - expand bchr['ed'] = bchr['ed'] + expand bchr.loc[bchr['st'] < 0, 'st'] = 0 else: bchr = beddf[beddf['chr'] == chrom] UT.write_pandas(bchr[['chr', 'st', 'ed', idcol]], bpath, '') bfiles.append(bpath) if splitrmsk: rchr = rmsk[rmsk['chr'] == chrom] UT.write_pandas(rchr[['chr', 'st', 'ed', 'name', 'strand']], rpath, '') opath = prefix + 'out.{0}.bed'.format(chrom) ofiles.append(opath) args.append([bpath, rpath, opath]) rslts = UT.process_mp(count_repeats_viz_chr, args, np=np, doreduce=False) # gather outputs cols = ['name', 'repnames'] outs = [UT.read_pandas(f, names=cols) for f in ofiles] df = PD.concat(outs, ignore_index=True) df['name'] = df['name'].astype(str) i2rn = UT.df2dict(df, 'name', 'repnames') beddf[col] = [i2rn[str(x)] for x in beddf[idcol]] # cleanup if cleanup: for f in bfiles: os.unlink(f) for f in ofiles: os.unlink(f) return beddf