예제 #1
0
파일: bedtools.py 프로젝트: king1212/jGEM
def sj02bw(sj0, pathpre, genome, np=12):
    chroms = UT.chroms(genome)
    chromdf = UT.chromdf(genome).sort_values('size',ascending=False)
    chroms = [x for x in chromdf['chr'] if x in chroms]
    chromdic = UT.df2dict(chromdf, 'chr', 'size')
    if 'jcnt' not in sj0:
        sj0['jcnt'] = sj0['ucnt']+sj0['mcnt']
    files = []
    args = []
    for c in chroms:
        f = '{0}.{1}.{{0}}.wig'.format(pathpre,c)
        args.append((sj0[sj0['chr']==c], c, chromdic[c], f))
        files.append(f)
    rslts = UT.process_mp(sj02wig, args, np=np, doreduce=False)
    rmfiles = []
    for strand in ['+','-','.']:
        s = STRANDMAP0[strand]
        wig = pathpre+'.sj{0}.wig'.format(s)
        bwpath = pathpre+'.sj{0}.bw'.format(s)
        with open(wig, 'w') as dst:
            for tmpl in files:
                f = tmpl.format(strand)
                with open(f,'r') as src:
                    shutil.copyfileobj(src, dst)
                rmfiles.append(f)
        rmfiles.append(wig)
        wig2bw(wig, UT.chromsizes(genome), bwpath)
    for f in rmfiles:
        os.unlink(f)
    os.unlink(wig)
    
예제 #2
0
파일: bigwig.py 프로젝트: king1212/jGEM
def merge_bigwigs_mp(bwfiles, genome, dstpath, scale=None, np=7):
    chroms = UT.chroms(genome)
    chromfile = UT.chromsizes(genome)
    chromsizes = UT.df2dict(UT.chromdf(genome), 'chr', 'size')
    # reorder chroms, so that chrX doesn't get processed alone at the end wasting MP time
    tmp = sorted([(chromsizes[c], c) for c in chroms])[::-1]
    chroms = [x[1] for x in tmp]
    args = [(bwfiles, c, chromsizes[c], dstpath + '.{0}.wig'.format(c), scale)
            for c in chroms]

    rslts = UT.process_mp(merge_bigwigs_chr, args, np, doreduce=False)

    dic = dict(rslts)
    LOG.debug('concatenating chromosomes...')
    wigpath = dstpath + '.wig'
    UT.makedirs(os.path.dirname(wigpath))
    with open(wigpath, 'wb') as dst:
        for c in chroms:
            with open(dic[c], 'rb') as src:
                shutil.copyfileobj(src, dst)

    LOG.debug('converting wiggle to bigwig')
    BT.wig2bw(wigpath, chromfile, dstpath)

    # clean up
    for c in chroms:
        f = dstpath + '.{0}.wig'.format(c)
        if os.path.exists(f):
            os.unlink(f)
    if os.path.exists(wigpath):
        os.unlink(wigpath)
예제 #3
0
파일: bigwig.py 프로젝트: king1212/jGEM
def bw2bed_mp(bwfile, bedfile, chroms, th, np=4):
    """ multi CPU version of bw2bed """

    args = []
    files = []
    for chrom in chroms:
        bedchromfile = bedfile + '.{0}.bed.gz'.format(chrom)
        files.append(bedchromfile)
        args.append((bwfile, bedchromfile, [chrom], th, False))

    rslts = UT.process_mp(bw2bed, args, np=np, doreduce=False)

    # concatenate gz files
    bedbase = bedfile[:-3] if bedfile[-3:] == '.gz' else bedfile
    with open(bedbase, 'wb') as dst:
        for f in rslts:
            with open(f, 'rb') as src:
                shutil.copyfileobj(src, dst)
    # !!! bedtool gzip problem againg !!!
    # bedtools only process first one if just concatenate gzipped files
    # => concatenate unzipped and gzip whole thing at the end
    bedfile = UT.compress(bedbase)

    # clean up temp files
    for f in rslts:
        os.unlink(f)

    return bedfile
예제 #4
0
파일: merge2.py 프로젝트: king1212/jGEM
    def __call__(self):
        chroms = UT.chroms(self.genome)
        csizedic = UT.df2dict(UT.chromdf(self.genome), 'chr', 'size')
        args = []
        for c in chroms:
            csize = csizedic[c]
            args.append((self.bwsjpre, self.statspath, c, csize, self.params))
        rslts = UT.process_mp(filter_sj, args, np=self.np, doreduce=False)

        dstpath = self.bwsjpre + '.sjpath.filtered.bed.gz'
        with open(dstpath, 'wb') as dst:
            for c in chroms:
                srcpath = self.bwsjpre + '.sjpath.{0}.filtered.bed.gz'.format(
                    c)
                with open(srcpath, 'rb') as src:
                    shutil.copyfileobj(src, dst)
예제 #5
0
파일: phylo.py 프로젝트: king1212/jGEM
    def calculate(self, unionexbed, addcols=['_id','_gidx'], np=10):
        """ Calculate PhyloCSF score.

        Args:
            unionexbed: Pandas DataFrame bed containing unioned exons
            addcols: additional cols to copy from unionexbed (other than chr,st,ed)
            np: number of CPU to use

        """
        # process chrom wise
        args = []
        for chrom in unionexbed['chr'].unique():
            uechr = unionexbed[unionexbed['chr']==chrom][['chr','st','ed']+addcols].copy()
            mycopy = Phylo60(self.path)
            args.append((uechr, mycopy))

        rslts = UT.process_mp(calc_worker, args, np=np, doreduce=False)

        df = PD.concat(rslts, ignore_index=True)
        return df
예제 #6
0
파일: merge2.py 프로젝트: king1212/jGEM
def estimatecovs(modelpre, bwpre, dstpre, genome, tcovth=1, np=6):
    bed = GGB.read_bed(modelpre + '.paths.withse.bed.gz')
    chroms = bed['chr'].unique()
    csizedic = UT.df2dict(UT.chromdf(genome), 'chr', 'size')
    bundles = []
    args = []
    for chrom in chroms:
        sub = bed[(bed['chr'] == chrom)]
        uc = UT.union_contiguous(sub[['chr', 'st', 'ed']], returndf=True)
        # total about 30K=> make batch of ~1000
        n = len(uc)
        nb = int(N.ceil(n / 1000.))
        for i in range(nb):
            sti = 1000 * i
            edi = min(1000 * (i + 1), len(uc) - 1)
            st = max(uc.iloc[sti]['st'] - 100, 0)
            ed = min(uc.iloc[edi]['ed'] + 100, csizedic[chrom])
            args.append([modelpre, bwpre, chrom, st, ed, dstpre, tcovth])
            bundles.append((chrom, st, ed))

    rslts = UT.process_mp(bundle_estimator, args, np=np, doreduce=False)

    concatenate_bundles(bundles, dstpre)
예제 #7
0
파일: repeats.py 프로젝트: king1212/jGEM
def count_repeats_mp(beddf,
                     genomefastaobj,
                     col='#repbp',
                     returnseq=False,
                     seqcol='seq',
                     idfld='_id',
                     np=4):
    """ MultiCPU version of counts_repeats """
    # only send relevant part i.e. chr,st,ed,id
    if not idfld in beddf:
        beddf[idfld] = N.arange(len(beddf))
    # number per CPU
    n = int(N.ceil(len(beddf) / float(np)))  # per CPU
    args = [(beddf.iloc[i * n:(i + 1) * n], genomefastaobj, col, returnseq,
             seqcol) for i in range(np)]
    rslts = UT.process_mp(count_repeats, args, np=np, doreduce=False)
    df = PD.concat(rslts, ignore_index=True)
    i2c = UT.df2dict(df, idfld, col)
    beddf[col] = [i2c[x] for x in beddf[idfld]]
    if returnseq:
        i2s = UT.df2dict(df, idfld, seqcol)
        beddf[seqcol] = [i2s[x] for x in beddf[idfld]]
    return beddf
예제 #8
0
파일: repeats.py 프로젝트: king1212/jGEM
def count_repeats_viz_mp(beddf,
                         rmskvizpath,
                         idcol='_id',
                         np=3,
                         prefix=None,
                         expand=0,
                         col='repnames'):
    """Use rmsk-viz track and check each (unioned) exon overlaps with repeats and report repeat name(s).
    Uses Bedtools and calculates chromosome-wise.  

    Args:
        beddf: Pandas DataFrame with chr,st,ed cols, when calculating repeats bp
         for genes, unioned bed should be used (use utils.make_unionex)
        idcol: colname for unique row id (default _id)
        rmskvizpath: path to repeat masker viz BED7 file (created using rmskviz2bed7)
        np: number of CPU to use
        prefix: path prefix for temp file, if not None temp files are kept. (default None)
        expand: how many bases to expand exon region in each side (default 0)
        col: column name to put in overlapping repeat names (if multiple comma separated)

    Outputs:
        are put into beddf columns with colname col(default repnames)

    """
    cleanup = False
    if prefix is None:
        cleanup = True
        prefix = os.path.join(os.path.dirname(rmskvizpath),
                              str(uuid.uuid4()) + '_')

    # chrom-wise
    chroms = sorted(beddf['chr'].unique())
    # check whether rmskviz is already split
    splitrmsk = False
    for chrom in chroms:
        rpath = rmskvizpath + '.{0}.bed.gz'.format(chrom)  # reuse
        if not os.path.exists(rpath):
            splitrmsk = True
            break
    if splitrmsk:
        rmsk = GGB.read_bed(rmskvizpath)

    args = []
    bfiles = []
    ofiles = []
    for chrom in chroms:
        bpath = prefix + 'tgt.{0}.bed'.format(chrom)  # don't compress
        rpath = rmskvizpath + '.{0}.bed.gz'.format(chrom)  # reuse
        if expand > 0:
            bchr = beddf[beddf['chr'] == chrom].copy()
            bchr['st'] = bchr['st'] - expand
            bchr['ed'] = bchr['ed'] + expand
            bchr.loc[bchr['st'] < 0, 'st'] = 0
        else:
            bchr = beddf[beddf['chr'] == chrom]
        UT.write_pandas(bchr[['chr', 'st', 'ed', idcol]], bpath, '')
        bfiles.append(bpath)
        if splitrmsk:
            rchr = rmsk[rmsk['chr'] == chrom]
            UT.write_pandas(rchr[['chr', 'st', 'ed', 'name', 'strand']], rpath,
                            '')
        opath = prefix + 'out.{0}.bed'.format(chrom)
        ofiles.append(opath)
        args.append([bpath, rpath, opath])

    rslts = UT.process_mp(count_repeats_viz_chr, args, np=np, doreduce=False)

    # gather outputs
    cols = ['name', 'repnames']
    outs = [UT.read_pandas(f, names=cols) for f in ofiles]
    df = PD.concat(outs, ignore_index=True)
    df['name'] = df['name'].astype(str)
    i2rn = UT.df2dict(df, 'name', 'repnames')
    beddf[col] = [i2rn[str(x)] for x in beddf[idcol]]

    # cleanup
    if cleanup:
        for f in bfiles:
            os.unlink(f)
        for f in ofiles:
            os.unlink(f)

    return beddf