Пример #1
0
def merge_bigwigs_mp(bwfiles, genome, dstpath, scale=None, np=7):
    chroms = UT.chroms(genome)
    chromfile = UT.chromsizes(genome)
    chromsizes = UT.df2dict(UT.chromdf(genome), 'chr', 'size')
    # reorder chroms, so that chrX doesn't get processed alone at the end wasting MP time
    tmp = sorted([(chromsizes[c], c) for c in chroms])[::-1]
    chroms = [x[1] for x in tmp]
    args = [(bwfiles, c, chromsizes[c], dstpath + '.{0}.wig'.format(c), scale)
            for c in chroms]

    rslts = UT.process_mp(merge_bigwigs_chr, args, np, doreduce=False)

    dic = dict(rslts)
    LOG.debug('concatenating chromosomes...')
    wigpath = dstpath + '.wig'
    UT.makedirs(os.path.dirname(wigpath))
    with open(wigpath, 'wb') as dst:
        for c in chroms:
            with open(dic[c], 'rb') as src:
                shutil.copyfileobj(src, dst)

    LOG.debug('converting wiggle to bigwig')
    BT.wig2bw(wigpath, chromfile, dstpath)

    # clean up
    for c in chroms:
        f = dstpath + '.{0}.wig'.format(c)
        if os.path.exists(f):
            os.unlink(f)
    if os.path.exists(wigpath):
        os.unlink(wigpath)
Пример #2
0
def bw2bed(bwfile, bedfile, chroms, th, compress=True):
    """Transform BigWig genomeCov to binary BED by thresholding. 
    Makes result file (bwfile[:-3]+'.binary%g.bed'.format(th))

    Args:
        bwfile: path to BigWig file
        chroms: list of chromosome names
        th: coverage threshold

    Returns:
        path to generated BED file
    """
    bedbase = bedfile[:-3] if bedfile[-3:] == '.gz' else bedfile
    #bedfile = '{0}.binary{1:g}.bed'.format(bwfile[:-3], th)
    if UT.notstale(bwfile, bedbase + '.gz'):
        return bedbase + '.gz'
    # make sure bwfile exists
    if not (os.path.exists(bwfile)):
        raise RuntimeError('BigWig file {0} does not exist.'.format(bwfile))
    processor = apply_threshold(bwfile, th, chroms)
    UT.makedirs(os.path.dirname(bedfile))
    out = open(bedbase, 'w')
    out.write(''.join(['%s\t%i\t%i\n' % x for x in processor]))
    #out.write('\n') #<= this introduces space inbetween chroms in mp ode
    # which terminates bedtools at chr1
    out.close()
    if compress:
        return UT.compress(bedbase)
    return bedbase
Пример #3
0
 def write_iso(self, fname, maxisonum):
     if fname[-3:]=='.gz':
         fname = fname[:-3]
     UT.makedirs(os.path.dirname(fname))
     with open(fname,'w') as fobj:
         for x in self.gen_iso_all(maxisonum=maxisonum):
             fobj.write('\t'.join(map(str,x))+'\n')
     UT.compress(fname)
Пример #4
0
def wig2bw(wigpath, chromsizes, bwpath):
    """Generate bigwig coverage from WIGGLE.
    Runs Kent's tool wigToBigWig.
    """
    cmd = ['wigToBigWig', wigpath, chromsizes, bwpath]
    UT.makedirs(os.path.dirname(bwpath))
    err = subprocess.call(cmd)
    return err
Пример #5
0
def write_ggb(df, fname, cols, mode='w'):    
    # df.loc[:,'st'] = df['st'].astype(int)
    # df.loc[:,'ed'] = df['ed'].astype(int)
    if fname[-3:]=='.gz':
        compress=True
        fname = fname[:-3]
    else:
        compress=False
    if (df.dtypes['st'] != int) or (df.dtypes['ed'] != int):
        LOG.warning('st,ed not integer: copy and converting')
        df = df.copy()
        df['st'] = df['st'].astype(int)
        df['ed'] = df['ed'].astype(int)
    UT.makedirs(os.path.dirname(fname))
    with open(fname, mode) as f:
        df[cols].to_csv(f, index=False, header=False, sep='\t', quoting=csv.QUOTE_NONE)
    if compress:
        return UT.compress(fname)
    return fname
Пример #6
0
 def save(self):
     # [i,5,5b,3,3b,s,sb,j,glc,ecc,jcc]
     # light weight stats also usable from others ==> dict
     #   auc, detected1, ..., sigmoid,...,maxx,avgx,avgy,...
     # ==> pickle or json
     decode = '{0}.{1}'.format(self.en1.code, self.datacode)
     fname1 = self.en2.fname2('stats.json', decode, category='output')
     UT.makedirs(os.path.dirname(fname1))
     with open(fname1, 'w') as fp:
         json.dump(self.stats, fp)
     # [i,5,5b,3,3b,s,sb,j] cov(x),ratio(y) => in a dataframe
     # [glc,ecc,jcc] gcov(x), ratio(y) => in a dataframe
     # ==> put all in one four column dataframe (kind, id, x, y)
     fname2 = self.en2.fname2('ratios.txt.gz', decode, category='output')
     for k, v in self.ratios.items():
         v['kind'] = k
     df = PD.concat(self.ratios.values(), ignore_index=True)
     UT.write_pandas(df, fname2, 'h')
     # DP
     dp = self.get_detection_percentages()
     fname3 = self.en2.fname2('dp.txt.gz', decode, category='output')
     UT.write_pandas(dp, fname3, 'ih')
Пример #7
0
 def wrap(*args,**kwargs):
     # check output '.gz'
     if outname in kwargs:
         opath = kwargs[outname]
     else:
         opath = args[pos]
         args = list(args)
     if opath[-3:]=='.gz':
         compress = True
         opath = opath[:-3]
     else:
         compress = False
     UT.makedirs(os.path.dirname(opath))
     if outname in kwargs:
         kwargs[outname] = opath
     else:
         args[pos] = opath
     err = func(*args, **kwargs)
     if err != noerr:
         LOG.warning('bederror:{0}, err={1}'.format(func.__name__, err))
         raise RuntimeError(func.__name__)
     if compress:
         return UT.compress(opath)
     return opath
Пример #8
0
def bam2bw(fpath, chromsizes, bpath, aligned=None):
    """
    Generate normalized coverage from BAM

    Args:
        fpath (str): path to BAM
        chromsizes (str): path to chromsizes file 
        bpath (str): path to BIGWIG
        aligned (int): number of aligned reads, if None uses samtools to find it from BAM

    Requires Bedtools (genomeCoverageBed) and Kent Tool (wigToBigWig)

    """
    # countreads
    if aligned is None:
        aligned = cnt_bam(fpath)
    scale = 1000000. / float(aligned)
    # convert_to_wig
    tpath = bpath + '.wig'
    UT.makedirs(os.path.dirname(tpath))
    tfobj = open(tpath, 'wb')
    cmd1 = [
        'genomeCoverageBed', '-split', '-bg', '-ibam', fpath, '-g', chromsizes,
        '-scale',
        str(scale)
    ]
    p1 = subprocess.Popen(cmd1, stdout=tfobj)
    p1.wait()
    tfobj.close()

    # convet_wig_to_bigwig
    cmd2 = ['wigToBigWig', tpath, chromsizes, bpath]
    p2 = subprocess.call(cmd2)

    # remove_temporary_file
    os.remove(tpath)
Пример #9
0
def test_makedirs(tmpdir):
    path = os.path.join(str(tmpdir), 'a/b/c')
    UT.makedirs(path)
    assert os.path.exists(path)
    # should not raise
    UT.makedirs(path)

    # make a file
    path2 = os.path.join(str(tmpdir), 'a/b/c/d')
    open(path2, 'w').write('test\n')
    # should raise
    with pytest.raises(OSError):
        UT.makedirs(path2)
Пример #10
0
def process_mapbed(bedpath, dstpre, genome, chromdir, stranded='.', np=3):
    """
    Args:
        bedpath: path to gzipped BED7 file (converted from BAM)
        dstpre: path prefix to destination
        genome: UCSC genome (mm10 etc.)
        chromdir: directory containing chromosome sequence in FASTA
        np: number of CPU to use

    Outputs:
        1. dstpre+'.ex.p.bw'
        2. dstpre+'.ex.n.bw'
        3. dstpre+'.ex.u.bw'
        4. dstpre+'.sj.p.bw'
        5. dstpre+'.sj.n.bw'
        6. dstpre+'.sj.u.bw'
        7. dstpre+'.ex.p.uniq.bw'
        8. dstpre+'.ex.n.uniq.bw'
        9. dstpre+'.ex.u.uniq.bw'
        10. dstpre+'.sj.p.uniq.bw'
        11. dstpre+'.sj.n.uniq.bw'
        12. dstpre+'.sj.u.uniq.bw'
        13. dstpre+'.sjpath.bed' BED12 (sc1:ucnt, sc2:jcnt=ucnt+mcnt)
    """
    chroms = UT.chroms(genome)
    chromdf = UT.chromdf(genome)
    chromsizes = UT.chromsizes(genome)

    # split into chroms
    UT.makedirs(dstpre)
    splitbedgz(bedpath, dstpre) # ~30sec
    duppath = dstpre+'.dupitems.txt.gz'
    chroms = [c for c in chroms if os.path.exists(dstpre+'.{0}.bed'.format(c))]
    files = [dstpre+'.{0}.bed'.format(c) for c in chroms]
    _scan_make_map(files, duppath)

    files0 = [dstpre+'.{0}.bed'.format(c) for c  in chromdf['chr'].values] # to be deleted
    args = [(dstpre, x, genome, chromdir, stranded) for x in chroms]
    # spread to CPUs
    rslts = UT.process_mp2(_process_mapbed_chr, args, np=np, doreduce=False)
    # concatenate chr files
    files1 = []
    dstpath = dstpre+'.sjpath.bed'
    LOG.info('making {0}...'.format(dstpath))
    with open(dstpath, 'wb') as dst:
        for c in chroms:
            srcpath = dstpre+'.{0}.sjpath.bed'.format(c)
            files1.append(srcpath)
            with open(srcpath, 'rb') as src:
                shutil.copyfileobj(src, dst)
    dstpath = UT.compress(dstpath)

    for kind in ['.ex','.sj']:
        for strand in ['.p','.n','.u']:
            for suf in ['','.uniq']:
                pre = dstpre+kind+suf+strand
                wigpath = pre+'.wig'
                bwpath = pre+'.bw'
                with open(wigpath, 'wb') as dst:
                    for c in chroms:
                        srcpath = pre+'.{0}.wig'.format(c)
                        files1.append(srcpath)
                        if os.path.exists(srcpath):
                            with open(srcpath,'rb') as src:
                                shutil.copyfileobj(src, dst)
                LOG.info('making {0}...'.format(bwpath))
                if os.path.getsize(wigpath)>0:
                    wig2bw(wigpath, chromsizes, bwpath)
                files1.append(wigpath)

    # clean up temp files
    LOG.info('deleting intermediate files...')
    for x in files0+files1:
        if os.path.exists(x):
            LOG.debug('deleting {0}...'.format(x))
            os.unlink(x)
Пример #11
0
def find_genes4(sj,
                ae,
                filepre,
                cachename=None,
                np=1,
                override=False,
                depth=500,
                separatese=True):
    """ 
    Adds _gidx column to ae
    Connection: 1) by junctions, 2) by overlap in the same strand

    Returns genes [set([_id,..]), ...]
    """
    if '_id' not in ae.columns:
        LOG.info('setting ex _id...')
        UT.set_ids(ae)
    if '_id' not in sj.columns:
        LOG.info('setting sj _id...')
        UT.set_ids(sj)
    if 'cat' not in ae.columns:
        UT.set_exon_category(sj, ae)
    if 'a_id' not in ae.columns:
        UT.set_ad_info(sj, ae)

    ### FIND GENES
    if cachename and os.path.exists(cachename) and not override:
        LOG.info('loading cached genes (connected components)...')
        genes = pickle.load(open(cachename, 'rb'))
    else:
        LOG.info('finding genes (connected components)...')
        _sttime = time.time()
        if separatese:
            me, se = UT.mese(ae)
            genes = mcore_allcomponents4(sj, me, filepre, np, depth=depth)
            # SE genes
            genes += [set([x]) for x in se['_id']]
        else:
            genes = mcore_allcomponents4(sj, ae, filepre, np, depth=depth)
        # version 4 graph: uses overlaps in addition to junctions to connect
        # genes = [set([_id's]),...]
        if cachename:
            UT.makedirs(os.path.dirname(cachename))
            pickle.dump(genes, open(cachename, 'wb'))
        LOG.info(' time: {0:.3f}s'.format(time.time() - _sttime))

    ### WRITE EXONS W/ GENE number
    LOG.info('assigning gidx...')
    _sttime = time.time()
    i2g = {}  # eid => _gidx
    i2gn = {}  # eidt => gname
    g2gn = {}
    i2s = dict(UT.izipcols(ae, ['_id', 'strand']))  # eid => strand
    #i2c = dict(UT.izipcols(ae, ['_id','cat'])) # eid => category
    s2n = {'+': 'P', '-': 'N', '.': '', '.+': '', '.-': ''}
    c2n = {'s': 'S', 'i': 'G', '5': 'G', '3': 'G'}
    for i, ids in enumerate(genes):
        gid = i + 1
        strand = s2n[i2s[list(ids)[0]]]
        cat = 'S' if len(ids) == 1 else 'G'
        if strand == 'N':  # negative strand
            gid = -gid
        gname = 'J{0}{1}{2}'.format(strand, cat, abs(gid))
        g2gn[gid] = gname
        for x in ids:
            i2g[x] = gid
            i2gn[x] = gname

    ae['_gidx'] = [i2g[x] for x in ae['_id']]
    ae['gname'] = [i2gn[x] for x in ae['_id']]

    ## set sj _gidx, use acceptor=>_gidx map (exon a_id, sj a_id)
    a2g = dict(UT.izipcols(ae, ['a_id', '_gidx']))
    d2g = dict(UT.izipcols(ae, ['d_id', '_gidx']))
    sj['_gidx'] = [
        a2g.get(x, d2g.get(y, 0))
        for x, y in UT.izipcols(sj, ['a_id', 'd_id'])
    ]
    sj['gname'] = [g2gn.get(x, '') for x in sj['_gidx']]

    # This shouldn't happen
    nidx = ae['_gidx'] == 0
    if N.sum(nidx) > 0:
        LOG.warning(
            '###### WARNING!!!!!! exons with no gene assignment:{0}'.format(
                N.sum(nidx)))
        #ae.loc[nidx, '_gidx'] = N.arange(len(ae),len(ae)+N.sum(nidx))

    return genes