Exemplo n.º 1
0
def merge_bigwigs_mp(bwfiles, genome, dstpath, scale=None, np=7):
    chroms = UT.chroms(genome)
    chromfile = UT.chromsizes(genome)
    chromsizes = UT.df2dict(UT.chromdf(genome), 'chr', 'size')
    # reorder chroms, so that chrX doesn't get processed alone at the end wasting MP time
    tmp = sorted([(chromsizes[c], c) for c in chroms])[::-1]
    chroms = [x[1] for x in tmp]
    args = [(bwfiles, c, chromsizes[c], dstpath + '.{0}.wig'.format(c), scale)
            for c in chroms]

    rslts = UT.process_mp(merge_bigwigs_chr, args, np, doreduce=False)

    dic = dict(rslts)
    LOG.debug('concatenating chromosomes...')
    wigpath = dstpath + '.wig'
    UT.makedirs(os.path.dirname(wigpath))
    with open(wigpath, 'wb') as dst:
        for c in chroms:
            with open(dic[c], 'rb') as src:
                shutil.copyfileobj(src, dst)

    LOG.debug('converting wiggle to bigwig')
    BT.wig2bw(wigpath, chromfile, dstpath)

    # clean up
    for c in chroms:
        f = dstpath + '.{0}.wig'.format(c)
        if os.path.exists(f):
            os.unlink(f)
    if os.path.exists(wigpath):
        os.unlink(wigpath)
Exemplo n.º 2
0
    def make_dm(self, targetlevel):
        """calculate 2 DMs (logdiff and minmin) at specified level """
        # first make gcovlevel <=> targetlevel mapping
        si = self.si
        gl = self.gcovlevel
        gc = self.gcov
        ts = si.groupby(targetlevel, sort=False).first().index.values

        g2t = UT.df2dict(si, gl, targetlevel)
        t2g = make_dict(si, targetlevel, gl)

        lgc = N.log2(gc + 1)
        v0 = lgc.groupby(g2t, axis=1).mean()  # target level
        maxe = v0.max(axis=1)
        gids = maxe[maxe > N.log2(self.maxeth + 1)].index.values
        v = v0.ix[gids][ts]  # restrict to expressed
        # do the math in numpy to get normalized logdiff DM
        m = v.values
        logdiff = N.abs(m[:, :, N.newaxis] - m[:, N.newaxis, :])
        maxdiff = logdiff.max(axis=2).max(axis=1)
        normdiff = logdiff / maxdiff[:, N.newaxis, N.newaxis]  # normalized
        dm = PD.Panel(normdiff, v.index, ts, ts)
        # calculate minmin DM
        gmin = gc.ix[gids].groupby(g2t, axis=1).min()[ts].values
        a = gmin[:, :, N.newaxis]  # i
        b = gmin[:, N.newaxis, :]  # j
        minmin = N.minimum(a, b)
        mm = PD.Panel(minmin, v.index, ts, ts)
        self.dms[targetlevel] = dict(ts=ts,
                                     g2t=g2t,
                                     t2g=t2g,
                                     dm=dm,
                                     mm=mm,
                                     v=v)
Exemplo n.º 3
0
def sj02bw(sj0, pathpre, genome, np=12):
    chroms = UT.chroms(genome)
    chromdf = UT.chromdf(genome).sort_values('size',ascending=False)
    chroms = [x for x in chromdf['chr'] if x in chroms]
    chromdic = UT.df2dict(chromdf, 'chr', 'size')
    if 'jcnt' not in sj0:
        sj0['jcnt'] = sj0['ucnt']+sj0['mcnt']
    files = []
    args = []
    for c in chroms:
        f = '{0}.{1}.{{0}}.wig'.format(pathpre,c)
        args.append((sj0[sj0['chr']==c], c, chromdic[c], f))
        files.append(f)
    rslts = UT.process_mp(sj02wig, args, np=np, doreduce=False)
    rmfiles = []
    for strand in ['+','-','.']:
        s = STRANDMAP0[strand]
        wig = pathpre+'.sj{0}.wig'.format(s)
        bwpath = pathpre+'.sj{0}.bw'.format(s)
        with open(wig, 'w') as dst:
            for tmpl in files:
                f = tmpl.format(strand)
                with open(f,'r') as src:
                    shutil.copyfileobj(src, dst)
                rmfiles.append(f)
        rmfiles.append(wig)
        wig2bw(wig, UT.chromsizes(genome), bwpath)
    for f in rmfiles:
        os.unlink(f)
    os.unlink(wig)
    
Exemplo n.º 4
0
def filter_sj(bwsjpre, statspath, chrom, csize, params):
    # read in junction stats
    stats = UT.read_pandas(statspath)
    if 'chr' not in stats:
        stats['chr'] = [x.split(':')[0] for x in stats['locus']]
    if '#detected' in stats:
        stats.rename(columns={'#detected': 'detected'}, inplace=True)
    stats = stats[stats['chr'] == chrom].copy()
    if 'pc' not in stats:
        stats['pc'] = [locus2pc(x) for x in stats['locus']]
    flds = ['detected', 'maxcnt', 'maxoverhang']
    dics = {f: UT.df2dict(stats, 'pc', f) for f in flds}
    # read sjpath
    fpath_chr = bwsjpre + '.sjpath.{0}.bed.gz'.format(chrom)
    dstpath = bwsjpre + '.sjpath.{0}.filtered.bed.gz'.format(chrom)
    if os.path.exists(fpath_chr):
        sj = GGB.read_bed(fpath_chr)
    else:
        fpath = bwsjpre + '.sjpath.bed.gz'
        sj = GGB.read_bed(fpath)
        sj = sj[sj['chr'] == chrom].copy()
    name0 = sj.iloc[0]['name']
    if len(name0.split('|')) < len(name0.split(',')):  # exons attached?
        sj['name'] = [','.join(x.split(',')[1:-1]) for x in sj['name']]
    # filter unstranded
    sj = sj[sj['strand'].isin(['+', '-'])].copy()
    # filter with stats
    for f in flds:
        sj[f] = [
            N.min([dics[f].get(x, 0) for x in y.split(',')])
            for y in sj['name']
        ]
        sj = sj[sj[f] > params['th_' + f]].copy()  # filter
    # edge exon size
    sj['eflen'] = [int(x.split(',')[0]) for x in sj['esizes']]
    sj['ellen'] = [int(x.split(',')[-2]) for x in sj['esizes']]
    eth = params['th_minedgeexon']
    sj = sj[(sj['eflen'] > eth) & (sj['ellen'] > eth)].copy()
    # calculate sjratio, sjratio2
    sjexbw = A2.SjExBigWigs(bwsjpre, mixunstranded=False)
    for s in ['+', '-']:
        idx = sj['strand'] == s
        with sjexbw:
            sa = sjexbw.bws['sj'][s].get(chrom, 0, csize)
            ea = sjexbw.bws['ex'][s].get(chrom, 0, csize)
        a = sa + ea
        sj.loc[idx, 'sjratio2'] = [
            x / N.mean(a[int(s):int(e)])
            for x, s, e in sj[idx][['sc1', 'tst', 'ted']].values
        ]
    sj = sj[sj['sjratio2'] > params['th_sjratio2']]
    GGB.write_bed(sj, dstpath, ncols=12)
Exemplo n.º 5
0
def count_repeats_mp(beddf,
                     genomefastaobj,
                     col='#repbp',
                     returnseq=False,
                     seqcol='seq',
                     idfld='_id',
                     np=4):
    """ MultiCPU version of counts_repeats """
    # only send relevant part i.e. chr,st,ed,id
    if not idfld in beddf:
        beddf[idfld] = N.arange(len(beddf))
    # number per CPU
    n = int(N.ceil(len(beddf) / float(np)))  # per CPU
    args = [(beddf.iloc[i * n:(i + 1) * n], genomefastaobj, col, returnseq,
             seqcol) for i in range(np)]
    rslts = UT.process_mp(count_repeats, args, np=np, doreduce=False)
    df = PD.concat(rslts, ignore_index=True)
    i2c = UT.df2dict(df, idfld, col)
    beddf[col] = [i2c[x] for x in beddf[idfld]]
    if returnseq:
        i2s = UT.df2dict(df, idfld, seqcol)
        beddf[seqcol] = [i2s[x] for x in beddf[idfld]]
    return beddf
Exemplo n.º 6
0
 def assign_tcode_sj(self):
     self.sj_tgt = stgt = self.cn_tgt.model(
         'sj')  #UT.read_pandas(self.p1.sj)
     self.sj_ref = sref = self.cn_ref.model(
         'sj')  #UT.read_pandas(self.p2.sj)
     if 'locus' not in stgt.columns:
         stgt['locus'] = UT.calc_locus_strand(stgt)
     if 'locus' not in sref.columns:
         sref['locus'] = UT.calc_locus_strand(sref)
     l2c = dict([(x, 'k.me') for x in sref['locus']])
     rcode = self.cn_ref.code
     setfld = 'etcode_' + rcode
     sgtfld = 'gtcode_' + rcode
     stgt[setfld] = [l2c.get(x, 'u.me') for x in stgt['locus']]
     g2c = UT.df2dict(self.ex_tgt, '_gidx', 'gtcode_' + rcode)
     stgt[sgtfld] = [g2c.get(x, 'u.me') for x in stgt['_gidx']]
Exemplo n.º 7
0
    def __call__(self):
        chroms = UT.chroms(self.genome)
        csizedic = UT.df2dict(UT.chromdf(self.genome), 'chr', 'size')
        args = []
        for c in chroms:
            csize = csizedic[c]
            args.append((self.bwsjpre, self.statspath, c, csize, self.params))
        rslts = UT.process_mp(filter_sj, args, np=self.np, doreduce=False)

        dstpath = self.bwsjpre + '.sjpath.filtered.bed.gz'
        with open(dstpath, 'wb') as dst:
            for c in chroms:
                srcpath = self.bwsjpre + '.sjpath.{0}.filtered.bed.gz'.format(
                    c)
                with open(srcpath, 'rb') as src:
                    shutil.copyfileobj(src, dst)
Exemplo n.º 8
0
def estimatecovs(modelpre, bwpre, dstpre, genome, tcovth=1, np=6):
    bed = GGB.read_bed(modelpre + '.paths.withse.bed.gz')
    chroms = bed['chr'].unique()
    csizedic = UT.df2dict(UT.chromdf(genome), 'chr', 'size')
    bundles = []
    args = []
    for chrom in chroms:
        sub = bed[(bed['chr'] == chrom)]
        uc = UT.union_contiguous(sub[['chr', 'st', 'ed']], returndf=True)
        # total about 30K=> make batch of ~1000
        n = len(uc)
        nb = int(N.ceil(n / 1000.))
        for i in range(nb):
            sti = 1000 * i
            edi = min(1000 * (i + 1), len(uc) - 1)
            st = max(uc.iloc[sti]['st'] - 100, 0)
            ed = min(uc.iloc[edi]['ed'] + 100, csizedic[chrom])
            args.append([modelpre, bwpre, chrom, st, ed, dstpre, tcovth])
            bundles.append((chrom, st, ed))

    rslts = UT.process_mp(bundle_estimator, args, np=np, doreduce=False)

    concatenate_bundles(bundles, dstpre)
Exemplo n.º 9
0
 def calc_many_specific(self,
                        targetlevel,
                        key2names,
                        scoreth=None,
                        rdratioth=0.6):
     """
     Args:
         targetlevel: name or cg1
         key2names: dict groupname (key) to names in targetlevel
         
     """
     dfs = []
     for k, ln in key2names.items():
         print('{0}...'.format(k))
         df = self.calc_one_specific(targetlevel, ln)
         cols = list(df.columns)
         if scoreth is not None:
             df = df[df['score'] > scoreth].copy()
             print('scoreth{0}:{1}'.format(scoreth, len(df)))
         if rdratioth is not None:
             idx1 = (df['gcov'] > df['gcov2']) & (df['rd'] > rdratioth)
             idx2 = (df['gcov'] <= df['gcov2']) & (
                 (1 - df['rd']) > rdratioth)
             df = df[idx1 | idx2].copy()
             print('rdratioth{0}:{1}'.format(rdratioth, len(df)))
         df['key'] = k
         df = df.sort_values('score', ascending=False)
         df['rank'] = N.arange(len(df))
         df['id'] = df['key'] + '.' + df['rank'].astype(str)
         dfs.append(df)
     df0 = PD.concat(dfs, ignore_index=True)
     g2cg1 = UT.df2dict(self.si, 'group', 'cg1')
     df0['region'] = [g2cg1.get(x, x) for x in df0['key']]
     df0 = df0[['region', 'key', 'id'] + cols]
     df0 = self.annotate(df0)
     return df0
Exemplo n.º 10
0
def gtf_from_bed12(modelpre, dstpath=None, source='.'):
    # path['gname'] contains gene id
    paths = GGB.read_bed(modelpre+'.paths.withse.bed.gz')
    ex = UT.read_pandas(modelpre+'.ex.txt.gz')
    ex['id'] = ex['chr']+':'+ex['name']
    n2gn = UT.df2dict(ex, 'id', 'gname')
    # n2gn = UT.df2dict(ex, 'name', 'gname') # there may be same st,ed in different chromosome
    paths['id'] = paths['chr']+':'+paths['name']
    paths['id0'] = paths['chr']+':'+paths['name'].str.split('|').str[0]
    paths['gname'] = [n2gn[x] for x in paths['id0']]
    g2cnt = {}
    tnames = []
    for x in paths['gname']:
        i = g2cnt.get(x,1)
        tnames.append('{0}.{1}'.format(x,i))
        g2cnt[x] = i+1
    paths['tname'] = tnames    
    txt = 'gene_id "{0}"; transcript_id "{1}"; exon_number "{2}";'
    def _gen():
        cols = ['chr','st','ed','gname','tname','esizes','estarts','strand']
        for c,s,e,gn,tn,esi,est,strand in paths[cols].values:
            esizes = [int(x) for x in esi.split(',')[:-1]]
            estarts = [int(x) for x in est.split(',')[:-1]]
            for i,(x,y) in enumerate(zip(esizes,estarts)):
                est = s+y
                eed = est+x
                extra = txt.format(gn,tn,i+1)
                yield (c,source,'exon',est+1,eed,'.',strand,'.',extra)
    df = PD.DataFrame([x for x in _gen()], columns=GGB.GTFCOLS)
    if dstpath is None:
        dstpath = bedpath.replace('.bed','.gtf')
    GGB.write_gtf(df, dstpath)
    
    idf = paths[['id','chr','name','tname','gname']]
    UT.write_pandas(idf, modelpre+'.idmap.txt.gz','h')
    return df
Exemplo n.º 11
0
    def prep_sjex(self, en, np=1, savesjex=True, calccovs=True):
        """ Assign ecov, gcov, jcnt """
        dcode = self.datacode
        sj = en.model('sj', dcode)
        ex = en.model('ex', dcode)
        savesj = False
        saveex = False
        # check support
        if len(sj) > 0:
            dids = set(ex['d_id'].values)
            aids = set(ex['a_id'].values)
            idx = sj['a_id'].isin(aids) & sj['d_id'].isin(dids)
            sj = sj[idx].copy()
            en.sj = sj
        if '_id' not in ex.columns:  # edge case (len(sj)==0)
            ex['_id'] = N.arange(len(ex))
        if '_gidx' not in ex.columns:  # edge case (len(sj)==0)
            ex['_gidx'] = N.arange(len(ex))

        # length
        if 'len' not in sj.columns:
            sj['len'] = sj['ed'] - sj['st']
            savesj = True
        if 'len' not in ex.columns:
            ex['len'] = ex['ed'] - ex['st']
            saveex = True
        # ecov
        if calccovs:
            print('calccov for {0}'.format(en.code))
            ecovname = self.colname('ecov')
            if ecovname not in ex.columns:
                ecov = CC.calc_ecov(
                    expath=en.modelpath('ex'),
                    cipath=en.modelpath('ci'),
                    bwpath=self.bigwig,
                    dstprefix=en.fname2(
                        '', self.datacode),  # cov is data dependent
                    override=False,  # override previous?
                    np=np)
                ex[ecovname] = ecov.set_index('eid').ix[
                    ex['_id'].values]['ecov'].values
                saveex = True
            # gcov, glen
            gcovname = self.colname('gcov')
            if gcovname not in ex.columns:
                gcov = CC.calc_gcov(
                    expath=en.modelpath('ex'),
                    cipath=en.modelpath('ci'),
                    bwpath=self.bigwig,
                    dstprefix=en.fname2('', self.datacode),
                    override=False,  # reuse covci from ecov calc
                    np=np)
                tmp = gcov.set_index('_gidx').ix[ex['_gidx'].values]
                ex[gcovname] = tmp['gcov'].values
                if 'glen' in tmp:
                    ex['glen'] = tmp[
                        'glen'].values  # glen is only dependent on model not data
                saveex = True
        else:
            ecovname = self.colname('ecov')
            if ecovname not in ex.columns:
                ex[ecovname] = 0
            gcovname = self.colname('gcov')
            if gcovname not in ex.columns:
                ex[gcovname] = 0
        # sjcnt
        ucntname = self.colname('ucnt')
        mcntname = self.colname('mcnt')
        jcntname = self.colname('jcnt')
        sjfile = self.sjfile
        if ucntname not in sj.columns:
            if sjfile.endswith('.bed') or sjfile.endswith(
                    '.bed.gz'):  # no header
                dsj = UT.read_pandas(sjfile,
                                     names=[
                                         'chr', 'st', 'ed', 'name', 'ucnt',
                                         'strand', 'mcnt'
                                     ])
            else:  # assume txt file with header
                dsj = UT.read_pandas(sjfile)
            # locus based matching
            dsj['locus'] = UT.calc_locus_strand(dsj)
            sj['locus'] = UT.calc_locus_strand(sj)
            l2u = UT.df2dict(dsj, 'locus', 'ucnt')
            l2m = UT.df2dict(dsj, 'locus', 'mcnt')
            sj[ucntname] = [l2u.get(x, 0) for x in sj['locus']]
            sj[mcntname] = [l2m.get(x, 0) for x in sj['locus']]
            sj[jcntname] = [x or y for x, y in sj[[ucntname, mcntname]].values]
            savesj = True
        if saveex and savesjex:
            en.savemodel('ex', dcode, category='output')
        if savesj and savesjex:
            en.savemodel('sj', dcode, category='output')
Exemplo n.º 12
0
    def calc_stats(self):

        ecovname = self.colname('ecov')
        jcntname = self.colname('jcnt')
        jhitname = self.colname2('jhit', self.en2.code)

        def _findclosest(e, which):
            e['dlen'] = N.abs(e['len'] - e['b_len'].astype(float))
            e['ratio'] = e['b_len'].astype(float) / e['len']
            e = e.sort_values(['_id', 'dlen'], ascending=True)
            f = e.groupby('_id', sort=False).first().reset_index()
            self.closest[which] = f
            return f

        def _count(dw, da1, da2, which):
            if which != 'j':
                da1 = da1[da1[ecovname] > 0]
                dw = dw[dw[ecovname] > 0]
                #da2 = da2[da2[ecovname]>0]
            else:
                da1 = da1[da1[jcntname] > 0]
                dw = dw[dw[jcntname] > 0]
                #da2 = da2[da2[jcntname]>0]
            pop = set(da1['_id'].values)
            hit = set(dw['_id'].values)
            pop2 = set(da2['_id'].values)
            #dif = pop.difference(hit)
            if len(pop) == 0:
                LOG.warning('no elements in {0} for population1'.format(
                    self.abbr[which]))
            if len(pop2) == 0:
                LOG.warning('no elements in {0} for population2'.format(
                    self.abbr[which]))
            if len(hit) == 0:
                LOG.warning('no elements in {0} for match'.format(
                    self.abbr[which]))
            np1, nh, np2 = len(pop), len(hit), len(pop2)
            r1 = float(nh) / max(1, np1)
            r2 = float(nh) / max(1, np2)
            LOG.info(
                '[{5}] detected1:{0},\tmatched:{1},\t(detected2:{2}),\tratio:{3:.2f},\t(ratio2:{4:.2f})'
                .format(np1, nh, np2, r1, r2, which))
            #return hit, pop, pop2
            return nh, np1, np2

        for which in ['i', '5', '3', 's', 'j', '5b', '3b', 'sb']:
            LOG.debug(which + '=' * 10)
            cn = 'hit{0}'.format(which)
            if which != 'j':
                e1, e2 = self.e1, self.e2
                # use exons with reads
                ea1 = e1[(e1['cat'] == which[0])][['_id', ecovname,
                                                   'name']].copy()  # all exons
                if len(which) == 1:
                    ea2 = e2[(e2['cat'] == which[0])]
                else:  # all of exons allowed
                    ea2 = e2
                ew = self.e[which]  # matched exons
                hit, pop, pop2 = _count(ew, ea1, ea2, which)
                ew2 = _findclosest(ew, which)  # calculate ratio
                i2r = UT.df2dict(ew2, '_id', 'ratio')
                ea1[cn] = [i2r.get(x, 0) for x in ea1['_id']]
                ea1 = ea1.set_index('_id')
                x = N.log2(ea1[ecovname] + 1)  # log coverage
                y = ea1[cn]
                ns = ea1['name']
            else:
                sa = self.s1
                hit, pop, pop2 = _count(self.e['j'], sa, self.s2, which)
                sa[cn] = [1 if x > 0 else 0
                          for x in sa[jhitname]]  # in case of NaN
                sa = sa.set_index('_id')
                x = N.log2(sa[jcntname] + 1)
                y = sa[cn]
                ns = sa['name']

            # gen4 ecov>0, detected or not
            # if which != 'j':
            #     idx2 = x>0
            #     x2 = x[idx2].values
            #     y4 = N.array(y[idx2]>0, dtype=int)
            # else:
            #     x2 = x.values
            #     y4 = N.array(y>0, dtype=int)

            # only consider ones detected in the reference (en1)
            idx2 = x > 0
            x2 = x[idx2].values
            y4 = N.array(y[idx2] > 0,
                         dtype=int)  # binary detection indicator (ratio>0)

            try:
                x3, y3, xth = UT.fit_sigmoid(x2, y4, (0, 5), 0.99)
            except:
                xth = N.NaN
            auc4, maxx4, avgy4, x4, y4 = self._calc_binned(
                x2, y4, self.binsize)
            p1 = float(hit) / pop if pop > 0 else 0.
            p2 = float(hit) / pop2 if pop2 > 0 else 0.
            self.ratios[which] = PD.DataFrame({'x': x, 'y': y, 'name': ns})
            self.stats[which] = {
                'detected1': pop,  # int
                'matched': hit,  # int
                'detected2': pop2,  # int 
                'p1': p1,  # float
                'p2': p2,  # float
                'auc': auc4,  # float
                'maxx': list(maxx4),  # list
                'avgy': list(avgy4),  # list
                'xth': xth,  # float
            }
Exemplo n.º 13
0
 def __call__(self):
     # exdf => ex.p, ex.n, ex.u
     # sjdf => sj.p, sj.n, sj.u
     # paths => sjpath.bed
     # divide into tasks (exdf,sjdf,paths) x chroms
     self.server = server = TQ.Server(name='PrepBWSJ', np=self.np)
     self.chroms = chroms = UT.chroms(self.genome)
     csizes = UT.df2dict(UT.chromdf(self.genome), 'chr', 'size')
     self.exstatus = exstatus = {}
     self.sjstatus = sjstatus = {}
     self.pastatus = pastatus = {}
     exdone = False
     sjdone = False
     padone = False
     with server:
         for chrom in chroms:
             # exdf tasks
             tname = 'prep_exwig_chr.{0}'.format(chrom)
             args = (self.j2pres, self.libsizes, self.dstpre, chrom,
                     csizes[chrom])
             task = TQ.Task(tname, prep_exwig_chr, args)
             server.add_task(task)
             # exdf tasks
             tname = 'prep_sjwig_chr.{0}'.format(chrom)
             args = (self.j2pres, self.libsizes, self.dstpre, chrom,
                     csizes[chrom])
             task = TQ.Task(tname, prep_sjwig_chr, args)
             server.add_task(task)
             # exdf tasks
             tname = 'prep_sjpath_chr.{0}'.format(chrom)
             args = (self.j2pres, self.libsizes, self.dstpre, chrom)
             task = TQ.Task(tname, prep_sjpath_chr, args)
             server.add_task(task)
         while server.check_error():
             try:
                 name, rslt = server.get_result(
                     timeout=5)  # block until result come in
             except TQ.Empty:
                 name, rslt = None, None
             if name is not None:
                 if name.startswith('prep_exwig_chr.'):
                     chrom = name.split('.')[1]
                     exstatus[chrom] = rslt
                     if len(exstatus) == len(chroms):  # all finished
                         print('$$$$$$$$ putting in prep_exbw $$$$$$$$$$$')
                         tname = 'prep_exbw'
                         args = (self.dstpre, chroms, self.genome)
                         task = TQ.Task(tname, prep_exbw, args)
                         server.add_task(task)
                 if name.startswith('prep_sjwig_chr.'):
                     chrom = name.split('.')[1]
                     sjstatus[chrom] = rslt
                     if len(sjstatus) == len(chroms):  # all finished
                         print('$$$$$$$$ putting in prep_sjbw $$$$$$$$$$$')
                         tname = 'prep_sjbw'
                         args = (self.dstpre, chroms, self.genome)
                         task = TQ.Task(tname, prep_sjbw, args)
                         server.add_task(task)
                 if name.startswith('prep_sjpath_chr.'):
                     chrom = name.split('.')[1]
                     pastatus[chrom] = rslt
                     if len(pastatus) == len(chroms):  # all finished
                         print(
                             '$$$$$$$$ putting in prep_sjpath $$$$$$$$$$$')
                         tname = 'prep_sjpath'
                         args = (self.dstpre, chroms)
                         task = TQ.Task(tname, prep_sjpath, args)
                         server.add_task(task)
                 if name == 'prep_exbw':
                     print('$$$$$$$$ prep_exbw done $$$$$$$$$$$')
                     exdone = True
                 if name == 'prep_sjbw':
                     print('$$$$$$$$ prep_sjbw done $$$$$$$$$$$')
                     sjdone = True
                 if name == 'prep_sjpath':
                     print('$$$$$$$$ prep_sjpath done $$$$$$$$$$$')
                     padone = True
                 if exdone & sjdone & padone:
                     break
         print('Exit Loop')
     print('Done')
Exemplo n.º 14
0
    def calc_completeness(self):
        """Completeness measures how much of the reference gene structure is recovered.

        1. GLC: gene length completeness = max(ratio of gene length covered by overlapping target gene)
        2. ECC: exon count completeness = max(ratio of overlapping exon counts)
        3. JCC: junction count completeness = max(ratio of overlapping junction counts)

        """
        ov = self.ov  # all
        if self.exclude_se_from_completeness:
            ov = ov[ov['cat'] != 's']

        # actual overlap with correct strand
        ov2 = ov[(ov['b__gidx'] != '.') & (
            (ov['strand'] == ov['b_strand']) | (ov['b_strand'] == '.'))]
        if self.exclude_se_from_completeness:
            ov2 = ov2[ov2['b_cat'] != 's']

        gcovname = self.colname('gcov')
        g2gcov = UT.df2dict(self.e1, '_gidx', gcovname)
        xlim = [0, 6]
        # GLC
        g1 = ov.groupby('_gidx')
        glc = (g1['ed'].max() - g1['st'].min()).to_frame('glen')
        g2 = ov2.groupby(['_gidx', 'b__gidx'])
        gl2 = (g2['ed'].max() -
               g2['st'].min()).to_frame('b_glen').reset_index()
        gl2 = gl2.groupby('_gidx')['b_glen'].max()
        g2gl2 = UT.series2dict(gl2)
        glc['b_glen'] = [g2gl2.get(x, 0) for x in glc.index]
        glc['y'] = glc['b_glen'] / glc['glen']
        glc['x'] = N.log2(N.array([g2gcov[x] for x in glc.index]) + 1.)
        self.ratios['glc'] = glc[['x', 'y']]
        x, y = glc['x'].values, glc['y'].values
        x2, y2, xth = UT.fit_sigmoid(x, y, xlim, 0.99)
        auc, maxx, avgy, x, y = self._calc_binned(x, y, self.binsize)
        self.stats['glc'] = {
            'p1':
            N.sum(glc['b_glen'] > 0) / float(len(glc)),  # float ratio detected
            'auc': auc,  # float
            'maxx': list(maxx),  # list
            'avgy': list(avgy),  # list
            'xth': xth,  # float
        }

        # ECC
        ecc = ov.groupby([
            '_gidx', '_id'
        ]).first().reset_index().groupby('_gidx').size().to_frame('#exons')
        ec2 = ov2.groupby(['_gidx', 'b__gidx', '_id']).first().reset_index()
        ec2 = ec2.groupby(['_gidx',
                           'b__gidx']).size().to_frame('ec').reset_index()
        ec2 = ec2.groupby('_gidx')['ec'].max()
        g2ec2 = UT.series2dict(ec2)
        ecc['b_#exons'] = [g2ec2.get(x, 0) for x in ecc.index]
        ecc['y'] = ecc['b_#exons'] / ecc['#exons']
        ecc['x'] = N.log2(N.array([g2gcov[x] for x in ecc.index]) + 1.)
        self.ratios['ecc'] = ecc[['x', 'y']]
        x, y = ecc['x'].values, ecc['y'].values
        x2, y2, xth = UT.fit_sigmoid(x, y, xlim, 0.99)
        auc, maxx, avgy, x, y = self._calc_binned(x, y, self.binsize)
        self.stats['ecc'] = {
            'p1': N.sum(ecc['b_#exons'] > 0) / float(len(ecc)),
            'auc': auc,
            'maxx': list(maxx),
            'avgy': list(avgy),
            'xth': xth
        }

        # JCC
        s1 = self.s1
        jcc = s1.groupby('_gidx').size().to_frame('jc')
        if '_gidx' not in self.s2:  # adapt to old version where sj.txt.gz did not contain _gidx
            a2g = UT.df2dict(self.e2, 'a_id', '_gidx')
            d2g = UT.df2dict(self.e2, 'd_id', '_gidx')
            self.s2['_gidx'] = [
                a2g.get(x, d2g.get(y, 0))
                for x, y in self.s2[['a_id', 'd_id']].values
            ]
        l2g2 = UT.df2dict(self.s2, 'locus', '_gidx')
        s1['b__gidx'] = [l2g2.get(x, '.') for x in s1['locus'].values]
        s1o = s1[s1['b__gidx'] != '.']  # overlapping
        jc2 = s1o.groupby(['_gidx',
                           'b__gidx']).size().to_frame('jc2').reset_index()
        jc2 = jc2.groupby('_gidx')['jc2'].max()
        g2jc2 = UT.series2dict(jc2)
        jcc['b_jc'] = [g2jc2.get(x, 0) for x in jcc.index]
        jcc['y'] = jcc['b_jc'] / jcc['jc']
        jcc['x'] = N.log2(N.array([g2gcov[x] for x in jcc.index]) + 1.)
        self.ratios['jcc'] = jcc[['x', 'y']]
        x, y = jcc['x'].values, jcc['y'].values
        x2, y2, xth = UT.fit_sigmoid(x, y, xlim, 0.99)
        auc, maxx, avgy, x, y = self._calc_binned(x, y, self.binsize)
        self.stats['jcc'] = {
            'p1': N.sum(jcc['b_jc'] > 0) / float(len(jcc)),
            'auc': auc,
            'maxx': list(maxx),
            'avgy': list(avgy),
            'xth': xth
        }
Exemplo n.º 15
0
    def find_match(self):
        en1 = self.en1
        en2 = self.en2
        # write internal,3,5,se exons separately for finding match
        a = en1.fname2(
            'emtmp.ex.bed.gz', en2.code
        )  # need to be unique to avoid parallel conflict (en1 ref shared)
        b = en2.fname('emtmp.ex.bed.gz')
        c = en1.fname2('emtmp.ex.ovl.txt.gz', en2.code)
        self.e1 = e1 = en1.model('ex')
        self.e2 = e2 = en2.model('ex')
        ecovname = self.colname('ecov')
        cols = [
            'chr', 'st', 'ed', 'cat', '_id', ecovname, '_gidx', 'len', 'strand'
        ]
        a = UT.write_pandas(e1[cols], a, '')
        b = UT.write_pandas(e2[cols], b, '')
        c = BT.bedtoolintersect(a, b, c, wao=True)
        ocols = cols + ['b_' + x for x in cols] + ['ovl']
        self.ov = ov = UT.read_pandas(c, names=ocols)  # overlaps of exons

        idxchr = ov['chr'] == ov['b_chr']  # str vs. str
        idxstrand = ov['strand'] == ov['b_strand']  # str vs. str
        idxp = (ov['strand'] == '+') & idxstrand
        idxn = (ov['strand'] == '-') & idxstrand
        idxst = ov['st'] == ov['b_st']  # b_st column mixed? type?
        idxed = ov['ed'] == ov['b_ed']  # b_ed column mixed? type?
        idxcat = ov['cat'] == ov['b_cat']
        idxcov = ov[ecovname] > 0  # exons with reads
        LOG.debug(
            '=' * 10 +
            'calculating match between {0} and {1}'.format(en1.code, en2.code))
        LOG.debug('len(ov):{0}'.format(len(ov)))
        for k in [
                'idxchr', 'idxstrand', 'idxp', 'idxn', 'idxst', 'idxed',
                'idxcat', 'idxcov'
        ]:
            v = locals()[k]
            LOG.debug('#{0}:{1}'.format(k, N.sum(v)))

        # internal exon cat='i' and chr,st,ed,strand match
        self.ei = ei = ov[idxchr & idxstrand & idxst & idxed & idxcat &
                          (ov['cat'] == 'i')].copy()
        # 5' cat='5' and chr,donor (+,ed)|(-,st) match, find closest
        self.e5 = e5 = ov[idxchr & ((idxp & idxed) | (idxn & idxst)) & idxcat &
                          (ov['cat'] == '5')].copy()
        # 3' cat='3' and chr,acceptor (+,st)|(-,ed) match
        self.e3 = e3 = ov[idxchr & ((idxn & idxed) | (idxp & idxst)) & idxcat &
                          (ov['cat'] == '3')].copy()
        # se cat='s' and chr,
        self.es = es = ov[idxchr & (ov['cat'] == 's') & idxcat].copy()

        # allow overlap to ther categories
        self.e5b = e5b = ov[idxchr & ((idxp & idxed) | (idxn & idxst)) &
                            (ov['cat'] == '5')].copy()
        # 3' cat='3' and chr,acceptor (+,st)|(-,ed) match
        self.e3b = e3b = ov[idxchr & ((idxn & idxed) | (idxp & idxst)) &
                            (ov['cat'] == '3')].copy()
        # se cat='s' and chr,
        self.esb = esb = ov[idxchr & (ov['cat'] == 's')].copy()

        # splice junction
        self.s1 = s1 = en1.model('sj')
        self.s2 = s2 = en2.model('sj')
        jcntname = self.colname('jcnt')
        l2c = UT.df2dict(s2, 'locus', jcntname)
        jhitname = self.colname2('jhit', en2.code)
        s1[jhitname] = [l2c.get(x, 0)
                        for x in s1['locus']]  # corresponding s2 count
        self.sj = sj = s1[
            s1[jhitname] > 0].copy()  # only consider s2 count > 0

        # for batch processing
        self.e = {
            'i': ei,
            '5': e5,
            '3': e3,
            's': es,
            'j': sj,
            '5b': e5b,
            '3b': e3b,
            'sb': esb
        }
Exemplo n.º 16
0
def _process_mapbed_chr(dstpre, chrom, genome, chromdir, stranded):
    # 1st pass: calc dupdic
    bedpath = dstpre+'.{0}.bed'.format(chrom)
    dupids = UT.read_pandas(dstpre+'.dupitems.txt.gz', index_col=[0]).index
    # 2nd pass make wiggles
    gfc = FA.GenomeFASTAChroms(chromdir)
    chromsize = UT.df2dict(UT.chromdf(genome), 'chr', 'size')[chrom]
    
    # mqth MAPQ threshold there are ~6% <10
    # generator which makes an array
    fp = open(bedpath,'rb')

    wigs = {}
    wigpaths = {}
    for kind in ['.ex','.sj']:
        wigs[kind] = {}
        wigpaths[kind] = {}
        for strand in ['.p','.n','.u']:
            wigs[kind][strand] = {}
            wigpaths[kind][strand] = {}
            for suf in ['','.uniq']:
                wigpath = dstpre+kind+suf+strand+'.{0}.wig'.format(chrom)
                if os.path.exists(wigpath):
                    os.unlink(wigpath)
                wigpaths[kind][strand][suf] = wigpath
                wigs[kind][strand][suf] = N.zeros(chromsize, dtype=float)

    sjs = [] # path: (chr, st, ed, pcode, ucnt, strand, acnt)
    # pcode = a(apos)d(dpos) = a(ed)d(st) if strand=='+' else a(st)d(ed)
    # ucnt = unique read counts
    # acnt = multi-read adjusted all counts (=ucnt+Sum(mcnt(i)/dup(i)))
    # delete previous
    sjbed12 = dstpre+'.{0}.sjpath.bed'.format(chrom)
    if os.path.exists(sjbed12):
        os.unlink(sjbed12)

    def _write_arrays():
        for kind in ['.ex','.sj']:
            for strand in ['.p','.n','.u']:
                for suf in ['','.uniq']:
                    cybw.array2wiggle_chr64(wigs[kind][strand][suf], chrom,  wigpaths[kind][strand][suf], 'w')
        
    def _write_sj(sjs):
        # sjs = [(chr,st,ed,pathcode(name),ureads(sc1),strand,tst,ted,areads(sc2),cse),...]
        sjdf = PD.DataFrame(sjs, columns=GGB.BEDCOLS[:9]+['cse'])
        sjdfgr = sjdf.groupby('name')
        sj = sjdfgr.first()
        sj['sc1'] = sjdfgr['sc1'].sum().astype(int) # ucnt
        sj['sc2'] = sjdfgr['sc2'].sum().astype(int) # jcnt=ucnt+mcnt
        sj['st'] = sjdfgr['st'].min()
        sj['ed'] = sjdfgr['ed'].max()
        sj['#exons'] = sj['cse'].apply(len)+1
        sj['ests'] = [[0]+[z[1]-st for z in cse] for st,cse in sj[['st','cse']].values]
        sj['eeds'] = [[z[0]-st for z in cse]+[ed-st] for st,ed,cse in sj[['st','ed','cse']].values]
        esizes = [[u-v for u,v in zip(x,y)] for x,y in sj[['eeds','ests']].values]
        sj['estarts'] = ['{0},'.format(','.join([str(y) for y in x])) for x in sj['ests']]
        sj['esizes'] = ['{0},'.format(','.join([str(y) for y in x])) for x in esizes]
        sj['name'] = sj.index
        # sj = sj.reset_index()
        with open(sjbed12, 'w') as f:
            sj[GGB.BEDCOLS].to_csv(f, index=False, header=False, sep='\t', quoting=csv.QUOTE_NONE)
            
    def _append_sj(cse, css, csj, chrom,ureads,areads):
        if (len(cse)>0): # spits out splice rec
            # chr,st,ed,pathcode,ureads,strand,tst,ted,areads
            tst = cse[0][0]
            ted = cse[-1][1]
            if len(css)>0:
                strand = Counter(css).most_common()[0][0]
            else:
                strand = '.'
            name = pathcode(cse, strand)
            st = int(csj[0][1]) # first segment start
            ed = int(csj[-1][2]) # last segment end
            sjs.append((chrom,st,ed,name,ureads,strand,tst,ted,areads,cse))   
    
    def _add_to_ex_arrays(st,ed,dup,strand):
        kind='.ex'
        strand = STRANDMAP[(strand,stranded)]
        dic = wigs[kind][strand]
        dic[''][st:ed] += 1
        if not dup:
            dic['.uniq'][st:ed] += 1

    def _add_to_sj_arrays(sst,sed,dup,strand):
        kind='.sj'
        s = {'+':'.p','-':'.n','.':'.u'}[strand]
        dic = wigs[kind][s]
        # add to the arrays
        dic[''][sst:sed] += 1
        if not dup:
            dic['.uniq'][sst:sed] += 1
            ureads,areads = 1,1
        else:
            ureads,areads = 0,1
        return ureads,areads
        
    csj = [] # current collection of spliced reads
    css = [] # current strands
    cse = [] # current (sst,sed)
    csn = 0 # current segment number
    ureads,areads = 1,1 # uniq, total reads it's either 1,1 or 0,1
    pmid = None # previous map id common to spliced segments
    for line in fp:
        rec = line.strip().split(b'\t')
        # 7 column bed: chr(0), st(1), ed(2), name(3), mapq(4), strand(5), mapid(6)
        cchr = rec[0].decode()
        st,ed = int(rec[1]),int(rec[2])
        dup = rec[3] in dupids #dic[rec[3]]
        estrand = rec[5]
        _add_to_ex_arrays(st,ed,dup,estrand)
        # process splice
        if pmid != rec[6]: # new map 
            _append_sj(cse, css, csj, chrom, ureads, areads)
            csj,css,cse,csn = [rec],[],[],0 # reset running params
        else: # add segments
            csj.append(rec)            
            prec = csj[-2] # previous rec
            sst = int(prec[2]) # ed of previous segment
            sed = int(rec[1]) # st of current segment
            cse.append((sst,sed))
            # find strand
            sted = gfc.get(chrom,sst,sst+2)+gfc.get(chrom,sed-2,sed)
            strand = STED2STRAND.get(sted,'.')
            if strand != '.':
                css.append(strand)
            ureads,areads = _add_to_sj_arrays(sst,sed,dup,strand)
        pmid = rec[6]

    _append_sj(cse, css, csj, chrom, ureads, areads)

    _write_arrays()
    _write_sj(sjs)
Exemplo n.º 17
0
def count_repeats_viz_mp(beddf,
                         rmskvizpath,
                         idcol='_id',
                         np=3,
                         prefix=None,
                         expand=0,
                         col='repnames'):
    """Use rmsk-viz track and check each (unioned) exon overlaps with repeats and report repeat name(s).
    Uses Bedtools and calculates chromosome-wise.  

    Args:
        beddf: Pandas DataFrame with chr,st,ed cols, when calculating repeats bp
         for genes, unioned bed should be used (use utils.make_unionex)
        idcol: colname for unique row id (default _id)
        rmskvizpath: path to repeat masker viz BED7 file (created using rmskviz2bed7)
        np: number of CPU to use
        prefix: path prefix for temp file, if not None temp files are kept. (default None)
        expand: how many bases to expand exon region in each side (default 0)
        col: column name to put in overlapping repeat names (if multiple comma separated)

    Outputs:
        are put into beddf columns with colname col(default repnames)

    """
    cleanup = False
    if prefix is None:
        cleanup = True
        prefix = os.path.join(os.path.dirname(rmskvizpath),
                              str(uuid.uuid4()) + '_')

    # chrom-wise
    chroms = sorted(beddf['chr'].unique())
    # check whether rmskviz is already split
    splitrmsk = False
    for chrom in chroms:
        rpath = rmskvizpath + '.{0}.bed.gz'.format(chrom)  # reuse
        if not os.path.exists(rpath):
            splitrmsk = True
            break
    if splitrmsk:
        rmsk = GGB.read_bed(rmskvizpath)

    args = []
    bfiles = []
    ofiles = []
    for chrom in chroms:
        bpath = prefix + 'tgt.{0}.bed'.format(chrom)  # don't compress
        rpath = rmskvizpath + '.{0}.bed.gz'.format(chrom)  # reuse
        if expand > 0:
            bchr = beddf[beddf['chr'] == chrom].copy()
            bchr['st'] = bchr['st'] - expand
            bchr['ed'] = bchr['ed'] + expand
            bchr.loc[bchr['st'] < 0, 'st'] = 0
        else:
            bchr = beddf[beddf['chr'] == chrom]
        UT.write_pandas(bchr[['chr', 'st', 'ed', idcol]], bpath, '')
        bfiles.append(bpath)
        if splitrmsk:
            rchr = rmsk[rmsk['chr'] == chrom]
            UT.write_pandas(rchr[['chr', 'st', 'ed', 'name', 'strand']], rpath,
                            '')
        opath = prefix + 'out.{0}.bed'.format(chrom)
        ofiles.append(opath)
        args.append([bpath, rpath, opath])

    rslts = UT.process_mp(count_repeats_viz_chr, args, np=np, doreduce=False)

    # gather outputs
    cols = ['name', 'repnames']
    outs = [UT.read_pandas(f, names=cols) for f in ofiles]
    df = PD.concat(outs, ignore_index=True)
    df['name'] = df['name'].astype(str)
    i2rn = UT.df2dict(df, 'name', 'repnames')
    beddf[col] = [i2rn[str(x)] for x in beddf[idcol]]

    # cleanup
    if cleanup:
        for f in bfiles:
            os.unlink(f)
        for f in ofiles:
            os.unlink(f)

    return beddf