예제 #1
0
파일: merge2.py 프로젝트: king1212/jGEM
 def __init__(self, modelpre, bwpre, chrom, st, ed, dstpre, tcovth):
     self.modelpre = modelpre
     self.tcovth = tcovth
     A2.LocalAssembler.__init__(self,
                                bwpre,
                                chrom,
                                st,
                                ed,
                                dstpre,
                                refcode=None)
     bed12 = GGB.read_bed(modelpre + '.paths.withse.bed.gz')
     idx = (bed12['chr']
            == chrom) & (bed12['tst'] >= st) & (bed12['ted'] <= ed)
     self.paths = bed12[idx].copy()
     sj = GGB.read_bed(bwpre + '.sjpath.bed.gz')
     idx0 = (sj['chr'] == chrom) & (sj['tst'] >= st) & (sj['ted'] <= ed)
     self.sjpaths0 = sj[idx0].copy()
     # load exdf, sjdf
     sjdf = UT.read_pandas(modelpre + '.sjdf.txt.gz', names=A2.SJDFCOLS)
     exdf = UT.read_pandas(modelpre + '.exdf.txt.gz', names=A2.EXDFCOLS)
     idx = (sjdf['chr'] == chrom) & (sjdf['st'] >= st) & (sjdf['ed'] <= ed)
     self.sjdf = sjdf[idx].copy()
     idx = (exdf['chr'] == chrom) & (exdf['st'] >= st) & (exdf['ed'] <= ed)
     self.exdf = exdf[idx].copy()
     A2.set_ad_pos(self.sjdf, 'sj')
     A2.set_ad_pos(self.exdf, 'ex')
예제 #2
0
파일: merge2.py 프로젝트: king1212/jGEM
def filter_sj(bwsjpre, statspath, chrom, csize, params):
    # read in junction stats
    stats = UT.read_pandas(statspath)
    if 'chr' not in stats:
        stats['chr'] = [x.split(':')[0] for x in stats['locus']]
    if '#detected' in stats:
        stats.rename(columns={'#detected': 'detected'}, inplace=True)
    stats = stats[stats['chr'] == chrom].copy()
    if 'pc' not in stats:
        stats['pc'] = [locus2pc(x) for x in stats['locus']]
    flds = ['detected', 'maxcnt', 'maxoverhang']
    dics = {f: UT.df2dict(stats, 'pc', f) for f in flds}
    # read sjpath
    fpath_chr = bwsjpre + '.sjpath.{0}.bed.gz'.format(chrom)
    dstpath = bwsjpre + '.sjpath.{0}.filtered.bed.gz'.format(chrom)
    if os.path.exists(fpath_chr):
        sj = GGB.read_bed(fpath_chr)
    else:
        fpath = bwsjpre + '.sjpath.bed.gz'
        sj = GGB.read_bed(fpath)
        sj = sj[sj['chr'] == chrom].copy()
    name0 = sj.iloc[0]['name']
    if len(name0.split('|')) < len(name0.split(',')):  # exons attached?
        sj['name'] = [','.join(x.split(',')[1:-1]) for x in sj['name']]
    # filter unstranded
    sj = sj[sj['strand'].isin(['+', '-'])].copy()
    # filter with stats
    for f in flds:
        sj[f] = [
            N.min([dics[f].get(x, 0) for x in y.split(',')])
            for y in sj['name']
        ]
        sj = sj[sj[f] > params['th_' + f]].copy()  # filter
    # edge exon size
    sj['eflen'] = [int(x.split(',')[0]) for x in sj['esizes']]
    sj['ellen'] = [int(x.split(',')[-2]) for x in sj['esizes']]
    eth = params['th_minedgeexon']
    sj = sj[(sj['eflen'] > eth) & (sj['ellen'] > eth)].copy()
    # calculate sjratio, sjratio2
    sjexbw = A2.SjExBigWigs(bwsjpre, mixunstranded=False)
    for s in ['+', '-']:
        idx = sj['strand'] == s
        with sjexbw:
            sa = sjexbw.bws['sj'][s].get(chrom, 0, csize)
            ea = sjexbw.bws['ex'][s].get(chrom, 0, csize)
        a = sa + ea
        sj.loc[idx, 'sjratio2'] = [
            x / N.mean(a[int(s):int(e)])
            for x, s, e in sj[idx][['sc1', 'tst', 'ted']].values
        ]
    sj = sj[sj['sjratio2'] > params['th_sjratio2']]
    GGB.write_bed(sj, dstpath, ncols=12)
예제 #3
0
파일: evaluate.py 프로젝트: king1212/jGEM
    def model(self, which, code2=None):
        """Returns model dataframe (junction/exon/chopped intervals).

        Args:
            which: one of 'sj','ex', 'ci'

        """
        if hasattr(self, which):  # cached
            return getattr(self, which)

        path = self.modelpath(which, code2)
        if os.path.exists(path):  # file exists
            if which == 'ci':
                df = GGB.read_bed(path)
            else:
                df = UT.read_pandas(path)
            setattr(self, which, df)
            return df
        # file does not exists, if ci then make from ex
        if which == 'ci':
            expath = self.modelpath('ex', code2)
            if os.path.exists(expath):
                self.ci = UT.chopintervals(self.model('ex'), path)
            else:
                raise RuntimeError('file {0} does not exist'.format(expath))
        else:
            raise RuntimeError('file {0} does not exist'.format(path))
예제 #4
0
파일: repeats.py 프로젝트: king1212/jGEM
def filter_paths(mdstpre, rdstpre):
    ex = UT.read_pandas(rdstpre + '.ex.txt.gz')

    def select_chromwise(paths, ex):
        npchrs = []
        for chrom in paths['chr'].unique():
            pchr = paths[paths['chr'] == chrom]
            echr = ex[ex['chr'] == chrom]
            exnames = set(echr['name'].values)
            #e2gname = UT.df2dict(echr,'name','gname')
            idx = [
                all([x in exnames for x in y.split('|')]) for y in pchr['name']
            ]
            npchrs.append(pchr[idx])
        return PD.concat(npchrs, ignore_index=True)

    paths = GGB.read_bed(mdstpre + '.paths.withse.bed.gz')
    npaths = select_chromwise(paths, ex)
    GGB.write_bed(npaths, rdstpre + '.paths.withse.bed.gz', ncols=12)

    paths = GGB.read_bed(mdstpre + '.paths.txt.gz')
    npaths = select_chromwise(paths, ex)
    GGB.write_bed(npaths, rdstpre + '.paths.txt.gz', ncols=12)
예제 #5
0
def test_sjtab2sjbed(sampleinfo, datadir, outdir):
    rec = sampleinfo.iloc[0]
    sjtab = os.path.join(datadir, 'SJ', rec['sjtab'])
    sjbed = os.path.join(outdir, rec['sjbed'])
    aligned = rec['aligned']
    sj = GGB.sjtab2sjbed(sjtab, sjbed, aligned)
    assert os.path.exists(sjbed)
    SJCOLS = [
        'chr', 'st', 'ed', 'strand2', 'motif', 'annotated', 'ureads', 'mreads',
        'maxoverhang'
    ]
    sji = PD.read_table(sjtab, names=SJCOLS)
    assert len(sj) == len(sji)
    #cols = ['chr','st','ed','name','strand','ucnt','mcnt']
    #sjo = PD.read_table(sjbed, compression='gzip', names=cols)
    sjo = GGB.read_bed(sjbed)
    assert all(sj[GGB.BEDCOLS[:7]] == sjo)
예제 #6
0
파일: calccov.py 프로젝트: king1212/jGEM
def calc_glen(ex, cipath):
    ci = GGB.read_bed(cipath)  # 5 col bed, name:eids, sc1:cid
    ci['len'] = ci['ed'] - ci['st']
    ci['cid'] = ci['sc1']
    c2l = dict(UT.izipcols(ci, ['cid', 'len']))
    if 'cid' not in ex.columns:
        e2c = {}
        for i, name in ci[['cid', 'name']].values:
            for eid in name.split(','):
                e2c.setdefault(int(eid), []).append(i)
        ex['cid'] = [e2c[x] for x in ex['_id']]

    def _gen():
        for g, cids in UT.izipcols(ex, ['_gidx', 'cid']):
            for c in cids:
                yield (c, g)

    df = PD.DataFrame(list(set([x for x in _gen()])), columns=['cid', '_gidx'])
    df['len'] = [c2l[x] for x in df['cid']]
    glen = df.groupby('_gidx')['len'].sum()
    return dict(zip(glen.index, glen.values))
예제 #7
0
파일: calccov.py 프로젝트: king1212/jGEM
def calc_cov_mp(bed, bwname, fname, np, which='cov'):
    if which == 'cov':
        worker = worker_cov
    elif which == 'max':
        worker = worker_max

    if UT.isstring(bed):
        bed = GGB.read_bed(bed)
    #cols = list(bed.columns)+['cov']
    cols = list(bed.columns) + [which]
    chroms = bed['chr'].unique()
    #LOG.debug(chroms)
    cdir = os.path.dirname(__file__)
    data = [(bed[bed['chr'] == c].copy(), bwname, c, cdir) for c in chroms]
    recs = []
    if np == 1:
        # for c,bwname,chrom,d in data:
        for arg in data:
            LOG.debug('cov calculation: processing {0}...'.format(arg[-2]))
            recs += worker(*arg)
    else:
        LOG.debug('{1} calculation: np={0}'.format(np, which))
        try:
            p = multiprocessing.Pool(np)
            a = zip(repeat(worker), data)
            rslts = p.map(mp_worker, a)
            for v in rslts:
                recs += v
            LOG.debug('done {1} calculation: np={0}'.format(np, which))
        finally:
            LOG.debug('closing pool')
            p.close()
            #p.join()
        #recs = reduce(iadd, rslts)
    LOG.debug('writing rslts...')
    df = PD.DataFrame(recs, columns=cols)
    UT.save_tsv_nidx_whead(df, fname)
    return df
예제 #8
0
파일: merge2.py 프로젝트: king1212/jGEM
def estimatecovs(modelpre, bwpre, dstpre, genome, tcovth=1, np=6):
    bed = GGB.read_bed(modelpre + '.paths.withse.bed.gz')
    chroms = bed['chr'].unique()
    csizedic = UT.df2dict(UT.chromdf(genome), 'chr', 'size')
    bundles = []
    args = []
    for chrom in chroms:
        sub = bed[(bed['chr'] == chrom)]
        uc = UT.union_contiguous(sub[['chr', 'st', 'ed']], returndf=True)
        # total about 30K=> make batch of ~1000
        n = len(uc)
        nb = int(N.ceil(n / 1000.))
        for i in range(nb):
            sti = 1000 * i
            edi = min(1000 * (i + 1), len(uc) - 1)
            st = max(uc.iloc[sti]['st'] - 100, 0)
            ed = min(uc.iloc[edi]['ed'] + 100, csizedic[chrom])
            args.append([modelpre, bwpre, chrom, st, ed, dstpre, tcovth])
            bundles.append((chrom, st, ed))

    rslts = UT.process_mp(bundle_estimator, args, np=np, doreduce=False)

    concatenate_bundles(bundles, dstpre)
예제 #9
0
def make_sjexci(path, np):
    if path[-3:]=='.gz':
        bpath = path[:-3]
    else:
        bpath = path
    ext = bpath[-4:]
    if ext not in ['.gtf', '.bed', '.txt']:
        raise ValueError('unknown filetype {0}, should be either .gtf,.bed (bed12),.txt (ucsc knownGene)'.format(ext))
    pathprefix = bpath[:-4]

    if not os.path.exists(path):
        raise ValueError('{0} file does not exists'.format(ext))

    if ext=='.gtf':
        df = GGB.read_gtf(path).sort_values(['chr',])
        sj, ex = gtf2exonsj(df, np=np)
    elif ext=='.bed': 
        df = GGB.read_bed(path)
        sj, ex = bed2exonsj(df, np=np)
    elif ext=='.txt': # UCSC download
        if 'knownGene' in path:
            df = GGB.read_ucsc_knownGene(path)
            sj, ex = kg2exonsj(df, np=np)
        elif 'refGene' in path:
            df = GGB.read_ucsc_refGene(path)
            sj, ex = kg2exonsj(df, np=np) # same as kg
    
    # save
    LOG.info('saving sj to {0}'.format(pathprefix+'.sj.txt.gz'))
    UT.write_pandas(sj, pathprefix+'.sj.txt.gz', 'h')
    LOG.info('saving ex to {0}'.format(pathprefix+'.ex.txt.gz'))
    UT.write_pandas(ex, pathprefix+'.ex.txt.gz', 'h')    

    # make ci
    ci = UT.chopintervals(ex, pathprefix+'.ci.txt.gz')
    return sj, ex
예제 #10
0
def gtf_from_bed12(modelpre, dstpath=None, source='.'):
    # path['gname'] contains gene id
    paths = GGB.read_bed(modelpre+'.paths.withse.bed.gz')
    ex = UT.read_pandas(modelpre+'.ex.txt.gz')
    ex['id'] = ex['chr']+':'+ex['name']
    n2gn = UT.df2dict(ex, 'id', 'gname')
    # n2gn = UT.df2dict(ex, 'name', 'gname') # there may be same st,ed in different chromosome
    paths['id'] = paths['chr']+':'+paths['name']
    paths['id0'] = paths['chr']+':'+paths['name'].str.split('|').str[0]
    paths['gname'] = [n2gn[x] for x in paths['id0']]
    g2cnt = {}
    tnames = []
    for x in paths['gname']:
        i = g2cnt.get(x,1)
        tnames.append('{0}.{1}'.format(x,i))
        g2cnt[x] = i+1
    paths['tname'] = tnames    
    txt = 'gene_id "{0}"; transcript_id "{1}"; exon_number "{2}";'
    def _gen():
        cols = ['chr','st','ed','gname','tname','esizes','estarts','strand']
        for c,s,e,gn,tn,esi,est,strand in paths[cols].values:
            esizes = [int(x) for x in esi.split(',')[:-1]]
            estarts = [int(x) for x in est.split(',')[:-1]]
            for i,(x,y) in enumerate(zip(esizes,estarts)):
                est = s+y
                eed = est+x
                extra = txt.format(gn,tn,i+1)
                yield (c,source,'exon',est+1,eed,'.',strand,'.',extra)
    df = PD.DataFrame([x for x in _gen()], columns=GGB.GTFCOLS)
    if dstpath is None:
        dstpath = bedpath.replace('.bed','.gtf')
    GGB.write_gtf(df, dstpath)
    
    idf = paths[['id','chr','name','tname','gname']]
    UT.write_pandas(idf, modelpre+'.idmap.txt.gz','h')
    return df
예제 #11
0
def test_bed2exonsj(testbed12):
    b12 = GGB.read_bed(testbed12)
    sj, ex = CV.bed2exonsj(b12)
    print(sj.iloc[:10])
    print(ex.iloc[:10])
예제 #12
0
파일: conftest.py 프로젝트: king1212/jGEM
def sj(sjbed):
    "returns sj dataframe"
    sj0 = GGB.read_bed(sjbed)
    return sj0.iloc[:5000]
예제 #13
0
 def read_bed(self, suffix, category='read'):
     return GGB.read_bed(self.bedname(suffix, category))
예제 #14
0
파일: repeats.py 프로젝트: king1212/jGEM
def count_repeats_viz_mp(beddf,
                         rmskvizpath,
                         idcol='_id',
                         np=3,
                         prefix=None,
                         expand=0,
                         col='repnames'):
    """Use rmsk-viz track and check each (unioned) exon overlaps with repeats and report repeat name(s).
    Uses Bedtools and calculates chromosome-wise.  

    Args:
        beddf: Pandas DataFrame with chr,st,ed cols, when calculating repeats bp
         for genes, unioned bed should be used (use utils.make_unionex)
        idcol: colname for unique row id (default _id)
        rmskvizpath: path to repeat masker viz BED7 file (created using rmskviz2bed7)
        np: number of CPU to use
        prefix: path prefix for temp file, if not None temp files are kept. (default None)
        expand: how many bases to expand exon region in each side (default 0)
        col: column name to put in overlapping repeat names (if multiple comma separated)

    Outputs:
        are put into beddf columns with colname col(default repnames)

    """
    cleanup = False
    if prefix is None:
        cleanup = True
        prefix = os.path.join(os.path.dirname(rmskvizpath),
                              str(uuid.uuid4()) + '_')

    # chrom-wise
    chroms = sorted(beddf['chr'].unique())
    # check whether rmskviz is already split
    splitrmsk = False
    for chrom in chroms:
        rpath = rmskvizpath + '.{0}.bed.gz'.format(chrom)  # reuse
        if not os.path.exists(rpath):
            splitrmsk = True
            break
    if splitrmsk:
        rmsk = GGB.read_bed(rmskvizpath)

    args = []
    bfiles = []
    ofiles = []
    for chrom in chroms:
        bpath = prefix + 'tgt.{0}.bed'.format(chrom)  # don't compress
        rpath = rmskvizpath + '.{0}.bed.gz'.format(chrom)  # reuse
        if expand > 0:
            bchr = beddf[beddf['chr'] == chrom].copy()
            bchr['st'] = bchr['st'] - expand
            bchr['ed'] = bchr['ed'] + expand
            bchr.loc[bchr['st'] < 0, 'st'] = 0
        else:
            bchr = beddf[beddf['chr'] == chrom]
        UT.write_pandas(bchr[['chr', 'st', 'ed', idcol]], bpath, '')
        bfiles.append(bpath)
        if splitrmsk:
            rchr = rmsk[rmsk['chr'] == chrom]
            UT.write_pandas(rchr[['chr', 'st', 'ed', 'name', 'strand']], rpath,
                            '')
        opath = prefix + 'out.{0}.bed'.format(chrom)
        ofiles.append(opath)
        args.append([bpath, rpath, opath])

    rslts = UT.process_mp(count_repeats_viz_chr, args, np=np, doreduce=False)

    # gather outputs
    cols = ['name', 'repnames']
    outs = [UT.read_pandas(f, names=cols) for f in ofiles]
    df = PD.concat(outs, ignore_index=True)
    df['name'] = df['name'].astype(str)
    i2rn = UT.df2dict(df, 'name', 'repnames')
    beddf[col] = [i2rn[str(x)] for x in beddf[idcol]]

    # cleanup
    if cleanup:
        for f in bfiles:
            os.unlink(f)
        for f in ofiles:
            os.unlink(f)

    return beddf