def as2exsj(dstpre, np=7): ex = UT.read_pandas(dstpre+'.exdf.txt.gz', names=A2.EXDFCOLS) sj = UT.read_pandas(dstpre+'.sjdf.txt.gz', names=A2.SJDFCOLS) se = UT.read_pandas(dstpre+'.sedf.txt.gz', names=A2.EXDFCOLS) paths = UT.read_pandas(dstpre+'.paths.txt.gz', names=A2.PATHCOLS) #ex.loc[ex['strand'].isin(['.+','.-']),'strand'] = '.' #sj.loc[sj['strand'].isin(['.+','.-']),'strand'] = '.' sj['st'] = sj['st']+1 cols = A2.EXDFCOLS ex = PD.concat([ex[cols],se[cols]],ignore_index=True) UT.set_info(sj,ex) UT.set_exon_category(sj, ex) # find genes (connected components) set '_gidx' graphpre = dstpre+str(uuid.uuid4())+'_' prefix = os.path.abspath(graphpre) # need unique prefix for parallel processing genes = GP.find_genes4(sj,ex, filepre=prefix, np=np, override=False, separatese=True) ex.loc[ex['kind']=='3','cat'] = '3' ex.loc[ex['kind']=='5','cat'] = '5' UT.write_pandas(ex, dstpre+'.ex.txt.gz', 'h') UT.write_pandas(sj, dstpre+'.sj.txt.gz', 'h') ci = UT.chopintervals(ex, dstpre+'.ci.txt.gz') return sj, ex
def __init__(self, sjexpre, code, chromdir, rmskviz, outdir, **kw): self.sjexpre = sjexpre self.prefix = prefix = os.path.join(outdir, code) self.fnobj = FN.FileNamesBase(prefix) self.chromdir = chromdir self.rmskviz = rmskviz self.gfc = FA.GenomeFASTAChroms(chromdir) self.params = RMSKPARAMS.copy() self.params.update(kw) self.ex = UT.read_pandas(sjexpre + '.ex.txt.gz') self.sj = UT.read_pandas(sjexpre + '.sj.txt.gz') if 'glen' not in self.ex or 'tlen' not in self.ex: if not os.path.exists(sjexpre + '.ci.txt.gz'): ci = UT.chopintervals(ex, sjexpre + '.ci.txt.gz') else: ci = UT.read_ci(sjexpre + '.ci.txt.gz') UT.set_glen_tlen(self.ex, ci, gidx='_gidx') UT.write_pandas(self.ex, sjexpre + '.ex.txt.gz', 'h') uexpath = sjexpre + '.unionex.txt.gz' if os.path.exists(uexpath): self.uex = UT.read_pandas(uexpath) else: LOG.info('making union exons...saving to {0}'.format(uexpath)) self.uex = UT.make_unionex(self.ex, '_gidx') UT.write_pandas(self.uex, uexpath, 'h')
def model(self, which, code2=None): """Returns model dataframe (junction/exon/chopped intervals). Args: which: one of 'sj','ex', 'ci' """ if hasattr(self, which): # cached return getattr(self, which) path = self.modelpath(which, code2) if os.path.exists(path): # file exists if which == 'ci': df = GGB.read_bed(path) else: df = UT.read_pandas(path) setattr(self, which, df) return df # file does not exists, if ci then make from ex if which == 'ci': expath = self.modelpath('ex', code2) if os.path.exists(expath): self.ci = UT.chopintervals(self.model('ex'), path) else: raise RuntimeError('file {0} does not exist'.format(expath)) else: raise RuntimeError('file {0} does not exist'.format(path))
def trim_ex(expath, dstpath, dstcipath, length=1000, gidfld='_gidx', np=7): """Generate trimmed version of genes for calculating coverage to avoid length bias. Args: expath (str): path exon tsv dstpath (str): path to trimmed exon dstcipath (str): path to ci (chopped interval) length (pos int): length to trim from 3' end in base pair (default 1000 bp) gidfld (str): column name for gene id (default _gidx) np (pos int): number of CPU to use Generates: Two files (dstpath, dstcipath). Returns: a dataframe containing trimmed exons """ #ex = UT.read_pandas(MD.paths[code]['ex']) #dstpath = MD.trimmedex[code][length]['ex'] #dstcipath = MD.trimmedex[code][length]['ci'] ex = UT.read_pandas(expath) if 'len' not in ex.columns: ex['len'] = ex['ed'] - ex['st'] if np==1: recs = trim_ex_worker((ex, length, gidfld)) else: chroms = sorted(ex['chr'].unique()) data = [(ex[ex['chr']==c], length, gidfld) for c in chroms] recs = [] try: p = multiprocessing.Pool(np) for v in p.map(trim_ex_worker, data): recs += v #recs = reduce(iadd, p.map(trim_ex_worker, *zip(*data))) finally: p.close() # p.join() cols = list(ex.columns.values) nex = PD.DataFrame(recs, columns = cols) nex['len'] = nex['ed'] - nex['st'] # edge case nex.loc[nex['st']==nex['ed'],'ed'] = nex['st'] + 1 UT.save_tsv_nidx_whead(nex, dstpath) UT.chopintervals(nex, dstcipath) return nex
def ci(self): cicols = ['chr','st','ed','name','id'] cipath = self.cipath() if os.path.exists(cipath): LOG.info('reading ci({0}) from cache...'.format(cipath)) ci = UT.read_pandas(cipath, names=cicols) return ci if not os.path.exists(self.gtfpath): raise RuntimeError('file {0} does not exist'.format(self.gtfpath)) LOG.info('making ci..') sj,ex = self.sjex() ci = UT.chopintervals(ex, cipath) return ci
def make_sjex(gtfpath, dstpre, np=12): if UT.isstring(gtfpath): gtf = GGB.read_gtf(gtfpath) else: gtf = gtfpath sj,ex = gtf2exonsj(gtf, np=np) print(ex.groupby(['kind','cat']).size()) ex.loc[ex['kind']=='5','cat'] = '5' ex.loc[ex['kind']=='3','cat'] = '3' UT.write_pandas(ex, dstpre+'.ex.txt.gz', 'h') UT.write_pandas(sj, dstpre+'.sj.txt.gz', 'h') # make ci ci = UT.chopintervals(ex, dstpre+'.ci.txt.gz') return {'sj':sj,'ex':ex}
def as3exsj(dstpre, minelen=150, np=7): ex = UT.read_pandas(dstpre+'.exdf.txt.gz', names=A3.EXDFCOLS) sj = UT.read_pandas(dstpre+'.sjdf.txt.gz', names=A3.SJDFCOLS) se = UT.read_pandas(dstpre+'.sedf.txt.gz', names=A3.EXDFCOLS) paths = UT.read_pandas(dstpre+'.paths.txt.gz', names=A3.PATHCOLS) #ex.loc[ex['strand'].isin(['.+','.-']),'strand'] = '.' # sj.loc[sj['strand']=='.+','strand'] = '+' # sj.loc[sj['strand']=='.-','strand'] = '-' sj['st'] = sj['st']+1 cols = A3.EXDFCOLS ex = PD.concat([ex[cols],se[cols]],ignore_index=True) UT.set_info(sj,ex) UT.set_exon_category(sj, ex) # find genes (connected components) set '_gidx' graphpre = dstpre+str(uuid.uuid4())+'_' prefix = os.path.abspath(graphpre) # need unique prefix for parallel processing # genes = GP.find_genes4(sj,ex, # filepre=prefix, # np=np, # override=False, # separatese=True) genes = GP.find_genes3(sj,ex, # don't use exon overlap as connection np=np, override=False) ex.loc[ex['kind']=='3','cat'] = '3' ex.loc[ex['kind']=='5','cat'] = '5' # remove these with elen smaller than minelen ex['len'] = ex['ed']-ex['st'] exsiz = ex.groupby('_gidx')['len'].sum() rgidx = exsiz[exsiz<minelen].index.values LOG.info('minelen filter #ex {0}=>{1}'.format(len(ex), len(ex)-len(rgidx))) ex2 = ex[~ex['_gidx'].isin(rgidx)] sj2 = sj[~sj['_gidx'].isin(rgidx)] # write UT.write_pandas(ex2, dstpre+'.ex.txt.gz', 'h') UT.write_pandas(sj2, dstpre+'.sj.txt.gz', 'h') ci = UT.chopintervals(ex2, dstpre+'.ci.txt.gz') return sj2, ex2
def make_sjexci(path, np): if path[-3:]=='.gz': bpath = path[:-3] else: bpath = path ext = bpath[-4:] if ext not in ['.gtf', '.bed', '.txt']: raise ValueError('unknown filetype {0}, should be either .gtf,.bed (bed12),.txt (ucsc knownGene)'.format(ext)) pathprefix = bpath[:-4] if not os.path.exists(path): raise ValueError('{0} file does not exists'.format(ext)) if ext=='.gtf': df = GGB.read_gtf(path).sort_values(['chr',]) sj, ex = gtf2exonsj(df, np=np) elif ext=='.bed': df = GGB.read_bed(path) sj, ex = bed2exonsj(df, np=np) elif ext=='.txt': # UCSC download if 'knownGene' in path: df = GGB.read_ucsc_knownGene(path) sj, ex = kg2exonsj(df, np=np) elif 'refGene' in path: df = GGB.read_ucsc_refGene(path) sj, ex = kg2exonsj(df, np=np) # same as kg # save LOG.info('saving sj to {0}'.format(pathprefix+'.sj.txt.gz')) UT.write_pandas(sj, pathprefix+'.sj.txt.gz', 'h') LOG.info('saving ex to {0}'.format(pathprefix+'.ex.txt.gz')) UT.write_pandas(ex, pathprefix+'.ex.txt.gz', 'h') # make ci ci = UT.chopintervals(ex, pathprefix+'.ci.txt.gz') return sj, ex
def calc_cov_ovl_mp(srcname, bwname, dstname, np=1, covciname=None, ciname=None, colname='cov', override=False): """Calculate coverage (from BigWig) over intervals (from srcname). A column (default 'cov') which contains coverages is added to source dataframe and the source is overwritten. Args: srcname: path to exons tsv bwname: path to bigwig dstname: path for result np: number of processors covciname: path to covci (coverage for chopped interval dataframe) ciname: path to ci (chopped interval dataframe) colname: name for column which contain calculated coverages Returns: source dataframe with column (cov) added SideEffects: source tsv is overwritten with new column added """ if UT.isstring(srcname): exons = UT.read_pandas(srcname) else: exons = srcname # cache if covciname is None: assert (UT.isstring(srcname)) covciname = srcname[:-7] + '.covci.txt.gz' if ciname is None: assert (UT.isstring(srcname)) ciname = srcname[:-7] + '.ci.txt.gz' if override or (not os.path.exists(covciname)): LOG.debug('calculating covci...') _sttime = time.time() if override or not (os.path.exists(ciname)): ci = UT.chopintervals(exons, ciname) else: ci = UT.read_pandas(ciname, names=['chr', 'st', 'ed', 'name', 'id']) ci['name'] = ci['name'].astype(str) covci = calc_cov_mp(ci, bwname, covciname, np) LOG.debug(' time: {0:.3f}s'.format(time.time() - _sttime)) else: LOG.debug('loading cached covci...') covci = UT.read_pandas(covciname) covci['name'] = covci['name'].astype(str) # covci: chopped interval's cov => reverse # ci => exon id ====> revers exon => ci indices # exon cov = sum(cicov*cilen)/totlen LOG.debug('calculating exon cov...') if 'id' not in covci.columns: covci['id'] = covci['sc1'] _sttime = time.time() e2c = {} for i, name in covci[['id', 'name']].values: for eid in name.split(','): e2c.setdefault(int(eid), []).append(i) covci['len'] = covci['ed'] - covci['st'] covci['val'] = covci['cov'] * covci['len'] def _gen(): for eid in exons['_id']: for cid in e2c[eid]: yield (cid, eid) tmp = PD.DataFrame(list(set([x for x in _gen()])), columns=['cid', 'eid']) c2len = dict(covci[['id', 'len']].values) c2val = dict(covci[['id', 'val']].values) tmp['val'] = [c2val[x] for x in tmp['cid']] tmp['len'] = [c2len[x] for x in tmp['cid']] tmpg = tmp.groupby('eid')[['val', 'len']].sum().reset_index() tmpg['cov'] = tmpg['val'] / tmpg['len'] e2cov = dict(tmpg[['eid', 'cov']].values) exons[colname] = [e2cov[x] for x in exons['_id']] UT.save_tsv_nidx_whead(exons, dstname) return exons
def calc_gcov(expath, cipath, bwpath, dstprefix, override=False, np=4): """Calculate gene coverages. Args: expath: merged ex cipath: chopped interval for ex bwpath: bigwig file (sample) dstprefix: prefix for outputs Outputs: 1. dstprefix+'.covci.txt.gz' 2. dstprefix+'.gcov.txt.gz' : DataFrame(col:_gidx,len,val,gcov,len2,gcov2,cids) len2: calculate length from ci with cov > 0 (normal length = use entire ci's belonging to the gene) gcov2 = val/len2 cids: cid with cov > for the gene ','.joined """ ex = UT.read_pandas(expath) covcipath = dstprefix + 'covci.txt.gz' gcovpath = dstprefix + 'gcov.txt.gz' if UT.notstale([expath, cipath], covcipath, override): cc = UT.read_pandas(covcipath) else: if UT.notstale(expath, cipath, False): ci = UT.read_pandas(cipath, names=['chr', 'st', 'ed', 'name', 'id']) else: ci = UT.chopintervals(ex, cipath, idcol='_id') cc = calc_cov_mp(ci, bwpath, covcipath, np=np) # if override or (not os.path.exists(covcipath)): # # calc covci # if not os.path.exists(cipath): # ci = UT.chopintervals(ex, cipath, idcol='_id') # else: # ci = UT.read_pandas(cipath, names=['chr','st','ed','name','id']) # cc = calc_cov_mp(ci, bwpath, covcipath, np=np) # else: # cc = UT.read_pandas(covcipath) if 'id' not in cc.columns: cc['id'] = cc['sc1'] if 'eid' not in cc.columns: cc['eid'] = cc['name'].astype(str).apply( lambda x: [int(y) for y in x.split(',')]) cc['len'] = cc['ed'] - cc['st'] cc['val'] = cc['cov'] * cc['len'] ccf = UT.flattendf(cc[['id', 'eid', 'len', 'val', 'st', 'ed']], 'eid') e2g = dict(UT.izipcols(ex, ['_id', '_gidx'])) ccf['_gidx'] = [e2g[x] for x in ccf['eid']] # for normal gcov: take unique combination of (gid, id) (id=cid) # for gocv2 : first select ccf with val>0 ccf2 = ccf[ccf['val'] > 0].groupby(['_gidx', 'id']).first().reset_index() ccf2g = ccf2.groupby('_gidx') df2 = ccf2g[['len', 'val']].sum() df2['gcov2'] = df2['val'] / df2['len'] df2['cids'] = ccf2g['id'].apply(lambda x: ','.join([str(y) for y in x])) df2['gst2'] = ccf2g['st'].min() df2['ged2'] = ccf2g['ed'].max() df2['glen2'] = df2['ged2'] - df2['gst2'] df2 = df2.reset_index() ccf1 = ccf.groupby(['_gidx', 'id']).first().reset_index() ccf1g = ccf1.groupby('_gidx') df = ccf1g[['len', 'val']].sum() df['gcov'] = df['val'] / df['len'] df['st'] = ccf1g['st'].min() df['ed'] = ccf1g['ed'].max() df['glen'] = df['ed'] - df['st'] df = df.reset_index() g2chr = dict(UT.izipcols(ex, ['_gidx', 'chr'])) df['chr'] = [g2chr[x] for x in df['_gidx']] def _set_df2prop(src, tgt, default): dic = dict(UT.izipcols(df2, ['_gidx', src])) df[tgt] = [dic.get(x, default) for x in df['_gidx']] _set_df2prop('gcov2', 'gcov2', 0) _set_df2prop('len', 'len2', 0) _set_df2prop('cids', 'cids', '') _set_df2prop('gst2', 'st2', -1) _set_df2prop('ged2', 'ed2', -1) _set_df2prop('glen2', 'glen2', 0) cols = [ '_gidx', 'chr', 'st', 'ed', 'len', 'val', 'gcov', 'glen', 'len2', 'gcov2', 'cids', 'st2', 'ed2', 'glen2' ] cols = ['_gidx', 'gcov'] df = df[cols] UT.save_tsv_nidx_whead(df, gcovpath) return df
def calc_ecov(expath, cipath, bwpath, dstprefix, blocksize=100, override=False, np=4): """Calculate exon coverages. Args: expath: merged ex cipath: chopped interval for ex bwpath: bigwig file (sample) dstprefix: prefix for outputs Outputs: 1. dstprefix+'.covci.txt.gz': coverage for ci 2. dstprefix+'.ecov.txt.gz' : DataFrame(cols: eid, chr, st, ed, ecov) """ covcipath = dstprefix + 'covci.txt.gz' ecovpath = dstprefix + 'ecov.txt.gz' ex = UT.read_pandas(expath) if UT.notstale([expath, cipath], covcipath, override): cc = UT.read_pandas(covcipath) else: if UT.notstale(expath, cipath, False): # you do not want to override ci ci = UT.read_pandas(cipath, names=['chr', 'st', 'ed', 'name', 'id']) else: #ex = UT.read_pandas(expath) ci = UT.chopintervals(ex, cipath, idcol='_id') cc = calc_cov_mp(ci, bwpath, covcipath, np=np) # ex = UT.read_pandas(expath) # if 'locus2' not in ex: # ex['locus2'] = UT.calc_locus_strand(ex) # if '_id' not in ex: # UT.set_ids(ex) # e2l = UT.df2dict(ex, '_id', 'locus2') # ex2 = ex.groupby('locus2').first().reset_index() # # maps: eid (_id) <=> locus2 # if UT.notstale([expath, cipath], covcipath, override): # cc = UT.read_pandas(covcipath) # else: # if UT.notstale(expath, cipath, False): # you do not want to override ci # ci = UT.read_pandas(cipath, names=['chr','st','ed','name','id']) # else: # ci = UT.chopintervals(ex2, cipath, idcol='_id') # cc = calc_cov_mp(ci, bwpath, covcipath, np=np) # if override or (not os.path.exists(covcipath)): # # calc covci # if not os.path.exists(cipath): # ex = UT.read_pandas(expath) # ci = UT.chopintervals(ex, cipath, idcol='_id') # else: # ci = UT.read_pandas(cipath, names=['chr','st','ed','name','id']) # cc = calc_cov_mp(ci, bwpath, covcipath, np=np) # else: # cc = UT.read_pandas(covcipath) if 'id' not in cc.columns: cc['id'] = cc['sc1'] if 'pid' not in cc.columns: cc['pid'] = cc['name'].astype(str).apply( lambda x: [int(y) for y in x.split(',')]) cc['name1'] = cc['pid'] #ccf = UT.flattendf(cc[['chr','st','ed','pid']], 'pid') #ccfg = ccf.groupby('eid') #df = ccfg[['chr']].first() #df['st'] = ccfg['st'].min() #df['ed'] = ccfg['ed'].max() #df.reset_index(inplace=True) df = ex[['_id', '_pid']].rename(columns={'_id': 'eid', '_pid': 'pid'}) e2cs = calc_ecov_mp(cc, None, np, blocksize) # pid => cov # l2cs = {e2l[x]: e2cs[x] for x in e2cs} # locus2 => cov # ex['ecov'] = [l2cs[x] for x in ex['locus2']] df['ecov'] = [e2cs[x] for x in df['pid']] # UT.save_tsv_nidx_whead(ex[['_id','ecov']], ecovpath) # return ex UT.save_tsv_nidx_whead(df[['eid', 'pid', 'ecov']], ecovpath) return df
def filter(self, **kw): """ Filter genes. base pair repeat overlap % >= th_bp_ovl (default 50) exon_repeat_overlap % >= th_ex_ovl (default 50) #union exon < th_uexon (default 4) That is, by default, it filters out 2,3 exon genes with both base pair and exon level overlap to repeats are greater or equal to 50%. Does not apply to single exons. """ d = self.ugb pr = self.params fn = self.fnobj pr.update(kw) idx1 = (d['rep%'] >= pr['th_bp_ovl']) & (d['rviz%'] > pr['th_ex_ovl']) idx2 = (d['#junc'].notnull()) & (d['#uexons'] < pr['th_uexon']) idx = ~(idx1 & idx2) self.ugb2 = ugb2 = d[idx] # filtered self.ugb3 = ugb3 = d[~idx] gids = ugb2.index.values ex0 = self.ex sj0 = self.sj uex = self.uex # filter ex,sj,uex self.ex2 = ex2 = ex0[ex0['_gidx'].isin(gids)].sort_values( ['chr', 'st', 'ed']) self.sj2 = sj2 = sj0[sj0['_gidx'].isin(gids)].sort_values( ['chr', 'st', 'ed']) self.uex2 = uex2 = uex[uex['_gidx'].isin(gids)].sort_values( ['chr', 'st', 'ed']) gcovfld = 'gcov_' + pr['datacode'] if pr['datacode'] else 'gcov' self.gbed2 = gbed2 = GGB.unionex2bed12(uex2, name=pr['gname'], sc2=gcovfld, sc1='tlen') gbed2['sc2'] = gbed2['sc2'].astype(int) # write out filtered ex,sj,ci,unionex,gbed UT.write_pandas(ex2, fn.txtname('ex', category='output'), 'h') UT.write_pandas(sj2, fn.txtname('sj', category='output'), 'h') UT.chopintervals(ex2, fn.txtname('ci', category='output')) GGB.write_bed(ex2, fn.bedname('ex', category='output')) GGB.write_bed(sj2, fn.bedname('sj', category='output')) UT.write_pandas(uex2, fn.txtname('unionex', category='output'), 'h') UT.write_pandas(ugb2, fn.txtname('genes.stats', category='output'), 'h') UT.write_pandas(gbed2, fn.bedname('genes', category='output'), '') # BED12 # also write filtered out genes self.ex3 = ex3 = ex0[~ex0['_gidx'].isin(gids)].sort_values( ['chr', 'st', 'ed']) self.sj3 = sj3 = sj0[~sj0['_gidx'].isin(gids)].sort_values( ['chr', 'st', 'ed']) self.uex3 = uex3 = uex[~uex['_gidx'].isin(gids)].sort_values( ['chr', 'st', 'ed']) gcovfld = 'gcov_' + pr['datacode'] if pr['datacode'] else 'gcov' self.gbed3 = gbed3 = GGB.unionex2bed12(uex3, name=pr['gname'], sc2=gcovfld, sc1='tlen') gbed3['sc2'] = gbed3['sc2'].astype(int) # write out filtered ex,sj,ci,unionex,gbed UT.write_pandas(ex3, fn.txtname('removed.ex', category='output'), 'h') UT.write_pandas(sj3, fn.txtname('removed.sj', category='output'), 'h') UT.chopintervals(ex3, fn.txtname('removed.ci', category='output')) UT.write_pandas(uex3, fn.txtname('removed.unionex', category='output'), 'h') UT.write_pandas(ugb3, fn.txtname('removed.genes.stats', category='output'), 'h') UT.write_pandas(gbed3, fn.bedname('removed.genes', category='output'), '') # BED12