def make_sjex(gtfpath, dstpre, np=12): if UT.isstring(gtfpath): gtf = GGB.read_gtf(gtfpath) else: gtf = gtfpath sj,ex = gtf2exonsj(gtf, np=np) print(ex.groupby(['kind','cat']).size()) ex.loc[ex['kind']=='5','cat'] = '5' ex.loc[ex['kind']=='3','cat'] = '3' UT.write_pandas(ex, dstpre+'.ex.txt.gz', 'h') UT.write_pandas(sj, dstpre+'.sj.txt.gz', 'h') # make ci ci = UT.chopintervals(ex, dstpre+'.ci.txt.gz') return {'sj':sj,'ex':ex}
def make_sjex(self, np=4): sjpath, expath = self.sjexpaths() if not os.path.exists(self.gtfpath): raise RuntimeError('file {0} does not exist'.format(self.gtfpath)) LOG.info('making sj,ex...') gtf = GGB.read_gtf(self.gtfpath) # ~ 1.5 min => # if 'cov' in gtf.iloc[0]['extra']: # gtf['cov'] = GGB.get_gtf_attr_col(gtf, 'cov') # convert gtf to sjex pre = self.fname('graphpre{0}_'.format(uuid.uuid4())) sj, ex = gtf2exonsj(gtf, np=np, graphpre=pre) # save UT.write_pandas(sj, sjpath, 'h') UT.write_pandas(ex, expath, 'h') return sj,ex
def make_sjexci(path, np): if path[-3:]=='.gz': bpath = path[:-3] else: bpath = path ext = bpath[-4:] if ext not in ['.gtf', '.bed', '.txt']: raise ValueError('unknown filetype {0}, should be either .gtf,.bed (bed12),.txt (ucsc knownGene)'.format(ext)) pathprefix = bpath[:-4] if not os.path.exists(path): raise ValueError('{0} file does not exists'.format(ext)) if ext=='.gtf': df = GGB.read_gtf(path).sort_values(['chr',]) sj, ex = gtf2exonsj(df, np=np) elif ext=='.bed': df = GGB.read_bed(path) sj, ex = bed2exonsj(df, np=np) elif ext=='.txt': # UCSC download if 'knownGene' in path: df = GGB.read_ucsc_knownGene(path) sj, ex = kg2exonsj(df, np=np) elif 'refGene' in path: df = GGB.read_ucsc_refGene(path) sj, ex = kg2exonsj(df, np=np) # same as kg # save LOG.info('saving sj to {0}'.format(pathprefix+'.sj.txt.gz')) UT.write_pandas(sj, pathprefix+'.sj.txt.gz', 'h') LOG.info('saving ex to {0}'.format(pathprefix+'.ex.txt.gz')) UT.write_pandas(ex, pathprefix+'.ex.txt.gz', 'h') # make ci ci = UT.chopintervals(ex, pathprefix+'.ci.txt.gz') return sj, ex
def test_read_gtf(g4gtfpath): gtf = GGB.read_gtf(g4gtfpath) assert len(gtf) == 112665
def g4Xkr4gtf(datadir): "returns gencod4 Xkr4 gtf exons" path = os.path.join(datadir, 'assemblies', 'gencode.vM4.Xkr4.gtf.gz') return GGB.read_gtf(path)
def g4gtf(g4gtfpath): #GTFCOLS = ['chr','src','typ','st','ed','sc1','strand','sc2','extra'] #return PD.read_table(g4gtfpath, names=GTFCOLS, compression='gzip') return GGB.read_gtf(g4gtfpath)