예제 #1
0
def make_sjex(gtfpath, dstpre, np=12):
    if UT.isstring(gtfpath):
        gtf = GGB.read_gtf(gtfpath)
    else:
        gtf = gtfpath
    sj,ex = gtf2exonsj(gtf, np=np)
    print(ex.groupby(['kind','cat']).size())
    ex.loc[ex['kind']=='5','cat'] = '5'
    ex.loc[ex['kind']=='3','cat'] = '3'
    UT.write_pandas(ex, dstpre+'.ex.txt.gz', 'h')
    UT.write_pandas(sj, dstpre+'.sj.txt.gz', 'h')
    # make ci
    ci = UT.chopintervals(ex, dstpre+'.ci.txt.gz')
    return {'sj':sj,'ex':ex}
예제 #2
0
 def make_sjex(self, np=4):
     sjpath, expath = self.sjexpaths()
     if not os.path.exists(self.gtfpath):
         raise RuntimeError('file {0} does not exist'.format(self.gtfpath))
     LOG.info('making sj,ex...')
     gtf = GGB.read_gtf(self.gtfpath) # ~ 1.5 min => 
     # if 'cov' in gtf.iloc[0]['extra']:
     #     gtf['cov'] = GGB.get_gtf_attr_col(gtf, 'cov')
     # convert gtf to sjex
     pre = self.fname('graphpre{0}_'.format(uuid.uuid4()))
     sj, ex = gtf2exonsj(gtf, np=np, graphpre=pre)
     # save
     UT.write_pandas(sj, sjpath, 'h')
     UT.write_pandas(ex, expath, 'h')
     return sj,ex
예제 #3
0
def make_sjexci(path, np):
    if path[-3:]=='.gz':
        bpath = path[:-3]
    else:
        bpath = path
    ext = bpath[-4:]
    if ext not in ['.gtf', '.bed', '.txt']:
        raise ValueError('unknown filetype {0}, should be either .gtf,.bed (bed12),.txt (ucsc knownGene)'.format(ext))
    pathprefix = bpath[:-4]

    if not os.path.exists(path):
        raise ValueError('{0} file does not exists'.format(ext))

    if ext=='.gtf':
        df = GGB.read_gtf(path).sort_values(['chr',])
        sj, ex = gtf2exonsj(df, np=np)
    elif ext=='.bed': 
        df = GGB.read_bed(path)
        sj, ex = bed2exonsj(df, np=np)
    elif ext=='.txt': # UCSC download
        if 'knownGene' in path:
            df = GGB.read_ucsc_knownGene(path)
            sj, ex = kg2exonsj(df, np=np)
        elif 'refGene' in path:
            df = GGB.read_ucsc_refGene(path)
            sj, ex = kg2exonsj(df, np=np) # same as kg
    
    # save
    LOG.info('saving sj to {0}'.format(pathprefix+'.sj.txt.gz'))
    UT.write_pandas(sj, pathprefix+'.sj.txt.gz', 'h')
    LOG.info('saving ex to {0}'.format(pathprefix+'.ex.txt.gz'))
    UT.write_pandas(ex, pathprefix+'.ex.txt.gz', 'h')    

    # make ci
    ci = UT.chopintervals(ex, pathprefix+'.ci.txt.gz')
    return sj, ex
예제 #4
0
def test_read_gtf(g4gtfpath):
    gtf = GGB.read_gtf(g4gtfpath)
    assert len(gtf) == 112665
예제 #5
0
파일: conftest.py 프로젝트: king1212/jGEM
def g4Xkr4gtf(datadir):
    "returns gencod4 Xkr4 gtf exons"
    path = os.path.join(datadir, 'assemblies', 'gencode.vM4.Xkr4.gtf.gz')
    return GGB.read_gtf(path)
예제 #6
0
파일: conftest.py 프로젝트: king1212/jGEM
def g4gtf(g4gtfpath):
    #GTFCOLS = ['chr','src','typ','st','ed','sc1','strand','sc2','extra']
    #return PD.read_table(g4gtfpath, names=GTFCOLS, compression='gzip')
    return GGB.read_gtf(g4gtfpath)