Пример #1
0
def test_GenomeFASTAChroms(datadir):
    chromdir = os.path.join(datadir, 'FASTA')
    ga = FA.GenomeFASTAChroms(chromdir, '\n')
    assert set(ga.chromosomes) == set(['chr1', 'chr2'])
    assert ga.get('chr1', 100, 110) == 'tcccagatga'
    assert ga.get('chr2', 605, 610) == 'ATGTC'
    assert ga.ext == '.fa'
    chromdir = os.path.join(datadir, 'FASTA.gz')
    ga = FA.GenomeFASTAChroms(chromdir, '\n')
    assert ga.ext == '.fa.gz'
    assert set(ga.chromosomes) == set(['chr1', 'chr2'])
    assert ga.get('chr1', 100, 110) == 'tcccagatga'
    assert ga.get('chr2', 605, 610) == 'ATGTC'
Пример #2
0
def test_count_repeats_mp(datadir):
    TESTDATA = StringIO("""st,ed,name,sc1,chr,strand
0,10,a,0,chr1,+
5,20,a,1,chr1,-
25,30,a,1,chr1,+
40,45,b,2,chr1,-
45,50,b,2,chr1,+
49,55,c,2,chr1,+
255,260,d,3,chr2,-
260,270,d,4,chr2,+
370,380,e,4,chr2,-
380,390,e,5,chr2,+
	""")
    df = PD.DataFrame.from_csv(TESTDATA, sep=",", index_col=False)
    print(df)
    udf = UT.make_unionex(df, gidx='name')
    chromdir = os.path.join(datadir, 'FASTA')
    gfc = FA.GenomeFASTAChroms(chromdir)
    RP.count_repeats_mp(udf, gfc, returnseq=True)
    print(udf)
    assert list(udf['#repbp']) == [0, 0, 0, 3, 15, 20]
    assert udf.iloc[3]['seq'] == 'TCTgag'
    udf = UT.make_unionex(df, gidx='sc1')
    udf = RP.count_repeats_mp(udf, gfc, returnseq=False)
    print(udf)
    assert list(udf['#repbp']) == [0, 0, 0, 3, 5, 10, 10, 10]
Пример #3
0
    def __init__(self, sjexpre, code, chromdir, rmskviz, outdir, **kw):
        self.sjexpre = sjexpre
        self.prefix = prefix = os.path.join(outdir, code)
        self.fnobj = FN.FileNamesBase(prefix)
        self.chromdir = chromdir
        self.rmskviz = rmskviz
        self.gfc = FA.GenomeFASTAChroms(chromdir)

        self.params = RMSKPARAMS.copy()
        self.params.update(kw)

        self.ex = UT.read_pandas(sjexpre + '.ex.txt.gz')
        self.sj = UT.read_pandas(sjexpre + '.sj.txt.gz')
        if 'glen' not in self.ex or 'tlen' not in self.ex:
            if not os.path.exists(sjexpre + '.ci.txt.gz'):
                ci = UT.chopintervals(ex, sjexpre + '.ci.txt.gz')
            else:
                ci = UT.read_ci(sjexpre + '.ci.txt.gz')
            UT.set_glen_tlen(self.ex, ci, gidx='_gidx')
            UT.write_pandas(self.ex, sjexpre + '.ex.txt.gz', 'h')
        uexpath = sjexpre + '.unionex.txt.gz'
        if os.path.exists(uexpath):
            self.uex = UT.read_pandas(uexpath)
        else:
            LOG.info('making union exons...saving to {0}'.format(uexpath))
            self.uex = UT.make_unionex(self.ex, '_gidx')
            UT.write_pandas(self.uex, uexpath, 'h')
Пример #4
0
    def __init__(self, j2pre, code, chromdir, rmskviz, dstpre, **kw):
        self.j2pre = j2pre
        self.fnobj = FN.FileNamesBase(prefix)
        self.chromdir = chromdir
        self.rmskviz = rmskviz
        self.gfc = FA.GenomeFASTAChroms(chromdir)

        self.params = RMSKPARAMS.copy()
        self.params.update(kw)

        # get exons from paths
        self.paths = paths = UT.read_pandas(j2pre + '.paths.txt.gz',
                                            names=A2.PATHCOLS)
Пример #5
0
def _process_mapbed_chr(dstpre, chrom, genome, chromdir, stranded):
    # 1st pass: calc dupdic
    bedpath = dstpre+'.{0}.bed'.format(chrom)
    dupids = UT.read_pandas(dstpre+'.dupitems.txt.gz', index_col=[0]).index
    # 2nd pass make wiggles
    gfc = FA.GenomeFASTAChroms(chromdir)
    chromsize = UT.df2dict(UT.chromdf(genome), 'chr', 'size')[chrom]
    
    # mqth MAPQ threshold there are ~6% <10
    # generator which makes an array
    fp = open(bedpath,'rb')

    wigs = {}
    wigpaths = {}
    for kind in ['.ex','.sj']:
        wigs[kind] = {}
        wigpaths[kind] = {}
        for strand in ['.p','.n','.u']:
            wigs[kind][strand] = {}
            wigpaths[kind][strand] = {}
            for suf in ['','.uniq']:
                wigpath = dstpre+kind+suf+strand+'.{0}.wig'.format(chrom)
                if os.path.exists(wigpath):
                    os.unlink(wigpath)
                wigpaths[kind][strand][suf] = wigpath
                wigs[kind][strand][suf] = N.zeros(chromsize, dtype=float)

    sjs = [] # path: (chr, st, ed, pcode, ucnt, strand, acnt)
    # pcode = a(apos)d(dpos) = a(ed)d(st) if strand=='+' else a(st)d(ed)
    # ucnt = unique read counts
    # acnt = multi-read adjusted all counts (=ucnt+Sum(mcnt(i)/dup(i)))
    # delete previous
    sjbed12 = dstpre+'.{0}.sjpath.bed'.format(chrom)
    if os.path.exists(sjbed12):
        os.unlink(sjbed12)

    def _write_arrays():
        for kind in ['.ex','.sj']:
            for strand in ['.p','.n','.u']:
                for suf in ['','.uniq']:
                    cybw.array2wiggle_chr64(wigs[kind][strand][suf], chrom,  wigpaths[kind][strand][suf], 'w')
        
    def _write_sj(sjs):
        # sjs = [(chr,st,ed,pathcode(name),ureads(sc1),strand,tst,ted,areads(sc2),cse),...]
        sjdf = PD.DataFrame(sjs, columns=GGB.BEDCOLS[:9]+['cse'])
        sjdfgr = sjdf.groupby('name')
        sj = sjdfgr.first()
        sj['sc1'] = sjdfgr['sc1'].sum().astype(int) # ucnt
        sj['sc2'] = sjdfgr['sc2'].sum().astype(int) # jcnt=ucnt+mcnt
        sj['st'] = sjdfgr['st'].min()
        sj['ed'] = sjdfgr['ed'].max()
        sj['#exons'] = sj['cse'].apply(len)+1
        sj['ests'] = [[0]+[z[1]-st for z in cse] for st,cse in sj[['st','cse']].values]
        sj['eeds'] = [[z[0]-st for z in cse]+[ed-st] for st,ed,cse in sj[['st','ed','cse']].values]
        esizes = [[u-v for u,v in zip(x,y)] for x,y in sj[['eeds','ests']].values]
        sj['estarts'] = ['{0},'.format(','.join([str(y) for y in x])) for x in sj['ests']]
        sj['esizes'] = ['{0},'.format(','.join([str(y) for y in x])) for x in esizes]
        sj['name'] = sj.index
        # sj = sj.reset_index()
        with open(sjbed12, 'w') as f:
            sj[GGB.BEDCOLS].to_csv(f, index=False, header=False, sep='\t', quoting=csv.QUOTE_NONE)
            
    def _append_sj(cse, css, csj, chrom,ureads,areads):
        if (len(cse)>0): # spits out splice rec
            # chr,st,ed,pathcode,ureads,strand,tst,ted,areads
            tst = cse[0][0]
            ted = cse[-1][1]
            if len(css)>0:
                strand = Counter(css).most_common()[0][0]
            else:
                strand = '.'
            name = pathcode(cse, strand)
            st = int(csj[0][1]) # first segment start
            ed = int(csj[-1][2]) # last segment end
            sjs.append((chrom,st,ed,name,ureads,strand,tst,ted,areads,cse))   
    
    def _add_to_ex_arrays(st,ed,dup,strand):
        kind='.ex'
        strand = STRANDMAP[(strand,stranded)]
        dic = wigs[kind][strand]
        dic[''][st:ed] += 1
        if not dup:
            dic['.uniq'][st:ed] += 1

    def _add_to_sj_arrays(sst,sed,dup,strand):
        kind='.sj'
        s = {'+':'.p','-':'.n','.':'.u'}[strand]
        dic = wigs[kind][s]
        # add to the arrays
        dic[''][sst:sed] += 1
        if not dup:
            dic['.uniq'][sst:sed] += 1
            ureads,areads = 1,1
        else:
            ureads,areads = 0,1
        return ureads,areads
        
    csj = [] # current collection of spliced reads
    css = [] # current strands
    cse = [] # current (sst,sed)
    csn = 0 # current segment number
    ureads,areads = 1,1 # uniq, total reads it's either 1,1 or 0,1
    pmid = None # previous map id common to spliced segments
    for line in fp:
        rec = line.strip().split(b'\t')
        # 7 column bed: chr(0), st(1), ed(2), name(3), mapq(4), strand(5), mapid(6)
        cchr = rec[0].decode()
        st,ed = int(rec[1]),int(rec[2])
        dup = rec[3] in dupids #dic[rec[3]]
        estrand = rec[5]
        _add_to_ex_arrays(st,ed,dup,estrand)
        # process splice
        if pmid != rec[6]: # new map 
            _append_sj(cse, css, csj, chrom, ureads, areads)
            csj,css,cse,csn = [rec],[],[],0 # reset running params
        else: # add segments
            csj.append(rec)            
            prec = csj[-2] # previous rec
            sst = int(prec[2]) # ed of previous segment
            sed = int(rec[1]) # st of current segment
            cse.append((sst,sed))
            # find strand
            sted = gfc.get(chrom,sst,sst+2)+gfc.get(chrom,sed-2,sed)
            strand = STED2STRAND.get(sted,'.')
            if strand != '.':
                css.append(strand)
            ureads,areads = _add_to_sj_arrays(sst,sed,dup,strand)
        pmid = rec[6]

    _append_sj(cse, css, csj, chrom, ureads, areads)

    _write_arrays()
    _write_sj(sjs)