def test_GenomeFASTAChroms(datadir): chromdir = os.path.join(datadir, 'FASTA') ga = FA.GenomeFASTAChroms(chromdir, '\n') assert set(ga.chromosomes) == set(['chr1', 'chr2']) assert ga.get('chr1', 100, 110) == 'tcccagatga' assert ga.get('chr2', 605, 610) == 'ATGTC' assert ga.ext == '.fa' chromdir = os.path.join(datadir, 'FASTA.gz') ga = FA.GenomeFASTAChroms(chromdir, '\n') assert ga.ext == '.fa.gz' assert set(ga.chromosomes) == set(['chr1', 'chr2']) assert ga.get('chr1', 100, 110) == 'tcccagatga' assert ga.get('chr2', 605, 610) == 'ATGTC'
def test_count_repeats_mp(datadir): TESTDATA = StringIO("""st,ed,name,sc1,chr,strand 0,10,a,0,chr1,+ 5,20,a,1,chr1,- 25,30,a,1,chr1,+ 40,45,b,2,chr1,- 45,50,b,2,chr1,+ 49,55,c,2,chr1,+ 255,260,d,3,chr2,- 260,270,d,4,chr2,+ 370,380,e,4,chr2,- 380,390,e,5,chr2,+ """) df = PD.DataFrame.from_csv(TESTDATA, sep=",", index_col=False) print(df) udf = UT.make_unionex(df, gidx='name') chromdir = os.path.join(datadir, 'FASTA') gfc = FA.GenomeFASTAChroms(chromdir) RP.count_repeats_mp(udf, gfc, returnseq=True) print(udf) assert list(udf['#repbp']) == [0, 0, 0, 3, 15, 20] assert udf.iloc[3]['seq'] == 'TCTgag' udf = UT.make_unionex(df, gidx='sc1') udf = RP.count_repeats_mp(udf, gfc, returnseq=False) print(udf) assert list(udf['#repbp']) == [0, 0, 0, 3, 5, 10, 10, 10]
def __init__(self, sjexpre, code, chromdir, rmskviz, outdir, **kw): self.sjexpre = sjexpre self.prefix = prefix = os.path.join(outdir, code) self.fnobj = FN.FileNamesBase(prefix) self.chromdir = chromdir self.rmskviz = rmskviz self.gfc = FA.GenomeFASTAChroms(chromdir) self.params = RMSKPARAMS.copy() self.params.update(kw) self.ex = UT.read_pandas(sjexpre + '.ex.txt.gz') self.sj = UT.read_pandas(sjexpre + '.sj.txt.gz') if 'glen' not in self.ex or 'tlen' not in self.ex: if not os.path.exists(sjexpre + '.ci.txt.gz'): ci = UT.chopintervals(ex, sjexpre + '.ci.txt.gz') else: ci = UT.read_ci(sjexpre + '.ci.txt.gz') UT.set_glen_tlen(self.ex, ci, gidx='_gidx') UT.write_pandas(self.ex, sjexpre + '.ex.txt.gz', 'h') uexpath = sjexpre + '.unionex.txt.gz' if os.path.exists(uexpath): self.uex = UT.read_pandas(uexpath) else: LOG.info('making union exons...saving to {0}'.format(uexpath)) self.uex = UT.make_unionex(self.ex, '_gidx') UT.write_pandas(self.uex, uexpath, 'h')
def __init__(self, j2pre, code, chromdir, rmskviz, dstpre, **kw): self.j2pre = j2pre self.fnobj = FN.FileNamesBase(prefix) self.chromdir = chromdir self.rmskviz = rmskviz self.gfc = FA.GenomeFASTAChroms(chromdir) self.params = RMSKPARAMS.copy() self.params.update(kw) # get exons from paths self.paths = paths = UT.read_pandas(j2pre + '.paths.txt.gz', names=A2.PATHCOLS)
def _process_mapbed_chr(dstpre, chrom, genome, chromdir, stranded): # 1st pass: calc dupdic bedpath = dstpre+'.{0}.bed'.format(chrom) dupids = UT.read_pandas(dstpre+'.dupitems.txt.gz', index_col=[0]).index # 2nd pass make wiggles gfc = FA.GenomeFASTAChroms(chromdir) chromsize = UT.df2dict(UT.chromdf(genome), 'chr', 'size')[chrom] # mqth MAPQ threshold there are ~6% <10 # generator which makes an array fp = open(bedpath,'rb') wigs = {} wigpaths = {} for kind in ['.ex','.sj']: wigs[kind] = {} wigpaths[kind] = {} for strand in ['.p','.n','.u']: wigs[kind][strand] = {} wigpaths[kind][strand] = {} for suf in ['','.uniq']: wigpath = dstpre+kind+suf+strand+'.{0}.wig'.format(chrom) if os.path.exists(wigpath): os.unlink(wigpath) wigpaths[kind][strand][suf] = wigpath wigs[kind][strand][suf] = N.zeros(chromsize, dtype=float) sjs = [] # path: (chr, st, ed, pcode, ucnt, strand, acnt) # pcode = a(apos)d(dpos) = a(ed)d(st) if strand=='+' else a(st)d(ed) # ucnt = unique read counts # acnt = multi-read adjusted all counts (=ucnt+Sum(mcnt(i)/dup(i))) # delete previous sjbed12 = dstpre+'.{0}.sjpath.bed'.format(chrom) if os.path.exists(sjbed12): os.unlink(sjbed12) def _write_arrays(): for kind in ['.ex','.sj']: for strand in ['.p','.n','.u']: for suf in ['','.uniq']: cybw.array2wiggle_chr64(wigs[kind][strand][suf], chrom, wigpaths[kind][strand][suf], 'w') def _write_sj(sjs): # sjs = [(chr,st,ed,pathcode(name),ureads(sc1),strand,tst,ted,areads(sc2),cse),...] sjdf = PD.DataFrame(sjs, columns=GGB.BEDCOLS[:9]+['cse']) sjdfgr = sjdf.groupby('name') sj = sjdfgr.first() sj['sc1'] = sjdfgr['sc1'].sum().astype(int) # ucnt sj['sc2'] = sjdfgr['sc2'].sum().astype(int) # jcnt=ucnt+mcnt sj['st'] = sjdfgr['st'].min() sj['ed'] = sjdfgr['ed'].max() sj['#exons'] = sj['cse'].apply(len)+1 sj['ests'] = [[0]+[z[1]-st for z in cse] for st,cse in sj[['st','cse']].values] sj['eeds'] = [[z[0]-st for z in cse]+[ed-st] for st,ed,cse in sj[['st','ed','cse']].values] esizes = [[u-v for u,v in zip(x,y)] for x,y in sj[['eeds','ests']].values] sj['estarts'] = ['{0},'.format(','.join([str(y) for y in x])) for x in sj['ests']] sj['esizes'] = ['{0},'.format(','.join([str(y) for y in x])) for x in esizes] sj['name'] = sj.index # sj = sj.reset_index() with open(sjbed12, 'w') as f: sj[GGB.BEDCOLS].to_csv(f, index=False, header=False, sep='\t', quoting=csv.QUOTE_NONE) def _append_sj(cse, css, csj, chrom,ureads,areads): if (len(cse)>0): # spits out splice rec # chr,st,ed,pathcode,ureads,strand,tst,ted,areads tst = cse[0][0] ted = cse[-1][1] if len(css)>0: strand = Counter(css).most_common()[0][0] else: strand = '.' name = pathcode(cse, strand) st = int(csj[0][1]) # first segment start ed = int(csj[-1][2]) # last segment end sjs.append((chrom,st,ed,name,ureads,strand,tst,ted,areads,cse)) def _add_to_ex_arrays(st,ed,dup,strand): kind='.ex' strand = STRANDMAP[(strand,stranded)] dic = wigs[kind][strand] dic[''][st:ed] += 1 if not dup: dic['.uniq'][st:ed] += 1 def _add_to_sj_arrays(sst,sed,dup,strand): kind='.sj' s = {'+':'.p','-':'.n','.':'.u'}[strand] dic = wigs[kind][s] # add to the arrays dic[''][sst:sed] += 1 if not dup: dic['.uniq'][sst:sed] += 1 ureads,areads = 1,1 else: ureads,areads = 0,1 return ureads,areads csj = [] # current collection of spliced reads css = [] # current strands cse = [] # current (sst,sed) csn = 0 # current segment number ureads,areads = 1,1 # uniq, total reads it's either 1,1 or 0,1 pmid = None # previous map id common to spliced segments for line in fp: rec = line.strip().split(b'\t') # 7 column bed: chr(0), st(1), ed(2), name(3), mapq(4), strand(5), mapid(6) cchr = rec[0].decode() st,ed = int(rec[1]),int(rec[2]) dup = rec[3] in dupids #dic[rec[3]] estrand = rec[5] _add_to_ex_arrays(st,ed,dup,estrand) # process splice if pmid != rec[6]: # new map _append_sj(cse, css, csj, chrom, ureads, areads) csj,css,cse,csn = [rec],[],[],0 # reset running params else: # add segments csj.append(rec) prec = csj[-2] # previous rec sst = int(prec[2]) # ed of previous segment sed = int(rec[1]) # st of current segment cse.append((sst,sed)) # find strand sted = gfc.get(chrom,sst,sst+2)+gfc.get(chrom,sed-2,sed) strand = STED2STRAND.get(sted,'.') if strand != '.': css.append(strand) ureads,areads = _add_to_sj_arrays(sst,sed,dup,strand) pmid = rec[6] _append_sj(cse, css, csj, chrom, ureads, areads) _write_arrays() _write_sj(sjs)