def adderrortoread(readstr,readcigar,readqualitydeteriorationrate,indelrate): switchdict={'A':'t','T':'a','C':'g','G':'c'} outstr='' errorrate=0 indellist=[0] indelflaglist=[-1] for i in range(len(readstr)): if common.toss(indelrate) and i!=0 and indellist[-1]!=(i-1): indellist.append(i) if common.toss(): outstr+=outstr[-1] outstr+=readstr[i] indelflaglist.append(1) else: indelflaglist.append(0) else: if common.toss(errorrate): outstr+=switchdict[readstr[i]] else: outstr+=readstr[i] errorrate+=readqualitydeteriorationrate indellist.append(len(readstr)) indelflaglist.append(-1) outcigarstr='' for i in range(1,len(indellist)): subcig=common.cigarsubstr(readcigar,indellist[i-1],indellist[i]-indellist[i-1]) #print 'here',readcigar,indellist[i-1],indellist[i],subcig outcigarstr+=subcig if indelflaglist[i]==1: outcigarstr+='1I' elif indelflaglist[i]==0: outcigarstr+='1D' return [outstr,outcigarstr]
def RNAmetasource2source(parameterlist): """ Output: Gene Dict: Transcript Dict: chromosome, strand, startbase, cigar, txsize """ # empirical parameter genewithnovelskippedexonpct=50 metasourcefilename=parameterlist[0][0] filetype=parameterlist[1][0] if parameterlist[1][1]=='no_novel_transcript': novel_skipped_exon_flg=0 else: novel_skipped_exon_flg=1 numgenes=int(parameterlist[1][2]) genelist=parameterlist[1][3] if filetype=='gtf': metasource=gtffile.gtfFile(metasourcefilename) genetranscriptdict=metasource.getgenetranscriptdict() elif filetype=='pck': genetranscriptdict=cPickle.load(open(metasourcefilename)) allgenes=genetranscriptdict.keys() if len(genelist)==0: if numgenes==0: numgenes=len(genetranscriptdict) else: numgenes=min(numgenes,len(genetranscriptdict)) choosegenes=random.sample(allgenes,numgenes) else: choosegenes=genelist[0:] #print len(genelist),len(choosegenes) outtranscriptdict={} for gene in choosegenes: txdict=genetranscriptdict[gene] txexonslist=[txdict[tx][2] for tx in txdict] if novel_skipped_exon_flg==1 and common.toss(genewithnovelskippedexonpct/100.0): newtxflg=0 trys=0 while not newtxflg and trys<10: trys+=1 tx=random.choice(txdict.keys()) txexons=txdict[tx][2] if len(txexons)>2: skipped=random.randint(1,len(txexons)-2) newtx=txexons[:skipped-1]+txexons[skipped:] if newtx not in txexonslist: newtxflg=1 txdict['%s_skip'%tx]=[txdict[tx][0],txdict[tx][1],newtx] outtranscriptdict[gene]={} for tx in txdict: cigartup=common.exonlist2cigar(txdict[tx][2]) outtranscriptdict[gene][tx]=[txdict[tx][0],txdict[tx][1],cigartup[0],cigartup[1],cigartup[2]] return outtranscriptdict
def fragment2read(allparameterlist): """ input: transcript string, transcript cigar, fragmentlist output: list of (readstr,readcigar) """ #read_extract_parameters=SE,100,300,400 ; single:size and fragment range #read_extract_parameters=PE,100,300,400 ; paired:size and fragment range #read_extract_parameters=PB,500,1000 ; pacbio fragment range txstr, txcigar, fragmentlist, parameterlist = allparameterlist readlist = [] if parameterlist[0] == 'SE': substrsize = int(parameterlist[1]) for fragment in fragmentlist: if common.toss(): substrstart = fragment[0] else: substrstart = fragment[1] - substrsize readcigar = common.cigarsubstr(txcigar, substrstart, substrsize) readstr = txstr[substrstart:substrstart + substrsize] readlist.append([substrstart, readstr, readcigar]) if parameterlist[0] == 'PE': substrsize = int(parameterlist[1]) for fragment in fragmentlist: substrstart1 = fragment[0] readcigar1 = common.cigarsubstr(txcigar, substrstart1, substrsize) readstr1 = txstr[substrstart1:substrstart1 + substrsize] #readlist.append([substrstart,readstr,readcigar]) substrstart2 = fragment[1] - substrsize readcigar2 = common.cigarsubstr(txcigar, substrstart2, substrsize) readstr2 = txstr[substrstart2:substrstart2 + substrsize] readlist.append([[substrstart1, readstr1, readcigar1], [substrstart2, readstr2, readcigar2]]) if parameterlist[0] == 'PB': for fragment in fragmentlist: substrstart = fragment[0] substrsize = fragment[1] - fragment[0] readcigar = common.cigarsubstr(txcigar, substrstart, substrsize) readstr = txstr[substrstart:substrstart + substrsize] readlist.append([substrstart, readstr, readcigar]) readlist.sort() if parameterlist[0] == 'PE': pe_readlist = readlist[0:] readlist = [] for pe_read in pe_readlist: read1, read2 = pe_read readlist.append(read1) readlist.append(read2) return readlist
def fragment2read(allparameterlist): """ input: transcript string, transcript cigar, fragmentlist output: list of (readstr,readcigar) """ #read_extract_parameters=SE,100,300,400 ; single:size and fragment range #read_extract_parameters=PE,100,300,400 ; paired:size and fragment range #read_extract_parameters=PB,500,1000 ; pacbio fragment range txstr,txcigar,fragmentlist,parameterlist=allparameterlist readlist=[] if parameterlist[0]=='SE': substrsize=int(parameterlist[1]) for fragment in fragmentlist: if common.toss(): substrstart=fragment[0] else: substrstart=fragment[1]-substrsize readcigar=common.cigarsubstr(txcigar,substrstart,substrsize) readstr=txstr[substrstart:substrstart+substrsize] readlist.append([substrstart,readstr,readcigar]) if parameterlist[0]=='PE': substrsize=int(parameterlist[1]) for fragment in fragmentlist: substrstart1=fragment[0] readcigar1=common.cigarsubstr(txcigar,substrstart1,substrsize) readstr1=txstr[substrstart1:substrstart1+substrsize] #readlist.append([substrstart,readstr,readcigar]) substrstart2=fragment[1]-substrsize readcigar2=common.cigarsubstr(txcigar,substrstart2,substrsize) readstr2=txstr[substrstart2:substrstart2+substrsize] readlist.append([[substrstart1,readstr1,readcigar1],[substrstart2,readstr2,readcigar2]]) if parameterlist[0]=='PB': for fragment in fragmentlist: substrstart=fragment[0] substrsize=fragment[1]-fragment[0] readcigar=common.cigarsubstr(txcigar,substrstart,substrsize) readstr=txstr[substrstart:substrstart+substrsize] readlist.append([substrstart,readstr,readcigar]) readlist.sort() if parameterlist[0]=='PE': pe_readlist=readlist[0:] readlist=[] for pe_read in pe_readlist: read1,read2=pe_read readlist.append(read1) readlist.append(read2) return readlist
def adderrortoread(readstr, readcigar, readqualitydeteriorationrate, indelrate): switchdict = {'A': 't', 'T': 'a', 'C': 'g', 'G': 'c'} outstr = '' errorrate = 0 indellist = [0] indelflaglist = [-1] for i in range(len(readstr)): if common.toss(indelrate) and i != 0 and indellist[-1] != (i - 1): indellist.append(i) if common.toss(): outstr += outstr[-1] outstr += readstr[i] indelflaglist.append(1) else: indelflaglist.append(0) else: if common.toss(errorrate): outstr += switchdict[readstr[i]] else: outstr += readstr[i] errorrate += readqualitydeteriorationrate indellist.append(len(readstr)) indelflaglist.append(-1) outcigarstr = '' for i in range(1, len(indellist)): subcig = common.cigarsubstr(readcigar, indellist[i - 1], indellist[i] - indellist[i - 1]) #print 'here',readcigar,indellist[i-1],indellist[i],subcig outcigarstr += subcig if indelflaglist[i] == 1: outcigarstr += '1I' elif indelflaglist[i] == 0: outcigarstr += '1D' return [outstr, outcigarstr]
def getsimulatedtwoexpressions(genetxcountdict,covdistparam,JSDdistparam,numreads,fold_flg=0): """ Computes gene and transcript coverages for two datasets input: genetxcountdict: count of transcripts for each gene, make sure than covdistparam: the parameters describing truncated normal distribution - mu,sigma,left,right of ratio of rpkm of gene between two datasets JSDdistparam: the parameters describing uniform plus triangular distribution - left, right, mode of JSD of trascript expression of a gene between two datasets output: genepairexpdict=gene:genereads1,genereads2,txexp1,txexp2 """ numgenes=len(genetxcountdict) genereadslist1=getsimulatedgenereadcounts(numgenes,numreads) #print covdistparam covratiolisttemp1=[random.gauss(covdistparam[0],covdistparam[1]) for i in range(10*numgenes)] covratiolisttemp2=[x for x in covratiolisttemp1 if covdistparam[2]<=x<=covdistparam[3]] covratiolist=random.sample(covratiolisttemp2,numgenes) # 1: gene read count <= 5% of total is acceptable # 2: gene read count >= 60% of total is not acceptable # 3: if between 1 and 2, gene read count < 10 times fair share is acceptable genefairshare=numreads*1.0/numgenes genereadslist2temp1=[genereadslist1[i]*pow(10,covratiolist[i]) for i in range(numgenes)] genereadslist2temp2=[min(x,numreads*0.60) for x in genereadslist2temp1] genereadslist2temp3=[min(x,max(10*genefairshare,numreads*0.05)) for x in genereadslist2temp2] genereadslist2=[max(1,int(x*numreads/sum(genereadslist2temp3))) for x in genereadslist2temp3] JSDlist=[] for gene in genetxcountdict: if genetxcountdict[gene]>1: if common.toss(): JSDlist.append(random.uniform(JSDdistparam[0],JSDdistparam[1])) else: JSDlist.append(random.triangular(JSDdistparam[0],JSDdistparam[1],JSDdistparam[2])) numbins=50 JSDbins=np.linspace(0,1,numbins+1) JSDbinscount=[] for i in range(numbins): JSDbinscount.append(len([x for x in JSDlist if JSDbins[i]<x<=JSDbins[i+1]])) print JSDbinscount outdict={} genelist=genetxcountdict.keys() for i in range(numgenes): gene=genelist[i] if genetxcountdict[gene]==1: outdict[gene]=[genereadslist1[i],genereadslist2[i],[1.0],[1.0],0.0] else: binned=0; numattempts=0 while binned==0: numattempts+=1 if fold_flg==0: v1=getsimulatedtxexpressionpct(genetxcountdict[gene]) v2=getsimulatedtxexpressionpct(genetxcountdict[gene]) genejsd=common.JSD(v1,v2) binidx=int(genejsd*numbins) else: v1=getsimulatedtxexpressionpct(genetxcountdict[gene]) v2=v1[:] v2[0],v2[1]=v1[1],v1[0] genejsd=abs(v2[1]-v2[0]) binidx=min(int(genejsd*numbins),numbins-1) if JSDbinscount[binidx]>0: binned=1 JSDbinscount[binidx]-=1 if numattempts>10000 and binidx>1: binned=1 JSDbinscount[binidx]-=1 outdict[gene]=[genereadslist1[i],genereadslist2[i],v1,v2,genejsd] print JSDbinscount return outdict
def RNAmetasource2source(parameterlist): """ Output: Gene Dict: Transcript Dict: chromosome, strand, startbase, cigar, txsize """ # empirical parameter genewithnovelskippedexonpct = 50 metasourcefilename = parameterlist[0][0] filetype = parameterlist[1][0] if parameterlist[1][1] == 'no_novel_transcript': novel_skipped_exon_flg = 0 else: novel_skipped_exon_flg = 1 numgenes = int(parameterlist[1][2]) genelist = parameterlist[1][3] if filetype == 'gtf': metasource = gtffile.gtfFile(metasourcefilename) genetranscriptdict = metasource.getgenetranscriptdict() elif filetype == 'pck': genetranscriptdict = cPickle.load(open(metasourcefilename)) allgenes = genetranscriptdict.keys() if len(genelist) == 0: if numgenes == 0: numgenes = len(genetranscriptdict) else: numgenes = min(numgenes, len(genetranscriptdict)) choosegenes = random.sample(allgenes, numgenes) else: choosegenes = genelist[0:] #print len(genelist),len(choosegenes) outtranscriptdict = {} for gene in choosegenes: txdict = genetranscriptdict[gene] txexonslist = [txdict[tx][2] for tx in txdict] if novel_skipped_exon_flg == 1 and common.toss( genewithnovelskippedexonpct / 100.0): newtxflg = 0 trys = 0 while not newtxflg and trys < 10: trys += 1 tx = random.choice(txdict.keys()) txexons = txdict[tx][2] if len(txexons) > 2: skipped = random.randint(1, len(txexons) - 2) newtx = txexons[:skipped - 1] + txexons[skipped:] if newtx not in txexonslist: newtxflg = 1 txdict['%s_skip' % tx] = [txdict[tx][0], txdict[tx][1], newtx] outtranscriptdict[gene] = {} for tx in txdict: cigartup = common.exonlist2cigar(txdict[tx][2]) outtranscriptdict[gene][tx] = [ txdict[tx][0], txdict[tx][1], cigartup[0], cigartup[1], cigartup[2] ] return outtranscriptdict
def getsimulatedtwoexpressions(genetxcountdict, covdistparam, JSDdistparam, numreads, fold_flg=0): """ Computes gene and transcript coverages for two datasets input: genetxcountdict: count of transcripts for each gene, make sure than covdistparam: the parameters describing truncated normal distribution - mu,sigma,left,right of ratio of rpkm of gene between two datasets JSDdistparam: the parameters describing uniform plus triangular distribution - left, right, mode of JSD of trascript expression of a gene between two datasets output: genepairexpdict=gene:genereads1,genereads2,txexp1,txexp2 """ numgenes = len(genetxcountdict) genereadslist1 = getsimulatedgenereadcounts(numgenes, numreads) #print covdistparam covratiolisttemp1 = [ random.gauss(covdistparam[0], covdistparam[1]) for i in range(10 * numgenes) ] covratiolisttemp2 = [ x for x in covratiolisttemp1 if covdistparam[2] <= x <= covdistparam[3] ] covratiolist = random.sample(covratiolisttemp2, numgenes) # 1: gene read count <= 5% of total is acceptable # 2: gene read count >= 60% of total is not acceptable # 3: if between 1 and 2, gene read count < 10 times fair share is acceptable genefairshare = numreads * 1.0 / numgenes genereadslist2temp1 = [ genereadslist1[i] * pow(10, covratiolist[i]) for i in range(numgenes) ] genereadslist2temp2 = [ min(x, numreads * 0.60) for x in genereadslist2temp1 ] genereadslist2temp3 = [ min(x, max(10 * genefairshare, numreads * 0.05)) for x in genereadslist2temp2 ] genereadslist2 = [ max(1, int(x * numreads / sum(genereadslist2temp3))) for x in genereadslist2temp3 ] JSDlist = [] for gene in genetxcountdict: if genetxcountdict[gene] > 1: if common.toss(): JSDlist.append(random.uniform(JSDdistparam[0], JSDdistparam[1])) else: JSDlist.append( random.triangular(JSDdistparam[0], JSDdistparam[1], JSDdistparam[2])) numbins = 50 JSDbins = np.linspace(0, 1, numbins + 1) JSDbinscount = [] for i in range(numbins): JSDbinscount.append( len([x for x in JSDlist if JSDbins[i] < x <= JSDbins[i + 1]])) print JSDbinscount outdict = {} genelist = genetxcountdict.keys() for i in range(numgenes): gene = genelist[i] if genetxcountdict[gene] == 1: outdict[gene] = [ genereadslist1[i], genereadslist2[i], [1.0], [1.0], 0.0 ] else: binned = 0 numattempts = 0 while binned == 0: numattempts += 1 if fold_flg == 0: v1 = getsimulatedtxexpressionpct(genetxcountdict[gene]) v2 = getsimulatedtxexpressionpct(genetxcountdict[gene]) genejsd = common.JSD(v1, v2) binidx = int(genejsd * numbins) else: v1 = getsimulatedtxexpressionpct(genetxcountdict[gene]) v2 = v1[:] v2[0], v2[1] = v1[1], v1[0] genejsd = abs(v2[1] - v2[0]) binidx = min(int(genejsd * numbins), numbins - 1) if JSDbinscount[binidx] > 0: binned = 1 JSDbinscount[binidx] -= 1 if numattempts > 10000 and binidx > 1: binned = 1 JSDbinscount[binidx] -= 1 outdict[gene] = [ genereadslist1[i], genereadslist2[i], v1, v2, genejsd ] print JSDbinscount return outdict