Пример #1
0
def adderrortoread(readstr,readcigar,readqualitydeteriorationrate,indelrate):
    switchdict={'A':'t','T':'a','C':'g','G':'c'}
    outstr=''
    errorrate=0
    indellist=[0]
    indelflaglist=[-1]
    for i in range(len(readstr)):
        if common.toss(indelrate) and i!=0 and indellist[-1]!=(i-1):
            indellist.append(i)
            if common.toss():
                outstr+=outstr[-1]
                outstr+=readstr[i]
                indelflaglist.append(1)
            else:
                indelflaglist.append(0)
        else:
            if common.toss(errorrate):
                outstr+=switchdict[readstr[i]]
            else:
                outstr+=readstr[i]
        errorrate+=readqualitydeteriorationrate
    indellist.append(len(readstr))
    indelflaglist.append(-1)
    outcigarstr=''
    for i in range(1,len(indellist)):
        subcig=common.cigarsubstr(readcigar,indellist[i-1],indellist[i]-indellist[i-1])
        #print 'here',readcigar,indellist[i-1],indellist[i],subcig
        outcigarstr+=subcig
        if indelflaglist[i]==1:
            outcigarstr+='1I'
        elif indelflaglist[i]==0:
            outcigarstr+='1D'
    
    return [outstr,outcigarstr]
Пример #2
0
def RNAmetasource2source(parameterlist):
    """
    Output:
        Gene Dict: Transcript Dict: chromosome, strand, startbase, cigar, txsize
    """
    # empirical parameter
    genewithnovelskippedexonpct=50
    metasourcefilename=parameterlist[0][0]
    filetype=parameterlist[1][0]
    if parameterlist[1][1]=='no_novel_transcript':
        novel_skipped_exon_flg=0
    else:
        novel_skipped_exon_flg=1
    numgenes=int(parameterlist[1][2])
    genelist=parameterlist[1][3]
    
    if filetype=='gtf':
        metasource=gtffile.gtfFile(metasourcefilename)
        genetranscriptdict=metasource.getgenetranscriptdict()
    elif filetype=='pck':
        genetranscriptdict=cPickle.load(open(metasourcefilename))
    
    allgenes=genetranscriptdict.keys()
    if len(genelist)==0:
        if numgenes==0:
            numgenes=len(genetranscriptdict)
        else:
            numgenes=min(numgenes,len(genetranscriptdict))
        choosegenes=random.sample(allgenes,numgenes)
    else:
        choosegenes=genelist[0:]
    
    #print len(genelist),len(choosegenes)
    
    outtranscriptdict={}
    for gene in choosegenes:
        txdict=genetranscriptdict[gene]
        txexonslist=[txdict[tx][2] for tx in txdict]
        if novel_skipped_exon_flg==1 and common.toss(genewithnovelskippedexonpct/100.0):
            newtxflg=0
            trys=0
            while not newtxflg and trys<10:
                trys+=1
                tx=random.choice(txdict.keys())
                txexons=txdict[tx][2]
                if len(txexons)>2:
                    skipped=random.randint(1,len(txexons)-2)
                    newtx=txexons[:skipped-1]+txexons[skipped:]
                if newtx not in txexonslist:
                    newtxflg=1
                    txdict['%s_skip'%tx]=[txdict[tx][0],txdict[tx][1],newtx]
        outtranscriptdict[gene]={}
        for tx in txdict:
            cigartup=common.exonlist2cigar(txdict[tx][2])
            outtranscriptdict[gene][tx]=[txdict[tx][0],txdict[tx][1],cigartup[0],cigartup[1],cigartup[2]]
    return outtranscriptdict                    
Пример #3
0
def fragment2read(allparameterlist):
    """
    input:
        transcript string, transcript cigar, fragmentlist
    output:
        list of (readstr,readcigar)
    """
    #read_extract_parameters=SE,100,300,400       ; single:size and fragment range
    #read_extract_parameters=PE,100,300,400       ; paired:size and fragment range
    #read_extract_parameters=PB,500,1000          ; pacbio fragment range
    txstr, txcigar, fragmentlist, parameterlist = allparameterlist
    readlist = []
    if parameterlist[0] == 'SE':
        substrsize = int(parameterlist[1])
        for fragment in fragmentlist:
            if common.toss():
                substrstart = fragment[0]
            else:
                substrstart = fragment[1] - substrsize
            readcigar = common.cigarsubstr(txcigar, substrstart, substrsize)
            readstr = txstr[substrstart:substrstart + substrsize]
            readlist.append([substrstart, readstr, readcigar])
    if parameterlist[0] == 'PE':
        substrsize = int(parameterlist[1])
        for fragment in fragmentlist:
            substrstart1 = fragment[0]
            readcigar1 = common.cigarsubstr(txcigar, substrstart1, substrsize)
            readstr1 = txstr[substrstart1:substrstart1 + substrsize]
            #readlist.append([substrstart,readstr,readcigar])
            substrstart2 = fragment[1] - substrsize
            readcigar2 = common.cigarsubstr(txcigar, substrstart2, substrsize)
            readstr2 = txstr[substrstart2:substrstart2 + substrsize]
            readlist.append([[substrstart1, readstr1, readcigar1],
                             [substrstart2, readstr2, readcigar2]])
    if parameterlist[0] == 'PB':
        for fragment in fragmentlist:
            substrstart = fragment[0]
            substrsize = fragment[1] - fragment[0]
            readcigar = common.cigarsubstr(txcigar, substrstart, substrsize)
            readstr = txstr[substrstart:substrstart + substrsize]
            readlist.append([substrstart, readstr, readcigar])
    readlist.sort()
    if parameterlist[0] == 'PE':
        pe_readlist = readlist[0:]
        readlist = []
        for pe_read in pe_readlist:
            read1, read2 = pe_read
            readlist.append(read1)
            readlist.append(read2)
    return readlist
Пример #4
0
def fragment2read(allparameterlist):
    """
    input:
        transcript string, transcript cigar, fragmentlist
    output:
        list of (readstr,readcigar)
    """
    #read_extract_parameters=SE,100,300,400       ; single:size and fragment range
    #read_extract_parameters=PE,100,300,400       ; paired:size and fragment range
    #read_extract_parameters=PB,500,1000          ; pacbio fragment range
    txstr,txcigar,fragmentlist,parameterlist=allparameterlist
    readlist=[]          
    if parameterlist[0]=='SE':
        substrsize=int(parameterlist[1])
        for fragment in fragmentlist:
            if common.toss():
                substrstart=fragment[0]
            else:
                substrstart=fragment[1]-substrsize
            readcigar=common.cigarsubstr(txcigar,substrstart,substrsize)
            readstr=txstr[substrstart:substrstart+substrsize]
            readlist.append([substrstart,readstr,readcigar])
    if parameterlist[0]=='PE':
        substrsize=int(parameterlist[1])
        for fragment in fragmentlist:
            substrstart1=fragment[0]
            readcigar1=common.cigarsubstr(txcigar,substrstart1,substrsize)
            readstr1=txstr[substrstart1:substrstart1+substrsize]
            #readlist.append([substrstart,readstr,readcigar])
            substrstart2=fragment[1]-substrsize
            readcigar2=common.cigarsubstr(txcigar,substrstart2,substrsize)
            readstr2=txstr[substrstart2:substrstart2+substrsize]
            readlist.append([[substrstart1,readstr1,readcigar1],[substrstart2,readstr2,readcigar2]])       
    if parameterlist[0]=='PB':
        for fragment in fragmentlist:
            substrstart=fragment[0]
            substrsize=fragment[1]-fragment[0]
            readcigar=common.cigarsubstr(txcigar,substrstart,substrsize)
            readstr=txstr[substrstart:substrstart+substrsize]
            readlist.append([substrstart,readstr,readcigar])       
    readlist.sort() 
    if parameterlist[0]=='PE':
        pe_readlist=readlist[0:]  
        readlist=[] 
        for pe_read in pe_readlist:
            read1,read2=pe_read
            readlist.append(read1)
            readlist.append(read2)
    return readlist
Пример #5
0
def adderrortoread(readstr, readcigar, readqualitydeteriorationrate,
                   indelrate):
    switchdict = {'A': 't', 'T': 'a', 'C': 'g', 'G': 'c'}
    outstr = ''
    errorrate = 0
    indellist = [0]
    indelflaglist = [-1]
    for i in range(len(readstr)):
        if common.toss(indelrate) and i != 0 and indellist[-1] != (i - 1):
            indellist.append(i)
            if common.toss():
                outstr += outstr[-1]
                outstr += readstr[i]
                indelflaglist.append(1)
            else:
                indelflaglist.append(0)
        else:
            if common.toss(errorrate):
                outstr += switchdict[readstr[i]]
            else:
                outstr += readstr[i]
        errorrate += readqualitydeteriorationrate
    indellist.append(len(readstr))
    indelflaglist.append(-1)
    outcigarstr = ''
    for i in range(1, len(indellist)):
        subcig = common.cigarsubstr(readcigar, indellist[i - 1],
                                    indellist[i] - indellist[i - 1])
        #print 'here',readcigar,indellist[i-1],indellist[i],subcig
        outcigarstr += subcig
        if indelflaglist[i] == 1:
            outcigarstr += '1I'
        elif indelflaglist[i] == 0:
            outcigarstr += '1D'

    return [outstr, outcigarstr]
Пример #6
0
def getsimulatedtwoexpressions(genetxcountdict,covdistparam,JSDdistparam,numreads,fold_flg=0):
    """
    Computes gene and transcript coverages for two datasets
    input:
        genetxcountdict: count of transcripts for each gene, make sure than 
        covdistparam: the parameters describing truncated normal distribution - mu,sigma,left,right
                      of ratio of rpkm of gene between two datasets
        JSDdistparam: the parameters describing uniform plus triangular distribution - left, right, mode
                      of JSD of trascript expression of a gene between two datasets
    output:
        genepairexpdict=gene:genereads1,genereads2,txexp1,txexp2
    """
    numgenes=len(genetxcountdict)
    genereadslist1=getsimulatedgenereadcounts(numgenes,numreads)
    #print covdistparam
    covratiolisttemp1=[random.gauss(covdistparam[0],covdistparam[1]) for i in range(10*numgenes)]
    covratiolisttemp2=[x for x in covratiolisttemp1 if covdistparam[2]<=x<=covdistparam[3]]
    covratiolist=random.sample(covratiolisttemp2,numgenes)
    # 1: gene read count <= 5% of total is acceptable 
    # 2: gene read count >= 60% of total is not acceptable
    # 3: if between 1 and 2, gene read count < 10 times fair share is acceptable
    genefairshare=numreads*1.0/numgenes
    genereadslist2temp1=[genereadslist1[i]*pow(10,covratiolist[i]) for i in range(numgenes)]
    genereadslist2temp2=[min(x,numreads*0.60) for x in genereadslist2temp1]
    genereadslist2temp3=[min(x,max(10*genefairshare,numreads*0.05)) for x in genereadslist2temp2]
    genereadslist2=[max(1,int(x*numreads/sum(genereadslist2temp3))) for x in genereadslist2temp3]
    
    JSDlist=[]
    for gene in genetxcountdict:
        if genetxcountdict[gene]>1:
            if common.toss():
                JSDlist.append(random.uniform(JSDdistparam[0],JSDdistparam[1]))
            else:
                JSDlist.append(random.triangular(JSDdistparam[0],JSDdistparam[1],JSDdistparam[2]))
                
    numbins=50
    JSDbins=np.linspace(0,1,numbins+1)
    JSDbinscount=[]
    for i in range(numbins):
        JSDbinscount.append(len([x for x in JSDlist if JSDbins[i]<x<=JSDbins[i+1]]))
    
    print JSDbinscount
    
    outdict={}
    genelist=genetxcountdict.keys()
    for i in  range(numgenes):
        gene=genelist[i]
        if genetxcountdict[gene]==1:
            outdict[gene]=[genereadslist1[i],genereadslist2[i],[1.0],[1.0],0.0]
        else:
            binned=0; numattempts=0
            while binned==0:
                numattempts+=1
                if fold_flg==0:
                    v1=getsimulatedtxexpressionpct(genetxcountdict[gene])
                    v2=getsimulatedtxexpressionpct(genetxcountdict[gene])
                    genejsd=common.JSD(v1,v2)
                    binidx=int(genejsd*numbins)
                else:
                    v1=getsimulatedtxexpressionpct(genetxcountdict[gene])
                    v2=v1[:]
                    v2[0],v2[1]=v1[1],v1[0]
                    genejsd=abs(v2[1]-v2[0])
                    binidx=min(int(genejsd*numbins),numbins-1)
                    
                if JSDbinscount[binidx]>0:
                    binned=1
                    JSDbinscount[binidx]-=1
                if numattempts>10000 and binidx>1:
                    binned=1
                    JSDbinscount[binidx]-=1
                
            outdict[gene]=[genereadslist1[i],genereadslist2[i],v1,v2,genejsd]     
    print JSDbinscount
    return outdict
Пример #7
0
def RNAmetasource2source(parameterlist):
    """
    Output:
        Gene Dict: Transcript Dict: chromosome, strand, startbase, cigar, txsize
    """
    # empirical parameter
    genewithnovelskippedexonpct = 50
    metasourcefilename = parameterlist[0][0]
    filetype = parameterlist[1][0]
    if parameterlist[1][1] == 'no_novel_transcript':
        novel_skipped_exon_flg = 0
    else:
        novel_skipped_exon_flg = 1
    numgenes = int(parameterlist[1][2])
    genelist = parameterlist[1][3]

    if filetype == 'gtf':
        metasource = gtffile.gtfFile(metasourcefilename)
        genetranscriptdict = metasource.getgenetranscriptdict()
    elif filetype == 'pck':
        genetranscriptdict = cPickle.load(open(metasourcefilename))

    allgenes = genetranscriptdict.keys()
    if len(genelist) == 0:
        if numgenes == 0:
            numgenes = len(genetranscriptdict)
        else:
            numgenes = min(numgenes, len(genetranscriptdict))
        choosegenes = random.sample(allgenes, numgenes)
    else:
        choosegenes = genelist[0:]

    #print len(genelist),len(choosegenes)

    outtranscriptdict = {}
    for gene in choosegenes:
        txdict = genetranscriptdict[gene]
        txexonslist = [txdict[tx][2] for tx in txdict]
        if novel_skipped_exon_flg == 1 and common.toss(
                genewithnovelskippedexonpct / 100.0):
            newtxflg = 0
            trys = 0
            while not newtxflg and trys < 10:
                trys += 1
                tx = random.choice(txdict.keys())
                txexons = txdict[tx][2]
                if len(txexons) > 2:
                    skipped = random.randint(1, len(txexons) - 2)
                    newtx = txexons[:skipped - 1] + txexons[skipped:]
                if newtx not in txexonslist:
                    newtxflg = 1
                    txdict['%s_skip' %
                           tx] = [txdict[tx][0], txdict[tx][1], newtx]
        outtranscriptdict[gene] = {}
        for tx in txdict:
            cigartup = common.exonlist2cigar(txdict[tx][2])
            outtranscriptdict[gene][tx] = [
                txdict[tx][0], txdict[tx][1], cigartup[0], cigartup[1],
                cigartup[2]
            ]
    return outtranscriptdict
Пример #8
0
def getsimulatedtwoexpressions(genetxcountdict,
                               covdistparam,
                               JSDdistparam,
                               numreads,
                               fold_flg=0):
    """
    Computes gene and transcript coverages for two datasets
    input:
        genetxcountdict: count of transcripts for each gene, make sure than 
        covdistparam: the parameters describing truncated normal distribution - mu,sigma,left,right
                      of ratio of rpkm of gene between two datasets
        JSDdistparam: the parameters describing uniform plus triangular distribution - left, right, mode
                      of JSD of trascript expression of a gene between two datasets
    output:
        genepairexpdict=gene:genereads1,genereads2,txexp1,txexp2
    """
    numgenes = len(genetxcountdict)
    genereadslist1 = getsimulatedgenereadcounts(numgenes, numreads)
    #print covdistparam
    covratiolisttemp1 = [
        random.gauss(covdistparam[0], covdistparam[1])
        for i in range(10 * numgenes)
    ]
    covratiolisttemp2 = [
        x for x in covratiolisttemp1 if covdistparam[2] <= x <= covdistparam[3]
    ]
    covratiolist = random.sample(covratiolisttemp2, numgenes)
    # 1: gene read count <= 5% of total is acceptable
    # 2: gene read count >= 60% of total is not acceptable
    # 3: if between 1 and 2, gene read count < 10 times fair share is acceptable
    genefairshare = numreads * 1.0 / numgenes
    genereadslist2temp1 = [
        genereadslist1[i] * pow(10, covratiolist[i]) for i in range(numgenes)
    ]
    genereadslist2temp2 = [
        min(x, numreads * 0.60) for x in genereadslist2temp1
    ]
    genereadslist2temp3 = [
        min(x, max(10 * genefairshare, numreads * 0.05))
        for x in genereadslist2temp2
    ]
    genereadslist2 = [
        max(1, int(x * numreads / sum(genereadslist2temp3)))
        for x in genereadslist2temp3
    ]

    JSDlist = []
    for gene in genetxcountdict:
        if genetxcountdict[gene] > 1:
            if common.toss():
                JSDlist.append(random.uniform(JSDdistparam[0],
                                              JSDdistparam[1]))
            else:
                JSDlist.append(
                    random.triangular(JSDdistparam[0], JSDdistparam[1],
                                      JSDdistparam[2]))

    numbins = 50
    JSDbins = np.linspace(0, 1, numbins + 1)
    JSDbinscount = []
    for i in range(numbins):
        JSDbinscount.append(
            len([x for x in JSDlist if JSDbins[i] < x <= JSDbins[i + 1]]))

    print JSDbinscount

    outdict = {}
    genelist = genetxcountdict.keys()
    for i in range(numgenes):
        gene = genelist[i]
        if genetxcountdict[gene] == 1:
            outdict[gene] = [
                genereadslist1[i], genereadslist2[i], [1.0], [1.0], 0.0
            ]
        else:
            binned = 0
            numattempts = 0
            while binned == 0:
                numattempts += 1
                if fold_flg == 0:
                    v1 = getsimulatedtxexpressionpct(genetxcountdict[gene])
                    v2 = getsimulatedtxexpressionpct(genetxcountdict[gene])
                    genejsd = common.JSD(v1, v2)
                    binidx = int(genejsd * numbins)
                else:
                    v1 = getsimulatedtxexpressionpct(genetxcountdict[gene])
                    v2 = v1[:]
                    v2[0], v2[1] = v1[1], v1[0]
                    genejsd = abs(v2[1] - v2[0])
                    binidx = min(int(genejsd * numbins), numbins - 1)

                if JSDbinscount[binidx] > 0:
                    binned = 1
                    JSDbinscount[binidx] -= 1
                if numattempts > 10000 and binidx > 1:
                    binned = 1
                    JSDbinscount[binidx] -= 1

            outdict[gene] = [
                genereadslist1[i], genereadslist2[i], v1, v2, genejsd
            ]
    print JSDbinscount
    return outdict