示例#1
0
 def loglik(self,arr,working):
     working['arrfail'][:] = self.afaildens
     working['arrfail'][np.logical_and(arr[:,0]<5,arr[:,1]<5)] = 1.
     working['arrfail_lik'] *= self.eta_arr
     loglik = np.zeros(len(arr))
     for j in range(3):
         chiamante_statfunc.dmvt(arr,self.mu[j],self.sigma[j],self.df[j],working['arrlik'][:,j],working['workarray'])             
         if self.p[0][j]>0: loglik += np.log(self.p[0][j]) + np.log(working['arrfail_lik']+(1-self.eta_arr)*working['arrlik'][:,j])
     return loglik.sum()
示例#2
0
def chiamante_qc(parameters,loglik,logpp,i,
                 arr,seq,prior,start,seqfaildens,arrfaildens,popidx,working,niteration,tolerance,df, hwe_prior,calculate_logpp,C,flip,retry,g_corrected,genotype_likelihoods):

    if seq!=None: doseq=True
    else: doseq=False
    ngeno = [((1-working['arrfail'])*working['new_g'][:,j]).sum() for j in range(3)]
    arr2 = np.power(2,arr)
    r = arr2.sum(1)
    theta = 2. * np.arctan2(arr2[:,0],arr2[:,1]) / np.pi
    mu = parameters['mu']
    if not (mu[0][0]>mu[2][0] and mu[2][1]>mu[0][1]):
        zstat1 = -2
        zstat2 = -2
    elif ngeno[0]>ngeno[2]:   
        zstat1 = (parameters['mu'][1][1]-parameters['mu'][0][1])/np.sqrt(parameters['sigma'][0][1,1])
        zstat2 = chiamante_statfunc.mahalanobis(parameters['mu'][1],parameters['mu'][0],parameters['sigma'][0])
    else:   
        zstat1 = (parameters['mu'][1][0]-parameters['mu'][2][0])/np.sqrt(parameters['sigma'][2][0,0])
        zstat2 = chiamante_statfunc.mahalanobis(parameters['mu'][1],parameters['mu'][2],parameters['sigma'][2])

    if ngeno[0]>1 or ngeno[2]>1:
        threshold1=1
        threshold2=3
    else:
        threshold1=.5
        threshold2=2
    if genotype_likelihoods:
        gl = working['gl']
    else:
        gl = None
    if retry<4 and (zstat1<threshold1 or zstat2<threshold2):
        #            print retry,ngeno,zstat1,zstat2
        if doseq and retry<3:
            ii = np.logical_and(arr.max(1)>6,np.logical_not(np.isnan(seq[:,0])))
            dosage = seq[ii,1:].sum(1)
            #            print pearsonr(arr[ii,0],dosage),pearsonr(arr[ii,1],dosage)
            pval1 = pearsonr(arr[ii,0],dosage)[1]
            pval2 = pearsonr(arr[ii,1],dosage)[1]

            if pval1<.0001 and pval2<.0001:
                 for j in range(3): 
                    wt = seq[ii,j]
                    start['mu'][j]  = (arr[ii].T*wt).sum(1)/wt.sum()
            retry=3
            nrit=niteration
            return chiamante_mainloop(arr,seq,prior,start,seqfaildens=seqfaildens*10,arrfaildens=arrfaildens,popidx=popidx,working=working,df=df,niteration=nrit,
                                     hwe_prior=hwe_prior,tolerance=tolerance,C=False,flip=flip,retry=retry,g_corrected=g_corrected,genotype_likelihoods=genotype_likelihoods)

        try: 
            newmu =  np.median(arr[arr.max(1)>6],0)
        except:
            newmu =  np.median(arr,0)
        
        muidx = newmu.argmax()*2
        if retry<3: 
            nrit=niteration
            retry=3
        else: 
            nrit=1
            retry=4
        start['mu'][muidx] = newmu#deepcopy(parameters['mu'][muidx])
        expected_mean(muidx,start['mu'],prior,parameters['model'])
        tmpsigma = start['sigma'][muidx]

        if (arr.max(1)>7).sum()>3:
            start['sigma'][muidx] = np.cov(arr[arr.max(1)>7].T)# np.diag((1,1))*start['sigma'][muidx].max()        

        ret = chiamante_mainloop(arr,seq,prior,start,seqfaildens=seqfaildens*10,arrfaildens=arrfaildens,popidx=popidx,working=working,df=df,niteration=nrit,
                                 hwe_prior=hwe_prior,tolerance=tolerance,C=False,flip=flip,retry=retry,g_corrected=g_corrected,genotype_likelihoods=genotype_likelihoods)
        start['sigma'][muidx] = tmpsigma
        return ret

    if ngeno[0]<1 and retry<3:
        ng = sum(ngeno)
        af = ngeno[1]/(ng*2)
        eg = (af**2)*ng
        if eg>1:
            #                print ngeno,"eg =",eg
            parameters['mu'][0][1] = np.min(arr[arr.max(1)>6,1])
        return chiamante_mainloop(arr,seq,prior,parameters,seqfaildens=seqfaildens*10,arrfaildens=arrfaildens,popidx=popidx,working=working,df=df,niteration=niteration,
                                 hwe_prior=hwe_prior,tolerance=tolerance,C=False,flip=flip,retry=3,g_corrected=g_corrected,genotype_likelihoods=genotype_likelihoods)

    if ngeno[2]<1 and retry<3:
        ng = sum(ngeno)
        af = ngeno[1]/(ng*2)
        eg = (af**2)*ng
        if eg>1:
            #               print ngeno,"eg =",eg
           parameters['mu'][2][0] = np.min(arr[arr.max(1)>6,0])
        return chiamante_mainloop(arr,seq,prior,parameters,seqfaildens=seqfaildens*10,arrfaildens=arrfaildens,popidx=popidx,working=working,df=df,niteration=niteration, 
                                  hwe_prior=hwe_prior,tolerance=tolerance,C=False,flip=flip,retry=3,g_corrected=g_corrected,genotype_likelihoods=genotype_likelihoods)

    if mu[0][0]<mu[1][0] and retry <4:
        #            print "Fixing mu_0"
        parameters['mu'][0] = parameters['mu'][2][[1,0]]
        return chiamante_mainloop(arr,seq,prior,parameters,seqfaildens=seqfaildens*10,arrfaildens=arrfaildens,popidx=popidx,working=working,df=df,niteration=1,
                                 hwe_prior=hwe_prior,tolerance=tolerance,C=False,flip=flip,retry=4,g_corrected=g_corrected,genotype_likelihoods=genotype_likelihoods)

    if mu[2][1]<mu[1][1] and retry<4:
        #            print "Fixing mu_2"
        parameters['mu'][2] = parameters['mu'][0][[1,0]]
        return chiamante_mainloop(arr,seq,prior,parameters,seqfaildens=seqfaildens*10,arrfaildens=arrfaildens,popidx=popidx,working=working,df=df,niteration=1,
                                 hwe_prior=hwe_prior,tolerance=tolerance,C=False,flip=flip,retry=4,g_corrected=g_corrected,genotype_likelihoods=genotype_likelihoods)
    # monomorphic checks
    mono = False
    calls = working['new_g'].argmax(1)
    calls[working['new_g'].max(1)<0.9]=3
    calls[working['arrfail']>0.1]=3
    theta_het = 2.*np.arctan2(2.**parameters['mu'][1][0],2.**parameters['mu'][1][1])/np.pi

    if (calls==0).sum()>0:            # weird thetas
        if theta[calls==0].min() < theta_het:
            # print 'hom0 < het centroid',theta[calls==0].min(),theta_het
            mono = True

    if (calls==2).sum()>0:
        if theta[calls==2].max() > theta_het:
            # print 'hom2 > het centroid',theta[calls==2].max(),theta_het
            mono = True

    if not doseq and (parameters['mu'][1].min() < 6 or (parameters['mu'][1].min() < 8 and ngeno[1]<4)):
        # print "very low het centroid",parameters['mu'][0],parameters['mu'][1],parameters['mu'][2]
        mono = True

    if ngeno[0]<1 or ngeno[2]<1:
        if ngeno[0]>ngeno[2]:   
            zstat1 = (parameters['mu'][1][0]-parameters['mu'][0][0])# /np.sqrt(parameters['sigma'][0][0,0])
        else:   
            zstat1 = (parameters['mu'][1][1]-parameters['mu'][2][1])# /np.sqrt(parameters['sigma'][2][1,1])
        #            print zstat1
        if zstat1 < -3:
            # print 'ridiculuously low het, returning monomorphic fit',zstat1
            mono = True                        

    if mono: # site looks monomorphic (or very low MAF)
        monofit = monomorphic_fit(prior,start,arr,seq,working,arrfaildens,df=df,niteration=niteration,tol=.1)
        if retry < 5:
            for j in range(3):
                parameters['mu'][j] = monofit['parameters']['mu'][j]
                parameters['sigma'][j] = monofit['parameters']['sigma'][j]
            return chiamante_mainloop(arr,seq,prior,parameters,seqfaildens=seqfaildens*10,arrfaildens=arrfaildens,popidx=popidx,working=working,df=df,niteration=1,
                                      hwe_prior=hwe_prior,tolerance=tolerance,C=False,flip=flip,retry=5,g_corrected=g_corrected,genotype_likelihoods=genotype_likelihoods)
        else:
            parameters = monofit['parameters']
            for j in range(3):
                if model==4: 
                    if df==None: working['arrlik'][:,j] = chiamante_statfunc.dt(arr[:,1],parameters['mu'][j],parameters['sigma'][j],100)
                    else: working['arrlik'][:,j] = chiamante_statfunc.dt(arr[:,1],parameters['mu'][j],parameters['sigma'][j],df[j])
                elif df==None: working['arrlik'][:,j] = chiamante_statfunc.dmvnorm(arr,parameters['mu'][j],parameters['sigma'][j])
                else: chiamante_statfunc.dmvt(arr,parameters['mu'][j],parameters['sigma'][j],df[j],working['arrlik'][:,j],working['workarray']) 

            if not doseq:
                chiamante_estep(popidx,monofit['parameters'],hwe_prior,working['new_g'],working['u'],arr,working['arrlik'],working['arrfail_lik'],working['arrfail'],working['workarray'],gl=gl)
            else:
                chiamante_estep(popidx,monofit['parameters'],hwe_prior,working['new_g'],working['u'],arr,working['arrlik'],working['arrfail_lik'],working['arrfail'],working['workarray'],
                                doseq=True,seqlik=seq,seqfail_lik=working['seqfail_lik'],seq_missing=working['seq_missing'],seqfail=working['seqfail'],gl=gl)

            return dict(parameters=monofit['parameters'],
                        loglik=loglik[:i],logpp=logpp[:i]
                        ,gprobs=working['new_g'],gl=gl,array_fail=working['arrfail'],seq_fail=working['seqfail']
                        ,u=working['u'],niteration=-1)

        #everything looks fine! returning the original fit
    return dict(parameters=parameters,
                loglik=loglik[:i],logpp=logpp[:i]
                ,gprobs=working['new_g'],gl=gl,array_fail=working['arrfail'],seq_fail=working['seqfail']
                ,u=working['u'],niteration=i)
示例#3
0
def monomorphic_fit(prior,parameters,arr,seqlik,working,arrfaildens,df=None,niteration=30,tol=0.01):
    
    if seqlik!=None:doseq=True
    else:doseq=False
    nsample = len(arr)
    working['arrfail_lik'][:] = arrfaildens
    working['arrfail_lik'][np.logical_and(arr[:,0]<5,arr[:,1]<5)] = 1.
    working['u'][:] = 1.

    if df != None:
        if type(df)=='int':
            df = [df for idx in range(3)]
            if type(df)=='list':
                if len(df)!=3:
                    print "df is not a list of length 3 or a scalar"
                    raise NameError('EpicFail')
    if mquantiles(arr[:,0],.99) > mquantiles(arr[:,1],.99): j = 0
    else: j = 2
    # j=arr.max(0).argmax()*2
    parm = Parameters(afaildens=arrfaildens,df=df,K=3)
    parm.mu[j] = np.median(arr[arr.max(1)>6],0)
    v1 = chiamante_statfunc.mad(arr[:,0])[1]
    v2 = chiamante_statfunc.mad(arr[:,1])[1]
    if v1<.1: v1=1
    if v2<.1: v2=1
    parm.sigma[j][:] = np.diag((v1,v2))
    parm.eta_arr=0.05
    parm.p[0][:] = 0.0
    parm.p[0][j] = 1.0
    workarray =  working['workarray']
    arrfail = working['arrfail']
    g = working['new_g']
    u = working['u']

    for i in range(niteration):
        #E-STEP
        chiamante_statfunc.dmvt(arr,parm.mu[j],parm.sigma[j],parm.df[j],working['arrlik'][:,j],working['workarray']) 
        arrfail[:] = parm.eta_arr*working['arrfail_lik']
        g[:,j] = working['arrlik'][:,j]*(1-parm.eta_arr)

        working['workarray'][:,0] = g[:,j]+arrfail #denominator
        g[:,j]/=working['workarray'][:,0]
        arrfail/=working['workarray'][:,0]
        ss =  workarray[:,1:3]
        if parm.df!=None:
            ss[:] = arr[:]
            ss[:] -= parm.mu[j]
            u[:,j] = (df[j]+2)/(df[j]+(np.dot(ss,la.inv(parm.sigma[j]))*ss).sum(1))


        #M-STEP
        wtu = workarray[:,0]
        residuals = workarray[:,1:3]
        residuals[:] = arr[:]
        success = 1 - arrfail

        wtu[:] = (success*g[:,j]) # weights without u
        ntmp = wtu.sum() # need this denominator
        wtu *= u[:,j]
        nu = wtu.sum()

        isig = la.inv(parm.sigma[j])
        den = la.inv(prior['isigma_mu'][j]+isig*nu)
        parm.mu[j] = np.dot(den,(np.dot(prior['isigma_mu'][j],prior['mu0'][j]) + np.dot(isig,np.dot(wtu,arr))))
        residuals[:] -= parm.mu[j]
        tmp2 = parm.mu[j]-prior['mu0'][j]
        parm.sigma[j] = (np.diag(np.diag(prior['s0'][j]))  +  np.diag((wtu*np.power(residuals.T,2)).sum(1))) / (prior['v0'][j] + 4 + ntmp)
        parm.eta_arr = (prior['arrfail_alpha']+arrfail.sum()-1)/(prior['arrfail_beta']+prior['arrfail_alpha']+nsample-2)

        if i>0:
            working['old_g'][:,j] -= g[:,j]
            if working['old_g'][:,j].max() < tol and working['old_g'][:,j].min() > -tol:
                break
            else:
                working['old_g'][:,j]=g[:,j]

    if j==0: g[:,1:] = 0.0
    if j==2: g[:,:2] = 0.0
    #    print i,"iterations"
    
    expected_mean(j,parm.mu,prior,3)
    for genotype in range(3):
        if genotype!=j:
            parm.sigma[genotype] = [np.array([[0.0255294956024038,0.00467599907475694],[0.00467599907475694,0.327148026714643]]),
                                    np.array([[0.0322203889783675,0.0167746786021707],[0.0167746786021707,0.0251180753954836]]),
                                    np.array([[0.370072130849117,0.00512061278760211],[0.00512061278760211,0.0212080779437183]])][genotype]

    
    if j==2: p = np.array([.95**2,2*.05*.95,.05**2])
    else: p = np.array([.05**2,2*.05*.95,.95**2])
    if doseq:  
        seqfail_lik=working['seqfail_lik']
        seqfail = working['seqfail']
        seqfail[:] = 1.0
        lik1 = working['workarray'][:,0]
        lik2 = working['workarray'][:,1]
        seq_not_missing = np.logical_not(np.isnan(seqlik[:,0]))
        nseq = seq_not_missing.sum()
        seqfail[seq_not_missing] = seqfail_lik[seq_not_missing]*parameters['eta_seq']#tmp_p[2]    
#        lik1[seq_not_missing] = (working['arrlik'][seq_not_missing,j]*(1-parm.eta_arr) + parm.eta_arr*working['arrfail_lik'][seq_not_missing])
        lik2[seq_not_missing] = (seqlik[seq_not_missing,j]*(1-parameters['eta_seq'])+seqfail[seq_not_missing])
        seqfail[seq_not_missing]/=(lik2[seq_not_missing])
        eta_seq=(prior['seqfail_alpha']+seqfail[seq_not_missing].sum()-1)/(prior['seqfail_beta']+prior['seqfail_alpha']+nseq-2)        
        return dict(parameters=dict(p=p,df=df,eta_array=parm.eta_arr,mu=parm.mu,sigma=parm.sigma,eta_seq=eta_seq,model=3),gprobs=g,niteration=-1,arrfail=arrfail)
    else:
        return dict(parameters=dict(p=p,df=df,eta_array=parm.eta_arr,mu=parm.mu,sigma=parm.sigma,model=3),gprobs=g,niteration=-1,arrfail=arrfail)
示例#4
0
def chiamante_mainloop(arr,seq,prior,start,seqfaildens,arrfaildens,popidx,working,niteration=50,tolerance=1e-3,df=None,
                       hwe_prior=True,calculate_logpp=False,C=True,flip=False,retry=0,g_corrected=None,genotype_likelihoods=False):

    if not arr.shape[1] == 2:
        raise ValueError("Array does not have 2 columns.")
    if df != None:
        if len(df) != 3:
            raise ValueError("invalid degrees of freedom")
        else: df = [float(val) for val in df]
    
    if df!=None:
        if sum(df)==0: df = None

    if genotype_likelihoods:
        gl = working['gl']
    else:
        gl = None

    model = prior['model']
    nsample = len(arr)
    npop = len(popidx)
    
    working['arrfail_lik'][:] = arrfaildens

    arr2 = np.power(2,arr)
    r = arr2.sum(1)

    working['arrfail_lik'][r<36.] = 1.
    working['u'][:] = 1.
    if seq==None:
        doseq=False
    else:
        doseq=True
        working['seq_missing'] = np.where(np.isnan(seq[:,0]))[0]
        working['seq_not_missing'] = np.where(np.logical_not(np.isnan(seq[:,0])))[0]
        nseq = len(working['seq_not_missing'])

    if hwe_prior and not len(prior['raf_alpha'])==npop:
        print "Length of raf_alpha not consistent with number of populations"

    if flip: flip_raf_prior(prior)        

    K=3 #number of classes
    logpp = np.zeros(niteration)
    loglik = np.zeros(niteration)

    parameters = dict(mu = deepcopy(start['mu']),
                      sigma = deepcopy(start['sigma']),                          
                      alpha = [start['alpha'] for i in range(npop)],
                      eta_array = deepcopy(start['eta_array']),
                      eta_seq = deepcopy(start['eta_seq']),df=df,model=model)

    if hwe_prior:
        if type(start['p'])==np.ndarray and len(start['p'])==3:
            parameters['raf']= [start['raf'] for i in range(npop)]
            parameters['p'] = [np.array([1./3. for i in range(3)]) for idx in range(npop)]
        elif type(start['p'])==list and len(start['p'])==npop:
            if len(start['raf']) != len(start['p']):
                print "len(start[raf]) != len(start[p])"
                exit()
            parameters['p'] = deepcopy(start['p'])
            parameters['raf'] = deepcopy(start['raf'])
        else:
            print "Length of genotype frequencies does not match npop"
            exit()
    else:
        if type(start['p'])==np.ndarray:
            parameters['p']=deepcopy(start['p'])
        else:
            print 'start[p] dont look right'
            exit()
    
    if df != None:
        if type(df)=='int':
            df = [df for idx in range(3)]
            if type(df)=='list':
                if len(df)!=3:
                    print "df is not a list of length 3 or a scalar"
                    raise ValueError("df is not a list of length 3 or a scalar")

    
    for i in range(niteration):
        for j in range(3):
            if model==4: 
                if df==None: working['arrlik'][:,j] = chiamante_statfunc.dt(arr[:,1],parameters['mu'][j],parameters['sigma'][j],100)
                else: working['arrlik'][:,j] = chiamante_statfunc.dt(arr[:,1],parameters['mu'][j],parameters['sigma'][j],df[j])
            elif df==None: working['arrlik'][:,j] = chiamante_statfunc.dmvnorm(arr,parameters['mu'][j],parameters['sigma'][j])
            else: chiamante_statfunc.dmvt(arr,parameters['mu'][j],parameters['sigma'][j],df[j],working['arrlik'][:,j],working['workarray']) 

        if C:#there was initially a C version for the EM step but it turned out to be no faster!
            print "not implemented"
            quit()
        else:
            if not doseq:
                chiamante_estep(popidx,parameters,hwe_prior,working['new_g'],working['u'],arr,working['arrlik'],working['arrfail_lik'],working['arrfail'],working['workarray'],gl=gl)
                parameters = chiamante_mstep(arr,working['arrfail'],working['new_g'],prior,hwe_prior,model,working['u'],popidx,parameters,working['workarray'])
            else:
                chiamante_estep(popidx,parameters,hwe_prior,working['new_g'],working['u'],arr,working['arrlik'],working['arrfail_lik'],working['arrfail'],working['workarray'],
                                doseq=True,seqlik=seq,seqfail_lik=working['seqfail_lik'],seq_missing=working['seq_missing'],seqfail=working['seqfail'],gl=gl)
                parameters = chiamante_mstep(arr,working['arrfail'],working['new_g'],prior,hwe_prior,model,working['u'],popidx,parameters,working['workarray'], doseq,working['seq_not_missing'],nseq,working['seqfail'])#M-STEP

        if niteration==1: break
        elif i>1 and (abs(working['old_g']-working['new_g'])).max() < tolerance:  break
        else:
            tmp_g = working['new_g']
            working['new_g'] = working['old_g']
            working['old_g'] = tmp_g
    i+=1


    ngeno = [((1-working['arrfail'])*working['new_g'][:,j]).sum() for j in range(3)]

#perform various QC checks if we are on the last iteration and if the site is not monomorphic(convergence to monomorphic tends to indicate nothing went wrong)
    if False:
        if niteration>1 and round(max(ngeno))<nsample: 
            return chiamante_qc(parameters,loglik,logpp,i
                                ,arr,seq,prior,start,seqfaildens,arrfaildens,popidx,working,niteration,tolerance,df,hwe_prior,calculate_logpp,C,flip,retry,g_corrected,genotype_likelihoods=genotype_likelihoods)

    if not hwe_prior: parameters['raf'] = parameters['p'][0] + .5*parameters['p'][1]
    if flip: flip_raf_prior(prior)
 
    return dict(parameters=parameters,
                loglik=loglik[:i],logpp=logpp[:i]
                ,gprobs=working['new_g'],gl=gl,array_fail=working['arrfail'],seq_fail=working['seqfail']
                ,u=working['u'],niteration=i)