예제 #1
0
파일: stats.py 프로젝트: apaloczy/ap_tools
def rsig_student(ndof_eff, alpha=0.95):
	"""
	USAGE
	-----
	Rsigt = rsig_student(ndof_eff, alpha=0.95)

	References
	----------
	https://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient

	Example
	-------
	TODO
	"""
	ndof = ndof_eff - 2

	## Find the critical value of r from the Student t distribution
	## by inverting the survival function (1-CDF).
	pval = 1 - alpha
	tcrit = student.isf(pval,ndof)

	## Convert the critical value of the t statistic
	## into a critical value of r.
	rcrit_t = tcrit/np.sqrt(ndof + tcrit**2)

	return rcrit_t
예제 #2
0
def rsig_student(ndof_eff, alpha=0.95):
	"""
	USAGE
	-----
	Rsigt = rsig_student(ndof_eff, alpha=0.95)

	References
	----------
	https://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient

	Example
	-------
	TODO
	"""
	ndof = ndof_eff - 2

	## Find the critical value of r from the Student t distribution
	## by inverting the survival function (1-CDF).
	pval = 1 - alpha
	tcrit = student.isf(pval,ndof)

	## Convert the critical value of the t statistic
	## into a critical value of r.
	rcrit_t = tcrit/np.sqrt(ndof + tcrit**2)

	return rcrit_t
예제 #3
0
파일: Welch.py 프로젝트: beiko-lab/STAMP
	def run(self, seqGroup1, seqGroup2, parentSeqGroup1, parentSeqGroup2, confIntervMethod, coverage):
		note = ''
		
		n1 = len(seqGroup1)
		n2 = len(seqGroup2)
		
		if n1 >= 2 and n2 >= 2:
			# calculate proportions
			propGroup1 = []
			for i in xrange(0, n1):
				propGroup1.append(float(seqGroup1[i]) / parentSeqGroup1[i])
				
			propGroup2 = []
			for i in xrange(0, n2):
				propGroup2.append(float(seqGroup2[i]) / parentSeqGroup2[i])
			
			# calculate p-value, effect size, and CI
			meanG1 = float(sum(propGroup1)) / n1
			meanG2 = float(sum(propGroup2)) / n2
			dp = meanG1 - meanG2
			
			varG1 = variance(propGroup1, meanG1)
			varG2 = variance(propGroup2, meanG2)
			
			normVarG1 = varG1 / n1
			normVarG2 = varG2 / n2
			unpooledVar = normVarG1 + normVarG2
			sqrtUnpooledVar = math.sqrt(unpooledVar)
			
			
			if unpooledVar != 0:
				# p-value
				T_statistic = (meanG1 - meanG2) / sqrtUnpooledVar
				dof = (unpooledVar*unpooledVar) / ( (normVarG1*normVarG1)/(n1-1) + (normVarG2*normVarG2)/(n2-1) )
				pValue = t.cdf(T_statistic, dof)
				
				# CI
				tCritical = t.isf(0.5 * (1.0-coverage), dof) # 0.5 factor accounts from symmetric nature of distribution
				lowerCI = dp - tCritical*sqrtUnpooledVar
				upperCI = dp + tCritical*sqrtUnpooledVar
			else:
				if meanG1 != meanG2:
					pValue = 0.0 # the difference (at least according to these samples) must be true as there is no variance
				else:
					pValue = 0.5
					
				lowerCI = dp
				upperCI = dp
				
				note = 'degenerate case: variance of both groups is zero'
		else:
			pValue = 0.5
			lowerCI = 0.0
			upperCI = 0.0
			dp = 0.0
			note = 'degenerate case: both groups must contain at least 2 samples'
	
		return 1.0 - pValue, 2*min(pValue, 1.0 - pValue), lowerCI*100, upperCI*100, dp*100, note
예제 #4
0
파일: Ttest.py 프로젝트: beiko-lab/STAMP
	def run(self, seqGroup1, seqGroup2, parentSeqGroup1, parentSeqGroup2, confIntervMethod, coverage):
		note = ''
		
		n1 = len(seqGroup1)
		n2 = len(seqGroup2)
		
		try:
			if n1 < 2 or n2 < 2:
				raise Exception('degenerate case: both groups must contain at least 2 samples')
				
			# calculate proportions
			propGroup1 = []
			for i in xrange(0, n1):
				propGroup1.append(float(seqGroup1[i]) / parentSeqGroup1[i])
				
			propGroup2 = []
			for i in xrange(0, n2):
				propGroup2.append(float(seqGroup2[i]) / parentSeqGroup2[i])
			
			# calculate statistics
			meanG1 = float(sum(propGroup1)) / n1
			meanG2 = float(sum(propGroup2)) / n2
			dp = meanG1 - meanG2
			
			varG1 = variance(propGroup1, meanG1)
			varG2 = variance(propGroup2, meanG2)
			
			dof = n1 + n2 - 2
			pooledVar = ((n1 - 1)*varG1 + (n2 - 1)*varG2) / (n1 + n2 - 2)
			sqrtPooledVar = math.sqrt(pooledVar)
			denom = sqrtPooledVar * math.sqrt(1.0/n1 + 1.0/n2)
				
			# p-value
			T_statistic = (meanG1 - meanG2) / denom
			pValue = t.cdf(T_statistic, dof)
			
			# CI
			tCritical = t.isf(0.5 * (1.0-coverage), dof) # 0.5 factor accounts from symmetric nature of distribution
			lowerCI = dp - tCritical*denom
			upperCI = dp + tCritical*denom

		except Exception as note:
			pValue = 0.5
			lowerCI = 0.0
			upperCI = 0.0
			dp = 0.0
		except ZeroDivisionError:
			if meanG1 != meanG2:
				pValue = 0.0 # the difference (at least according to these samples) must be true as there is no variance
			else:
				pValue = 0.5
				
			lowerCI = dp
			upperCI = dp
			note = 'degenerate case: variance of both groups is zero'

		return 1.0 - pValue, 2*min(pValue, 1.0 - pValue), lowerCI*100, upperCI*100, dp*100, note
예제 #5
0
def weighted_average(w_in,y_in,conf=None,do_std=True):
    """computes weighted average of y with weight w
    over axis x
    
    input
    =====
    y_in: array or list of arrays to average.
          each element of the list is a numpy array
          each of these arrays has the structure (rec,val,....,x)
          where [val==0 is the mean] and [val==1 is the variance]
         

    w_in:    array of weights.
          shape is (nrec,x)
          
    conf:  confidance interval to use.
           (e.g., conf=0.95 for 95% conf. interval)
           if None (default), weighted standard deviation returned
    
    do_std: Bool
            Flag calculation of standard deviation/error.
            Default=True.
    

    output
    ======
    WA:  weighted average.  If y_in is a list, WA is a list.
         WA[i].shape=y_in[i].shape[1:]

    WSTD: weighted stdard deviation/error.  If none calculated, WSTD=None
    

    notes
    =====
    """
    assert type(w_in) is np.ndarray
    #assert type(y_list) is list
    assert w_in.ndim==2

    if type(y_in) is list:
        y_list=y_in
    else:
        y_list=[y_in]

    for y in y_list:
        assert y.shape[0]==w_in.shape[0]
        assert y.shape[-1]==w_in.shape[-1]
        assert type(y) is np.ndarray

    # #right type
    # w=w.astype(np.float)
        
    
    #normalize w
    w=w_in/np.sum(w_in,axis=0)
    
    WA_list=[]
    sig_WA_list=None
    
    if do_std and w.shape[0]>2:
        Sww=np.sum(w*w,axis=0)
        sig_WA_list=[]
        
    
    

    for y in y_list:
        #type
        #y=y.astype(np.float)
        
        #reshape w for broadcasting
        #not strictly necessary, but prevents any odd things
        new_shape=[w.shape[0]]+[1]*(y.ndim-2)+[w.shape[-1]]
        w.shape=new_shape
        #Sw.shape=new_shape[1:]
        #Sww.shape=new_shape[1:]
        
        WA=np.sum(w*y,axis=0)
        WA_list.append(WA)
        
        
        #weighted std dev
        if w.shape[0]>2 and do_std:
            Sww.shape=new_shape[1:]

            
            delta=y-WA
            
            sig_WA=np.sum(w*delta*delta,axis=0)
            
            sig_WA=sig_WA/(1.-Sww)

            #msk out bad values?
            msk=(sig_WA>0) & (np.isfinite(sig_WA))
            #zero bad points
            sig_WA[~msk]=0.
            sig_WA=np.sqrt(sig_WA)
            
            sig_WA_list.append(sig_WA)
        
    #print y.shape,WA.shape
    #do SE?
    if conf is not None and sig_WA_list is not None:
        sT=_ST.isf((1.-conf)*0.5,w.shape[0]-1)
        for i in range(len(sig_WA_list)):
            sig_WA_list[i]*=sT
            
            
    
    return WA_list,sig_WA_list
예제 #6
0
def spliced_ave_var(w_in,y_in,conf=None):
    """computes variance from pieces y with weight w over first axis
    
    input
    =====
    w_in:    array of weights.
          shape is (nrec,x)

    y_in: array or list of arrays of ave/variance
          each array of shape (nrec,val,...,x)
          val=0 => average
          val=1 => variance

          
    conf:  confidance interval to use.
           (e.g., conf=0.95 for 95% conf. interval)
           if None (default), weighted standard deviation returned
    
    do_std: Bool
            Flag calculation of standard deviation/error.
            Default=True.
    

    output
    ======
    Y:  weighted average/variance.  If y_in is a list, Y is a list.
         Y[i].shape=y_in[i].shape[1:]

    WSTD: weighted stdard deviation/error between blocks
    

    notes
    =====
    """
    assert type(w_in) is np.ndarray
    #assert type(y_list) is list
    assert w_in.ndim==2

    if type(y_in) is list:
        y_list=y_in
    else:
        y_list=[y_in]

    for y in y_list:
        assert y.ndim>2
        assert y.shape[0]==w_in.shape[0]
        assert y.shape[1]==2
        assert y.shape[-1]==w_in.shape[-1]
        assert type(y) is np.ndarray

    # #right type
    # w=w.astype(np.float)

    nrec=w_in.shape[0]
    ny=len(y_list)
    
    #normalize w
    #w=w_in/np.sum(w_in,axis=0)
    weight=w_in/np.sum(w_in,axis=0)
    

    W1=np.zeros(weight.shape[1:]) #sum weight
    W2=np.zeros(weight.shape[1:]) #sum weight**2
    
    M_V_1=[] #splice mean,variance
    M_V_2=[] #variance mean,variance

    V1=[] #mean of variance (not returned)
    
    w_shape=[] #reshaper for each y in y_list
    for y in y_list:
        M_V_1.append(np.zeros(y.shape[1:]))
        M_V_2.append(np.zeros(y.shape[1:]))

        V1.append(np.zeros(y.shape[2:]))
        #         y less rec,val,x
        new_shape=[1]*(y.ndim-3)+[y.shape[-1]]
        w_shape.append(new_shape)

        
    #accumulate
    for rec in range(nrec):
        
        w=weight[rec,:]
        
        W1_last=W1.copy() #note, have copy to make sure not pointing
        W1+=w
        W2+=w*w

        for iy in range(ny):
            s=w_shape[iy]
            x=y_list[iy][rec,0,...]
            v=y_list[iy][rec,1,...]

            f0=(w/W1).reshape(s)
            f1=(W1_last*w/W1).reshape(s)

            delta=(x-M_V_1[iy][0,...])
            delta_2=delta*delta

            #splice mean/var
            M_V_1[iy][0,...]+=delta*f0 #((w/W1).reshape(s))
            M_V_1[iy][1,...]+=v*w+delta_2*f1 #(W1_last*w/W1).reshape(s)

            #variance of mean
            M_V_2[iy][0,...]+=delta_2*f1 #(W1_last*w/W1).reshape(s)

            #variance of variance
            delta=(v-V1[iy])
            V1[iy]+=delta*f0 #(w/W1).reshape(s)
            M_V_2[iy][1,...]+=delta*delta*f1 #(W1_last*w/W1).reshape(s)
                



    #normalize spliced variance
    for iy in range(ny):
        M_V_1[iy][1,...]/=(W1.reshape(w_shape[iy]))

    #normalize variance of mean, variance of variance
    if nrec<2:
        fac=np.zeros(W1.shape)
    else:
        fac=W1/(W1**2-W2)
        
    for iy in range(ny):
        s=w_shape[iy]
        M_V_2[iy][0,...]*=fac.reshape(s)
        M_V_2[iy][1,...]*=fac.reshape(s)

        #mask out bads
        if nrec>1:
            msk=(M_V_2[iy]>0)&(np.isfinite(M_V_2[iy]))
            M_V_2[iy][~msk]=0.
            M_V_2[iy]=np.sqrt(M_V_2[iy])
        else:
            M_V_2[iy]=np.zeros(M_V_2[iy].shape)
            

    #confidence interval
    if conf is not None and nrec>1:
        sT=_ST.isf((1.-conf)*0.5,nrec-1)/np.sqrt(nrec)
        for iy in range(ny):
            M_V_2[iy]*=sT
    

    return M_V_1,M_V_2
예제 #7
0
	def run(self, seqGroup1, seqGroup2, parentSeqGroup1, parentSeqGroup2, confIntervMethod, coverage):
		note = ''
		
		n1 = len(seqGroup1)
		n2 = len(seqGroup2)
		
		try:
			if n1 < 2 or n2 < 2:
				raise Exception('degenerate case: both groups must contain at least 2 samples')
				
			# calculate proportions
			propGroup1 = []
			for i in xrange(0, n1):
				if parentSeqGroup1[i] > 0:
					propGroup1.append(float(seqGroup1[i]) / parentSeqGroup1[i])
				else:
					propGroup1.append( 0.0 )
					note = 'degenerate case: parent group had a count of zero'
				
			propGroup2 = []
			for i in xrange(0, n2):
				if parentSeqGroup2[i] > 0:
					propGroup2.append(float(seqGroup2[i]) / parentSeqGroup2[i])
				else:
					propGroup2.append( 0.0 )
					note = 'degenerate case: parent group had a count of zero'
			
			# calculate statistics
			meanG1 = float(sum(propGroup1)) / n1
			meanG2 = float(sum(propGroup2)) / n2
			dp = meanG1 - meanG2
			
			varG1 = variance(propGroup1, meanG1)
			varG2 = variance(propGroup2, meanG2)
			
			dof = n1 + n2 - 2
			pooledVar = ((n1 - 1)*varG1 + (n2 - 1)*varG2) / (n1 + n2 - 2)
			sqrtPooledVar = math.sqrt(pooledVar)
			denom = sqrtPooledVar * math.sqrt(1.0/n1 + 1.0/n2)
				
			# p-value
			T_statistic = (meanG1 - meanG2) / denom
			pValue = t.cdf(T_statistic, dof)
			
			# CI
			tCritical = t.isf(0.5 * (1.0-coverage), dof) # 0.5 factor accounts from symmetric nature of distribution
			lowerCI = dp - tCritical*denom
			upperCI = dp + tCritical*denom

		except Exception as note:
			pValue = 0.5
			lowerCI = 0.0
			upperCI = 0.0
			dp = 0.0
		except ZeroDivisionError:
			if meanG1 != meanG2:
				pValue = 0.0 # the difference (at least according to these samples) must be true as there is no variance
			else:
				pValue = 0.5
				
			lowerCI = dp
			upperCI = dp
			note = 'degenerate case: variance of both groups is zero'

		return 1.0 - pValue, 2*min(pValue, 1.0 - pValue), lowerCI*100, upperCI*100, dp*100, note
예제 #8
0
	def run(self, seqGroup1, seqGroup2, parentSeqGroup1, parentSeqGroup2, confIntervMethod, coverage):
		note = ''
		
		n1 = len(seqGroup1)
		n2 = len(seqGroup2)
		
		if n1 >= 2 and n2 >= 2:
			# calculate proportions
			propGroup1 = []
			for i in xrange(0, n1):
				if parentSeqGroup1[i] > 0:
					propGroup1.append(float(seqGroup1[i]) / parentSeqGroup1[i])
				else:
					propGroup1.append( 0.0 )
					note = 'degenerate case: parent group had a count of zero'
				
			propGroup2 = []
			for i in xrange(0, n2):
				if parentSeqGroup2[i] > 0:
					propGroup2.append(float(seqGroup2[i]) / parentSeqGroup2[i])
				else:
					propGroup2.append( 0.0 )
					note = 'degenerate case: parent group had a count of zero'
			
			# calculate p-value, effect size, and CI
			meanG1 = float(sum(propGroup1)) / n1
			meanG2 = float(sum(propGroup2)) / n2
			dp = meanG1 - meanG2
			
			varG1 = var(propGroup1, ddof=1)
			varG2 = var(propGroup2, ddof=1)
			
			normVarG1 = varG1 / n1
			normVarG2 = varG2 / n2
			unpooledVar = normVarG1 + normVarG2
			sqrtUnpooledVar = math.sqrt(unpooledVar)
			
			
			if unpooledVar != 0:
				# p-value
				T_statistic = (meanG1 - meanG2) / sqrtUnpooledVar
				dof = (unpooledVar*unpooledVar) / ( (normVarG1*normVarG1)/(n1-1) + (normVarG2*normVarG2)/(n2-1) )
				pValue = t.cdf(T_statistic, dof)
				
				# CI
				tCritical = t.isf(0.5 * (1.0-coverage), dof) # 0.5 factor accounts from symmetric nature of distribution
				lowerCI = dp - tCritical*sqrtUnpooledVar
				upperCI = dp + tCritical*sqrtUnpooledVar
			else:
				if meanG1 != meanG2:
					pValue = 0.0 # the difference (at least according to these samples) must be true as there is no variance
				else:
					pValue = 0.5
					
				lowerCI = dp
				upperCI = dp
				
				note = 'degenerate case: variance of both groups is zero'
		else:
			pValue = 0.5
			lowerCI = 0.0
			upperCI = 0.0
			dp = 0.0
			note = 'degenerate case: both groups must contain at least 2 samples'
	
		return 1.0 - pValue, 2*min(pValue, 1.0 - pValue), lowerCI*100, upperCI*100, dp*100, note