예제 #1
0
    def test_mean(self, mu0, return_weights=False):
        """
        Returns - 2 x log-likelihood ratio, p-value and weights
        for a hypothesis test of the mean.

        Parameters
        ----------
        mu0 : float
            Mean value to be tested

        return_weights : bool
            If return_weights is True the funtion returns
            the weights of the observations under the null hypothesis.
            Default is False

        Returns
        -------
        test_results : tuple
            The log-likelihood ratio and p-value of mu0
        """
        self.mu0 = mu0
        endog = self.endog
        nobs = self.nobs
        eta_min = (1. - (1. / nobs)) / (self.mu0 - max(endog))
        eta_max = (1. - (1. / nobs)) / (self.mu0 - min(endog))
        eta_star = optimize.brentq(self._find_eta, eta_min, eta_max)
        new_weights = (1. / nobs) * 1. / (1. + eta_star * (endog - self.mu0))
        llr = -2 * np.sum(np.log(nobs * new_weights))
        if return_weights:
            return llr, chi2.sf(llr, 1), new_weights
        else:
            return llr, chi2.sf(llr, 1)
예제 #2
0
    def test_mean(self, mu0, return_weights=False):
        """
        Returns - 2 x log-likelihood ratio, p-value and weights
        for a hypothesis test of the mean.

        Parameters
        ----------
        mu0 : float
            Mean value to be tested

        return_weights : bool
            If return_weights is True the funtion returns
            the weights of the observations under the null hypothesis.
            Default is False

        Returns
        -------
        test_results : tuple
            The log-likelihood ratio and p-value of mu0
        """
        self.mu0 = mu0
        endog = self.endog
        nobs = self.nobs
        eta_min = (1. - (1. / nobs)) / (self.mu0 - max(endog))
        eta_max = (1. - (1. / nobs)) / (self.mu0 - min(endog))
        eta_star = optimize.brentq(self._find_eta, eta_min, eta_max)
        new_weights = (1. / nobs) * 1. / (1. + eta_star * (endog - self.mu0))
        llr = -2 * np.sum(np.log(nobs * new_weights))
        if return_weights:
            return llr, chi2.sf(llr, 1), new_weights
        else:
            return llr, chi2.sf(llr, 1)
예제 #3
0
    def do_TS_GMM(self, utu, utr, b, f):
        # Unrestricted case where intercepts are included
        alpha = b[0:self.N]
        gtu = self.set_g(utu, f)
        nmom = gtu.shape[0]
        d = self.set_d()
        m = int(ceil(1.2 * float(self.T) ** (1.0 / 3)))  # int(floor(self.T**(1.0/4.0)))
        Su = self.set_S(m, gtu)
        SIGMAb = self.get_varb(d, Su)

        Sigmaalp = SIGMAb[0:self.N,0:self.N]
        if rank(Sigmaalp,tol=1e-9)<self.N:
            valu = dot(alpha.T,dot(pinv(Sigmaalp),alpha))
        else:
            valu = dot(alpha.T,solve(Sigmaalp,alpha))
        self.TS_GMM_pval_u = squeeze(chi2.sf(valu,self.N - self.k))

        # Restricted case with no intercept included

        gtr = self.set_g(utr,f)
        Sr = self.set_S(m,gtr)
        gTr = reshape(mean(gtr,axis=1),(nmom,1))
        if rank(Sr,tol=1e-9) < nmom:
            valr = self.T*dot(gTr.T,dot(pinv(Sr),gTr))
        else:
            valr = self.T*dot(gTr.T,solve(Sr,gTr))
        self.TS_GMM_pval_r = squeeze(chi2.sf(valr,self.N-self.k))

        # GJ test

        gTu = reshape(mean(gtu,axis=1),(nmom,1))
        val = self.T*(dot(gTr.T,solve(Su,gTr)) - dot(gTu.T,solve(Su,gTu)))
        self.TS_GMM_pval_3 = squeeze(chi2.sf(val,self.N-self.k))

        return
예제 #4
0
 def calChi2(self, x, y):
     """
     Input
     x:变量[1D]array
     y:实际标签[1D]array
     return
     chi2Value, eptFre, pValue, dfreedom
     """
     n = y.shape[0]
     xValues = np.unique(x)
     yValues = np.unique(y)
     #y的分布
     PyValues = [sum(y==yvalue)/n for yvalue in yValues]
     #生成交叉表,实际分布表和期望分布表
     realFre = np.zeros((len(xValues), len(yValues)))
     eptFre = np.copy(realFre)
     for xIdx, xvalue in enumerate(xValues):
         for yIdx,yvalue in enumerate(yValues):
             realFre[xIdx, yIdx] = sum((x==xvalue)&(y==yvalue))
             eptFre[xIdx, yIdx] = sum(x==xvalue)*PyValues[yIdx]
     #计算卡方值矩阵、卡方值、自由度、p值
     chi2Matrix = np.power((realFre-eptFre), 2)/(eptFre+1.0e-6)
     chi2Value = chi2Matrix.sum()
     dfreedom = (len(xValues)-1)*(len(yValues)-1)
     if dfreedom == 0:
         pValue = chi2.sf(chi2Value, dfreedom+1)
     else:
         pValue = chi2.sf(chi2Value, dfreedom)
     return round(chi2Value,4), round(pValue,4), dfreedom, eptFre
예제 #5
0
def p_value_calculate(X, y, is_intercept, X_null=None):
    # with null variable
    print("X.shape={}, y.shape={}".format(X.shape, y.shape))

    lr_model = LogisticRegression(C=1e8, solver='lbfgs', max_iter=1000)
    lr_model.fit(X, y)
    alt_prob = lr_model.predict_proba(X)

    alt_log_likelihood = -log_loss(y, alt_prob, normalize=False)
    if is_intercept:
        # if we want the p-value of beta_0
        null_prob = sum(y) / float(y.shape[0]) * \
                    np.ones(y.shape)
        null_log_likelihood = -log_loss(y, null_prob, normalize=False)
        df = 1
        G = 2 * (alt_log_likelihood - null_log_likelihood)
        p_value = chi2.sf(G, df)
    else:
        # without null variable
        lr_model.fit(X_null, y)
        null_prob = lr_model.predict_proba(X_null)[:, 1]
        null_log_likelihood = -log_loss(y, null_prob, normalize=False)

        df = X.shape[1] - X_null.shape[1]
        G = 2 * (alt_log_likelihood - null_log_likelihood)
        p_value = chi2.sf(G, df)
    return p_value
예제 #6
0
def g_func(init_par, alpha, delta, plx_obs, mualpha_obs, mudelta_obs, vrad_obs, sigma_obs, sigma_vrad, ccoef, N):
	"""
	Estimate the g_func (exponent of the likelihood), which gives an estimate
	of the membership of each star to the moving group.

	Parameters:
	------------
	init_par - Set of initial values for: 1) All the parallaxes [mas];
				       2) The cluster centroid velocity  [vx_0, vy_0, vz_0] [km/s];
				       3) The cluster velocity dispersion, sigma_v [km/s];
	alpha, delta - Cluster member positions [rad];
	plx_obs, mualpha_obs, mudelta_obs - observed values for parallaxes and proper motions [mas, mas/yr];
	sigma_obs - observed errors for parallaxes and proper motions [mas, mas/yr];
	ccoef - 3-dim array of correlation coefficients from the HIP catalogue;
	N - the number of stars;

	Returns:
	-----------
	g_func - An array of values with the values of g_i(theta) for each star in the group, see eq. 19 in Lindegren+2000;
	"""
	L, g = ilike(init_par, alpha, delta, plx_obs, mualpha_obs, mudelta_obs, vrad_obs, sigma_obs, sigma_vrad, ccoef, N) 	
	p = np.zeros(N)
	for i in range(N):
	    if np.isfinite(vrad_obs[i]):
	        p[i] = chi2.sf(g[i],3)
	    else:
	        p[i] = chi2.sf(g[i],2)
	        
	return  p
예제 #7
0
def asymptotic_p_value(asimov_q, use_median_rather_than_asimov=False):
    if use_median_rather_than_asimov:
        median_q = ncx2.ppf(0.5, df=2, nc=max(0., asimov_q))
        p_value = chi2.sf(median_q, df=2)
    else:
        p_value = chi2.sf(asimov_q, df=2)
    return p_value
예제 #8
0
def statistics(n, N, scale=1.1):
    
    m = [2, 4, 8, 16, 32, 64]
    chisq_arr = np.zeros(len(m))
    fig6, ax6 = plt.subplots(figsize=(7,5))

    for i in range(len(m)):
        k, avg_kprob, std_kprob = average(n, N, m[i], scale=scale)
        prob_theory = (k-m[i])*np.log10(m[i]) - (k - m[i] + 1)*np.log10((m[i] + 1))
        prob_theory = 10**prob_theory

        chisq1 = chisqg(avg_kprob[:], prob_theory[:], sd=std_kprob[:])
        chisq2 = chisqg(avg_kprob[1:-1], prob_theory[1:-1], sd=std_kprob[1:-1])
        p_value1 = chi2.sf(chisq1, len(avg_kprob[:])-2)
        p_value2 = chi2.sf(chisq2, len(avg_kprob[1:-1])-2)

        chisq_arr[i] = chisq2/(len(avg_kprob[1:-1])-2)
     
        print('m = ', m[i], 'all points, chisq = ', chisq1, 'p value = ', p_value1)
        print('m = ', m[i], 'Sliced points, chisq = ', chisq2, 'p value = ', p_value2)
   
    ax6.plot(m, chisq_arr, 'o')
    ax6.set_xlabel('m')
    ax6.set_ylabel('$\chi^2/N_{dof}$')
    fig6.tight_layout()
def get_gwas(simulated_data, freq_a1, freq_b1):

    model_a = smf.ols("phenotype ~ snp_a_gen", data=simulated_data).fit()
    model_b = smf.ols("phenotype ~ snp_b_gen", data=simulated_data).fit()

    model = smf.ols('phenotype ~ snp_a_gen + snp_b_gen',
                    data=simulated_data).fit()
    # print(model.summary())

    gwas_dict = {
        "snp_num": [1, 2],
        "freq1": [freq_a1, freq_b1],
        "freq2": [1 - freq_a1, 1 - freq_b1],
        "beta": [model_a.params.snp_a_gen, model_b.params.snp_b_gen],
        "se": [model_a.bse.snp_a_gen, model_b.bse.snp_b_gen],
        "p": [
            chi2.sf((model_a.params.snp_a_gen / model_a.bse.snp_a_gen)**2, 1),
            chi2.sf((model_b.params.snp_b_gen / model_b.bse.snp_b_gen)**2, 1)
        ]
    }

    gwas = pd.DataFrame.from_dict(gwas_dict)
    gwas = gwas[["snp_num", "freq1", "freq2", "beta", "se", "p"]]
    gwas["z_u"] = gwas["beta"] / gwas["se"]
    return gwas
예제 #10
0
def TS(pathway, ppi, stu, purb, dgv=0.4):
    """ T-square.
        For the given pathway, this function creates the corresponding interaction matrix.
        Returns the associated T^2, p-value, and other information.
    """
    # - pathway is a pandas dataframe containing the id of a pathway, its included proteins, and their abundance ratios.
    # - z contains only the abundance ratios of the proteins, to be used later to calculate the T^2 value.
    # - m contains the indexes of the pathway's proteins that can be translated form Uniprot to STRING.
    # - S is the interaction matrix to be built from STRING interaction scores.
    pathway = pathway.sort_values(by='prot_id').reset_index(drop=True)
    z = np.vectorize(float)(pathway['exp'])
    m = np.where(np.isin(stu, pathway))[0]
    S = dgv * np.identity(len(z))
    nrow, ncol = S.shape
    if nrow != 1:
        # Each possible pair of proteins in the pathway will be looked at.
        for i in range(1, nrow):
            for j in range(i):
                # x1 is the Uniprot accession to one protein i in the pathway.
                x1 = pathway.iat[i, 1]
                # x2 is the Uniprot accession to another protein j in the pathway.
                x2 = pathway.iat[j, 1]
                # s1 is the STRING id to protein i, translated using m.
                s1 = stu.iloc[m]['String_id'].to_numpy()[np.where(
                    np.isin(stu.iloc[m]['Uniprot_id'], x1))[0]]
                # s2 is the STRING id to protein j.
                s2 = stu.iloc[m]['String_id'].to_numpy()[np.where(
                    np.isin(stu.iloc[m]['Uniprot_id'], x2))[0]]
                if len(s1) * len(s2) != 0:
                    # If there is one, p will contain the experimental value of interaction between the two proteins.
                    # If there are more, the mean will be used.
                    # Get all protein 1 partners in STRING
                    p = ppi.iloc[np.where(np.isin(ppi['protein1'], s1))[0]]
                    # Check for protein among partners
                    p = p.iloc[np.where(np.isin(p['protein2'],
                                                s2))]['experimental']
                    # p holds interaction score(s) from experimental evidences only
                    if len(p) > 0:
                        # Modify S to include that value at the corresponding index.
                        if z[pathway['prot_id'] == x1] * z[pathway['prot_id']
                                                           == x2] < 0:
                            S[i, j] = -np.mean(p)
                            S[j, i] = -np.mean(p)
                        else:
                            S[i, j] = np.mean(p)
                            S[j, i] = np.mean(p)
    # Transform STRING scores, S matrix to be positive-definite, and therefore useable for the T^2 method.
    S = nearestPD(S)
    r = np.linalg.matrix_rank(S, tol=1e-10)
    # T2 score matrix effectively used
    T2 = TV(z, S)
    I = dgv * np.identity(len(z))
    T2I = TV(z, I)
    return np.array([
        pathway.iat[0, 0], ','.join(pathway['prot_id']),
        len(pathway), r, T2,
        chi2.sf(T2, r), T2I,
        chi2.sf(T2I, r)
    ],
                    dtype=object)
예제 #11
0
    def fmb_pval(self, alpham, vcvalpha):
        if rank(vcvalpha, tol=1e-9) < self.N:
            self.FMB_JS = dot(alpham.T, dot(pinv(vcvalpha), alpham))

            return chi2.sf(dot(alpham.T, dot(pinv(vcvalpha), alpham)), self.N - self.k)
        else:
            self.FMB_JS = dot(alpham.T, solve(vcvalpha, alpham))
            return chi2.sf(dot(alpham.T, solve(vcvalpha, alpham)), self.N - self.k)
예제 #12
0
def LR_test(full, reduced):
    full_ll = list(full.rx2('loglik'))
    reduced_ll = list(reduced.rx2('loglik'))
    assert full_ll[0] == reduced_ll[0]
    if len(full_ll) == 1:
        return 1.
    full_df = len(full.rx2('coefficients')) 
    if len(reduced_ll) == 1:
        return chi2.sf(2*full_ll[1] - 2*full_ll[0], full_df)
    reduced_df = len(reduced.rx2('coefficients'))
    df = max(full_df - reduced_df, 1)
    return chi2.sf(2*full_ll[1] - 2*reduced_ll[1], df)
예제 #13
0
def fishers_method(log10pvals):
    if len(log10pvals) == 1:
        return log10pvals[0]
    signs = set(np.sign(log10pvals))
    df = 2 * len(log10pvals)
    if len(signs) > 1:
        return 0
    elif signs == {-1}:
        chi2sum = sum(-log10pvals) * np.log(10)
        return np.log10(chi2.sf(chi2sum, df))
    elif signs == {1}:
        chi2sum = sum(log10pvals) * np.log(10)
        return -np.log10(chi2.sf(chi2sum, df))
 def independence_test(self, A):
     length = len(A)
     if length==1:
         return [0.0,None]
     
     n = 0
     ni = []
     nj = []
     for i in range(length):
         sum = 0
         for ele in A[i]:
             n = n + ele
             sum = sum + ele
         ni.append(sum)
     
     for i in range(len(A[0])):
         sum = 0
         for j in range(length):
             sum = sum + A[j][i]
         nj.append(sum)
         
     T = []
     for i in ni:
         tmp = []
         for j in nj:
             tmp.append(float(i*j)/n)
         T.append(tmp)
     
     c2 = 0
     for i in range(length):
         for j in range(len(A[0])):
             c2 = c2 + (A[i][j]-T[i][j])**2/T[i][j]
     
     p = C.sf(c2, (length-1)*(len(A[0])-1))
     return [round(c2,6),round(p,6)]
예제 #15
0
    def getStats(self,keyData,keyTheory,auto=False,show=False,numbins=-1):

        dataVector = self.datas[keyData]['binned'][:numbins]
        dofs = len(dataVector) - 2
        if auto: dofs -= 1
        chisqNull = self.chisq(keyData)
        chisqTheory = self.chisq(keyData,keyTheory,numbins=numbins)

        stats = {}
        stats['reduced chisquare'] =  chisqTheory / dofs
        stats['pte'] = chi2.sf(chisqTheory,dofs)
        stats['null sig'] = np.sqrt(chisqNull)
        stats['theory sig'] = np.sqrt(chisqNull-chisqTheory)
        

        if show:
            printC("="*len(keyTheory),color='y')
            printC(keyTheory,color='y')
            printC('-'*len(keyTheory),color='y')
            printC("amplitude",color='b')
            bf,err = self.datas[keyTheory]['amp']
            printC('{0:.2f}'.format(bf)+"+-"+'{:04.2f}'.format(err),color='p')
            
            for key,val in stats.iteritems():
                printC(key,color='b')
                printC('{0:.2f}'.format(val),color='p')
            printC("="*len(keyTheory),color='y')

        
        return stats
예제 #16
0
def chisq(obs, exp, cov_input, ddof=None, sidx=None, eidx=None):
    ''' compute chisq 
        
        input
        obs  : observation
        exp  : expected value
        cov  : covariance 
        ddof : degree of freedom 

        oupput:
        chisq: computed chisq
        p    : p value
    '''
    from scipy.stats import chi2

    diff = obs - exp if not (exp == 0.).all() else obs.copy()
    cov = cov_input.copy()

    if sidx is None: sidx = 0
    if eidx is None: eidx = len(diff)
    diff = diff[sidx:eidx]
    cov = cov[sidx:eidx, sidx:eidx]

    norm = np.mean(np.abs(cov))
    cov /= norm
    diff /= np.sqrt(norm)

    chisq = np.dot(np.linalg.pinv(cov), diff)
    chisq = np.dot(diff.T, chisq)

    if ddof is None: ddof = len(obs)
    p = chi2.sf(chisq, ddof)

    return chisq, p
예제 #17
0
def get_chi2_two(key, targ, null, plot=True):
    """
    """

    ntype = targ.shape[0]

    for i in range(ntype):

        tt = np.array([
            sum([i * p for i, p in enumerate(targ[i, j, :])])
            for j in range(ntype)
        ])
        nn = np.array([
            sum([i * p for i, p in enumerate(null[i, j, :])])
            for j in range(ntype)
        ])

        n_nn = np.sum(nn)
        n_tt = np.sum(tt)
        k1 = np.sqrt(n_nn / n_tt)
        k2 = 1 / k1

        chi2_stat = sum([(k1 * t - k2 * n)**2 / (t + n)
                         for t, n in zip(tt, nn)])

        df = len(tt) - 1
        print(i, tt, nn / np.sum(nn) * np.sum(tt),
              np.array(tt) - np.array(nn) / np.sum(nn) * np.sum(tt))
        #print(key, i, 'chi2', chi2_stat, df)
        p_value = chi2.sf(chi2_stat, df)
        print(k, i, 'p-value', p_value)
예제 #18
0
def chstwo(bins1, bins2, ddof=0, axis=0):
    """
    Chi-square test for difference between two data sets. Return the statistic and the p-value.
    Uses _count() to drop from the chi-square sum any entries for which both values are 0; the degrees of freedom are decremented for each dropped case.

    Comments on relation to NRC's chstwo() and SciPy's chi2.sf():
        -bins1 :: f_obs
        -bins2 :: f_exp
        -ddof :: adjustment to dof ... related to knstrn ... dof = num_obs - 1 - ddof ... see NRC discussion of arguments to chstwo algorithm.
            ... if data sets are of equal integral (perhaps normalized) then knstrn = 1, ddof = 0, dof = num_obs - 1 - 0
            ... if data sets are not of equal integral then knstrn = 0 ... ddof = -1, dof = num_obs - 1 - (-1) = num_obs
            ... could essentially rewrite as dof = num_obs - 1 - ddof --> dof = num_obs - knstrn - ddof, where ddof becomes any adjustment to dof beyond that of knstrn
        -Evaluating the prob from the chi2 distribution:
            ... NRC defines gammq as 1 - P, where P is probability that the observed chi2 for a correct model should be less than a value chi2.
                gammq(0.5*df, 0.5*chi2)
            ... For scipy, we'd have gammq = 1 - scipy.stats.chi2.cdf(x, df, loc=0, scale=1)
                or gammq = scipy.stats.chi2.sf(x, df, loc=0, scale=1) ... sf is survival function (also defined as 1 - cdf, but sf is sometimes more accurate)
    """
    # check inputs
    if len(bins1) != len(bins2):
        return 'Error: chstwo: len(bins1) != len(bins2)'
    # where bins1[i]=bins2[i]=0, mask entry i
    bins1, bins2 = np.ma.masked_where(condition = [(bins1 == bins2) & (bins1==0), (bins2 == bins1) & (bins2==0)], a = [bins1, bins2])
    # Do the test
    terms = (bins1 - bins2)**2 / (bins1 + bins2) # Terms with division by zero have been masked out. Terms evaluating to zero are kept.
    stat = terms.sum(axis=axis)
    num_obs = _count(terms, axis=axis) # returns number of non-masked terms in stat
    ddof = np.asarray(ddof)
    p = chi2.sf(stat, num_obs - 1 - ddof)
    # print('chi2.sf(stat = %f, dof = %d, ddof = %d)' % (stat, num_obs - 1 - ddof, ddof))
    return stat, p
예제 #19
0
    def test_joint_skew_kurt(self, skew0, kurt0, return_weights=False):
        """
        Returns - 2 x log-likelihood and the p-value for the joint
        hypothesis test for skewness and kurtosis

        Parameters
        ----------
        skew0 : float
            Skewness value to be tested
        kurt0 : float
            Kurtosis value to be tested

        return_weights : bool
            If True, function also returns the weights that
            maximize the likelihood ratio. Default is False.

        Returns
        -------
        test_results : tuple
            The log-likelihood ratio and p-value  of the joint hypothesis test.
        """
        self.skew0 = skew0
        self.kurt0 = kurt0
        start_nuisance = np.array([self.endog.mean(), self.endog.var()])

        llr = optimize.fmin_powell(self._opt_skew_kurt,
                                   start_nuisance,
                                   full_output=1,
                                   disp=0)[1]
        p_val = chi2.sf(llr, 2)
        if return_weights:
            return llr, p_val, self.new_weights.T
        return llr, p_val
예제 #20
0
 def testConnectednessBetweenTwoUsers(self, currentUser, neighborUser):
     """
     Cluster identification:
     Test whether two user models have the same ground-truth theta
     :param currentUser:
     :param neighborUser:
     :return:
     """
     n = currentUser.update_num
     m = neighborUser.update_num
     if n == 0 and m == 0:
         return False
     # Compute numerator
     theta_combine = np.dot(
         np.linalg.pinv(currentUser.A + neighborUser.A -
                        2 * self.lambda_ * np.identity(n=self.dimension)),
         currentUser.b + neighborUser.b)
     num = np.linalg.norm(
         np.dot(currentUser.X,
                (currentUser.UserThetaNoReg -
                 theta_combine)))**2 + np.linalg.norm(
                     np.dot(
                         neighborUser.X,
                         (neighborUser.UserThetaNoReg - theta_combine)))**2
     XCombinedRank = np.linalg.matrix_rank(
         np.concatenate((currentUser.X, neighborUser.X), axis=0))
     df1 = int(currentUser.rank + neighborUser.rank - XCombinedRank)
     chiSquareStatistic = num / (self.NoiseScale**2)
     p_value = chi2.sf(x=chiSquareStatistic, df=df1)
     if p_value <= self.neighbor_identification_alpha:  # upper bound probability of false alarm
         return False
     else:
         return True
예제 #21
0
    def test_var(self, sig2_0, return_weights=False):
        """
        Returns  -2 x log-likelihoog ratio and the p-value for the
        hypothesized variance

        Parameters
        ----------
        sig2_0 : float
            Hypothesized variance to be tested

        return_weights : bool
            If True, returns the weights that maximize the
            likelihood of observing sig2_0. Default is False

        Returns
        --------
        test_results : tuple
            The  log-likelihood ratio and the p_value  of sig2_0

        Examples
        --------
        >>> random_numbers = np.random.standard_normal(1000)*100
        >>> el_analysis = sm.emplike.DescStat(random_numbers)
        >>> hyp_test = el_analysis.test_var(9500)
        """
        self.sig2_0 = sig2_0
        mu_max = max(self.endog)
        mu_min = min(self.endog)
        llr = optimize.fminbound(self._opt_var, mu_min, mu_max, \
                                 full_output=1)[1]
        p_val = chi2.sf(llr, 1)
        if return_weights:
            return llr, p_val, self.new_weights.T
        else:
            return llr, p_val
예제 #22
0
  def __init__(self,sigLocal,sig0,N0):
    # Convert significance to p-value
    pLocal = norm.sf(sigLocal)
    p0 = norm.sf(sig0)
    
    # Get the test statistic value corresponding to the p-value
    u = chi2.isf(pLocal*2,1)
    u0 = chi2.isf(p0*2,1)
    
    # The main equations
    N = N0 * exp(-(u-u0)/2.)
    pGlobal = N + chi2.sf(u,1)/2.
    
    # Further info
    sigGlobal = norm.isf(pGlobal)
    trialFactor = pGlobal/pLocal

    self.sigGlobal = sigGlobal
    self.sigLocal = sigLocal
    self.sig0 = sig0
    self.pGlobal = pGlobal
    self.pLocal = pLocal
    self.p0 = p0
    self.N0 = N0
    self.N = N
    self.u0 = u0
    self.u = u
    self.trialFactor = trialFactor
예제 #23
0
    def lr_test(self, ll_min, ll_max, p_threshold, delta_params):
        """Performs likelihood ratio test.

        Parameters
        ----------
        ll_min : float
            Log likelihood of model with fewer params.
        ll_max
            Log likelihood of model with more params.
        p_threshold : float
            Threshold of p value to accept model_max as better.
        delta_params : int
            Difference in number of parameters in nested model.

        Returns
        -------
        bool
            True if test passes.

        """
        lr = -2 * (ll_max - ll_min)  # log-likelihood ratio
        p = chi2.sf(lr, delta_params)
        print(ll_min, ll_max, delta_params)
        print("p-value is: " + str(p))
        return p < p_threshold
예제 #24
0
def logrank(fr_sample1, fr_sample2):
    confidence = 0.95
    Z = norm.ppf((1.00 + confidence) / 2.0)
    fr1 = fr_agg(fr_sample1)
    fr2 = fr_agg(fr_sample2)
    fr1 = fr1.set_index('Time')
    fr2 = fr2.set_index('Time')
    idx = fr1.index.union(fr2.index)

    Y1 = fr1['dY'].reindex(idx, fill_value=0).cumsum().shift(1).fillna(0)
    Y2 = fr2['dY'].reindex(idx, fill_value=0).cumsum().shift(1).fillna(0)
    h1 = fr1['dE[N]'].reindex(idx, fill_value=0)
    h2 = fr2['dE[N]'].reindex(idx, fill_value=0)
    dN1 = fr1['dN'].reindex(idx, fill_value=0)
    dN2 = fr2['dN'].reindex(idx, fill_value=0)

    w1 = (Y1 * Y2 / (Y1 + Y2)).fillna(0)
    w2 = ((Y1 * Y2) / (Y1 + Y2)**2).fillna(0)
    U_score = (w1 * (h1 - h2)).sum()
    U_var = (w2 * (dN1 + dN2)).sum()

    score = U_score**2 / U_var
    p_value = chi2.sf(score, 1)
    #print U_score, U_var, score, p_value

    return p_value
예제 #25
0
    def _opt_var(self, nuisance_mu, pval=False):
        """
        This is the function to be optimized over a nuisance mean parameter
        to determine the likelihood ratio for the variance

        Parameters
        ----------
        nuisance_mu : float
            Value of a nuisance mean parameter

        Returns
        -------
        llr : float
            Log likelihood of a pre-specified variance holding the nuisance
            parameter constant
        """
        endog = self.endog
        nobs = self.nobs
        sig_data = ((endog - nuisance_mu) ** 2 \
                    - self.sig2_0)
        mu_data = (endog - nuisance_mu)
        est_vect = np.column_stack((mu_data, sig_data))
        eta_star = self._modif_newton(np.array([1. / nobs, 1. / nobs]),
                                      est_vect,
                                      np.ones(nobs) * (1. / nobs))

        denom = 1 + np.dot(eta_star, est_vect.T)
        self.new_weights = 1. / nobs * 1. / denom
        llr = np.sum(np.log(nobs * self.new_weights))
        if pval:  # Used for contour plotting
            return chi2.sf(-2 * llr, 1)
        return -2 * llr
    def multinomial_chi2_test(self, config):
        '''Simple multinomial chi2 test - Based on Gardner & Knopoff (1974)'''
        # Config should include 'K' marker - divide catalogue into intervals
        # of K length
        if config['K'] < 1:
            raise ValueError('K must be greater than or equal to 1')
        start_bin = range(self.start_year, self.end_year, config['K'])
        end_bin = range(self.start_year + config['K'] - 1, self.end_year,
                        config['K'])
        number_ints = len(end_bin)
        time_ints = np.column_stack([np.array(start_bin[:number_ints]),
                                     np.array(end_bin)])
        
        ncount = np.zeros(np.shape(time_ints)[0], dtype=int)

        for iloc, time_bin in enumerate(time_ints):
            ncount[iloc] = np.sum(np.logical_and(self.year >= time_bin[0],
                self.year < time_bin[1]))
        #ncount = ncount.astype(float)
        theoretical_rate = self.number_events / float(config['K'])
        c_value, expected_c = self._get_c_value(config['K'], theoretical_rate)
        observed_c = self._get_obs_c(c_value, ncount)
        chi2m = np.sum((observed_c.astype(float) - 
            (expected_c.astype(float) ** 2.)) / expected_c.astype(float))
        if not(config['dof']):
            config['dof'] = float(c_value[-1] - 2)

        p_value = chi2.sf(chi2m, config['dof'])
        return p_value, chi2m, c_value[-1], config['dof']
예제 #27
0
    def test_corr(self, corr0, return_weights=0):
        """
        Returns -2 x log-likelihood ratio and  p-value for the
        correlation coefficient between 2 variables

        Parameters
        ----------
        corr0 : float
            Hypothesized value to be tested

        return_weights : bool
            If true, returns the weights that maximize
            the log-likelihood at the hypothesized value
        """
        nobs = self.nobs
        endog = self.endog
        if endog.shape[1] != 2:
            raise Exception('Correlation matrix not yet implemented')
        nuis0 = np.array([
            endog[:, 0].mean(), endog[:, 0].var(), endog[:, 1].mean(),
            endog[:, 1].var()
        ])

        x0 = np.zeros(5)
        weights0 = np.array([1. / nobs] * int(nobs))
        args = (corr0, endog, nobs, x0, weights0)
        llr = optimize.fmin(self._opt_correl,
                            nuis0,
                            args=args,
                            full_output=1,
                            disp=0)[1]
        p_val = chi2.sf(llr, 1)
        if return_weights:
            return llr, p_val, self.new_weights.T
        return llr, p_val
예제 #28
0
def mcfequal(fr_sample1, fr_sample2, confidence=0.95, robust=False):
    #TODO: drop Y = 0, compare multiple
    Z = norm.ppf((1.00 + confidence) / 2.0)
    fr1 = fr_agg(fr_sample1)
    fr2 = fr_agg(fr_sample2)
    fr1 = fr1.set_index('Time')
    fr2 = fr2.set_index('Time')
    idx = fr1.index.union(fr2.index)

    Y1 = fr1['dY'].reindex(idx, fill_value=0).cumsum().shift(1).fillna(0)
    Y2 = fr2['dY'].reindex(idx, fill_value=0).cumsum().shift(1).fillna(0)
    h1 = fr1['dE[N]'].reindex(idx, fill_value=0)
    h2 = fr2['dE[N]'].reindex(idx, fill_value=0)

    w = (Y1 * Y2 / (Y1 + Y2)).fillna(0)
    U_score = (w * (h1 - h2)).sum()

    if not robust:
        U_var = var(fr1, w).iloc[-1] + var(
            fr2, w).iloc[-1]  #(w**2 * (h1/Y1 + h2/Y2)).sum()
    else:
        U_var = robust_var(fr_sample1, w).iloc[-1] + robust_var(fr_sample2,
                                                                w).iloc[-1]

    p_value = chi2.sf(U_score**2 / U_var, 1)
    #print U_score, U_var, U_score**2/U_var, p_value
    return p_value
예제 #29
0
def visualize_pruning(
        w_norm,
        n_retained,
        title='Initial model weights vs theoretical for pruning'):
    fig, ax1 = plt.subplots()
    ax1.set_title(title)
    ax1.hist(w_norm,
             normed=True,
             bins=200,
             alpha=0.6,
             histtype='stepfilled',
             range=[0, n_retained * 5])
    ax1.axvline(x=n_retained, linewidth=1, color='r')
    ax1.set_ylabel('PDF', color='b')

    ax2 = ax1.twinx()
    ax2.set_ylabel('Survival Function', color='r')

    ax1.set_xlabel('w_norm')

    x = np.linspace(chi2.ppf(0.001, n_retained), chi2.ppf(0.999, n_retained),
                    100)
    ax2.plot(x,
             chi2.sf(x, n_retained),
             'g-',
             lw=1,
             alpha=0.6,
             label='chi2 pdf')
    ax1.plot(x,
             chi2.pdf(x, n_retained),
             'r-',
             lw=1,
             alpha=0.6,
             label='chi2 pdf')
예제 #30
0
 def solve(self):
     url='http://112.124.1.3:8060/getData/101.json'
     data=urllib2.urlopen(url).read()
     babyArr=eval(data)['data']
     baby=[]
     num=[]
     for i in babyArr:
         if i[2]<=10 and i[2]>5:
             baby.append(i[2]*4.33)
             num.append(i[5])
         if i[2]<49 and i[2]>25:
             baby.append(i[2])
             num.append(i[5])
     a1,a2,a3,n1,n2,n3=0.0,0.0,0.0,0.0,0.0,0.0
     for i in range(len(baby)):
         if baby[i]<=37:
             a1+=1
             if num[i]==1:
                 n1+=1
         elif baby[i]>=41:
             a3+=1
             if num[i]==1:
                 n3+=1
         else:
             a2+=1
             if num[i]==1:
                 n2+=1
     a=a1+a2+a3
     p1=a1/a
     p2=a2/a
     p3=a3/a
     n=n1+n2+n3
     c=n1**2/(n*p1)+n2**2/(n*p2)+n3**2/(n*p3)-n
     p=chi2.sf(c,2)
     print[c,p]
예제 #31
0
 def llr_pvalue(self):
     """
     p-value of likelihood ratio chi-squared statistic; `-2*(llnull - llf)`
     with degrees of freedom `df_model`
     under H0: all coefficients excluding constnat is zero
     """
     return chi2.sf(self.llr, self.df_model)
예제 #32
0
파일: tools.py 프로젝트: miaviles/VerticaPy
def normaltest(vdf: vDataFrame, column: str):
    """
---------------------------------------------------------------------------
Test whether a sample differs from a normal distribution.

Parameters
----------
vdf: vDataFrame
    input vDataFrame.
column: str
    Input vcolumn to test.

Returns
-------
tablesample
    An object containing the result. For more information, see
    utilities.tablesample.
    """
    Z1, Z2 = skewtest(vdf,
                      column)["value"][0], kurtosistest(vdf,
                                                        column)["value"][0]
    Z = Z1**2 + Z2**2
    pvalue = chi2.sf(Z, 2)
    result = tablesample({
        "index": [
            "Statistic",
            "p_value",
        ],
        "value": [Z, pvalue],
    })
    return result
예제 #33
0
def get_s2_two_old(key, targ, null, plot=True):
    """
    """

    ntype = targ.shape[0]

    for i in range(ntype):

        tt = np.array([
            sum([i * p for i, p in enumerate(targ[i, j, :])])
            for j in range(ntype)
        ])
        nn = np.array([
            sum([i * p for i, p in enumerate(null[i, j, :])])
            for j in range(ntype)
        ])

        n_nn = np.sum(nn)
        n_tt = np.sum(tt)

        print(i, tt, nn / np.sum(nn) * np.sum(tt),
              np.array(tt) - np.array(nn) / np.sum(nn) * np.sum(tt))

        nn = nn / n_nn
        tt = tt / n_tt

        cb = sum([np.sqrt(t * n) for t, n in zip(tt, nn)])
        s2 = 4 * n_nn * n_tt / (n_nn + n_tt) * np.arccos(cb)**2

        df = len(tt) - 1
        p_value = chi2.sf(s2, df)
        print(key, i, 'p-value', p_value)
예제 #34
0
def logrank_k(*fr_samples):
    confidence = 0.95
    Z = norm.ppf((1.00 + confidence) / 2.0)
    k = len(fr_samples)

    cohorts = range(k)
    fr_aggs = [fr_agg(fr_cohort).set_index('Time') for fr_cohort in fr_samples]

    dY = pd.DataFrame(
        {idx: fr_agg['dY']
         for idx, fr_agg in zip(cohorts, fr_aggs)}).fillna(0)
    dN = pd.DataFrame(
        {idx: fr_agg['dN']
         for idx, fr_agg in zip(cohorts, fr_aggs)}).fillna(0)
    dN_ = dN.sum(axis=1)
    Y = dY.cumsum(axis=0).shift(1, axis=0).fillna(0)
    Y_ = Y.sum(axis=1)

    K = Y.all(axis=1).astype(int)
    Z = np.array([(K * (dN[cohort] - Y[cohort] * (dN_ / Y_))).sum()
                  for cohort in cohorts])
    V = np.array([[
        (K * Y[cohort_i] / Y_ *
         (int(cohort_i == cohort_j) - Y[cohort_j] / Y_) * dN_).sum()
        for cohort_j in cohorts
    ] for cohort_i in cohorts])
    Z = Z[:-1]
    V = V[:-1, :-1]

    score = np.dot(np.dot(Z, inv(V)), Z)
    p_value = chi2.sf(score, k - 1)
    #print Z, V, score, p_value
    return p_value
예제 #35
0
def check_ra_dec_uniform(ra, dec, nside=2, footprint=None):
    pixels = hp.ang2pix(nside, ra, dec, lonlat=True)
    npix = hp.nside2npix(nside) if footprint is None else footprint.size
    pixels, counts = np.unique(pixels, return_counts=True)
    assert pixels.size == npix
    mean = ra.size / npix
    assert chi2.sf(((counts - mean)**2.0 / mean).sum(), df=npix-1) > 1e-5
예제 #36
0
def Question_2_Chi_Squared():
    '''

Observed
                A           nA
     Bitter     13          11          24
     NBitter    5           7           12
                18          18          36


Expected-0
                A                          nA
     Bitter     24/36 * 18/36 * 36         24
     NBitter                               12
                18          18             36


Expected-1
                A                          nA
     Bitter     12                         24
     NBitter                               12
                18          18             36


Expected-2
                A                          nA
     Bitter     12          12             24
     NBitter    6           6              12
                18          18             36

    '''

    chi2_sum = (13-12)**2/12 + (11-12)**2/12 + (5-6)**2/6 + (7-6)**2/6
    print(chi2.sf(0.5, 1))
예제 #37
0
파일: pValue.py 프로젝트: hannahm8/bigG
def pValue(confidence_levels, statistics="chi2"):

    if statistics is None:
        print("no test statistics specified. choose between ks or chi2")
        exit()
    elif statistics == "ks":
        """
        The KS statistics is relevant for the 
        actual confidence regions
        """
        return kstest(confidence_levels, 'uniform')[1]
    elif statistics == "chi2":
        """
        The Fisher method chi2 statistics acts
        on the p-values, defined as 1-CL
        """

        #compute the fisher combined statistic
        p = -2. * np.sum(np.log(confidence_levels))

        # find out if this is chi-squared distributed with 2*NExp degrees of freedom using the survival function.
        #print "sf: ", chi2.sf(p,2.*len(confidence_levels))
        sfValue = chi2.sf(p, 2. * len(confidence_levels))
        """
        print "the confidence levels are: \n", confidence_levels
        print "this gives a global pvalue of ", p
        print "the survival function for this is ", sfValue
        x = np.arange(0.0, 50, 0.01)
        plt.plot(x, chi2.sf(x,2*len(confidence_levels)))
        plt.axvline(p)
        plt.show()
        """
        return sfValue
예제 #38
0
    def test_joint_skew_kurt(self, skew0, kurt0, return_weights=False):
        """
        Returns - 2 x log-likelihood and the p-value for the joint
        hypothesis test for skewness and kurtosis

        Parameters
        ----------
        skew0 : float
            Skewness value to be tested
        kurt0 : float
            Kurtosis value to be tested

        return_weights : bool
            If True, function also returns the weights that
            maximize the likelihood ratio. Default is False.

        Returns
        -------
        test_results : tuple
            The log-likelihood ratio and p-value  of the joint hypothesis test.
        """
        self.skew0 = skew0
        self.kurt0 = kurt0
        start_nuisance = np.array([self.endog.mean(),
                                       self.endog.var()])

        llr = optimize.fmin_powell(self._opt_skew_kurt, start_nuisance,
                                     full_output=1, disp=0)[1]
        p_val = chi2.sf(llr, 2)
        if return_weights:
            return llr, p_val, self.new_weights.T
        return llr, p_val
예제 #39
0
def plot_ts_vs_chi2(data, ext_list="ext1_ts", ndf_chi2=[1], subplot=[1, 2, 1], **kwargs):
    ax = plt.subplot(subplot[0], subplot[1], subplot[2])
    ext_data = column(data, "%s" % ext_list)
    clean_data = [x for x in ext_data if not math.isnan(x)]  # remove nan from data
    n, bins, patches = plt.hist(
        clean_data, int(math.ceil(max(column(data, "%s" % ext_list)))), normed=1, facecolor="green"
    )
    bincenters = 0.5 * (bins[1:] + bins[:-1])
    chi2_vals = []
    colors = ["r", "b", "g"]
    for j in range(0, len(ndf_chi2)):
        chi2_vals.append(chi2.pdf(bincenters, ndf_chi2[j]))
        plt.plot(bincenters, chi2_vals[j], "%s--" % colors[j], linewidth=2.0, label="$\chi^2_%i$/2" % ndf_chi2[j])
    legend = ax.legend(loc="upper right", frameon=False)
    plt.ylabel("PDF")
    plt.xlabel("TS$_{%s}$" % ext_list[0:4])
    plt.yscale("log")
    plt.ylim([0.00001, 2.0])

    ax = plt.subplot(subplot[0], subplot[1], subplot[2] + 1)
    n, bins, patches = plt.hist(
        clean_data, int(math.ceil(max(column(data, "%s" % ext_list)))), normed=1, facecolor="green", cumulative=-1
    )
    chi2_sfvals = []
    for j in range(0, len(ndf_chi2)):
        chi2_sfvals.append(chi2.sf(bincenters, ndf_chi2[j]))
        plt.plot(bincenters, chi2_sfvals[j], "%s--" % colors[j], linewidth=2.0, label="$\chi^2_%i$/2" % ndf_chi2[j])
    legend = ax.legend(loc="upper right", frameon=False)
    plt.ylabel("1-CDF")
    plt.xlabel("TS$_{%s}$" % ext_list[0:4])
    plt.yscale("log")
    plt.ylim([0.00001, 2.0])
예제 #40
0
    def _opt_var(self, nuisance_mu, pval=False):
        """
        This is the function to be optimized over a nuisance mean parameter
        to determine the likelihood ratio for the variance

        Parameters
        ----------
        nuisance_mu : float
            Value of a nuisance mean parameter

        Returns
        -------
        llr : float
            Log likelihood of a pre-specified variance holding the nuisance
            parameter constant
        """
        endog = self.endog
        nobs = self.nobs
        sig_data = ((endog - nuisance_mu) ** 2 \
                    - self.sig2_0)
        mu_data = (endog - nuisance_mu)
        est_vect = np.column_stack((mu_data, sig_data))
        eta_star = self._modif_newton(np.array([1. / nobs,
                                               1. / nobs]), est_vect,
                                                np.ones(nobs) * (1. / nobs))

        denom = 1 + np.dot(eta_star, est_vect.T)
        self.new_weights = 1. / nobs * 1. / denom
        llr = np.sum(np.log(nobs * self.new_weights))
        if pval:  # Used for contour plotting
            return chi2.sf(-2 * llr, 1)
        return -2 * llr
예제 #41
0
    def test_var(self, sig2_0, return_weights=False):
        """
        Returns  -2 x log-likelihoog ratio and the p-value for the
        hypothesized variance

        Parameters
        ----------
        sig2_0 : float
            Hypothesized variance to be tested

        return_weights : bool
            If True, returns the weights that maximize the
            likelihood of observing sig2_0. Default is False

        Returns
        --------
        test_results : tuple
            The  log-likelihood ratio and the p_value  of sig2_0

        Examples
        --------
        >>> random_numbers = np.random.standard_normal(1000)*100
        >>> el_analysis = sm.emplike.DescStat(random_numbers)
        >>> hyp_test = el_analysis.test_var(9500)
        """
        self.sig2_0 = sig2_0
        mu_max = max(self.endog)
        mu_min = min(self.endog)
        llr = optimize.fminbound(self._opt_var, mu_min, mu_max, \
                                 full_output=1)[1]
        p_val = chi2.sf(llr, 1)
        if return_weights:
            return llr, p_val, self.new_weights.T
        else:
            return  llr, p_val
예제 #42
0
    def test_corr(self, corr0, return_weights=0):
        """
        Returns -2 x log-likelihood ratio and  p-value for the
        correlation coefficient between 2 variables

        Parameters
        ----------
        corr0 : float
            Hypothesized value to be tested

        return_weights : bool
            If true, returns the weights that maximize
            the log-likelihood at the hypothesized value
        """
        nobs = self.nobs
        endog = self.endog
        if endog.shape[1] != 2:
            raise Exception('Correlation matrix not yet implemented')
        nuis0 = np.array([endog[:, 0].mean(),
                              endog[:, 0].var(),
                              endog[:, 1].mean(),
                              endog[:, 1].var()])

        x0 = np.zeros(5)
        weights0 = np.array([1. / nobs] * int(nobs))
        args = (corr0, endog, nobs, x0, weights0)
        llr = optimize.fmin(self._opt_correl, nuis0, args=args,
                                     full_output=1, disp=0)[1]
        p_val = chi2.sf(llr, 1)
        if return_weights:
            return llr, p_val, self.new_weights.T
        return llr, p_val
예제 #43
0
    def independence_test(self, A):
        if(len(A)==0):
            return [None.None]
        if(len(A)==1):
            return [0.0,None]

        rows=[]
        columns=[]

        #ini rows
        for row in A:
            r_sum=0.0
            for r in row:
                r_sum+=r
            rows.append(r_sum)

        #ini columns
        for i in range(0,len(A[0])):
            c_sum=0.0
            for row in A:
                c_sum+=row[i]
            columns.append(c_sum)

        tot=sum(rows)

        #cal Ma1
        Ma_1=[]
        for i in range(0,len(rows)):
            row=[]
            for j in range(0,len(columns)):
                T=rows[i]*columns[j]/tot
                row.append(T)
            Ma_1.append(row)

        #cal Ma2
        Ma_2=[]
        for i in range(0,len(rows)):
            row=[]
            for j in range(0,len(columns)):
                Z=(A[i][j]-Ma_1[i][j])**2/Ma_1[i][j]
                row.append(Z)
            Ma_2.append(row)

        #cal X_2
        X_2=0.0
        for x in Ma_2:
            for y in x:
                X_2+=y

        #cal c
        print X_2
        print tot
        c=(X_2/(X_2+tot))**0.5
        print c
        #cal p
        p=chi2.sf(x=X_2,df=(len(rows)-1)*(len(columns)-1))

        return [round(X_2,6),round(p,6)]
예제 #44
0
    def var_threshold(self, alpha):
        SS = (self.n1 - 1) * self.S1
        chi20 = SS / self.var0
        n1 = self.n1
        # hypothesis testing2
        H1a = chi2.ppf(1 - alpha / 2.0, n1 - 1) < chi20 or chi2.ppf(alpha / 2.0, n1 - 1) > chi20
        H1b = chi2.ppf(alpha / 2.0, n1 - 1) > chi20
        H1c = chi2.ppf(1 - alpha / 2.0, n1 - 1) < chi20
        # p-value
        p1a = np.max(np.array([chi2.sf(chi20, n1 - 1), 1 - chi2.sf(chi20, n1 - 1)]))
        p1b = chi2.sf(chi20, n1 - 1)
        p1c = 1 - chi2.sf(chi20, n1 - 1)

        # confidence intervals: the minimum level of significance
        # alpha for which the null hypothesis is rejected
        c1 = (n1 - 1) * SS / chi2.ppf(1 - alpha / 2.0, n1 - 1)
        c2 = (n1 - 1) * SS / chi2.ppf(alpha / 2.0, n1 - 1)
        return H1a, H1b, H1c, p1a, p1b, p1c, (c1, c2)
예제 #45
0
파일: regressions.py 프로젝트: bulik/ldsc
def p_z_norm(est, se):
    '''Convert estimate and se to Z-score and P-value.'''
    try:
        Z = est / se
    except (FloatingPointError, ZeroDivisionError):
        Z = float('inf')

    P = chi2.sf(Z ** 2, 1, loc=0, scale=1)  # 0 if Z=inf
    return P, Z
예제 #46
0
파일: g9.py 프로젝트: ababino/efe
def chi2_from_sample(data):
    histdata = np.histogram(data, 56, range=(-7, 7))
    N = sum(histdata[0])
    y_data = histdata[0]
    yerr2 = y_data * (1 - y_data / N)
    x = histdata[1][:-1] + np.diff(histdata[1]) / 2
    norm025 = norm(0, 2.5)
    y_fit = N * (norm025.cdf(histdata[1][1:]) - norm025.cdf(histdata[1][:-1]))
    chi2_m = sum((y_data - y_fit)**2 / yerr2)
    return chi2_m, chi2.sf(chi2_m, len(x)), len(x)
예제 #47
0
def sf_z2m(ts,m=2):
    """ Return the survival function (chance probability) according to the
        asymptotic calibration for the Z^2_m test.

        args
        ----
        ts      result of the Z^2_m test
    """
    from scipy.stats import chi2
    return chi2.sf(ts,2*m)
예제 #48
0
def hotel2(X1, X2):
  """ Computes Hotelling t-squared statistic under two assumptions or variance.

  :param X1 pandas DataFrame with samples from first group
  :param X2 pandas DataFrame with samples from second group
  :return None
  """
  # TODO: Verify Hotelling results
  n1, k = X1.shape
  n2, k2 = X2.shape
  assert(k == k2)

  ybar1 = X1.mean().as_matrix()
  s1 = np.cov(X1, rowvar=False)
  ybar2 = X2.mean(axis=0).as_matrix()
  s2 = np.cov(X2, rowvar=False)

  alpha = 0.05
  diffs = (ybar1 - ybar2).reshape(1, k)

  # TODO: Incorporate a test for equal variances

  # If variances assumed equal, then pool
  if True:
    spool = ((n1 - 1) * s1 + (n2 - 1) * s2) / (n1 + n2 - 2)
    t2 = diffs\
      .dot(np.linalg.inv(spool * (1.0 / n1 + 1.0 / n2)))\
      .dot(ybar1 - ybar2)\
      .item(0)
    eff = (n1 + n2 - k - 1) * t2 / (k * (n1 + n2 - 2))
    df1 = k
    df2 = n1 + n2 - k - 1
    p_value = f.sf(eff, df1, df2)
    print('If variances are assumed equal between classes')
    if p_value < alpha:
      print("\t=> Reject the null hypothesis that mean(X1) == mean(X2)")
    else:
      print("\t=> Accept null hypothesis that mean(X1) == mean(X2)")
    print(t2, p_value)

  # If variances not assumed equal, then use modified Hotelling
  if True:
    t2 = diffs\
      .dot(np.linalg.inv(s1 / n1 + s2 / n2))\
      .dot(ybar1 - ybar2)\
      .item(0)
    p_value = chi2.sf(t2, k)
    print('If variances are not assumed equal between classes')
    if p_value < alpha:
      print("\t=> Reject the null hypothesis that mean(X1) == mean(X2)")
    else:
      print("\t=> Accept null hypothesis that mean(X1) == mean(X2)")
    print(t2, p_value)
예제 #49
0
파일: _qtl.py 프로젝트: Horta/lim
    def pvalues(self):
        """Association p-value for candidate markers."""
        self.compute_statistics()

        lml_alts = self.alt_lmls()
        lml_null = self.null_lml()

        lrs = -2 * lml_null + 2 * asarray(lml_alts)

        from scipy.stats import chi2
        chi2 = chi2(df=1)

        return chi2.sf(lrs)
def chi_test(alleles, freq_table, est, n=1):
    # Combinations: https://docs.python.org/2/library/itertools.html
    # Calculate Observed and the Expected then calc the chi stat
    allele_pool = ''.join(alleles)
    individuals = Counter(est)
    observed_table = pd.DataFrame()
    expected_table = pd.DataFrame()

    for genotype in combinations_with_replacement(allele_pool, 2):
        genotype_string = genotype[0] + genotype[1]

        # EXPECTED
        if genotype[0] != genotype[1]:
            # These are the Heteros, multiple their expected frequencies by 2
            exp_hetero_count = 2 * freq_table[genotype[0]].values * freq_table[genotype[1]].values * n
            expected_table[genotype_string] = exp_hetero_count

        else:
            # These are the Homos, square their expected frequencies
            exp_homo_count = (freq_table[genotype[0]].values ** 2) * n
            expected_table[genotype_string] = exp_homo_count

        # OBSERVED
        if genotype[0] == genotype[1]:
            # H**o therefore order doesn't matter
            obs_homo_count = individuals[int(genotype_string)]
            observed_table.set_value(0, genotype_string, obs_homo_count)
        else:
            # Hetero therefore order does matter
            # I'm using extended slice syntax to reverse the string [::-1] This makes synonymous genotypes the same
            obs_hetero_count = individuals[int(genotype_string)]
            obs_hetero_count += individuals[int(genotype_string[::-1])]
            observed_table.set_value(0, genotype_string, obs_hetero_count)

    # Calculate Chi sq
    chi_table = ((observed_table - expected_table) ** 2) / expected_table
    chi_sq_statistic = chi_table.sum(axis=1).values[0]
    df = len(expected_table.columns) - 2
    p_value = chi2.sf(chi_sq_statistic, df)

    print('\n Expected Numbers')
    print(expected_table)

    print('\n Observed Numbers')
    print(observed_table)

    print('\n CHI^2: {}'.format(chi_sq_statistic))
    print('df: {}, p: {}'.format(df, p_value))

    return chi_sq_statistic, df, p_value
예제 #51
0
파일: slm.py 프로젝트: zcpan/BinContModel
	def fit(self):
		if self.verbose:
			print 'rho\tbeta1\tbeta2\tsigma2'
		self.em()
		self.residue1 = self.Y1 - self.Y1.mean() - self.X * self.beta1 
		self.residue2 = self.Y2 - self.Y2.mean() - self.X * self.beta2
		self.sigma1 = np.sqrt(np.dot(self.residue1.T, self.residue1)/(self.n - 2))
		self.sigma2 = np.sqrt(np.dot(self.residue2.T,self.residue2)/(self.n-2))
		self.LL = -self.n * np.log(4*np.pi**2*self.sigma2*self.sigma1*(1-self.rho**2)) - 1/2/(1-self.rho**2)*(np.dot(self.residue1.T,self.residue1)/self.sigma1**2 - 2 * self.rho * np.dot(self.residue1.T,self.residue2) / self.sigma1 / self.sigma2 + np.dot(self.residue2.T,self.residue2) / self.sigma2 ** 2)
		self.pvalue = chi2.sf(2*(self.LL - self.LL_control),2)
		self.beta2_norm = float(self.beta2 / self.sigma2)
		self.pvalue = 1 - norm.cdf(self.beta2_norm * np.sqrt(self.n))
		if self.verbose:
			print self.beta2_norm, self.pvalue
예제 #52
0
    def solve(self):
        page=urllib.urlopen('http://112.124.1.3:8060/getData/101.json')
        c=page.read()
        data=json.loads(c)["data"]

        week1=0.0
        week1_2=0.0
        week2=0.0
        week2_2=0.0
        week3=0.0
        week3_2=0.0
        for x in data:
            y=x[2]
            if(y>5 and y<=10):
                y=y*4.33
            if(y>25 and y<=37):
                if(x[5]==1):
                    week1+=1
                else:
                    week1_2+=1
            elif(y>=38 and y<=40):
                if(x[5]==1):
                    week2+=1
                else:
                    week2_2+=1
            elif(y>=41 and y<49):
                if(x[5]==1):
                    week3+=1
                else:
                    week3_2+=1

        week=[week1,week2,week3]
        week_2=[week1_2,week2_2,week3_2]
        
        sum_week=sum(week)
        sum_week_2=sum(week_2)
        tot=sum_week+sum_week_2

        t11=(week1+week1_2)*sum_week/tot
        t12=(week2+week2_2)*sum_week/tot
        t13=(week3+week3_2)*sum_week/tot

        z1=(week1-t11)**2/t11
        z2=(week2-t12)**2/t12
        z3=(week3-t13)**2/t13

        x_2=z1+z2+z3
        p=chi2.sf(x=x_2,df=2)
        return (round(x_2,6),p)
예제 #53
0
    def el_test(self, b0_vals, param_nums, method='nm',
                            stochastic_exog=1, return_weights=0):
        """
        Returns the llr and p-value for a hypothesized parameter value
        for a regression that goes through the origin

        Parameters
        ----------
        b0_vals : 1darray
            The hypothesized value to be tested

        param_num : 1darray
            Which parameters to test.  Note this uses python
            indexing but the '0' parameter refers to the intercept term,
            which is assumed 0.  Therefore, param_num should be > 0.

        print_weights : bool
            If true, returns the weights that optimize the likelihood
            ratio at b0_vals.  Default is False

        method : string
            Can either be 'nm' for Nelder-Mead or 'powell' for Powell.  The
            optimization method that optimizes over nuisance parameters.
            Default is 'nm'

        stochastic_exog : bool
            When TRUE, the exogenous variables are assumed to be stochastic.
            When the regressors are nonstochastic, moment conditions are
            placed on the exogenous variables.  Confidence intervals for
            stochastic regressors are at least as large as non-stochastic
            regressors.  Default is TRUE

        Returns
        -------
        res : tuple
            pvalue and likelihood ratio
        """
        b0_vals = np.hstack((0, b0_vals))
        param_nums = np.hstack((0, param_nums))
        test_res = self.model.fit().el_test(b0_vals, param_nums, method=method,
                                  stochastic_exog=stochastic_exog,
                                  return_weights=return_weights)
        llr_test = test_res[0]
        llr_res = llr_test - self.llr
        pval = chi2.sf(llr_res, self.model.exog.shape[1] - 1)
        if return_weights:
            return llr_res, pval, test_res[2]
        else:
            return llr_res, pval
예제 #54
0
def fishers_method(values):
    """ function to combine p values, using Fisher's method
    
    Args:
        x: list of P-values for a gene
    
    Returns:
        combined P-value
    """
    
    values = [ x for x in values if not isnan(x) ]
    
    # use Fisher's combined method to estimate the P value from multiple
    # P-values. The chi square statistic is -2*sum(ln(P-values))
    return chi2.sf(-2 * sum(map(log, values)), 2 * len(values))
예제 #55
0
def fishersMethod(x):
    """ function to combine p values, using Fisher's method
    
    Args:
        x: list of P-values for a gene
    
    Returns:
        combined P-value
    """
    
    x = [ val for val in x if not math.isnan(val) ]
    
    if len(x) == 0:
        return numpy.nan
    
    return chi2.sf(-2 * sum(numpy.log(x)), 2 * len(x))
def mahalanobis_distance(difference, num_random_features):
    num_samples, _ = shape(difference)
    sigma = cov(transpose(difference))

    try:
        numpy.linalg.inv(sigma)
    except LinAlgError:
        warn('covariance matrix is singular. Pvalue returned is 1.1')
        return 1.1
    mu = mean(difference, 0)

    if num_random_features == 1:
        stat = float(num_samples * mu ** 2) / float(sigma)
    else:
        stat = num_samples * mu.dot(solve(sigma, transpose(mu)))

    return chi2.sf(stat, num_random_features)
 def solve(self):
     html = self.getHtml('http://112.124.1.3:8050/getData/101')
     data = json.loads(html)["data"]
     
     T1 = []
     T2 = []
     
     for i in range(len(data)):
         a = data[i][2]
         if ((a<=5) | (a>=49) | ((a>10)&(a<=25))):
             continue
         if ((a<=10)&(a>5)):
             a = 4.33*a 
         if (data[i][5] == 1):
             T1.append(a)
         T2.append(a)
     
     n1 = len(T1)
     n2 = len(T2)
     
     a1=0
     b1=0
     d1=0
     for i in T1:
         if (i < 38):
             a1 = a1 + 1
         elif i>=41:
             d1 = d1 +1
         else:
             b1 = b1 + 1
     a2=0
     b2=0
     d2=0
     for i in T2:
         if (i < 38):
             a2 = a2 + 1
         elif i>=41:
             d2 = d2 +1
         else:
             b2 = b2 + 1
     
     t1 = float(a2)*n1/n2
     t2 = float(b2)*n1/n2
     c2 = float(a1**2)/(n1*float(a2)/n2)+float(b1**2)/(n1*float(b2)/n2)+float(d1**2)/(n1*float(d2)/n2)-n1
     p = C.sf(c2,2)
     return [c2,p]
예제 #58
0
def mahalanobis_distance(difference, num_random_features):
    num_samples, _ = np.shape(difference)
    sigma = np.cov(np.transpose(difference))

    mu = np.mean(difference, 0)

    if num_random_features == 1:
        stat = float(num_samples * mu ** 2) / float(sigma)
    else:
        try:
            linalg.inv(sigma)
        except LinAlgError:
            print('covariance matrix is singular. Pvalue returned is 1.1')
            warnings.warn('covariance matrix is singular. Pvalue returned is 1.1')
            return 0
        stat = num_samples * mu.dot(linalg.solve(sigma, np.transpose(mu)))

    return chi2.sf(stat, num_random_features)
예제 #59
0
    def mv_test_mean(self, mu_array, return_weights=False):
        """
        Returns -2 x log likelihood and the p-value
        for a multivariate hypothesis test of the mean

        Parameters
        ----------
        mu_array  : 1d array
            Hypothesized values for the mean.  Must have same number of
            elements as columns in endog

        return_weights : bool
            If True, returns the weights that maximize the
            likelihood of mu_array. Default is False.

        Returns
        -------
        test_results : tuple
            The log-likelihood ratio and p-value for mu_array
        """
        endog = self.endog
        nobs = self.nobs
        if len(mu_array) != endog.shape[1]:
            raise Exception(
                "mu_array must have the same number of \
                           elements as the columns of the data."
            )
        mu_array = mu_array.reshape(1, endog.shape[1])
        means = np.ones((endog.shape[0], endog.shape[1]))
        means = mu_array * means
        est_vect = endog - means
        start_vals = 1.0 / nobs * np.ones(endog.shape[1])
        eta_star = self._modif_newton(start_vals, est_vect, np.ones(nobs) * (1.0 / nobs))
        denom = 1 + np.dot(eta_star, est_vect.T)
        self.new_weights = 1 / nobs * 1 / denom
        llr = -2 * np.sum(np.log(nobs * self.new_weights))
        p_val = chi2.sf(llr, mu_array.shape[1])
        if return_weights:
            return llr, p_val, self.new_weights.T
        else:
            return llr, p_val
예제 #60
0
def chi2_calc(flux,fluxerr):
   ''' Chi2 with constant flux model
   
   flux: flux array
   fluxerr: flux error array
   
   return: chi^2 with constant flux (at weighted mean) model
   '''
   we_fix=[]
   for item in fluxerr:
      w_fix=1/((item)**2)
      we_fix.append(w_fix)
   wei_fix=np.array(we_fix)
   dof_fix=len(flux)-1
   wm_fix=np.average(flux,weights=wei_fix)
   un_fix=1/np.sqrt((np.array(we_fix).sum()))
   residual_fix=errf(wm_fix,flux,fluxerr)
   chisquared_fix=residual_fix**2  
   chi_tot_fix=((residual_fix**2).sum())
   null_hyp_fix=chi2.sf(chi_tot_fix,(np.array(flux).shape[0])-1)
   return(chi_tot_fix,dof_fix,wm_fix,un_fix,null_hyp_fix)