Пример #1
0
 def different_stdev_explicite(self, alpha, y1, y2, S1, S2, n1, n2):
     t0 = (y1 - y2) / (np.sqrt(S1**2 / n1 + S2**2 / n2))
     # hypothesis testing2
     df = int((S1**2 / n1 + S2**2 / n2)**2 / ((S1**2 / n1)**2 / (n1 - 1) +
                                              (S2**2 / n2)**2 / (n2 - 1)))
     H1a = t.ppf(1 - alpha / 2., df) < np.abs(t0)
     H1b = t.ppf(1 - alpha, df) < t0
     H1c = t.ppf(alpha, df) > t0
     # p-value
     p1a = t.sf(np.abs(t0), df) * 2
     p1b = t.sf(t0, df)
     p1c = t.cdf(t0, df)
     c1 = y1 - y2 - t.ppf(1 - alpha / 2.,
                          df) * np.sqrt(S1**2 / n1 + S2**2 / n2)
     c2 = y1 - y2 + t.ppf(1 - alpha / 2.,
                          df) * np.sqrt(S1**2 / n1 + S2**2 / n2)
     CI = (c1, c2)
     print 'at the level of significance ', alpha, ':'
     print 'H1 mu1 != mu2 is ', H1a
     print 'H1 mu1 > mu2 is ', H1b
     print 'H1 mu1 < mu2 is ', H1c
     print 'probability of type I error for mu1 != mu2:', p1a
     print 'probability of type I error for mu1 > mu2:', p1b
     print 'probability of type I error for mu1 < mu2:', p1c
     print 'CI (%.1f%%) for mu1 - mu2:' % (100 - 100 * alpha), CI, CI / y1
Пример #2
0
    def ttest_1samp(self, a, popmean):
        if (len(a) == 0):
            return [None, None]
        if (len(a) == 1):
            return [None, None]

        #cal avg
        avg = 0.0
        for x in a:
            avg += x
        avg = avg / len(a)

        S = 0.0
        for x in a:
            S += (x - avg)**2
        S = (S / (len(a) - 1))**0.5
        print S
        if (S == 0):
            return [None, None]
        tvalue = (avg - popmean) / (S / (len(a)**0.5))
        if (tvalue >= 0):
            p = t.sf(x=tvalue, df=len(a) - 1) * 2
            return [tvalue, p]
        else:
            p = 2 * t.sf(x=-tvalue, df=len(a) - 1)
            return [tvalue, p]
Пример #3
0
def p_valut_t(x_bar, mu, s, n, how):
    """ 计算sigma未知情况下的p-值
    总体均值假设检验,当sigma未知的情况下计算p-value

    Params
    ------
    x_bar: 样本均值
    mu: 总体均值,即目标值
    s: 样本方差
    n: 样本容量
    how: 假设检验方法,可选择 ( 'up', 'down', 'double' )

    Return
    ------
    (检验统计量, p-值)

    """
    t_dist = t(n-1)
    t_val = (x_bar - mu) / (s / np.sqrt(n))
    if how == 'up':
        p = t.sf(z)
    elif how == 'down':
        p = t.cdf(z)
    elif how == 'double':
        p = t.sf(abs(z)) * 2
    else:
        pass

    return t_val, p
Пример #4
0
    def ttest_1samp(self,a,popmean):
        if(len(a)==0):
            return [None,None]
        if(len(a)==1):
            return [None,None]

        #cal avg
        avg=0.0
        for x in a:
            avg+=x
        avg=avg/len(a)

        S=0.0
        for x in a:
            S+=(x-avg)**2
        S=(S/(len(a)-1))**0.5
        print S
        if(S==0):
            return [None,None]
        tvalue=(avg-popmean)/(S/(len(a)**0.5))
        if(tvalue>=0):
            p=t.sf(x=tvalue,df=len(a)-1)*2
            return [tvalue,p]
        else:
            p=2*t.sf(x=-tvalue,df=len(a)-1)
            return [tvalue,p]
    def regression_analysis(self, key, info):
        '''
        Calculates all the values we will need for simple linear regression 
        analysis, and does the analysis itself.
        '''
        # not the most efficient, but we want to keep these values
        # to calculate standard errors
        info = list(info)

        # calculate sums
        sumx, sumy, sumxx, sumyy, sumxy, n = (0.0, 0.0, 0.0, 0.0, 0.0, 0)
        for (x, y) in info:
            sumx += x
            sumy += y
            sumxx += x * x
            sumyy += y * y
            sumxy += x * y
            n += 1

        # calculate correlation
        corr = 0
        corr_denom = math.sqrt((n * sumxx - sumx**2) * (n * sumyy - sumy**2))
        if corr_denom < 0.0001:
            yield False, "Could not calculate coefficients"

        corr_num = n * sumxy - sumx * sumy 
        corr = corr_num / corr_denom

        if abs(corr) < 0.0001:
            yield False, "Could not calculate coefficients"

        # calculate regression coefficients
        beta1 = (sumxy - sumx * sumy / n) / (sumxx - sumx**2 / n)
        beta0 = (sumy - beta1 * sumx) / n

        # calculate standard errors
        y_reals = [y for (x, y) in info]
        y_hats = [beta0 + beta1 * y for y in y_reals]
        s_num = sum([(y - yhat) for (y, yhat) in zip(y_reals, y_hats)])
        s = math.sqrt(s_num / (n - 2))

        se_denom = n * sumxx - sumx**2
        se_beta0 = s * math.sqrt(sumxx / se_denom)
        se_beta1 = s * math.sqrt(n / se_denom)

        # calculate t-values
        t0 = beta0 / se_beta0
        t1 = beta1 / se_beta1

        # calculate 2-sided p-values
        alpha = 0.05
        t_stat = t.ppf(1 - alpha/2, n - 2)
        beta0_p_value = t.sf(abs(t0), n - 2) * 2
        beta1_p_value = t.sf(abs(t1), n - 2) * 2

        # output most important values in a human-readable format
        print("Correlation: {}".format(corr))
        print("Beta 0: {}, p-value: {}".format(beta0, beta0_p_value))
        print("Beta 1: {}, p-value: {}".format(beta1, beta1_p_value))
Пример #6
0
def _correl_pvalue(r, n, k=0, alternative="two-sided"):
    """Compute the p-value of a correlation coefficient.

    https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.pearsonr.html
    https://en.wikipedia.org/wiki/Pearson_correlation_coefficient#Using_the_exact_distribution

    See also scipy.stats._ttest_finish

    Parameters
    ----------
    r : float
        Correlation coefficient.
    n : int
        Sample size
    k : int
        Number of covariates for (semi)-partial correlation.
    alternative : string
        Tail of the test.

    Returns
    -------
    pval : float
        p-value.

    Notes
    -----
    This uses the same approach as :py:func:`scipy.stats.pearsonr` to calculate
    the p-value (i.e. using a beta distribution)
    """
    from scipy.stats import t
    assert alternative in [
        'two-sided', 'greater', 'less'
    ], ("Alternative must be one of 'two-sided' (default), 'greater' or 'less'."
        )

    # Method 1: using a student T distribution
    dof = n - k - 2
    tval = r * np.sqrt(dof / (1 - r**2))
    if alternative == 'less':
        pval = t.cdf(tval, dof)
    elif alternative == 'greater':
        pval = t.sf(tval, dof)
    elif alternative == 'two-sided':
        pval = 2 * t.sf(np.abs(tval), dof)

    # Method 2: beta distribution (similar to scipy.stats.pearsonr, faster)
    # from scipy.special import btdtr
    # ab = (n - k) / 2 - 1
    # pval = 2 * btdtr(ab, ab, 0.5 * (1 - abs(np.float64(r))))
    return pval
Пример #7
0
def kramers_v(x, y, bias_correction=True):
    """Calculates Cramer's V statistic for categorical-categorical association.

    Taken from https://github.com/shakedzy/dython/blob/master/dython/nominal.py
    Inspired by Shaked Zychlinski.

    This is a symmetric coefficient: V(x,y) = V(y,x)
    Original function taken from: https://stackoverflow.com/a/46498792/5863503
    Wikipedia: https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V

    Parameters:
    -----------
    x : list / NumPy ndarray / Pandas Series
        A sequence of categorical measurements
    y : list / NumPy ndarray / Pandas Series
        A sequence of categorical measurements
    bias_correction : Boolean, default = True
        Use bias correction from Bergsma and Wicher,
        Journal of the Korean Statistical Society 42 (2013): 323-328.

    Returns:
    --------
    float in the range of [0,1]
    """
    confusion_matrix = crosstab(x, y)
    c2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = c2 / n
    r, k = confusion_matrix.shape
    if bias_correction:
        phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
        rcorr = r - ((r - 1) ** 2) / (n - 1)
        kcorr = k - ((k - 1) ** 2) / (n - 1)
        if min((kcorr - 1), (rcorr - 1)) == 0:
            warnings.warn(
                "Unable to calculate Cramer's V using bias correction. Consider using bias_correction=False",
                RuntimeWarning)
            return np.nan
        else:

            V = np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))
            # calculate p-value using V
            tval = t.isf(0.975, n-3)
            return V, t.sf(abs(tval), n-2)
    else:
        V = np.sqrt(phi2 / min(k - 1, r - 1))
        tval = t.isf(0.975, n-3)
        return V, t.sf(abs(tval), n-2)
Пример #8
0
 def pearsonr(self, x, y):
     n = len(x)
     if(n == 0):
         return [None, None]
     sum_x1 = 0
     sum_x2 = 0
     for i in x:
         sum_x1 += float(i)
         sum_x2 += float(i)**2
     sum_y1 = 0
     sum_y2 = 0
     for i in y:
         sum_y1 += float(i)
         sum_y2 += float(i)**2
     f1 = 0
     for i in range(n):
         f1 += float(x[i])*float(y[i])
     f1 = f1 * n
     f1 = f1 - sum_x1 * sum_y1
     f21 = (n * sum_x2-sum_x1**2)**0.5
     f22 = (n * sum_y2-sum_y1**2)**0.5
     f2 = f21 * f22
     r = f1 / f2
     r = round(r, 6)
     if(r == 1 or r == -1):
         p = 0
     else:
         T = r * ((n-2)/(1-r**2))**0.5
         p = t.sf(abs(T), (n-2)) * 2
         p = round(p, 6)
         print p
     return [r, p]
Пример #9
0
def dunnetts_post_hoc(X0, X, alpha):
    Y = [X0, *X]
    p = len(X)
    N_i = [len(y) for y in Y]
    # s^2 = Sum(Sum((X_ij - |X|)^2))/n
    #n = sum(N_i) - (p+1)
    n = np.sum(N_i) - (p + 1)  # degrees of freedom
    s_num = np.sum([np.power([y - np.mean(x) for y in x], 2) for x in Y])
    s = np.sqrt(s_num / n)

    N = [len(x) for x in X]
    m0 = np.mean(X0)
    N0 = len(X0)
    t_cv = t.ppf(1 - (alpha / 2),
                 n)  # get 2-tailed critical value from t-disitribution
    CI = []
    P = []
    for x, Ni in zip(X, N):
        mx = np.mean(x)
        A0 = t_cv * s * np.sqrt(1 / Ni + 1 / N0)
        Ai = np.abs(mx - m0)
        Ti = Ai / (s * np.sqrt(1 / Ni + 1 / N0))
        Pi = t.sf(Ti, n)
        P.append(Pi)
        CI.append((Ai - A0, Ai + A0))

    return CI, P
Пример #10
0
def main(feature_set):
    coef_list = []
    for iteration in range(MAX_ITERATIONS):
        print 'iteration: %d\r' % (iteration + 1),
        x_train, x_test, y_train, y_test = get_regression_dataset(
            0.6, feature_set=feature_set)
        # x_train, x_test = x_train[feature_set], x_test[feature_set]
        lr = LinearRegression()
        lr.fit(x_train, y_train)
        coef_list.append(lr.coef_)

    coef_list = np.array(coef_list)
    se = np.std(coef_list, 0) / np.sqrt(MAX_ITERATIONS)
    t = np.mean(coef_list, 0) / se
    pvalue = t_table.sf(np.fabs(t), len(t) - 1) * 2
    coef_list = np.mean(coef_list, 0)

    print '\n\n{:25s}   {:s}         {:s}  {:s}     {:s}'.format(
        'Field', 'COEF', 'Standard Error', 't-Statistics', 'P-value')
    print '================================================================================'
    for values in zip(feature_set, coef_list, se, t, pvalue):
        print '{:25s}   {:3.4f} \t    {:3.4f} \t    {:3.4f} \t  {:3.6f}'.format(
            *values)
    print '\n'
    print_errors(lr,
                 x_train,
                 y_train.values,
                 x_test,
                 y_test.values,
                 msg='Full Features')
Пример #11
0
def t_equal_var(n1, m1, var1, n2, m2, var2):
    temp = ((n1 - 1) * var1 +
            (n2 - 1) * var2) / (n1 + n2 - 2) * (1 / n1 + 1 / n2)
    _t = (m1 - m2) / np.sqrt(temp)
    _v = n1 + n2 - 2
    _p = t.sf(_t, _v)
    return _t, _v, _p
def compute_corrected_ttest(differences, df, n_train, n_test):
    """Computes right-tailed paired t-test with corrected variance.

    Parameters
    ----------
    differences : array-like of shape (n_samples, 1)
        Vector containing the differences in the score metrics of two models.
    df : int
        Degrees of freedom.
    n_train : int
        Number of samples in the training set.
    n_test : int
        Number of samples in the testing set.

    Returns
    -------
    t_stat : float
        Variance-corrected t-statistic.
    p_val : float
        Variance-corrected p-value.
    """
    mean = np.mean(differences)
    std = corrected_std(differences, n_train, n_test)
    t_stat = mean / std
    p_val = t.sf(np.abs(t_stat), df)  # right-tailed t-test
    return t_stat, p_val
Пример #13
0
def get_correlation_parallel(s1, s2):
    """
    params s1 - series 1
    params s2 - series 2 
    NOTE : series are number 1 to 25 when giving in arguments
    returns the correlation between series
    """
    start = time.time()
    offsets = []  #this will be the arguments to all the parallel jobs
    instances = (MAX_ROWS / BATCH_SIZE)
    mean, std = calculate_mean_std_parallel()
    stripped_mean, stripped_std = calculate_stripped_mean_std_parallel(
        mean, std)
    processes = Pool(processes=instances)
    for i in range(instances):
        offsets.append(
            (s1, s2, mean, std, stripped_mean, stripped_std, i * BATCH_SIZE))
    results = processes.map(get_correlation, offsets)
    processes.close()
    processes.join()
    pearson_corr = 0
    total = 0
    for result in results:
        pearson_corr += result[0] * result[1]
        total += result[1]
    pearson_corr = 1.0 * pearson_corr / total
    t_value = abs(pearson_corr * math.sqrt(1.0 * (total - 2) /
                                           (1 -
                                            (pearson_corr * pearson_corr))))
    p_value = t.sf(t_value, total - 2)
    print "\n ######### CORRELATION BETWEEN SERIES ", s1, " AND SERIES ", s2, " is ", pearson_corr, "t value is ", t_value, " and p value is ", p_value, "######### \n"
    end = time.time()
    print "EXECUTION TIME : ", end - start, " sec"
    return pearson_corr
Пример #14
0
 def wrapper(*args, **kwargs) -> Tuple[float, float]:
     sample_dist = func(*args, **kwargs)
     estimate = sample_dist.mean()
     std_err_estimate = sample_dist.std()
     n_samples = len(sample_dist)
     return estimate, 2 * t.sf(
         x=abs(estimate), df=n_samples - 2, loc=0, scale=std_err_estimate)
 def ttest_1samp(self, a, popmean):
     n = len(a)
     mean = self.mean(a)
     
     t = (mean-popmean)/(self.stan_de(a, mean)/(n**0.5))
     p = 2*T.sf(abs(t),n-1)
     return [round(t,6),round(p,6)]
Пример #16
0
    def pearsonr(self, x, y):
        n = len(x)
        if (n == 0):
            return [None, None]

        sum_x = sum(x)
        sum_y = sum(y)

        sum_xy = 0.0
        sum_x2 = 0.0
        sum_y2 = 0.0
        for x, y in zip(x, y):
            sum_xy += x * y
            sum_x2 += x**2
            sum_y2 += y**2

        z = ((n * sum_x2 - (sum_x)**2) * (n * sum_y2 - (sum_y)**2))**0.5
        if (z == 0):
            return [None, 0]

        r = (n * sum_xy - sum_x * sum_y) / z
        if (abs(r) == 1):
            return [r, 0]

        tvalue = r * ((n - 2) / (1 - r**2))**0.5
        p = 2 * t.sf(x=abs(tvalue), df=n - 2)
        return (round(r, 6), round(p, 6))
    def get_local_air_quality_comparison(self, city_str, tolerance=2.0):
        self.city_str = city_str
        token = "fe269bc83b983ff958090f5808afa12eed57f14f"
        req_data = get_request_data(self.base_url + self.city_str +
                                    "/?token=" + token)

        lat, lng = req_data['data']['city']['geo']

        latlngbx = str(lat) + "," + str(lng) + "," + str(
            lat + tolerance) + "," + str(lng + tolerance)
        r = requests.get(
            "https://api.waqi.info/" +
            f"/map/bounds/?latlng={latlngbx}&token={token}").json()
        if len(r['data']) > 0:
            local_df = make_dataframe(r)
            air_quality_comp = {
                'deviation': 'Not found',
                'probability': 'Not found'
            }

            deviation = local_df[local_df['name'].str.contains(
                city_str)]['aqi'].mean() - local_df['aqi'].mean()

            if not np.isnan(deviation):
                air_quality_comp['deviation'] = deviation

            probability = one_samp_t_test(
                local_df[local_df['name'].str.contains(city_str)], deviation)
            probability = t.sf(np.abs(probability), local_df.count() - 1)[0]

            if not np.isnan(probability):
                air_quality_comp['probability'] = probability

            return air_quality_comp
Пример #18
0
 def calculate_t_p_error_stats(self):
     self.rating_dict = {.05:"*",
                    .01:"**",
                    .001: "***"}
     results = self.estimates
     stat_sig_names = ["SE", "t-stat", "p-value"]
     for stat_name in stat_sig_names: 
         results[stat_name] = np.nan
     #generate statistic for each variable
     for var in self.beta_names:
         #SE of coefficient is found in the diagonal of cov_matrix
         results.loc[var]["SE"] = self.cov_matrix[var][var] ** (1/2)
         #tstat = Coeff / SE
         results.loc[var]["t-stat"] = \
             results["Coefficient"][var] / results["SE"][var]
         #p-value is estimated using a  table that transforms t-value in refference to df
         results.loc[var]["p-value"] = np.round(t.sf(np.abs(results.\
                    loc[var]["t-stat"]),self.degrees_of_freedom + 1) * 2, 5)
     #values for signifiances will be blank unless p-value < .05
     #pandas does not allow np.nan values or default blank strings to be replaced
     significance = ["" for i in range(len(self.beta_names))]   
     for i in range(len(self.beta_names)):
         var = self.beta_names[i]
         for val in self.rating_dict:
             if results.loc[var]["p-value"] < val:
                 significance[i] = self.rating_dict[val]
                 print(var, self.rating_dict[val])  
     results["significance"] = significance
Пример #19
0
 def calculate_t_p_error_stats(self):
     est = ["SE", "t-stat", "p-value", "p-rating"]
     rating_dict = {.001:"***",
                    .01:"**",
                    .05:"*"}
     for name in est: 
         results = self.estimates
         results[name] = np.nan
         for var in self.beta_names:
             if name == "SE": 
                 # SE of coefficient is found in the diagonal of cov_matrix
                 results.ix[var][name] = \
                 self.cov_matrix[var][var] ** (1/2)
             if name == "t-stat":
                 # tstat = Coef / SE
                 results.ix[var][name] = \
                 results.ix[var]["Coefficient"] / results.ix[var]["SE"]
             if name == "p-value":
                 # p-values is estimated from location within a 
                 # distribution implied by the t-stat
                 results.ix[var][name] = round(t.sf(\
                           np.abs(results.ix[var]["t-stat"]), 
                           self.degrees_of_freedom + 1) * 2, 5)
             if name == "p-rating":
                 print(name)
                 for val in rating_dict:
                     if results.ix[var]["p-value"] < val:
                         results[name][var] = rating_dict[val]
                         break
                     # if p-stat > .05, no break in for-loop, set val of ""
                     results[name][var] = ""
Пример #20
0
    def _p_value_raw(self):
        """Returns the raw p values."""
        from scipy.stats import t

        result = [2 * t.sf(a, b) for a, b in zip(np.fabs(self._t_stat_raw), self._df_resid_raw)]

        return np.array(result)
Пример #21
0
    def pearsonr(self, x, y):
        n = len(x)
        if (n == 0):
            return [None, None]
        else:
            sumX = 0
            sumY = 0
            sumX2 = 0
            sumY2 = 0

            sumX = self.getSum(x)
            sumX2 = self.getSum2(x)
            sumY = self.getSum(y)
            sumY2 = self.getSum2(y)

            xy = 0
            for i in range(n):
                xy += float(x[i]) * float(y[i])
            f1 = n * xy - sumX * sumY
            f21 = (n * sumX2 - sumX**2)**0.5
            f22 = (n * sumY2 - sumY**2)**0.5
            f2 = f21 * f22
            if (f2 == 0):
                return [None, None]
            r = f1 / f2
            r = round(r, 6)

            if (r == 1 or r == -1):
                p = 0
            else:
                T = r * ((n - 2) / (1 - r**2))**0.5
                p = t.sf(abs(T), n - 2) * 2
                p = round(p, 6)

            return [r, p]
Пример #22
0
 def pearsonr(self, x, y):
     sx = 0.0
     sy = 0.0
     sxy = 0.0
     sxx = 0.0
     syy = 0.0
     if len(x) == 0 or len(y) == 0:
         return [None, None]
     if len(x) != len(y):
         return [None, None]
     n = len(x)
     for i in range(0, n):
         sx += x[i]
         sy += y[i]
         sxy += x[i] * y[i]
         sxx += x[i]**2
         syy += y[i]**2
     rxy = (n * sxy - sx * sy) / ((n * sxx - sx**2) *
                                  (n * syy - sy**2))**0.5
     v = (1 - rxy**2)
     if v == 0:
         return [round(rxy, 6), 0.000000]
     t = rxy * (((n - 2) / (1 - rxy**2))**0.5)
     p = T.sf(t, n - 2)
     if p > 0.5:
         p = 1 - p
     else:
         p = 2 * p
     return [round(rxy, 6), round(p, 6)]
Пример #23
0
 def pearsonr(self,x,y):
     n=len(x)
     if(n==0):
         return [None,None]
     
     sum_x=sum(x)
     sum_y=sum(y)
     
     sum_xy=0.0
     sum_x2=0.0
     sum_y2=0.0
     for x,y in zip(x,y):
         sum_xy+=x*y
         sum_x2+=x**2
         sum_y2+=y**2
     
     z=((n*sum_x2-(sum_x)**2)*(n*sum_y2-(sum_y)**2))**0.5
     if(z==0):
         return [None,0]
     
     r=(n*sum_xy-sum_x*sum_y)/z
     if(abs(r)==1):
         return [r,0]
     
     tvalue=r*((n-2)/(1-r**2))**0.5
     p=2*t.sf(x=abs(tvalue),df=n-2)
     return (round(r,6),round(p,6))
Пример #24
0
 def calculate_t_p_error_stats(self):
     results = self.estimates
     stat_sig_names = ["SE", "t-stat", "p-value"]
     # create space in data frame for SE, t, and p
     for stat_name in stat_sig_names:
         results[stat_name] = np.nan
     # generate statistic for each variable
     for var in self.beta_names:
         # SE ** 2 of coefficient is found in the diagonal of the cov_matrix
         results.loc[var]["SE"] = self.cov_matrix[var][var] ** (1/2)     
         # t-stat = Coef / SE
         results.loc[var]["t-stat"] = \
             results["Coefficient"][var] / results["SE"][var]
         # p-values is estimated using a table that transforms t-stat in   
         # light of degrees of freedom  
         # 2 is for 2 tail...
         # 5 is to round to 5 decimal places
         results.loc[var]["p-value"] = np.round(
             t.sf(np.abs(results.loc[var]["t-stat"]), 
                  self.degrees_of_freedom +1) * 2, 5)
     ratings = [.05, .01, .001]
     significance = ["" for name in self.beta_names]
     for i in range(len(self.beta_names)):
         var = self.beta_names[i]
         for rating in ratings:
             if results.loc[var]["p-value"] < rating:
                 significance[i] = significance[i] + "*"
     results["significance"] = significance
Пример #25
0
 def different_stdev(self, alpha):
     t0 = (self.y1 - self.y2) / (np.sqrt(self.S1**2/self.n1 +
                                         self.S2**2/self.n2))
     # hypothesis testing2
     n1, n2, y1, y2, S1, S2 = self.n1, self.n2, self.y1, self.y2, self.S1, self.S2
     df = int((S1**2/n1+S2**2/n2)**2/((S1**2/n1)**2/(n1-1)+(S2**2/n2)**2/(n2-1)))
     H1a = t.ppf(1 - alpha/2., df) < np.abs(t0)
     H1b = t.ppf(1 - alpha, df) < t0
     H1c = t.ppf(alpha, df) > t0
     # p-value
     p1a = t.sf(np.abs(t0), df) * 2
     p1b = t.sf(t0, df)
     p1c = t.cdf(t0, df)
     c1 = y1 - y2 - t.ppf(1 - alpha/2., df) * np.sqrt(S1**2/n1+S2**2/n2)        
     c2 = y1 - y2 + t.ppf(1 - alpha/2., df) * np.sqrt(S1**2/n1+S2**2/n2)
     return H1a, H1b, H1c, p1a, p1b, p1c, (c1,c2)
Пример #26
0
    def calculate_t_p_error_stats(self):
        ratings = [.05, .01, .001]
        results = self.estimates
        stat_sig_names = ["SE", "t-stat", "p-value"]
        # create space in data frame for SE, t, and p
        for stat_name in stat_sig_names:
            results[stat_name] = np.nan
        # generate statistic for each variable
        for var in self.beta_names:
            # SE ** 2 of coefficient is found in the diagonal of cov_matrix
            results.loc[var]["SE"] = self.cov_matrix[var][var]**(1 / 2)

            # t-stat = Coef / SE
            results.loc[var]["t-stat"] = \
                results["Coefficient"][var] / results["SE"][var]
            # p-values is estimated using a table that transforms t-value in
            # light of degrees of freedom
            results.loc[var]["p-value"] = np.round(t.sf(np.abs(results.\
                       loc[var]["t-stat"]), self.degrees_of_freedom + 1) * 2, 5)
        # values for significances will be blank unless p-values < .05
        # pandas does not allow np.nan values or default blank strings to
        # be replaced x-post
        significance = ["" for i in range(len(self.beta_names))]
        for i in range(len(self.beta_names)):
            var = self.beta_names[i]
            for val in ratings:
                if results.loc[var]["p-value"] < val:
                    significance[i] = significance[i] + "*"
        results["signficance"] = significance
Пример #27
0
def get_correlation_parallel(s1,s2):
    """
    params s1 - series 1
    params s2 - series 2 
    NOTE : series are number 1 to 25 when giving in arguments
    returns the correlation between series
    """
    start = time.time()
    offsets = [] #this will be the arguments to all the parallel jobs
    instances = (MAX_ROWS/BATCH_SIZE)
    mean,std = calculate_mean_std_parallel()
    stripped_mean,stripped_std = calculate_stripped_mean_std_parallel(mean,std)
    processes = Pool(processes=instances)
    for i in range(instances):
        offsets.append((s1,s2,mean,std,stripped_mean,stripped_std,i*BATCH_SIZE))
    results = processes.map(get_correlation,offsets)
    processes.close()
    processes.join()
    pearson_corr = 0
    total = 0
    for result in results:
        pearson_corr += result[0]*result[1]
        total += result[1]
    pearson_corr = 1.0*pearson_corr / total
    t_value = abs(pearson_corr*math.sqrt( 1.0*(total - 2) / ( 1 - (pearson_corr*pearson_corr))))
    p_value = t.sf(t_value,total-2)
    print "\n ######### CORRELATION BETWEEN SERIES ",s1," AND SERIES ",s2, " is ",pearson_corr , "t value is ", t_value ," and p value is ", p_value,  "######### \n" 
    end = time.time()
    print "EXECUTION TIME : ", end-start , " sec"
    return pearson_corr
Пример #28
0
def lag_linregress_3D(x, y, lagx=0, lagy=0):
    """
    Input: Two xr.Datarrays of any dimensions with the first dim being time. 
    Thus the input data could be a 1D time series, or for example, have three 
    dimensions (time,lat,lon). 
    Datasets can be provided in any order, but note that the regression slope 
    and intercept will be calculated for y with respect to x.
    Output: Covariance, correlation, regression slope and intercept, p-value, 
    and standard error on regression between the two datasets along their 
    aligned time dimension.  
    Lag values can be assigned to either of the data, with lagx shifting x, and
    lagy shifting y, with the specified lag amount. 
    """
    #1. Ensure that the data are properly alinged to each other.
    x, y = xr.align(x, y)

    #2. Add lag information if any, and shift the data accordingly
    if lagx != 0:

        # If x lags y by 1, x must be shifted 1 step backwards.
        # But as the 'zero-th' value is nonexistant, xr assigns it as invalid
        # (nan). Hence it needs to be dropped
        x = x.shift(time=-lagx).dropna(dim='time')

        # Next important step is to re-align the two datasets so that y adjusts
        # to the changed coordinates of x
        x, y = xr.align(x, y)

    if lagy != 0:
        y = y.shift(time=-lagy).dropna(dim='time')
        x, y = xr.align(x, y)

    #3. Compute data length, mean and standard deviation along time axis:
    n = y.notnull().sum(dim='time')
    xmean = x.mean(axis=0)
    ymean = y.mean(axis=0)
    xstd = x.std(axis=0)
    ystd = y.std(axis=0)

    #4. Compute covariance along time axis
    cov = np.sum((x - xmean) * (y - ymean), axis=0) / (n)

    #5. Compute correlation along time axis
    cor = cov / (xstd * ystd)

    #6. Compute regression slope and intercept:
    slope = cov / (xstd**2)
    intercept = ymean - xmean * slope

    #7. Compute P-value and standard error
    #Compute t-statistics
    tstats = cor * np.sqrt(n - 2) / np.sqrt(1 - cor**2)
    stderr = slope / tstats

    from scipy.stats import t
    pval = t.sf(tstats, n - 2) * 2
    pval = xr.DataArray(pval, dims=cor.dims, coords=cor.coords)

    return cov, cor, slope, intercept, pval, stderr
Пример #29
0
def bicor(x, y, c=9):
    """
    Biweight midcorrelation.

    Parameters
    ----------
    x, y : array_like
        First and second set of observations. x and y must be independent.
    c : float
        Tuning constant for the biweight estimator (default = 9.0).

    Returns
    -------
    r : float
        Correlation coefficient.
    pval : float
        Two-tailed p-value.

    Notes
    -----
    This function will return (np.nan, np.nan) if mad(x) == 0 or mad(y) == 0.

    References
    ----------
    https://en.wikipedia.org/wiki/Biweight_midcorrelation

    https://docs.astropy.org/en/stable/api/astropy.stats.biweight.biweight_midcovariance.html

    Langfelder, P., & Horvath, S. (2012). Fast R Functions for Robust
    Correlations and Hierarchical Clustering. Journal of Statistical Software,
    46(11). https://www.ncbi.nlm.nih.gov/pubmed/23050260
    """
    from scipy.stats import t
    # Calculate median
    nx = x.size
    x_median = np.median(x)
    y_median = np.median(y)
    # Raw median absolute deviation
    x_mad = np.median(np.abs(x - x_median))
    y_mad = np.median(np.abs(y - y_median))
    if x_mad == 0 or y_mad == 0:
        # From Langfelder and Horvath 2012:
        # "Strictly speaking, a call to bicor in R should return a missing
        # value if mad(x) = 0 or mad(y) = 0." This avoids division by zero.
        return np.nan, np.nan
    # Calculate weights
    u = (x - x_median) / (c * x_mad)
    v = (y - y_median) / (c * y_mad)
    w_x = (1 - u**2)**2 * ((1 - np.abs(u)) > 0)
    w_y = (1 - v**2)**2 * ((1 - np.abs(v)) > 0)
    # Normalize x and y by weights
    x_norm = (x - x_median) * w_x
    y_norm = (y - y_median) * w_y
    denom = (np.sqrt((x_norm**2).sum()) * np.sqrt((y_norm**2).sum()))
    # Calculate r, t and two-sided p-value
    r = (x_norm * y_norm).sum() / denom
    tval = r * np.sqrt((nx - 2) / (1 - r**2))
    pval = 2 * t.sf(abs(tval), nx - 2)
    return r, pval
Пример #30
0
def percbend(x, y, beta=0.2):
    """
    Percentage bend correlation (Wilcox 1994).
    Parameters
    ----------
    x, y : array_like
        First and second set of observations. x and y must be independent.
    beta : float
        Bending constant for omega (0 <= beta <= 0.5).
    Returns
    -------
    r : float
        Percentage bend correlation coefficient.
    pval : float
        Two-tailed p-value.
    Notes
    -----
    Code inspired by Matlab code from Cyril Pernet and Guillaume Rousselet.
    References
    ----------
    .. [1] Wilcox, R.R., 1994. The percentage bend correlation coefficient.
       Psychometrika 59, 601–616. https://doi.org/10.1007/BF02294395
    .. [2] Pernet CR, Wilcox R, Rousselet GA. Robust Correlation Analyses:
       False Positive and Power Validation Using a New Open Source Matlab
       Toolbox. Frontiers in Psychology. 2012;3:606.
       doi:10.3389/fpsyg.2012.00606.
    """
    X = np.column_stack((x, y))
    nx = X.shape[0]
    M = np.tile(np.median(X, axis=0), nx).reshape(X.shape)
    W = np.sort(np.abs(X - M), axis=0)
    m = int((1 - beta) * nx)
    omega = W[m - 1, :]
    P = (X - M) / omega
    P[np.isinf(P)] = 0
    P[np.isnan(P)] = 0

    # Loop over columns
    a = np.zeros((2, nx))
    for c in [0, 1]:
        psi = P[:, c]
        i1 = np.where(psi < -1)[0].size
        i2 = np.where(psi > 1)[0].size
        s = X[:, c].copy()
        s[np.where(psi < -1)[0]] = 0
        s[np.where(psi > 1)[0]] = 0
        pbos = (np.sum(s) + omega[c] * (i2 - i1)) / (s.size - i1 - i2)
        a[c] = (X[:, c] - pbos) / omega[c]

    # Bend
    a[a <= -1] = -1
    a[a >= 1] = 1

    # Get r, tval and pval
    a, b = a
    r = (a * b).sum() / np.sqrt((a ** 2).sum() * (b ** 2).sum())
    tval = r * np.sqrt((nx - 2) / (1 - r ** 2))
    pval = 2 * t.sf(abs(tval), nx - 2)
    return r, pval
Пример #31
0
    def _p_values(self):
        """
        Return the model's coefficient/parameter p-values.

        :return: Numpy array
        """
        p_vals = [t.sf(abs(x), self.deg_of_freedom)*2 for x in self.t_statistics]
        return p_vals
Пример #32
0
    def equal_stdev(self, alpha):
        n1, n2, y1, y2 = self.n1, self.n2, self.y1, self.y2
        Sp = np.sqrt( ((n1 - 1)*self.S1**2 +
                       (n2 - 1)*self.S2**2) / (n1 + n2 - 2) )
        t0 = (y1 - y2) / (Sp * np.sqrt(1./n1 + 1./n2))

        # hypothesis testing2
        H1a = t.ppf(1 - alpha/2., n1 + n2 -2) < np.abs(t0)
        H1b = t.ppf(1 - alpha, n1 + n2 -2) < t0
        H1c = t.ppf(alpha, n1 + n2 -2) > t0
        # p-value
        p1a = t.sf(np.abs(t0), n1 + n2 -2) * 2
        p1b = t.sf(t0, n1 + n2 -2)
        p1c = t.cdf(t0, n1 + n2 -2)
        c1 = y1 - y2 - t.ppf(1 - alpha/2., n1 + n2 -2) * Sp * np.sqrt(1./n1+1./n2)        
        c2 = y1 - y2 + t.ppf(1 - alpha/2., n1 + n2 -2) * Sp * np.sqrt(1./n1+1./n2)
        return H1a, H1b, H1c, p1a, p1b, p1c, (c1,c2)
Пример #33
0
 def _compute_pvalue(self):
     """Returns the p-value."""
     if self.test_statistic_name == "t":
         if self.side is "less_than":
             return student_t.cdf(self.test_statistic, self.deg_of_freedom)
         elif self.side is "greater_than":
             return student_t.sf(self.test_statistic, self.deg_of_freedom)
         else:  #side is "not_equal"
             return 2 * student_t.sf(abs(self.test_statistic),
                                     self.deg_of_freedom)
     elif self.test_statistic_name == "z":
         if self.side is "less_than":
             return norm.cdf(self.test_statistic)
         elif self.side is "greater_than":
             return norm.sf(self.test_statistic)
         else:  #side is "not_equal"
             return 2 * norm.sf(abs(self.test_statistic))
Пример #34
0
    def t_test(cat):
        p = cat['avg_x']*cat['cnt_x']+cat['avg_y']*cat['cnt_y']
        p = p/(cat['cnt_x']+cat['cnt_y'])
        p += 1e-8
        z = (cat['avg_x']-cat['avg_y']) / np.sqrt((p*(1-p)*(1/cat['cnt_x']+1/cat['cnt_y'])))
        p_value = t.sf(abs(z), df=cat['cnt_x']+cat['cnt_y']-2)*2

        return p_value
Пример #35
0
def t_tests_on_mean(mu_0, x_var, s, n, alpha, power=None):
    print("Two-Sided t-Test - H_0 : μ = {} vs H_A : μ ≠ {}".format(mu_0, mu_0))
    print("with x_var {}, s {}, n {}, α {} :\n".format(x_var, s, n, alpha))

    t_statistic = (x_var - mu_0) / (s / sqrt(n))
    p_value = t.sf(np.abs(t_statistic), n - 1) * 2
    print("t-statistic : {:.4f}, p-value : 2P(T>=|t|) = {:.3f}".format(
        t_statistic, p_value))
    print("The null hypothesis is {}".format(
        "Accepted" if p_value > alpha else "Rejected"))
    print(
        "========================================================================"
    )

    print("One-Sided t-Test - H_0 : μ <= {} vs H_A : μ > {}".format(
        mu_0, mu_0))
    print("with x_var {}, s {}, n {}, α {} :\n".format(x_var, s, n, alpha))

    p_value = t.sf(t_statistic, n - 1)
    print("t-statistic : {:.4f}, p-value : P(T>=t) = {:.3f}".format(
        t_statistic, p_value))
    print("The null hypothesis is {}".format(
        "Accepted" if p_value > alpha else "Rejected"))
    print(
        "========================================================================"
    )

    print("One-Sided t-Test - H_0 : μ >= {} vs H_A : μ < {}".format(
        mu_0, mu_0))
    print("with x_var {}, s {}, n {}, α {} :\n".format(x_var, s, n, alpha))

    p_value = 1 - t.sf(t_statistic, n - 1)
    print("t-statistic : {:.4f}, p-value : P(T<=t) = {:.3f}".format(
        t_statistic, p_value))
    print("The null hypothesis is {}".format(
        "Accepted" if p_value > alpha else "Rejected"))
    print(
        "========================================================================"
    )

    if power is not None:
        raise NotImplementedError
        print("Power >= {} requires n >= {}".format(power, 1))
        print(
            "========================================================================"
        )
Пример #36
0
def welch(n1, m1, var1, n2, m2, var2, alpha=0.05):
    _t = (m1 - m2) / np.sqrt(var1 / n1 + var2 / n2)
    var1_SE, var2_SE = var1 / n1, var2 / n2

    _v = (var1_SE + var2_SE)**2 / (var1_SE**2 / (n1 + 1) + var2_SE**2 /
                                   (n2 + 1)) - 2
    _p = t.sf(_t, _v)
    return _t, _v, _p
Пример #37
0
def satterthwaite(n1, m1, var1, n2, m2, var2, alpha=0.05):
    _t = (m1 - m2) / np.sqrt(var1 / n1 + var2 / n2)
    var1_SE, var2_SE = var1 / n1, var2 / n2

    _v = (var1_SE + var2_SE)**2 / (var1_SE**2 / (n1 - 1) + var2_SE**2 /
                                   (n2 - 1))
    _p = t.sf(_t, _v)
    return _t, _v, _p
 def pearsonr(self, x, y):
     n = len(x)
     if n==0:
         return [None,None]
     r = (n*self.proSum(x, y)-self.summary(x)*self.summary(y))/(((n*self.squareSum(x)-self.summary(x)**2)*(n*self.squareSum(y)-self.summary(y)**2))**0.5)
     t = r*(float(n-2)/(1-r**2))**0.5
     p = 2*T.sf(abs(t), n-2)
     return [round(r,6),round(p,6)]
Пример #39
0
def full_glm_results(endog_arr,
                     exog_vars,
                     return_resids=False,
                     only_tvals=False,
                     PCA_whiten=False,
                     ZCA_whiten=False,
                     orthogonalize=True,
                     orthogNear=False,
                     orthog_GramSchmidt=False):
    if np.mean(exog_vars[:, 0]) != 1:
        print(
            "Warning: the intercept is not included as the first column in your exogenous variable array"
        )
    n, num_depv = endog_arr.shape
    k = exog_vars.shape[1]

    if orthogonalize:
        exog_vars = sm.add_constant(orthog_columns(exog_vars[:, 1:]))
    elif orthogNear:
        exog_vars = sm.add_constant(ortho_neareast(exog_vars[:, 1:]))
    elif orthog_GramSchmidt:  # for when order matters AKA type 2 sum of squares
        exog_vars = sm.add_constant(gram_schmidt_orthonorm(exog_vars[:, 1:]))
    else:
        pass

    invXX = np.linalg.inv(np.dot(exog_vars.T, exog_vars))

    DFbetween = k - 1  # aka df model
    DFwithin = n - k  # aka df residuals
    DFtotal = n - 1
    if PCA_whiten:
        endog_arr = PCAwhiten(endog_arr)
    if ZCA_whiten:
        endog_arr = ZCAwhiten(endog_arr)

    a = cy_lin_lstsqr_mat(exog_vars, endog_arr)
    sigma2 = np.sum((endog_arr - np.dot(exog_vars, a))**2, axis=0) / (n - k)
    se = se_of_slope(num_depv, invXX, sigma2, k)

    if only_tvals:
        return a / se
    else:
        resids = endog_arr - np.dot(exog_vars, a)
        RSS = np.sum(resids**2, axis=0)
        TSS = np.sum((endog_arr - np.mean(endog_arr, axis=0))**2, axis=0)
        R2 = 1 - (RSS / TSS)

        std_y = np.sqrt(TSS / DFtotal)
        R2_adj = 1 - ((1 - R2) * DFtotal / (DFwithin))
        Fvalues = ((TSS - RSS) / (DFbetween)) / (RSS / DFwithin)
        Tvalues = a / se
        Pvalues = t.sf(np.abs(Tvalues), DFtotal) * 2
        if return_resids:
            fitted = np.dot(exog_vars, a)
            return (Fvalues, Tvalues, Pvalues, R2, R2_adj, np.array(resids),
                    np.array(fitted))
        else:
            return (Fvalues, Tvalues, Pvalues, R2, R2_adj)
Пример #40
0
    def _overlap_output(self, category_names, overlap_matrix, M_annot, M_tot, print_coefficients):
        '''LD Score regression summary for overlapping categories.'''
        overlap_matrix_prop = np.zeros([self.n_annot,self.n_annot])
        for i in range(self.n_annot):
            overlap_matrix_prop[i, :] = overlap_matrix[i, :] / M_annot

        prop_hsq_overlap = np.dot(
            overlap_matrix_prop, self.prop.T).reshape((1, self.n_annot))
        prop_hsq_overlap_var = np.diag(
            np.dot(np.dot(overlap_matrix_prop, self.prop_cov), overlap_matrix_prop.T))
        prop_hsq_overlap_se = np.sqrt(
            np.maximum(0, prop_hsq_overlap_var)).reshape((1, self.n_annot))
        one_d_convert = lambda x: np.array(x).reshape(np.prod(x.shape))
        prop_M_overlap = M_annot / M_tot
        enrichment = prop_hsq_overlap / prop_M_overlap
        enrichment_se = prop_hsq_overlap_se / prop_M_overlap
        overlap_matrix_diff = np.zeros([self.n_annot,self.n_annot])
        for i in range(self.n_annot):
            if not M_tot == M_annot[0,i]:
                overlap_matrix_diff[i, :] = overlap_matrix[i,:]/M_annot[0,i] - \
                    (M_annot - overlap_matrix[i,:]) / (M_tot-M_annot[0,i])

        diff_est = np.dot(overlap_matrix_diff,self.coef)
        diff_cov = np.dot(np.dot(overlap_matrix_diff,self.coef_cov),overlap_matrix_diff.T)
        diff_se = np.sqrt(np.diag(diff_cov))
        diff_p = [np.nan if diff_se[i]==0 else 2*tdist.sf(abs(diff_est[i]/diff_se[i]),self.n_blocks) \
            for i in range(self.n_annot)]
        
        coef_z = []
        for i in range(self.n_annot):
            if one_d_convert(self.coef)[i]==0 and one_d_convert(self.coef_se)[i]==0:
                coef_z.append(0)
            elif one_d_convert(self.coef_se)[i]==0:
                coef_z.append('NA')
            else:
                coef_z.append(one_d_convert(self.coef)[i] / one_d_convert(self.coef_se)[i])

        df = pd.DataFrame({
            'Category': category_names,
            'Prop._SNPs': one_d_convert(prop_M_overlap),
            'Prop._h2': one_d_convert(prop_hsq_overlap),
            'Prop._h2_std_error': one_d_convert(prop_hsq_overlap_se),
            'Enrichment': one_d_convert(enrichment),
            'Enrichment_std_error': one_d_convert(enrichment_se),
            'Enrichment_p':diff_p,
            'Coefficient': one_d_convert(self.coef),
            'Coefficient_std_error': self.coef_se,
            'Coefficient_z-score': coef_z
        })
        if print_coefficients:
            df = df[['Category', 'Prop._SNPs', 'Prop._h2', 'Prop._h2_std_error',
                    'Enrichment','Enrichment_std_error', 'Enrichment_p',
                     'Coefficient', 'Coefficient_std_error','Coefficient_z-score']]
        else:
            df = df[['Category', 'Prop._SNPs', 'Prop._h2', 'Prop._h2_std_error',
                    'Enrichment','Enrichment_std_error', 'Enrichment_p']]
        return df
Пример #41
0
    def reducer(self, key, info):
        '''
        Calculates all the values we will need for simple linear regression 
        analysis, and does the analysis itself.
        '''
        # not the most efficient, but we want to keep these values
        # to calculate standard errors
        info = list(info)

        # calculate sums
        sumx, sumy, sumxx, sumyy, sumxy, n = (0.0, 0.0, 0.0, 0.0, 0.0, 0)
        for (x, y) in info:
            sumx += x
            sumy += y
            sumxx += x * x
            sumyy += y * y
            sumxy += x * y
            n += 1

        # calculate correlation
        corr = 0
        corr_denom = math.sqrt((n * sumxx - sumx**2) * (n * sumyy - sumy**2))
        if corr_denom < 0.0001:
            yield False, "Could not calculate coefficients"

        corr_num = n * sumxy - sumx * sumy 
        corr = corr_num / corr_denom

        if abs(corr) < 0.0001:
            yield False, "Could not calculate coefficients"

        # calculate regression coefficients
        beta1 = (sumxy - sumx * sumy / n) / (sumxx - sumx**2 / n)
        beta0 = (sumy - beta1 * sumx) / n

        # calculate standard errors
        # note: this is the reason why this isn't in a regression class
        y_reals = [y for (x, y) in info]
        y_hats = [beta0 + beta1 * y for y in y_reals]
        s_num = sum([(y - yhat) for (y, yhat) in zip(y_reals, y_hats)])
        s = math.sqrt(s_num / (n - 2))

        se_denom = n * sumxx - sumx**2
        # se_beta0 = s * math.sqrt(sumxx / se_denom)
        se_beta1 = s * math.sqrt(n / se_denom)

        # calculate t-values
        # t0 = beta0 / se_beta0
        t1 = beta1 / se_beta1

        # calculate 2-sided p-values
        # alpha = 0.05
        # t_stat = t.ppf(1 - alpha/2, n - 2)
        # beta0_p_value = t.sf(abs(t0), n - 2) * 2
        beta1_p_value = t.sf(abs(t1), n - 2) * 2

        yield None, (beta1, beta1_p_value, corr)
Пример #42
0
 def ttest_1samp(self, a, popmean):
     mean,s=0.0,0.0
     mean=sum(a)/float(len(a))
     for i in a:
         s+=(i-mean)**2
     s/=(len(a)-1)
     s=s**0.5
     T=(mean-popmean)/(s/(len(a))**0.5)
     P=t.sf(abs(T),len(a)-1)*2
     return[round(T,6),round(P,6)]
Пример #43
0
def significantly_different_genes(
        rpkm_table,
        experiment_groups,
        intergroups,
        target_p_value=0.05):
    """
    Performs a test that uses the error function to determine if we can reject the hypothesis that
    all the genes are sampled from the same distribution.

    :param rpkm_table: table of the rpkm values
    :param experiment_groups: groups on indexes
    :param intergroups: the groups between which we want to do the comparisons
    :param target_p_value: p_value with which we want to be able to reject the null hypothesis
    """
    groups_means = np.zeros((rpkm_table.shape[0], len(experiment_groups)))
    groups_var = np.zeros((rpkm_table.shape[0], len(experiment_groups)))

    for i, group in enumerate(experiment_groups):
        groups_means[:, i] = np.mean(rpkm_table[:, group], axis=1)
        groups_var[:, i] = np.var(rpkm_table[:, group], axis=1) / \
            estimator_dilatation_table[len(group)] ** 2

    group_comparison = []
    for bi_group in intergroups:
        groups_mean_difference = np.fabs(
            groups_means[
                :,
                bi_group[0]] -
            groups_means[
                :,
                bi_group[1]])
        groups_combined_std = np.sqrt(
            groups_var[
                :,
                bi_group[0]] +
            groups_var[
                :,
                bi_group[1]])
        p_val = t.sf(groups_mean_difference /
                     groups_combined_std, (len(experiment_groups[bi_group[0]]) +
                                           len(experiment_groups[bi_group[1]])) /
                     2)
        sorted_p_vals = np.sort(p_val, axis=0)
        lower_index = np.array(range(0, sorted_p_vals.shape[0])) *\
            target_p_value / sorted_p_vals.shape[0]
        pre_filter_mask = sorted_p_vals <= lower_index
        filter_mask = pre_filter_mask
        if np.any(pre_filter_mask):
            refined_threshold = np.max(sorted_p_vals[pre_filter_mask])
            filter_mask = p_val < refined_threshold
        group_comparison.append((p_val, filter_mask))

    return group_comparison
Пример #44
0
 def pVal(self):
     p = {}
     for name, sab in self.sab.items():
         ssa = self.ssa[name]
         ssb = self.ssb[name]
         dof = self.dof[name]
         r = CorrCurves.calc(sab, ssa, ssb)
         df = dof - 1
         t = r * np.sqrt(df/(1-r**2))
         rawP = tDist.sf(np.abs(t), df)
         p[name] = CorrCurves.bonferroni(rawP)
         
     return p
Пример #45
0
 def pairedTTest(y1, y2):
     y1, y2 = array(y1), array(y2)
     n = len(y1)
     y_diff = y1 - y2
     y_diff_mean, yfcra_sd = mean(y_diff), std(y_diff)
     t = y_diff_mean / (yfcra_sd / sqrt(n))
     p = spt.sf(np.abs(t), n-1)
     y1_mean, y1_std = mean(y1), std(y1)
     y1_y1z = (y1 - y1_mean) / y1_std
     y2_y1z = (y2 - y1_mean) / y1_std
     #assert mean(y1_y1z) == 0.000, "y1 mean not zero, %.5f" % mean(y1_y1z) #will be close enough to zero
     d = mean(y2_y1z)
     return (t, p, d)
Пример #46
0
 def different_stdev_explicite(self, alpha, y1, y2, S1, S2, n1, n2):
     t0 = (y1 - y2) / (np.sqrt(S1 ** 2 / n1 + S2 ** 2 / n2))
     # hypothesis testing2
     df = int((S1**2/n1+S2**2/n2)**2/((S1**2/n1)**2/(n1-1)+(S2**2/n2)**2/(n2-1)))
     H1a = t.ppf(1 - alpha/2., df) < np.abs(t0)
     H1b = t.ppf(1 - alpha, df) < t0
     H1c = t.ppf(alpha, df) > t0
     # p-value
     p1a = t.sf(np.abs(t0), df) * 2
     p1b = t.sf(t0, df)
     p1c = t.cdf(t0, df)
     c1 = y1 - y2 - t.ppf(1 - alpha/2., df) * np.sqrt(S1**2/n1+S2**2/n2)        
     c2 = y1 - y2 + t.ppf(1 - alpha/2., df) * np.sqrt(S1**2/n1+S2**2/n2)
     CI = (c1, c2)
     print 'at the level of significance ', alpha, ':'
     print 'H1 mu1 != mu2 is ', H1a
     print 'H1 mu1 > mu2 is ', H1b
     print 'H1 mu1 < mu2 is ', H1c
     print 'probability of type I error for mu1 != mu2:', p1a
     print 'probability of type I error for mu1 > mu2:', p1b
     print 'probability of type I error for mu1 < mu2:', p1c
     print 'CI (%.1f%%) for mu1 - mu2:' %(100-100*alpha), CI, CI/y1
Пример #47
0
 def __init__(self, t=None, F=None, sd=None, effect=None, df_denom=None,
              df_num=None):
     if F is not None:
         self.fvalue = F
         self.df_denom = df_denom
         self.df_num = df_num
         self.pvalue = fdist.sf(F, df_num, df_denom)
     else:
         self.tvalue = t
         self.sd = sd
         self.effect = effect
         self.df_denom = df_denom
         self.pvalue = student_t.sf(np.abs(t), df_denom)
def show_correlation_coefficient_stats():
    # Get the values from the entries in the window
    try:
        correlation_coefficient = float(enter_coefficient.get())
    except:
        correlation_coefficient = ''
    try:
        number_of_samples = int(enter_number_of_samples.get())
    except:
        number_of_samples = ''
    try:
        tails = int(enter_tails.get())
    except:
        tails = ''
    try:
        correlation_type = enter_correlation_type.get()
    except:
        correlation_type = ''
    try:
        level_of_significance = float(enter_level_of_significance.get())
    except:
        level_of_significance = ''
    # Fix the values
    if tails == '':
        tails = 2
    if correlation_type == '':
        correlation_type = 'pearson'
    if level_of_significance == '':
        level_of_significance = 0.05
    # Return the alarm
    if (correlation_coefficient == '' or number_of_samples == ''):
        messagebox.showwarning(title="Error", message="Missing critical values!")
    else:
        ####### Calculation of the Student's t-distribution
        degrees_of_freedom = number_of_samples-2
        if correlation_type == "pearson":
            t_value = correlation_coefficient * sqrt((number_of_samples-2)/(1-correlation_coefficient**2))
            # Calculate the one-tail p-value
            p_value = t.sf(t_value, degrees_of_freedom)
            if tails == 1:
                messagebox.showinfo(title="Correlation p-value", message="The p-value for a %s correlation coefficient (of %s) computed on %s samples is: %s" %(correlation_type, correlation_coefficient, number_of_samples, p_value))
            # Calculate the two-tail p-value
            elif tails == 2:
                p_value = p_value*2
                messagebox.showinfo(title="Correlation p-value", message="The p-value for a %s correlation coefficient (of %s) computed on %s samples is: %s" %(correlation_type, correlation_coefficient, number_of_samples, p_value))
            ###################### Significance
            if p_value <= level_of_significance:
                messagebox.showinfo(title="Significance", message="The calculated correlation coefficient IS statistically significant at a level of significance of %s" %(level_of_significance))
            else:
                messagebox.showinfo(title="Significance", message="The calculated correlation coefficient is NOT statistically significant at a level of significance of %s" %(level_of_significance))
Пример #49
0
    def _overlap_output(self, category_names, overlap_matrix, M_annot, M_tot, print_coefficients):
        '''LD Score regression summary for overlapping categories.'''
        overlap_matrix_prop = np.zeros([self.n_annot,self.n_annot])
        for i in range(self.n_annot):
            overlap_matrix_prop[i, :] = overlap_matrix[i, :] / M_annot

        prop_hsq_overlap = np.dot(
            overlap_matrix_prop, self.prop.T).reshape((1, self.n_annot))
        prop_hsq_overlap_var = np.diag(
            np.dot(np.dot(overlap_matrix_prop, self.prop_cov), overlap_matrix_prop.T))
        prop_hsq_overlap_se = np.sqrt(
            np.maximum(0, prop_hsq_overlap_var)).reshape((1, self.n_annot))
        one_d_convert = lambda x: np.array(x).reshape(np.prod(x.shape))
        prop_M_overlap = M_annot / M_tot
        enrichment = prop_hsq_overlap / prop_M_overlap
        enrichment_se = prop_hsq_overlap_se / prop_M_overlap
        overlap_matrix_diff = np.zeros([self.n_annot,self.n_annot])
        for i in range(self.n_annot):
            if not M_tot == M_annot[0,i]:
                overlap_matrix_diff[i, :] = overlap_matrix[i,:]/M_annot[0,i] - \
                    (M_annot - overlap_matrix[i,:]) / (M_tot-M_annot[0,i])

        diff_est = np.dot(overlap_matrix_diff,self.coef)
        diff_cov = np.dot(np.dot(overlap_matrix_diff,self.coef_cov),overlap_matrix_diff.T)
        diff_se = np.sqrt(np.diag(diff_cov))
        diff_p = ['NA' if diff_se[i]==0 else 2*tdist.sf(abs(diff_est[i]/diff_se[i]),self.n_blocks) \
            for i in range(self.n_annot)]

        df = pd.DataFrame({
            'Category': category_names,
            'Prop._SNPs': one_d_convert(prop_M_overlap),
            'Prop._h2': one_d_convert(prop_hsq_overlap),
            'Prop._h2_std_error': one_d_convert(prop_hsq_overlap_se),
            'Enrichment': one_d_convert(enrichment),
            'Enrichment_std_error': one_d_convert(enrichment_se),
            'Enrichment_p':diff_p,
            'Coefficient': one_d_convert(self.coef),
            'Coefficient_std_error': self.coef_se,
            'Coefficient_z-score': one_d_convert(self.coef) / one_d_convert(self.coef_se)
        })
        if print_coefficients:
            df = df[['Category', 'Prop._SNPs', 'Prop._h2', 'Prop._h2_std_error',
                    'Enrichment','Enrichment_std_error', 'Enrichment_p',
                     'Coefficient', 'Coefficient_std_error','Coefficient_z-score']]
        else:
            df = df[['Category', 'Prop._SNPs', 'Prop._h2', 'Prop._h2_std_error',
                    'Enrichment','Enrichment_std_error', 'Enrichment_p']]
        return df
Пример #50
0
def pers(x, y):
    assert len(x) == len(y)

    x_bar = mean(x)
    y_bar = mean(y)
    s_x = std(x, ddof=1)
    s_y = std(y, ddof=1)
    tmp = 0.0
    for i in range(0, len(x)):
        tmp += (x[i] - x_bar) * (y[i] - y_bar)

    r = tmp / (len(x) - 1) / s_x / s_y
    if r == 1:
        return [1, 0]
    tt = r * sqrt((len(x) - 2) / (1 - r ** 2))
    p = t.sf(abs(tt), len(x) - 2) * 2
    return [r, p]
Пример #51
0
Файл: t.py Проект: ronrest/pyrpy
def pt(x, df=1, loc=0, scale=1, ncp=None, lowertail=True, log=False):
    """
    The cumulative distribution function for the t distribution.
    You provide a value along the t distribution (eg x=3) or array of
    values, and it returns what proportion of values lie below it (the quantile)

    Alternatively, if you select lowertail=False, it returns the proportion of
    values that are above it.

    ARGS:
    ---------------
    :param x (float, array of floats):
        The values along the distribution.
    :param df (float):
        degrees of freedom
    :param loc: array_like, optional
        location parameter (default=0)
    :param scale: float, optional
        scale (default=1)
    :param ncp (float):
        non-centrality parameter delta.
        Currently not implemented.
    :param lowertail (bool):
        are you interested in what proportion of values lie beneath x? or
        above x (False)?
    :param log (boolean):
        Use log scale?

    RETURN:
    ---------------
    :return:
        an array of quantiles() corresponding to the values in x
    """
    if lowertail and not log:
        return t.cdf(x, df=df, loc=loc, scale=scale)
    elif not lowertail and not log:
        return t.sf(x, df=df, loc=loc, scale=scale)
    elif lowertail and log:
        return t.logcdf(x, df=df, loc=loc, scale=scale)
    else:
        return t.logsf(x, df=df, loc=loc, scale=scale)
Пример #52
0
def approximate_MH_accept(mu_0,log_lik,X,batch_size,epsilon,theta_prime, theta_t,N):

    iteration_number=0

    while True:
        iteration_number +=1
        n = iteration_number*batch_size
        n = min(n, N)
        sub = np.random.choice(X, n,replace=False)
        sub = log_lik(sub, theta_prime) - log_lik(sub, theta_t)
        l_hat = np.mean(sub)
        l_2_hat = np.mean(sub**2)
        s_l = np.sqrt(l_2_hat - l_hat**2*n/(n-1))
        s = s_l/ np.sqrt(n)*np.sqrt(1 - (n-1)/(N-1))
        t_students_var = (l_hat - mu_0) / s
        stat = np.abs(t_students_var)
        delta  = t.sf(stat, n-1)
        if delta < epsilon:
            if l_hat > mu_0:
                return True,n
            return False,n
Пример #53
0
 def test_pvalue(self):
     assert_almost_equal(self.Ttest.pvalue, student_t.sf(
                     np.abs(self.res1.tvalues), self.res1.model.df_resid)*2,
                     DECIMAL_4)
Пример #54
0
 def pvalues(self):
 #TODO: same for conditional and unconditional?
     df_resid = self.df_resid
     return t.sf(np.abs(self.tvalues), df_resid) * 2
Пример #55
0
 def pvalues(self):
     return t.sf(np.abs(self.tvalues), self.df_resid)*2
Пример #56
0
subjs_fname = "/Users/sudregp/data/meg/good_subjects.txt"
group_fname = "/Users/sudregp/data/meg/%s_subjs.txt" % group
data_dir = "/Users/sudregp/data/results/meg/"
fid = open(subjs_fname, "r")
subjs = [line.rstrip() for line in fid]
fid.close()
fid = open(group_fname, "r")
this_group = [line.rstrip() for line in fid]
fid.close()

# load the pre-computed correlation data
fname = data_dir + "corrs-seed%d-%dto%d-lh.stc" % (seed_src, band[0], band[1])
stc = mne.read_source_estimate(fname)

y = [s in this_group for s in subjs]
y = np.asarray(y).T
X = np.mean(stc.data[:, y], axis=1)

print "Subjects in %s: %d" % (group, np.sum(y))
if fdr > 0:
    n = sum(y)
    # from http://www.danielsoper.com/statcalc3/calc.aspx?id=44
    tstat = X / np.sqrt((1 - X ** 2) / (n - 2))
    # t.sf gives the one tailed version
    pval = t.sf(tstat, n - 1) * 2
    reject_fdr, pval_fdr = mne.stats.fdr_correction(pval, alpha=fdr, method="indep")
    X[~reject_fdr] = 0

stc2 = mne.SourceEstimate(X[:, None], vertices=stc.vertno, tmin=0, tstep=0, subject="fsaverage")
brain = stc2.plot(hemi="both", fmin=min(X), fmid=(min(X) + (max(X) - min(X)) / 2), fmax=max(X))
Пример #57
0
 def pval(x, standard_error, df=800, tail=2):
     pval = t.sf(np.abs((x-0)/standard_error), df) * tail
     return pval
Пример #58
0
    def _p_value_raw(self):
        """Returns the raw p values."""
        from scipy.stats import t

        return 2 * t.sf(np.fabs(self._t_stat_raw),
                        self._df_resid_raw)
Пример #59
0
def glm(x,y,w=1.0):

  p,n    = shape(x)                    # sample size
  p     += 1                           # add one for intercept
  dof    = n - p                       # degrees of freedom
  
  sig    = var(y)                      # variance
  mu     = (y + mean(y))/2.0           # initial mean estimate
  eta    = log(mu)                     # initial predictor
  X      = vstack((ones(n), x)).T      # observed x-variable matrix

  # Newton-Raphson :
  converged = False
  rtol      = 1e-12
  dtol      = 1e-12
  lmbda     = 1.0
  nIter     = 0
  deviance  = 1
  D         = 1
  ahat      = zeros(p)   # initial parameters
  rel_res   = zeros(p)   # initial relative residual
  maxIter   = 100

  rel_a = []
  dev_a = []

  while not converged and nIter < maxIter:
    W       = diags(w*mu**2/sig, 0)         # compute weights
    z       = eta + (y - mu)/mu             # adjusted dependent variable

    WX      = W.dot(X)
    XTWX    = dot(X.T, WX)
    iXTWX   = inv(XTWX)
    Wz      = W.dot(z)

    ahat_n  = dot(iXTWX, dot(X.T, Wz))
    
    eta     = dot(X, ahat_n)               # compute estimates
    mu      = exp(eta)                     # linear predictor

    # calculate residual :
    rel_res  = norm(ahat - ahat_n, inf)
    rel_a.append(rel_res)
    ahat     = ahat_n

    D_n      = sum((y - mu)**2)
    deviance = abs(D_n - D)
    D        = D_n
    dev_a.append(deviance)
    
    if rel_res < rtol or deviance < dtol: converged = True
    nIter +=  1

    string = "Newton iteration %d: d (abs) = %.2e, (tol = %.2e) r (rel) = %.2e (tol = %.2e)"
    print string % (nIter, deviance, dtol, rel_res, rtol)
  
  # calculate statistics :
  varA   = diag(iXTWX)            # variance of alpha hat
  sea    = sqrt(varA)             # vector of standard errors for alpha hat
  t_a    = ahat / sea
  pval   = t.sf(abs(t_a), dof) * 2
  conf   = 0.95                        # 95% confidence interval
  tbonf  = t.ppf((1 - conf/p), dof)    # bonferroni corrected t-value
  ci     = tbonf*sea                   # confidence interval for ahat
  resid  = (y - mu)                    # 'working' residual
                                       
  RSS    = sum((y - mu)**2)            # residual sum of squares
  TSS    = sum((y - mean(y))**2)       # total sum of squares
  R2     = (TSS-RSS)/TSS               # R2
  F      = (TSS-RSS)/(p-1) * (n-p)/RSS # F-statistic
  F_p    = fdtrc(p-1, dof, F)          # F-Stat. p-value

  # log-likelihood :
  L      = sum((y*mu - mu**2/2)/(2*sig) - y**2/(2*sig) - 0.5*log(2*pi*sig))
  AIC    = (-2*L + 2*p)/n              # AIC statistic

  # estimated error variance :
  sighat = 1/(n-p) * RSS
                                        
  vara = { 'ahat'  : ahat,              
           'yhat'  : mu,                
           'sea'   : sea,               
           'ci'    : ci,                
           'dof'   : dof,               
           'resid' : resid,             
           'rel_a' : rel_a,
           'dev_a' : dev_a,
           'R2'    : R2,
           'F'     : F,
           'AIC'   : AIC,
           'sighat': sighat}
  return vara
fc_fmri_ctl_a= fc_fmri_ctl_var / 9.0
fc_fmri_dms_a= fc_fmri_dms_var / 9.0
#     (3) Add results obtained for CTL and DMS in step (2) together:
fc_syn_a = fc_syn_ctl_a + fc_syn_dms_a
fc_fmri_a= fc_fmri_ctl_a+ fc_fmri_dms_a
#     (4) Take the square root the results in step (3):
sqrt_fc_syn_a = np.sqrt(fc_syn_a)
sqrt_fc_fmri_a= np.sqrt(fc_fmri_a)
#     (5) Divide the results of step (1) by the results of step (4) to obtain 't':
fc_syn_t = fc_syn_mean_diff  / sqrt_fc_syn_a
fc_fmri_t= fc_fmri_mean_diff / sqrt_fc_fmri_a
#     (6) Calculate the degrees of freedom (add up number of observations for each group
#         minus number of groups):
dof = 10 + 10 - 2
#     (7) find the p-values for the above 't' and 'degrees of freedom':
fc_syn_p_values  = t.sf(fc_syn_t, dof)
fc_fmri_p_values = t.sf(fc_fmri_t, dof)

print 't-values for synaptic activity correlations: ', fc_syn_t
print 't-values for fmri time-series correlations: ', fc_fmri_t

# convert to Pandas dataframe, using the transpose to convert to a format where the names
# of the modules are the labels for each time-series
fc_mean = pd.DataFrame(np.array([fc_syn_dms_mean, fc_syn_ctl_mean,
                                 fc_fmri_dms_mean, fc_fmri_ctl_mean]),
                      columns=np.array(['V1', 'V4', 'FS', 'D1', 'D2', 'FR', 'LIT']),
                       index=np.array(['DMS-syn', 'CTL-syn', 'DMS-fmri', 'CTL-fmri']))
#fc_std  = pd.DataFrame(np.array([fc_syn_dms_std, fc_syn_ctl_std,
#                                 fc_fmri_dms_std, fc_fmri_ctl_std]),
#                      columns=np.array(['V1', 'V4', 'D1', 'D2', 'FS', 'FR']),
#                       index=np.array(['DMS-syn', 'CTL-syn', 'DMS-fmri', 'CTL-fmri']))