Exemplo n.º 1
0
    def get_univariate_quant_metrics(self,
                                     dataset=TRAIN,
                                     transformed=False,
                                     verbose=True,
                                     thin_model=None,
                                     thin_true=None,
                                     seed=None,
                                     n=None):
        """
        Calculates quantitative metrics for the difference between p(t) and
        p_model(t) and the difference between p(y) and p_model(y)

        :param dataset: dataset subset to evaluate on (train, val, or test)
        :param transformed: If True, use transformed version of data.
            If False, use original (non-transformed) version of data.
        :param thin_model: thinning interval for the model data
        :param thin_true: thinning interval for the real data
        :param seed: seed for sample from generative model
        :return: {
            't_ks_pval': ks p-value with null that t_model and t_true are from the same distribution
            'y_ks_pval': ks p-value with null that y_model and y_true are from the same distribution
            't_wasserstein1_dist': wasserstein1 distance between t_true and t_model
            'y_wasserstein1_dist': wasserstein1 distance between y_true and y_model
        }
        """
        _, t_model, y_model = to_np_vectors(self.sample(
            seed=seed, untransform=(not transformed)),
                                            thin_interval=thin_model)

        _, t_true, y_true = self.get_data(transformed=transformed,
                                          dataset=dataset,
                                          verbose=verbose)
        t_true, y_true = to_np_vectors((t_true, y_true),
                                       thin_interval=thin_true)

        # jitter for numerical stability
        t_true = t_true.copy() + np.random.rand(*t_true.shape) * 1e-6
        t_model = t_model.copy() + np.random.rand(*t_model.shape) * 1e-6

        ks_label = "_ks_pval"
        es_label = "_es_pval"
        wasserstein_label = "_wasserstein1_dist"
        metrics = {
            T + ks_label:
            float(stats.ks_2samp(t_model, t_true).pvalue),
            Y + ks_label:
            float(stats.ks_2samp(y_model, y_true).pvalue),
            T + es_label:
            float(stats.epps_singleton_2samp(t_model, t_true).pvalue),
            Y + es_label:
            float(stats.epps_singleton_2samp(y_model, y_true).pvalue),
            T + wasserstein_label:
            float(stats.wasserstein_distance(t_model, t_true)),
            Y + wasserstein_label:
            float(stats.wasserstein_distance(y_model, y_true)),
        }

        return metrics
Exemplo n.º 2
0
def es_test(X, y, p=0.05):
    vars = list(X)
    dict = {}
    selected_features = []
    for var in vars:
        y0 = y[X[var] == 0]
        y1 = y[X[var] == 1]
        if (len(y0) > 25) and (len(y1) > 25):
            try:
                es = epps_singleton_2samp(y0, y1)
            except:
                es = ks_2samp(y0, y1)

            pv = es.pvalue
        else:
            pv = 1
        if pv <= p:
            selected_features.append(var)
        dict[var] = {
            'ES-pv': pv,
            'Avg-1': np.mean(y1),
            'Avg-0': np.mean(y0)
        }  #, 'y0_mean':np.mean(y0), 'y1_mean':np.mean(y1)}
    df = pd.DataFrame.from_dict(dict).T
    return selected_features, df
Exemplo n.º 3
0
def es_metric(df1, df2, numerical_columns):
    es_tests = []
    for col in numerical_columns:
        try:
            es_test = epps_singleton_2samp(x=df1.loc[:, col],
                                           y=df2.loc[:, col])
        except np.linalg.LinAlgError:
            es_test = [None, None]
        es_tests.append(es_test)
    p_values = [x[1] for x in es_tests]
    list_of_pval = [f"{col}_p-value" for col in numerical_columns]
    es_pvalues = dict(zip(list_of_pval, p_values))
    return es_pvalues
Exemplo n.º 4
0
def es(d1, d2, verbose=False):
    """
    Calculates the Epps-Singleton test statistic on 2 distributions.

    Can be used on continuous or discrete distributions.
    Any binning/bucketing of the distributions/samples should be done before passing them to this
    function.

    Whereas KS relies on the empirical distribution function, ES is based on the empirical characteristic function
    (Epps & Singleton 1986, Goerg & Kaiser 2009).

    Advantages:

    - Unlike the KS, the ES can be used on both continuous & discrete distributions.

    - ES has higher power (vs KS) in many examples.

    Disadvantages:

    - Not recommended for fewer than 25 observations. Instead, use the Anderson-Darling TS. (However, ES can still be
    used for small samples. A correction factor is applied so that the asymptotic TS distribution more closely follows
    the chi-squared distribution, such that p-values can be computed.)


    References:

    - [SciPy documentation](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.epps_singleton_2samp.html)

    Args:
        d1 (np.array or pandas.Series): First sample.

        d2 (np.array or pandas.Series): Second sample.

        verbose (bool): If True, useful interpretation info is printed to stdout.

    Returns:
        float: Epps-Singleton test statistic
        float: p-value of rejecting the null hypothesis (that the two distributions are identical)
    """
    d1 = assure_numpy_array(d1)
    d2 = assure_numpy_array(d2)

    es, pvalue = stats.epps_singleton_2samp(d1, d2)

    return es, pvalue
Exemplo n.º 5
0
def calc_ks(df, pop, stat, col2, epps=False):
    """Compare two distributions.
    
    ks_1samp;kstest(x, "norm")
    anderson_ksample(x)  # mixed distribution
    epps_singleon_2samp(x, y)  # discrete

    Parameters
    ----------
    df : TYPE
        DESCRIPTION.
    pop : TYPE
        DESCRIPTION.
    stat : TYPE
        DESCRIPTION.
    col2 : TYPE
        DESCRIPTION.
    epps : TYPE, optional
        DESCRIPTION. The default is False.

    Returns
    -------
    None.

    """
    dfpop = df[df["pops"] == pop]
    stats_list = dfpop[stat].unique()
    sig_count = 0
    for i in stats_list:
        obs = dfpop[(dfpop[stat] == i) & (dfpop["df_id"] == "obs")]
        sim = dfpop[(dfpop[stat] == i) & (dfpop["df_id"] == "sim")]
        if epps:
            ts, pval = epps_singleton_2samp(obs[col2], sim[col2])
        else:
            ts, pval = ks_2samp(obs[col2], sim[col2])
        if pval < 0.001:
            print(f"{stat} {i}: {pval} **")
            sig_count += 1
        else:
            print(f"{stat} {i}: {pval}")
    print(f"total perc sig: {sig_count/len(stats_list)}")
Exemplo n.º 6
0
    def test_integration(self,
                         nsamples=int(1e4),
                         nwarmup=int(1e3),
                         alpha=1e-2):

        test_sampler = ou.sample_posterior(self.t, self.vt, self.ome)
        test_samples = [next(test_sampler)
                        for _ in range(nsamples + nwarmup)][nwarmup::]

        null_sampler = sample_posterior(self.t, self.vt, ou.init_thi,
                                        ou.bounds_thi, fix_ou.eval_log_lik,
                                        ou.eval_log_prior, self.ome)
        null_samples = [next(null_sampler)
                        for _ in range(nsamples + nwarmup)][nwarmup::]

        thi_null = np.array(null_samples[::10])
        thi_test = np.array([sample[0] for sample in test_samples][::10])

        for thi_null_, thi_test_ in zip(thi_null.T, thi_test.T):
            self.assertLess(alpha,
                            epps_singleton_2samp(thi_null_, thi_test_)[1])
Exemplo n.º 7
0
 def time_epps_singleton_2samp(self):
     stats.epps_singleton_2samp(self.a, self.b)
def custom(a, b):
    v, p = stats.epps_singleton_2samp(a, b)
    return p
Exemplo n.º 9
0
 def func_es_2samp(lhs, rhs):
     return stats.epps_singleton_2samp(lhs, rhs)[1]
Exemplo n.º 10
0
import scipy.stats as scstats
dataset = shLacz_dataset
KS_test_results = np.array([scstats.ks_2samp(df_columns_to_arrays(df)[1], df_columns_to_arrays(df)[0]) for df in dataset])
KS_vals = KS_test_results[:,0]
p_vals = KS_test_results[:,1]

fig = plot_statistics_bar(KS_vals, None, exp_labels, "shLacz", "KS statistic", p_vals)
#plt.savefig("./figures/figure5D.png")
# %%
p_vals
# %%
sample1 = np.array(fed_fast_dataset[0].iloc[:,0], dtype = float)
sample2 = np.array(fed_fast_dataset[0].iloc[:,1], dtype = float)
ks, p = scstats.ks_2samp(sample1, sample2)
# %%
es, p = scstats.epps_singleton_2samp(sample1, sample2)
# %%
p_vals
# %%
p
# %%
scstats.mstats.spearmanr(sample1, sample2)
# %%
data_ind = 2
sample1, sample2 = shLacz_dataset[data_ind].iloc[:,1], shLacz_dataset[data_ind].iloc[:,0]
sample1, sample2 = np.array(sample1, np.float32), np.array(sample2, np.float32)
scstats.mstats.brunnermunzel(sample1, sample2, alternative = "two-sided", distribution = "normal")
#A_statistic(sample1, sample2)
# %%
output = bootstrap_confidence_interval_2samp(sample1, sample2, A_statistic, B = 20000, CI_algorithm = "BCa")
#%%
Exemplo n.º 11
0
    samples.append([y, i, "binomial"])
for i in range(3 * n_of_samples, 4 * n_of_samples):
    y = geom.rvs(p, size=s_size)
    samples.append([y, i, "geometric"])
for i in range(4 * n_of_samples, 5 * n_of_samples):
    y = poisson.rvs(n, size=s_size)
    samples.append([y, i, "poisson"])
outlier_1 = beta.rvs(1, 10, size=1000)
outlier_2 = chi2.rvs(n, size=1000)
samples.append([outlier_1, 5 * n_of_samples, "beta"])
samples.append([outlier_2, 5 * n_of_samples + 1, "chi_square"])

for i in range(len(samples)):
    for j in range(i, len(samples)):
        ks_test_pvalue = ks_2samp(samples[i][0], samples[j][0])[1]
        epps_singleton_pvalue = epps_singleton_2samp(samples[i][0],
                                                     samples[j][0])[1]

        if ks_test_pvalue > 0.05:
            G.add_edge(i, j, weight=0.01 /
                       (ks_test_pvalue))  #0.01 scaling factor here
        if epps_singleton_pvalue > 0.05:
            H.add_edge(i, j, weight=0.01 /
                       (epps_singleton_pvalue))  #0.01 scaling factor here

# Testing whether two samples are generated by the same underlying distribution is a classical question in statistics. A widely used test is the Kolmogorov-Smirnov (KS) test which relies on the empirical distribution function. Epps and Singleton introduce a test based on the empirical characteristic function.
#
# One advantage of the ES test compared to the KS test is that is does not assume a continuous distribution.The authors conclude that the test also has a higher power than the KS test in many examples. They recommend the use of the ES test for discrete samples as well as continuous samples with at least 25 observations each.

# In[2]:

nx.draw(G, with_labels=True, edge_color='#00b4d9')