예제 #1
0
 def expect_assertion_error(*params):
     with pytest.raises(AssertionError):
         pg.chi2_independence(*params)
예제 #2
0
    def test_chi2_independence(self):
        """Test function chi2_independence."""
        # Setup
        np.random.seed(42)
        mean, cov = [0.5, 0.5], [(1, .6), (.6, 1)]
        x, y = np.random.multivariate_normal(mean, cov, 30).T
        data = pd.DataFrame({'x': x, 'y': y})
        mask_class_1 = data > 0.5
        data[mask_class_1] = 1
        data[~mask_class_1] = 0

        # Comparing results with SciPy
        _, _, stats = pg.chi2_independence(data, x='x', y='y')
        contingency_table = pd.crosstab(data['x'], data['y'])
        for i in stats.index:
            lambda_ = stats.at[i, 'lambda']
            dof = stats.at[i, 'dof']
            chi2 = stats.at[i, 'chi2']
            p = round(stats.at[i, 'p'], 6)
            sp_chi2, sp_p, sp_dof, _ = chi2_contingency(contingency_table,
                                                        lambda_=lambda_)
            assert (chi2, p, dof) == (round(sp_chi2, 3), round(sp_p, 6),
                                      sp_dof)

        # Testing resilience to NaN
        mask_nan = np.random.random(data.shape) > 0.8  # ~20% NaN values
        data[mask_nan] = np.nan
        pg.chi2_independence(data, x='x', y='y')

        # Testing validations
        def expect_assertion_error(*params):
            with pytest.raises(AssertionError):
                pg.chi2_independence(*params)
        expect_assertion_error(1, 'x', 'y')  # Not a pd.DataFrame
        expect_assertion_error(data, x, 'y')  # Not a string
        expect_assertion_error(data, 'x', y)  # Not a string
        expect_assertion_error(data, 'x', 'z')  # Not a column of data

        # Testing "no data" ValueError
        data['x'] = np.nan
        with pytest.raises(ValueError):
            pg.chi2_independence(data, x='x', y='y')

        # Testing degenerated case (observed == expected)
        data['x'] = 1
        data['y'] = 1
        expected, observed, stats = pg.chi2_independence(data, 'x', 'y')
        assert expected.iloc[0, 0] == observed.iloc[0, 0]
        assert stats.at[0, 'dof'] == 0
        for i in stats.index:
            chi2 = stats.at[i, 'chi2']
            p = stats.at[i, 'p']
            assert (chi2, p) == (0.0, 1.0)

        # Testing warning on low count
        data.iloc[0, 0] = 0
        with pytest.warns(UserWarning):
            pg.chi2_independence(data, 'x', 'y')

        # Comparing results with R
        # 2 x 2 contingency table (dof = 1)
        # >>> tbl = table(df$sex, df$target)
        # >>> chisq.test(tbl, correct = TRUE)
        # >>> cramersV(tbl)
        _, _, stats = pg.chi2_independence(df_ind, 'sex', 'target')
        assert stats.at[0, 'chi2'] == 22.717
        assert stats.at[0, 'dof'] == 1
        assert np.allclose(stats.at[0, 'p'], 1.877e-06)
        assert round(stats.at[0, 'cramer'], 2) == 0.27

        # 4 x 2 contingency table
        _, _, stats = pg.chi2_independence(df_ind, 'cp', 'target')
        assert stats.at[0, 'chi2'] == 81.686
        assert stats.at[0, 'dof'] == 3.
        assert stats.at[0, 'p'] < 2.2e-16
        assert round(stats.at[0, 'cramer'], 3) == 0.519
        assert np.allclose(stats.at[0, 'power'], 1.)
    axes[4].axvline(0, linestyle="--", color="grey")
frac = (x3["CD15(FITC-A)"] > 0).groupby(
    x3["severity_group"]).sum() / x3["severity_group"].value_counts()
sns.barplot(frac * 100, frac.index, palette=pal, ax=axes[5])
axes[5].set(xlabel="% CD15 positive")
axes[5].set_yticklabels([])
fig.tight_layout()
fig.savefig(
    figures_dir / "panels" / "Figure2.CD5_expression_positivity.svg",
    **figkws,
)

from scipy.stats import fisher_exact
import pingouin as pg  # type: ignore

pg.chi2_independence(data=x, x="severity_group", y="cluster")

y = x[["cluster"]].join(pd.get_dummies(x["severity_group"]))
for cat in x["severity_group"].unique():
    pg.chi2_independence(data=y, x="cluster", y=cat)

y = x[["severity_group"]].join(pd.get_dummies(x["cluster"]))
v = dict()
for cat in x["cluster"].unique():
    v[cat] = pg.chi2_independence(data=y, x="severity_group",
                                  y=cat)[2].iloc[-1]

y = pd.get_dummies(x[["severity_group", "cluster"]])
for seve in meta["severity_group"].cat.categories:
    for clus in x["cluster"].unique():
        y2 = y[["severity_group_" + seve, "cluster_" + clus]]
예제 #4
0
if choose_analysis == "Chi-square test":
    cat_vars = df.select_dtypes(include=np.object).columns.tolist()

    y_var1 = st.sidebar.selectbox("Choose first categorical variable:",
                                  cat_vars)
    y_var2 = st.sidebar.selectbox("Choose second categorical variable:",
                                  cat_vars)
    expand = st.sidebar.beta_expander("More options")
    yates_correction = expand.checkbox("Use Yates correction?")
    # move_counts = expand.slider("Adjust labels for counts on bar plot", 0.0, 0.5, 0.15, 0.01)

    st.header("Chi-square test of independence:")
    st.markdown("----")
    st.success("Expected and observed frequencies:")

    expected, observed, stats = pg.chi2_independence(
        df, x=y_var1, y=y_var2, correction=True if yates_correction else False)
    st.subheader("Expected")
    st.write(expected)
    st.subheader("Observed")
    st.write(observed)
    st.subheader("Chi-square test results:")
    st.write(stats.loc[[0]])
    st.markdown("----")

    st.success("Frequency bars are generated:")
    st.markdown("## ")
    fig = plt.figure(figsize=(12, 6))
    total = float(len(df))
    ax = sns.countplot(x=y_var1, hue=y_var2, data=df, palette="Set2")
    numX = len([x for x in df[y_var1].unique() if x == x])
def test_significance(positive_samples_path, negative_samples_path,
                      good_seed_only):
    """ Computes the significance levels / effect size of the generation strategy on the success of adversarial
    samples """

    # Read-in tables
    print('Reading-in tables ...')
    with open(positive_samples_path, 'r', encoding='utf8') as psp:
        positive_samples = json.load(psp)
    with open(negative_samples_path, 'r', encoding='utf8') as nsp:
        negative_samples = json.load(nsp)

    # Store success labels per generation strategy
    success_var = {
        'insert_at_homograph': list(),
        'replace_at_homograph': list(),
        'insert_at_other': list(),
        'replace_at_other': list()
    }

    # Construct dataframe for the Chi^2 test
    print('Looking up sample provenance ...')
    for term in positive_samples.keys():
        for seed_cluster in positive_samples[term].keys():
            for adv_cluster in positive_samples[term][seed_cluster].keys():
                for seed_sentence in positive_samples[term][seed_cluster][
                        adv_cluster].keys():
                    for sample in positive_samples[term][seed_cluster][
                            adv_cluster][seed_sentence]:
                        if good_seed_only == 'True' and sample[20][
                                0] != 'not_flipped':
                            continue
                        gen_strat = sample[19][-1]
                        success_var[gen_strat].append(1)
    for term in negative_samples.keys():
        for seed_cluster in negative_samples[term].keys():
            for adv_cluster in negative_samples[term][seed_cluster].keys():
                for seed_sentence in negative_samples[term][seed_cluster][
                        adv_cluster].keys():
                    for sample in negative_samples[term][seed_cluster][
                            adv_cluster][seed_sentence]:
                        if good_seed_only == 'True' and sample[20][
                                0] != 'not_flipped':
                            continue
                        gen_strat = sample[19][-1]
                        success_var[gen_strat].append(0)

    # Construct dataframe
    print('Computing correlations ...')
    success_dict = {'method': list(), 'labels': list()}
    for m in success_var.keys():
        success_dict['method'] += [m] * len(success_var[m])
        success_dict['labels'] += success_var[m]
    unrolled_success_dict = pd.DataFrame.from_dict(success_dict)
    # Perform Chi^2 test
    expected, observed, stats = chi2_independence(unrolled_success_dict,
                                                  x='method',
                                                  y='labels')
    chi2 = stats.iloc[0]['chi2']
    p = stats.iloc[0]['p']
    p = p if p > 0.00005 else 0.0
    v = stats.iloc[0]['cramer']

    # Report
    print('Done!')
    print('=' * 20)
    print('CHI^2 STATS:')
    if p > 0.0:
        print('{:.3f}, {:.4f}, {:.4f}'.format(chi2, p, v))
    else:
        print('{:.3f}, {:.1f}, {:.4f}'.format(chi2, p, v))
        cbar_kws=dict(label="Mean intensity\n(Z-score)"),
        **kws,
    )
    grid2.savefig(
        output_dir
        / f"{panel_name}.{label}.cluster_mean_intensity.clustermap.zscore.svg",
        **figkws,
    )
    plt.close(grid2.fig)

    # Association between factors and cluster distribution
    for var in clin_vars[1:]:
        y = a.obs[[var]].join(pd.get_dummies(a.obs["cluster"]))
        v = dict()
        for cat in a.obs["cluster"].unique():
            v[cat] = pg.chi2_independence(data=y, x=var, y=cat)[2].iloc[-1]
        res = pd.DataFrame(v).T

        # same order as clustermap
        res = res.reindex(mean.iloc[grid1.dendrogram_row.reordered_ind].index)

        fig, ax = plt.subplots(1, 1, figsize=(1.430, 0.08))
        cramer = res["cramer"].astype(float)
        points = ax.scatter(
            cramer.index,
            [0] * cramer.shape[0],
            s=6,
            c=cramer,
            cmap="autumn_r",
            marker="s",
            edgecolors="none",