def test_expected_freq(): assert_array_equal(expected_freq([1]), np.array([1.0])) observed = np.array([[[2, 0], [0, 2]], [[0, 2], [2, 0]], [[1, 1], [1, 1]]]) e = expected_freq(observed) assert_array_equal(e, np.ones_like(observed)) observed = np.array([[10, 10, 20], [20, 20, 20]]) e = expected_freq(observed) correct = np.array([[12., 12., 16.], [18., 18., 24.]]) assert_array_almost_equal(e, correct)
def check_for_fisher(df, var1, var2): exp_freq = expected_freq(pd.crosstab(df[var1], df[var2])) if exp_freq.shape != (2,2): return False if (exp_freq<=GLOBAL_EXPECTED_FREQ).any(axis=None) or len(df)<=GLOBAL_N_FOR_FISHER: return True return False
def chi_square_yats(self): observed_list = [self.get_observed()] observed = np.asarray(observed_list) expected = expected_freq(observed) dof = expected.size - sum(expected.shape) + expected.ndim - 1 observed = observed + 0.5 * np.sign(expected - observed) return power_divergence(observed, expected, ddof=observed.size - 1 - dof, axis=None, lambda_=None)
def get_expected_values(crosstab): expected = expected_freq(crosstab) return expected
def cross_chi2(index, columns): chi_res = [] cross_result = pd.crosstab(index=index, columns=columns, margins=True) cr_re = pd.crosstab(index=index, columns=columns, margins=False) # 给模型的不能有汇总列,8/25修改 chi2_pearson, p_value_pearson, dof_pearson, expect_pearson = chi2_contingency( cr_re, correction=True, lambda_='pearson') # pearson 卡方 chi2_log, p_value_log, dof_log, expect_log = chi2_contingency( cr_re, correction=True, lambda_='log-likelihood') chi2_ftukey, p_value_ftukey, dof_ftukey, expect_ftukey = chi2_contingency( cr_re, correction=True, lambda_='freeman-tukey') chi2_mll, p_value_mll, dof_mll, expect_mll = chi2_contingency( cr_re, correction=True, lambda_='mod-log-likelihood') chi2_neyman, p_value_neyman, dof_neyman, expect_neyman = chi2_contingency( cr_re, correction=True, lambda_='neyman') chi2_cr, p_value_cr, dof_cr, expect_cr = chi2_contingency( cr_re, correction=True, lambda_='cressie-read') chi_res.append([ "{:.4f}".format(chi2_pearson), "{:.4f}".format(p_value_pearson), dof_pearson ]) chi_res.append( ["{:.4f}".format(chi2_log), "{:.4f}".format(p_value_log), dof_log]) chi_res.append([ "{:.4f}".format(chi2_ftukey), "{:.4f}".format(p_value_ftukey), dof_ftukey ]) chi_res.append( ["{:.4f}".format(chi2_mll), "{:.4f}".format(p_value_mll), dof_mll]) chi_res.append([ "{:.4f}".format(chi2_neyman), "{:.4f}".format(p_value_neyman), dof_neyman ]) chi_res.append( ["{:.4f}".format(chi2_cr), "{:.4f}".format(p_value_cr), dof_cr]) corss_index = cross_result.index.tolist() corss_index[-1] = '总计' corss_columns = cross_result.columns.tolist() corss_columns[-1] = '总计' corss_value = cross_result.values.tolist() exp = pd.DataFrame(expected_freq(cr_re)) exp = sum_data(exp) expect = format_data_col(exp).values.tolist() r1 = { 'title': "交叉表", 'row': corss_index, 'col': corss_columns[0:], 'data': corss_value } r1 = transform_table_data_to_html(r1) r2 = { 'title': "期望频数表", 'row': corss_index, 'col': corss_columns, 'data': expect } r2 = transform_table_data_to_html(r2) r3 = { 'title': "卡方检验", 'row': [ "pearson", "log-likelihood", "freeman-tukey", "mod-log-likelihood", "neyman", "cressie-read" ], 'col': ['值', '显著性', '自由度'], 'data': chi_res } r3 = transform_table_data_to_html(r3) return [r1, r2, r3]
def test_marginal_sums(contingency_table, threshold=5): """ Return True if the expected marginal sums are all above 5, in which case the chi square test of independency is generally considered valid""" expected_frequencies = contingency.expected_freq(contingency_table.values) return np.all(np.greater(expected_frequencies, threshold))
def chi2_independence(data, x, y, correction=True): """ Chi-squared independence tests between two categorical variables. The test is computed for different values of :math:`\\lambda`: 1, 2/3, 0, -1/2, -1 and -2 (Cressie and Read, 1984). Parameters ---------- data : :py:class:`pandas.DataFrame` The dataframe containing the ocurrences for the test. x, y : string The variables names for the Chi-squared test. Must be names of columns in ``data``. correction : bool Whether to apply Yates' correction when the degree of freedom of the observed contingency table is 1 (Yates 1934). Returns ------- expected : pd.DataFrame The expected contingency table of frequencies. observed : pd.DataFrame The (corrected or not) observed contingency table of frequencies. stats : :py:class:`pandas.DataFrame` The test summary, containing four columns: * ``'test'``: The statistic name * ``'lambda'``: The :math:`\\lambda` value used for the power\ divergence statistic * ``'chi2'``: The test statistic * ``'p'``: The p-value of the test * ``'cramer'``: The Cramer's V effect size * ``'power'``: The statistical power of the test Notes ----- From Wikipedia: *The chi-squared test is used to determine whether there is a significant difference between the expected frequencies and the observed frequencies in one or more categories.* As application examples, this test can be used to *i*) evaluate the quality of a categorical variable in a classification problem or to *ii*) check the similarity between two categorical variables. In the first example, a good categorical predictor and the class column should present high :math:`\\chi^2` and low p-value. In the second example, similar categorical variables should present low :math:`\\chi^2` and high p-value. This function is a wrapper around the :py:func:`scipy.stats.power_divergence` function. .. warning :: As a general guideline for the consistency of this test, the observed and the expected contingency tables should not have cells with frequencies lower than 5. References ---------- .. [1] Cressie, N., & Read, T. R. (1984). Multinomial goodness‐of‐fit tests. Journal of the Royal Statistical Society: Series B (Methodological), 46(3), 440-464. .. [2] Yates, F. (1934). Contingency Tables Involving Small Numbers and the :math:`\\chi^2` Test. Supplement to the Journal of the Royal Statistical Society, 1, 217-235. Examples -------- Let's see if gender is a good categorical predictor for the presence of heart disease. >>> import pingouin as pg >>> data = pg.read_dataset('chi2_independence') >>> data['sex'].value_counts(ascending=True) 0 96 1 207 Name: sex, dtype: int64 If gender is not a good predictor for heart disease, we should expect the same 96:207 ratio across the target classes. >>> expected, observed, stats = pg.chi2_independence(data, x='sex', ... y='target') >>> expected target 0 1 sex 0 43.722772 52.277228 1 94.277228 112.722772 Let's see what the data tells us. >>> observed target 0 1 sex 0 24.5 71.5 1 113.5 93.5 The proportion is lower on the class 0 and higher on the class 1. The tests should be sensitive to this difference. >>> stats.round(3) test lambda chi2 dof p cramer power 0 pearson 1.000 22.717 1.0 0.0 0.274 0.997 1 cressie-read 0.667 22.931 1.0 0.0 0.275 0.998 2 log-likelihood 0.000 23.557 1.0 0.0 0.279 0.998 3 freeman-tukey -0.500 24.220 1.0 0.0 0.283 0.998 4 mod-log-likelihood -1.000 25.071 1.0 0.0 0.288 0.999 5 neyman -2.000 27.458 1.0 0.0 0.301 0.999 Very low p-values indeed. The gender qualifies as a good predictor for the presence of heart disease on this dataset. """ # Python code inspired by SciPy's chi2_contingency assert isinstance(data, pd.DataFrame), 'data must be a pandas DataFrame.' assert isinstance(x, str), 'x must be a string.' assert isinstance(y, str), 'y must be a string.' assert all(col in data.columns for col in (x, y)),\ 'columns are not in dataframe.' assert isinstance(correction, bool), 'correction must be a boolean.' observed = pd.crosstab(data[x], data[y]) if observed.size == 0: raise ValueError('No data; observed has size 0.') expected = pd.DataFrame(expected_freq(observed), index=observed.index, columns=observed.columns) # All count frequencies should be at least 5 for df, name in zip([observed, expected], ['observed', 'expected']): if (df < 5).any(axis=None): warnings.warn('Low count on {} frequencies.'.format(name)) dof = float(expected.size - sum(expected.shape) + expected.ndim - 1) if dof == 1 and correction: # Adjust `observed` according to Yates' correction for continuity. observed = observed + 0.5 * np.sign(expected - observed) ddof = observed.size - 1 - dof n = data.shape[0] stats = [] names = [ "pearson", "cressie-read", "log-likelihood", "freeman-tukey", "mod-log-likelihood", "neyman" ] for name, lambda_ in zip(names, [1.0, 2 / 3, 0.0, -1 / 2, -1.0, -2.0]): if dof == 0: chi2, p, cramer, power = 0.0, 1.0, np.nan, np.nan else: chi2, p = power_divergence(observed, expected, ddof=ddof, axis=None, lambda_=lambda_) dof_cramer = min(expected.shape) - 1 cramer = np.sqrt(chi2 / (n * dof_cramer)) power = power_chi2(dof=dof, w=cramer, n=n, alpha=0.05) stats.append({ 'test': name, 'lambda': lambda_, 'chi2': chi2, 'dof': dof, 'p': p, 'cramer': cramer, 'power': power }) stats = pd.DataFrame(stats)[[ 'test', 'lambda', 'chi2', 'dof', 'p', 'cramer', 'power' ]] return expected, observed, stats
def pointwise_mutual_information(contingency_matrix): expected_freq_matrix = expected_freq(contingency_matrix) return { 'pmi': np.log2(contingency_matrix[0][0] / expected_freq_matrix[0][0]) }
def meets_cochran(ser): expected = expected_freq( np.array([ser, cluster_stats2.cluster_sizes - ser])) emin = (np.round(expected) >= 1).all() perc_expected = ((expected > 5).sum() / expected.size) > 0.8 return emin and perc_expected
def has_zero_expected(ser): expected = expected_freq( np.array([ser, cluster_stats2.cluster_sizes - ser])) return np.any(np.round(expected) == 0)
def tabella_di_contingenza(dataframe, colonna_A, colonna_B, ordine_A=False, ordine_B=False, informativo=False, norm_axis=False): ''' dataframe: inserire la tabella su cui si vuole fare la tabulazione incrociata colonna_A: inserire la stringa di testo che rappresenta l'intestazione della singola colonna colonna_B: inserire la stringa di testo che rappresenta l'intestazione della singola colonna ordine_A: inserire una lista di valori rappresentativi dell'ordine delle categorie della colonna A ordine_B: inserire una lista di valori rappresentativi dell'ordine delle categorie della colonna B iformativo: True, permette di avere in una stessa tabella frequenze, frequenze attese e scarti. ''' # qui aggiuntere tabella con scarti e percentuale. # qui andrebbero inserite anche le percentuali di riga crosstab = pd.crosstab(dataframe[colonna_A], dataframe[colonna_B], margins=True) # normalize : boolean, {‘all’, ‘index’, ‘columns’} if ordine_A != False: crosstab = crosstab.reindex(ordine_A, axis=0) if ordine_B != False: crosstab = crosstab.reindex(ordine_B, axis=1) if informativo == True: expected = pd.DataFrame(expected_freq(crosstab), index=crosstab.index, columns=crosstab.columns) crosstab_norm_all = pd.crosstab( dataframe[colonna_A], dataframe[colonna_B], margins=True, normalize="all").applymap(lambda x: ("( {:.2f})".format(x))) crosstab_norm_index = pd.crosstab( dataframe[colonna_A], dataframe[colonna_B], margins=True, normalize="index").applymap(lambda x: ("( {:.2f})".format(x))) crosstab_norm_columns = pd.crosstab( dataframe[colonna_A], dataframe[colonna_B], margins=True, normalize="columns").applymap(lambda x: ("( {:.2f})".format(x))) if norm_axis == False: crosstab = crosstab.applymap(str) + " " + expected.applymap( lambda x: ("( {:.2f})".format(x))) + " " + ( crosstab - expected).applymap(lambda x: ( "( {:.2f})".format(x))) + " " + crosstab_norm_all if norm_axis == "index": crosstab = crosstab.applymap(str) + " " + expected.applymap( lambda x: ("( {:.2f})".format(x))) + " " + ( crosstab - expected).applymap(lambda x: ( "( {:.2f})".format(x))) + " " + crosstab_norm_index if norm_axis == "columns": crosstab = crosstab.applymap(str) + " " + expected.applymap( lambda x: ("( {:.2f})".format(x))) + " " + ( crosstab - expected).applymap(lambda x: ( "( {:.2f})".format(x))) + " " + crosstab_norm_columns return crosstab
len(controls) - len(segment[3]) ] ] if method == 'chi': p = chi2_contingency(contingency_table, correction=yates)[1] if yates: method_name = 'Yates chi-squared' else: method_name = 'Chi-squared' elif method == 'fisher': p = fisher_exact(contingency_table)[1] method_name = 'Fisher' elif method == 'g': p = power_divergence( contingency_table[0] + contingency_table[1], f_exp=expected_freq(contingency_table).ravel(), ddof=2, lambda_='log-likelihood')[1] method_name = 'G-test' else: expected_frequency_table = expected_freq(contingency_table) num_large_cells = 0 num_small_cells = 0 for row in expected_frequency_table: for cell in row: if cell >= 5: num_large_cells += 1 elif cell < 1: num_small_cells += 1 break if num_large_cells >= 3 and num_small_cells == 0:
def contingency_table(dataframe, columns_a, columns_b, order_a=False, order_b=False, informative=True, norm_axis=False): ''' dataframe: enter the table on which you want to make the cross tabulation columns_a: insert the text string representing the header of the single column columns_b: insert the text string representing the header of the single column order_a: insert a list of values representative of the order of the categories in column A order_b: insert a list of values representative of the order of the categories in column B informative: True, allows you to have in the same table frequencies, expected frequencies and discards. ''' if order_a != False: dataframe[columns_a] = pd.Categorical(dataframe[columns_a], categories=order_a) if order_b != False: dataframe[columns_b] = pd.Categorical(dataframe[columns_b], categories=order_b) crosstab = pd.crosstab(dataframe[columns_a], dataframe[columns_b], margins=True, dropna=False) if informative == True: expected = pd.DataFrame(expected_freq(crosstab), index=crosstab.index, columns=crosstab.columns) crosstab_norm_all = pd.crosstab( dataframe[columns_a], dataframe[columns_b], margins=True, normalize="all", dropna=False).applymap(lambda x: ("( {:.2f})".format(x))) crosstab_norm_index = pd.crosstab( dataframe[columns_a], dataframe[columns_b], margins=True, normalize="index", dropna=False).applymap(lambda x: ("( {:.2f})".format(x))) crosstab_norm_columns = pd.crosstab( dataframe[columns_a], dataframe[columns_b], margins=True, normalize="columns", dropna=False).applymap(lambda x: ("( {:.2f})".format(x))) if norm_axis == False: crosstab = crosstab.applymap(str) + " " + expected.applymap( lambda x: ("( {:.2f})".format(x))) + " " + ( crosstab - expected).applymap(lambda x: ( "( {:.2f})".format(x))) + " " + crosstab_norm_all if norm_axis == "index": crosstab = crosstab.applymap(str) + " " + expected.applymap( lambda x: ("( {:.2f})".format(x))) + " " + ( crosstab - expected).applymap(lambda x: ( "( {:.2f})".format(x))) + " " + crosstab_norm_index if norm_axis == "columns": crosstab = crosstab.applymap(str) + " " + expected.applymap( lambda x: ("( {:.2f})".format(x))) + " " + ( crosstab - expected).applymap(lambda x: ( "( {:.2f})".format(x))) + " " + crosstab_norm_columns return crosstab
#Cases without segment Controls without segment [len(cases) - len(segment[2]), len(controls) - len(segment[3])] ] if method == 'chi': p = chi2_contingency(contingency_table, correction=yates)[1] if yates: method_name = 'Yates chi-squared' else: method_name = 'Chi-squared' elif method == 'fisher': p = fisher_exact(contingency_table)[1] method_name = 'Fisher' elif method == 'g': p = power_divergence( contingency_table[0] + contingency_table[1], f_exp=expected_freq(contingency_table).ravel(), ddof=2, lambda_='log-likelihood' )[1] method_name = 'G-test' else: expected_frequency_table = expected_freq(contingency_table) num_large_cells = 0 num_small_cells = 0 for row in expected_frequency_table: for cell in row: if cell >= 5: num_large_cells += 1 elif cell < 1: num_small_cells += 1 break