def prop_test(self, col): """ Performs a Chi-Square test of independence on <col> See stats.chi2_contingency() Parameters ---------- col : str Name of column on which the test should be performed Returns ______ dict {'var': <col>, 'before': <pvalue before matching>, 'after': <pvalue after matching>} """ if not uf.is_continuous(col, self.X) and col not in self.exclude: pval_before = round( stats.chi2_contingency(self.prep_prop_test(self.data, col))[1], 6) pval_after = round( stats.chi2_contingency( self.prep_prop_test(self.matched_data, col))[1], 6) return {"var": col, "before": pval_before, "after": pval_after} else: print("{} is a continuous variable".format(col))
def compare_categorical(self, return_table=False): """ Plots the proportional differences of each enumerated discete column for test and control. i.e. <prop_test_that_have_x> - <prop_control_that_have_x> Each chart title contains the results from a Chi-Square Test of Independence before and after matching. See pymatch.prop_test() Parameters ---------- return_table : bool Should the function return a table with test results? Return ------ pd.DataFrame() (optional) Table with the p-values of the Chi-Square contingency test for each discrete column before and after matching """ def prep_plot(data, var, colname): t, c = data[data[self.yvar] == 1], data[data[self.yvar] == 0] # dummy var for counting dummy = [ i for i in t.columns if i not in (var, "match_id", "record_id", "weight") ][0] countt = t[[var, dummy]].groupby(var).count() / len(t) countc = c[[var, dummy]].groupby(var).count() / len(c) ret = (countt - countc).dropna() ret.columns = [colname] return ret title_str = """ Proportional Difference (test-control) for {} Before and After Matching Chi-Square Test for Independence p-value before | after: {} | {} """ test_results = [] for col in self.matched_data.columns: if not uf.is_continuous(col, self.X) and col not in self.exclude: dbefore = prep_plot(self.data, col, colname="before") dafter = prep_plot(self.matched_data, col, colname="after") df = dbefore.join(dafter) test_results_i = self.prop_test(col) test_results.append(test_results_i) # plotting df.plot.bar(alpha=0.8) plt.title( title_str.format(col, test_results_i["before"], test_results_i["after"])) lim = max(0.09, abs(df).max().max()) + 0.01 plt.ylim((-lim, lim)) return pd.DataFrame(test_results)[["var", "before", "after" ]] if return_table else None
def compare_continuous(self, save=False, return_table=False): """ Plots the ECDFs for continuous features before and after matching. Each chart title contains test results and statistics to summarize how similar the two distributions are (we want them to be close after matching). Tests performed: Kolmogorov-Smirnov Goodness of fit Test (KS-test) This test statistic is calculated on 1000 permuted samples of the data, generating an imperical p-value. See pymatch.functions.ks_boot() This is an adaptation of the ks.boot() method in the R "Matching" package https://www.rdocumentation.org/packages/Matching/versions/4.9-2/topics/ks.boot Chi-Square Distance: Similarly this distance metric is calculated on 1000 permuted samples. See pymatch.functions.grouped_permutation_test() Other included Stats: Standarized mean and median differences How many standard deviations away are the mean/median between our groups before and after matching i.e. abs(mean(control) - mean(test)) / std(control.union(test)) Parameters ---------- return_table : bool Should the function a table with tests and statistics? Returns ------- pd.DataFrame (optional) Table of before/after statistics if return_table == True """ test_results = [] for col in self.matched_data.columns: if uf.is_continuous(col, self.X) and col not in self.exclude: # organize data trb, cob = self.test[col], self.control[col] tra = self.matched_data[self.matched_data[self.yvar] == True][col] coa = self.matched_data[self.matched_data[self.yvar] == False][col] xtb, xcb = ECDF(trb), ECDF(cob) xta, xca = ECDF(tra), ECDF(coa) # before/after stats std_diff_med_before, std_diff_mean_before = uf.std_diff( trb, cob) std_diff_med_after, std_diff_mean_after = uf.std_diff(tra, coa) pb, truthb = uf.grouped_permutation_test( uf.chi2_distance, trb, cob) pa, trutha = uf.grouped_permutation_test( uf.chi2_distance, tra, coa) ksb = round(uf.ks_boot(trb, cob, nboots=1000), 6) ksa = round(uf.ks_boot(tra, coa, nboots=1000), 6) # plotting f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, sharex=True, figsize=(12, 5)) ax1.plot(xcb.x, xcb.y, label="Control", color=self.control_color) ax1.plot(xtb.x, xtb.y, label="Test", color=self.test_color) ax1.plot(xcb.x, xcb.y, label="Control", color=self.control_color) ax1.plot(xtb.x, xtb.y, label="Test", color=self.test_color) title_str = """ ECDF for {} {} Matching KS p-value: {} Grouped Perm p-value: {} Std. Median Difference: {} Std. Mean Difference: {} """ ax1.set_title( title_str.format(col, "before", ksb, pb, std_diff_med_before, std_diff_mean_before)) ax2.plot(xca.x, xca.y, label="Control") ax2.plot(xta.x, xta.y, label="Test") ax2.set_title( title_str.format(col, "after", ksa, pa, std_diff_med_after, std_diff_mean_after)) ax2.legend(loc="lower right") plt.xlim((0, np.percentile(xta.x, 99))) test_results.append({ "var": col, "ks_before": ksb, "ks_after": ksa, "grouped_chisqr_before": pb, "grouped_chisqr_after": pa, "std_median_diff_before": std_diff_med_before, "std_median_diff_after": std_diff_med_after, "std_mean_diff_before": std_diff_mean_before, "std_mean_diff_after": std_diff_mean_after, }) var_order = [ "var", "ks_before", "ks_after", "grouped_chisqr_before", "grouped_chisqr_after", "std_median_diff_before", "std_median_diff_after", "std_mean_diff_before", "std_mean_diff_after", ] return pd.DataFrame(test_results)[var_order] if return_table else None