Пример #1
0
    def prop_test(self, col):
        """
        Performs a Chi-Square test of independence on <col>
        See stats.chi2_contingency()

        Parameters
        ----------
        col : str
            Name of column on which the test should be performed

        Returns
        ______
        dict
            {'var': <col>,
             'before': <pvalue before matching>,
             'after': <pvalue after matching>}


        """
        if not uf.is_continuous(col, self.X) and col not in self.exclude:
            pval_before = round(
                stats.chi2_contingency(self.prep_prop_test(self.data, col))[1],
                6)
            pval_after = round(
                stats.chi2_contingency(
                    self.prep_prop_test(self.matched_data, col))[1], 6)
            return {"var": col, "before": pval_before, "after": pval_after}
        else:
            print("{} is a continuous variable".format(col))
Пример #2
0
    def compare_categorical(self, return_table=False):
        """
        Plots the proportional differences of each enumerated
        discete column for test and control.
        i.e. <prop_test_that_have_x>  - <prop_control_that_have_x>
        Each chart title contains the results from a
        Chi-Square Test of Independence before and after
        matching.
        See pymatch.prop_test()

        Parameters
        ----------
        return_table : bool
            Should the function return a table with
            test results?

        Return
        ------
        pd.DataFrame() (optional)
            Table with the p-values of the Chi-Square contingency test
            for each discrete column before and after matching

        """
        def prep_plot(data, var, colname):
            t, c = data[data[self.yvar] == 1], data[data[self.yvar] == 0]
            # dummy var for counting
            dummy = [
                i for i in t.columns
                if i not in (var, "match_id", "record_id", "weight")
            ][0]
            countt = t[[var, dummy]].groupby(var).count() / len(t)
            countc = c[[var, dummy]].groupby(var).count() / len(c)
            ret = (countt - countc).dropna()
            ret.columns = [colname]
            return ret

        title_str = """
        Proportional Difference (test-control) for {} Before and After Matching
        Chi-Square Test for Independence p-value before | after:
        {} | {}
        """
        test_results = []
        for col in self.matched_data.columns:
            if not uf.is_continuous(col, self.X) and col not in self.exclude:
                dbefore = prep_plot(self.data, col, colname="before")
                dafter = prep_plot(self.matched_data, col, colname="after")
                df = dbefore.join(dafter)
                test_results_i = self.prop_test(col)
                test_results.append(test_results_i)

                # plotting
                df.plot.bar(alpha=0.8)
                plt.title(
                    title_str.format(col, test_results_i["before"],
                                     test_results_i["after"]))
                lim = max(0.09, abs(df).max().max()) + 0.01
                plt.ylim((-lim, lim))
        return pd.DataFrame(test_results)[["var", "before", "after"
                                           ]] if return_table else None
Пример #3
0
    def compare_continuous(self, save=False, return_table=False):
        """
        Plots the ECDFs for continuous features before and
        after matching. Each chart title contains test results
        and statistics to summarize how similar the two distributions
        are (we want them to be close after matching).

        Tests performed:
        Kolmogorov-Smirnov Goodness of fit Test (KS-test)
            This test statistic is calculated on 1000
            permuted samples of the data, generating
            an imperical p-value.  See pymatch.functions.ks_boot()
            This is an adaptation of the ks.boot() method in
            the R "Matching" package
            https://www.rdocumentation.org/packages/Matching/versions/4.9-2/topics/ks.boot
        Chi-Square Distance:
            Similarly this distance metric is calculated on
            1000 permuted samples.
            See pymatch.functions.grouped_permutation_test()

        Other included Stats:
        Standarized mean and median differences
        How many standard deviations away are the mean/median
        between our groups before and after matching
        i.e. abs(mean(control) - mean(test)) / std(control.union(test))

        Parameters
        ----------
        return_table : bool
            Should the function a table with tests and statistics?

        Returns
        -------
        pd.DataFrame (optional)
            Table of before/after statistics if return_table == True


        """
        test_results = []
        for col in self.matched_data.columns:
            if uf.is_continuous(col, self.X) and col not in self.exclude:
                # organize data
                trb, cob = self.test[col], self.control[col]
                tra = self.matched_data[self.matched_data[self.yvar] ==
                                        True][col]
                coa = self.matched_data[self.matched_data[self.yvar] ==
                                        False][col]
                xtb, xcb = ECDF(trb), ECDF(cob)
                xta, xca = ECDF(tra), ECDF(coa)

                # before/after stats
                std_diff_med_before, std_diff_mean_before = uf.std_diff(
                    trb, cob)
                std_diff_med_after, std_diff_mean_after = uf.std_diff(tra, coa)
                pb, truthb = uf.grouped_permutation_test(
                    uf.chi2_distance, trb, cob)
                pa, trutha = uf.grouped_permutation_test(
                    uf.chi2_distance, tra, coa)
                ksb = round(uf.ks_boot(trb, cob, nboots=1000), 6)
                ksa = round(uf.ks_boot(tra, coa, nboots=1000), 6)

                # plotting
                f, (ax1, ax2) = plt.subplots(1,
                                             2,
                                             sharey=True,
                                             sharex=True,
                                             figsize=(12, 5))
                ax1.plot(xcb.x,
                         xcb.y,
                         label="Control",
                         color=self.control_color)
                ax1.plot(xtb.x, xtb.y, label="Test", color=self.test_color)
                ax1.plot(xcb.x,
                         xcb.y,
                         label="Control",
                         color=self.control_color)
                ax1.plot(xtb.x, xtb.y, label="Test", color=self.test_color)

                title_str = """
                ECDF for {} {} Matching
                KS p-value: {}
                Grouped Perm p-value: {}
                Std. Median Difference: {}
                Std. Mean Difference: {}
                """
                ax1.set_title(
                    title_str.format(col, "before", ksb, pb,
                                     std_diff_med_before,
                                     std_diff_mean_before))
                ax2.plot(xca.x, xca.y, label="Control")
                ax2.plot(xta.x, xta.y, label="Test")
                ax2.set_title(
                    title_str.format(col, "after", ksa, pa, std_diff_med_after,
                                     std_diff_mean_after))
                ax2.legend(loc="lower right")
                plt.xlim((0, np.percentile(xta.x, 99)))

                test_results.append({
                    "var": col,
                    "ks_before": ksb,
                    "ks_after": ksa,
                    "grouped_chisqr_before": pb,
                    "grouped_chisqr_after": pa,
                    "std_median_diff_before": std_diff_med_before,
                    "std_median_diff_after": std_diff_med_after,
                    "std_mean_diff_before": std_diff_mean_before,
                    "std_mean_diff_after": std_diff_mean_after,
                })

        var_order = [
            "var",
            "ks_before",
            "ks_after",
            "grouped_chisqr_before",
            "grouped_chisqr_after",
            "std_median_diff_before",
            "std_median_diff_after",
            "std_mean_diff_before",
            "std_mean_diff_after",
        ]

        return pd.DataFrame(test_results)[var_order] if return_table else None