def testShuffledDataframeRelativeToJackknife(self): # Same as test above, but also testing that reordering the data doesn't # change results, up to order. df = pd.DataFrame({ "X": range(11), "Y": np.concatenate((np.zeros(6), np.ones(5))), "Z": np.concatenate((np.zeros(3), np.ones(8))) }) metric = metrics.Distribution("X", ["Z"]) se_method = standard_errors.Jackknife() output = core.Analyze(df.iloc[np.random.permutation(11)]).relative_to( comparisons.AbsoluteDifference( "Y", 0)).with_standard_errors(se_method).calculate(metric).run() output = (output.reset_index().sort_values(by=["Y", "Z"]).set_index( ["Y", "Z"])) correct = pd.DataFrame( np.array([[-0.2, 0.18100283490], [0.2, 0.18100283490]]), columns=[ "X Distribution Absolute Difference", "X Distribution Absolute Difference Jackknife SE" ], index=pd.MultiIndex(levels=[[1.], [0., 1.]], labels=[[0, 0], [0, 1]], names=["Y", "Z"])) correct = (correct.reset_index().sort_values(by=["Y", "Z"]).set_index( ["Y", "Z"])) self.assertTrue( all(output.index == correct.index) and all(output.columns == correct.columns) and np.all(abs(output.values - correct.values) < 1e-10))
def testDataframeRelativeToJackknife(self): df = pd.DataFrame({ "X": range(11), "Y": np.concatenate((np.zeros(6), np.ones(5))), "Z": np.concatenate((np.zeros(3), np.ones(8))) }) metric = metrics.Distribution("X", ["Z"]) se_method = standard_errors.Jackknife() output = core.Analyze(df).relative_to( comparisons.AbsoluteDifference( "Y", 0)).with_standard_errors(se_method).calculate(metric).run() correct = pd.DataFrame( np.array([[-0.2, 0.18100283490], [0.2, 0.18100283490]]), columns=[ "X Distribution Absolute Difference", "X Distribution Absolute Difference Jackknife SE" ], index=pd.MultiIndex(levels=[[1.], [0., 1.]], labels=[[0, 0], [0, 1]], names=["Y", "Z"])) self.assertTrue( all(output.index == correct.index) and all(output.columns == correct.columns) and np.all(abs(output.values - correct.values) < 1e-10))
def testWeightedDistribution(self): df = pd.DataFrame({"X": [1, 1, 1, 2, 2, 3, 4], "Y": [1, 2, 0, 1, 1, 1, 1]}) weights = np.array([1, 7, 1, 1, 1, 1, 1]) metric = metrics.Distribution("X", ["Y"]) output = metric(df, weights) correct = pd.DataFrame( np.array([12 / 20., 7 / 20., 1 / 20.]), columns=[""], index=pd.Index([1, 2, 0], name="Y")) self.assertTrue(output.equals(correct))
def testTwoDimensionalDistribution(self): df = pd.DataFrame({"X": [1, 1, 1, 2, 2, 3, 4], "Y": [1, 2, 0, 1, 1, 1, 1], "Z": [1, 0, 0, 0, 0, 0, 0]}) weights = np.array([1, 1, 1, 1, 1, 1, 1]) metric = metrics.Distribution("X", ["Y", "Z"]) output = metric(df, weights) correct = pd.DataFrame( np.array([1 / 14., 1 / 14., 1 / 14., 11 / 14.]), columns=[""], index=pd.MultiIndex(levels=[[0, 1, 2], [0, 1]], labels=[[1, 2, 0, 1], [1, 0, 0, 0]], names=["Y", "Z"])) self.assertTrue(output.equals(correct))
def testSplitDataframe(self): df = pd.DataFrame({ "X": range(11), "Y": np.concatenate((np.zeros(6), np.ones(5))), "Z": np.concatenate((np.zeros(3), np.ones(8))) }) metric = metrics.Distribution("X", ["Z"]) output = core.Analyze(df).split_by(["Y"]).calculate(metric).run() correct = pd.DataFrame(np.array([0.2, 0.8, 0.0, 1.0]), columns=["X Distribution"], index=pd.MultiIndex(levels=[[0.0, 1.0], [0.0, 1.0]], labels=[[0, 0, 1, 1], [0, 1, 0, 1]], names=["Y", "Z"])) self.assertTrue( all(output.index == correct.index) and all(output.columns == correct.columns) and np.all(abs(output.values - correct.values) < 1e-10))
def testDataframeJackknife(self): df = pd.DataFrame({ "X": range(11), "Y": np.concatenate((np.zeros(6), np.ones(5))), "Z": np.concatenate((np.zeros(3), np.ones(8))) }) metric = metrics.Distribution("X", ["Z"]) se_method = standard_errors.Jackknife("Y") output = core.Analyze(df).with_standard_errors(se_method).calculate( metric).run() correct = pd.DataFrame( np.array([[3 / 55., np.sqrt(((3 / 15. - 0.1)**2 + 0.1**2) / 2.)], [52 / 55., np.sqrt(((12 / 15. - 0.9)**2 + 0.1**2) / 2.)]]), columns=("X Distribution", "X Distribution Jackknife SE"), index=pd.Index([0., 1.], name="Z")) self.assertTrue( all(output.index == correct.index) and all(output.columns == correct.columns) and np.all(abs(output.values - correct.values) < 1e-10))