def test_all_models_have_pval(self): losses = self.losses_df.iloc[:, :20] mcs = MCS(losses, 0.05, reps=200) mcs.seed(23456) mcs.compute() nan_locs = np.isnan(mcs.pvalues.iloc[:, 0]) assert not nan_locs.any()
def test_exact_ties(self): losses = self.losses_df.iloc[:, :20].copy() tied_mean = losses.mean().median() losses.iloc[:, 10:] -= losses.iloc[:, 10:].mean() losses.iloc[:, 10:] += tied_mean mcs = MCS(losses, 0.05, reps=200) mcs.seed(23456) mcs.compute()
def test_missing_included_max(self): losses = self.losses_df.iloc[:, :20].copy() losses = losses.values + 5 * np.arange(20)[None, :] mcs = MCS(losses, 0.05, reps=200, method='max') mcs.seed(23456) mcs.compute() assert len(mcs.included) > 0 assert (len(mcs.included) + len(mcs.excluded)) == 20
def test_str_repr(self): mcs = MCS(self.losses, 0.05) expected = "MCS(size: 0.05, bootstrap: " + str(mcs.bootstrap) + ")" assert_equal(str(mcs), expected) expected = expected[:-1] + ", ID: " + hex(id(mcs)) + ")" assert_equal(mcs.__repr__(), expected) expected = ("<strong>MCS</strong>(" + "<strong>size</strong>: 0.05, " + "<strong>bootstrap</strong>: " + str(mcs.bootstrap) + ", " + "<strong>ID</strong>: " + hex(id(mcs)) + ")") assert_equal(mcs._repr_html_(), expected)
def test_str_repr(self): mcs = MCS(self.losses, 0.05) expected = 'MCS(size: 0.05, bootstrap: ' + str(mcs.bootstrap) + ')' assert_equal(str(mcs), expected) expected = expected[:-1] + ', ID: ' + hex(id(mcs)) + ')' assert_equal(mcs.__repr__(), expected) expected = ('<strong>MCS</strong>(' + '<strong>size</strong>: 0.05, ' + '<strong>bootstrap</strong>: ' + str(mcs.bootstrap) + ', ' + '<strong>ID</strong>: ' + hex(id(mcs)) + ')') assert_equal(mcs._repr_html_(), expected)
def test_r_method(self): def r_step(losses, indices): # A basic but direct implementation of the r method k = losses.shape[1] b = len(indices) mean_diffs = losses.mean(0) loss_diffs = np.zeros((k, k)) variances = np.zeros((k, k)) bs_diffs = np.zeros(b) stat_candidates = [] for i in range(k): for j in range(i, k): if i == j: variances[i, i] = 1.0 loss_diffs[i, j] = 0.0 continue loss_diffs_vec = losses[:, i] - losses[:, j] loss_diffs_vec = loss_diffs_vec - loss_diffs_vec.mean() loss_diffs[i, j] = mean_diffs[i] - mean_diffs[j] loss_diffs[j, i] = mean_diffs[j] - mean_diffs[i] for n in range(b): # Compute bootstrapped versions bs_diffs[n] = loss_diffs_vec[indices[n]].mean() variances[j, i] = variances[i, j] = (bs_diffs ** 2).mean() std_diffs = np.abs(bs_diffs) / np.sqrt(variances[i, j]) stat_candidates.append(std_diffs) stat_candidates = np.array(stat_candidates).T stat_distn = np.max(stat_candidates, 1) std_loss_diffs = loss_diffs / np.sqrt(variances) stat = np.max(std_loss_diffs) pval = np.mean(stat <= stat_distn) loc = np.argwhere(std_loss_diffs == stat) drop_index = loc.flat[0] return pval, drop_index losses = self.losses[:, :10] # Limit size mcs = MCS(losses, 0.05, reps=200) mcs.seed(23456) mcs.compute() m = 5 # Number of direct pvals = np.zeros(m) * np.nan indices = np.zeros(m) * np.nan for i in range(m): removed = list(indices[np.isfinite(indices)]) include = list(set(list(range(10))).difference(removed)) include.sort() pval, drop_index = r_step(losses[:, np.array(include)], mcs._bootstrap_indices) pvals[i] = pval if i == 0 else np.max([pvals[i - 1], pval]) indices[i] = include[drop_index] direct = pd.DataFrame(pvals, index=np.array(indices, dtype=np.int64), columns=['Pvalue']) direct.index.name = 'Model index' assert_frame_equal(mcs.pvalues.iloc[:m], direct)
def test_errors(self): with pytest.raises(ValueError): MCS(self.losses[:, 1], 0.05) mcs = MCS(self.losses, 0.05, reps=100, block_size=10, method='max', bootstrap='circular') mcs.compute() mcs = MCS(self.losses, 0.05, reps=100, block_size=10, method='max', bootstrap='moving block') mcs.compute() with pytest.raises(ValueError): MCS(self.losses, 0.05, bootstrap='unknown')
def test_max_method(self): def max_step(losses, indices): # A basic but direct implementation of the max method k = losses.shape[1] b = len(indices) loss_errors = losses - losses.mean(0) stats = np.zeros((b, k)) for n in range(b): # Compute bootstrapped versions bs_loss_errors = loss_errors[indices[n]] stats[n] = bs_loss_errors.mean(0) - bs_loss_errors.mean() variances = (stats ** 2).mean(0) std_devs = np.sqrt(variances) stat_dist = np.max(stats / std_devs, 1) test_stat = (losses.mean(0) - losses.mean()) std_test_stat = test_stat / std_devs test_stat = np.max(std_test_stat) pval = (test_stat < stat_dist).mean() drop_index = np.argwhere(std_test_stat == test_stat).squeeze() return pval, drop_index, std_devs losses = self.losses[:, :10] # Limit size mcs = MCS(losses, 0.05, reps=200, method='max') mcs.seed(23456) mcs.compute() m = 8 # Number of direct pvals = np.zeros(m) * np.nan indices = np.zeros(m) * np.nan for i in range(m): removed = list(indices[np.isfinite(indices)]) include = list(set(list(range(10))).difference(removed)) include.sort() pval, drop_index, _ = max_step(losses[:, np.array(include)], mcs._bootstrap_indices) pvals[i] = pval if i == 0 else np.max([pvals[i - 1], pval]) indices[i] = include[drop_index] direct = pd.DataFrame(pvals, index=np.array(indices, dtype=np.int64), columns=['Pvalue']) direct.index.name = 'Model index' assert_frame_equal(mcs.pvalues.iloc[:m], direct)
def test_errors(self): with pytest.raises(ValueError): MCS(self.losses[:, 1], 0.05) mcs = MCS( self.losses, 0.05, reps=100, block_size=10, method="max", bootstrap="circular", ) mcs.compute() mcs = MCS( self.losses, 0.05, reps=100, block_size=10, method="max", bootstrap="moving block", ) mcs.compute() with pytest.raises(ValueError): MCS(self.losses, 0.05, bootstrap="unknown")
def test_r_method(self): def r_step(losses, indices): # A basic but direct implementation of the r method t, k = losses.shape b = len(indices) mean_diffs = losses.mean(0) loss_diffs = np.zeros((k, k)) variances = np.zeros((k, k)) bs_diffs = np.zeros(b) stat_candidates = [] for i in range(k): for j in range(i, k): if i == j: variances[i, i] = 1.0 loss_diffs[i, j] = 0.0 continue loss_diffs_vec = losses[:, i] - losses[:, j] loss_diffs_vec = loss_diffs_vec - loss_diffs_vec.mean() loss_diffs[i, j] = mean_diffs[i] - mean_diffs[j] loss_diffs[j, i] = mean_diffs[j] - mean_diffs[i] for n in range(b): # Compute bootstrapped versions bs_diffs[n] = loss_diffs_vec[indices[n]].mean() variances[j, i] = variances[i, j] = (bs_diffs**2).mean() std_diffs = np.abs(bs_diffs) / np.sqrt(variances[i, j]) stat_candidates.append(std_diffs) stat_candidates = np.array(stat_candidates).T stat_distn = np.max(stat_candidates, 1) std_loss_diffs = loss_diffs / np.sqrt(variances) stat = np.max(std_loss_diffs) pval = np.mean(stat <= stat_distn) loc = np.argwhere(std_loss_diffs == stat) drop_index = loc.flat[0] return pval, drop_index losses = self.losses[:, :10] # Limit size mcs = MCS(losses, 0.05, reps=200) mcs.seed(23456) mcs.compute() m = 5 # Number of direct pvals = np.zeros(m) * np.nan indices = np.zeros(m) * np.nan for i in range(m): removed = list(indices[np.isfinite(indices)]) include = list(set(list(range(10))).difference(removed)) include.sort() pval, drop_index = r_step(losses[:, np.array(include)], mcs._bootsrap_indices) pvals[i] = pval if i == 0 else np.max([pvals[i - 1], pval]) indices[i] = include[drop_index] direct = pd.DataFrame(pvals, index=np.array(indices, dtype=np.int64), columns=['Pvalue']) direct.index.name = 'Model index' assert_frame_equal(mcs.pvalues.iloc[:m], direct)
def test_max_method(self): def max_step(losses, indices): # A basic but direct implementation of the max method t, k = losses.shape b = len(indices) loss_errors = losses - losses.mean(0) stats = np.zeros((b, k)) for n in range(b): # Compute bootstrapped versions bs_loss_errors = loss_errors[indices[n]] stats[n] = bs_loss_errors.mean(0) - bs_loss_errors.mean() variances = (stats**2).mean(0) std_devs = np.sqrt(variances) stat_dist = np.max(stats / std_devs, 1) test_stat = (losses.mean(0) - losses.mean()) std_test_stat = test_stat / std_devs test_stat = np.max(std_test_stat) pval = (test_stat < stat_dist).mean() drop_index = np.argwhere(std_test_stat == test_stat).squeeze() return pval, drop_index, std_devs losses = self.losses[:, :10] # Limit size mcs = MCS(losses, 0.05, reps=200, method='max') mcs.seed(23456) mcs.compute() m = 8 # Number of direct pvals = np.zeros(m) * np.nan indices = np.zeros(m) * np.nan for i in range(m): removed = list(indices[np.isfinite(indices)]) include = list(set(list(range(10))).difference(removed)) include.sort() pval, drop_index, std_devs = max_step(losses[:, np.array(include)], mcs._bootsrap_indices) pvals[i] = pval if i == 0 else np.max([pvals[i - 1], pval]) indices[i] = include[drop_index] direct = pd.DataFrame(pvals, index=np.array(indices, dtype=np.int64), columns=['Pvalue']) direct.index.name = 'Model index' assert_frame_equal(mcs.pvalues.iloc[:m], direct)
def test_mcs_error(self): mcs = MCS(self.losses_df, 0.05, reps=100, block_size=10, method='r') with pytest.raises(RuntimeError): mcs.included
def test_output_types(self): mcs = MCS(self.losses_df, 0.05, reps=100, block_size=10, method='r') mcs.compute() assert_equal(type(mcs.included), list) assert_equal(type(mcs.excluded), list) assert isinstance(mcs.pvalues, pd.DataFrame)