def test_all_models_have_pval(self): losses = self.losses_df.iloc[:, :20] mcs = MCS(losses, 0.05, reps=200) mcs.seed(23456) mcs.compute() nan_locs = np.isnan(mcs.pvalues.iloc[:,0]) assert_true(not nan_locs.any())
def test_all_models_have_pval(self): losses = self.losses_df.iloc[:, :20] mcs = MCS(losses, 0.05, reps=200) mcs.seed(23456) mcs.compute() nan_locs = np.isnan(mcs.pvalues.iloc[:, 0]) assert not nan_locs.any()
def test_exact_ties(self): losses = self.losses_df.iloc[:, :20].copy() tied_mean = losses.mean().median() losses.iloc[:, 10:] -= losses.iloc[:, 10:].mean() losses.iloc[:, 10:] += tied_mean mcs = MCS(losses, 0.05, reps=200) mcs.seed(23456) mcs.compute()
def test_r_method(self): def r_step(losses, indices): # A basic but direct implementation of the r method t, k = losses.shape b = len(indices) mean_diffs = losses.mean(0) loss_diffs = np.zeros((k, k)) variances = np.zeros((k, k)) bs_diffs = np.zeros(b) stat_candidates = [] for i in range(k): for j in range(i, k): if i == j: variances[i, i] = 1.0 loss_diffs[i, j] = 0.0 continue loss_diffs_vec = losses[:, i] - losses[:, j] loss_diffs_vec = loss_diffs_vec - loss_diffs_vec.mean() loss_diffs[i, j] = mean_diffs[i] - mean_diffs[j] loss_diffs[j, i] = mean_diffs[j] - mean_diffs[i] for n in range(b): # Compute bootstraped versions bs_diffs[n] = loss_diffs_vec[indices[n]].mean() variances[j, i] = variances[i, j] = (bs_diffs**2).mean() stat_candidates.append( np.abs(bs_diffs) / np.sqrt(variances[i, j])) stat_candidates = np.array(stat_candidates).T stat_distn = np.max(stat_candidates, 1) std_loss_diffs = loss_diffs / np.sqrt(variances) stat = np.max(std_loss_diffs) pval = np.mean(stat <= stat_distn) loc = np.argwhere(std_loss_diffs == stat) drop_index = loc.flat[0] return pval, drop_index losses = self.losses[:, :10] # Limit size mcs = MCS(losses, 0.05, reps=200) mcs.seed(23456) mcs.compute() m = 5 # Number of direct pvals = np.zeros(m) * np.nan indices = np.zeros(m) * np.nan for i in range(m): removed = list(indices[np.isfinite(indices)]) include = list(set(list(range(10))).difference(removed)) include.sort() pval, drop_index = r_step(losses[:, np.array(include)], mcs._bootsrap_indices) pvals[i] = pval if i == 0 else np.max([pvals[i - 1], pval]) indices[i] = include[drop_index] direct = pd.DataFrame(pvals, index=np.array(indices, dtype=np.int64), columns=['Pvalue']) direct.index.name = 'Model index' assert_frame_equal(mcs.pvalues.iloc[:m], direct)
def test_r_method(self): def r_step(losses, indices): # A basic but direct implementation of the r method t, k = losses.shape b = len(indices) mean_diffs = losses.mean(0) loss_diffs = np.zeros((k, k)) variances = np.zeros((k, k)) bs_diffs = np.zeros(b) stat_candidates = [] for i in range(k): for j in range(i, k): if i == j: variances[i, i] = 1.0 loss_diffs[i, j] = 0.0 continue loss_diffs_vec = losses[:, i] - losses[:, j] loss_diffs_vec = loss_diffs_vec - loss_diffs_vec.mean() loss_diffs[i, j] = mean_diffs[i] - mean_diffs[j] loss_diffs[j, i] = mean_diffs[j] - mean_diffs[i] for n in range(b): # Compute bootstraped versions bs_diffs[n] = loss_diffs_vec[indices[n]].mean() variances[j, i] = variances[i, j] = (bs_diffs ** 2).mean() stat_candidates.append(np.abs(bs_diffs) / np.sqrt(variances[i, j])) stat_candidates = np.array(stat_candidates).T stat_distn = np.max(stat_candidates, 1) std_loss_diffs = loss_diffs / np.sqrt(variances) stat = np.max(std_loss_diffs) pval = np.mean(stat <= stat_distn) loc = np.argwhere(std_loss_diffs == stat) drop_index = loc.flat[0] return pval, drop_index losses = self.losses[:, :10] # Limit size mcs = MCS(losses, 0.05, reps=200) mcs.seed(23456) mcs.compute() m = 5 # Number of direct pvals = np.zeros(m) * np.nan indices = np.zeros(m) * np.nan for i in range(m): removed = list(indices[np.isfinite(indices)]) include = list(set(list(range(10))).difference(removed)) include.sort() pval, drop_index = r_step(losses[:, np.array(include)], mcs._bootsrap_indices) pvals[i] = pval if i == 0 else np.max([pvals[i - 1], pval]) indices[i] = include[drop_index] direct = pd.DataFrame(pvals, index=np.array(indices,dtype=np.int64), columns=['Pvalue']) direct.index.name = 'Model index' assert_frame_equal(mcs.pvalues.iloc[:m], direct)
def test_max_method(self): def max_step(losses, indices): # A basic but direct implementation of the max method t, k = losses.shape b = len(indices) loss_errors = losses - losses.mean(0) stats = np.zeros((b, k)) for n in range(b): # Compute bootstraped versions bs_loss_errors = loss_errors[indices[n]] stats[n] = bs_loss_errors.mean(0) - bs_loss_errors.mean() variances = (stats**2).mean(0) std_devs = np.sqrt(variances) stat_dist = np.max(stats / std_devs, 1) test_stat = (losses.mean(0) - losses.mean()) std_test_stat = test_stat / std_devs test_stat = np.max(std_test_stat) pval = (test_stat < stat_dist).mean() drop_index = np.argwhere(std_test_stat == test_stat).squeeze() return pval, drop_index, std_devs losses = self.losses[:, :10] # Limit size mcs = MCS(losses, 0.05, reps=200, method='max') mcs.seed(23456) mcs.compute() m = 8 # Number of direct pvals = np.zeros(m) * np.nan indices = np.zeros(m) * np.nan for i in range(m): removed = list(indices[np.isfinite(indices)]) include = list(set(list(range(10))).difference(removed)) include.sort() pval, drop_index, std_devs = max_step(losses[:, np.array(include)], mcs._bootsrap_indices) pvals[i] = pval if i == 0 else np.max([pvals[i - 1], pval]) indices[i] = include[drop_index] direct = pd.DataFrame(pvals, index=np.array(indices, dtype=np.int64), columns=['Pvalue']) direct.index.name = 'Model index' assert_frame_equal(mcs.pvalues.iloc[:m], direct)
def test_max_method(self): def max_step(losses, indices): # A basic but direct implementation of the max method t, k = losses.shape b = len(indices) loss_errors = losses - losses.mean(0) stats = np.zeros((b, k)) for n in range(b): # Compute bootstraped versions bs_loss_errors = loss_errors[indices[n]] stats[n] = bs_loss_errors.mean(0) - bs_loss_errors.mean() variances = (stats ** 2).mean(0) std_devs = np.sqrt(variances) stat_dist = np.max(stats / std_devs, 1) test_stat = (losses.mean(0) - losses.mean()) std_test_stat = test_stat / std_devs test_stat = np.max(std_test_stat) pval = (test_stat < stat_dist).mean() drop_index = np.argwhere(std_test_stat == test_stat).squeeze() return pval, drop_index, std_devs losses = self.losses[:, :10] # Limit size mcs = MCS(losses, 0.05, reps=200, method='max') mcs.seed(23456) mcs.compute() m = 8 # Number of direct pvals = np.zeros(m) * np.nan indices = np.zeros(m) * np.nan for i in range(m): removed = list(indices[np.isfinite(indices)]) include = list(set(list(range(10))).difference(removed)) include.sort() pval, drop_index, std_devs = max_step(losses[:, np.array(include)], mcs._bootsrap_indices) pvals[i] = pval if i == 0 else np.max([pvals[i - 1], pval]) indices[i] = include[drop_index] direct = pd.DataFrame(pvals, index=np.array(indices,dtype=np.int64), columns=['Pvalue']) direct.index.name = 'Model index' assert_frame_equal(mcs.pvalues.iloc[:m], direct)