def test_seed(self): M, N, corr_coef, prev = 20, 1000, 0, 0.3 auc= np.linspace(0.5, 0.9, M) seed = 341324 R0, y0 = sample.data_set(auc, corr_coef, prev, N) R1, y1 = sample.data_set(auc, corr_coef, prev, N, seed=seed) R2, y2 = sample.data_set(auc, corr_coef, prev, N, seed=seed) R3, y3 = sample.data_set(auc, corr_coef, prev, N, seed=seed - 10) self.assertTrue((R1 == R2).all()) self.assertTrue((y1 == y2).all()) self.assertTrue(np.sum(R0 != R1) > 0.5*N) self.assertTrue(np.sum(R0 != R3) > 0.5*N) self.assertTrue(np.sum(R1 != R3) > 0.5*N)
def setUp(self): self.N, self.prevalence = 500, 0.3 self.r, self.y = sample.data_set(np.array([0.9]), 0, self.prevalence, self.N) self.r = self.r.squeeze() self.cl = cl.FD()
def test_1d(self): """Validate 1-d sampling. Test 1: R, y dimensions are as expected Test 2: R is a rank array Test 3: y is an array of binary labels Test 4: positive class prevalence is recovered """ N = 1000 prevalence = 0.3 RTRUE = np.arange(1, N+1) YTRUE = [0,1] R, y = sample.data_set(0.7, 0, prevalence, N) # Test 1 self.assertEqual(y.ndim, 1) self.assertEqual(R.ndim, 2) self.assertEqual(R.shape, (1, N)) self.assertEqual(y.size, N) # Test 2 self.assertEqual(np.setdiff1d(RTRUE, R[0,:]).size, 0) self.assertEqual(np.setdiff1d(R[0, :], RTRUE).size, 0) # Test 3 self.assertEqual(np.setdiff1d(y, YTRUE).size, 0) self.assertEqual(np.setdiff1d(YTRUE, y).size, 0) # Test 4 self.assertEqual(np.sum(y), int(N*prevalence))
def test_output(self): """Validate that output satisfies expected generic properties. Test 1: R, y dimensions are as expected Test 2: y is an ((N,) ndarray) of binary values Test 3: prevalence is recovered Test 4: R is an ((M, N) ndarray) of rank values """ N_samples = np.array([100, 1000]) prevalence = np.linspace(0.1, 0.9, 9) M_methods = np.arange(5, 10) corr_coeffs = np.linspace(0, 0.9, 10) # used to validate binary label output by set comparison YTRUE = [0,1] for m in M_methods: auc = np.linspace(0.5, 0.9, m) for n in N_samples: # used to validate rank output by set comparison RTRUE = np.arange(1, n+1) for prev in prevalence: for corr_coef in corr_coeffs: R, y = sample.data_set(auc, corr_coef, prev, n) # Test 1 self.assertEqual(R.ndim, 2) self.assertEqual(y.ndim, 1) self.assertEqual(R.shape, (m, n)) self.assertEqual(R.shape[1], y.size) # Test 2 self.assertEqual(np.setdiff1d(y, YTRUE).size, 0) self.assertEqual(np.setdiff1d(YTRUE, y).size, 0) # Test 3 self.assertEqual(np.sum(y), int(n * prev)) # Test 4 for i in range(m): self.assertEqual(np.setdiff1d(R[i, :], RTRUE).size, 0) self.assertEqual(np.setdiff1d(RTRUE, R[i, :]).size, 0)
def _simulate_and_fit(self, auc=None, prevalence=None): """Generate simulation data and fit FD model. Given either default parameters stored as TestFDfit class attributes, or those provided, generate simulation data. Using FD class instance bound to TestFDfit class attribute self.cl, fit the data. Test are conducted by accessing the trained FD class instance. """ if auc is None: auc = np.array([self.auc]) elif not isinstance(auc, np.ndarray): auc = np.array([auc]) if prevalence is None: prevalence = self.prevalence r, y = sample.data_set(auc, self.corr_coef, prevalence, self.N) r = r.squeeze() self.cl.fit(r, y)
def main(): if not os.path.exists(DIRECTORY): os.mkdir(DIRECTORY) auc = np.zeros(M) # set seed for pseudo random number generation rng = np.random.default_rng(SEED) for rho in TRUE_RHO: emp_rho_p = utils.MatrixContainer(NREPS, M) # positive class corrs emp_rho_n = utils.MatrixContainer(NREPS, M) # negative class corrs emp_auc = utils.VectorContainer(NREPS, M) for n in range(NREPS): R, y = sample.data_set(TRUE_AUC, rho, PREVALENCE, N, seed=rng) for i in range(M): auc[i] = stats.rank_2_auc(R[i, :], y) emp_auc.append(auc) emp_rho_p.append(np.corrcoef(R[:, y == 1])) emp_rho_n.append(np.corrcoef(R[:, y == 0])) rho_string = "100xrho_{}".format(int(100 * rho)) utils.hist( np.hstack([emp_rho_p.vals, emp_rho_n.vals]), (rho, 1), (r"$C_{ij}$ = " + str(rho) + r"for $i \neq j$", r"$C_{ii}$ = 1"), "Conditional correlation coefficients", "{}/corr_{}.pdf".format(DIRECTORY, rho_string), bins=NBINS) utils.hist(emp_auc.vals, TRUE_AUC, [r"AUC = " + str(auc) for auc in TRUE_AUC], "AUC", "{}/auc_{}.pdf".format(DIRECTORY, rho_string), bins=NBINS) return 0
def _update_sim_data(self): """Update simulation data""" self.R, self.y = sample.data_set(self.auc, self.corr_coef, self.prevalence, self.N)
def main(): # if directory for figures does not exists, make it if not path.exists(path.join(path.dirname(__file__), DIRECTORY)): mkdir(path.join(path.dirname(__file__), DIRECTORY)) # initialize dictionaries for per classifier items # classifier class instances for training and evaluating performance classifiers = {"FiDEL": cls.FDensemble(), "WOC": cls.Woc(), "Best Ind": cls.BestInd()} # keep per classifier performance statistics clstats = {"FiDEL": utils.StatsTable(CORRELATIONS.size, M.size), "WOC": utils.StatsTable(CORRELATIONS.size, M.size), "Best Ind": utils.StatsTable(CORRELATIONS.size, M.size)} corr_stats = utils.StatsTable(CORRELATIONS.size, M.size) # keep per classifier data for plotting clplots = {"FiDEL": utils.DataForPlots("FiDEL", COLORS["FiDEL"]), "WOC": utils.DataForPlots("WOC", COLORS["WOC"]), "Best Ind": utils.DataForPlots("Best Ind.", COLORS["Best Ind"])} for i, corr in enumerate(CORRELATIONS): for j, m in enumerate(M): base_classifiers_auc = utils.get_auc(m, AUC_LIMITS) # inititialize data structure which keeps per classifier # replicate performance data and computes statistics clreps = {"FiDEL": utils.ReplicateData(NREPS), "WOC": utils.ReplicateData(NREPS), "Best Ind": utils.ReplicateData(NREPS)} # initialize data structure for storing correlation values corr_reps = utils.ReplicateData(NREPS) # Generate NREPS synthetic data to train and evaluate classifier # performance for nrep in range(NREPS): # simulate data R, y = sample.data_set(base_classifiers_auc, corr, PREVALENCE, N) C = utils.compute_mean_cond_corr_matrix(R, y) C_upper_tri_vals = utils.extra_upper_diagonal(C) corr_reps.append(np.mean(C_upper_tri_vals)) # train classifier and evaluate performance for key, cl in classifiers.items(): try: cl.fit(R, y) except NotImplementedError as err: # WOC classifier does not require any training and consequently, # a call to Woc.fit() results in a NotImplementedError. Safe # to ignore this exception for WOC classifier alone. if key != "WOC": raise err auc = stats.rank_2_auc(cl.compute_ranks(R), y) clreps[key].append(auc) # Update conditional correlation value statistics corr_stats.update(corr_reps, (i, j)) # For each classifier, compute mean and sem of auc # over replicate experiments and store for key, wstats in clstats.items(): wstats.update(clreps[key], (i, j)) # With data collected, make plots # Given different conditoinal correlations, plot AUC vs. M base classifiers for i, corr in enumerate(CORRELATIONS): # Store classifier plot data for each classifier for key, wstats in clstats.items(): clplots[key].update_data(x=M, y=wstats.mean[i, :], yerr=wstats.sem[i,:]) savename = path.join(DIRECTORY, "auc_100xcorr_{}.pdf".format(int(100*corr))) ylabel = "AUC(M, {} = {:0.2f})".format(r"$\hat{r}$", np.mean(corr_stats.mean[i, :])) plot_errorbars(clplots, xlabel="Number of Base Classifiers (M)", ylabel= ylabel, savename=savename) # Given different number of base classifiers, plot AUC vs. conditional correlation for j, m in enumerate(M): # Store classifier plot data for each classifier for key, wstats in clstats.items(): clplots[key].update_data(x=corr_stats.mean[:, j], xerr=corr_stats.sem[:, j], y=wstats.mean[:, j], yerr=wstats.sem[:, j]) savename = path.join(DIRECTORY, "auc_m_{}.pdf".format(m)) plot_errorbars(clplots, xlabel= r"Class Conditioned Correlation ($\hat{r}$)", ylabel= "AUC({}, M = {})".format(r"$\hat{r}$", m), savename=savename) return 0
def test_inputs(self): """Verify that invalid inputs raise exceptions. Test 1: auc value outside [0, 1] Test 2: corr_coef outside [0, 1) Test 3: prevalence outside (0,1) or a type other than float Test 4: N samples > 1, and can't be float """ M, N, prevalence, corr_coef = 5, 1000, 0.3, 0.1 auc = np.linspace(0.6, 0.9, M) # Test 1 with self.assertRaises(ValueError): sample.data_set(1.2, corr_coef, prevalence, N) with self.assertRaises(ValueError): sample.data_set(-0.2, corr_coef, prevalence, N) with self.assertRaises(ValueError): sample.data_set(np.linspace(0, 1.1, M), corr_coef, prevalence, N) with self.assertRaises(ValueError): sample.data_set(np.linspace(-0.23, 0.8, M), corr_coef, prevalence, N) # Test 2 with self.assertRaises(ValueError): sample.data_set(auc, 1.2, prevalence, N) with self.assertRaises(ValueError): sample.data_set(auc, 1, prevalence, N) with self.assertRaises(ValueError): sample.data_set(auc, -0.2, prevalence, N) # Test 3 with self.assertRaises(ValueError): sample.data_set(auc, corr_coef, 0, N) with self.assertRaises(ValueError): sample.data_set(auc, corr_coef, 1, N) with self.assertRaises(TypeError): sample.data_set(auc, corr_coef, (0.5, 0.5), N) with self.assertRaises(TypeError): sample.data_set(auc, corr_coef, [0.5, 0.5], N) with self.assertRaises(ValueError): sample.data_set(auc, corr_coef, np.array([0.5, 0.5]), N) with self.assertRaises(TypeError): sample.data_set(auc, corr_coef, set(0.4, 0.5), N) with self.assertRaises(TypeError): sample.data_set(auc, corr_coef, set(0.4, 0.4), N) # Test 4 with self.assertRaises(ValueError): sample.data_set(auc, corr_coef, 0.4, 1) with self.assertRaises(ValueError): sample.data_set(auc, corr_coef, 0.4, -41) with self.assertRaises(TypeError): sample.data_set(auc, corr_coef, 0.4, 34.2)