예제 #1
0
    def test_seed(self):
        M, N, corr_coef, prev = 20, 1000, 0, 0.3
        auc= np.linspace(0.5, 0.9, M)
        seed = 341324

        R0, y0 = sample.data_set(auc, corr_coef, prev, N)
        R1, y1 = sample.data_set(auc, corr_coef, prev, N, seed=seed)
        R2, y2 = sample.data_set(auc, corr_coef, prev, N, seed=seed)
        R3, y3 = sample.data_set(auc, corr_coef, prev, N, seed=seed - 10)

        self.assertTrue((R1 == R2).all())
        self.assertTrue((y1 == y2).all())

        self.assertTrue(np.sum(R0 != R1) > 0.5*N)
        self.assertTrue(np.sum(R0 != R3) > 0.5*N)
        self.assertTrue(np.sum(R1 != R3) > 0.5*N)
예제 #2
0
    def setUp(self):
        self.N, self.prevalence = 500, 0.3
        self.r, self.y = sample.data_set(np.array([0.9]),
                0, self.prevalence, self.N)
        self.r = self.r.squeeze()

        self.cl = cl.FD()
예제 #3
0
    def test_1d(self):
        """Validate 1-d sampling.

        Test 1: R, y dimensions are as expected
        Test 2: R is a rank array
        Test 3: y is an array of binary labels
        Test 4: positive class prevalence is recovered
        """
        N = 1000
        prevalence = 0.3
        RTRUE = np.arange(1, N+1)
        YTRUE = [0,1]

        R, y = sample.data_set(0.7, 0, prevalence, N)

        # Test 1
        self.assertEqual(y.ndim, 1)
        self.assertEqual(R.ndim, 2)
        self.assertEqual(R.shape, (1, N))
        self.assertEqual(y.size, N)

        # Test 2
        self.assertEqual(np.setdiff1d(RTRUE, R[0,:]).size, 0)
        self.assertEqual(np.setdiff1d(R[0, :], RTRUE).size, 0)

        # Test 3
        self.assertEqual(np.setdiff1d(y, YTRUE).size, 0)
        self.assertEqual(np.setdiff1d(YTRUE, y).size, 0)

        # Test 4
        self.assertEqual(np.sum(y), int(N*prevalence))
예제 #4
0
    def test_output(self):
        """Validate that output satisfies expected generic properties.

        Test 1: R, y dimensions are as expected
        Test 2: y is an ((N,) ndarray) of binary values
        Test 3: prevalence is recovered
        Test 4: R is an ((M, N) ndarray) of rank values
        """
        N_samples = np.array([100, 1000])
        prevalence = np.linspace(0.1, 0.9, 9)
        M_methods = np.arange(5, 10)
        corr_coeffs = np.linspace(0, 0.9, 10)

        # used to validate binary label output by set comparison
        YTRUE = [0,1]

        for m in M_methods:

            auc = np.linspace(0.5, 0.9, m)

            for n in N_samples:

                # used to validate rank output by set comparison
                RTRUE = np.arange(1, n+1)

                for prev in prevalence:

                    for corr_coef in corr_coeffs:

                        R, y = sample.data_set(auc, corr_coef, prev, n)

                        # Test 1
                        self.assertEqual(R.ndim, 2)
                        self.assertEqual(y.ndim, 1)
                        self.assertEqual(R.shape, (m, n))
                        self.assertEqual(R.shape[1], y.size)

                        # Test 2
                        self.assertEqual(np.setdiff1d(y, YTRUE).size, 0)
                        self.assertEqual(np.setdiff1d(YTRUE, y).size, 0)

                        # Test 3
                        self.assertEqual(np.sum(y), int(n * prev))

                        # Test 4
                        for i in range(m):
                            self.assertEqual(np.setdiff1d(R[i, :], RTRUE).size, 0)
                            self.assertEqual(np.setdiff1d(RTRUE, R[i, :]).size, 0)
예제 #5
0
    def _simulate_and_fit(self, auc=None, prevalence=None):
        """Generate simulation data and fit FD model.

        Given either default parameters stored as TestFDfit class attributes,
        or those provided, generate simulation data.  Using FD class instance
        bound to TestFDfit class attribute self.cl, fit the data.  Test are
        conducted by accessing the trained FD class instance.
        """
        if auc is None:
            auc = np.array([self.auc])
        elif not isinstance(auc, np.ndarray):
            auc = np.array([auc])

        if prevalence is None:
            prevalence = self.prevalence

        r, y = sample.data_set(auc, self.corr_coef, prevalence, self.N)
        r = r.squeeze()

        self.cl.fit(r, y)
예제 #6
0
def main():
    if not os.path.exists(DIRECTORY):
        os.mkdir(DIRECTORY)

    auc = np.zeros(M)

    # set seed for pseudo random number generation
    rng = np.random.default_rng(SEED)

    for rho in TRUE_RHO:
        emp_rho_p = utils.MatrixContainer(NREPS, M)  # positive class corrs
        emp_rho_n = utils.MatrixContainer(NREPS, M)  # negative class corrs
        emp_auc = utils.VectorContainer(NREPS, M)

        for n in range(NREPS):
            R, y = sample.data_set(TRUE_AUC, rho, PREVALENCE, N, seed=rng)

            for i in range(M):
                auc[i] = stats.rank_2_auc(R[i, :], y)

            emp_auc.append(auc)
            emp_rho_p.append(np.corrcoef(R[:, y == 1]))
            emp_rho_n.append(np.corrcoef(R[:, y == 0]))

        rho_string = "100xrho_{}".format(int(100 * rho))

        utils.hist(
            np.hstack([emp_rho_p.vals, emp_rho_n.vals]), (rho, 1),
            (r"$C_{ij}$ = " + str(rho) + r"for $i \neq j$", r"$C_{ii}$ = 1"),
            "Conditional correlation coefficients",
            "{}/corr_{}.pdf".format(DIRECTORY, rho_string),
            bins=NBINS)
        utils.hist(emp_auc.vals,
                   TRUE_AUC, [r"AUC = " + str(auc) for auc in TRUE_AUC],
                   "AUC",
                   "{}/auc_{}.pdf".format(DIRECTORY, rho_string),
                   bins=NBINS)

    return 0
예제 #7
0
 def _update_sim_data(self):
     """Update simulation data"""
     self.R, self.y = sample.data_set(self.auc,
                                     self.corr_coef, 
                                     self.prevalence, 
                                     self.N)
예제 #8
0
def main():

    # if directory for figures does not exists, make it
    if not path.exists(path.join(path.dirname(__file__), DIRECTORY)):
        mkdir(path.join(path.dirname(__file__), DIRECTORY))

    # initialize dictionaries for per classifier items

    # classifier class instances for training and evaluating performance
    classifiers = {"FiDEL": cls.FDensemble(),
                   "WOC": cls.Woc(),
                   "Best Ind": cls.BestInd()}

    # keep per classifier performance statistics
    clstats = {"FiDEL": utils.StatsTable(CORRELATIONS.size, 
                                   M.size),
               "WOC": utils.StatsTable(CORRELATIONS.size, 
                                 M.size),
               "Best Ind": utils.StatsTable(CORRELATIONS.size, 
                                      M.size)}

    corr_stats = utils.StatsTable(CORRELATIONS.size,
                                M.size)

    # keep per classifier data for plotting
    clplots = {"FiDEL": utils.DataForPlots("FiDEL", COLORS["FiDEL"]),
               "WOC": utils.DataForPlots("WOC", COLORS["WOC"]),
               "Best Ind": utils.DataForPlots("Best Ind.", COLORS["Best Ind"])}


    for i, corr in enumerate(CORRELATIONS):

        for j, m in enumerate(M):

            base_classifiers_auc = utils.get_auc(m, AUC_LIMITS)

            # inititialize data structure which keeps per classifier
            # replicate performance data and computes statistics
            clreps = {"FiDEL": utils.ReplicateData(NREPS),
                      "WOC": utils.ReplicateData(NREPS),
                      "Best Ind": utils.ReplicateData(NREPS)}

            # initialize data structure for storing correlation values
            corr_reps = utils.ReplicateData(NREPS)

            # Generate NREPS synthetic data to train and evaluate classifier
            # performance
            for nrep in range(NREPS):

                # simulate data
                R, y = sample.data_set(base_classifiers_auc, corr, PREVALENCE, N)

                C = utils.compute_mean_cond_corr_matrix(R, y)
                C_upper_tri_vals = utils.extra_upper_diagonal(C)
                corr_reps.append(np.mean(C_upper_tri_vals))

                # train classifier and evaluate performance
                for key, cl in classifiers.items():
                    try:
                        cl.fit(R, y)
                    except NotImplementedError as err:
                        # WOC classifier does not require any training and consequently,
                        # a call to Woc.fit() results in a NotImplementedError.  Safe
                        # to ignore this exception for WOC classifier alone.
                        if key != "WOC":
                            raise err

                    auc = stats.rank_2_auc(cl.compute_ranks(R), y)
                    clreps[key].append(auc)


            # Update conditional correlation value statistics
            corr_stats.update(corr_reps, (i, j))

            # For each classifier, compute mean and sem of auc 
            # over replicate experiments and store
            for key, wstats in clstats.items():
                wstats.update(clreps[key], (i, j))


    # With data collected, make plots

    # Given different conditoinal correlations, plot AUC vs. M base classifiers
    for i, corr in enumerate(CORRELATIONS):

        # Store classifier plot data for each classifier
        for key, wstats in clstats.items():
            clplots[key].update_data(x=M, 
                                    y=wstats.mean[i, :],
                                    yerr=wstats.sem[i,:])

        savename = path.join(DIRECTORY, "auc_100xcorr_{}.pdf".format(int(100*corr)))
        ylabel = "AUC(M, {} = {:0.2f})".format(r"$\hat{r}$", 
                                            np.mean(corr_stats.mean[i, :]))

        plot_errorbars(clplots, 
                       xlabel="Number of Base Classifiers (M)",
                       ylabel= ylabel,
                       savename=savename)

    # Given different number of base classifiers, plot AUC vs. conditional correlation
    for j, m in enumerate(M):

        # Store classifier plot data for each classifier
        for key, wstats in clstats.items():
            clplots[key].update_data(x=corr_stats.mean[:, j],
                                    xerr=corr_stats.sem[:, j],
                                    y=wstats.mean[:, j],
                                    yerr=wstats.sem[:, j])

        savename = path.join(DIRECTORY, "auc_m_{}.pdf".format(m))

        plot_errorbars(clplots, 
                       xlabel= r"Class Conditioned Correlation ($\hat{r}$)",
                       ylabel= "AUC({}, M = {})".format(r"$\hat{r}$", m),
                       savename=savename)

    return 0
예제 #9
0
    def test_inputs(self):
        """Verify that invalid inputs raise exceptions.

        Test 1: auc value outside [0, 1]
        Test 2: corr_coef outside [0, 1)
        Test 3: prevalence outside (0,1) or a type other than float
        Test 4: N samples > 1, and can't be float
        """
        M, N, prevalence, corr_coef = 5, 1000, 0.3, 0.1
        auc = np.linspace(0.6, 0.9, M)
        
        # Test 1
        with self.assertRaises(ValueError):
            sample.data_set(1.2, corr_coef, prevalence, N)
        with self.assertRaises(ValueError):
            sample.data_set(-0.2, corr_coef, prevalence, N)
        with self.assertRaises(ValueError):
            sample.data_set(np.linspace(0, 1.1, M),  corr_coef, prevalence, N)
        with self.assertRaises(ValueError):
            sample.data_set(np.linspace(-0.23, 0.8, M),  corr_coef, prevalence, N)

        # Test 2
        with self.assertRaises(ValueError):
            sample.data_set(auc, 1.2, prevalence, N)
        with self.assertRaises(ValueError):
            sample.data_set(auc, 1, prevalence, N)
        with self.assertRaises(ValueError):
            sample.data_set(auc, -0.2, prevalence, N)

        # Test 3
        with self.assertRaises(ValueError):
            sample.data_set(auc, corr_coef, 0, N)
        with self.assertRaises(ValueError):
            sample.data_set(auc, corr_coef, 1, N)
        with self.assertRaises(TypeError):
            sample.data_set(auc, corr_coef, (0.5, 0.5), N)
        with self.assertRaises(TypeError):
            sample.data_set(auc, corr_coef, [0.5, 0.5], N)
        with self.assertRaises(ValueError):
            sample.data_set(auc, corr_coef, np.array([0.5, 0.5]), N)
        with self.assertRaises(TypeError):
            sample.data_set(auc, corr_coef, set(0.4, 0.5), N)
        with self.assertRaises(TypeError):
            sample.data_set(auc, corr_coef, set(0.4, 0.4), N)

        # Test 4
        with self.assertRaises(ValueError):
            sample.data_set(auc, corr_coef, 0.4, 1)
        with self.assertRaises(ValueError):
            sample.data_set(auc, corr_coef, 0.4, -41)
        with self.assertRaises(TypeError):
            sample.data_set(auc, corr_coef, 0.4, 34.2)