示例#1
0
def job_nfsicJ10_med(paired_source, tr, te, r, n_permute=None):
    """
    NFSIC in which the test locations are randomized, and the Gaussian width 
    is set with the median heuristic. Use full sample. No training/testing splits.
    J=10
    """
    J = 10
    pdata = tr + te
    with util.ContextTimer() as t:
        #V, W = it.GaussNFSIC.init_locs_2randn(pdata, J, seed=r+2)
        # May overfit and increase type-I errors?
        #V, W = it.GaussNFSIC.init_locs_joint_randn(pdata, J, seed=r+2)
        with util.NumpySeedContext(seed=r + 92):
            dx = pdata.dx()
            dy = pdata.dy()
            V = np.random.randn(J, dx)
            W = np.random.randn(J, dy)
        k, l = kl_kgauss_median(pdata)

        nfsic_med = it.NFSIC(k,
                             l,
                             V,
                             W,
                             alpha=alpha,
                             reg='auto',
                             n_permute=n_permute,
                             seed=r + 3)
        nfsic_med_result = nfsic_med.perform_test(pdata)
    return {
        'indtest': nfsic_med,
        'test_result': nfsic_med_result,
        'time_secs': t.secs
    }
示例#2
0
def job_rdcperm_med(paired_source, tr, te, r, n_features=10):
    """
    The Randomized Dependence Coefficient test with permutations.
    """
    pdata = tr + te
    n_permute = 500
    # n_features=10 from Lopez-Paz et al., 2013 paper.
    with util.ContextTimer() as t:
        # get the median distances
        X, Y = pdata.xy()
        # copula transform to both X and Y
        cop_map = fea.MarginalCDFMap()
        Xcdf = cop_map.gen_features(X)
        Ycdf = cop_map.gen_features(Y)

        medx = util.meddistance(Xcdf, subsample=1000)
        medy = util.meddistance(Ycdf, subsample=1000)
        sigmax2 = medx**2
        sigmay2 = medy**2

        fmx = fea.RFFKGauss(sigmax2, n_features=n_features, seed=r + 19)
        fmy = fea.RFFKGauss(sigmay2, n_features=n_features, seed=r + 220)
        rdcperm = it.RDCPerm(fmx,
                             fmy,
                             n_permute=n_permute,
                             alpha=alpha,
                             seed=r + 100)
        rdcperm_result = rdcperm.perform_test(pdata)
    return {
        'indtest': rdcperm,
        'test_result': rdcperm_result,
        'time_secs': t.secs
    }
示例#3
0
def job_fhsic_med(paired_source, tr, te, r):
    """
    HSIC with random Fourier features. Simulate the null distribution 
    with the spectrums of the empirical cross covariance operators.
    - Gaussian kernels.
    - No parameter selection procedure. Use the median heuristic for both 
    X and Y.
    - Use full sample for testing. 
    """

    n_simulate = 2000
    # random features
    n_features = 10
    # use full sample for testing. Merge training and test sets
    pdata = tr + te
    with util.ContextTimer() as t:
        X, Y = pdata.xy()
        medx = util.meddistance(X, subsample=1000)
        medy = util.meddistance(Y, subsample=1000)
        sigmax2 = medx**2
        sigmay2 = medy**2

        fmx = fea.RFFKGauss(sigmax2, n_features=n_features, seed=r + 1)
        fmy = fea.RFFKGauss(sigmay2, n_features=n_features, seed=r + 2)
        ffhsic = it.FiniteFeatureHSIC(fmx,
                                      fmy,
                                      n_simulate=n_simulate,
                                      alpha=alpha,
                                      seed=r + 89)
        ffhsic_result = ffhsic.perform_test(pdata)
    return {
        'indtest': ffhsic,
        'test_result': ffhsic_result,
        'time_secs': t.secs
    }
示例#4
0
def job_nyhsic_med(paired_source, tr, te, r):
    """
    HSIC with Nystrom approximation. Simulate the null distribution 
    with the spectrums of the empirical cross covariance operators.
    - Gaussian kernels.
    - No parameter selection procedure. Use the median heuristic for both 
    X and Y.
    - Use full sample for testing. 
    """

    n_simulate = 2000
    # random features
    n_features = 10
    # use full sample for testing. Merge training and test sets
    pdata = tr + te
    with util.ContextTimer() as t:
        X, Y = pdata.xy()
        k, l = kl_kgauss_median(pdata)
        # randomly choose the inducing points from X, Y
        induce_x = util.subsample_rows(X, n_features, seed=r + 2)
        induce_y = util.subsample_rows(Y, n_features, seed=r + 3)

        nyhsic = it.NystromHSIC(k,
                                l,
                                induce_x,
                                induce_y,
                                n_simulate=n_simulate,
                                alpha=alpha,
                                seed=r + 89)
        nyhsic_result = nyhsic.perform_test(pdata)
    return {
        'indtest': nyhsic,
        'test_result': nyhsic_result,
        'time_secs': t.secs
    }
示例#5
0
    def compute(self):

        # randomly wait a few seconds so that multiple processes accessing the same
        # Theano function do not cause a lock problem. I do not know why.
        # I do not know if this does anything useful.
        # Sleep in seconds.
        time.sleep(np.random.rand(1) * 3)

        paired_source = self.paired_source
        r = self.rep
        n = self.n
        job_func = self.job_func

        pdata = paired_source.sample(n, seed=r)
        with util.ContextTimer() as t:
            logger.info("computing. %s. prob=%s, r=%d, n=%d" %
                        (job_func.__name__, pdata.label, r, n))
            tr, te = pdata.split_tr_te(tr_proportion=tr_proportion,
                                       seed=r + 21)
            prob_label = self.prob_label

            job_result = job_func(paired_source, tr, te, r)

            # create ScalarResult instance
            result = SingleResult(job_result)
            # submit the result to my own aggregator
            self.aggregator.submit_result(result)
            func_name = job_func.__name__
        logger.info("done. ex1: %s, prob=%s, r=%d, n=%d. Took: %.3g s " %
                    (func_name, pdata.label, r, n, t.secs))

        # save result
        fname = '%s-%s-r%d_n%d_a%.3f_trp%.2f.p' \
            %(prob_label, func_name,  r, n, alpha, tr_proportion)
        glo.ex_save_result(ex, job_result, prob_label, fname)
示例#6
0
def job_qhsic_med(paired_source, tr, te, r):
    """
    Quadratic-time HSIC using the permutation test.
    - Gaussian kernels.
    - No parameter selection procedure. Use the median heuristic for both 
    X and Y.
    - Use full sample for testing. 
    """
    # use full sample for testing. Merge training and test sets

    pdata = tr + te
    n_permute = 500

    if pdata.sample_size() >= 5000:
        # give up. Too big.
        k, l = kl_kgauss_median(pdata)
        qhsic = it.QuadHSIC(k, l, n_permute, alpha=alpha, seed=r + 1)
        fake_result = {
            'alpha': alpha,
            'pvalue': 1,
            'test_stat': -1,
            'h0_rejected': False,
            'time_secs': 0,
            'n_permute': n_permute
        }
        return {'indtest': qhsic, 'test_result': fake_result, 'time_secs': 0}

    # Actually do the test
    with util.ContextTimer() as t:
        k, l = kl_kgauss_median(pdata)
        qhsic = it.QuadHSIC(k, l, n_permute, alpha=alpha, seed=r + 1)
        qhsic_result = qhsic.perform_test(pdata)
    return {'indtest': qhsic, 'test_result': qhsic_result, 'time_secs': t.secs}
示例#7
0
def job_nfsicJ3_opt(paired_source, tr, te, r, J=3):
    """NFSIC with test locations optimzied.  """
    with util.ContextTimer() as t:
        nfsic_opt_options = {
            'n_test_locs': J,
            'max_iter': 200,
            'V_step': 1,
            'W_step': 1,
            'gwidthx_step': 1,
            'gwidthy_step': 1,
            'batch_proportion': 1.0,
            'tol_fun': 1e-4,
            'step_pow': 0.5,
            'seed': r + 2,
            'reg': 1e-6
        }
        op_V, op_W, op_gwx, op_gwy, info = it.GaussNFSIC.optimize_locs_widths(
            tr, alpha, **nfsic_opt_options)
        nfsic_opt = it.GaussNFSIC(op_gwx,
                                  op_gwy,
                                  op_V,
                                  op_W,
                                  alpha,
                                  reg='auto',
                                  seed=r + 3)
        nfsic_opt_result = nfsic_opt.perform_test(te)
    return {
        'indtest': nfsic_opt,
        'test_result': nfsic_opt_result,
        'time_secs': t.secs
    }
示例#8
0
def job_rdcperm_nc_med(paired_source, tr, te, r, n_features=10):
    """
    The Randomized Dependence Coefficient test with permutations.
    No copula transformtation. Use median heuristic on the data.
    """
    pdata = tr + te
    n_permute = 500
    # n_features=10 from Lopez-Paz et al., 2013 paper.
    with util.ContextTimer() as t:
        # get the median distances
        X, Y = pdata.xy()

        medx = util.meddistance(X, subsample=1000)
        medy = util.meddistance(Y, subsample=1000)
        sigmax2 = medx**2
        sigmay2 = medy**2

        fmx = fea.RFFKGauss(sigmax2, n_features=n_features, seed=r + 19)
        fmy = fea.RFFKGauss(sigmay2, n_features=n_features, seed=r + 220)
        rdcperm = it.RDCPerm(fmx,
                             fmy,
                             n_permute=n_permute,
                             alpha=alpha,
                             seed=r + 100,
                             use_copula=False)
        rdcperm_result = rdcperm.perform_test(pdata)
    return {
        'indtest': rdcperm,
        'test_result': rdcperm_result,
        'time_secs': t.secs
    }
示例#9
0
def job_rdc_med(paired_source, tr, te, r, n_features=10):
    """
    The Randomized Dependence Coefficient test.
    - Gaussian width = median heuristic on the copula-transformed data 
    - 10 random features for each X andY
    - Use full dataset for testing
    """
    pdata = tr + te
    # n_features=10 from Lopez-Paz et al., 2013 paper.
    with util.ContextTimer() as t:
        # get the median distances
        X, Y = pdata.xy()
        # copula transform to both X and Y
        cop_map = fea.MarginalCDFMap()
        Xcdf = cop_map.gen_features(X)
        Ycdf = cop_map.gen_features(Y)

        medx = util.meddistance(Xcdf, subsample=1000)
        medy = util.meddistance(Ycdf, subsample=1000)
        sigmax2 = medx**2
        sigmay2 = medy**2

        fmx = fea.RFFKGauss(sigmax2, n_features=n_features, seed=r + 19)
        fmy = fea.RFFKGauss(sigmay2, n_features=n_features, seed=r + 220)
        rdc = it.RDC(fmx, fmy, alpha=alpha)
        rdc_result = rdc.perform_test(pdata)
    return {'indtest': rdc, 'test_result': rdc_result, 'time_secs': t.secs}
示例#10
0
def job_nfsic_grid(paired_source, tr, te, r):
    """
    NFSIC where the test locations are randomized, and the Gaussian widths 
    are optimized by a grid search.
    """
    # randomize the test locations by fitting Gaussians to the data
    with util.ContextTimer() as t:
        V, W = it.GaussNFSIC.init_locs_2randn(tr, J, seed=r + 2)
        xtr, ytr = tr.xy()
        n_gwidth_cand = 30
        gwidthx_factors = 2.0**np.linspace(-4, 4, n_gwidth_cand)
        gwidthy_factors = gwidthx_factors
        #gwidthy_factors = 2.0**np.linspace(-3, 4, 40)
        medx = util.meddistance(xtr, 1000)
        medy = util.meddistance(ytr, 1000)
        list_gwidthx = np.hstack(((medx**2) * gwidthx_factors))
        list_gwidthy = np.hstack(((medy**2) * gwidthy_factors))

        bestij, lambs = it.GaussNFSIC.grid_search_gwidth(
            tr, V, W, list_gwidthx, list_gwidthy)
        # These are width^2
        best_widthx = list_gwidthx[bestij[0]]
        best_widthy = list_gwidthy[bestij[1]]

        # perform test
        nfsic_grid = it.GaussNFSIC(best_widthx, best_widthy, V, W, alpha)
        nfsic_grid_result = nfsic_grid.perform_test(te)
    return {
        'indtest': nfsic_grid,
        'test_result': nfsic_grid_result,
        'time_secs': t.secs
    }
示例#11
0
def job_nfsicJ10_stoopt(paired_source, tr, te, r, n_permute=None):
    J = 10
    k, l = kl_kgauss_median(tr)
    medx2 = k.sigma2
    medy2 = l.sigma2

    fac_min = 5e-2
    fac_max = 5e3

    with util.ContextTimer() as t:

        nfsic_opt_options = {
            'n_test_locs': J,
            'max_iter': 100,
            'V_step': 1,
            'W_step': 1,
            'gwidthx_step': 1,
            'gwidthy_step': 1,
            'batch_proportion': 1,
            'tol_fun': 1e-4,
            'step_pow': 0.5,
            'seed': r + 2,
            'reg': 1e-6,
            'gwidthx_lb': medx2 * 1e-3,
            'gwidthx_ub': medx2 * 1e3,
            'gwidthy_lb': medy2 * 1e-3,
            'gwidthy_ub': medy2 * 1e3
        }
        op_V, op_W, op_gwx, op_gwy, info = it.GaussNFSIC.optimize_locs_widths(
            tr, alpha, **nfsic_opt_options)

        # make sure the optimized widths are not too extreme
        #last_gwx = info['gwidthxs'][-1]
        #last_gwy = info['gwidthys'][-1]
        #op_gwx = last_gwx
        #op_gwy = last_gwy
        op_gwx = max(fac_min * medx2, 1e-5, min(fac_max * medx2, op_gwx))
        op_gwy = max(fac_min * medy2, 1e-5, min(fac_max * medy2, op_gwy))

        nfsic_opt = it.GaussNFSIC(op_gwx,
                                  op_gwy,
                                  op_V,
                                  op_W,
                                  alpha=alpha,
                                  reg='auto',
                                  n_permute=n_permute,
                                  seed=r + 3)
        nfsic_opt_result = nfsic_opt.perform_test(te)
    return {
        'indtest': nfsic_opt,
        'test_result': nfsic_opt_result,
        'time_secs': t.secs
    }
示例#12
0
def job_qhsic_med(paired_source, tr, te, r):
    """
    Quadratic-time HSIC using the permutation test.
    - Gaussian kernels.
    - No parameter selection procedure. Use the median heuristic for both 
    X and Y.
    - Use full sample for testing. 
    """
    # use full sample for testing. Merge training and test sets
    pdata = tr + te
    n_permute = 300
    with util.ContextTimer() as t:
        k, l = kl_kgauss_median(pdata)
        qhsic = it.QuadHSIC(k, l, n_permute, alpha=alpha, seed=r + 1)
        qhsic_result = qhsic.perform_test(pdata)
    return {'indtest': qhsic, 'test_result': qhsic_result, 'time_secs': t.secs}
示例#13
0
def job_nfsic_med(paired_source, tr, te, r):
    """
    NFSIC in which the test locations are randomized, and the Gaussian width 
    is set with the median heuristic. Use full sample. No training/testing splits.
    """
    pdata = tr + te
    with util.ContextTimer() as t:
        V, W = it.GaussNFSIC.init_locs_2randn(pdata, J, seed=r + 2)
        k, l = kl_kgauss_median(pdata)
        nfsic_med = it.NFSIC(k, l, V, W, alpha=alpha, reg='auto')
        nfsic_med_result = nfsic_med.perform_test(pdata)
    return {
        #'indtest': nfsic_med,
        'test_result': nfsic_med_result,
        'time_secs': t.secs
    }
示例#14
0
    def compute(self):

        # randomly wait a few seconds so that multiple processes accessing the same
        # Theano function do not cause a lock problem. I do not know why.
        # I do not know if this does anything useful.
        # Sleep in seconds.
        time.sleep(np.random.rand(1) * 2)

        # load the data and construct a PairedSource here
        # The data can be big. We have to load it in this job function i.e.,
        # each computing node loads by itself (no data passing).
        folder_path = self.folder_path
        prob_label = self.prob_label
        paired_source, _, is_h0 = exglo.get_problem_pickle(
            folder_path, prob_label + '.n0')

        n = self.n
        r = self.rep
        job_func = self.job_func

        pdata = paired_source.sample(n, seed=r)
        with util.ContextTimer() as t:
            logger.info("computing. %s. prob=%s, r=%d, n=%d" %
                        (job_func.__name__, pdata.label, r, n))
            tr, te = pdata.split_tr_te(tr_proportion=tr_proportion,
                                       seed=r + 21)
            prob_label = self.prob_label

            job_result = job_func(paired_source, tr, te, r)

            # create ScalarResult instance
            result = SingleResult(job_result)
            # submit the result to my own aggregator
            self.aggregator.submit_result(result)
            func_name = job_func.__name__
        logger.info("done. ex1: %s, prob=%s, r=%d, n=%d. Took: %.3g s " %
                    (func_name, pdata.label, r, n, t.secs))

        # save result
        fname = '%s-%s-r%d_n%d_a%.3f_trp%.2f.p' \
            %(prob_label, func_name, r, n, alpha, tr_proportion)
        glo.ex_save_result(ex, job_result, prob_label, fname)
示例#15
0
def job_nfsicJ10_cperm_stoopt(paired_source, tr, te, r):
    """
    - Copula transform the data
    - Use permutations to simulate from the null distribution.
    """
    n_permute = 500

    with util.ContextTimer() as t:
        # copula transform to both X and Y
        cop_map = fea.MarginalCDFMap()
        xtr, ytr = tr.xy()
        xte, yte = te.xy()

        xtr = cop_map.gen_features(xtr)
        ytr = cop_map.gen_features(ytr)
        xte = cop_map.gen_features(xte)
        yte = cop_map.gen_features(yte)

        tr = data.PairedData(xtr, ytr)
        te = data.PairedData(xte, yte)

        to_return = job_nfsicJ10_stoopt(paired_source, tr, te, r, n_permute)
    to_return['time_secs'] = t.secs
    return to_return
示例#16
0
def job_nfsicJ3_perm_stoopt(paired_source, tr, te, r):
    """
    Use permutations to simulate from the null distribution.
    """
    n_permute = 500
    J = 3
    with util.ContextTimer() as t:
        nfsic_opt_options = {
            'n_test_locs': J,
            'max_iter': 300,
            'V_step': 1,
            'W_step': 1,
            'gwidthx_step': 1,
            'gwidthy_step': 1,
            'batch_proportion': 0.7,
            'tol_fun': 1e-4,
            'step_pow': 0.5,
            'seed': r + 2,
            'reg': 1e-6
        }
        op_V, op_W, op_gwx, op_gwy, info = it.GaussNFSIC.optimize_locs_widths(
            tr, alpha, **nfsic_opt_options)
        nfsic_opt = it.GaussNFSIC(op_gwx,
                                  op_gwy,
                                  op_V,
                                  op_W,
                                  alpha,
                                  reg='auto',
                                  n_permute=n_permute,
                                  seed=r + 3)
        nfsic_opt_result = nfsic_opt.perform_test(te)
    return {
        'indtest': nfsic_opt,
        'test_result': nfsic_opt_result,
        'time_secs': t.secs
    }