def job_fhsic_med(paired_source, tr, te, r): """ HSIC with random Fourier features. Simulate the null distribution with the spectrums of the empirical cross covariance operators. - Gaussian kernels. - No parameter selection procedure. Use the median heuristic for both X and Y. - Use full sample for testing. """ n_simulate = 2000 # random features n_features = 10 # use full sample for testing. Merge training and test sets pdata = tr + te with util.ContextTimer() as t: X, Y = pdata.xy() medx = util.meddistance(X, subsample=1000) medy = util.meddistance(Y, subsample=1000) sigmax2 = medx**2 sigmay2 = medy**2 fmx = fea.RFFKGauss(sigmax2, n_features=n_features, seed=r + 1) fmy = fea.RFFKGauss(sigmay2, n_features=n_features, seed=r + 2) ffhsic = it.FiniteFeatureHSIC(fmx, fmy, n_simulate=n_simulate, alpha=alpha, seed=r + 89) ffhsic_result = ffhsic.perform_test(pdata) return { 'indtest': ffhsic, 'test_result': ffhsic_result, 'time_secs': t.secs }
def job_rdcperm_med(paired_source, tr, te, r, n_features=10): """ The Randomized Dependence Coefficient test with permutations. """ pdata = tr + te n_permute = 500 # n_features=10 from Lopez-Paz et al., 2013 paper. with util.ContextTimer() as t: # get the median distances X, Y = pdata.xy() # copula transform to both X and Y cop_map = fea.MarginalCDFMap() Xcdf = cop_map.gen_features(X) Ycdf = cop_map.gen_features(Y) medx = util.meddistance(Xcdf, subsample=1000) medy = util.meddistance(Ycdf, subsample=1000) sigmax2 = medx**2 sigmay2 = medy**2 fmx = fea.RFFKGauss(sigmax2, n_features=n_features, seed=r + 19) fmy = fea.RFFKGauss(sigmay2, n_features=n_features, seed=r + 220) rdcperm = it.RDCPerm(fmx, fmy, n_permute=n_permute, alpha=alpha, seed=r + 100) rdcperm_result = rdcperm.perform_test(pdata) return { 'indtest': rdcperm, 'test_result': rdcperm_result, 'time_secs': t.secs }
def job_rdcperm_nc_med(paired_source, tr, te, r, n_features=10): """ The Randomized Dependence Coefficient test with permutations. No copula transformtation. Use median heuristic on the data. """ pdata = tr + te n_permute = 500 # n_features=10 from Lopez-Paz et al., 2013 paper. with util.ContextTimer() as t: # get the median distances X, Y = pdata.xy() medx = util.meddistance(X, subsample=1000) medy = util.meddistance(Y, subsample=1000) sigmax2 = medx**2 sigmay2 = medy**2 fmx = fea.RFFKGauss(sigmax2, n_features=n_features, seed=r + 19) fmy = fea.RFFKGauss(sigmay2, n_features=n_features, seed=r + 220) rdcperm = it.RDCPerm(fmx, fmy, n_permute=n_permute, alpha=alpha, seed=r + 100, use_copula=False) rdcperm_result = rdcperm.perform_test(pdata) return { 'indtest': rdcperm, 'test_result': rdcperm_result, 'time_secs': t.secs }
def job_rdc_med(paired_source, tr, te, r, n_features=10): """ The Randomized Dependence Coefficient test. - Gaussian width = median heuristic on the copula-transformed data - 10 random features for each X andY - Use full dataset for testing """ pdata = tr + te # n_features=10 from Lopez-Paz et al., 2013 paper. with util.ContextTimer() as t: # get the median distances X, Y = pdata.xy() # copula transform to both X and Y cop_map = fea.MarginalCDFMap() Xcdf = cop_map.gen_features(X) Ycdf = cop_map.gen_features(Y) medx = util.meddistance(Xcdf, subsample=1000) medy = util.meddistance(Ycdf, subsample=1000) sigmax2 = medx**2 sigmay2 = medy**2 fmx = fea.RFFKGauss(sigmax2, n_features=n_features, seed=r + 19) fmy = fea.RFFKGauss(sigmay2, n_features=n_features, seed=r + 220) rdc = it.RDC(fmx, fmy, alpha=alpha) rdc_result = rdc.perform_test(pdata) return {'indtest': rdc, 'test_result': rdc_result, 'time_secs': t.secs}
def job_nfsic_grid(paired_source, tr, te, r): """ NFSIC where the test locations are randomized, and the Gaussian widths are optimized by a grid search. """ # randomize the test locations by fitting Gaussians to the data with util.ContextTimer() as t: V, W = it.GaussNFSIC.init_locs_2randn(tr, J, seed=r + 2) xtr, ytr = tr.xy() n_gwidth_cand = 30 gwidthx_factors = 2.0**np.linspace(-4, 4, n_gwidth_cand) gwidthy_factors = gwidthx_factors #gwidthy_factors = 2.0**np.linspace(-3, 4, 40) medx = util.meddistance(xtr, 1000) medy = util.meddistance(ytr, 1000) list_gwidthx = np.hstack(((medx**2) * gwidthx_factors)) list_gwidthy = np.hstack(((medy**2) * gwidthy_factors)) bestij, lambs = it.GaussNFSIC.grid_search_gwidth( tr, V, W, list_gwidthx, list_gwidthy) # These are width^2 best_widthx = list_gwidthx[bestij[0]] best_widthy = list_gwidthy[bestij[1]] # perform test nfsic_grid = it.GaussNFSIC(best_widthx, best_widthy, V, W, alpha) nfsic_grid_result = nfsic_grid.perform_test(te) return { 'indtest': nfsic_grid, 'test_result': nfsic_grid_result, 'time_secs': t.secs }
def kl_kgauss_median(pdata): """ Get two Gaussian kernels constructed with the median heuristic. """ xtr, ytr = pdata.xy() dx = xtr.shape[1] dy = ytr.shape[1] medx2 = util.meddistance(xtr, subsample=1000)**2 medy2 = util.meddistance(ytr, subsample=1000)**2 k = kernel.KGauss(medx2) l = kernel.KGauss(medy2) return k, l
def kl_median(pdata): """ Get two Gaussian kernels constructed with the median heuristic. Randomize V, W from the standard Gaussian distribution. """ xtr, ytr = pdata.xy() dx = xtr.shape[1] dy = ytr.shape[1] medx2 = util.meddistance(xtr)**2 medy2 = util.meddistance(ytr)**2 k = kernel.KGauss(medx2) l = kernel.KGauss(medy2) return k, l
def setUp(self): n = 300 dx = 2 pdata_mean = get_pdata_mean(n, dx) X, Y = pdata_mean.xy() gwx2 = util.meddistance(X)**2 gwy2 = util.meddistance(Y)**2 J = 2 V = np.random.randn(J, dx) W = np.random.randn(J, 1) self.gnfsic = it.GaussNFSIC(gwx2, gwy2, V, W, alpha=0.01) self.pdata_mean = pdata_mean
def kl_median(pdata): """ Get two Gaussian kernels constructed with the median heuristic. Randomize V, W from the standard Gaussian distribution. """ xtr, ytr = pdata.xy() dx = xtr.shape[1] dy = ytr.shape[1] medx2 = util.meddistance(xtr)**2 medy2 = util.meddistance(ytr)**2 k = kernel.KGauss(medx2) l = kernel.KGauss(medy2) return k, l
def setUp(self): n = 300 dx = 2 pdata_mean = get_pdata_mean(n, dx) X, Y = pdata_mean.xy() gwx2 = util.meddistance(X)**2 gwy2 = util.meddistance(Y)**2 J = 2 V = np.random.randn(J, dx) W = np.random.randn(J, 1) self.gnfsic = it.GaussNFSIC(gwx2, gwy2, V, W, alpha=0.01) self.pdata_mean = pdata_mean
def setUp(self): n = 300 dx = 2 pdata_mean = get_pdata_mean(n, dx) X, Y = pdata_mean.xy() gwx2 = util.meddistance(X)**2 gwy2 = util.meddistance(Y)**2 k = kernel.KGauss(gwx2) l = kernel.KGauss(gwy2) J = 2 V = np.random.randn(J, dx) W = np.random.randn(J, 1) self.nfsic = it.NFSIC(k, l, V, W, alpha=0.01) self.pdata_mean = pdata_mean
def setUp(self): n = 300 dx = 2 pdata_mean = get_pdata_mean(n, dx) X, Y = pdata_mean.xy() gwx2 = util.meddistance(X)**2 gwy2 = util.meddistance(Y)**2 k = kernel.KGauss(gwx2) l = kernel.KGauss(gwy2) J = 2 V = np.random.randn(J, dx) W = np.random.randn(J, 1) self.nfsic = it.NFSIC(k, l, V, W, alpha=0.01) self.pdata_mean = pdata_mean
def test_nfsic(self): n = 50 dx = 3 dy = 1 X = np.random.randn(n, dx) Y = np.random.randn(n, dy) + 1 medx2 = util.meddistance(X)**2 medy2 = util.meddistance(Y)**2 k = kernel.KGauss(medx2) l = kernel.KGauss(medy2) J = 3 V = np.random.randn(J, dx) W = np.random.randn(J, dy) nfsic, mean, cov = it.nfsic(X, Y, k, l, V, W, reg=0) self.assertAlmostEqual(np.imag(nfsic), 0) self.assertGreater(nfsic, 0)
def test_nfsic(self): n = 50 dx = 3 dy = 1 X = np.random.randn(n, dx) Y = np.random.randn(n, dy) + 1 medx2 = util.meddistance(X)**2 medy2 = util.meddistance(Y)**2 k = kernel.KGauss(medx2) l = kernel.KGauss(medy2) J = 3 V = np.random.randn(J, dx) W = np.random.randn(J, dy) nfsic, mean, cov = it.nfsic(X, Y, k, l, V, W, reg=0) self.assertAlmostEqual(np.imag(nfsic), 0) self.assertGreater(nfsic, 0)