def job_mmd_med(p, data_source, tr, te, r): """ MMD test of Gretton et al., 2012 used as a goodness-of-fit test. Require the ability to sample from p i.e., the UnnormalizedDensity p has to be able to return a non-None from get_datasource() """ # full data data = tr + te X = data.data() with util.ContextTimer() as t: # median heuristic pds = p.get_datasource() datY = pds.sample(data.sample_size(), seed=r + 294) Y = datY.data() XY = np.vstack((X, Y)) # If p, q differ very little, the median may be very small, rejecting H0 # when it should not? medx = util.meddistance(X, subsample=1000) medy = util.meddistance(Y, subsample=1000) medxy = util.meddistance(XY, subsample=1000) med_avg = (medx + medy + medxy) / 3.0 k = kernel.KGauss(med_avg**2) mmd_test = mgof.QuadMMDGof(p, k, n_permute=400, alpha=alpha, seed=r) mmd_result = mmd_test.perform_test(data) return {'test_result': mmd_result, 'time_secs': t.secs}
def job_mmd_opt(p, data_source, tr, te, r): """ MMD test of Gretton et al., 2012 used as a goodness-of-fit test. Require the ability to sample from p i.e., the UnnormalizedDensity p has to be able to return a non-None from get_datasource() With optimization. Gaussian kernel. """ data = tr + te X = data.data() with util.ContextTimer() as t: # median heuristic pds = p.get_datasource() datY = pds.sample(data.sample_size(), seed=r+294) Y = datY.data() XY = np.vstack((X, Y)) med = util.meddistance(XY, subsample=1000) # Construct a list of kernels to try based on multiples of the median # heuristic #list_gwidth = np.hstack( (np.linspace(20, 40, 10), (med**2) # *(2.0**np.linspace(-2, 2, 20) ) ) ) list_gwidth = (med**2)*(2.0**np.linspace(-4, 4, 30) ) list_gwidth.sort() candidate_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth] mmd_opt = mgof.QuadMMDGofOpt(p, n_permute=300, alpha=alpha, seed=r) mmd_result = mmd_opt.perform_test(data, candidate_kernels=candidate_kernels, tr_proportion=tr_proportion, reg=1e-3) return { 'test_result': mmd_result, 'time_secs': t.secs}
def job_mmd_dgauss_opt(p, data_source, tr, te, r): """ MMD test of Gretton et al., 2012 used as a goodness-of-fit test. Require the ability to sample from p i.e., the UnnormalizedDensity p has to be able to return a non-None from get_datasource() With optimization. Diagonal Gaussian kernel where there is one Gaussian width for each dimension. """ data = tr + te X = data.data() d = X.shape[1] with util.ContextTimer() as t: # median heuristic pds = p.get_datasource() datY = pds.sample(data.sample_size(), seed=r+294) Y = datY.data() XY = np.vstack((X, Y)) # Get the median heuristic for each dimension meds = np.zeros(d) for i in range(d): medi = util.meddistance(XY[:, [i]], subsample=1000) meds[i] = medi # Construct a list of kernels to try based on multiples of the median # heuristic med_factors = 2.0**np.linspace(-4, 4, 20) candidate_kernels = [] for i in range(len(med_factors)): ki = kernel.KDiagGauss( (meds**2)*med_factors[i] ) candidate_kernels.append(ki) mmd_opt = mgof.QuadMMDGofOpt(p, n_permute=300, alpha=alpha, seed=r+56) mmd_result = mmd_opt.perform_test(data, candidate_kernels=candidate_kernels, tr_proportion=tr_proportion, reg=1e-3) return { 'test_result': mmd_result, 'time_secs': t.secs}