def sample(self, n, seed=3): with util.NumpySeedContext(seed=seed): X = np.log(self.nonhom_linear(size=n)) if len(X.shape) == 1: # This can happen if d=1 X = X[:, np.newaxis] return Data(X)
def gaussbern_rbm_probs(stds_perturb_B, dx=50, dh=10, n=sample_size): """ Get a sequence of Gaussian-Bernoulli RBM problems. We follow the parameter settings as described in section 6 of Liu et al., 2016. - stds_perturb_B: a list of Gaussian noise standard deviations for perturbing B. - dx: observed dimension - dh: latent dimension """ probs = [] for i, std in enumerate(stds_perturb_B): with util.NumpySeedContext(seed=i + 1000): B = np.random.randint(0, 2, (dx, dh)) * 2 - 1.0 b = np.random.randn(dx) c = np.random.randn(dh) p = density.GaussBernRBM(B, b, c) if std <= 1e-8: B_perturb = B else: B_perturb = B + np.random.randn(dx, dh) * std gb_rbm = data.DSGaussBernRBM(B_perturb, b, c, burnin=2000) probs.append((std, p, gb_rbm)) return probs
def sample(self, n, seed=29): pmix = self.pmix means = self.means variances = self.variances k, d = self.means.shape sam_list = [] with util.NumpySeedContext(seed=seed): # counts for each mixture component counts = np.random.multinomial(n, pmix, size=1) # counts is a 2d array counts = counts[0] # For each component, draw from its corresponding mixture component. for i, nc in enumerate(counts): # construct the component # https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.multivariate_normal.html cov = variances[i] mnorm = stats.multivariate_normal(means[i], cov) # Sample from ith component sam_i = mnorm.rvs(size=nc) sam_list.append(sam_i) sample = np.vstack(sam_list) assert sample.shape[0] == n np.random.shuffle(sample) return Data(sample)
def sample(self, n, seed=2): with util.NumpySeedContext(seed=seed): d = len(self.mean) mean = self.mean variance = self.variance X = np.random.randn(n, d) * np.sqrt(variance) + mean return Data(X)
def sample(self, n, seed=872): """ Rejection sampling. """ d = len(self.freqs) sigma2 = self.sigma2 freqs = self.freqs with util.NumpySeedContext(seed=seed): # rejection sampling sam = np.zeros((n, d)) # sample block_size*d at a time. block_size = 500 from_ind = 0 while from_ind < n: # The proposal q is N(0, sigma2*I) X = np.random.randn(block_size, d) * np.sqrt(sigma2) q_un = np.exp(-np.sum(X**2, 1) / (2.0 * sigma2)) # unnormalized density p p_un = q_un * (1 + np.prod(np.cos(X * freqs), 1)) c = 2.0 I = stats.uniform.rvs(size=block_size) < p_un / (c * q_un) # accept accepted_count = np.sum(I) to_take = min(n - from_ind, accepted_count) end_ind = from_ind + to_take AX = X[I, :] X_take = AX[:to_take, :] sam[from_ind:end_ind, :] = X_take from_ind = end_ind return Data(sam)
def sample(self, n, seed=3): with util.NumpySeedContext(seed=seed): X = np.log(1 / self.inh2d(lamb_bar=n) - 1) if len(X.shape) == 1: # This can happen if d=1 X = X[:, np.newaxis] return Data(X)
def sample(self, n, seed=3): with util.NumpySeedContext(seed=seed): X = stats.gamma.rvs(self.alpha, size=n, scale=1.0 / self.beta) if len(X.shape) == 1: # This can happen if d=1 X = X[:, np.newaxis] return Data(X)
def simulate_null_dist(eigs, J, n_simulate=2000, seed=7): """ Simulate the null distribution using the spectrums of the covariance matrix of the U-statistic. The simulated statistic is n*FSSD^2 where FSSD is an unbiased estimator. - eigs: a numpy array of estimated eigenvalues of the covariance matrix. eigs is of length d*J, where d is the input dimension, and - J: the number of test locations. Return a numpy array of simulated statistics. """ d = len(eigs) / J assert d > 0 # draw at most d x J x block_size values at a time block_size = max(20, int(1000.0 / (d * J))) fssds = np.zeros(n_simulate) from_ind = 0 with util.NumpySeedContext(seed=seed): while from_ind < n_simulate: to_draw = min(block_size, n_simulate - from_ind) # draw chi^2 random variables. chi2 = np.random.randn(d * J, to_draw)**2 # an array of length to_draw sim_fssds = eigs.dot(chi2 - 1.0) # store end_ind = from_ind + to_draw fssds[from_ind:end_ind] = sim_fssds from_ind = end_ind return fssds
def sample(self, n, seed=3): with util.NumpySeedContext(seed=seed): X_gmm, llh = self.gmm_sample(N=n) X = X_gmm if len(X.shape) == 1: # This can happen if d=1 X = X[:, np.newaxis] return Data(X)
def sample(self, n, seed=3): with util.NumpySeedContext(seed=seed): mvn = stats.multivariate_normal(self.mean, self.cov) X = mvn.rvs(size=n) if len(X.shape) == 1: # This can happen if d=1 X = X[:, np.newaxis] return Data(X)
def test_grad_log(self): n = 8 with util.NumpySeedContext(seed=17): for d in [4, 1]: variance = 1.2 mean = np.random.randn(d) + 1 X = np.random.rand(n, d) - 2 isonorm = density.IsotropicNormal(mean, variance) grad_log = isonorm.grad_log(X) my_grad_log = -(X-mean)/variance # check correctness np.testing.assert_almost_equal(grad_log, my_grad_log)
def test_log_den(self): n = 7 with util.NumpySeedContext(seed=16): for d in [3, 1]: variance = 1.1 mean = np.random.randn(d) X = np.random.rand(n, d) + 1 isonorm = density.IsotropicNormal(mean, variance) log_dens = isonorm.log_den(X) my_log_dens = -np.sum((X-mean)**2, 1)/(2.0*variance) # check correctness np.testing.assert_almost_equal(log_dens, my_log_dens)
def test_multivariate_normal_density(self): for i in range(4): with util.NumpySeedContext(seed=i + 8): d = i + 2 cov = stats.wishart(df=10 + d, scale=np.eye(d)).rvs(size=1) mean = np.random.randn(d) X = np.random.randn(11, d) den_estimate = density.GaussianMixture.multivariate_normal_density( mean, cov, X) mnorm = stats.multivariate_normal(mean=mean, cov=cov) den_truth = mnorm.pdf(X) np.testing.assert_almost_equal(den_estimate, den_truth)
def test_basic(self): """ Nothing special. Just test basic things. """ # sample n = 10 d = 3 with util.NumpySeedContext(seed=29): X = np.random.randn(n, d) * 3 k = kernel.KGauss(sigma2=1) K = k.eval(X, X) self.assertEqual(K.shape, (n, n)) self.assertTrue(np.all(K >= 0 - 1e-6)) self.assertTrue(np.all(K <= 1 + 1e-6), 'K not bounded by 1')
def test_pair_gradX_Y(self): # sample n = 11 d = 3 with util.NumpySeedContext(seed=20): X = np.random.randn(n, d) * 4 Y = np.random.randn(n, d) * 2 k = kernel.KGauss(sigma2=2.1) # n x d pair_grad = k.pair_gradX_Y(X, Y) loop_grad = np.zeros((n, d)) for i in range(n): for j in range(d): loop_grad[i, j] = k.gradX_Y(X[[i], :], Y[[i], :], j) testing.assert_almost_equal(pair_grad, loop_grad)
def test_gradX_y(self): n = 10 with util.NumpySeedContext(seed=10): for d in [1, 3]: y = np.random.randn(d) * 2 X = np.random.rand(n, d) * 3 sigma2 = 1.3 k = kernel.KGauss(sigma2=sigma2) # n x d G = k.gradX_y(X, y) # check correctness K = k.eval(X, y[np.newaxis, :]) myG = -K / sigma2 * (X - y) self.assertEqual(G.shape, myG.shape) testing.assert_almost_equal(G, myG)
def med_heuristic(models, ref, subsample=1000, seed=100): # subsample first n = ref.shape[0] assert subsample > 0 sub_models = [] with util.NumpySeedContext(seed=seed): ind = np.random.choice(n, min(subsample, n), replace=False) for i in range(len(models)): sub_models.append(models[i][ind, :]) sub_ref = ref[ind, :] med_mz = np.zeros(len(sub_models)) for i, model in enumerate(sub_models): sq_pdist_mz = util.dist_matrix(model, sub_ref)**2 med_mz[i] = np.median(sq_pdist_mz)**0.5 sigma2 = 0.5 * np.mean(med_mz)**2 return sigma2
def perform_test(self, dat, return_simulated_stats=False, return_ustat_gram=False): """ dat: a instance of Data """ with util.ContextTimer() as t: alpha = self.alpha n_simulate = self.n_simulate X = dat.data() n = X.shape[0] _, H = self.compute_stat(dat, return_ustat_gram=True) test_stat = n * np.mean(H) # bootrapping sim_stats = np.zeros(n_simulate) with util.NumpySeedContext(seed=self.seed): for i in range(n_simulate): W = self.bootstrapper(n) # n * [ (1/n^2) * \sum_i \sum_j h(x_i, x_j) w_i w_j ] boot_stat = W.dot(H.dot(old_div(W, float(n)))) # This is a bootstrap version of n*V_n sim_stats[i] = boot_stat # approximate p-value with the permutations pvalue = np.mean(sim_stats > test_stat) results = { 'alpha': self.alpha, 'pvalue': pvalue, 'test_stat': test_stat, 'h0_rejected': pvalue < alpha, 'n_simulate': n_simulate, 'time_secs': t.secs, "H_mu": H.mean(), "H_sigma": H.std() } if return_simulated_stats: results['sim_stats'] = sim_stats if return_ustat_gram: results['H'] = H return results
def training_model(model, data, SAVE_DIR): held_out, train_set = data[:HELD_OUT], data[HELD_OUT:] x = tf.placeholder(tf.float32, [None, DIM], name="subsample") loss = model.loss(x) ploss = model.ploss(x, BETA) opt = tf.train.AdamOptimizer(LEARNING_RATE).minimize(ploss) saver = tf.train.Saver(tf.trainable_variables(model.name)) minLoss = 1e10 noChange = 0 with tf.Session() as sess: print(" Training ") sess.run([ tf.global_variables_initializer(), tf.local_variables_initializer() ]) tf.get_default_graph().finalize() for i in range(N_EPOCHS): with util.NumpySeedContext(i): train_set = np.random.permutation(train_set) for j in range(int(TRAIN_SIZE / BATCH_SIZE)): subsample = train_set[j * BATCH_SIZE:(j + 1) * BATCH_SIZE, :] _, = sess.run([opt], feed_dict={"subsample:0": subsample}) ## Early stopping val, pval = sess.run([loss, ploss], feed_dict={"subsample:0": held_out}) print(val, pval) minLoss = np.min([minLoss, val]) if np.allclose(minLoss, val): print("{0} loss at epoch: {1}".format(minLoss, i)) noChange = noChange + 1 if not os.path.isdir(SAVE_DIR + "/{0}".format(i)): os.mkdir(SAVE_DIR + "/{0}".format(i)) saver.save(sess, SAVE_DIR + "/{0}/model".format(i), write_meta_graph=False) else: noChange = 0 if noChange > GIVE_UP: break print(" END and SAVE {0}".format(i)) if not os.path.isdir(SAVE_DIR + "/end"): os.mkdir(SAVE_DIR + "/end") saver.save(sess, SAVE_DIR + "/end/model", write_meta_graph=False)
def test_gradXY_sum(self): n = 11 with util.NumpySeedContext(seed=12): for d in [3, 1]: X = np.random.randn(n, d) sigma2 = 1.4 k = kernel.KGauss(sigma2=sigma2) # n x n myG = np.zeros((n, n)) K = k.eval(X, X) for i in range(n): for j in range(n): diffi2 = np.sum((X[i, :] - X[j, :])**2) #myG[i, j] = -diffi2*K[i, j]/(sigma2**2)+ d*K[i, j]/sigma2 myG[i, j] = K[i, j] / sigma2 * (d - diffi2 / sigma2) # check correctness G = k.gradXY_sum(X, X) self.assertEqual(G.shape, myG.shape) testing.assert_almost_equal(G, myG)
def sample(self, n, seed=29): pmix = self.pmix means = self.means variances = self.variances k, d = self.means.shape sam_list = [] with util.NumpySeedContext(seed=seed): # counts for each mixture component counts = np.random.multinomial(n, pmix, size=1) # counts is a 2d array counts = counts[0] # For each component, draw from its corresponding mixture component. for i, nc in enumerate(counts): # Sample from ith component sam_i = np.random.randn(nc, d)*np.sqrt(variances[i]) + means[i] sam_list.append(sam_i) sample = np.vstack(sam_list) assert sample.shape[0] == n np.random.shuffle(sample) return Data(sample)
def gaussbern_rbm_tuple(var, dx=50, dh=10, n=sample_size): """ Get a tuple of Gaussian-Bernoulli RBM problems. We follow the parameter settings as described in section 6 of Liu et al., 2016. - var: Gaussian noise variance for perturbing B. - dx: observed dimension - dh: latent dimension Return p, a DataSource """ with util.NumpySeedContext(seed=1000): B = np.random.randint(0, 2, (dx, dh)) * 2 - 1.0 b = np.random.randn(dx) c = np.random.randn(dh) p = density.GaussBernRBM(B, b, c) B_perturb = B + np.random.randn(dx, dh) * np.sqrt(var) gb_rbm = data.DSGaussBernRBM(B_perturb, b, c, burnin=50) return p, gb_rbm
def gbrbm_perturb(var_perturb_B, dx=50, dh=10): """ Get a Gaussian-Bernoulli RBM problem where the first entry of the B matrix (the matrix linking the latent and the observation) is perturbed. - var_perturb_B: Gaussian noise variance for perturbing B. - dx: observed dimension - dh: latent dimension Return p (density), data source """ with util.NumpySeedContext(seed=10): B = np.random.randint(0, 2, (dx, dh))*2 - 1.0 b = np.random.randn(dx) c = np.random.randn(dh) p = density.GaussBernRBM(B, b, c) B_perturb = np.copy(B) if var_perturb_B > 1e-7: B_perturb[0, 0] = B_perturb[0, 0] + \ np.random.randn(1)*np.sqrt(var_perturb_B) ds = data.DSGaussBernRBM(B_perturb, b, c, burnin=2000) return p, ds
def sample(self, n, seed=3, return_latent=False): """ Sample by blocked Gibbs sampling """ B = self.B b = self.b c = self.c dh = len(c) dx = len(b) # Initialize the state of the Markov chain with util.NumpySeedContext(seed=seed): X = np.random.randn(n, dx) H = np.random.randint(1, 2, (n, dh)) * 2 - 1.0 # burn-in for t in range(self.burnin): X, H = self._blocked_gibbs_next(X, H) # sampling X, H = self._blocked_gibbs_next(X, H) if return_latent: return Data(X), H else: return Data(X)
def sample(self, n, seed=4): with util.NumpySeedContext(seed=seed): X = np.random.laplace(loc=self.loc, scale=self.scale, size=(n, self.d)) return Data(X)
def sample(self, n, seed=5): with util.NumpySeedContext(seed=seed): X = stats.t.rvs(df=self.df, size=n) X = X[:, np.newaxis] return Data(X)