def test_pca_compare_var(): # create some random data num_samples = 10000 dim = 10 batch_size = 100 num_components = 3 # generate some data mean = np.random.random(dim) cov_factor = np.random.random((dim, dim)) cov = np.dot(cov_factor, cov_factor.T) samples = be.float_tensor( np.random.multivariate_normal(mean, cov, size=num_samples)) samples_train, samples_validate = batch.split_tensor(samples, 0.9) data = batch.Batch({ 'train': batch.InMemoryTable(samples_train, batch_size), 'validate': batch.InMemoryTable(samples_validate, batch_size) }) # find the principal directions pca_sgd = factorization.PCA.from_batch(data, num_components, epochs=10, grad_steps_per_minibatch=1, stepsize=0.01) pca_svd = factorization.PCA.from_svd(samples_train, num_components) assert be.norm(pca_sgd.var - pca_svd.var) / be.norm(pca_sgd.var) < 1e-1
def test_gaussian_1D_1mode_train(): # create some example data num = 10000 mu = 3 sigma = 1 samples = be.randn((num, 1)) * sigma + mu # set up the reader to get minibatches batch_size = 100 samples_train, samples_validate = batch.split_tensor(samples, 0.9) data = batch.Batch({ 'train': batch.InMemoryTable(samples_train, batch_size), 'validate': batch.InMemoryTable(samples_validate, batch_size) }) # parameters learning_rate = schedules.PowerLawDecay(initial=0.1, coefficient=0.1) mc_steps = 1 num_epochs = 10 num_sample_steps = 100 # set up the model and initialize the parameters vis_layer = layers.GaussianLayer(1) hid_layer = layers.OneHotLayer(1) rbm = BoltzmannMachine([vis_layer, hid_layer]) rbm.initialize(data, method='hinton') # modify the parameters to shift the initialized model from the data # this forces it to train rbm.layers[0].params = layers.ParamsGaussian( rbm.layers[0].params.loc - 3, rbm.layers[0].params.log_var - 1) # set up the optimizer and the fit method opt = optimizers.ADAM(stepsize=learning_rate) cd = fit.SGD(rbm, data) # fit the model print('training with persistent contrastive divergence') cd.train(opt, num_epochs, method=fit.pcd, mcsteps=mc_steps) # sample data from the trained model model_state = \ samplers.SequentialMC.generate_fantasy_state(rbm, num, num_sample_steps) pts_trained = model_state[0] percent_error = 10 mu_trained = be.mean(pts_trained) assert numpy.abs(mu_trained / mu - 1) < (percent_error / 100) sigma_trained = numpy.sqrt(be.var(pts_trained)) assert numpy.abs(sigma_trained / sigma - 1) < (percent_error / 100)
def test_pca_save_read_num_components(): # create some random data num_samples = 10000 dim = 10 batch_size = 100 num_components = 3 num_components_save = 2 # generate some data mean = np.random.random(dim) cov_factor = np.random.random((dim, dim)) cov = np.dot(cov_factor, cov_factor.T) samples = be.float_tensor( np.random.multivariate_normal(mean, cov, size=num_samples)) samples_train, samples_validate = batch.split_tensor(samples, 0.9) data = batch.Batch({ 'train': batch.InMemoryTable(samples_train, batch_size), 'validate': batch.InMemoryTable(samples_validate, batch_size) }) # find the principal directions pca = factorization.PCA.from_batch(data, num_components, epochs=10, grad_steps_per_minibatch=1, stepsize=0.01) # save it pca_file = tempfile.NamedTemporaryFile() store = pd.HDFStore(pca_file.name, mode="w") pca.save(store, num_components_save=num_components_save) # read it pca_read = factorization.PCA.from_saved(store) store.close() # check it assert be.allclose(pca.W[:, :num_components_save], pca_read.W) assert be.allclose(pca.var[:num_components_save], pca_read.var) assert pca.stepsize == pca_read.stepsize assert pca_read.num_components == num_components_save
def test_rbm(paysage_path=None): num_hidden_units = 50 batch_size = 50 num_epochs = 1 learning_rate = schedules.PowerLawDecay(initial=0.01, coefficient=0.1) mc_steps = 1 if not paysage_path: paysage_path = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) filepath = os.path.join(paysage_path, 'examples', 'mnist', 'mnist.h5') if not os.path.exists(filepath): raise IOError( "{} does not exist. run mnist/download_mnist.py to fetch from the web" .format(filepath)) shuffled_filepath = os.path.join(paysage_path, 'examples', 'mnist', 'shuffled_mnist.h5') # shuffle the data if not os.path.exists(shuffled_filepath): shuffler = batch.DataShuffler(filepath, shuffled_filepath, complevel=0) shuffler.shuffle() # set a seed for the random number generator be.set_seed() import pandas samples = pre.binarize_color( be.float_tensor( pandas.read_hdf(shuffled_filepath, key='train/images').values[:10000])) samples_train, samples_validate = batch.split_tensor(samples, 0.95) data = batch.Batch({ 'train': batch.InMemoryTable(samples_train, batch_size), 'validate': batch.InMemoryTable(samples_validate, batch_size) }) # set up the model and initialize the parameters vis_layer = layers.BernoulliLayer(data.ncols) hid_layer = layers.BernoulliLayer(num_hidden_units) rbm = BoltzmannMachine([vis_layer, hid_layer]) rbm.initialize(data) # obtain initial estimate of the reconstruction error perf = ProgressMonitor() untrained_performance = perf.epoch_update(data, rbm, store=True, show=False) # set up the optimizer and the fit method opt = optimizers.RMSProp(stepsize=learning_rate) cd = fit.SGD(rbm, data) # fit the model print('training with contrastive divergence') cd.train(opt, num_epochs, method=fit.pcd, mcsteps=mc_steps) # obtain an estimate of the reconstruction error after 1 epoch trained_performance = cd.monitor.memory[-1] assert (trained_performance['ReconstructionError'] < untrained_performance['ReconstructionError']), \ "Reconstruction error did not decrease" # close the HDF5 store data.close()
def test_tap_machine(paysage_path=None): num_hidden_units = 10 batch_size = 100 num_epochs = 5 learning_rate = schedules.PowerLawDecay(initial=0.1, coefficient=1.0) if not paysage_path: paysage_path = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) filepath = os.path.join(paysage_path, 'examples', 'mnist', 'mnist.h5') if not os.path.exists(filepath): raise IOError( "{} does not exist. run mnist/download_mnist.py to fetch from the web" .format(filepath)) shuffled_filepath = os.path.join(paysage_path, 'examples', 'mnist', 'shuffled_mnist.h5') # shuffle the data if not os.path.exists(shuffled_filepath): shuffler = batch.DataShuffler(filepath, shuffled_filepath, complevel=0) shuffler.shuffle() # set a seed for the random number generator be.set_seed() # set up the reader to get minibatches samples = pre.binarize_color( be.float_tensor( pandas.read_hdf(shuffled_filepath, key='train/images').as_matrix()[:10000])) samples_train, samples_validate = batch.split_tensor(samples, 0.95) data = batch.Batch({ 'train': batch.InMemoryTable(samples_train, batch_size), 'validate': batch.InMemoryTable(samples_validate, batch_size) }) # set up the model and initialize the parameters vis_layer = layers.BernoulliLayer(data.ncols) hid_layer = layers.BernoulliLayer(num_hidden_units) rbm = BoltzmannMachine([vis_layer, hid_layer]) rbm.initialize(data) # obtain initial estimate of the reconstruction error perf = ProgressMonitor(generator_metrics = \ [ReconstructionError(), TAPLogLikelihood(10), TAPFreeEnergy(10)]) untrained_performance = perf.epoch_update(data, rbm, store=True, show=False) # set up the optimizer and the fit method opt = optimizers.Gradient(stepsize=learning_rate, tolerance=1e-5) tap = fit.TAP(True, 0.1, 0.01, 25, True, 0.5, 0.001, 0.0) solver = fit.SGD(rbm, data) solver.monitor.generator_metrics.append(TAPLogLikelihood(10)) solver.monitor.generator_metrics.append(TAPFreeEnergy(10)) # fit the model print('training with stochastic gradient ascent') solver.train(opt, num_epochs, method=tap.tap_update) # obtain an estimate of the reconstruction error after 1 epoch trained_performance = solver.monitor.memory[-1] assert (trained_performance['TAPLogLikelihood'] > untrained_performance['TAPLogLikelihood']), \ "TAP log-likelihood did not increase" assert (trained_performance['ReconstructionError'] < untrained_performance['ReconstructionError']), \ "Reconstruction error did not decrease" # close the HDF5 store data.close()