def test_pca_compare_var(): # create some random data num_samples = 10000 dim = 10 batch_size = 100 num_components = 3 # generate some data mean = np.random.random(dim) cov_factor = np.random.random((dim, dim)) cov = np.dot(cov_factor, cov_factor.T) samples = be.float_tensor( np.random.multivariate_normal(mean, cov, size=num_samples)) samples_train, samples_validate = batch.split_tensor(samples, 0.9) data = batch.Batch({ 'train': batch.InMemoryTable(samples_train, batch_size), 'validate': batch.InMemoryTable(samples_validate, batch_size) }) # find the principal directions pca_sgd = factorization.PCA.from_batch(data, num_components, epochs=10, grad_steps_per_minibatch=1, stepsize=0.01) pca_svd = factorization.PCA.from_svd(samples_train, num_components) assert be.norm(pca_sgd.var - pca_svd.var) / be.norm(pca_sgd.var) < 1e-1
def test_gaussian_1D_1mode_train(): # create some example data num = 10000 mu = 3 sigma = 1 samples = be.randn((num, 1)) * sigma + mu # set up the reader to get minibatches batch_size = 100 samples_train, samples_validate = batch.split_tensor(samples, 0.9) data = batch.Batch({ 'train': batch.InMemoryTable(samples_train, batch_size), 'validate': batch.InMemoryTable(samples_validate, batch_size) }) # parameters learning_rate = schedules.PowerLawDecay(initial=0.1, coefficient=0.1) mc_steps = 1 num_epochs = 10 num_sample_steps = 100 # set up the model and initialize the parameters vis_layer = layers.GaussianLayer(1) hid_layer = layers.OneHotLayer(1) rbm = BoltzmannMachine([vis_layer, hid_layer]) rbm.initialize(data, method='hinton') # modify the parameters to shift the initialized model from the data # this forces it to train rbm.layers[0].params = layers.ParamsGaussian( rbm.layers[0].params.loc - 3, rbm.layers[0].params.log_var - 1) # set up the optimizer and the fit method opt = optimizers.ADAM(stepsize=learning_rate) cd = fit.SGD(rbm, data) # fit the model print('training with persistent contrastive divergence') cd.train(opt, num_epochs, method=fit.pcd, mcsteps=mc_steps) # sample data from the trained model model_state = \ samplers.SequentialMC.generate_fantasy_state(rbm, num, num_sample_steps) pts_trained = model_state[0] percent_error = 10 mu_trained = be.mean(pts_trained) assert numpy.abs(mu_trained / mu - 1) < (percent_error / 100) sigma_trained = numpy.sqrt(be.var(pts_trained)) assert numpy.abs(sigma_trained / sigma - 1) < (percent_error / 100)
def test_in_memory_table_batch(): # create data num_rows = 10000 num_cols = 10 tensor = be.rand((num_rows, num_cols)) # batch it with InMemoryTable batch_size = 1000 num_train_batches = num_rows // batch_size data = batch.InMemoryTable(tensor, batch_size) # loop through, checking the data i_batch = 0 while True: # get the data try: batch_data = data.get() except StopIteration: assert i_batch == num_train_batches i_batch = 0 break # check it assert be.allclose( batch_data, tensor[i_batch * batch_size:(i_batch + 1) * batch_size]) i_batch += 1
def test_grbm_reload(): vis_layer = layers.BernoulliLayer(num_vis, center=True) hid_layer = layers.GaussianLayer(num_hid, center=True) # create some extrinsics grbm = BoltzmannMachine([vis_layer, hid_layer]) data = batch.Batch({ 'train': batch.InMemoryTable(be.randn((10 * num_samples, num_vis)), num_samples) }) grbm.initialize(data) with tempfile.NamedTemporaryFile() as file: # save the model store = pandas.HDFStore(file.name, mode='w') grbm.save(store) store.close() # reload store = pandas.HDFStore(file.name, mode='r') grbm_reload = BoltzmannMachine.from_saved(store) store.close() # check the two models are consistent vis_data = vis_layer.random((num_samples, num_vis)) data_state = State.from_visible(vis_data, grbm) vis_orig = grbm.deterministic_iteration(1, data_state)[0] vis_reload = grbm_reload.deterministic_iteration(1, data_state)[0] assert be.allclose(vis_orig, vis_reload) assert be.allclose(grbm.layers[0].moments.mean, grbm_reload.layers[0].moments.mean) assert be.allclose(grbm.layers[0].moments.var, grbm_reload.layers[0].moments.var) assert be.allclose(grbm.layers[1].moments.mean, grbm_reload.layers[1].moments.mean) assert be.allclose(grbm.layers[1].moments.var, grbm_reload.layers[1].moments.var)
def test_pca_save_read_num_components(): # create some random data num_samples = 10000 dim = 10 batch_size = 100 num_components = 3 num_components_save = 2 # generate some data mean = np.random.random(dim) cov_factor = np.random.random((dim, dim)) cov = np.dot(cov_factor, cov_factor.T) samples = be.float_tensor( np.random.multivariate_normal(mean, cov, size=num_samples)) samples_train, samples_validate = batch.split_tensor(samples, 0.9) data = batch.Batch({ 'train': batch.InMemoryTable(samples_train, batch_size), 'validate': batch.InMemoryTable(samples_validate, batch_size) }) # find the principal directions pca = factorization.PCA.from_batch(data, num_components, epochs=10, grad_steps_per_minibatch=1, stepsize=0.01) # save it pca_file = tempfile.NamedTemporaryFile() store = pd.HDFStore(pca_file.name, mode="w") pca.save(store, num_components_save=num_components_save) # read it pca_read = factorization.PCA.from_saved(store) store.close() # check it assert be.allclose(pca.W[:, :num_components_save], pca_read.W) assert be.allclose(pca.var[:num_components_save], pca_read.var) assert pca.stepsize == pca_read.stepsize assert pca_read.num_components == num_components_save
def test_grbm_save(): vis_layer = layers.BernoulliLayer(num_vis, center=True) hid_layer = layers.GaussianLayer(num_hid, center=True) grbm = BoltzmannMachine([vis_layer, hid_layer]) data = batch.Batch({ 'train': batch.InMemoryTable(be.randn((10 * num_samples, num_vis)), num_samples) }) grbm.initialize(data) with tempfile.NamedTemporaryFile() as file: store = pandas.HDFStore(file.name, mode='w') grbm.save(store) store.close()
def test_in_memory_batch(): # create data num_rows = 10000 num_cols = 10 tensor = be.rand((num_rows, num_cols)) # read it back with Batch batch_size = 1000 num_train_batches = num_rows // batch_size with batch.Batch({ 'train': batch.InMemoryTable(tensor, batch_size), 'validate': batch.InMemoryTable(tensor, batch_size) }) as data: # loop through thrice, checking the data i_batch = 0 while True: # get the data try: batch_data_train = data.get("train") batch_data_validate = data.get("validate") except StopIteration: assert i_batch == num_train_batches i_batch = 0 data.reset_generator("all") break # check it assert be.allclose( batch_data_train, tensor[i_batch * batch_size:(i_batch + 1) * batch_size]) assert be.allclose( batch_data_validate, tensor[i_batch * batch_size:(i_batch + 1) * batch_size]) i_batch += 1
def test_rbm(paysage_path=None): num_hidden_units = 50 batch_size = 50 num_epochs = 1 learning_rate = schedules.PowerLawDecay(initial=0.01, coefficient=0.1) mc_steps = 1 if not paysage_path: paysage_path = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) filepath = os.path.join(paysage_path, 'examples', 'mnist', 'mnist.h5') if not os.path.exists(filepath): raise IOError( "{} does not exist. run mnist/download_mnist.py to fetch from the web" .format(filepath)) shuffled_filepath = os.path.join(paysage_path, 'examples', 'mnist', 'shuffled_mnist.h5') # shuffle the data if not os.path.exists(shuffled_filepath): shuffler = batch.DataShuffler(filepath, shuffled_filepath, complevel=0) shuffler.shuffle() # set a seed for the random number generator be.set_seed() import pandas samples = pre.binarize_color( be.float_tensor( pandas.read_hdf(shuffled_filepath, key='train/images').values[:10000])) samples_train, samples_validate = batch.split_tensor(samples, 0.95) data = batch.Batch({ 'train': batch.InMemoryTable(samples_train, batch_size), 'validate': batch.InMemoryTable(samples_validate, batch_size) }) # set up the model and initialize the parameters vis_layer = layers.BernoulliLayer(data.ncols) hid_layer = layers.BernoulliLayer(num_hidden_units) rbm = BoltzmannMachine([vis_layer, hid_layer]) rbm.initialize(data) # obtain initial estimate of the reconstruction error perf = ProgressMonitor() untrained_performance = perf.epoch_update(data, rbm, store=True, show=False) # set up the optimizer and the fit method opt = optimizers.RMSProp(stepsize=learning_rate) cd = fit.SGD(rbm, data) # fit the model print('training with contrastive divergence') cd.train(opt, num_epochs, method=fit.pcd, mcsteps=mc_steps) # obtain an estimate of the reconstruction error after 1 epoch trained_performance = cd.monitor.memory[-1] assert (trained_performance['ReconstructionError'] < untrained_performance['ReconstructionError']), \ "Reconstruction error did not decrease" # close the HDF5 store data.close()
def test_tap_machine(paysage_path=None): num_hidden_units = 10 batch_size = 100 num_epochs = 5 learning_rate = schedules.PowerLawDecay(initial=0.1, coefficient=1.0) if not paysage_path: paysage_path = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) filepath = os.path.join(paysage_path, 'examples', 'mnist', 'mnist.h5') if not os.path.exists(filepath): raise IOError( "{} does not exist. run mnist/download_mnist.py to fetch from the web" .format(filepath)) shuffled_filepath = os.path.join(paysage_path, 'examples', 'mnist', 'shuffled_mnist.h5') # shuffle the data if not os.path.exists(shuffled_filepath): shuffler = batch.DataShuffler(filepath, shuffled_filepath, complevel=0) shuffler.shuffle() # set a seed for the random number generator be.set_seed() # set up the reader to get minibatches samples = pre.binarize_color( be.float_tensor( pandas.read_hdf(shuffled_filepath, key='train/images').as_matrix()[:10000])) samples_train, samples_validate = batch.split_tensor(samples, 0.95) data = batch.Batch({ 'train': batch.InMemoryTable(samples_train, batch_size), 'validate': batch.InMemoryTable(samples_validate, batch_size) }) # set up the model and initialize the parameters vis_layer = layers.BernoulliLayer(data.ncols) hid_layer = layers.BernoulliLayer(num_hidden_units) rbm = BoltzmannMachine([vis_layer, hid_layer]) rbm.initialize(data) # obtain initial estimate of the reconstruction error perf = ProgressMonitor(generator_metrics = \ [ReconstructionError(), TAPLogLikelihood(10), TAPFreeEnergy(10)]) untrained_performance = perf.epoch_update(data, rbm, store=True, show=False) # set up the optimizer and the fit method opt = optimizers.Gradient(stepsize=learning_rate, tolerance=1e-5) tap = fit.TAP(True, 0.1, 0.01, 25, True, 0.5, 0.001, 0.0) solver = fit.SGD(rbm, data) solver.monitor.generator_metrics.append(TAPLogLikelihood(10)) solver.monitor.generator_metrics.append(TAPFreeEnergy(10)) # fit the model print('training with stochastic gradient ascent') solver.train(opt, num_epochs, method=tap.tap_update) # obtain an estimate of the reconstruction error after 1 epoch trained_performance = solver.monitor.memory[-1] assert (trained_performance['TAPLogLikelihood'] > untrained_performance['TAPLogLikelihood']), \ "TAP log-likelihood did not increase" assert (trained_performance['ReconstructionError'] < untrained_performance['ReconstructionError']), \ "Reconstruction error did not decrease" # close the HDF5 store data.close()