Exemplo n.º 1
0
def test_mean():
    # create some random data
    s = be.rand((100000, ))

    # reference result
    ref_mean = be.mean(s)

    # do the online calculation
    mv = math_utils.MeanVarianceCalculator()
    for i in range(10):
        mv.update(s[i * 10000:(i + 1) * 10000])

    assert be.allclose(be.float_tensor(np.array([ref_mean])),
                       be.float_tensor(np.array([mv.mean])))
Exemplo n.º 2
0
def test_pca_compare_var():
    # create some random data
    num_samples = 10000
    dim = 10
    batch_size = 100
    num_components = 3

    # generate some data
    mean = np.random.random(dim)
    cov_factor = np.random.random((dim, dim))
    cov = np.dot(cov_factor, cov_factor.T)
    samples = be.float_tensor(
        np.random.multivariate_normal(mean, cov, size=num_samples))

    samples_train, samples_validate = batch.split_tensor(samples, 0.9)
    data = batch.Batch({
        'train':
        batch.InMemoryTable(samples_train, batch_size),
        'validate':
        batch.InMemoryTable(samples_validate, batch_size)
    })

    # find the principal directions
    pca_sgd = factorization.PCA.from_batch(data,
                                           num_components,
                                           epochs=10,
                                           grad_steps_per_minibatch=1,
                                           stepsize=0.01)
    pca_svd = factorization.PCA.from_svd(samples_train, num_components)

    assert be.norm(pca_sgd.var - pca_svd.var) / be.norm(pca_sgd.var) < 1e-1
Exemplo n.º 3
0
def test_pca_svd_save_read():
    # create some random data
    num_samples = 10000
    dim = 10
    num_components = 3

    # generate some data
    mean = np.random.random(dim)
    cov_factor = np.random.random((dim, dim))
    cov = np.dot(cov_factor, cov_factor.T)
    samples = be.float_tensor(
        np.random.multivariate_normal(mean, cov, size=num_samples))

    # find the principal directions
    pca = factorization.PCA.from_svd(samples, num_components)

    # save it
    pca_file = tempfile.NamedTemporaryFile()
    store = pd.HDFStore(pca_file.name, mode="w")
    pca.save(store)

    # read it
    pca_read = factorization.PCA.from_saved(store)
    store.close()

    # check it
    assert be.allclose(pca.W, pca_read.W)
    assert be.allclose(pca.var, pca_read.var)
    assert pca.stepsize == pca_read.stepsize
    assert pca.num_components == pca_read.num_components
Exemplo n.º 4
0
def test_mean():
    # create some random data
    num = 100000
    num_steps = 10
    stepsize = num // num_steps
    s = be.rand((num,))

    # reference result
    ref_mean = be.mean(s)

    # do the online calculation
    mv = math_utils.MeanCalculator()
    for i in range(num_steps):
        mv.update(s[i*stepsize:(i+1)*stepsize])

    assert be.allclose(be.float_tensor(np.array([ref_mean])),
                       be.float_tensor(np.array([mv.mean])))
Exemplo n.º 5
0
    def from_dataframe(cls, df):
        """
        Create a MeanVarianceArrayCalculator from a DataFrame config.

        Args:
            config (DataFrame): the parameters, stored as a DataFrame.

        Returns:
            MeanVarianceArrayCalculator

        """
        mvac = cls()
        mvac.num = (df["num"].astype(int))[0] # constant column
        mvac.mean = be.float_tensor(df["mean"].astype(float))
        mvac.var = be.float_tensor(df["var"].astype(float))
        mvac.square = be.float_tensor(df["square"].astype(float))
        return mvac
Exemplo n.º 6
0
def create_batch(batch_size, train_fraction=0.95, transform=be.do_nothing):
    """
    Create a Batch reader.

    Args:
        transform (callable): the transform function.
        train_fraction (float): the training data fraction.

    Returns:
        data (Batch): a batcher.

    """
    samples = be.float_tensor(pandas.read_hdf(
                default_paths(), key='train/images').values)
    return batch.in_memory_batch(samples, batch_size, train_fraction, transform)
Exemplo n.º 7
0
def test_pca_svd():
    # create some random data
    num_samples = 10000
    dim = 10
    num_components = 3

    # generate some data
    mean = np.random.random(dim)
    cov_factor = np.random.random((dim, dim))
    cov = np.dot(cov_factor, cov_factor.T)
    samples = be.float_tensor(
        np.random.multivariate_normal(mean, cov, size=num_samples))

    # find the principal directions
    pca = factorization.PCA.from_svd(samples, num_components)

    assert be.shape(pca.W) == (dim, num_components)
    assert be.shape(pca.var) == (num_components, )
Exemplo n.º 8
0
def test_pca_save_read_num_components():
    # create some random data
    num_samples = 10000
    dim = 10
    batch_size = 100
    num_components = 3
    num_components_save = 2

    # generate some data
    mean = np.random.random(dim)
    cov_factor = np.random.random((dim, dim))
    cov = np.dot(cov_factor, cov_factor.T)
    samples = be.float_tensor(
        np.random.multivariate_normal(mean, cov, size=num_samples))

    samples_train, samples_validate = batch.split_tensor(samples, 0.9)
    data = batch.Batch({
        'train':
        batch.InMemoryTable(samples_train, batch_size),
        'validate':
        batch.InMemoryTable(samples_validate, batch_size)
    })

    # find the principal directions
    pca = factorization.PCA.from_batch(data,
                                       num_components,
                                       epochs=10,
                                       grad_steps_per_minibatch=1,
                                       stepsize=0.01)

    # save it
    pca_file = tempfile.NamedTemporaryFile()
    store = pd.HDFStore(pca_file.name, mode="w")
    pca.save(store, num_components_save=num_components_save)

    # read it
    pca_read = factorization.PCA.from_saved(store)
    store.close()

    # check it
    assert be.allclose(pca.W[:, :num_components_save], pca_read.W)
    assert be.allclose(pca.var[:num_components_save], pca_read.var)
    assert pca.stepsize == pca_read.stepsize
    assert pca_read.num_components == num_components_save
Exemplo n.º 9
0
def test_rbm(paysage_path=None):

    num_hidden_units = 50
    batch_size = 50
    num_epochs = 1
    learning_rate = schedules.PowerLawDecay(initial=0.01, coefficient=0.1)
    mc_steps = 1

    if not paysage_path:
        paysage_path = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
    filepath = os.path.join(paysage_path, 'examples', 'mnist', 'mnist.h5')

    if not os.path.exists(filepath):
        raise IOError(
            "{} does not exist. run mnist/download_mnist.py to fetch from the web"
            .format(filepath))

    shuffled_filepath = os.path.join(paysage_path, 'examples', 'mnist',
                                     'shuffled_mnist.h5')

    # shuffle the data
    if not os.path.exists(shuffled_filepath):
        shuffler = batch.DataShuffler(filepath, shuffled_filepath, complevel=0)
        shuffler.shuffle()

    # set a seed for the random number generator
    be.set_seed()

    import pandas
    samples = pre.binarize_color(
        be.float_tensor(
            pandas.read_hdf(shuffled_filepath,
                            key='train/images').values[:10000]))
    samples_train, samples_validate = batch.split_tensor(samples, 0.95)
    data = batch.Batch({
        'train':
        batch.InMemoryTable(samples_train, batch_size),
        'validate':
        batch.InMemoryTable(samples_validate, batch_size)
    })

    # set up the model and initialize the parameters
    vis_layer = layers.BernoulliLayer(data.ncols)
    hid_layer = layers.BernoulliLayer(num_hidden_units)

    rbm = BoltzmannMachine([vis_layer, hid_layer])
    rbm.initialize(data)

    # obtain initial estimate of the reconstruction error
    perf = ProgressMonitor()
    untrained_performance = perf.epoch_update(data,
                                              rbm,
                                              store=True,
                                              show=False)

    # set up the optimizer and the fit method
    opt = optimizers.RMSProp(stepsize=learning_rate)
    cd = fit.SGD(rbm, data)

    # fit the model
    print('training with contrastive divergence')
    cd.train(opt, num_epochs, method=fit.pcd, mcsteps=mc_steps)

    # obtain an estimate of the reconstruction error after 1 epoch
    trained_performance = cd.monitor.memory[-1]

    assert (trained_performance['ReconstructionError'] <
            untrained_performance['ReconstructionError']), \
    "Reconstruction error did not decrease"

    # close the HDF5 store
    data.close()
Exemplo n.º 10
0
def test_binarize_color():
    result_pre = [
        pre.binarize_color(pre.scale(tensor, 1 / 255)) for tensor in tensors
    ]
    result_ref = [be.float_tensor(be.tround(tensor)) for tensor in tensors]
    assert compare_lists(result_pre, result_ref)
Exemplo n.º 11
0
def run(paysage_path=None, num_epochs=10, show_plot=False):
    num_hidden_units = 256
    batch_size = 100
    learning_rate = schedules.PowerLawDecay(initial=0.01, coefficient=0.1)
    mc_steps = 1

    (_, _, shuffled_filepath) = \
            util.default_paths(paysage_path)

    # set up the reader to get minibatches
    import pandas
    data = batch.InMemoryBatch(pre.binarize_color(
        be.float_tensor(
            pandas.read_hdf(shuffled_filepath,
                            key='train/images').as_matrix())),
                               batch_size,
                               train_fraction=0.95)

    # set up the model and initialize the parameters
    vis_layer = layers.BernoulliLayer(data.ncols)
    hid_layer = layers.BernoulliLayer(num_hidden_units)

    rbm = model.Model([vis_layer, hid_layer])
    rbm.weights[0].add_penalty({'matrix': pen.l2_penalty(0.001)})
    rbm.initialize(data, method='glorot_normal')

    metrics = [
        'ReconstructionError', 'EnergyDistance', 'EnergyGap', 'EnergyZscore',
        'HeatCapacity', 'WeightSparsity', 'WeightSquare'
    ]
    perf = fit.ProgressMonitor(data, metrics=metrics)

    # set up the optimizer and the fit method
    opt = optimizers.ADAM(stepsize=learning_rate)

    sampler = fit.DrivenSequentialMC.from_batch(rbm, data)

    cd = fit.SGD(rbm,
                 data,
                 opt,
                 num_epochs,
                 sampler,
                 method=fit.pcd,
                 mcsteps=mc_steps,
                 monitor=perf)

    # fit the model
    print('training with contrastive divergence')
    cd.train()

    # evaluate the model
    util.show_metrics(rbm, perf)
    valid = data.get('validate')
    util.show_reconstructions(rbm,
                              valid,
                              fit,
                              show_plot,
                              n_recon=10,
                              vertical=False,
                              num_to_avg=10)
    util.show_fantasy_particles(rbm, valid, fit, show_plot, n_fantasy=25)
    util.show_weights(rbm, show_plot, n_weights=25)

    # close the HDF5 store
    data.close()
    print("Done")
Exemplo n.º 12
0
def test_tap_machine(paysage_path=None):
    num_hidden_units = 10
    batch_size = 100
    num_epochs = 5
    learning_rate = schedules.PowerLawDecay(initial=0.1, coefficient=1.0)

    if not paysage_path:
        paysage_path = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
    filepath = os.path.join(paysage_path, 'examples', 'mnist', 'mnist.h5')

    if not os.path.exists(filepath):
        raise IOError(
            "{} does not exist. run mnist/download_mnist.py to fetch from the web"
            .format(filepath))

    shuffled_filepath = os.path.join(paysage_path, 'examples', 'mnist',
                                     'shuffled_mnist.h5')

    # shuffle the data
    if not os.path.exists(shuffled_filepath):
        shuffler = batch.DataShuffler(filepath, shuffled_filepath, complevel=0)
        shuffler.shuffle()

    # set a seed for the random number generator
    be.set_seed()

    # set up the reader to get minibatches
    samples = pre.binarize_color(
        be.float_tensor(
            pandas.read_hdf(shuffled_filepath,
                            key='train/images').as_matrix()[:10000]))
    samples_train, samples_validate = batch.split_tensor(samples, 0.95)
    data = batch.Batch({
        'train':
        batch.InMemoryTable(samples_train, batch_size),
        'validate':
        batch.InMemoryTable(samples_validate, batch_size)
    })

    # set up the model and initialize the parameters
    vis_layer = layers.BernoulliLayer(data.ncols)
    hid_layer = layers.BernoulliLayer(num_hidden_units)

    rbm = BoltzmannMachine([vis_layer, hid_layer])
    rbm.initialize(data)

    # obtain initial estimate of the reconstruction error
    perf = ProgressMonitor(generator_metrics = \
            [ReconstructionError(), TAPLogLikelihood(10), TAPFreeEnergy(10)])
    untrained_performance = perf.epoch_update(data,
                                              rbm,
                                              store=True,
                                              show=False)

    # set up the optimizer and the fit method
    opt = optimizers.Gradient(stepsize=learning_rate, tolerance=1e-5)
    tap = fit.TAP(True, 0.1, 0.01, 25, True, 0.5, 0.001, 0.0)
    solver = fit.SGD(rbm, data)
    solver.monitor.generator_metrics.append(TAPLogLikelihood(10))
    solver.monitor.generator_metrics.append(TAPFreeEnergy(10))

    # fit the model
    print('training with stochastic gradient ascent')
    solver.train(opt, num_epochs, method=tap.tap_update)

    # obtain an estimate of the reconstruction error after 1 epoch
    trained_performance = solver.monitor.memory[-1]

    assert (trained_performance['TAPLogLikelihood'] >
            untrained_performance['TAPLogLikelihood']), \
    "TAP log-likelihood did not increase"
    assert (trained_performance['ReconstructionError'] <
            untrained_performance['ReconstructionError']), \
    "Reconstruction error did not decrease"

    # close the HDF5 store
    data.close()