Python DataShuffler示例，paysage.batch.DataShuffler Python示例

示例#1

0

显示文件

文件： example_util.py 项目： jdonald/paysage

def default_shuffled_filepath(paysage_path, filepath):
    shuffled_filepath = os.path.join(paysage_path, 'mnist',
                                     'shuffled_mnist.h5')
    if not os.path.exists(shuffled_filepath):
        print("Shuffled file does not exist, creating a shuffled dataset.")
        shuffler = batch.DataShuffler(filepath, shuffled_filepath, complevel=0)
        shuffler.shuffle()
    return shuffled_filepath

示例#2

0

显示文件

def test_shuffle():
    # create temporary files
    file_original = tempfile.NamedTemporaryFile()
    file_shuffle = tempfile.NamedTemporaryFile()

    # create data
    num_rows = 10000
    num_cols_A = 100
    num_cols_B = 1
    df_A = pd.DataFrame(
        np.arange(num_rows * num_cols_A).reshape(num_rows, num_cols_A),
        columns=['col_{}'.format(i) for i in np.arange(num_cols_A)],
        index=['ix_{}'.format(i) for i in np.arange(num_rows)])
    df_B = pd.DataFrame(
        np.arange(num_rows * num_cols_B).reshape(num_rows, num_cols_B),
        columns=['col_{}'.format(i) for i in np.arange(num_cols_B)],
        index=['ix_{}'.format(i) for i in np.arange(num_rows)])

    # save it
    store = pd.HDFStore(file_original.name, mode='w', format='table')
    store.append("A", df_A)
    store.append("B", df_B)
    store.close()

    # shuffle it, with an artificially low memory limit
    shuffler = batch.DataShuffler(file_original.name,
                                  file_shuffle.name,
                                  allowed_mem=0.001)
    shuffler.shuffle()

    # read the shuffled data
    df_As = pd.read_hdf(file_shuffle.name, "A")
    df_Bs = pd.read_hdf(file_shuffle.name, "B")

    # check the two shuffles are consistent
    assert (df_As.index == df_Bs.index).all()
    assert (df_As['col_0'] // num_cols_A == df_Bs['col_0'] // num_cols_B).all()

    # check that the shuffles preserve the index
    ix_A_orig = sorted(list(df_A.index))
    ix_A_shuffled = sorted(list(df_As.index))
    assert ix_A_orig == ix_A_shuffled

    # check a couple of statistics
    vals_B = df_B['col_0'].values
    vals_Bs = df_Bs['col_0'].values
    # the number of fixed points tends to a Poisson distribution with e.v. = 1
    assert (vals_B == vals_Bs).sum() < 5
    # the difference between values (using the natural numbers as values)
    # is a triangular distribution centered at 0. Can check the variance.
    diff_dist_std = (vals_B - vals_Bs).std()
    assert np.abs(diff_dist_std / (num_rows / np.sqrt(6)) - 1) < 0.05

示例#3

0

显示文件

文件： mnist_util.py 项目： shevisjohnson/paysage

def default_paths(file = "shuffled"):
    files = {"shuffled": {"input": "mnist.h5", "output": "shuffled_mnist.h5"},
            }
    file_path = os.path.abspath(__file__)
    mnist_path = os.path.join(os.path.dirname(file_path), files[file]["input"])
    if not os.path.exists(mnist_path):
        raise IOError("{} does not exist. run download_mnist.py to fetch from the web"
                      .format(mnist_path))
    shuffled_path = os.path.join(os.path.dirname(file_path), files[file]["output"])
    if not os.path.exists(shuffled_path):
        print("Shuffled file does not exist, creating a shuffled dataset.")
        shuffler = batch.DataShuffler(mnist_path, shuffled_path, complevel=0)
        shuffler.shuffle()
    return shuffled_path

示例#4

0

显示文件

def test_rbm(paysage_path=None):

    num_hidden_units = 50
    batch_size = 50
    num_epochs = 1
    learning_rate = schedules.PowerLawDecay(initial=0.01, coefficient=0.1)
    mc_steps = 1

    if not paysage_path:
        paysage_path = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
    filepath = os.path.join(paysage_path, 'examples', 'mnist', 'mnist.h5')

    if not os.path.exists(filepath):
        raise IOError(
            "{} does not exist. run mnist/download_mnist.py to fetch from the web"
            .format(filepath))

    shuffled_filepath = os.path.join(paysage_path, 'examples', 'mnist',
                                     'shuffled_mnist.h5')

    # shuffle the data
    if not os.path.exists(shuffled_filepath):
        shuffler = batch.DataShuffler(filepath, shuffled_filepath, complevel=0)
        shuffler.shuffle()

    # set a seed for the random number generator
    be.set_seed()

    import pandas
    samples = pre.binarize_color(
        be.float_tensor(
            pandas.read_hdf(shuffled_filepath,
                            key='train/images').values[:10000]))
    samples_train, samples_validate = batch.split_tensor(samples, 0.95)
    data = batch.Batch({
        'train':
        batch.InMemoryTable(samples_train, batch_size),
        'validate':
        batch.InMemoryTable(samples_validate, batch_size)
    })

    # set up the model and initialize the parameters
    vis_layer = layers.BernoulliLayer(data.ncols)
    hid_layer = layers.BernoulliLayer(num_hidden_units)

    rbm = BoltzmannMachine([vis_layer, hid_layer])
    rbm.initialize(data)

    # obtain initial estimate of the reconstruction error
    perf = ProgressMonitor()
    untrained_performance = perf.epoch_update(data,
                                              rbm,
                                              store=True,
                                              show=False)

    # set up the optimizer and the fit method
    opt = optimizers.RMSProp(stepsize=learning_rate)
    cd = fit.SGD(rbm, data)

    # fit the model
    print('training with contrastive divergence')
    cd.train(opt, num_epochs, method=fit.pcd, mcsteps=mc_steps)

    # obtain an estimate of the reconstruction error after 1 epoch
    trained_performance = cd.monitor.memory[-1]

    assert (trained_performance['ReconstructionError'] <
            untrained_performance['ReconstructionError']), \
    "Reconstruction error did not decrease"

    # close the HDF5 store
    data.close()

示例#5

0

显示文件

def test_rbm(paysage_path=None):

    num_hidden_units = 50
    batch_size = 50
    num_epochs = 1
    learning_rate = schedules.PowerLawDecay(initial=0.01, coefficient=0.1)
    mc_steps = 1

    if not paysage_path:
        paysage_path = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
    filepath = os.path.join(paysage_path, 'mnist', 'mnist.h5')

    if not os.path.exists(filepath):
        raise IOError(
            "{} does not exist. run mnist/download_mnist.py to fetch from the web"
            .format(filepath))

    shuffled_filepath = os.path.join(paysage_path, 'mnist',
                                     'shuffled_mnist.h5')

    # shuffle the data
    if not os.path.exists(shuffled_filepath):
        shuffler = batch.DataShuffler(filepath, shuffled_filepath, complevel=0)
        shuffler.shuffle()

    # set a seed for the random number generator
    be.set_seed()

    # set up the reader to get minibatches
    data = batch.HDFBatch(shuffled_filepath,
                          'train/images',
                          batch_size,
                          transform=pre.binarize_color,
                          train_fraction=0.99)

    # set up the model and initialize the parameters
    vis_layer = layers.BernoulliLayer(data.ncols)
    hid_layer = layers.BernoulliLayer(num_hidden_units)

    rbm = model.Model([vis_layer, hid_layer])
    rbm.initialize(data)

    # obtain initial estimate of the reconstruction error
    perf = fit.ProgressMonitor(data, metrics=['ReconstructionError'])
    untrained_performance = perf.check_progress(rbm)

    # set up the optimizer and the fit method
    opt = optimizers.RMSProp(stepsize=learning_rate)

    sampler = fit.DrivenSequentialMC.from_batch(rbm, data)

    cd = fit.SGD(rbm,
                 data,
                 opt,
                 num_epochs,
                 sampler,
                 method=fit.pcd,
                 mcsteps=mc_steps,
                 monitor=perf)

    # fit the model
    print('training with contrastive divergence')
    cd.train()

    # obtain an estimate of the reconstruction error after 1 epoch
    trained_performance = perf.check_progress(rbm)

    assert (trained_performance['ReconstructionError'] <
            untrained_performance['ReconstructionError']), \
    "Reconstruction error did not decrease"

    # close the HDF5 store
    data.close()

示例#6

0

显示文件

文件： test_tap_machine.py 项目： jdonald/paysage

def test_tap_machine(paysage_path=None):
    num_hidden_units = 10
    batch_size = 50
    num_epochs = 1
    learning_rate = 0.01

    if not paysage_path:
        paysage_path = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
    filepath = os.path.join(paysage_path, 'mnist', 'mnist.h5')

    if not os.path.exists(filepath):
        raise IOError(
            "{} does not exist. run mnist/download_mnist.py to fetch from the web"
            .format(filepath))

    shuffled_filepath = os.path.join(paysage_path, 'mnist',
                                     'shuffled_mnist.h5')

    # shuffle the data
    if not os.path.exists(shuffled_filepath):
        shuffler = batch.DataShuffler(filepath, shuffled_filepath, complevel=0)
        shuffler.shuffle()

    # set a seed for the random number generator
    be.set_seed()

    # set up the reader to get minibatches
    data = batch.Batch(shuffled_filepath,
                       'train/images',
                       batch_size,
                       transform=batch.binarize_color,
                       train_fraction=0.1)

    # set up the model and initialize the parameters
    vis_layer = layers.BernoulliLayer(data.ncols)
    hid_layer = layers.BernoulliLayer(num_hidden_units)

    rbm = tap_machine.TAP_rbm([vis_layer, hid_layer],
                              tolerance_EMF=1e-2,
                              max_iters_EMF=50)
    rbm.initialize(data)

    # obtain initial estimate of the reconstruction error
    perf = fit.ProgressMonitor(data, metrics=['ReconstructionError'])
    untrained_performance = perf.check_progress(rbm)

    # set up the optimizer and the fit method
    opt = optimizers.Gradient(stepsize=learning_rate,
                              scheduler=optimizers.PowerLawDecay(0.1),
                              tolerance=1e-3,
                              ascent=True)

    solver = fit.SGD(rbm, data, opt, num_epochs, method=fit.tap, monitor=perf)

    # fit the model
    print('training with stochastic gradient ascent')
    solver.train()

    # obtain an estimate of the reconstruction error after 1 epoch
    trained_performance = perf.check_progress(rbm)

    assert (trained_performance['ReconstructionError'] <
            untrained_performance['ReconstructionError']), \
    "Reconstruction error did not decrease"

    # close the HDF5 store
    data.close()

示例#7

0

显示文件

def test_tap_machine(paysage_path=None):
    num_hidden_units = 10
    batch_size = 100
    num_epochs = 5
    learning_rate = schedules.PowerLawDecay(initial=0.1, coefficient=1.0)

    if not paysage_path:
        paysage_path = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
    filepath = os.path.join(paysage_path, 'examples', 'mnist', 'mnist.h5')

    if not os.path.exists(filepath):
        raise IOError(
            "{} does not exist. run mnist/download_mnist.py to fetch from the web"
            .format(filepath))

    shuffled_filepath = os.path.join(paysage_path, 'examples', 'mnist',
                                     'shuffled_mnist.h5')

    # shuffle the data
    if not os.path.exists(shuffled_filepath):
        shuffler = batch.DataShuffler(filepath, shuffled_filepath, complevel=0)
        shuffler.shuffle()

    # set a seed for the random number generator
    be.set_seed()

    # set up the reader to get minibatches
    samples = pre.binarize_color(
        be.float_tensor(
            pandas.read_hdf(shuffled_filepath,
                            key='train/images').as_matrix()[:10000]))
    samples_train, samples_validate = batch.split_tensor(samples, 0.95)
    data = batch.Batch({
        'train':
        batch.InMemoryTable(samples_train, batch_size),
        'validate':
        batch.InMemoryTable(samples_validate, batch_size)
    })

    # set up the model and initialize the parameters
    vis_layer = layers.BernoulliLayer(data.ncols)
    hid_layer = layers.BernoulliLayer(num_hidden_units)

    rbm = BoltzmannMachine([vis_layer, hid_layer])
    rbm.initialize(data)

    # obtain initial estimate of the reconstruction error
    perf = ProgressMonitor(generator_metrics = \
            [ReconstructionError(), TAPLogLikelihood(10), TAPFreeEnergy(10)])
    untrained_performance = perf.epoch_update(data,
                                              rbm,
                                              store=True,
                                              show=False)

    # set up the optimizer and the fit method
    opt = optimizers.Gradient(stepsize=learning_rate, tolerance=1e-5)
    tap = fit.TAP(True, 0.1, 0.01, 25, True, 0.5, 0.001, 0.0)
    solver = fit.SGD(rbm, data)
    solver.monitor.generator_metrics.append(TAPLogLikelihood(10))
    solver.monitor.generator_metrics.append(TAPFreeEnergy(10))

    # fit the model
    print('training with stochastic gradient ascent')
    solver.train(opt, num_epochs, method=tap.tap_update)

    # obtain an estimate of the reconstruction error after 1 epoch
    trained_performance = solver.monitor.memory[-1]

    assert (trained_performance['TAPLogLikelihood'] >
            untrained_performance['TAPLogLikelihood']), \
    "TAP log-likelihood did not increase"
    assert (trained_performance['ReconstructionError'] <
            untrained_performance['ReconstructionError']), \
    "Reconstruction error did not decrease"

    # close the HDF5 store
    data.close()

示例#8

0

显示文件

文件： test_rbm.py 项目： austinvhuang/paysage

def test_rbm(paysage_path=None):
    """TODO : this is just a placeholder, need to clean up & simplifiy setup. Also
    need to figure how to deal with consistent random seeding throughout the
    codebase to obtain deterministic checkable results.
    """
    num_hidden_units = 50
    batch_size = 50
    num_epochs = 1
    learning_rate = 0.01
    mc_steps = 1

    if not paysage_path:
        paysage_path = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
    filepath = os.path.join(paysage_path, 'mnist', 'mnist.h5')

    if not os.path.exists(filepath):
        raise IOError(
            "{} does not exist. run mnist/download_mnist.py to fetch from the web"
            .format(filepath))

    shuffled_filepath = os.path.join(paysage_path, 'mnist',
                                     'shuffled_mnist.h5')

    # shuffle the data
    if not os.path.exists(shuffled_filepath):
        shuffler = batch.DataShuffler(filepath, shuffled_filepath, complevel=0)
        shuffler.shuffle()

    # set a seed for the random number generator
    be.set_seed()

    # set up the reader to get minibatches
    data = batch.Batch(shuffled_filepath,
                       'train/images',
                       batch_size,
                       transform=batch.binarize_color,
                       train_fraction=0.99)

    # set up the model and initialize the parameters
    vis_layer = layers.BernoulliLayer(data.ncols)
    hid_layer = layers.BernoulliLayer(num_hidden_units)

    rbm = hidden.Model([vis_layer, hid_layer])
    rbm.initialize(data)

    # obtain initial estimate of the reconstruction error
    perf = fit.ProgressMonitor(0, data, metrics=[M.ReconstructionError()])
    untrained_performance = perf.check_progress(rbm, 0)

    # set up the optimizer and the fit method
    opt = optimizers.RMSProp(rbm,
                             stepsize=learning_rate,
                             scheduler=optimizers.PowerLawDecay(0.1))

    sampler = fit.DrivenSequentialMC.from_batch(rbm, data, method='stochastic')

    cd = fit.PCD(rbm,
                 data,
                 opt,
                 sampler,
                 num_epochs,
                 mcsteps=mc_steps,
                 skip=200,
                 metrics=[M.ReconstructionError()])

    # fit the model
    print('training with contrastive divergence')
    cd.train()

    # obtain an estimate of the reconstruction error after 1 epoch
    trained_performance = perf.check_progress(rbm, 0)

    assert (trained_performance['ReconstructionError'] <
            untrained_performance['ReconstructionError']), \
    "Reconstruction error did not decrease"

    # close the HDF5 store
    data.close()