Exemplo n.º 1
0
import numpy as np
import theano
import theano.tensor as T
import synkhronos as synk

synk.fork()
s_init = np.ones(3, dtype='float32')
x = T.matrix('x')
s = theano.shared(s_init, name='s')
s_old = s
f = synk.function([x], updates={s: T.sum(x * s, axis=0)})
synk.distribute()
x_dat = np.array([[1., 1, 1],
                  [2, 2, 2],
                  [3, 3, 3],
                  [4, 4, 4]]).astype('float32')
print("\ns initial:\n", s.get_value())
f.as_theano(x_dat)
print("\ns after Theano call:\n", s.get_value())
s.set_value(s_init)
f(x_dat)
print("\nlocal s after reset and Synkhronos call:\n", s.get_value())
gathered_s = synk.gather(s, nd_up=1)
print("\ngathered s:\n", gathered_s)
synk.reduce(s, op="sum")
print("\nlocal s after in-place reduce:\n", s.get_value())
gathered_s = synk.gather(s, nd_up=1)
print("\ngathered s after reduce:\n", gathered_s)
s.set_value(s_init)
synk.broadcast(s)
f(x_dat)
Exemplo n.º 2
0
def main(model='mlp', num_epochs=500):
    # Load the dataset
    print("Loading data...")
    X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()

    # Fork workers and initialize gpu before building any variables.
    synk.fork()

    # Prepare Theano variables for inputs and targets
    input_var = T.tensor4('inputs')
    target_var = T.ivector('targets')

    # Create neural network model (depending on first command line parameter)
    print("Building model and compiling functions...")
    if model == 'mlp':
        network = build_mlp(input_var)
    elif model.startswith('custom_mlp:'):
        depth, width, drop_in, drop_hid = model.split(':', 1)[1].split(',')
        network = build_custom_mlp(input_var, int(depth), int(width),
                                   float(drop_in), float(drop_hid))
    elif model == 'cnn':
        network = build_cnn(input_var)
    else:
        print("Unrecognized model type %r." % model)
        return

    # Create a loss expression for training, i.e., a scalar objective we want
    # to minimize (for our multi-class problem, it is the cross-entropy loss):
    prediction = lasagne.layers.get_output(network)
    loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
    loss = loss.mean()
    # We could add some weight decay as well here, see lasagne.regularization.

    # Create update expressions for training, i.e., how to modify the
    # parameters at each training step. Here, we'll use Stochastic Gradient
    # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more.
    params = lasagne.layers.get_all_params(network, trainable=True)
    updates = lasagne.updates.nesterov_momentum(loss,
                                                params,
                                                learning_rate=0.01,
                                                momentum=0.9)
    # ipdb.set_trace()
    # Create a loss expression for validation/testing. The crucial difference
    # here is that we do a deterministic forward pass through the network,
    # disabling dropout layers.
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(
        test_prediction, target_var)
    test_loss = test_loss.mean()
    # As a bonus, also create an expression for the classification accuracy:
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                      dtype=theano.config.floatX)

    # Compile a function performing a training step on a mini-batch (by giving
    # the updates dictionary) and returning the corresponding training loss:
    # train_fn = theano.function([input_var, target_var], loss, updates=updates)
    train_fn = synk.function([input_var, target_var], loss, updates=updates)

    # Compile a second function computing the validation loss and accuracy:
    # val_fn = theano.function([input_var, target_var], [test_loss, test_acc])
    val_fn = synk.function([input_var, target_var], [test_loss, test_acc])

    # Send all functions and variables to workers (in the future, automatic)
    synk.distribute()

    # Write data into input shared memory (also applies to val_fn--same vars).
    X_train_synk, y_train_synk = train_fn.build_inputs(X_train, y_train)
    X_val_synk, y_val_synk = train_fn.build_inputs(X_val, y_val)
    X_test_synk, y_test_synk = train_fn.build_inputs(X_test, y_test)

    # Finally, launch the training loop.
    print("Starting training...")
    # We iterate over epochs:
    for epoch in range(num_epochs):
        # In each epoch, we do a full pass over the training data:
        train_err = 0
        train_batches = 0
        start_time = time.time()
        for batch in iterate_minibatch_indices(len(y_train), 500,
                                               shuffle=True):
            train_err += train_fn(X_train_synk, y_train_synk, batch=batch)
            synk.all_reduce(params)
            train_batches += 1
        mid_time = time.time()

        # And a full pass over the validation data:
        val_err = 0
        val_acc = 0
        val_batches = 0
        for batch in iterate_minibatch_indices(len(y_val), 500, shuffle=False):
            err, acc = val_fn(X_val_synk,
                              y_val_synk,
                              batch=batch,
                              num_slices=1)
            val_err += err
            val_acc += acc
            val_batches += 1
        end_time = time.time()

        val_fn_time = end_time - mid_time
        train_fn_time = mid_time - start_time

        # Then we print the results for this epoch:
        print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs,
                                                   time.time() - start_time))
        print("Train function time: {:.3f}s".format(train_fn_time))
        print("Validation function time: {:.3f}s".format(val_fn_time))
        print("  training loss:\t\t{:.6f}".format(train_err / train_batches))
        print("  validation loss:\t\t{:.6f}".format(val_err / val_batches))
        print("  validation accuracy:\t\t{:.2f} %".format(val_acc /
                                                          val_batches * 100))

    # After training, we compute and print the test error:
    test_err = 0
    test_acc = 0
    test_batches = 0
    for batch in iterate_minibatch_indices(len(y_test), 500, shuffle=False):
        err, acc = val_fn(X_test_synk, y_test_synk, batch=batch)
        test_err += err
        test_acc += acc
        test_batches += 1

    print("Final results:")
    print("  test loss:\t\t\t{:.6f}".format(test_err / test_batches))
    print("  test accuracy:\t\t{:.2f} %".format(test_acc / test_batches * 100))

    # Optionally, you could now dump the network weights to a file like this:
    np.savez('model.npz', *lasagne.layers.get_all_param_values(network))

    # And load them again later on like this:
    with np.load('model.npz') as f:
        param_values = [f['arr_%d' % i] for i in range(len(f.files))]
    lasagne.layers.set_all_param_values(network, param_values)
Exemplo n.º 3
0
import numpy as np
import theano
import theano.tensor as T
import synkhronos as synk

n_gpus = synk.fork()

# Make data-parallel computation with Theano shared variable (exists on GPU).
dtype = theano.config.floatX
s_x = theano.shared(np.ones([100, 4], dtype=dtype), name='s_x')
s_y = theano.shared(np.zeros([4, 5], dtype=dtype), name='s_y')
s_unused = theano.shared(np.zeros([5, 5], dtype=dtype))  # (see note at bottom)
z = T.mean(s_x.dot(s_y), axis=0)

f = synk.function(inputs=[], sliceable_shareds=[s_x], outputs=z)
synk.distribute()  # (shared variable data sent to workers with function)

# Inspect values of Theano shared variables--separate copy on each GPU.
print("\nLengths of s_x on each GPU: ", synk.get_lengths(s_x))
print("Shapes of s_x on each GPU: ", synk.get_shapes(s_x))

x_dat = np.random.randn(8 * n_gpus, 4).astype(dtype)
y_dat = np.random.randn(4, 5).astype(dtype)

# Manipulate values of Theano shared variables across all GPUs.
synk.scatter(s_x, x_dat)
synk.broadcast(s_y, y_dat)  # (without data arg, operates on existing var data)

print("\nData scattered to s_x and broadcast to s_y...")
print("\nShapes of s_x on each GPU: ", synk.get_shapes(s_x))
gathered_x = synk.gather(s_x, nd_up=0)
Exemplo n.º 4
0
def main():

    B_SIZE = 10000
    MID = B_SIZE // 2

    synk.fork()
    import lasagne

    input_var = T.tensor4('inputs')
    target_var = T.ivector('targets')
    network = build_mlp(input_var)
    # network = build_cnn(input_var)
    prediction = lasagne.layers.get_output(network)
    loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
    loss = loss.mean()
    params = lasagne.layers.get_all_params(network, trainable=True)

    grads = theano.grad(loss, wrt=params)
    flat_grad = T.concatenate(list(map(T.flatten, grads)))

    f_loss = synk.function([input_var, target_var],
                           loss,
                           collect_modes=[None],
                           reduce_ops="sum")
    f_grad = synk.function([input_var, target_var],
                           flat_grad,
                           collect_modes=[None])

    synk.distribute()

    x_data, y_data = make_data([1, 28, 28], B_SIZE)

    loss_1 = f_loss(x_data, y_data)
    grad_1 = f_grad(x_data, y_data)

    x_shmem, y_shmem = f_loss.get_input_shmems()
    x_dat_sh = x_shmem[:B_SIZE]
    y_dat_sh = y_shmem[:B_SIZE]
    x_data_1 = x_data[:MID]
    x_data_2 = x_data[MID:]
    y_data_1 = y_data[:MID]
    y_data_2 = y_data[MID:]

    ITERS = 10
    t0 = timer()
    for _ in range(ITERS):
        loss_i = f_loss.as_theano(x_data_1, y_data_1)
        loss_j = f_loss.as_theano(x_data_2, y_data_2)
    loss_time = timer() - t0
    print("theano loss_time: ", loss_time)

    t0 = timer()
    for _ in range(ITERS):
        grad_i = f_grad.as_theano(x_data_1, y_data_1)
        grad_j = f_grad.as_theano(x_data_2, y_data_2)
    grad_time = timer() - t0
    print("theano grad_time: ", grad_time)

    t0 = timer()
    for _ in range(ITERS):
        loss_i = f_loss(x_dat_sh, y_dat_sh)
    loss_time = timer() - t0
    print("synk shmem loss_time: ", loss_time)

    t0 = timer()
    for _ in range(ITERS):
        grad_i = f_grad(x_dat_sh, y_dat_sh)
    grad_time = timer() - t0
    print("synk shmem grad_time: ", grad_time)

    t0 = timer()
    for _ in range(ITERS):
        loss_i = f_loss(x_data, y_data)
    loss_time = timer() - t0
    print("synk new input loss_time: ", loss_time)

    t0 = timer()
    for _ in range(ITERS):
        grad_i = f_grad(x_data, y_data)
    grad_time = timer() - t0
    print("synk new input grad_time: ", grad_time)
Exemplo n.º 5
0
synk.fork()  # processes forked, GPUs initialized

# Build simple data-parallel computations (parallel across rows of "x")
x = T.matrix('x')
y = T.matrix('y')
z_avg = T.mean(x.dot(y), axis=0)
z_sum = T.sum(x.dot(y), axis=0)
z_max = T.max(x.dot(y), axis=0)

# Build Synk function. NOTES:
# 1. bcast_input "y" will have the full value broadcast to all workers
# 2. outputs have different reduce operations (default is "avg")
f = synk.function(inputs=[x],
                  bcast_inputs=[y],
                  outputs=[z_avg, (z_sum, "sum"), (z_max, "max")])
synk.distribute()  # worker GPUs receive all synk functions, prepare to execute

# Generate random data and compute results
x_dat = 0.01 * np.random.randn(1000, 10).astype(theano.config.floatX)
y_dat = np.random.randn(10, 5).astype(theano.config.floatX)

# For comparison, run on only master GPU, as if standard Theano built by:
# f = theano.function(inputs=[x, y], outputs=[z_avg, z_sum, z_max])
r_avg, r_sum, r_max = f.as_theano(x_dat, y_dat)

# Prepare for computation: move data into OS-shared memory (this is one way)
x_dat_synk, y_dat_synk = f.build_inputs(x_dat, y_dat)

# Compute result using multiple GPUs, reduce to master
r_avg_synk, r_sum_synk, r_max_synk = f(x_dat_synk, y_dat_synk)
Exemplo n.º 6
0
def main(model='mlp', batch_size=500, num_epochs=10):

    # Load the dataset
    print("Loading data...")
    X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()
    y_train = y_train.astype("int32")  # (some downstream type error on uint8)
    y_val = y_val.astype("int32")

    # Fork worker processes and initilize GPU before building variables.
    n_gpu = synk.fork()

    # Prepare Theano variables for inputs and targets
    input_var = T.tensor4('inputs')
    target_var = T.ivector('targets')

    network = build_network(model, input_var)

    # Create a loss expression for training, i.e., a scalar objective we want
    # to minimize (for our multi-class problem, it is the cross-entropy loss):
    prediction = lasagne.layers.get_output(network)
    loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
    loss = loss.mean()
    # We could add some weight decay as well here, see lasagne.regularization.

    # Create update expressions for training, i.e., how to modify the
    # parameters at each training step. Here, we'll use Stochastic Gradient
    # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more.
    params = lasagne.layers.get_all_params(network, trainable=True)

    grad_updates, param_updates, grad_shared = updates.nesterov_momentum(
        loss, params, learning_rate=0.01, momentum=0.9)
    # updates = lasagne.updates.nesterov_momentum(
    #         loss, params, learning_rate=0.01, momentum=0.9)

    # Create a loss expression for validation/testing. The crucial difference
    # here is that we do a deterministic forward pass through the network,
    # disabling dropout layers.
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(
        test_prediction, target_var)
    test_loss = test_loss.mean()
    # As a bonus, also create an expression for the classification accuracy:
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                      dtype=theano.config.floatX)

    # Make GPU variables to hold the data.
    s_input_train = theano.shared(X_train[:len(X_train) // n_gpu])
    s_target_train = theano.shared(y_train[:len(y_train) // n_gpu])
    s_input_val = theano.shared(X_val[:len(X_val) // n_gpu])
    s_target_val = theano.shared(y_val[:len(y_val) // n_gpu])

    # Compile a function performing a training step on a mini-batch (by giving
    # the updates dictionary) and returning the corresponding training loss:
    train_grad_fn = synk.function(
        inputs=[],
        outputs=loss,
        givens=[(input_var, s_input_train), (target_var, s_target_train)],
        sliceable_shareds=[s_input_train, s_target_train],
        updates=grad_updates)
    train_update_fn = synk.function([], updates=param_updates)
    # train_fn = theano.function([input_var, target_var], loss, updates=updates)

    # Compile a second function computing the validation loss and accuracy:
    val_fn = synk.function(inputs=[],
                           givens=[(input_var, s_input_val),
                                   (target_var, s_target_val)],
                           sliceable_shareds=[s_input_val, s_target_val],
                           outputs=[test_loss, test_acc])
    # val_fn = theano.function([input_var, target_var], [test_loss, test_acc])

    # Don't bother to put test data on GPU ahead of time.
    test_fn = synk.function([input_var, target_var],
                            outputs=[test_loss, test_acc])

    # After building all functions, give them to workers.
    synk.distribute()

    # Put data into OS shared memory for worker access.
    X_test, y_test = test_fn.build_inputs(X_test, y_test)

    print("Scattering data to GPUs.")
    scatter_vars = [s_input_train, s_target_train, s_input_val, s_target_val]
    scatter_vals = [X_train, y_train, X_val, y_val]
    synk.scatter(scatter_vars, scatter_vals)
    train_worker_len = min(synk.get_lengths(s_target_train))
    worker_batch_size = batch_size // n_gpu

    # Finally, launch the training loop.
    print("Starting training...")
    # We iterate over epochs:
    for epoch in range(num_epochs):
        # In each epoch, we do a full pass over the training data:
        train_err = 0
        train_batches = 0
        start_time = time.time()
        # for batch in iterate_minibatches(X_train, y_train, batch_size, shuffle=True):
        for batch in iterate_minibatch_indices(train_worker_len,
                                               worker_batch_size,
                                               shuffle=True):
            train_err += train_grad_fn(batch_s=batch)
            synk.all_reduce(grad_shared)  # (averges)
            train_update_fn()
            train_batches += 1

        # And a full pass over the validation data:
        # val_err = 0
        # val_acc = 0
        # val_batches = 0
        # for batch in iterate_minibatches(X_val, y_val, batch_size, shuffle=False):
        #     inputs, targets = batch
        #     err, acc = val_fn(inputs, targets)
        #     val_err += err
        #     val_acc += acc
        #     val_batches += 1
        val_err, val_acc = val_fn(num_slices=4)

        # Then we print the results for this epoch:
        print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs,
                                                   time.time() - start_time))
        print("  training loss:\t\t{:.6f}".format(train_err / train_batches))
        print("  validation loss:\t\t{:.6f}".format(float(val_err)))
        print("  validation accuracy:\t\t{:.2f} %".format(
            float(val_acc) * 100))

    # After training, we compute and print the test error:
    # test_err = 0
    # test_acc = 0
    # test_batches = 0
    # for batch in iterate_minibatches(X_test, y_test, batch_size, shuffle=False):
    #     inputs, targets = batch
    #     err, acc = val_fn(inputs, targets)
    #     test_err += err
    #     test_acc += acc
    #     test_batches += 1
    test_err, test_acc = test_fn(X_test, y_test, num_slices=4)
    print("Final results:")
    print("  test loss:\t\t\t{:.6f}".format(float(test_err)))
    print("  test accuracy:\t\t{:.2f} %".format(float(test_acc) * 100))