def make_train_function(loss, params, x, y, update_rule, *args, **kwargs): grad_updates, param_updates, grad_shared = \ update_rule(loss, params, *args, **kwargs) f_grad_shared = synk.function( inputs=[x, y], outputs=loss, # (assumes this is an avg) updates=grad_updates) f_param_update = synk.function(inputs=[], updates=param_updates) def train_minibatch(x_data, y_data, batch=None): train_loss = f_grad_shared(x_data, y_data, batch=batch) # (synk_data) synk.all_reduce(grad_shared, op="avg") # (assumes loss is an avg) f_param_update() return train_loss return train_minibatch
import numpy as np import theano import theano.tensor as T import synkhronos as synk synk.fork() s_init = np.ones(3, dtype='float32') x = T.matrix('x') s = theano.shared(s_init, name='s') s_old = s f = synk.function([x], updates={s: T.sum(x * s, axis=0)}) synk.distribute() x_dat = np.array([[1., 1, 1], [2, 2, 2], [3, 3, 3], [4, 4, 4]]).astype('float32') print("\ns initial:\n", s.get_value()) f.as_theano(x_dat) print("\ns after Theano call:\n", s.get_value()) s.set_value(s_init) f(x_dat) print("\nlocal s after reset and Synkhronos call:\n", s.get_value()) gathered_s = synk.gather(s, nd_up=1) print("\ngathered s:\n", gathered_s) synk.reduce(s, op="sum") print("\nlocal s after in-place reduce:\n", s.get_value()) gathered_s = synk.gather(s, nd_up=1) print("\ngathered s after reduce:\n", gathered_s) s.set_value(s_init) synk.broadcast(s) f(x_dat)
import synkhronos as synk import numpy as np import theano synk.fork() s = theano.shared(np.ones([5, 5], dtype='float32'), name="shared_var") s2 = theano.shared(np.ones([4, 4], dtype='float32'), name="shared_var_2") f = synk.function([], [s.dot(s), s2.dot(s2)]) synk.distribute() # print(f()) # print(synk.get_value(1, s)) # d = 2 * np.ones([5, 5], dtype='float32') # synk.set_value(1, s, d) d55 = np.array(list(range(5 * 5)), dtype='float32').reshape(5, 5) d64 = np.array(list(range(6 * 4)), dtype='float32').reshape(6, 4) # (run interactive in iPython for setup)
def main(model='mlp', num_epochs=500): # Load the dataset print("Loading data...") X_train, y_train, X_val, y_val, X_test, y_test = load_dataset() # Fork workers and initialize gpu before building any variables. synk.fork() # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs') target_var = T.ivector('targets') # Create neural network model (depending on first command line parameter) print("Building model and compiling functions...") if model == 'mlp': network = build_mlp(input_var) elif model.startswith('custom_mlp:'): depth, width, drop_in, drop_hid = model.split(':', 1)[1].split(',') network = build_custom_mlp(input_var, int(depth), int(width), float(drop_in), float(drop_hid)) elif model == 'cnn': network = build_cnn(input_var) else: print("Unrecognized model type %r." % model) return # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() # We could add some weight decay as well here, see lasagne.regularization. # Create update expressions for training, i.e., how to modify the # parameters at each training step. Here, we'll use Stochastic Gradient # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more. params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=0.01, momentum=0.9) # ipdb.set_trace() # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, # disabling dropout layers. test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy( test_prediction, target_var) test_loss = test_loss.mean() # As a bonus, also create an expression for the classification accuracy: test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: # train_fn = theano.function([input_var, target_var], loss, updates=updates) train_fn = synk.function([input_var, target_var], loss, updates=updates) # Compile a second function computing the validation loss and accuracy: # val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) val_fn = synk.function([input_var, target_var], [test_loss, test_acc]) # Send all functions and variables to workers (in the future, automatic) synk.distribute() # Write data into input shared memory (also applies to val_fn--same vars). X_train_synk, y_train_synk = train_fn.build_inputs(X_train, y_train) X_val_synk, y_val_synk = train_fn.build_inputs(X_val, y_val) X_test_synk, y_test_synk = train_fn.build_inputs(X_test, y_test) # Finally, launch the training loop. print("Starting training...") # We iterate over epochs: for epoch in range(num_epochs): # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() for batch in iterate_minibatch_indices(len(y_train), 500, shuffle=True): train_err += train_fn(X_train_synk, y_train_synk, batch=batch) synk.all_reduce(params) train_batches += 1 mid_time = time.time() # And a full pass over the validation data: val_err = 0 val_acc = 0 val_batches = 0 for batch in iterate_minibatch_indices(len(y_val), 500, shuffle=False): err, acc = val_fn(X_val_synk, y_val_synk, batch=batch, num_slices=1) val_err += err val_acc += acc val_batches += 1 end_time = time.time() val_fn_time = end_time - mid_time train_fn_time = mid_time - start_time # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs, time.time() - start_time)) print("Train function time: {:.3f}s".format(train_fn_time)) print("Validation function time: {:.3f}s".format(val_fn_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) print(" validation accuracy:\t\t{:.2f} %".format(val_acc / val_batches * 100)) # After training, we compute and print the test error: test_err = 0 test_acc = 0 test_batches = 0 for batch in iterate_minibatch_indices(len(y_test), 500, shuffle=False): err, acc = val_fn(X_test_synk, y_test_synk, batch=batch) test_err += err test_acc += acc test_batches += 1 print("Final results:") print(" test loss:\t\t\t{:.6f}".format(test_err / test_batches)) print(" test accuracy:\t\t{:.2f} %".format(test_acc / test_batches * 100)) # Optionally, you could now dump the network weights to a file like this: np.savez('model.npz', *lasagne.layers.get_all_param_values(network)) # And load them again later on like this: with np.load('model.npz') as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] lasagne.layers.set_all_param_values(network, param_values)
import numpy as np import theano import theano.tensor as T import synkhronos as synk n_gpus = synk.fork() # Make data-parallel computation with Theano shared variable (exists on GPU). dtype = theano.config.floatX s_x = theano.shared(np.ones([100, 4], dtype=dtype), name='s_x') s_y = theano.shared(np.zeros([4, 5], dtype=dtype), name='s_y') s_unused = theano.shared(np.zeros([5, 5], dtype=dtype)) # (see note at bottom) z = T.mean(s_x.dot(s_y), axis=0) f = synk.function(inputs=[], sliceable_shareds=[s_x], outputs=z) synk.distribute() # (shared variable data sent to workers with function) # Inspect values of Theano shared variables--separate copy on each GPU. print("\nLengths of s_x on each GPU: ", synk.get_lengths(s_x)) print("Shapes of s_x on each GPU: ", synk.get_shapes(s_x)) x_dat = np.random.randn(8 * n_gpus, 4).astype(dtype) y_dat = np.random.randn(4, 5).astype(dtype) # Manipulate values of Theano shared variables across all GPUs. synk.scatter(s_x, x_dat) synk.broadcast(s_y, y_dat) # (without data arg, operates on existing var data) print("\nData scattered to s_x and broadcast to s_y...") print("\nShapes of s_x on each GPU: ", synk.get_shapes(s_x))
if RUN_BOTH: import synkhronos as synk synk.fork() else: import theano.gpuarray theano.gpuarray.use("cuda") x = T.matrix('x') y = T.matrix('y') v = T.vector('v') s = theano.shared(np.ones([1, 5], dtype='float32'), name='s') z = T.sum(x.dot(y), axis=0) if RUN_BOTH: f_synk = synk.function([x, y], z, broadcast_inputs=[y]) g_synk = synk.function([v], updates={s: s + v}, broadcast_inputs=[v]) synk.distribute() f_theano = theano.function([x, y], z) g_theano = theano.function([v], updates={s: s + v}) x_dat = 0.01 * np.ones([1000, 10], dtype='float32') x_dat1 = x_dat[:400] x_dat2 = x_dat[400:] y_dat = np.ones([10, 5], dtype='float32') r_theano = f_theano(x_dat, y_dat) print("result of f_theano: ", r_theano) r_t_1 = f_theano(x_dat1, y_dat) r_t_2 = f_theano(x_dat2, y_dat)
import numpy as np import theano import theano.tensor as T import synkhronos as synk synk.fork() x = T.matrix('x') y = theano.shared(np.random.randn(10, 20).astype('float32')) z = T.mean(x.dot(y), axis=0) f_th = theano.function([x], z) # just for comparison f = synk.function([x], z) synk.distribute() x_dat = np.random.randn(100, 10).astype('float32') r_th = f_th(x_dat) r = f(x_dat) r_as_th = f.as_theano(x_dat) assert np.allclose(r, r_th) assert np.allclose(r_as_th, r_th) print("All assertions passed.")
import numpy as np import theano import theano.tensor as T import synkhronos as synk synk.fork() x = T.matrix('x') y = T.vector('y') z = T.mean(x.dot(y), axis=0) f_th = theano.function(inputs=[x, y], outputs=z) f = synk.function(inputs=[x], bcast_inputs=[y], outputs=z) synk.distribute() x_dat = np.random.randn(100, 10).astype('float32') y_dat = np.random.randn(10).astype('float32') x_synk = synk.data(x_dat) y_synk = synk.data(y_dat) r_th = f_th(x_dat, y_dat) r = f(x_synk, y_synk) assert np.allclose(r, r_th) print("All assertions passed.")
import theano import theano.tensor as T import numpy as np import synkhronos as synk n_gpu = synk.fork() # x = T.matrix('x') x_dat = np.random.randn(100, 10).astype(theano.config.floatX) y_dat = np.random.randn(10, 5).astype(theano.config.floatX) x = theano.shared(x_dat, 'x_gpu') y = theano.shared(y_dat, 'y_gpu') z = T.mean(x.dot(y), axis=0) f = synk.function(inputs=[], outputs=z, sliceable_shareds=[x]) synk.distribute() full_x_dat = np.random.randn(n_gpu * 100, 10).astype(theano.config.floatX) synk.scatter(x, full_x_dat) r = f()
def main(): B_SIZE = 10000 MID = B_SIZE // 2 synk.fork() import lasagne input_var = T.tensor4('inputs') target_var = T.ivector('targets') network = build_mlp(input_var) # network = build_cnn(input_var) prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() params = lasagne.layers.get_all_params(network, trainable=True) grads = theano.grad(loss, wrt=params) flat_grad = T.concatenate(list(map(T.flatten, grads))) f_loss = synk.function([input_var, target_var], loss, collect_modes=[None], reduce_ops="sum") f_grad = synk.function([input_var, target_var], flat_grad, collect_modes=[None]) synk.distribute() x_data, y_data = make_data([1, 28, 28], B_SIZE) loss_1 = f_loss(x_data, y_data) grad_1 = f_grad(x_data, y_data) x_shmem, y_shmem = f_loss.get_input_shmems() x_dat_sh = x_shmem[:B_SIZE] y_dat_sh = y_shmem[:B_SIZE] x_data_1 = x_data[:MID] x_data_2 = x_data[MID:] y_data_1 = y_data[:MID] y_data_2 = y_data[MID:] ITERS = 10 t0 = timer() for _ in range(ITERS): loss_i = f_loss.as_theano(x_data_1, y_data_1) loss_j = f_loss.as_theano(x_data_2, y_data_2) loss_time = timer() - t0 print("theano loss_time: ", loss_time) t0 = timer() for _ in range(ITERS): grad_i = f_grad.as_theano(x_data_1, y_data_1) grad_j = f_grad.as_theano(x_data_2, y_data_2) grad_time = timer() - t0 print("theano grad_time: ", grad_time) t0 = timer() for _ in range(ITERS): loss_i = f_loss(x_dat_sh, y_dat_sh) loss_time = timer() - t0 print("synk shmem loss_time: ", loss_time) t0 = timer() for _ in range(ITERS): grad_i = f_grad(x_dat_sh, y_dat_sh) grad_time = timer() - t0 print("synk shmem grad_time: ", grad_time) t0 = timer() for _ in range(ITERS): loss_i = f_loss(x_data, y_data) loss_time = timer() - t0 print("synk new input loss_time: ", loss_time) t0 = timer() for _ in range(ITERS): grad_i = f_grad(x_data, y_data) grad_time = timer() - t0 print("synk new input grad_time: ", grad_time)
import numpy as np import theano import theano.tensor as T import synkhronos as synk n_gpus = synk.fork() DAT = 200 # (data length on each GPU) # Build simple data-parallel computations with shraed variables. s_x = theano.shared(np.empty([DAT, 10], dtype=theano.config.floatX)) s_y = theano.shared(np.empty([10, 5], dtype=theano.config.floatX)) z = T.sum(s_x.dot(s_y), axis=0) f = synk.function(inputs=[], outputs=(z, "sum"), sliceable_shareds=[s_x]) synk.distribute() x_dat = 0.01 * np.random.randn(DAT * n_gpus, 10).astype(theano.config.floatX) y_dat = np.random.randn(10, 5).astype(theano.config.floatX) synk.scatter(s_x, x_dat) synk.broadcast(s_y, y_dat) # Build an assortment of subsets of the data to compute on. # (Can either build a single slice or single list, which will be applied within # each GPU (AFTER the data is scattered), or can build a list of slices or lists, # one for each GPU.) slice_1 = slice(100, 200) list_2 = np.random.randint(low=0, high=DAT, size=100) slices_3 = [slice(0 + i, 100 + i) for i in range(n_gpus)]
import synkhronos as synk synk.fork() # processes forked, GPUs initialized # Build simple data-parallel computations (parallel across rows of "x") x = T.matrix('x') y = T.matrix('y') z_avg = T.mean(x.dot(y), axis=0) z_sum = T.sum(x.dot(y), axis=0) z_max = T.max(x.dot(y), axis=0) # Build Synk function. NOTES: # 1. bcast_input "y" will have the full value broadcast to all workers # 2. outputs have different reduce operations (default is "avg") f = synk.function(inputs=[x], bcast_inputs=[y], outputs=[z_avg, (z_sum, "sum"), (z_max, "max")]) synk.distribute() # worker GPUs receive all synk functions, prepare to execute # Generate random data and compute results x_dat = 0.01 * np.random.randn(1000, 10).astype(theano.config.floatX) y_dat = np.random.randn(10, 5).astype(theano.config.floatX) # For comparison, run on only master GPU, as if standard Theano built by: # f = theano.function(inputs=[x, y], outputs=[z_avg, z_sum, z_max]) r_avg, r_sum, r_max = f.as_theano(x_dat, y_dat) # Prepare for computation: move data into OS-shared memory (this is one way) x_dat_synk, y_dat_synk = f.build_inputs(x_dat, y_dat) # Compute result using multiple GPUs, reduce to master
def main(model='mlp', batch_size=500, num_epochs=10): # Load the dataset print("Loading data...") X_train, y_train, X_val, y_val, X_test, y_test = load_dataset() y_train = y_train.astype("int32") # (some downstream type error on uint8) y_val = y_val.astype("int32") # Fork worker processes and initilize GPU before building variables. n_gpu = synk.fork() # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs') target_var = T.ivector('targets') network = build_network(model, input_var) # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() # We could add some weight decay as well here, see lasagne.regularization. # Create update expressions for training, i.e., how to modify the # parameters at each training step. Here, we'll use Stochastic Gradient # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more. params = lasagne.layers.get_all_params(network, trainable=True) grad_updates, param_updates, grad_shared = updates.nesterov_momentum( loss, params, learning_rate=0.01, momentum=0.9) # updates = lasagne.updates.nesterov_momentum( # loss, params, learning_rate=0.01, momentum=0.9) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, # disabling dropout layers. test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy( test_prediction, target_var) test_loss = test_loss.mean() # As a bonus, also create an expression for the classification accuracy: test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # Make GPU variables to hold the data. s_input_train = theano.shared(X_train[:len(X_train) // n_gpu]) s_target_train = theano.shared(y_train[:len(y_train) // n_gpu]) s_input_val = theano.shared(X_val[:len(X_val) // n_gpu]) s_target_val = theano.shared(y_val[:len(y_val) // n_gpu]) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_grad_fn = synk.function( inputs=[], outputs=loss, givens=[(input_var, s_input_train), (target_var, s_target_train)], sliceable_shareds=[s_input_train, s_target_train], updates=grad_updates) train_update_fn = synk.function([], updates=param_updates) # train_fn = theano.function([input_var, target_var], loss, updates=updates) # Compile a second function computing the validation loss and accuracy: val_fn = synk.function(inputs=[], givens=[(input_var, s_input_val), (target_var, s_target_val)], sliceable_shareds=[s_input_val, s_target_val], outputs=[test_loss, test_acc]) # val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) # Don't bother to put test data on GPU ahead of time. test_fn = synk.function([input_var, target_var], outputs=[test_loss, test_acc]) # After building all functions, give them to workers. synk.distribute() # Put data into OS shared memory for worker access. X_test, y_test = test_fn.build_inputs(X_test, y_test) print("Scattering data to GPUs.") scatter_vars = [s_input_train, s_target_train, s_input_val, s_target_val] scatter_vals = [X_train, y_train, X_val, y_val] synk.scatter(scatter_vars, scatter_vals) train_worker_len = min(synk.get_lengths(s_target_train)) worker_batch_size = batch_size // n_gpu # Finally, launch the training loop. print("Starting training...") # We iterate over epochs: for epoch in range(num_epochs): # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() # for batch in iterate_minibatches(X_train, y_train, batch_size, shuffle=True): for batch in iterate_minibatch_indices(train_worker_len, worker_batch_size, shuffle=True): train_err += train_grad_fn(batch_s=batch) synk.all_reduce(grad_shared) # (averges) train_update_fn() train_batches += 1 # And a full pass over the validation data: # val_err = 0 # val_acc = 0 # val_batches = 0 # for batch in iterate_minibatches(X_val, y_val, batch_size, shuffle=False): # inputs, targets = batch # err, acc = val_fn(inputs, targets) # val_err += err # val_acc += acc # val_batches += 1 val_err, val_acc = val_fn(num_slices=4) # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" validation loss:\t\t{:.6f}".format(float(val_err))) print(" validation accuracy:\t\t{:.2f} %".format( float(val_acc) * 100)) # After training, we compute and print the test error: # test_err = 0 # test_acc = 0 # test_batches = 0 # for batch in iterate_minibatches(X_test, y_test, batch_size, shuffle=False): # inputs, targets = batch # err, acc = val_fn(inputs, targets) # test_err += err # test_acc += acc # test_batches += 1 test_err, test_acc = test_fn(X_test, y_test, num_slices=4) print("Final results:") print(" test loss:\t\t\t{:.6f}".format(float(test_err))) print(" test accuracy:\t\t{:.2f} %".format(float(test_acc) * 100))
import theano import synkhronos as synk import numpy as np synk.fork() s = theano.shared(np.zeros([100, 2], dtype='float32'), name='shared_var') # s = theano.shared(np.array(list(range(100 * 2)), dtype='float32').reshape(100, 2)) f = synk.function([], outputs=(s, "gather"), sliceable_shareds=[s]) synk.distribute() d = np.ones([200, 2], dtype='float32') for i, row in enumerate(d): row *= i sd = synk.data(value=d) synk.scatter(s, sd) print(f()) print("\n") print(f(num_slices=3)) print("\n") print(f(batch_s=[0, 1, 2, 3, 4, 5])) print("\n") print(f(batch_s=[0, 1, 2, 3, 4, 5], num_slices=2)) print("\n") print(f(batch_s=[49, 23, 1, 7, 23])) print("\n")
""" import numpy as np import theano import theano.tensor as T import synkhronos as synk synk.fork() # Build simple data-parallel computations (parallel across rows of "x") x = T.matrix('x') y = T.matrix('y') w = T.matrix('w') z = T.sum((x + w).dot(y), axis=0) f = synk.function(inputs=[x, w], bcast_inputs=[y], outputs=(z, "sum")) synk.distribute() x_dat = 0.01 * np.random.randn(1000, 10).astype(theano.config.floatX) y_dat = np.random.randn(10, 5).astype(theano.config.floatX) w_dat = 0.01 * np.random.randn(100, 10).astype(theano.config.floatX) # Build assortment of subsets of the data to compute on. # (Can be int, slice, or list (e.g. list for random shuffle)) max_idx_0 = 100 slice_1 = slice(100, 200) # must specify start and stop (for now) list_2 = np.random.randint(low=0, high=999, size=100) r_theano_0 = f.as_theano(x_dat[:max_idx_0], w_dat, y_dat) r_theano_1 = f.as_theano(x_dat[slice_1], w_dat, y_dat) r_theano_2 = f.as_theano(x_dat[list_2], w_dat, y_dat)
import numpy as np import theano import theano.tensor as T import synkhronos as synk synk.fork(2) s_init = np.ones(2, dtype='float32') x = T.matrix('x') s = theano.shared(s_init, name='s') f = synk.function([x], updates=[(s, T.sum(x * s, axis=0))]) synk.distribute() x_dat = synk.data(np.array([[1, 1], [2, 2], [3, 3], [4, 4]]).astype('float32')) print("\ns initial:\n", s.get_value()) f.as_theano(x_dat.data) print("\ns after Theano call:\n", s.get_value()) s.set_value(s_init) f(x_dat) print("\nlocal s after reset and Synkhronos call:\n", s.get_value()) gathered_s = synk.gather(s, nd_up=1) print("\ngathered s:\n", gathered_s) synk.reduce(s, op="sum") print("\nlocal s after in-place reduce:\n", s.get_value()) gathered_s = synk.gather(s, nd_up=1) print("\ngathered s after reduce:\n", gathered_s) synk.broadcast(s, s_init)