# MULTIVERSO: all the workers will synchronize at the place you call barrier mv.barrier() # train model batch_size = 50 for i in range(50): for start in range(0, len(x_train), batch_size): # every process only train batches assigned to itself if start / batch_size % workers_num != worker_id: continue x_batch = x_train[start:start + batch_size] t_batch = t_train[start:start + batch_size] cost = train(x_batch, t_batch) # MULTIVERSO: sync value with multiverso after every batch sharedvar.sync_all_mv_shared_vars() # MULTIVERSO: all the workers will synchronize at the place you call barrier mv.barrier() # barrier every epoch # master will calc the accuracy if mv.is_master_worker(): predictions_test = predict(x_test) accuracy = np.mean(predictions_test == labels_test) print "epoch %d - accuracy: %.4f" % (i + 1, accuracy) # MULTIVERSO: You must call shutdown at the end of the file mv.shutdown()
def main(batch_size=128, lr=0.1, sync=False, n=5, num_epochs=82, model=None): # Check if cifar data exists if not os.path.exists("./cifar-10-batches-py"): print( "CIFAR-10 dataset can not be found. Please download the dataset from 'https://www.cs.toronto.edu/~kriz/cifar.html'." ) return # Load the dataset print("Loading data...") data = load_data() X_train = data['X_train'] Y_train = data['Y_train'] X_test = data['X_test'] Y_test = data['Y_test'] # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs') target_var = T.ivector('targets') # Create neural network model print("Building model and compiling functions...") network = build_cnn(input_var, n) print("number of parameters in model: %d" % lasagne.layers.count_params(network, trainable=True)) # MULTIVERSO: LasagneParamManager is a parameter manager which can # synchronize parameters of Lasagne with multiverso. lpm = param_manager.LasagneParamManager(network) if model is None: # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy( prediction, target_var) loss = loss.mean() # add weight decay all_layers = lasagne.layers.get_all_layers(network) l2_penalty = lasagne.regularization.regularize_layer_params( all_layers, lasagne.regularization.l2) * 0.0001 loss = loss + l2_penalty # Create update expressions for training # Stochastic Gradient Descent (SGD) with momentum params = lasagne.layers.get_all_params(network, trainable=True) sh_lr = theano.shared(lasagne.utils.floatX(lr)) updates = lasagne.updates.momentum(loss, params, learning_rate=sh_lr, momentum=0.9) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function([input_var, target_var], loss, updates=updates) # Create a loss expression for validation/testing test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy( test_prediction, target_var) test_loss = test_loss.mean() test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) if model is None: # launch the training loop print("Starting training...") # We iterate over epochs: for epoch in range(num_epochs): # devide the data into different process examples_per_worker = X_train.shape[0] / workers_num start_index = worker_id * (examples_per_worker) train_indices = np.arange(start_index, start_index + examples_per_worker) # shuffle training data np.random.shuffle(train_indices) rand_X_train = X_train[train_indices, :, :, :] rand_Y_train = Y_train[train_indices] # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() for batch in iterate_minibatches(rand_X_train, rand_Y_train, batch_size, shuffle=True, augment=True): train_batches += 1 inputs, targets = batch train_err += train_fn(inputs, targets) # MULTIVERSO: when you want to commit all the delta of # parameters manage by LasagneParamManager and update the latest # parameters from parameter server, you can call this function to # synchronize the values lpm.sync_all_param() # And a full pass over the validation data: # MULTIVERSO: all the workers will synchronize at the place you call barrier mv.barrier() if mv.is_master_worker(): val_err = 0 val_acc = 0 val_batches = 0 for batch in iterate_minibatches(X_test, Y_test, 500, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) val_err += err val_acc += acc val_batches += 1 # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) print(" validation accuracy:\t\t{:.2f} %".format( val_acc / val_batches * 100)) # adjust learning rate as in paper # 32k and 48k iterations should be roughly equivalent to 41 and 61 epochs if (epoch + 1) == 41 or (epoch + 1) == 61: # TODO: because of ASGD and multiple GPU are used, so Learning # rate change schedule should be reconsidered new_lr = sh_lr.get_value() * 0.1 print("New LR:" + str(new_lr)) sh_lr.set_value(lasagne.utils.floatX(new_lr)) # MULTIVERSO: all the workers will synchronize at the place you call barrier mv.barrier() if mv.is_master_worker(): # MULTIVERSO: update the parameters before save the model lpm.sync_all_param() # dump the network weights to a file : np.savez('cifar10_deep_residual_model.npz', *lasagne.layers.get_all_param_values(network)) else: # load network weights from model file with np.load(model) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] lasagne.layers.set_all_param_values(network, param_values) if mv.is_master_worker(): # Calculate validation error of model: test_err = 0 test_acc = 0 test_batches = 0 for batch in iterate_minibatches(X_test, Y_test, 500, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) test_err += err test_acc += acc test_batches += 1 print("Final results:") print(" test loss:\t\t\t{:.6f}".format(test_err / test_batches)) print(" test accuracy:\t\t{:.2f} %".format(test_acc / test_batches * 100)) # MULTIVERSO: You must call shutdown at the end of the file mv.shutdown()
mv.barrier() # train model batch_size = 50 for i in range(50): for start in range(0, len(x_train), batch_size): # every process only train batches assigned to itself if start / batch_size % workers_num != worker_id: continue x_batch = x_train[start:start + batch_size] t_batch = t_train[start:start + batch_size] cost = train(x_batch, t_batch) # MULTIVERSO: sync value with multiverso after every batch sharedvar.sync_all_mv_shared_vars() # MULTIVERSO: all the workers will synchronize at the place you call barrier mv.barrier() # barrier every epoch # master will calc the accuracy if mv.is_master_worker(): predictions_test = predict(x_test) accuracy = np.mean(predictions_test == labels_test) print "epoch %d - accuracy: %.4f" % (i + 1, accuracy) # MULTIVERSO: You must call shutdown at the end of the file mv.shutdown()
def main(batch_size=128, lr=0.1, sync=False, n=5, num_epochs=82, model=None): # Check if cifar data exists if not os.path.exists("./cifar-10-batches-py"): print("CIFAR-10 dataset can not be found. Please download the dataset from 'https://www.cs.toronto.edu/~kriz/cifar.html'.") return # Load the dataset print("Loading data...") data = load_data() X_train = data['X_train'] Y_train = data['Y_train'] X_test = data['X_test'] Y_test = data['Y_test'] # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs') target_var = T.ivector('targets') # Create neural network model print("Building model and compiling functions...") network = build_cnn(input_var, n) print("number of parameters in model: %d" % lasagne.layers.count_params(network, trainable=True)) # MULTIVERSO: MVNetParamManager is a parameter manager which can # synchronize parameters of Lasagne with multiverso. mvnpm = param_manager.MVNetParamManager(network) if model is None: # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() # add weight decay all_layers = lasagne.layers.get_all_layers(network) l2_penalty = lasagne.regularization.regularize_layer_params(all_layers, lasagne.regularization.l2) * 0.0001 loss = loss + l2_penalty # Create update expressions for training # Stochastic Gradient Descent (SGD) with momentum params = lasagne.layers.get_all_params(network, trainable=True) sh_lr = theano.shared(lasagne.utils.floatX(lr)) updates = lasagne.updates.momentum( loss, params, learning_rate=sh_lr, momentum=0.9) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function([input_var, target_var], loss, updates=updates) # Create a loss expression for validation/testing test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var) test_loss = test_loss.mean() test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) if model is None: # launch the training loop print("Starting training...") # We iterate over epochs: for epoch in range(num_epochs): # devide the data into different process examples_per_worker = X_train.shape[0] / workers_num start_index = worker_id * (examples_per_worker) train_indices = np.arange(start_index, start_index + examples_per_worker) # shuffle training data np.random.shuffle(train_indices) rand_X_train = X_train[train_indices,:,:,:] rand_Y_train = Y_train[train_indices] # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() for batch in iterate_minibatches(rand_X_train, rand_Y_train, batch_size, shuffle=True, augment=True): train_batches += 1 inputs, targets = batch train_err += train_fn(inputs, targets) # MULTIVERSO: when you want to commit all the delta of # parameters manage by MVNetParamManager and update the latest # parameters from parameter server, you can call this function to # synchronize the values mvnpm.sync_all_param() # And a full pass over the validation data: # MULTIVERSO: all the workers will synchronize at the place you call barrier mv.barrier() if mv.is_master_worker(): val_err = 0 val_acc = 0 val_batches = 0 for batch in iterate_minibatches(X_test, Y_test, 500, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) val_err += err val_acc += acc val_batches += 1 # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) print(" validation accuracy:\t\t{:.2f} %".format( val_acc / val_batches * 100)) # adjust learning rate as in paper # 32k and 48k iterations should be roughly equivalent to 41 and 61 epochs if (epoch+1) == 41 or (epoch+1) == 61: # TODO: because of ASGD and multiple GPU are used, so Learning # rate change schedule should be reconsidered new_lr = sh_lr.get_value() * 0.1 print("New LR:"+str(new_lr)) sh_lr.set_value(lasagne.utils.floatX(new_lr)) # MULTIVERSO: all the workers will synchronize at the place you call barrier mv.barrier() if mv.is_master_worker(): # MULTIVERSO: update the parameters before save the model mvnpm.sync_all_param() # dump the network weights to a file : np.savez('cifar10_deep_residual_model.npz', *lasagne.layers.get_all_param_values(network)) else: # load network weights from model file with np.load(model) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] lasagne.layers.set_all_param_values(network, param_values) if mv.is_master_worker(): # Calculate validation error of model: test_err = 0 test_acc = 0 test_batches = 0 for batch in iterate_minibatches(X_test, Y_test, 500, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) test_err += err test_acc += acc test_batches += 1 print("Final results:") print(" test loss:\t\t\t{:.6f}".format(test_err / test_batches)) print(" test accuracy:\t\t{:.2f} %".format( test_acc / test_batches * 100)) # MULTIVERSO: You must call shutdown at the end of the file mv.shutdown()
def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, dataset='mnist.pkl.gz', batch_size=600): """ Demonstrate stochastic gradient descent optimization of a log-linear model This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size ###################### # BUILD ACTUAL MODEL # ###################### print('... building the model') # MULTIVERSO: you should call mv.init before call multiverso apis mv.init() # MULTIVERSO: every process has distinct worker id worker_id = mv.worker_id() # MULTIVERSO: mv.workers_num will return the number of workers total_worker = mv.workers_num() # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # generate symbolic variables for input (x and y represent a # minibatch) x = T.matrix('x') # data, presented as rasterized images y = T.ivector('y') # labels, presented as 1D vector of [int] labels # construct the logistic regression class # Each MNIST image has size 28*28 classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10) # the cost we minimize during training is the negative log likelihood of # the model in symbolic format cost = classifier.negative_log_likelihood(y) # compiling a Theano function that computes the mistakes that are made by # the model on a minibatch test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # compute the gradient of cost with respect to theta = (W,b) g_W = T.grad(cost=cost, wrt=classifier.W) g_b = T.grad(cost=cost, wrt=classifier.b) # start-snippet-3 # specify how to update the parameters of the model as a list of # (variable, update expression) pairs. updates = [(classifier.W, classifier.W - learning_rate * g_W), (classifier.b, classifier.b - learning_rate * g_b)] # compiling a Theano function `train_model` that returns the cost, but in # the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # end-snippet-3 ############### # TRAIN MODEL # ############### print('... training the model') validation_frequency = n_train_batches start_time = timeit.default_timer() done_looping = False epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): # MULTIVERSO: we distribute the batches to different workers. # A worker will only train batches belonged to itself if minibatch_index % total_worker == worker_id: minibatch_avg_cost = train_model(minibatch_index) # MULTIVERSO: when you want to commit all the delta of # parameters produced by mv_shared and update the latest # parameters from parameter server, you can call this function to # synchronize the values sharedvar.sync_all_mv_shared_vars() iter = (epoch - 1) * n_train_batches + minibatch_index # MULTIVERSO: only master worker will output the model if mv.is_master_worker() and (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in range(n_valid_batches) ] validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, validation_loss * 100.)) # MULTIVERSO: all the workers will synchronize at the place you call barrier mv.barrier() # MULTIVERSO: You should make sure only one process will output the result. # Otherwise results will be outputted repeatedly if mv.is_master_worker(): end_time = timeit.default_timer() test_losses = [test_model(i) for i in range(n_test_batches)] test_score = numpy.mean(test_losses) print(('Optimization complete with validation score of %f %%,' 'with test performance %f %%') % (validation_loss * 100., test_score * 100.)) print('The code run for %d epochs, with %f epochs/sec' % (epoch, 1. * epoch / (end_time - start_time))) print(('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time))), file=sys.stderr) # save the model with open('model.pkl', 'wb') as f: pickle.dump(classifier, f) # MULTIVERSO: You must call shutdown at the end of the file mv.shutdown()