Exemplo n.º 1
0
# MULTIVERSO: all the workers will synchronize at the place you call barrier
mv.barrier()

# train model
batch_size = 50

for i in range(50):
    for start in range(0, len(x_train), batch_size):
        # every process only train batches assigned to itself
        if start / batch_size % workers_num != worker_id:
            continue
        x_batch = x_train[start:start + batch_size]
        t_batch = t_train[start:start + batch_size]
        cost = train(x_batch, t_batch)

        # MULTIVERSO: sync value with multiverso after every batch
        sharedvar.sync_all_mv_shared_vars()

    # MULTIVERSO: all the workers will synchronize at the place you call barrier
    mv.barrier()  # barrier every epoch

    # master will calc the accuracy
    if mv.is_master_worker():
        predictions_test = predict(x_test)
        accuracy = np.mean(predictions_test == labels_test)

        print "epoch %d - accuracy: %.4f" % (i + 1, accuracy)

# MULTIVERSO: You must call shutdown at the end of the file
mv.shutdown()
Exemplo n.º 2
0
def tearDownModule():
    mv.shutdown()
Exemplo n.º 3
0
def main(batch_size=128, lr=0.1, sync=False, n=5, num_epochs=82, model=None):
    # Check if cifar data exists
    if not os.path.exists("./cifar-10-batches-py"):
        print(
            "CIFAR-10 dataset can not be found. Please download the dataset from 'https://www.cs.toronto.edu/~kriz/cifar.html'."
        )
        return

    # Load the dataset
    print("Loading data...")
    data = load_data()
    X_train = data['X_train']
    Y_train = data['Y_train']
    X_test = data['X_test']
    Y_test = data['Y_test']

    # Prepare Theano variables for inputs and targets
    input_var = T.tensor4('inputs')
    target_var = T.ivector('targets')

    # Create neural network model
    print("Building model and compiling functions...")
    network = build_cnn(input_var, n)
    print("number of parameters in model: %d" %
          lasagne.layers.count_params(network, trainable=True))

    # MULTIVERSO: LasagneParamManager is a parameter manager which can
    # synchronize parameters of Lasagne with multiverso.
    lpm = param_manager.LasagneParamManager(network)

    if model is None:
        # Create a loss expression for training, i.e., a scalar objective we want
        # to minimize (for our multi-class problem, it is the cross-entropy loss):
        prediction = lasagne.layers.get_output(network)
        loss = lasagne.objectives.categorical_crossentropy(
            prediction, target_var)
        loss = loss.mean()
        # add weight decay
        all_layers = lasagne.layers.get_all_layers(network)
        l2_penalty = lasagne.regularization.regularize_layer_params(
            all_layers, lasagne.regularization.l2) * 0.0001
        loss = loss + l2_penalty

        # Create update expressions for training
        # Stochastic Gradient Descent (SGD) with momentum
        params = lasagne.layers.get_all_params(network, trainable=True)
        sh_lr = theano.shared(lasagne.utils.floatX(lr))
        updates = lasagne.updates.momentum(loss,
                                           params,
                                           learning_rate=sh_lr,
                                           momentum=0.9)

        # Compile a function performing a training step on a mini-batch (by giving
        # the updates dictionary) and returning the corresponding training loss:
        train_fn = theano.function([input_var, target_var],
                                   loss,
                                   updates=updates)

    # Create a loss expression for validation/testing
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(
        test_prediction, target_var)
    test_loss = test_loss.mean()
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                      dtype=theano.config.floatX)

    # Compile a second function computing the validation loss and accuracy:
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc])

    if model is None:
        # launch the training loop
        print("Starting training...")
        # We iterate over epochs:
        for epoch in range(num_epochs):
            # devide the data into different process
            examples_per_worker = X_train.shape[0] / workers_num
            start_index = worker_id * (examples_per_worker)
            train_indices = np.arange(start_index,
                                      start_index + examples_per_worker)
            # shuffle training data
            np.random.shuffle(train_indices)
            rand_X_train = X_train[train_indices, :, :, :]
            rand_Y_train = Y_train[train_indices]

            # In each epoch, we do a full pass over the training data:
            train_err = 0
            train_batches = 0
            start_time = time.time()
            for batch in iterate_minibatches(rand_X_train,
                                             rand_Y_train,
                                             batch_size,
                                             shuffle=True,
                                             augment=True):
                train_batches += 1
                inputs, targets = batch
                train_err += train_fn(inputs, targets)
                # MULTIVERSO: when you want to commit all the delta of
                # parameters manage by LasagneParamManager and update the latest
                # parameters from parameter server, you can call this function to
                # synchronize the values
                lpm.sync_all_param()

            # And a full pass over the validation data:
            # MULTIVERSO: all the workers will synchronize at the place you call barrier
            mv.barrier()
            if mv.is_master_worker():
                val_err = 0
                val_acc = 0
                val_batches = 0
                for batch in iterate_minibatches(X_test,
                                                 Y_test,
                                                 500,
                                                 shuffle=False):
                    inputs, targets = batch
                    err, acc = val_fn(inputs, targets)
                    val_err += err
                    val_acc += acc
                    val_batches += 1

                # Then we print the results for this epoch:
                print("Epoch {} of {} took {:.3f}s".format(
                    epoch + 1, num_epochs,
                    time.time() - start_time))
                print("  training loss:\t\t{:.6f}".format(train_err /
                                                          train_batches))
                print("  validation loss:\t\t{:.6f}".format(val_err /
                                                            val_batches))
                print("  validation accuracy:\t\t{:.2f} %".format(
                    val_acc / val_batches * 100))

            # adjust learning rate as in paper
            # 32k and 48k iterations should be roughly equivalent to 41 and 61 epochs
            if (epoch + 1) == 41 or (epoch + 1) == 61:
                # TODO: because of ASGD and multiple GPU are used, so Learning
                # rate change schedule should be reconsidered
                new_lr = sh_lr.get_value() * 0.1
                print("New LR:" + str(new_lr))
                sh_lr.set_value(lasagne.utils.floatX(new_lr))

        # MULTIVERSO: all the workers will synchronize at the place you call barrier
        mv.barrier()
        if mv.is_master_worker():
            # MULTIVERSO: update the parameters before save the model
            lpm.sync_all_param()
            # dump the network weights to a file :
            np.savez('cifar10_deep_residual_model.npz',
                     *lasagne.layers.get_all_param_values(network))
    else:
        # load network weights from model file
        with np.load(model) as f:
            param_values = [f['arr_%d' % i] for i in range(len(f.files))]
        lasagne.layers.set_all_param_values(network, param_values)

    if mv.is_master_worker():
        # Calculate validation error of model:
        test_err = 0
        test_acc = 0
        test_batches = 0
        for batch in iterate_minibatches(X_test, Y_test, 500, shuffle=False):
            inputs, targets = batch
            err, acc = val_fn(inputs, targets)
            test_err += err
            test_acc += acc
            test_batches += 1
        print("Final results:")
        print("  test loss:\t\t\t{:.6f}".format(test_err / test_batches))
        print("  test accuracy:\t\t{:.2f} %".format(test_acc / test_batches *
                                                    100))

    # MULTIVERSO: You must call shutdown at the end of the file
    mv.shutdown()
def main(batch_size=128, lr=0.1, sync=False, n=5, num_epochs=82, model=None):
    # Check if cifar data exists
    if not os.path.exists("./cifar-10-batches-py"):
        print("CIFAR-10 dataset can not be found. Please download the dataset from 'https://www.cs.toronto.edu/~kriz/cifar.html'.")
        return

    # Load the dataset
    print("Loading data...")
    data = load_data()
    X_train = data['X_train']
    Y_train = data['Y_train']
    X_test = data['X_test']
    Y_test = data['Y_test']

    # Prepare Theano variables for inputs and targets
    input_var = T.tensor4('inputs')
    target_var = T.ivector('targets')

    # Create neural network model
    print("Building model and compiling functions...")
    network = build_cnn(input_var, n)
    print("number of parameters in model: %d" % lasagne.layers.count_params(network, trainable=True))

    # MULTIVERSO: MVNetParamManager is a parameter manager which can
    # synchronize parameters of Lasagne with multiverso.
    mvnpm = param_manager.MVNetParamManager(network)

    if model is None:
        # Create a loss expression for training, i.e., a scalar objective we want
        # to minimize (for our multi-class problem, it is the cross-entropy loss):
        prediction = lasagne.layers.get_output(network)
        loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
        loss = loss.mean()
        # add weight decay
        all_layers = lasagne.layers.get_all_layers(network)
        l2_penalty = lasagne.regularization.regularize_layer_params(all_layers, lasagne.regularization.l2) * 0.0001
        loss = loss + l2_penalty

        # Create update expressions for training
        # Stochastic Gradient Descent (SGD) with momentum
        params = lasagne.layers.get_all_params(network, trainable=True)
        sh_lr = theano.shared(lasagne.utils.floatX(lr))
        updates = lasagne.updates.momentum(
                loss, params, learning_rate=sh_lr, momentum=0.9)

        # Compile a function performing a training step on a mini-batch (by giving
        # the updates dictionary) and returning the corresponding training loss:
        train_fn = theano.function([input_var, target_var], loss, updates=updates)

    # Create a loss expression for validation/testing
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(test_prediction,
                                                            target_var)
    test_loss = test_loss.mean()
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                      dtype=theano.config.floatX)

    # Compile a second function computing the validation loss and accuracy:
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc])

    if model is None:
        # launch the training loop
        print("Starting training...")
        # We iterate over epochs:
        for epoch in range(num_epochs):
            # devide the data into different process
            examples_per_worker = X_train.shape[0] / workers_num
            start_index = worker_id * (examples_per_worker)
            train_indices = np.arange(start_index, start_index + examples_per_worker)
            # shuffle training data
            np.random.shuffle(train_indices)
            rand_X_train = X_train[train_indices,:,:,:]
            rand_Y_train = Y_train[train_indices]

            # In each epoch, we do a full pass over the training data:
            train_err = 0
            train_batches = 0
            start_time = time.time()
            for batch in iterate_minibatches(rand_X_train, rand_Y_train, batch_size, shuffle=True, augment=True):
                train_batches += 1
                inputs, targets = batch
                train_err += train_fn(inputs, targets)
                # MULTIVERSO: when you want to commit all the delta of
                # parameters manage by MVNetParamManager and update the latest
                # parameters from parameter server, you can call this function to
                # synchronize the values
                mvnpm.sync_all_param()

            # And a full pass over the validation data:
            # MULTIVERSO: all the workers will synchronize at the place you call barrier
            mv.barrier()
            if mv.is_master_worker():
                val_err = 0
                val_acc = 0
                val_batches = 0
                for batch in iterate_minibatches(X_test, Y_test, 500, shuffle=False):
                    inputs, targets = batch
                    err, acc = val_fn(inputs, targets)
                    val_err += err
                    val_acc += acc
                    val_batches += 1

                # Then we print the results for this epoch:
                print("Epoch {} of {} took {:.3f}s".format(
                    epoch + 1, num_epochs, time.time() - start_time))
                print("  training loss:\t\t{:.6f}".format(train_err / train_batches))
                print("  validation loss:\t\t{:.6f}".format(val_err / val_batches))
                print("  validation accuracy:\t\t{:.2f} %".format(
                    val_acc / val_batches * 100))

            # adjust learning rate as in paper
            # 32k and 48k iterations should be roughly equivalent to 41 and 61 epochs
            if (epoch+1) == 41 or (epoch+1) == 61:
                # TODO: because of ASGD and multiple GPU are used, so Learning
                # rate change schedule should be reconsidered
                new_lr = sh_lr.get_value() * 0.1
                print("New LR:"+str(new_lr))
                sh_lr.set_value(lasagne.utils.floatX(new_lr))

        # MULTIVERSO: all the workers will synchronize at the place you call barrier
        mv.barrier()
        if mv.is_master_worker():
            # MULTIVERSO: update the parameters before save the model
            mvnpm.sync_all_param()
            # dump the network weights to a file :
            np.savez('cifar10_deep_residual_model.npz', *lasagne.layers.get_all_param_values(network))
    else:
        # load network weights from model file
        with np.load(model) as f:
             param_values = [f['arr_%d' % i] for i in range(len(f.files))]
        lasagne.layers.set_all_param_values(network, param_values)

    if mv.is_master_worker():
        # Calculate validation error of model:
        test_err = 0
        test_acc = 0
        test_batches = 0
        for batch in iterate_minibatches(X_test, Y_test, 500, shuffle=False):
            inputs, targets = batch
            err, acc = val_fn(inputs, targets)
            test_err += err
            test_acc += acc
            test_batches += 1
        print("Final results:")
        print("  test loss:\t\t\t{:.6f}".format(test_err / test_batches))
        print("  test accuracy:\t\t{:.2f} %".format(
            test_acc / test_batches * 100))

    # MULTIVERSO: You must call shutdown at the end of the file
    mv.shutdown()
Exemplo n.º 5
0
mv.barrier()


# train model
batch_size = 50

for i in range(50):
    for start in range(0, len(x_train), batch_size):
        # every process only train batches assigned to itself
        if start / batch_size % workers_num != worker_id:
            continue
        x_batch = x_train[start:start + batch_size]
        t_batch = t_train[start:start + batch_size]
        cost = train(x_batch, t_batch)

        # MULTIVERSO: sync value with multiverso after every batch
        sharedvar.sync_all_mv_shared_vars()

    # MULTIVERSO: all the workers will synchronize at the place you call barrier
    mv.barrier()  # barrier every epoch

    # master will calc the accuracy
    if mv.is_master_worker():
        predictions_test = predict(x_test)
        accuracy = np.mean(predictions_test == labels_test)

        print "epoch %d - accuracy: %.4f" % (i + 1, accuracy)

# MULTIVERSO: You must call shutdown at the end of the file
mv.shutdown()
Exemplo n.º 6
0
def tearDownModule():
    mv.shutdown()
Exemplo n.º 7
0
def sgd_optimization_mnist(learning_rate=0.13,
                           n_epochs=1000,
                           dataset='mnist.pkl.gz',
                           batch_size=600):
    """
    Demonstrate stochastic gradient descent optimization of a log-linear
    model

    This is demonstrated on MNIST.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz

    """
    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('... building the model')

    # MULTIVERSO: you should call mv.init before call multiverso apis
    mv.init()
    # MULTIVERSO: every process has distinct worker id
    worker_id = mv.worker_id()

    # MULTIVERSO: mv.workers_num will return the number of workers
    total_worker = mv.workers_num()

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # generate symbolic variables for input (x and y represent a
    # minibatch)
    x = T.matrix('x')  # data, presented as rasterized images
    y = T.ivector('y')  # labels, presented as 1D vector of [int] labels

    # construct the logistic regression class
    # Each MNIST image has size 28*28
    classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10)

    # the cost we minimize during training is the negative log likelihood of
    # the model in symbolic format
    cost = classifier.negative_log_likelihood(y)

    # compiling a Theano function that computes the mistakes that are made by
    # the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # compute the gradient of cost with respect to theta = (W,b)
    g_W = T.grad(cost=cost, wrt=classifier.W)
    g_b = T.grad(cost=cost, wrt=classifier.b)

    # start-snippet-3
    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs.
    updates = [(classifier.W, classifier.W - learning_rate * g_W),
               (classifier.b, classifier.b - learning_rate * g_b)]

    # compiling a Theano function `train_model` that returns the cost, but in
    # the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    # end-snippet-3

    ###############
    # TRAIN MODEL #
    ###############
    print('... training the model')
    validation_frequency = n_train_batches
    start_time = timeit.default_timer()

    done_looping = False
    epoch = 0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in range(n_train_batches):
            # MULTIVERSO: we distribute the batches to different workers.
            # A worker will only train batches belonged to itself
            if minibatch_index % total_worker == worker_id:
                minibatch_avg_cost = train_model(minibatch_index)
                # MULTIVERSO: when you want to commit all the delta of
                # parameters produced by mv_shared and update the latest
                # parameters from parameter server, you can call this function to
                # synchronize the values
                sharedvar.sync_all_mv_shared_vars()

            iter = (epoch - 1) * n_train_batches + minibatch_index

            # MULTIVERSO: only master worker will output the model
            if mv.is_master_worker() and (iter +
                                          1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in range(n_valid_batches)
                ]
                validation_loss = numpy.mean(validation_losses)

                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       validation_loss * 100.))
        # MULTIVERSO: all the workers will synchronize at the place you call barrier
        mv.barrier()

    # MULTIVERSO: You should make sure only one process will output the result.
    # Otherwise results will be outputted repeatedly
    if mv.is_master_worker():
        end_time = timeit.default_timer()

        test_losses = [test_model(i) for i in range(n_test_batches)]
        test_score = numpy.mean(test_losses)

        print(('Optimization complete with validation score of %f %%,'
               'with test performance %f %%') %
              (validation_loss * 100., test_score * 100.))
        print('The code run for %d epochs, with %f epochs/sec' %
              (epoch, 1. * epoch / (end_time - start_time)))
        print(('The code for file ' + os.path.split(__file__)[1] +
               ' ran for %.1fs' % ((end_time - start_time))),
              file=sys.stderr)

        # save the model
        with open('model.pkl', 'wb') as f:
            pickle.dump(classifier, f)
    # MULTIVERSO: You must call shutdown at the end of the file
    mv.shutdown()