Exemplo n.º 1
0
 def train(self, train_set, valid_set=None, test_set=None, train_size=None):
     """
     Train the model in multi-GPU environment.
     """
     from platoon.channel import Worker
     from platoon.param_sync import EASGD
     server_port = self._port
     param_map = self.create_param_map()
     # Initialize the worker
     worker = Worker(control_port=server_port)
     if self.config.learning_rate:
         worker.send_req({'init_schedule': self._schedule_params})
     self.sync_hyperparams(worker.send_req('sync_hyperparams')['sync_hyperparams'])
     easgd_alpha = worker.send_req('get_easgd_alpha')
     worker.init_shared_params(param_map.values(), param_sync_rule=EASGD(easgd_alpha))
     worker.copy_to_local()
     worker.send_req({
         "set_names": None,
         "training_names": self.training_names,
         "evaluation_names": self.evaluation_names
     })
     # Load all training batches, consume vast memory here
     self.logger.info("started process {}".format(os.getpid()))
     self.logger.info("(proc {}) load training data".format(os.getpid()))
     train_batches = list(train_set)
     network_callback = bool(self.network.training_callbacks)
     trainer_callback = bool(self._iter_callbacks)
     while True:
         resp = worker.send_req('next')
         if resp == 'stop':
             break
         elif resp == 'wait':
             time.sleep(1)
         elif resp == 'get_num_batches':
             worker.send_req({'get_num_batches_done': len(train_batches)})
         elif 'eval' in resp:
             self.best_cost = resp['best_valid_cost']
             worker.copy_to_local()
             valid_costs = None
             test_costs = None
             if valid_set:
                 self._run_valid(self.epoch, valid_set)
                 self.fix_costs()
                 valid_costs = self.last_run_costs
             if test_set:
                 self._run_test(self.epoch, test_set)
                 self.fix_costs()
                 test_costs = self.last_run_costs
             worker.send_req({
                 "eval_done": None,
                 "valid_costs": valid_costs,
                 "test_costs": test_costs,
                 "auto_save": self.config.auto_save
             })
         elif 'valid' in resp:
             self.best_cost = resp['best_valid_cost']
             worker.copy_to_local()
             if valid_set:
                 self._run_valid(self.epoch, valid_set, dry_run=True)
                 self.fix_costs()
             worker.send_req({
                 "valid_done": None,
                 "valid_costs": self.last_run_costs,
                 "auto_save": self.config.auto_save
             })
         elif 'train' in resp:
             batch_ids = resp['train']
             batch_costs = [[] for _ in self.training_names]
             for batch_id in batch_ids:
                 x = train_batches[batch_id]
                 cost_x = self.learn(*x)
                 for i, cost in enumerate(cost_x):
                     batch_costs[i].append(cost)
                 self.last_cost = cost_x[0]
             if network_callback:
                 self.network.training_callback()
             if trainer_callback:
                 for func in self._iter_callbacks:
                     func(self)
             worker.sync_params(synchronous=True)
             worker.send_req({'train_done': None, 'costs': [float(np.mean(c)) for c in batch_costs]})
         elif 'sync_hyperparams' in resp:
             self.sync_hyperparams(resp['sync_hyperparams'])
     worker.close()
     return []
Exemplo n.º 2
0
def train_lstm(
    dim_proj=1024,  # word embeding dimension and LSTM number of hidden units.

    # This value is suggested as being good in the EASGD paper, but
    # you may want to tune this
    train_len=10,  # Train for this many minibatches when requested

    decay_c=0.,  # Weight decay for the classifier applied to the U weights.
    lrate=0.0001,  # Learning rate for sgd (not used for adadelta and rmsprop)
    n_words=10000,  # Vocabulary size
    optimizer=adadelta,  # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate).
    encoder='lstm',  # TODO: can be removed must be lstm.
    saveto='lstm_model.npz',  # The best model will be saved there
    maxlen=100,  # Sequence longer then this get ignored
    batch_size=16,  # The batch size during training.
    valid_batch_size=64,  # The batch size used for validation/test set.
    dataset='imdb',

    # Parameter for extra option
    noise_std=0.,
    use_dropout=True,  # if False slightly faster, but worst test error
                       # This frequently need a bigger model.
    reload_model=None,  # Path to a saved model we want to start from.
    test_size=-1,  # If >0, we keep only this number of test example.
    mode='client',
):

    worker = channel.Worker(cport=5567)

    # Model options
    model_options = locals().copy()
    print "model options", model_options

    load_data, prepare_data = get_dataset('imdb')

    print 'Loading data'
    train, valid, test = load_data(n_words=n_words, valid_portion=0.05,
                                   maxlen=maxlen)
    if test_size > 0:
        # The test set is sorted by size, but we want to keep random
        # size example.  So we must select a random selection of the
        # examples.
        idx = numpy.arange(len(test[0]))
        numpy.random.shuffle(idx)
        idx = idx[:test_size]
        test = ([test[0][n] for n in idx], [test[1][n] for n in idx])

    ydim = numpy.max(train[1]) + 1

    model_options['ydim'] = ydim

    print 'Building model'
    # This create the initial parameters as numpy ndarrays.
    # Dict name (string) -> numpy ndarray
    params = init_params(model_options)

    if mode == 'init':
        if reload_model:
            load_params('lstm_model.npz', params)

    # This creates Theano Shared Variable from the parameters.
    # Dict name (string) -> Theano Tensor Shared Variable
    # params and tparams have different copy of the weights.
    tparams = init_tparams(params)

    worker.init_shared_params('DLTlstm', tparams.values(),
                              param_sync_rule=EASGD(0.5),
                              cleanup=(mode == 'init'))
    print "Params init done"

    if mode == 'test':
        import pdb
        pdb.set_trace()

    # use_noise is for dropout
    (use_noise, x, mask,
     y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options)

    if decay_c > 0.:
        decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c')
        weight_decay = 0.
        weight_decay += (tparams['U'] ** 2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    f_cost = theano.function([x, mask, y], cost, name='f_cost')

    grads = tensor.grad(cost, wrt=tparams.values())
    f_grad = theano.function([x, mask, y], grads, name='f_grad')

    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = optimizer(lr, tparams, grads,
                                        x, mask, y, cost)

    print 'Optimization'

    kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size)
    kf_test = get_minibatches_idx(len(test[0]), valid_batch_size)

    def train_iter():
        while True:
            kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True)
            for _, train_index in kf:
                y = [train[1][t] for t in train_index]
                x = [train[0][t] for t in train_index]
                x, mask, y = prepare_data(x, y)
                yield x, mask, y

    train_it = train_iter()

    best_p = None

    while True:
        step = worker.send_req('next')
        print step

        if step == 'train':
            use_noise.set_value(numpy_floatX(1.))
            for i in xrange(train_len):
                x, mask, y = next(train_it)
                cost = f_grad_shared(x, mask, y)
                f_update(lrate)
            print 'Train cost:', cost
            step = worker.send_req(dict(done=train_len))

            print "Syncing with global params"
            worker.sync_params(synchronous=True)

        """
        if step.startswith('save '):
            _, saveto = step.split(' ', 1)
            print 'Saving...',
            # TODO fix that shit so that saving works.
            numpy.savez(saveto, history_errs=history_errs, **s.params)
            pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
            print 'Done'
        """

        if step == 'valid':
            use_noise.set_value(numpy_floatX(0.))
            valid_err = pred_error(f_pred, prepare_data, valid,
                                   kf_valid)
            test_err = pred_error(f_pred, prepare_data, test, kf_test)
            res = worker.send_req(dict(test_err=float(test_err),
                                       valid_err=float(valid_err)))

            if res == 'best':
                best_p = unzip(tparams)

            print ('Valid ', valid_err,
                   'Test ', test_err)

        if step == 'stop':
            break

    return
Exemplo n.º 3
0
def train(worker, model_options, data_options,
          patience,  # early stopping patience
          max_epochs,
          finish_after,  # finish after this many updates
          decay_c,  # L2 regularization penalty
          alpha_c,  # alignment regularization
          clip_c,  # gradient clipping threshold
          lrate,  # learning rate
          optimizer,
          saveto,
          valid_freq,
          train_len,
          valid_sync,
          save_freq,   # save the parameters after every saveFreq updates
          sample_freq,   # generate some samples after every sampleFreq
          control_port,
          batch_port,
          log_port,
          reload_):

    LOGGER.info('Connecting to data socket ({}) and loading validation data'
                .format(batch_port))
    worker.init_mb_sock(batch_port)
    _, _, valid_stream = load_data(**data_options)

    LOGGER.info('Building model')
    params = init_params(model_options)
    # reload parameters
    experiment_id = worker.send_req('experiment_id')
    model_filename = '{}.model.npz'.format(experiment_id)
    saveto_filename = '{}.npz'.format(saveto)
    if reload_ and os.path.exists(saveto_filename):
        LOGGER.info('Loading parameters from {}'.format(saveto_filename))
        params = load_params(saveto_filename, params)

    LOGGER.info('Initializing parameters')
    tparams = init_tparams(params)
    alpha = worker.send_req('alpha')
    worker.init_shared_params(tparams.values(), param_sync_rule=EASGD(alpha))

    # use_noise is for dropout
    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, model_options)
    inps = [x, x_mask, y, y_mask]

    LOGGER.info('Building sampler')
    f_init, f_next = build_sampler(tparams, model_options, trng)

    # before any regularizer
    LOGGER.info('Building f_log_probs')
    f_log_probs = theano.function(inps, cost, profile=False)

    cost = cost.mean()

    # apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in six.iteritems(tparams):
            weight_decay += (vv ** 2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # regularize the alpha weights
    if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * ((tensor.cast(
            y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] -
            opt_ret['dec_alphas'].sum(0)) ** 2).sum(1).mean()
        cost += alpha_reg

    # Not used?
    # after all regularizers - compile the computational graph for cost
    # LOGGER.info('Building f_cost')
    # f_cost = theano.function(inps, cost, profile=False)

    LOGGER.info('Computing gradient')
    grads = tensor.grad(cost, wrt=itemlist(tparams))

    # apply gradient clipping here
    if clip_c > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g ** 2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(tensor.switch(g2 > (clip_c ** 2), g / tensor.sqrt(
                g2) * clip_c, g))
        grads = new_grads

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    LOGGER.info('Building optimizers')
    f_grad_shared, f_update = getattr(optimizers, optimizer)(lr, tparams,
                                                             grads, inps, cost)

    LOGGER.info('Optimization')

    log = RemoteLogger(port=log_port)
    train_start = time.clock()
    best_p = None

    # Making sure that the worker start training with the most recent params
    worker.copy_to_local()

    uidx = 0
    while True:
        step = worker.send_req('next')
        LOGGER.debug('Received command: {}'.format(step))
        if step == 'train':
            use_noise.set_value(1.)
            for i in xrange(train_len):
                x, x_mask, y, y_mask = worker.recv_mb()

                uidx += 1
                log_entry = {'iteration': uidx}

                # compute cost, grads and copy grads to shared variables
                update_start = time.clock()
                cost = f_grad_shared(x, x_mask, y, y_mask)
                f_update(lrate)

                log_entry['cost'] = float(cost)
                log_entry['average_source_length'] = \
                    float(x_mask.sum(0).mean())
                log_entry['average_target_length'] = \
                    float(y_mask.sum(0).mean())
                log_entry['update_time'] = time.clock() - update_start
                log_entry['train_time'] = time.clock() - train_start
                log_entry['time'] = time.time()
                log.log(log_entry)

            step = worker.send_req({'done': train_len})
            LOGGER.debug("Syncing with global params")
            worker.sync_params(synchronous=True)

        if step == 'valid':
            if valid_sync:
                worker.copy_to_local()
            use_noise.set_value(0.)
            valid_errs = pred_probs(f_log_probs, model_options, valid_stream)
            valid_err = float(valid_errs.mean())
            res = worker.send_req({'valid_err': valid_err})
            log.log({'validation_cost': valid_err,
                     'train_time': time.clock() - train_start,
                     'time': time.time()})

            if res == 'best' and saveto:
                best_p = unzip(tparams)
                save_params(best_p, model_filename, saveto_filename)

            if valid_sync:
                worker.copy_to_local()

        if step == 'stop':
            break

    # Release all shared ressources.
    worker.close()
Exemplo n.º 4
0
from platoon.param_sync import EASGD
import theano
import theano.tensor as T

config = get_config()

myModel = model.ModelAPI(config)

print "model initialized"

mb_size = 128

worker = Worker(control_port=4222)
device = theano.config.device

platoon_sync_rule = EASGD(0.3)
nb_minibatches_before_sync = 10  # 10 from EASGD paper

params = myModel.nnet.parameters

for param in params:
    print param.get_value().dtype

worker.init_shared_params(params, param_sync_rule=platoon_sync_rule)

step = worker.send_req('next')

print "training started"

for i in range(0, 100000):
    indices = np.random.choice(range(574000), mb_size, replace=False)