Python Worker示例，platoon.channel.Worker Python示例

示例#1

0

显示文件

class BatchedPixelSum(object):

    def __init__(self, control_port, batch_port):
        self._worker = Worker(control_port=control_port, data_port=batch_port)

        data_shape = self._worker.send_req('get_data_shape')

        self._computed_sum = theano.shared(
            value=np.zeros(data_shape, dtype=theano.config.floatX),
            name='sum', borrow=True)

        self._worker.init_shared_params(params=[self._computed_sum],
                                        param_sync_rule=SUMSync())

        input = T.matrix(dtype=theano.config.floatX)
        batch_sum = T.sum(input, axis=0, dtype=theano.config.floatX)

        updates = OrderedDict()
        updates[self._computed_sum] = (self._computed_sum + batch_sum)

        self._update_sum = theano.function(name='learn',
                                           inputs=[input],
                                           updates=updates)

    def get_sum(self):
        nb_batches_before_sync = 10

        while True:
            step = self._worker.send_req('next')
            print("# Command received: {}".format(step))

            if step == 'train':
                print("# Training", end=' ')
                # TODO: Having a fix number of MB before sync can cause
                # problems
                for i in range(nb_batches_before_sync):
                    data = np.asarray(self._worker.recv_mb())
                    print(".", end=' ')
                    self._update_sum(data)
                print("Done")
                import time
                time.sleep(1)
                step = self._worker.send_req('done',
                                             dict(num_batches=nb_batches_before_sync))

                print("Syncing with global params.")
                self._worker.sync_params(synchronous=True)

            if step == 'stop':
                break

        print("All computation done.")
        return self._worker.shared_params[0]  # Return global params

示例#2

0

显示文件

文件： batched_pixel_sum_worker.py 项目： mila-udem/platoon

class BatchedPixelSum(object):

    def __init__(self, control_port, batch_port):
        self._worker = Worker(control_port=control_port, port=batch_port)

        data_shape = self._worker.send_req('get_data_shape')

        self._computed_sum = theano.shared(
            value=np.zeros(data_shape, dtype=theano.config.floatX),
            name='sum', borrow=True)

        self._worker.init_shared_params(params=[self._computed_sum],
                                        param_sync_rule=SUMSync())

        input = T.matrix(dtype=theano.config.floatX)
        batch_sum = T.sum(input, axis=0, dtype=theano.config.floatX)

        updates = OrderedDict()
        updates[self._computed_sum] = (self._computed_sum + batch_sum)

        self._update_sum = theano.function(name='learn',
                                           inputs=[input],
                                           updates=updates)

    def get_sum(self):
        nb_batches_before_sync = 10

        while True:
            step = self._worker.send_req('next')
            print("# Command received: {}".format(step))

            if step == 'train':
                print("# Training", end=' ')
                # TODO: Having a fix number of MB before sync can cause
                # problems
                for i in range(nb_batches_before_sync):
                    data = np.asarray(self._worker.recv_mb())
                    print(".", end=' ')
                    self._update_sum(data)
                print("Done")
                import time
                time.sleep(1)
                step = self._worker.send_req('done',
                                             dict(num_batches=nb_batches_before_sync))

                print("Syncing with global params.")
                self._worker.sync_params(synchronous=True)

            if step == 'stop':
                break

        print("All computation done.")
        return self._worker.shared_params[0]  # Return global params

示例#3

0

显示文件

文件： batched_pixel_sum_worker.py 项目： o869k/platoon

    def __init__(self, control_port, batch_port):
        self._worker = Worker(control_port=control_port, port=batch_port)

        data_shape = self._worker.send_req('get_data_shape')

        self._computed_sum = theano.shared(
            value=np.zeros(data_shape, dtype=theano.config.floatX),
            name='sum', borrow=True)

        self._worker.init_shared_params(params=[self._computed_sum],
                                        param_sync_rule=SUMSync())

        input = T.matrix(dtype=theano.config.floatX)
        batch_sum = T.sum(input, axis=0, dtype=theano.config.floatX)

        updates = OrderedDict()
        updates[self._computed_sum] = (self._computed_sum + batch_sum)

        self._update_sum = theano.function(name='learn',
                                           inputs=[input],
                                           updates=updates)

示例#4

0

显示文件

文件： batched_pixel_sum_worker.py 项目： carriepl/platoon

    def __init__(self, control_port, batch_port):
        self._worker = Worker(control_port=control_port, port=batch_port)

        data_shape = self._worker.send_req('get_data_shape')

        self._computed_sum = theano.shared(value=np.zeros(data_shape, dtype=theano.config.floatX), name='sum', borrow=True)

        self._worker.init_shared_params(params=[self._computed_sum], param_sync_rule=SUMSync())

        input = T.matrix(dtype=theano.config.floatX)
        batch_sum = T.sum(input, axis=0, dtype=theano.config.floatX)

        updates = OrderedDict()
        updates[self._computed_sum] = (self._computed_sum + batch_sum)

        self._update_sum = theano.function(name='learn',
                                           inputs=[input],
                                           updates=updates)

示例#5

0

显示文件

文件： worker.py 项目： yochju/deepy

 def train(self, train_set, valid_set=None, test_set=None, train_size=None):
     """
     Train the model in multi-GPU environment.
     """
     from platoon.channel import Worker
     from platoon.param_sync import EASGD
     server_port = self._port
     param_map = self.create_param_map()
     # Initialize the worker
     worker = Worker(control_port=server_port)
     if self.config.learning_rate:
         worker.send_req({'init_schedule': self._schedule_params})
     self.sync_hyperparams(worker.send_req('sync_hyperparams')['sync_hyperparams'])
     easgd_alpha = worker.send_req('get_easgd_alpha')
     worker.init_shared_params(param_map.values(), param_sync_rule=EASGD(easgd_alpha))
     worker.copy_to_local()
     worker.send_req({
         "set_names": None,
         "training_names": self.training_names,
         "evaluation_names": self.evaluation_names
     })
     # Load all training batches, consume vast memory here
     self.logger.info("started process {}".format(os.getpid()))
     self.logger.info("(proc {}) load training data".format(os.getpid()))
     train_batches = list(train_set)
     network_callback = bool(self.network.training_callbacks)
     trainer_callback = bool(self._iter_callbacks)
     while True:
         resp = worker.send_req('next')
         if resp == 'stop':
             break
         elif resp == 'wait':
             time.sleep(1)
         elif resp == 'get_num_batches':
             worker.send_req({'get_num_batches_done': len(train_batches)})
         elif 'eval' in resp:
             self.best_cost = resp['best_valid_cost']
             worker.copy_to_local()
             valid_costs = None
             test_costs = None
             if valid_set:
                 self._run_valid(self.epoch, valid_set)
                 self.fix_costs()
                 valid_costs = self.last_run_costs
             if test_set:
                 self._run_test(self.epoch, test_set)
                 self.fix_costs()
                 test_costs = self.last_run_costs
             worker.send_req({
                 "eval_done": None,
                 "valid_costs": valid_costs,
                 "test_costs": test_costs,
                 "auto_save": self.config.auto_save
             })
         elif 'valid' in resp:
             self.best_cost = resp['best_valid_cost']
             worker.copy_to_local()
             if valid_set:
                 self._run_valid(self.epoch, valid_set, dry_run=True)
                 self.fix_costs()
             worker.send_req({
                 "valid_done": None,
                 "valid_costs": self.last_run_costs,
                 "auto_save": self.config.auto_save
             })
         elif 'train' in resp:
             batch_ids = resp['train']
             batch_costs = [[] for _ in self.training_names]
             for batch_id in batch_ids:
                 x = train_batches[batch_id]
                 cost_x = self.learn(*x)
                 for i, cost in enumerate(cost_x):
                     batch_costs[i].append(cost)
                 self.last_cost = cost_x[0]
             if network_callback:
                 self.network.training_callback()
             if trainer_callback:
                 for func in self._iter_callbacks:
                     func(self)
             worker.sync_params(synchronous=True)
             worker.send_req({'train_done': None, 'costs': [float(np.mean(c)) for c in batch_costs]})
         elif 'sync_hyperparams' in resp:
             self.sync_hyperparams(resp['sync_hyperparams'])
     worker.close()
     return []

示例#6

0

显示文件

文件： vae_worker.py 项目： alexmlamb/vaeNewLoss

import sys
sys.setrecursionlimit(990000)

from platoon.channel import Worker
from platoon.param_sync import EASGD



theano.config.floatX = 'float32'


if __name__ == "__main__":

    print "running worker"

    worker = Worker(control_port=4222)
    device = theano.config.device

    config = get_config()
    config["layer_weighting"] = {'y': 1.0}

    if config['dataset'] == "imagenet":
        data = ImageNetData(config)
    elif config['dataset'] == "svhn":
        data = SvhnData(config)
    elif config['dataset'] == 'cifar':
        data = CifarData(config, "train")
    elif config['dataset'] == 'stl':
        data = StlData(config)
    else:
        raise Exception()

示例#7

0

显示文件

            # trick : each worker can do their valid without talking to the controller
            # even if they finish before another worker, they will wait in the next
            # epoch at the calling of all_reduce when they need to sync again
            use_noise.set_value(numpy_floatX(0.))
            valid_err = pred_error(f_pred, prepare_data, valid, kf_valid)
            test_err = pred_error(f_pred, prepare_data, test, kf_test)

            # they do need to send the result to the controller
            res = worker.send_req('pred_errors', dict(test_err=float(test_err),
                                  valid_err=float(valid_err), epoch=epoch))

            if res == 'best':
                # should save the param at best
                pass

            if res == 'stop':
                break
        epoch += 1

    # Release all shared resources.
    worker.close()


if __name__ == '__main__':
    # See function train for all possible parameter and there definition.
    parser = Worker.default_parser()
    args = parser.parse_args()

    worker = Worker(**Worker.default_arguments(args))
    train_lstm(test_size=500)

示例#8

0

显示文件

文件： lstm_worker.py 项目： tsirif/platoon

    test_err = pred_error(f_pred, prepare_data, test, kf_test)

    print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err
    if saveto:
        numpy.savez(saveto, train_err=train_err,
                    valid_err=valid_err, test_err=test_err,
                    history_errs=history_errs, **best_p)
    print 'The code run for %d epochs, with %f sec/epochs' % (
        (eidx + 1), (end_time - start_time) / (1. * (eidx + 1)))
    print >> sys.stderr, ('Training took %.1fs' %
                          (end_time - start_time))
    return train_err, valid_err, test_err
"""

if __name__ == '__main__':
    # See function train for all possible parameter and there definition.
    parser = Worker.default_parser()
    parser.add_argument('--valid_sync', dest='valid_sync', action='store_true', default=False)
    parser.add_argument('--param-sync-api', action='store_true', default=False)
    args = parser.parse_args()

    worker = Worker(**Worker.default_arguments(args))
    # Set the random number generators' seeds for consistency
    # Each worker **MUST** be seeded with a different number, so that
    # they do not draw the same minibatches!
    SEED = 123
    numpy.random.seed(SEED + worker.global_rank)

    train_lstm(valid_sync=args.valid_sync, test_size=500,
               param_sync_api=args.param_sync_api)

示例#9

0

显示文件

文件： lstm_worker.py 项目： happyyang/platoon

from theano import config
import theano.tensor as tensor
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

import os

sys.path.append(os.path.dirname(__file__))
import imdb

sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
from platoon.channel import Worker
from platoon.param_sync import EASGD

datasets = {'imdb': (imdb.load_data, imdb.prepare_data)}

worker = Worker(control_port=5567)
# Set the random number generators' seeds for consistency
# Each worker **MUST** be seeded with a different number, so that
# they do not draw the same minibatches!
SEED = 123
numpy.random.seed(SEED + worker.global_rank)


def numpy_floatX(data):
    return numpy.asarray(data, dtype=config.floatX)


def get_minibatches_idx(n, minibatch_size, shuffle=False):
    """
    Used to shuffle the dataset at each iteration.
    """

示例#10

0

显示文件

文件： nmt_worker.py 项目： 5l1v3r1/nmt

                worker.copy_to_local()
            use_noise.set_value(0.)
            valid_errs = pred_probs(f_log_probs, model_options, valid_stream)
            valid_err = float(valid_errs.mean())
            res = worker.send_req({'valid_err': valid_err})
            log.log({'validation_cost': valid_err,
                     'train_time': time.clock() - train_start,
                     'time': time.time()})

            if res == 'best' and saveto:
                best_p = unzip(tparams)
                save_params(best_p, model_filename, saveto_filename)

            if valid_sync:
                worker.copy_to_local()

        if step == 'stop':
            break

    # Release all shared ressources.
    worker.close()


if __name__ == "__main__":
    LOGGER.info('Connecting to worker ({})'.format(sys.argv[1]))
    worker = Worker(int(sys.argv[1]))
    LOGGER.info('Retrieving configuration')
    config = worker.send_req('config')
    train(worker, config['model'], config['data'],
          **merge(config['training'], config['management'], config['multi']))

示例#11

0

显示文件

文件： lstm_worker.py 项目： DingKe/platoon

def train_lstm(
    dim_proj=1024,  # word embeding dimension and LSTM number of hidden units.

    # This value is suggested as being good in the EASGD paper, but
    # you may want to tune this
    train_len=10,  # Train for this many minibatches when requested

    decay_c=0.,  # Weight decay for the classifier applied to the U weights.
    lrate=0.0001,  # Learning rate for sgd (not used for adadelta and rmsprop)
    n_words=10000,  # Vocabulary size
    optimizer=adadelta,  # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate).
    encoder='lstm',  # TODO: can be removed must be lstm.
    saveto='lstm_model.npz',  # The best model will be saved there
    maxlen=100,  # Sequence longer then this get ignored
    batch_size=16,  # The batch size during training.
    valid_batch_size=64,  # The batch size used for validation/test set.
    dataset='imdb',

    # Parameter for extra option
    noise_std=0.,
    use_dropout=True,  # if False slightly faster, but worst test error
                       # This frequently need a bigger model.
    reload_model=None,  # Path to a saved model we want to start from.
    test_size=-1,  # If >0, we keep only this number of test example.
    valid_sync=False,
):

    worker = Worker(control_port=5567)

    # Model options
    model_options = locals().copy()
    print("model options", model_options)

    load_data, prepare_data = get_dataset('imdb')

    print('Loading data')
    train, valid, test = load_data(n_words=n_words, valid_portion=0.05,
                                   maxlen=maxlen)
    if test_size > 0:
        # The test set is sorted by size, but we want to keep random
        # size example.  So we must select a random selection of the
        # examples.
        idx = numpy.arange(len(test[0]))
        numpy.random.shuffle(idx)
        idx = idx[:test_size]
        test = ([test[0][n] for n in idx], [test[1][n] for n in idx])

    ydim = numpy.max(train[1]) + 1

    model_options['ydim'] = ydim

    print('Building model')
    # This create the initial parameters as numpy ndarrays.
    # Dict name (string) -> numpy ndarray
    params = init_params(model_options)

    if reload_model:
        load_params('lstm_model.npz', params)

    # This creates Theano Shared Variable from the parameters.
    # Dict name (string) -> Theano Tensor Shared Variable
    # params and tparams have different copy of the weights.
    tparams = init_tparams(params)

    worker.init_shared_params(tparams.values(), param_sync_rule=EASGD(0.5))
    print("Params init done")

    # use_noise is for dropout
    (use_noise, x, mask,
     y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options)

    if decay_c > 0.:
        decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c')
        weight_decay = 0.
        weight_decay += (tparams['U'] ** 2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    f_cost = theano.function([x, mask, y], cost, name='f_cost')

    grads = tensor.grad(cost, wrt=tparams.values())
    f_grad = theano.function([x, mask, y], grads, name='f_grad')

    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = optimizer(lr, tparams, grads,
                                        x, mask, y, cost)

    print('Optimization')

    kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size)
    kf_test = get_minibatches_idx(len(test[0]), valid_batch_size)

    def train_iter():
        while True:
            kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True)
            for _, train_index in kf:
                y = [train[1][t] for t in train_index]
                x = [train[0][t] for t in train_index]
                x, mask, y = prepare_data(x, y)
                yield x, mask, y

    train_it = train_iter()

    best_p = None

    # Making sure that the worker start training with the most recent params
    worker.copy_to_local()

    while True:
        step = worker.send_req('next')
        print(step)

        if step == 'train':
            use_noise.set_value(numpy_floatX(1.))
            for i in xrange(train_len):
                x, mask, y = next(train_it)
                cost = f_grad_shared(x, mask, y)
                f_update(lrate)
            print('Train cost:', cost)
            step = worker.send_req(dict(done=train_len))

            print("Syncing with global params")
            worker.sync_params(synchronous=True)

        """
        if step.startswith('save '):
            _, saveto = step.split(' ', 1)
            print 'Saving...',
            # TODO fix that shit so that saving works.
            numpy.savez(saveto, history_errs=history_errs, **s.params)
            pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
            print 'Done'
        """

        if step == 'valid':
            if valid_sync:
                worker.copy_to_local()
            use_noise.set_value(numpy_floatX(0.))
            valid_err = pred_error(f_pred, prepare_data, valid,
                                   kf_valid)
            test_err = pred_error(f_pred, prepare_data, test, kf_test)
            res = worker.send_req(dict(test_err=float(test_err),
                                       valid_err=float(valid_err)))

            if res == 'best':
                best_p = unzip(tparams)

            print(('Valid ', valid_err,
                   'Test ', test_err))
            if valid_sync:
                worker.copy_to_local()

        if step == 'stop':
            break

    # Release all shared ressources.
    worker.close()

示例#12

0

显示文件

文件： worker.py 项目： KentChun33333/deepy

 def train(self, train_set, valid_set=None, test_set=None, train_size=None):
     """
     Train the model in multi-GPU environment.
     """
     server_port = self._port
     param_map = self.create_param_map()
     # Initialize the worker
     worker = Worker(control_port=server_port)
     if self.config.learning_rate:
         worker.send_req({'init_schedule': self._schedule_params})
     self.sync_hyperparams(worker.send_req('sync_hyperparams')['sync_hyperparams'])
     easgd_alpha = worker.send_req('get_easgd_alpha')
     worker.init_shared_params(param_map.values(), param_sync_rule=EASGD(easgd_alpha))
     worker.copy_to_local()
     worker.send_req({
         "set_names": None,
         "training_names": self.training_names,
         "evaluation_names": self.evaluation_names
     })
     # Load all training batches, consume vast memory here
     self.logger.info("started process {}".format(os.getpid()))
     self.logger.info("(proc {}) load training data".format(os.getpid()))
     train_batches = list(train_set)
     network_callback = bool(self.network.training_callbacks)
     trainer_callback = bool(self._iter_callbacks)
     while True:
         resp = worker.send_req('next')
         if resp == 'stop':
             break
         elif resp == 'wait':
             time.sleep(1)
         elif resp == 'get_num_batches':
             worker.send_req({'get_num_batches_done': len(train_batches)})
         elif 'eval' in resp:
             self.best_cost = resp['best_valid_cost']
             worker.copy_to_local()
             valid_costs = None
             test_costs = None
             if valid_set:
                 self._run_valid(self.epoch, valid_set)
                 self.fix_costs()
                 valid_costs = self.last_run_costs
             if test_set:
                 self._run_test(self.epoch, test_set)
                 self.fix_costs()
                 test_costs = self.last_run_costs
             worker.send_req({
                 "eval_done": None,
                 "valid_costs": valid_costs,
                 "test_costs": test_costs,
                 "auto_save": self.config.auto_save
             })
         elif 'valid' in resp:
             self.best_cost = resp['best_valid_cost']
             worker.copy_to_local()
             if valid_set:
                 self._run_valid(self.epoch, valid_set, dry_run=True)
                 self.fix_costs()
             worker.send_req({
                 "valid_done": None,
                 "valid_costs": self.last_run_costs,
                 "auto_save": self.config.auto_save
             })
         elif 'train' in resp:
             batch_ids = resp['train']
             batch_costs = [[] for _ in self.training_names]
             for batch_id in batch_ids:
                 x = train_batches[batch_id]
                 cost_x = self.learn(*x)
                 for i, cost in enumerate(cost_x):
                     batch_costs[i].append(cost)
                 self.last_cost = cost_x[0]
             if network_callback:
                 self.network.training_callback()
             if trainer_callback:
                 for func in self._iter_callbacks:
                     func(self)
             worker.sync_params(synchronous=True)
             worker.send_req({'train_done': None, 'costs': [float(np.mean(c)) for c in batch_costs]})
         elif 'sync_hyperparams' in resp:
             self.sync_hyperparams(resp['sync_hyperparams'])
     worker.close()
     return []

示例#13

0

显示文件

def train(dim_word=100,  # word vector dimensionality
          dim=1000,  # the number of GRU units
          encoder='gru',
          max_epochs=5000,
          finish_after=10000000,  # finish after this many updates
          dispFreq=100,
          decay_c=0.,  # L2 weight decay penalty
          lrate=0.01,
          n_words=100000,  # vocabulary size
          maxlen=100,  # maximum length of the description
          batch_size=16,
          valid_batch_size=16,
          max_grad_norm=5,
          nlayers=1,
          data_path=None,
          use_dropout=False,
          platoon=False,
	  name=""):

    # Model options
    model_options = locals().copy()

    print 'Loading data'

    raw_data = reader.ptb_raw_data(data_path)
    train_data, valid_data, test_data, _ = raw_data
    pprint.pprint(model_options)
    print 'Building model'
    params = init_params(model_options)

    # create shared variables for parameters
    tparams = init_tparams(params)

    if platoon:
        print "PLATOON: Init ...",
        from platoon.channel import Worker
        from platoon.param_sync import ASGD
        worker = Worker(control_port=5567)
        print "DONE"

        print "PLATOON: Initializing shared params ...",
        worker.init_shared_params(tparams.values(), param_sync_rule=ASGD())
        print "DONE"
	worker.send_req({"type": name})

    # build the symbolic computational graph
    trng, use_noise, \
        x, \
        opt_ret, \
        cost, ups = \
        build_model(tparams, model_options)
    inps = [x]

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, updates=ups)
    print 'Done'

    # before any regularizer - will be used to compute ppl
    print 'Building f_cost...',
    cost_sum = cost.sum()
    f_cost = theano.function(inps, cost_sum, updates=ups)
    print 'Done'

    cost = cost.mean()

    # apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv ** 2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    print 'Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    print 'Done'

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    f_grad_shared, f_update = sgd(lr, tparams, grads, inps, cost, max_grad_norm)
    print 'Done'

    print 'Optimization'

    history_errs = []
    history_ppls = []
    wpss = []

    best_p = None

    # Training loop
    uidx = 0
    estop = False
    bad_counter = 0
    try:
        for eidx in xrange(max_epochs):
            n_samples = 0
            tlen = 0
            start_time = time.time()
            for x, y in reader.ptb_iterator(train_data, batch_size, maxlen):
                if platoon:
                    #print "PLATOON: Copying data from master ...",
                    worker.copy_to_local()
                    #print "DONE"

                n_samples += len(x)
                uidx += 1
                use_noise.set_value(1.)
                tlen += (x.shape[0] * x.shape[1])
                # pad batch and create mask
                if x is None:
                    print 'Minibatch with zero sample under length ', maxlen
                    uidx -= 1
                    continue

                ud_start = time.time()

                # compute cost, grads and copy grads to shared variables
                cost = f_grad_shared(x)

                # do the update on parameters
                f_update(lrate)

                ud = time.time() - ud_start

                if platoon:
                    #print "PLATOON: Syncing with master ...",
                    worker.sync_params(synchronous=True)
                    #print "DONE"

                # check for bad numbers
                if numpy.isnan(cost) or numpy.isinf(cost):
                    print 'NaN detected'
                    return 1.

                # verbose
                if numpy.mod(uidx, dispFreq) == 0:
                    print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud

                # finish after this many updates
                if uidx >= finish_after:
                    print 'Finishing after %d iterations!' % uidx
                    estop = True
                    break
            current_time = time.time()
            wps = int(tlen // (current_time - start_time))
            print "Current wps", wps
            wpss.append(wps)
            print 'Seen %d samples' % n_samples
            if platoon:
                print "PLATOON: Sending wps to controller ...",
                worker.send_req({'wps': wps, 'epoch': eidx})
                print "DONE"

        print "Avg wps, ", numpy.mean(wpss)
        print "Std avgs,", numpy.std(wpss)

        use_noise.set_value(0.)
    finally:
        if platoon:
            print "PLATOON: Closing worker ...",
            worker.send_req('done')
            worker.close()
            print "DONE"
    return 0

示例#14

0

显示文件

文件： isgd_worker.py 项目： alexmlamb/ImportanceSamplingSGD

import numpy as np
from config import get_config
from platoon.channel import Worker
from platoon.param_sync import EASGD
import theano
import theano.tensor as T

config = get_config()

myModel = model.ModelAPI(config)

print "model initialized"

mb_size = 128

worker = Worker(control_port=4222)
device = theano.config.device

platoon_sync_rule = EASGD(0.3)
nb_minibatches_before_sync = 10  # 10 from EASGD paper

params = myModel.nnet.parameters

for param in params:
    print param.get_value().dtype

worker.init_shared_params(params, param_sync_rule=platoon_sync_rule)

step = worker.send_req('next')

print "training started"

示例#15

0

显示文件

import numpy as np
from config import get_config
from platoon.channel import Worker
from platoon.param_sync import EASGD
import theano
import theano.tensor as T

config = get_config()

myModel = model.ModelAPI(config)

print "model initialized"

mb_size = 128

worker = Worker(control_port=4222)
device = theano.config.device

platoon_sync_rule = EASGD(0.3)
nb_minibatches_before_sync = 10  # 10 from EASGD paper

params = myModel.nnet.parameters

for param in params:
    print param.get_value().dtype

worker.init_shared_params(params, param_sync_rule=platoon_sync_rule)

step = worker.send_req('next')

print "training started"

示例#16

0

显示文件

文件： convnet_worker.py 项目： hma02/platoon

def train_convnet(

    queue_dict,
    valid_sync=False,
    verbose = False
	
	):
    
    gpuid = int(queue_dict['device'][-1])
    from lib.train_funcs import set_cpu_affi
    set_cpu_affi(gpuid)

    worker = Worker(control_port=5567)

    # Load Model options
    model_options = locals().copy()
    
    import yaml
    with open('config.yaml', 'r') as f:
        training_config = yaml.load(f)   
    name=training_config['name']
    
    with open(name+'.yaml', 'r') as f:
        model_config = yaml.load(f)
    model_options = dict(model_options.items()+training_config.items()+model_config.items()+queue_dict.items())
                                         
    
    print "model options", model_options

    print 'Loading data'
    
    from lib.train_funcs import unpack_configs,proc_configs, get_rand3d, adjust_learning_rate
    proc_configs(model_options)
    train_len = model_options['avg_freq'] # Train for this many minibatches when requested
                                         
    (flag_para_load, flag_top_5,
            train_filenames, val_filenames, train_labels, val_labels, img_mean) = \
            unpack_configs(model_options, ext_data='.hkl', ext_label='.npy')
    
    #train_filenames = train_filenames[:8]
    
    #val_filenames = val_filenames[:4]
    print 'Building model'
    
    # shared_x should be created after driver initialization and before drv.mem_get_ipc_handle() is called, otherwise memhandle will be invalid
    drv = drv_init(queue_dict) 
    # This create the initial parameters as numpy ndarrays.
    # Dict name (string) -> numpy ndarray
    tparams, model, drp = init_params(model_options)

    if model_options['resume_train']:
        load_epoch=model_options['load_epoch']
        load_model(load_epoch, layers, learning_rate, vels, \
        								path=model_options['load_path'])    

    worker.init_shared_params(tparams, param_sync_rule=EASGD(1.0/model_options['size'])) # Using alpha = 1/N

    print "Params init done"
    
    from lib.googlenet import get_shared_x_y,compile_model,compile_val
    shared_x_list, shared_y = get_shared_x_y(model_options)
    
    train_model, get_vel, descent_vel, params, vels,vels2, learning_rate = \
	 					compile_model(model, model_options,shared_x_list,shared_y)
                        
    val_model = compile_val(model, model_options,shared_x_list,shared_y)

    print 'Optimization'
                    
    # parallel data loading
    
    
    para_load_init(queue_dict, drv, shared_x_list[0],img_mean)
    
    para_train_it = p_iter(model_options, shared_y, train_filenames, \
                                    train_labels, train_model, 'train')
    para_val_it = p_iter(model_options, shared_y, val_filenames, \
                                            val_labels, val_model, 'val')

    best_p = None
    
    def print_time(amount, train_time_list,comm_time_list,wait_time_list):
        train,comm,wait = sum(train_time_list), sum(comm_time_list), sum (wait_time_list)
        print 'time per %d images: %.2f (train %.2f comm %.2f wait %.2f)' % \
                     (amount, train+comm+wait, train,comm,wait)
        return train+comm+wait, train,comm,wait

    count=0
    start_time = None
    
    import time
    inforec_list = []
    train_error_list = []
    val_error_list = []
    all_time_list = []
    epoch_time_list = []
    lr_list = []
    epoch=0
    step_idx = 0
    
    train_time_list = []
    wait_time_list = []
    comm_time_list = []
    
    while True:
        
        req_time= time.time()
        
        step = worker.send_req('next')
        
        #print step

        req_time = time.time() - req_time
        
        if step == 'train':
            
            if start_time==None:
                start_time = time.time()
 
            for i in xrange(train_len): # sync with server every train_len iter

                train_time, wait_time, cost, error, _ = next(para_train_it)  
                train_time_list.append(train_time)
                wait_time_list.append(wait_time)
                
                count+=1
                if (count) % (5120/model_options['file_batch_size']) ==0:
                    print ''
			        
                    print '%d %.4f %.4f'% (count, cost, error)
                    train_error_list.append([count, cost, error])
                    t_all,t_train,t_comm,t_wait = print_time(5120, train_time_list, comm_time_list, wait_time_list)
                    all_time_list.append([count,t_all,t_train,t_comm,t_wait])
                    train_time_list = []
                    wait_time_list =[]
                    comm_time_list = []
            
            comm_time = time.time()
            
            step = worker.send_req(dict(done=train_len))

            if verbose: print "Syncing"
            worker.sync_params(synchronous=True)
            
            comm_time_list.append(time.time() - comm_time + req_time)


        """
        if step.startswith('save '):
            _, saveto = step.split(' ', 1)
            print 'Saving...',
            # TODO fix that shit so that saving works.
            numpy.savez(saveto, history_errs=history_errs, **s.params)
            pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
            print 'Done'
        """

        if step == 'valid':
            
            if valid_sync:
                worker.copy_to_local()
                
            drp.SetDropoutOff()
            
            cost_list = []
            error_list = []
            error_top_5_list = []
            
            for i in xrange(len(val_filenames)):
            
                _, _, cost,error,error_top_5= next(para_val_it) 
          
                cost_list.append(cost)
                error_list.append(error)
                error_top_5_list.append(error_top_5)     
                
                print '.',
            print ''

            validation_loss = np.mean(cost_list)
            validation_error = np.mean(error_list)
            validation_error_top5 = np.mean(error_top_5_list)
            
            print 'validation cost:%.4f' % validation_loss
            print 'validation error:%.4f' % validation_error
            print 'validation top_5_error:%.4f' % validation_error_top5
            val_error_list.append([count, validation_loss, \
                        validation_error, validation_error_top5])

            drp.SetDropoutOn()

            res = worker.send_req(dict(test_err=float(validation_error),
                                       valid_err=float(validation_error)))

            if res == 'best':
                best_p = unzip(tparams)

            if valid_sync:
                worker.copy_to_local()
                
                
            # get total iterations processed by all workers
            uidx = worker.send_req('uidx')
            
            uepoch = int(uidx/len(train_filenames)) 

            if model.name=='alexnet':
                
                if model_options['lr_policy'] == 'step':
                    
                    if uepoch >=20 and uepoch < 40 and step_idx==0:

                        learning_rate.set_value(
                            np.float32(learning_rate.get_value() / 10))
                        print 'Learning rate divided by 10'
                        step_idx = 1
                        
                    elif uepoch >=40 and uepoch < 60 and step_idx==1:
                        
                        learning_rate.set_value(
                            np.float32(learning_rate.get_value() / 10))
                        print 'Learning rate divided by 10'
                        step_idx = 2
                        
                    elif uepoch >=60 and uepoch < 70 and step_idx==2:
                        
                        learning_rate.set_value(
                            np.float32(learning_rate.get_value() / 10))
                        print 'Learning rate divided by 10'
                        step_idx = 3
                    else:
                        pass


                if model_options['lr_policy'] == 'auto':
                    if uepoch>5 and (val_error_list[-3][2] - val_error_list[-1][2] <
                                        model_options['lr_adapt_threshold']):
                        learning_rate.set_value(
                            np.float32(learning_rate.get_value() / 10))
                       
                        
                lr = learning_rate.get_value()
                lr = np.float32(lr)
                          
            elif model.name=='googlenet':

                    # Poly lr policy according to
	                # https://github.com/BVLC/caffe/tree/master/models/bvlc_googlenet
                    max_iter = len(train_filenames)*240
                    lr = learning_rate.get_value() * \
	                    pow( (1. -  1.* uepoch*len(train_filenames) / max_iter), 0.5 )
                    lr = np.float32(lr)
                    learning_rate.set_value(lr)

            else:
                raise NotImplementedError
                
            print 'Learning rate now:', lr
				
            lr_list.append(lr)
	            
            if start_time!=None:
                epoch_time_list.append([count , time.time()-start_time])
                epoch = int(count/len(train_filenames) )
                print 'epoch %d time %.2fh, global epoch is %d' % (epoch, epoch_time_list[-1][1]/3600.0, uepoch)
                
                inforec_list = [train_error_list,
                                val_error_list,
                                all_time_list,
                                epoch_time_list,
                                lr_list
                                ]
                
                import pickle
                filepath = '../run/inforec/inforec_%s.pkl' % queue_dict['device']
                with open(filepath, 'wb') as f:
                    pickle.dump(inforec_list, f, protocol=pickle.HIGHEST_PROTOCOL)
            
            start_time=None

        if step == 'stop':
            break

    # Release all shared ressources.
    worker.close()

示例#17

0

显示文件

def train_lstm(
    dim_proj=1024,  # word embeding dimension and LSTM number of hidden units.

    # This value is suggested as being good in the EASGD paper, but
    # you may want to tune this
    train_len=10,  # Train for this many minibatches when requested
    decay_c=0.,  # Weight decay for the classifier applied to the U weights.
    lrate=0.0001,  # Learning rate for sgd (not used for adadelta and rmsprop)
    n_words=10000,  # Vocabulary size
    optimizer=adadelta,  # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate).
    encoder='lstm',  # TODO: can be removed must be lstm.
    saveto='lstm_model.npz',  # The best model will be saved there
    maxlen=100,  # Sequence longer then this get ignored
    batch_size=16,  # The batch size during training.
    valid_batch_size=64,  # The batch size used for validation/test set.
    dataset='imdb',

    # Parameter for extra option
    noise_std=0.,
    use_dropout=True,  # if False slightly faster, but worst test error
    # This frequently need a bigger model.
    reload_model=None,  # Path to a saved model we want to start from.
    test_size=-1,  # If >0, we keep only this number of test example.
    valid_sync=False,
):

    worker = Worker(control_port=5567)

    # Model options
    model_options = locals().copy()
    print("model options", model_options)

    load_data, prepare_data = get_dataset('imdb')

    print('Loading data')
    train, valid, test = load_data(n_words=n_words,
                                   valid_portion=0.05,
                                   maxlen=maxlen)
    if test_size > 0:
        # The test set is sorted by size, but we want to keep random
        # size example.  So we must select a random selection of the
        # examples.
        idx = numpy.arange(len(test[0]))
        numpy.random.shuffle(idx)
        idx = idx[:test_size]
        test = ([test[0][n] for n in idx], [test[1][n] for n in idx])

    ydim = numpy.max(train[1]) + 1

    model_options['ydim'] = ydim

    print('Building model')
    # This create the initial parameters as numpy ndarrays.
    # Dict name (string) -> numpy ndarray
    params = init_params(model_options)

    if reload_model:
        load_params('lstm_model.npz', params)

    # This creates Theano Shared Variable from the parameters.
    # Dict name (string) -> Theano Tensor Shared Variable
    # params and tparams have different copy of the weights.
    tparams = init_tparams(params)

    worker.init_shared_params(tparams.values(), param_sync_rule=EASGD(0.5))
    print("Params init done")

    # use_noise is for dropout
    (use_noise, x, mask, y, f_pred_prob, f_pred,
     cost) = build_model(tparams, model_options)

    if decay_c > 0.:
        decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c')
        weight_decay = 0.
        weight_decay += (tparams['U']**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    f_cost = theano.function([x, mask, y], cost, name='f_cost')

    grads = tensor.grad(cost, wrt=tparams.values())
    f_grad = theano.function([x, mask, y], grads, name='f_grad')

    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = optimizer(lr, tparams, grads, x, mask, y, cost)

    print('Optimization')

    kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size)
    kf_test = get_minibatches_idx(len(test[0]), valid_batch_size)

    def train_iter():
        while True:
            kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True)
            for _, train_index in kf:
                y = [train[1][t] for t in train_index]
                x = [train[0][t] for t in train_index]
                x, mask, y = prepare_data(x, y)
                yield x, mask, y

    train_it = train_iter()

    best_p = None

    # Making sure that the worker start training with the most recent params
    worker.copy_to_local()

    while True:
        step = worker.send_req('next')
        print(step)

        if step == 'train':
            use_noise.set_value(numpy_floatX(1.))
            for i in xrange(train_len):
                x, mask, y = next(train_it)
                cost = f_grad_shared(x, mask, y)
                f_update(lrate)
            print('Train cost:', cost)
            step = worker.send_req(dict(done=train_len))

            print("Syncing with global params")
            worker.sync_params(synchronous=True)
        """
        if step.startswith('save '):
            _, saveto = step.split(' ', 1)
            print 'Saving...',
            # TODO fix that shit so that saving works.
            numpy.savez(saveto, history_errs=history_errs, **s.params)
            pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
            print 'Done'
        """

        if step == 'valid':
            if valid_sync:
                worker.copy_to_local()
            use_noise.set_value(numpy_floatX(0.))
            valid_err = pred_error(f_pred, prepare_data, valid, kf_valid)
            test_err = pred_error(f_pred, prepare_data, test, kf_test)
            res = worker.send_req(
                dict(test_err=float(test_err), valid_err=float(valid_err)))

            if res == 'best':
                best_p = unzip(tparams)

            print(('Valid ', valid_err, 'Test ', test_err))
            if valid_sync:
                worker.copy_to_local()

        if step == 'stop':
            break

    # Release all shared ressources.
    worker.close()