class BatchedPixelSum(object): def __init__(self, control_port, batch_port): self._worker = Worker(control_port=control_port, data_port=batch_port) data_shape = self._worker.send_req('get_data_shape') self._computed_sum = theano.shared( value=np.zeros(data_shape, dtype=theano.config.floatX), name='sum', borrow=True) self._worker.init_shared_params(params=[self._computed_sum], param_sync_rule=SUMSync()) input = T.matrix(dtype=theano.config.floatX) batch_sum = T.sum(input, axis=0, dtype=theano.config.floatX) updates = OrderedDict() updates[self._computed_sum] = (self._computed_sum + batch_sum) self._update_sum = theano.function(name='learn', inputs=[input], updates=updates) def get_sum(self): nb_batches_before_sync = 10 while True: step = self._worker.send_req('next') print("# Command received: {}".format(step)) if step == 'train': print("# Training", end=' ') # TODO: Having a fix number of MB before sync can cause # problems for i in range(nb_batches_before_sync): data = np.asarray(self._worker.recv_mb()) print(".", end=' ') self._update_sum(data) print("Done") import time time.sleep(1) step = self._worker.send_req('done', dict(num_batches=nb_batches_before_sync)) print("Syncing with global params.") self._worker.sync_params(synchronous=True) if step == 'stop': break print("All computation done.") return self._worker.shared_params[0] # Return global params
class BatchedPixelSum(object): def __init__(self, control_port, batch_port): self._worker = Worker(control_port=control_port, port=batch_port) data_shape = self._worker.send_req('get_data_shape') self._computed_sum = theano.shared( value=np.zeros(data_shape, dtype=theano.config.floatX), name='sum', borrow=True) self._worker.init_shared_params(params=[self._computed_sum], param_sync_rule=SUMSync()) input = T.matrix(dtype=theano.config.floatX) batch_sum = T.sum(input, axis=0, dtype=theano.config.floatX) updates = OrderedDict() updates[self._computed_sum] = (self._computed_sum + batch_sum) self._update_sum = theano.function(name='learn', inputs=[input], updates=updates) def get_sum(self): nb_batches_before_sync = 10 while True: step = self._worker.send_req('next') print("# Command received: {}".format(step)) if step == 'train': print("# Training", end=' ') # TODO: Having a fix number of MB before sync can cause # problems for i in range(nb_batches_before_sync): data = np.asarray(self._worker.recv_mb()) print(".", end=' ') self._update_sum(data) print("Done") import time time.sleep(1) step = self._worker.send_req('done', dict(num_batches=nb_batches_before_sync)) print("Syncing with global params.") self._worker.sync_params(synchronous=True) if step == 'stop': break print("All computation done.") return self._worker.shared_params[0] # Return global params
def __init__(self, control_port, batch_port): self._worker = Worker(control_port=control_port, port=batch_port) data_shape = self._worker.send_req('get_data_shape') self._computed_sum = theano.shared( value=np.zeros(data_shape, dtype=theano.config.floatX), name='sum', borrow=True) self._worker.init_shared_params(params=[self._computed_sum], param_sync_rule=SUMSync()) input = T.matrix(dtype=theano.config.floatX) batch_sum = T.sum(input, axis=0, dtype=theano.config.floatX) updates = OrderedDict() updates[self._computed_sum] = (self._computed_sum + batch_sum) self._update_sum = theano.function(name='learn', inputs=[input], updates=updates)
def __init__(self, control_port, batch_port): self._worker = Worker(control_port=control_port, port=batch_port) data_shape = self._worker.send_req('get_data_shape') self._computed_sum = theano.shared(value=np.zeros(data_shape, dtype=theano.config.floatX), name='sum', borrow=True) self._worker.init_shared_params(params=[self._computed_sum], param_sync_rule=SUMSync()) input = T.matrix(dtype=theano.config.floatX) batch_sum = T.sum(input, axis=0, dtype=theano.config.floatX) updates = OrderedDict() updates[self._computed_sum] = (self._computed_sum + batch_sum) self._update_sum = theano.function(name='learn', inputs=[input], updates=updates)
def train(self, train_set, valid_set=None, test_set=None, train_size=None): """ Train the model in multi-GPU environment. """ from platoon.channel import Worker from platoon.param_sync import EASGD server_port = self._port param_map = self.create_param_map() # Initialize the worker worker = Worker(control_port=server_port) if self.config.learning_rate: worker.send_req({'init_schedule': self._schedule_params}) self.sync_hyperparams(worker.send_req('sync_hyperparams')['sync_hyperparams']) easgd_alpha = worker.send_req('get_easgd_alpha') worker.init_shared_params(param_map.values(), param_sync_rule=EASGD(easgd_alpha)) worker.copy_to_local() worker.send_req({ "set_names": None, "training_names": self.training_names, "evaluation_names": self.evaluation_names }) # Load all training batches, consume vast memory here self.logger.info("started process {}".format(os.getpid())) self.logger.info("(proc {}) load training data".format(os.getpid())) train_batches = list(train_set) network_callback = bool(self.network.training_callbacks) trainer_callback = bool(self._iter_callbacks) while True: resp = worker.send_req('next') if resp == 'stop': break elif resp == 'wait': time.sleep(1) elif resp == 'get_num_batches': worker.send_req({'get_num_batches_done': len(train_batches)}) elif 'eval' in resp: self.best_cost = resp['best_valid_cost'] worker.copy_to_local() valid_costs = None test_costs = None if valid_set: self._run_valid(self.epoch, valid_set) self.fix_costs() valid_costs = self.last_run_costs if test_set: self._run_test(self.epoch, test_set) self.fix_costs() test_costs = self.last_run_costs worker.send_req({ "eval_done": None, "valid_costs": valid_costs, "test_costs": test_costs, "auto_save": self.config.auto_save }) elif 'valid' in resp: self.best_cost = resp['best_valid_cost'] worker.copy_to_local() if valid_set: self._run_valid(self.epoch, valid_set, dry_run=True) self.fix_costs() worker.send_req({ "valid_done": None, "valid_costs": self.last_run_costs, "auto_save": self.config.auto_save }) elif 'train' in resp: batch_ids = resp['train'] batch_costs = [[] for _ in self.training_names] for batch_id in batch_ids: x = train_batches[batch_id] cost_x = self.learn(*x) for i, cost in enumerate(cost_x): batch_costs[i].append(cost) self.last_cost = cost_x[0] if network_callback: self.network.training_callback() if trainer_callback: for func in self._iter_callbacks: func(self) worker.sync_params(synchronous=True) worker.send_req({'train_done': None, 'costs': [float(np.mean(c)) for c in batch_costs]}) elif 'sync_hyperparams' in resp: self.sync_hyperparams(resp['sync_hyperparams']) worker.close() return []
import sys sys.setrecursionlimit(990000) from platoon.channel import Worker from platoon.param_sync import EASGD theano.config.floatX = 'float32' if __name__ == "__main__": print "running worker" worker = Worker(control_port=4222) device = theano.config.device config = get_config() config["layer_weighting"] = {'y': 1.0} if config['dataset'] == "imagenet": data = ImageNetData(config) elif config['dataset'] == "svhn": data = SvhnData(config) elif config['dataset'] == 'cifar': data = CifarData(config, "train") elif config['dataset'] == 'stl': data = StlData(config) else: raise Exception()
# trick : each worker can do their valid without talking to the controller # even if they finish before another worker, they will wait in the next # epoch at the calling of all_reduce when they need to sync again use_noise.set_value(numpy_floatX(0.)) valid_err = pred_error(f_pred, prepare_data, valid, kf_valid) test_err = pred_error(f_pred, prepare_data, test, kf_test) # they do need to send the result to the controller res = worker.send_req('pred_errors', dict(test_err=float(test_err), valid_err=float(valid_err), epoch=epoch)) if res == 'best': # should save the param at best pass if res == 'stop': break epoch += 1 # Release all shared resources. worker.close() if __name__ == '__main__': # See function train for all possible parameter and there definition. parser = Worker.default_parser() args = parser.parse_args() worker = Worker(**Worker.default_arguments(args)) train_lstm(test_size=500)
test_err = pred_error(f_pred, prepare_data, test, kf_test) print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err if saveto: numpy.savez(saveto, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **best_p) print 'The code run for %d epochs, with %f sec/epochs' % ( (eidx + 1), (end_time - start_time) / (1. * (eidx + 1))) print >> sys.stderr, ('Training took %.1fs' % (end_time - start_time)) return train_err, valid_err, test_err """ if __name__ == '__main__': # See function train for all possible parameter and there definition. parser = Worker.default_parser() parser.add_argument('--valid_sync', dest='valid_sync', action='store_true', default=False) parser.add_argument('--param-sync-api', action='store_true', default=False) args = parser.parse_args() worker = Worker(**Worker.default_arguments(args)) # Set the random number generators' seeds for consistency # Each worker **MUST** be seeded with a different number, so that # they do not draw the same minibatches! SEED = 123 numpy.random.seed(SEED + worker.global_rank) train_lstm(valid_sync=args.valid_sync, test_size=500, param_sync_api=args.param_sync_api)
from theano import config import theano.tensor as tensor from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams import os sys.path.append(os.path.dirname(__file__)) import imdb sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..')) from platoon.channel import Worker from platoon.param_sync import EASGD datasets = {'imdb': (imdb.load_data, imdb.prepare_data)} worker = Worker(control_port=5567) # Set the random number generators' seeds for consistency # Each worker **MUST** be seeded with a different number, so that # they do not draw the same minibatches! SEED = 123 numpy.random.seed(SEED + worker.global_rank) def numpy_floatX(data): return numpy.asarray(data, dtype=config.floatX) def get_minibatches_idx(n, minibatch_size, shuffle=False): """ Used to shuffle the dataset at each iteration. """
worker.copy_to_local() use_noise.set_value(0.) valid_errs = pred_probs(f_log_probs, model_options, valid_stream) valid_err = float(valid_errs.mean()) res = worker.send_req({'valid_err': valid_err}) log.log({'validation_cost': valid_err, 'train_time': time.clock() - train_start, 'time': time.time()}) if res == 'best' and saveto: best_p = unzip(tparams) save_params(best_p, model_filename, saveto_filename) if valid_sync: worker.copy_to_local() if step == 'stop': break # Release all shared ressources. worker.close() if __name__ == "__main__": LOGGER.info('Connecting to worker ({})'.format(sys.argv[1])) worker = Worker(int(sys.argv[1])) LOGGER.info('Retrieving configuration') config = worker.send_req('config') train(worker, config['model'], config['data'], **merge(config['training'], config['management'], config['multi']))
def train_lstm( dim_proj=1024, # word embeding dimension and LSTM number of hidden units. # This value is suggested as being good in the EASGD paper, but # you may want to tune this train_len=10, # Train for this many minibatches when requested decay_c=0., # Weight decay for the classifier applied to the U weights. lrate=0.0001, # Learning rate for sgd (not used for adadelta and rmsprop) n_words=10000, # Vocabulary size optimizer=adadelta, # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate). encoder='lstm', # TODO: can be removed must be lstm. saveto='lstm_model.npz', # The best model will be saved there maxlen=100, # Sequence longer then this get ignored batch_size=16, # The batch size during training. valid_batch_size=64, # The batch size used for validation/test set. dataset='imdb', # Parameter for extra option noise_std=0., use_dropout=True, # if False slightly faster, but worst test error # This frequently need a bigger model. reload_model=None, # Path to a saved model we want to start from. test_size=-1, # If >0, we keep only this number of test example. valid_sync=False, ): worker = Worker(control_port=5567) # Model options model_options = locals().copy() print("model options", model_options) load_data, prepare_data = get_dataset('imdb') print('Loading data') train, valid, test = load_data(n_words=n_words, valid_portion=0.05, maxlen=maxlen) if test_size > 0: # The test set is sorted by size, but we want to keep random # size example. So we must select a random selection of the # examples. idx = numpy.arange(len(test[0])) numpy.random.shuffle(idx) idx = idx[:test_size] test = ([test[0][n] for n in idx], [test[1][n] for n in idx]) ydim = numpy.max(train[1]) + 1 model_options['ydim'] = ydim print('Building model') # This create the initial parameters as numpy ndarrays. # Dict name (string) -> numpy ndarray params = init_params(model_options) if reload_model: load_params('lstm_model.npz', params) # This creates Theano Shared Variable from the parameters. # Dict name (string) -> Theano Tensor Shared Variable # params and tparams have different copy of the weights. tparams = init_tparams(params) worker.init_shared_params(tparams.values(), param_sync_rule=EASGD(0.5)) print("Params init done") # use_noise is for dropout (use_noise, x, mask, y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options) if decay_c > 0.: decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c') weight_decay = 0. weight_decay += (tparams['U'] ** 2).sum() weight_decay *= decay_c cost += weight_decay f_cost = theano.function([x, mask, y], cost, name='f_cost') grads = tensor.grad(cost, wrt=tparams.values()) f_grad = theano.function([x, mask, y], grads, name='f_grad') lr = tensor.scalar(name='lr') f_grad_shared, f_update = optimizer(lr, tparams, grads, x, mask, y, cost) print('Optimization') kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size) kf_test = get_minibatches_idx(len(test[0]), valid_batch_size) def train_iter(): while True: kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) for _, train_index in kf: y = [train[1][t] for t in train_index] x = [train[0][t] for t in train_index] x, mask, y = prepare_data(x, y) yield x, mask, y train_it = train_iter() best_p = None # Making sure that the worker start training with the most recent params worker.copy_to_local() while True: step = worker.send_req('next') print(step) if step == 'train': use_noise.set_value(numpy_floatX(1.)) for i in xrange(train_len): x, mask, y = next(train_it) cost = f_grad_shared(x, mask, y) f_update(lrate) print('Train cost:', cost) step = worker.send_req(dict(done=train_len)) print("Syncing with global params") worker.sync_params(synchronous=True) """ if step.startswith('save '): _, saveto = step.split(' ', 1) print 'Saving...', # TODO fix that shit so that saving works. numpy.savez(saveto, history_errs=history_errs, **s.params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1) print 'Done' """ if step == 'valid': if valid_sync: worker.copy_to_local() use_noise.set_value(numpy_floatX(0.)) valid_err = pred_error(f_pred, prepare_data, valid, kf_valid) test_err = pred_error(f_pred, prepare_data, test, kf_test) res = worker.send_req(dict(test_err=float(test_err), valid_err=float(valid_err))) if res == 'best': best_p = unzip(tparams) print(('Valid ', valid_err, 'Test ', test_err)) if valid_sync: worker.copy_to_local() if step == 'stop': break # Release all shared ressources. worker.close()
def train(self, train_set, valid_set=None, test_set=None, train_size=None): """ Train the model in multi-GPU environment. """ server_port = self._port param_map = self.create_param_map() # Initialize the worker worker = Worker(control_port=server_port) if self.config.learning_rate: worker.send_req({'init_schedule': self._schedule_params}) self.sync_hyperparams(worker.send_req('sync_hyperparams')['sync_hyperparams']) easgd_alpha = worker.send_req('get_easgd_alpha') worker.init_shared_params(param_map.values(), param_sync_rule=EASGD(easgd_alpha)) worker.copy_to_local() worker.send_req({ "set_names": None, "training_names": self.training_names, "evaluation_names": self.evaluation_names }) # Load all training batches, consume vast memory here self.logger.info("started process {}".format(os.getpid())) self.logger.info("(proc {}) load training data".format(os.getpid())) train_batches = list(train_set) network_callback = bool(self.network.training_callbacks) trainer_callback = bool(self._iter_callbacks) while True: resp = worker.send_req('next') if resp == 'stop': break elif resp == 'wait': time.sleep(1) elif resp == 'get_num_batches': worker.send_req({'get_num_batches_done': len(train_batches)}) elif 'eval' in resp: self.best_cost = resp['best_valid_cost'] worker.copy_to_local() valid_costs = None test_costs = None if valid_set: self._run_valid(self.epoch, valid_set) self.fix_costs() valid_costs = self.last_run_costs if test_set: self._run_test(self.epoch, test_set) self.fix_costs() test_costs = self.last_run_costs worker.send_req({ "eval_done": None, "valid_costs": valid_costs, "test_costs": test_costs, "auto_save": self.config.auto_save }) elif 'valid' in resp: self.best_cost = resp['best_valid_cost'] worker.copy_to_local() if valid_set: self._run_valid(self.epoch, valid_set, dry_run=True) self.fix_costs() worker.send_req({ "valid_done": None, "valid_costs": self.last_run_costs, "auto_save": self.config.auto_save }) elif 'train' in resp: batch_ids = resp['train'] batch_costs = [[] for _ in self.training_names] for batch_id in batch_ids: x = train_batches[batch_id] cost_x = self.learn(*x) for i, cost in enumerate(cost_x): batch_costs[i].append(cost) self.last_cost = cost_x[0] if network_callback: self.network.training_callback() if trainer_callback: for func in self._iter_callbacks: func(self) worker.sync_params(synchronous=True) worker.send_req({'train_done': None, 'costs': [float(np.mean(c)) for c in batch_costs]}) elif 'sync_hyperparams' in resp: self.sync_hyperparams(resp['sync_hyperparams']) worker.close() return []
def train(dim_word=100, # word vector dimensionality dim=1000, # the number of GRU units encoder='gru', max_epochs=5000, finish_after=10000000, # finish after this many updates dispFreq=100, decay_c=0., # L2 weight decay penalty lrate=0.01, n_words=100000, # vocabulary size maxlen=100, # maximum length of the description batch_size=16, valid_batch_size=16, max_grad_norm=5, nlayers=1, data_path=None, use_dropout=False, platoon=False, name=""): # Model options model_options = locals().copy() print 'Loading data' raw_data = reader.ptb_raw_data(data_path) train_data, valid_data, test_data, _ = raw_data pprint.pprint(model_options) print 'Building model' params = init_params(model_options) # create shared variables for parameters tparams = init_tparams(params) if platoon: print "PLATOON: Init ...", from platoon.channel import Worker from platoon.param_sync import ASGD worker = Worker(control_port=5567) print "DONE" print "PLATOON: Initializing shared params ...", worker.init_shared_params(tparams.values(), param_sync_rule=ASGD()) print "DONE" worker.send_req({"type": name}) # build the symbolic computational graph trng, use_noise, \ x, \ opt_ret, \ cost, ups = \ build_model(tparams, model_options) inps = [x] # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, updates=ups) print 'Done' # before any regularizer - will be used to compute ppl print 'Building f_cost...', cost_sum = cost.sum() f_cost = theano.function(inps, cost_sum, updates=ups) print 'Done' cost = cost.mean() # apply L2 regularization on weights if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Done' # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') print 'Building optimizers...', f_grad_shared, f_update = sgd(lr, tparams, grads, inps, cost, max_grad_norm) print 'Done' print 'Optimization' history_errs = [] history_ppls = [] wpss = [] best_p = None # Training loop uidx = 0 estop = False bad_counter = 0 try: for eidx in xrange(max_epochs): n_samples = 0 tlen = 0 start_time = time.time() for x, y in reader.ptb_iterator(train_data, batch_size, maxlen): if platoon: #print "PLATOON: Copying data from master ...", worker.copy_to_local() #print "DONE" n_samples += len(x) uidx += 1 use_noise.set_value(1.) tlen += (x.shape[0] * x.shape[1]) # pad batch and create mask if x is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() # compute cost, grads and copy grads to shared variables cost = f_grad_shared(x) # do the update on parameters f_update(lrate) ud = time.time() - ud_start if platoon: #print "PLATOON: Syncing with master ...", worker.sync_params(synchronous=True) #print "DONE" # check for bad numbers if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1. # verbose if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud # finish after this many updates if uidx >= finish_after: print 'Finishing after %d iterations!' % uidx estop = True break current_time = time.time() wps = int(tlen // (current_time - start_time)) print "Current wps", wps wpss.append(wps) print 'Seen %d samples' % n_samples if platoon: print "PLATOON: Sending wps to controller ...", worker.send_req({'wps': wps, 'epoch': eidx}) print "DONE" print "Avg wps, ", numpy.mean(wpss) print "Std avgs,", numpy.std(wpss) use_noise.set_value(0.) finally: if platoon: print "PLATOON: Closing worker ...", worker.send_req('done') worker.close() print "DONE" return 0
import numpy as np from config import get_config from platoon.channel import Worker from platoon.param_sync import EASGD import theano import theano.tensor as T config = get_config() myModel = model.ModelAPI(config) print "model initialized" mb_size = 128 worker = Worker(control_port=4222) device = theano.config.device platoon_sync_rule = EASGD(0.3) nb_minibatches_before_sync = 10 # 10 from EASGD paper params = myModel.nnet.parameters for param in params: print param.get_value().dtype worker.init_shared_params(params, param_sync_rule=platoon_sync_rule) step = worker.send_req('next') print "training started"
def train_convnet( queue_dict, valid_sync=False, verbose = False ): gpuid = int(queue_dict['device'][-1]) from lib.train_funcs import set_cpu_affi set_cpu_affi(gpuid) worker = Worker(control_port=5567) # Load Model options model_options = locals().copy() import yaml with open('config.yaml', 'r') as f: training_config = yaml.load(f) name=training_config['name'] with open(name+'.yaml', 'r') as f: model_config = yaml.load(f) model_options = dict(model_options.items()+training_config.items()+model_config.items()+queue_dict.items()) print "model options", model_options print 'Loading data' from lib.train_funcs import unpack_configs,proc_configs, get_rand3d, adjust_learning_rate proc_configs(model_options) train_len = model_options['avg_freq'] # Train for this many minibatches when requested (flag_para_load, flag_top_5, train_filenames, val_filenames, train_labels, val_labels, img_mean) = \ unpack_configs(model_options, ext_data='.hkl', ext_label='.npy') #train_filenames = train_filenames[:8] #val_filenames = val_filenames[:4] print 'Building model' # shared_x should be created after driver initialization and before drv.mem_get_ipc_handle() is called, otherwise memhandle will be invalid drv = drv_init(queue_dict) # This create the initial parameters as numpy ndarrays. # Dict name (string) -> numpy ndarray tparams, model, drp = init_params(model_options) if model_options['resume_train']: load_epoch=model_options['load_epoch'] load_model(load_epoch, layers, learning_rate, vels, \ path=model_options['load_path']) worker.init_shared_params(tparams, param_sync_rule=EASGD(1.0/model_options['size'])) # Using alpha = 1/N print "Params init done" from lib.googlenet import get_shared_x_y,compile_model,compile_val shared_x_list, shared_y = get_shared_x_y(model_options) train_model, get_vel, descent_vel, params, vels,vels2, learning_rate = \ compile_model(model, model_options,shared_x_list,shared_y) val_model = compile_val(model, model_options,shared_x_list,shared_y) print 'Optimization' # parallel data loading para_load_init(queue_dict, drv, shared_x_list[0],img_mean) para_train_it = p_iter(model_options, shared_y, train_filenames, \ train_labels, train_model, 'train') para_val_it = p_iter(model_options, shared_y, val_filenames, \ val_labels, val_model, 'val') best_p = None def print_time(amount, train_time_list,comm_time_list,wait_time_list): train,comm,wait = sum(train_time_list), sum(comm_time_list), sum (wait_time_list) print 'time per %d images: %.2f (train %.2f comm %.2f wait %.2f)' % \ (amount, train+comm+wait, train,comm,wait) return train+comm+wait, train,comm,wait count=0 start_time = None import time inforec_list = [] train_error_list = [] val_error_list = [] all_time_list = [] epoch_time_list = [] lr_list = [] epoch=0 step_idx = 0 train_time_list = [] wait_time_list = [] comm_time_list = [] while True: req_time= time.time() step = worker.send_req('next') #print step req_time = time.time() - req_time if step == 'train': if start_time==None: start_time = time.time() for i in xrange(train_len): # sync with server every train_len iter train_time, wait_time, cost, error, _ = next(para_train_it) train_time_list.append(train_time) wait_time_list.append(wait_time) count+=1 if (count) % (5120/model_options['file_batch_size']) ==0: print '' print '%d %.4f %.4f'% (count, cost, error) train_error_list.append([count, cost, error]) t_all,t_train,t_comm,t_wait = print_time(5120, train_time_list, comm_time_list, wait_time_list) all_time_list.append([count,t_all,t_train,t_comm,t_wait]) train_time_list = [] wait_time_list =[] comm_time_list = [] comm_time = time.time() step = worker.send_req(dict(done=train_len)) if verbose: print "Syncing" worker.sync_params(synchronous=True) comm_time_list.append(time.time() - comm_time + req_time) """ if step.startswith('save '): _, saveto = step.split(' ', 1) print 'Saving...', # TODO fix that shit so that saving works. numpy.savez(saveto, history_errs=history_errs, **s.params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1) print 'Done' """ if step == 'valid': if valid_sync: worker.copy_to_local() drp.SetDropoutOff() cost_list = [] error_list = [] error_top_5_list = [] for i in xrange(len(val_filenames)): _, _, cost,error,error_top_5= next(para_val_it) cost_list.append(cost) error_list.append(error) error_top_5_list.append(error_top_5) print '.', print '' validation_loss = np.mean(cost_list) validation_error = np.mean(error_list) validation_error_top5 = np.mean(error_top_5_list) print 'validation cost:%.4f' % validation_loss print 'validation error:%.4f' % validation_error print 'validation top_5_error:%.4f' % validation_error_top5 val_error_list.append([count, validation_loss, \ validation_error, validation_error_top5]) drp.SetDropoutOn() res = worker.send_req(dict(test_err=float(validation_error), valid_err=float(validation_error))) if res == 'best': best_p = unzip(tparams) if valid_sync: worker.copy_to_local() # get total iterations processed by all workers uidx = worker.send_req('uidx') uepoch = int(uidx/len(train_filenames)) if model.name=='alexnet': if model_options['lr_policy'] == 'step': if uepoch >=20 and uepoch < 40 and step_idx==0: learning_rate.set_value( np.float32(learning_rate.get_value() / 10)) print 'Learning rate divided by 10' step_idx = 1 elif uepoch >=40 and uepoch < 60 and step_idx==1: learning_rate.set_value( np.float32(learning_rate.get_value() / 10)) print 'Learning rate divided by 10' step_idx = 2 elif uepoch >=60 and uepoch < 70 and step_idx==2: learning_rate.set_value( np.float32(learning_rate.get_value() / 10)) print 'Learning rate divided by 10' step_idx = 3 else: pass if model_options['lr_policy'] == 'auto': if uepoch>5 and (val_error_list[-3][2] - val_error_list[-1][2] < model_options['lr_adapt_threshold']): learning_rate.set_value( np.float32(learning_rate.get_value() / 10)) lr = learning_rate.get_value() lr = np.float32(lr) elif model.name=='googlenet': # Poly lr policy according to # https://github.com/BVLC/caffe/tree/master/models/bvlc_googlenet max_iter = len(train_filenames)*240 lr = learning_rate.get_value() * \ pow( (1. - 1.* uepoch*len(train_filenames) / max_iter), 0.5 ) lr = np.float32(lr) learning_rate.set_value(lr) else: raise NotImplementedError print 'Learning rate now:', lr lr_list.append(lr) if start_time!=None: epoch_time_list.append([count , time.time()-start_time]) epoch = int(count/len(train_filenames) ) print 'epoch %d time %.2fh, global epoch is %d' % (epoch, epoch_time_list[-1][1]/3600.0, uepoch) inforec_list = [train_error_list, val_error_list, all_time_list, epoch_time_list, lr_list ] import pickle filepath = '../run/inforec/inforec_%s.pkl' % queue_dict['device'] with open(filepath, 'wb') as f: pickle.dump(inforec_list, f, protocol=pickle.HIGHEST_PROTOCOL) start_time=None if step == 'stop': break # Release all shared ressources. worker.close()
def train_lstm( dim_proj=1024, # word embeding dimension and LSTM number of hidden units. # This value is suggested as being good in the EASGD paper, but # you may want to tune this train_len=10, # Train for this many minibatches when requested decay_c=0., # Weight decay for the classifier applied to the U weights. lrate=0.0001, # Learning rate for sgd (not used for adadelta and rmsprop) n_words=10000, # Vocabulary size optimizer=adadelta, # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate). encoder='lstm', # TODO: can be removed must be lstm. saveto='lstm_model.npz', # The best model will be saved there maxlen=100, # Sequence longer then this get ignored batch_size=16, # The batch size during training. valid_batch_size=64, # The batch size used for validation/test set. dataset='imdb', # Parameter for extra option noise_std=0., use_dropout=True, # if False slightly faster, but worst test error # This frequently need a bigger model. reload_model=None, # Path to a saved model we want to start from. test_size=-1, # If >0, we keep only this number of test example. valid_sync=False, ): worker = Worker(control_port=5567) # Model options model_options = locals().copy() print("model options", model_options) load_data, prepare_data = get_dataset('imdb') print('Loading data') train, valid, test = load_data(n_words=n_words, valid_portion=0.05, maxlen=maxlen) if test_size > 0: # The test set is sorted by size, but we want to keep random # size example. So we must select a random selection of the # examples. idx = numpy.arange(len(test[0])) numpy.random.shuffle(idx) idx = idx[:test_size] test = ([test[0][n] for n in idx], [test[1][n] for n in idx]) ydim = numpy.max(train[1]) + 1 model_options['ydim'] = ydim print('Building model') # This create the initial parameters as numpy ndarrays. # Dict name (string) -> numpy ndarray params = init_params(model_options) if reload_model: load_params('lstm_model.npz', params) # This creates Theano Shared Variable from the parameters. # Dict name (string) -> Theano Tensor Shared Variable # params and tparams have different copy of the weights. tparams = init_tparams(params) worker.init_shared_params(tparams.values(), param_sync_rule=EASGD(0.5)) print("Params init done") # use_noise is for dropout (use_noise, x, mask, y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options) if decay_c > 0.: decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c') weight_decay = 0. weight_decay += (tparams['U']**2).sum() weight_decay *= decay_c cost += weight_decay f_cost = theano.function([x, mask, y], cost, name='f_cost') grads = tensor.grad(cost, wrt=tparams.values()) f_grad = theano.function([x, mask, y], grads, name='f_grad') lr = tensor.scalar(name='lr') f_grad_shared, f_update = optimizer(lr, tparams, grads, x, mask, y, cost) print('Optimization') kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size) kf_test = get_minibatches_idx(len(test[0]), valid_batch_size) def train_iter(): while True: kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) for _, train_index in kf: y = [train[1][t] for t in train_index] x = [train[0][t] for t in train_index] x, mask, y = prepare_data(x, y) yield x, mask, y train_it = train_iter() best_p = None # Making sure that the worker start training with the most recent params worker.copy_to_local() while True: step = worker.send_req('next') print(step) if step == 'train': use_noise.set_value(numpy_floatX(1.)) for i in xrange(train_len): x, mask, y = next(train_it) cost = f_grad_shared(x, mask, y) f_update(lrate) print('Train cost:', cost) step = worker.send_req(dict(done=train_len)) print("Syncing with global params") worker.sync_params(synchronous=True) """ if step.startswith('save '): _, saveto = step.split(' ', 1) print 'Saving...', # TODO fix that shit so that saving works. numpy.savez(saveto, history_errs=history_errs, **s.params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1) print 'Done' """ if step == 'valid': if valid_sync: worker.copy_to_local() use_noise.set_value(numpy_floatX(0.)) valid_err = pred_error(f_pred, prepare_data, valid, kf_valid) test_err = pred_error(f_pred, prepare_data, test, kf_test) res = worker.send_req( dict(test_err=float(test_err), valid_err=float(valid_err))) if res == 'best': best_p = unzip(tparams) print(('Valid ', valid_err, 'Test ', test_err)) if valid_sync: worker.copy_to_local() if step == 'stop': break # Release all shared ressources. worker.close()