def create_model(self): hidden_layers = [8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 16, 32] first_input = C.ops.splice(self._input, self._target) first_input_size = first_input.shape first_input = C.ops.reshape( first_input, (first_input_size[0], 1, first_input_size[1])) model = C.layers.Convolution2D((1, 3), num_filters=8, pad=True, reduction_rank=1, activation=C.ops.tanh)(first_input) print(model) for h in hidden_layers: input_new = C.ops.splice(model, first_input, axis=0) model = C.layers.Convolution2D((1, 3), num_filters=h, pad=True, reduction_rank=1, activation=C.ops.tanh)(input_new) print(model) ###### #model = C.ops.splice(model, self._target) # Dense layers direction = C.layers.Sequential([ C.layers.Dense(720, activation=C.ops.relu), C.layers.Dense(360, activation=None) ])(model) velocity = C.layers.Sequential([ C.layers.Dense(128, activation=C.ops.relu), C.layers.Dense(64, activation=None), C.layers.Dense(1, activation=None) ])(model) model = C.ops.splice(direction, velocity) if self._load_model: model = C.load_model(self._file_name) direction = model[0:360] velocity = model[360] C.logging.log_number_of_parameters(model) print(model) #loss = C.squared_error(direction, self._output) + C.squared_error(velocity, self._output_velocity) #error = C.squared_error(direction, self._output) + C.squared_error(velocity, self._output_velocity) loss = C.cross_entropy_with_softmax( direction, self._output) + C.squared_error(velocity, self._output_velocity) error = C.classification_error(direction, self._output) + C.squared_error( velocity, self._output_velocity) learner = C.adadelta(model.parameters, l2_regularization_weight=0.001) progress_printer = C.logging.ProgressPrinter(tag='Training') trainer = C.Trainer(model, (loss, error), learner, progress_printer) return model, loss, learner, trainer
def train(): model = Model() z, loss, acc = model.model() progress_writers = [ C.logging.ProgressPrinter(num_epochs=max_epochs, freq=log_freq, tag='Training', log_to_file='log/log_' + version) ] lr = C.learning_parameter_schedule(learning_rate, minibatch_size=None, epoch_size=None) learner = C.adadelta(z.parameters, lr) trainer = C.Trainer(z, (loss, acc), learner, progress_writers) mb_source, input_map = deserialize(loss, train_data, model) mb_valid, valid_map = deserialize(loss, valid_data, model) try: trainer.restore_from_checkpoint('../model/' + version) except Exception: print('No checkpoint.') for epoch in range(max_epochs): # train num_seq = 0 with tqdm(total=epoch_size, ncols=79) as progress_bar: while True: data = mb_source.next_minibatch(minibatch_size, input_map=input_map) trainer.train_minibatch(data) num_seq += trainer.previous_minibatch_sample_count progress_bar.update(trainer.previous_minibatch_sample_count) if num_seq >= epoch_size: break trainer.summarize_training_progress() trainer.save_checkpoint('../model/' + version + '/' + str(epoch)) # validation num_seq = 0 with tqdm(total=num_validation, ncols=79) as valid_progress_bar: while True: data = mb_valid.next_minibatch(minibatch_size, input_map=valid_map) if not data: break trainer.test_minibatch(data) num_seq += len(data) valid_progress_bar.update(len(data)) if num_seq >= num_validation: break trainer.summarize_test_progress()
def create_model(self): hidden_layers = [8, 8, 8, 8, 8, 8, 8, 8, 8] #first_input = C.ops.reshape( # C.ops.splice(self._input, self._target), # (1,self._input_size[0]*2,self._input_size[1])) #print(first_input) first_input = C.ops.reshape( self._input, (1, self._input_size[0], self._input_size[1])) model = C.layers.Convolution2D((1, 3), num_filters=8, pad=True, reduction_rank=1, activation=C.ops.tanh)(first_input) for h in hidden_layers: input_new = C.ops.splice(model, first_input) model = C.layers.Convolution2D((1, 3), num_filters=h, pad=True, reduction_rank=1, activation=C.ops.tanh)(input_new) model = C.ops.splice(model, self._target) ###### # Dense layers direction = C.layers.Sequential([ C.layers.Dense(256, activation=C.ops.relu), C.layers.Dense(128, activation=C.ops.relu), C.layers.Dense(64, activation=C.ops.relu), C.layers.Dense(32, activation=None), ])(model) velocity = C.layers.Sequential([ C.layers.Dense(128, activation=C.ops.relu), C.layers.Dense(64, activation=None), C.layers.Dense(1, activation=None) ])(model) model = C.ops.splice(direction, velocity) print(model) loss = C.cross_entropy_with_softmax( direction, self._output) + C.squared_error(velocity, self._output_velocity) error = C.classification_error(direction, self._output) + C.squared_error( velocity, self._output_velocity) learner = C.adadelta(model.parameters) progress_printer = C.logging.ProgressPrinter(tag='Training', freq=50) trainer = C.Trainer(model, (loss, error), learner, progress_printer) return model, loss, learner, trainer
def create_model(self): hidden_layers = self._hidden_layers with C.layers.default_options(init=C.layers.glorot_uniform(), activation=C.ops.relu): h = self._input for i in range(self._num_hidden_layers): h = C.layers.Dense(hidden_layers[i])(h) model = C.layers.Dense(self._output_size, activation=None)(h) loss = C.reduce_mean(C.square(model - self._output), axis=0) meas = C.reduce_mean(C.square(model - self._output), axis=0) learner = C.adadelta(model.parameters, self._lr_schedule) trainer = C.Trainer(model, (loss, meas), learner) return model, loss, learner, trainer
def create_model(self): hidden_layers = self._hidden_layers with cntk.layers.default_options(init=cntk.layers.glorot_uniform(), activation=cntk.ops.relu): h = self._input for i in range(self._num_hidden_layers): h = cntk.layers.Dense(hidden_layers[i], activation=cntk.ops.relu)(h) model = cntk.layers.Dense(self._output_size, activation=None)(h) loss = cntk.reduce_mean(cntk.square(model - self._output), axis=0) meas = cntk.reduce_mean(cntk.square(model - self._output), axis=0) learner = cntk.adadelta(model.parameters, self._lr_schedule, l2_regularization_weight=0.01) trainer = cntk.Trainer(model, (loss, meas), learner) return model, loss, learner, trainer
def create_model(self): hidden_layers = [8,8,8,8,8,8,8,8,8] first_input = C.ops.reshape( C.ops.splice(self._input,self._target), (1,self._input_size[0]*2,self._input_size[1])) print(first_input) model = C.layers.Convolution2D( (1,3), num_filters=8, pad=True, reduction_rank=1, activation=C.ops.tanh)(first_input) print(model) for h in hidden_layers: input_new = C.ops.splice(model,first_input) model = C.layers.Convolution2D( (1,3), num_filters=h, pad=True, reduction_rank=1, activation=C.ops.tanh)(input_new) print(model) ###### # Dense layers direction = C.layers.Sequential([ C.layers.Dense(256, activation=C.ops.relu), C.layers.Dense(128, activation=C.ops.relu), C.layers.Dense(64, activation=C.ops.relu), C.layers.Dense(32, activation=None), ])(model) velocity = C.layers.Sequential([ C.layers.Dense(128,activation=C.ops.relu), C.layers.Dense(64,activation=None), C.layers.Dense(1,activation=None) ])(model) model = C.ops.splice(direction,velocity) if self._load_model: model = C.load_model('dnns/GRP_f.dnn') direction = model[0:32] velocity = model[32] print (model) loss = C.cross_entropy_with_softmax(direction, self._output) + C.squared_error(velocity, self._output_velocity) error = C.classification_error(direction, self._output) + C.squared_error(velocity, self._output_velocity) learner = C.adadelta(model.parameters, l2_regularization_weight=0.001) progress_printer = C.logging.ProgressPrinter(tag='Training') trainer = C.Trainer(model, (loss,error), learner, progress_printer) return model, loss, learner, trainer
def create_model(self): hidden_layers = [8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 1] first_input = C.ops.reshape( C.ops.splice(self._input, self._target), (2, self._input_size[0], self._input_size[1])) print(first_input) model = C.layers.Convolution2D((1, 3), num_filters=8, pad=True, reduction_rank=1, activation=C.ops.tanh)(first_input) #print(model) for h in hidden_layers: input_new = C.ops.splice(model, first_input, axis=0) #print(input_new) model = C.layers.Convolution2D((1, 3), num_filters=h, pad=True, reduction_rank=1, activation=C.ops.tanh)(input_new) #print(model) model = C.ops.reshape(model, (1, 360)) #print(model) #C.logging.log_number_of_parameters(model) if self._load_model: model = C.load_model('dnns/feature_predicter_GRP.dnn') #print(model) err = C.ops.reshape(C.ops.minus(model, self._output), (self._output_size)) sq_err = C.ops.square(err) mse = C.ops.reduce_mean(sq_err) rmse_loss = C.ops.sqrt(mse) rmse_eval = rmse_loss learner = C.adadelta(model.parameters) progress_printer = C.logging.ProgressPrinter(tag='Training') trainer = C.Trainer(model, (rmse_loss, rmse_eval), learner, progress_printer) return model, rmse_loss, learner, trainer
def create_model(self): hidden_layers = [8,8,8,8,8,8,8,8,8] first_input = C.ops.splice(self._input,self._target) i_size = first_input.shape first_input = C.ops.reshape(first_input,(i_size[0],1,i_size[1])) model = C.layers.Convolution2D((1,3), num_filters=8, pad=True, reduction_rank=1,activation=C.ops.tanh)(first_input) print (model) for i, h in enumerate(hidden_layers): input_new = C.ops.splice(model, first_input,axis=0) model = C.layers.Convolution2D((1,3), num_filters=h, pad=True, reduction_rank=1, activation=C.ops.tanh, name='c_{}'.format(h))(input_new) print(model) model = C.layers.BatchNormalization()(model) model = C.layers.Dropout(0.1)(model) model = C.ops.splice(model, self._target) direction = C.layers.Sequential([ C.layers.Recurrence(C.layers.LSTM(720)), C.layers.Dense(360, activation=None) ]) (model) velocity = C.layers.Sequential([ C.layers.Recurrence(C.layers.LSTM(128)), C.layers.Dense(64,activation=C.ops.tanh), C.layers.Dense(1, activation=None) ])(model) model = C.ops.splice(direction,velocity) if self._load_model: model = C.load_model(self._file_name) direction = model[0:360] velocity = model[360] C.logging.log_number_of_parameters(model) print (model) #loss = C.cross_entropy_with_softmax(direction, self._output) + C.squared_error(velocity,self._output_velocity) + C.ops.relu(1-C.ops.log(C.reduce_min(self._input) + 1 -self._safe_dist)) loss = C.cross_entropy_with_softmax(direction, self._output) + C.squared_error(velocity,self._output_velocity) error = C.classification_error(direction, self._output) + C.squared_error(velocity,self._output_velocity) learner = C.adadelta(model.parameters, l2_regularization_weight=0.001) progress_printer = C.logging.ProgressPrinter(tag='Training', freq=20) trainer = C.Trainer(model,(loss,error), learner, progress_printer) return model, loss, learner, trainer
def create_model(self): hidden_layers = [8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 16, 32] first_input = C.ops.splice(self._input, self._target) first_input_size = first_input.shape first_input = C.ops.reshape( first_input, (first_input_size[0], 1, first_input_size[1])) model = C.layers.Convolution2D((1, 3), num_filters=8, pad=True, reduction_rank=1, activation=C.ops.tanh)(first_input) for i, h in enumerate(hidden_layers): #if i >0: #input_new = C.ops.splice(model,input_new,axis=0) #else: input_new = C.ops.splice(model, first_input, axis=0) model1 = C.layers.Convolution2D((1, 3), num_filters=int(0.5 * h), pad=True, reduction_rank=1, activation=C.ops.tanh)(input_new) model2 = C.layers.Convolution2D((1, 5), num_filters=int(0.25 * h), pad=True, reduction_rank=1, activation=C.ops.tanh)(input_new) model3 = C.layers.Convolution2D((1, 1), num_filters=int(0.25 * h), pad=True, reduction_rank=1, activation=C.ops.tanh)(input_new) model = C.ops.splice(model1, model2, model3, axis=0) print(model) """ model = C.layers.Sequential([ # Convolution layers C.layers.Convolution2D((1,3), num_filters=8, pad=True, reduction_rank=1, activation=C.ops.tanh, name='conv_a'), C.layers.Convolution2D((1,3), num_filters=16, pad=True, reduction_rank=1, activation=C.ops.tanh, name='conv2_a'), C.layers.Convolution2D((1,3), num_filters=32, pad=False, reduction_rank=1, activation=C.ops.tanh, name='conv3_a'), ###### # Dense layers C.layers.Dense(1024, activation=C.ops.relu,name='dense5_a'), # Recurrence #C.layers.Recurrence(C.layers.LSTM(2048, init=C.glorot_uniform()),name='lstm_a'), C.layers.Dense(1024,activation=None) ])(model) ###### """ # Prediction direction = C.layers.Sequential([ C.layers.Dense(720, activation=C.ops.relu, name='dense6_a'), C.layers.Dense(360, activation=None, name='dense7_a') ])(model) velocity = C.layers.Sequential([ C.layers.Dense(128, activation=C.ops.relu), C.layers.Dense(64, activation=None), C.layers.Dense(1, activation=None) ])(model) model = C.ops.splice(direction, velocity) #model = velocity if self._load_model: model = C.load_model(self._file_name) direction = model[0:360] velocity = model[360] C.logging.log_number_of_parameters(model) print(model) #loss = C.squared_error(direction, self._output) + C.squared_error(velocity, self._output_velocity) #error = C.classification_error(direction, self._output) + C.squared_error(velocity, self._output_velocity) loss = C.cross_entropy_with_softmax( direction, self._output) + C.squared_error(velocity, self._output_velocity) error = C.classification_error(direction, self._output) + C.squared_error( velocity, self._output_velocity) learner = C.adadelta(model.parameters, l2_regularization_weight=0.001) progress_printer = C.logging.ProgressPrinter(tag='Training') trainer = C.Trainer(model, (loss, error), learner, progress_printer) return model, loss, learner, trainer
def train(data_path, model_path, log_file, config_file, restore=False, profiling=False, gen_heartbeat=False): training_config = importlib.import_module(config_file).training_config # config for using multi GPUs if training_config['multi_gpu']: gpu_pad = training_config['gpu_pad'] gpu_cnt = training_config['gpu_cnt'] my_rank = C.Communicator.rank() my_gpu_id = (my_rank + gpu_pad) % gpu_cnt print("rank = " + str(my_rank) + ", using gpu " + str(my_gpu_id) + " of " + str(gpu_cnt)) C.try_set_default_device(C.gpu(my_gpu_id)) else: C.try_set_default_device(C.gpu(0)) # outputs while training normal_log = os.path.join(data_path, training_config['logdir'], log_file) # tensorboard files' dir tensorboard_logdir = os.path.join(data_path, training_config['logdir'], log_file) polymath = PolyMath(config_file) z, loss = polymath.model() max_epochs = training_config['max_epochs'] log_freq = training_config['log_freq'] progress_writers = [ C.logging.ProgressPrinter(num_epochs=max_epochs, freq=log_freq, tag='Training', log_to_file=normal_log, rank=C.Communicator.rank(), gen_heartbeat=gen_heartbeat) ] # add tensorboard writer for visualize tensorboard_writer = C.logging.TensorBoardProgressWriter( freq=10, log_dir=tensorboard_logdir, rank=C.Communicator.rank(), model=z) progress_writers.append(tensorboard_writer) lr = C.learning_parameter_schedule(training_config['lr'], minibatch_size=None, epoch_size=None) ema = {} dummies_info = {} dummies = [] for p in z.parameters: ema_p = C.constant(0, shape=p.shape, dtype=p.dtype, name='ema_%s' % p.uid) ema[p.uid] = ema_p dummies.append(C.reduce_sum(C.assign(ema_p, p))) dummies_info[dummies[-1].output] = (p.name, p.shape) dummy = C.combine(dummies) learner = C.adadelta(z.parameters, lr) if C.Communicator.num_workers() > 1: learner = C.data_parallel_distributed_learner(learner) trainer = C.Trainer(z, (loss, None), learner, progress_writers) if profiling: C.debugging.start_profiler(sync_gpu=True) train_data_file = os.path.join(data_path, training_config['train_data']) train_data_ext = os.path.splitext(train_data_file)[-1].lower() model_file = os.path.join(model_path, model_name) model = C.combine(list(z.outputs) + [loss.output]) label_ab = argument_by_name(loss, 'ab') epoch_stat = { 'best_val_err': 100, 'best_since': 0, 'val_since': 0, 'record_num': 0 } if restore and os.path.isfile(model_file): trainer.restore_from_checkpoint(model_file) #after restore always re-evaluate epoch_stat['best_val_err'] = validate_model( os.path.join(data_path, training_config['val_data']), model, polymath, config_file) def post_epoch_work(epoch_stat): trainer.summarize_training_progress() epoch_stat['val_since'] += 1 if epoch_stat['val_since'] == training_config['val_interval']: epoch_stat['val_since'] = 0 temp = dict((p.uid, p.value) for p in z.parameters) for p in trainer.model.parameters: p.value = ema[p.uid].value val_err = validate_model( os.path.join(data_path, training_config['val_data']), model, polymath, config_file) if epoch_stat['best_val_err'] > val_err: epoch_stat['best_val_err'] = val_err epoch_stat['best_since'] = 0 os.system("ls -la >> log.log") os.system("ls -la ./Models >> log.log") save_flag = True fail_cnt = 0 while save_flag: if fail_cnt > 100: print("ERROR: failed to save models") break try: trainer.save_checkpoint(model_file) epoch_stat['record_num'] += 1 record_file = os.path.join( model_path, str(epoch_stat['record_num']) + '-' + model_name) trainer.save_checkpoint(record_file) save_flag = False except: fail_cnt = fail_cnt + 1 for p in trainer.model.parameters: p.value = temp[p.uid] else: epoch_stat['best_since'] += 1 if epoch_stat['best_since'] > training_config['stop_after']: return False if profiling: C.debugging.enable_profiler() return True if train_data_ext == '.ctf': mb_source, input_map = create_mb_and_map(loss, train_data_file, polymath) minibatch_size = training_config['minibatch_size'] # number of samples epoch_size = training_config['epoch_size'] for epoch in range(max_epochs): num_seq = 0 while True: if trainer.total_number_of_samples_seen >= training_config[ 'distributed_after']: data = mb_source.next_minibatch( minibatch_size * C.Communicator.num_workers(), input_map=input_map, num_data_partitions=C.Communicator.num_workers(), partition_index=C.Communicator.rank()) else: data = mb_source.next_minibatch(minibatch_size, input_map=input_map) trainer.train_minibatch(data) num_seq += trainer.previous_minibatch_sample_count # print_para_info(dummy, dummies_info) if num_seq >= epoch_size: break if not post_epoch_work(epoch_stat): break else: if train_data_ext != '.tsv': raise Exception("Unsupported format") minibatch_seqs = training_config[ 'minibatch_seqs'] # number of sequences for epoch in range(max_epochs): # loop over epochs tsv_reader = create_tsv_reader(loss, train_data_file, polymath, minibatch_seqs, C.Communicator.num_workers()) minibatch_count = 0 for data in tsv_reader: if (minibatch_count % C.Communicator.num_workers()) == C.Communicator.rank(): trainer.train_minibatch(data) # update model with it dummy.eval() minibatch_count += 1 if not post_epoch_work(epoch_stat): break if profiling: C.debugging.stop_profiler()
((0.2, 0), [0.2, 0.2, 0.2, 0.2], 0), (([0.2, 0.4], 0, 5), [0.2] * 5 + [0.4] * 20, 0), (([(3, 0.2), (2, 0.4), (1, 0.8)], 0, 5), [0.2] * 15 + [0.4] * 10 + [0.8] * 20, 0), ] MOMENTUM_SCHEDULE_PARAMS = [ ((0.2, ), [0.2]), ((0.2, ), [0.2, 0.2, 0.2, 0.2]), (([0.2, 0.4], 5), [0.2] * 5 + [0.4] * 20), (([(3, 0.2), (2, 0.4), (1, 0.8)], 5), [0.2] * 15 + [0.4] * 10 + [0.8] * 20), ] LEARNER_LAMBDAS = [ lambda params: C.adadelta(params), lambda params: C.adagrad( params, lr=learning_rate_schedule(1, UnitType.minibatch)), lambda params: C.adam(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)), lambda params: C.fsadagrad(params, lr=learning_rate_schedule( 1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)), lambda params: C.nesterov(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)), lambda params: C.rmsprop(params, lr=learning_rate_schedule(1, UnitType.minibatch), gamma=0.1, inc=3.0,
def test_learner_init(): i = C.input_variable(shape=(1,), needs_gradient=True, name='a') w = parameter(shape=(1,)) res = i * w #test new API: learning_parameter_schedule #explicitly specify reference minibatch size and learning rate is in number: learner = sgd(res.parameters, lr=0.1, minibatch_size = 25) assert learner.is_compatible_mode() == False assert learner.minibatch_size == 25 #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == 25 assert learner.learning_rate() == 0.1 #no explicitly specification of reference minibatch size and learning rate is in number: learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1)) assert learner.is_compatible_mode() == False assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.learning_rate() == 0.1 learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1, 20), minibatch_size = 25) assert learner.is_compatible_mode() == False assert learner.minibatch_size == 25 #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == 20 assert learner.learning_rate() == 0.1 learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1, 20)) assert learner.is_compatible_mode() == False #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == 20 assert learner.learning_rate() == 0.1 #no explicitly specification of reference minibatch size and learning rate is in number: learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1)) assert learner.is_compatible_mode() == False assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.learning_rate() == 0.1 #no explicitly specification of reference minibatch size and learning rate is in number: learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1), minibatch_size=C.learners.IGNORE) assert learner.is_compatible_mode() == True assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.learning_rate() == 0.1 learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1, 20), minibatch_size=C.learners.IGNORE) assert learner.is_compatible_mode() == True assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == 20 assert learner.learning_rate() == 0.1 #no explicitly specification of reference minibatch size and learning rate is in number: learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1), minibatch_size=C.learners.IGNORE) assert learner.is_compatible_mode() == True assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.learning_rate() == 0.1 mysgd = C.sgd(parameters=res.parameters, lr=0.4, minibatch_size=32) assert mysgd.minibatch_size == 32 assert mysgd._learning_rate_schedule.minibatch_size == 32 assert mysgd.learning_rate() == 0.4 mymomentum = C.momentum_sgd(parameters=res.parameters, lr=0.4, momentum=0.9, minibatch_size=32) assert mymomentum.minibatch_size == 32 assert mymomentum._learning_rate_schedule.minibatch_size == 32 assert mymomentum.learning_rate() == 0.4 myadadelta = C.adadelta(parameters=res.parameters, lr=0.4, minibatch_size=32) assert myadadelta.minibatch_size == 32 assert myadadelta._learning_rate_schedule.minibatch_size == 32 assert myadadelta.learning_rate() == 0.4 myadam = C.adam(parameters=res.parameters, lr=0.4, momentum=0.9, variance_momentum=0.9, minibatch_size=32) assert myadam.minibatch_size == 32 assert myadam._learning_rate_schedule.minibatch_size == 32 assert myadam.learning_rate() == 0.4 myadagrad = C.adagrad(parameters=res.parameters, lr=0.4, minibatch_size=32) assert myadagrad.minibatch_size == 32 assert myadagrad._learning_rate_schedule.minibatch_size == 32 assert myadagrad.learning_rate() == 0.4 myfsadagrad = C.fsadagrad(parameters=res.parameters, lr=0.4, momentum=0.9, variance_momentum=0.9, minibatch_size=32) assert myfsadagrad.minibatch_size == 32 assert myfsadagrad._learning_rate_schedule.minibatch_size == 32 assert myfsadagrad.learning_rate() == 0.4 mynesterov = C.nesterov(parameters=res.parameters, lr=0.4, momentum=0.9, minibatch_size=32) assert mynesterov.minibatch_size == 32 assert mynesterov._learning_rate_schedule.minibatch_size == 32 assert mynesterov.learning_rate() == 0.4 myrmsrop = C.rmsprop(parameters=res.parameters, lr=0.4, gamma=0.5, inc=1.2, dec=0.7, max=10, min=1e-8, minibatch_size=32) assert myrmsrop.minibatch_size == 32 assert myrmsrop._learning_rate_schedule.minibatch_size == 32 assert myrmsrop.learning_rate() == 0.4 mysgd = C.sgd(parameters=res.parameters, lr=[0.4, 0.1, 0.001], minibatch_size=32, epoch_size=512) assert mysgd.minibatch_size == 32 assert mysgd._learning_rate_schedule.minibatch_size == 32 assert mysgd._learning_rate_schedule[0] == 0.4 assert mysgd._learning_rate_schedule[512] == 0.1 assert mysgd._learning_rate_schedule[512 * 2] == 0.001 mymomentum = C.momentum_sgd(parameters=res.parameters, lr=[0.4, 0.1, 0.001], momentum=[0.9], minibatch_size=32, epoch_size=512) assert mymomentum.minibatch_size == 32 assert mymomentum._learning_rate_schedule.minibatch_size == 32 assert mymomentum._learning_rate_schedule[0] == 0.4 assert mymomentum._learning_rate_schedule[512] == 0.1 assert mymomentum._learning_rate_schedule[512 * 2] == 0.001 myadadelta = C.adadelta(parameters=res.parameters, lr=[0.4, 0.1, 0.001], minibatch_size=32, epoch_size=512) assert myadadelta.minibatch_size == 32 assert myadadelta._learning_rate_schedule.minibatch_size == 32 assert myadadelta._learning_rate_schedule[0] == 0.4 assert myadadelta._learning_rate_schedule[512] == 0.1 assert myadadelta._learning_rate_schedule[512 * 2] == 0.001 myadam = C.adam(parameters=res.parameters, lr=[0.4, 0.1, 0.001], momentum=[0.9, 0.1, 0.001], variance_momentum=[0.9], minibatch_size=32, epoch_size=512) assert myadam.minibatch_size == 32 assert myadam._learning_rate_schedule.minibatch_size == 32 assert myadam._learning_rate_schedule[0] == 0.4 assert myadam._learning_rate_schedule[512] == 0.1 assert myadam._learning_rate_schedule[512 * 2] == 0.001 myadagrad = C.adagrad(parameters=res.parameters, lr=[0.4, 0.1, 0.001], minibatch_size=32, epoch_size=512) assert myadagrad.minibatch_size == 32 assert myadagrad._learning_rate_schedule.minibatch_size == 32 assert myadagrad._learning_rate_schedule[0] == 0.4 assert myadagrad._learning_rate_schedule[512] == 0.1 assert myadagrad._learning_rate_schedule[512 * 2] == 0.001 myfsadagrad = C.fsadagrad(parameters=res.parameters, lr=[0.4, 0.1, 0.001], momentum=[0.9], variance_momentum=[0.9], minibatch_size=32, epoch_size=512) assert myadagrad.minibatch_size == 32 assert myadagrad._learning_rate_schedule.minibatch_size == 32 assert myadagrad._learning_rate_schedule[0] == 0.4 assert myadagrad._learning_rate_schedule[512] == 0.1 assert myadagrad._learning_rate_schedule[512 * 2] == 0.001 mynesterov = C.nesterov(parameters=res.parameters, lr=[0.4, 0.1, 0.001], momentum=[0.9], minibatch_size=32, epoch_size=512) assert mynesterov.minibatch_size == 32 assert mynesterov._learning_rate_schedule.minibatch_size == 32 assert mynesterov._learning_rate_schedule[0] == 0.4 assert mynesterov._learning_rate_schedule[512] == 0.1 assert mynesterov._learning_rate_schedule[512 * 2] == 0.001 myrmsrop = C.rmsprop(parameters=res.parameters, lr=[0.4, 0.1, 0.001], gamma=0.5, inc=1.2, dec=0.7, max=10, min=1e-8, minibatch_size=32, epoch_size=512) assert myrmsrop.minibatch_size == 32 assert myrmsrop._learning_rate_schedule.minibatch_size == 32 assert myrmsrop._learning_rate_schedule[0] == 0.4 assert myrmsrop._learning_rate_schedule[512] == 0.1 assert myrmsrop._learning_rate_schedule[512 * 2] == 0.001 learner_parameter = learner.parameters from cntk.variables import Parameter param = learner_parameter[0] assert isinstance(param, Parameter) unit_gain_value = C.default_unit_gain_value() assert unit_gain_value momentum = C.momentum_schedule(0.999, minibatch_size=1) lr_per_sample = learning_parameter_schedule(0.1, minibatch_size = 1) C.momentum_sgd(res.parameters, lr_per_sample, momentum) C.momentum_sgd(res.parameters, lr_per_sample, momentum, unit_gain_value) C.momentum_sgd(res.parameters, lr_per_sample, momentum, unit_gain=unit_gain_value) C.set_default_unit_gain_value(False) unit_gain_value = C.default_unit_gain_value() assert not unit_gain_value lr_per_sample = learning_parameter_schedule([0.1, 0.2], minibatch_size = 1) C.nesterov(res.parameters, lr=lr_per_sample, momentum=momentum) C.nesterov(res.parameters, lr_per_sample, momentum, unit_gain_value) C.nesterov(res.parameters, lr=lr_per_sample, momentum=momentum, unit_gain=unit_gain_value) lr_per_sample = learning_parameter_schedule([0.1]*3 +[0.2]*2 +[0.3], minibatch_size=1) C.adagrad(res.parameters, lr=lr_per_sample, need_ave_multiplier=True) C.set_default_unit_gain_value(True) unit_gain_value = C.default_unit_gain_value() assert unit_gain_value lr_per_sample = learning_parameter_schedule([(3,0.1), (2, 0.2), (1, 0.3)], minibatch_size=1) C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum) C.fsadagrad(res.parameters, lr_per_sample, momentum, unit_gain_value) C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum, unit_gain=unit_gain_value) gamma, inc, dec, max, min = [0.5, 1.2, 0.7, 10, 1e-8] lr_per_sample = learning_parameter_schedule([0.1, 0.2], minibatch_size = 1, epoch_size = 100) C.rmsprop(res.parameters, lr_per_sample, gamma, inc, dec, max, min, True) C.adadelta(res.parameters, lr_per_sample)
#not specifying reference mb sizes ((0.2, 0), [0.2], 0), ((0.2, 0), [0.2, 0.2, 0.2, 0.2], 0), (([0.2,0.4], 0, 5), [0.2]*5+[0.4]*20, 0), (([(3,0.2),(2,0.4),(1,0.8)], 0, 5), [0.2]*15+[0.4]*10+[0.8]*20, 0), ] MOMENTUM_SCHEDULE_PARAMS = [ ((0.2,), [0.2]), ((0.2,), [0.2, 0.2, 0.2, 0.2]), (([0.2,0.4], 5), [0.2]*5+[0.4]*20), (([(3,0.2),(2,0.4),(1,0.8)], 5), [0.2]*15+[0.4]*10+[0.8]*20), ] LEARNER_LAMBDAS = [ lambda params: C.adadelta(params), lambda params: C.adagrad(params, lr=learning_rate_schedule(1, UnitType.minibatch)), lambda params: C.adam(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)), lambda params: C.fsadagrad(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)), lambda params: C.nesterov(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)), lambda params: C.rmsprop(params, lr=learning_rate_schedule(1, UnitType.minibatch), gamma=0.1, inc=3.0, dec=0.1, max=np.inf, min=1e-8), lambda params: C.sgd(params, lr=learning_rate_schedule(1, UnitType.minibatch)), lambda params: C.momentum_sgd(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9))] @pytest.mark.parametrize("params, expectation, minibatch_size", LR_SCHEDULE_PARAMS_LEGACY) def test_learning_rate_schedule(params, expectation, minibatch_size): l = learning_rate_schedule(*params) assert l.minibatch_size == minibatch_size assert [l[i] for i in range(len(expectation))] == expectation @pytest.mark.parametrize("params, expectation, minibatch_size", LR_SCHEDULE_PARAMS)
def test_learner_init(): i = C.input_variable(shape=(1,), needs_gradient=True, name='a') w = parameter(shape=(1,)) res = i * w learner = sgd(res.parameters, lr=learning_rate_schedule(0.1, UnitType.sample)) assert learner.learning_rate() == 0.1 learner.reset_learning_rate(learning_rate_schedule([1,2,3], UnitType.minibatch)); assert learner.learning_rate() == 1.0 learner_parameter = learner.parameters from cntk.variables import Parameter param = learner_parameter[0] assert isinstance(param, Parameter) unit_gain_value = C.default_unit_gain_value() assert unit_gain_value momentum_time_constant = C.momentum_as_time_constant_schedule(1100) lr_per_sample = learning_rate_schedule(0.1, UnitType.sample) C.momentum_sgd(res.parameters, lr_per_sample, momentum_time_constant) C.momentum_sgd(res.parameters, lr_per_sample, momentum_time_constant, unit_gain_value) C.momentum_sgd(res.parameters, lr_per_sample, momentum_time_constant, unit_gain=unit_gain_value) C.set_default_unit_gain_value(False) unit_gain_value = C.default_unit_gain_value() assert not unit_gain_value lr_per_sample = learning_rate_schedule([0.1, 0.2], UnitType.sample) C.nesterov(res.parameters, lr=lr_per_sample, momentum=momentum_time_constant) C.nesterov(res.parameters, lr_per_sample, momentum_time_constant, unit_gain_value) C.nesterov(res.parameters, lr=lr_per_sample, momentum=momentum_time_constant, unit_gain=unit_gain_value) lr_per_sample = learning_rate_schedule([0.1]*3 +[0.2]*2 +[0.3], UnitType.sample) C.adagrad(res.parameters, lr=lr_per_sample, need_ave_multiplier=True) C.set_default_unit_gain_value(True) unit_gain_value = C.default_unit_gain_value() assert unit_gain_value lr_per_sample = learning_rate_schedule([(3,0.1), (2, 0.2), (1, 0.3)], UnitType.sample) C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum_time_constant) C.fsadagrad(res.parameters, lr_per_sample, momentum_time_constant, unit_gain_value) C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum_time_constant, unit_gain=unit_gain_value) gamma, inc, dec, max, min = [0.1]*5 lr_per_sample = learning_rate_schedule([0.1, 0.2], UnitType.sample, 100) C.rmsprop(res.parameters, lr_per_sample, gamma, inc, dec, max, min, True) C.set_default_use_mean_gradient_value(False) use_mean_gradient_value = C.default_use_mean_gradient_value() assert not use_mean_gradient_value C.adadelta(res.parameters, lr_per_sample) C.set_default_use_mean_gradient_value(True) use_mean_gradient_value = C.default_use_mean_gradient_value() assert use_mean_gradient_value C.adadelta(res.parameters, lr_per_sample)
((0.2, UnitType.sample), [0.2, 0.2, 0.2, 0.2]), (([0.2, 0.4], UnitType.sample, 5), [0.2] * 5 + [0.4] * 20), (([(3, 0.2), (2, 0.4), (1, 0.8)], UnitType.sample, 5), [0.2] * 15 + [0.4] * 10 + [0.8] * 20), ] MOMENTUM_SCHEDULE_PARAMS = [ ((0.2, ), [0.2]), ((0.2, ), [0.2, 0.2, 0.2, 0.2]), (([0.2, 0.4], 5), [0.2] * 5 + [0.4] * 20), (([(3, 0.2), (2, 0.4), (1, 0.8)], 5), [0.2] * 15 + [0.4] * 10 + [0.8] * 20), ] LEARNER_LAMBDAS = [ lambda params: C.adadelta(params), lambda params: C.adagrad( params, lr=learning_rate_schedule(1, UnitType.minibatch)), lambda params: C.adam(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)), lambda params: C.fsadagrad(params, lr=learning_rate_schedule( 1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)), lambda params: C.nesterov(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)), lambda params: C.rmsprop(params, lr=learning_rate_schedule(1, UnitType.minibatch), gamma=0.1, inc=3.0,
def create_learner(paras, lr): import cntk as C return C.adadelta(paras, lr, 0.95, 1e-6,\ l2_regularization_weight=1.0, gaussian_noise_injection_std_dev=0,\ gradient_clipping_threshold_per_sample=3.0)
def test_learner_init_legacy(): i = C.input_variable(shape=(1, ), needs_gradient=True, name='a') w = parameter(shape=(1, )) res = i * w # for backcompatibility test # this will be deprecated in future version learner = sgd(res.parameters, lr=learning_rate_schedule(0.1, UnitType.sample)) assert learner._learning_rate_schedule.minibatch_size == 1 # the deprecated per sample schedule should not use compatible mode assert learner.learning_rate() == 0.1 # for backcompatibility test # this will be deprecated in future version # The UnitType will provide per minibatch instruction for the learner # this will be deprecated in future version learner = sgd(res.parameters, lr=learning_rate_schedule(0.1, UnitType.minibatch)) assert learner.is_compatible_mode() == False assert learner.learning_rate() == 0.1 assert learner.minibatch_size == C.learners.IGNORE assert learner._learning_rate_schedule.minibatch_size == 0 # for backcompatibility test, in reset learning rate, the learner won't receive the reference minibatch size from the schedule # user will need to specify the reference minibatch size explicitly # this will be deprecated in future version learner = sgd(res.parameters, lr=0.1) learner.reset_learning_rate( learning_rate_schedule([1, 2, 3], UnitType.minibatch)) assert learner.learning_rate() == 1.0 learner.minibatch_size = C.learners.IGNORE # reset to be per minibatch assert learner.minibatch_size == C.learners.IGNORE assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.is_compatible_mode() == True # for backcompatibility test # this will be deprecated in future version learner = sgd(res.parameters, lr=learning_rate_schedule(0.1, UnitType.sample), minibatch_size=C.learners.IGNORE) assert learner.is_compatible_mode() == True assert learner.learning_rate() == 0.1 assert learner.minibatch_size == C.learners.IGNORE # the learner's reference minibatch size is still 0 # this will be deprecated in future version: This is logical invalid combination but it was the only way to use mean gradient and set learning rate in the past. learner = sgd(res.parameters, lr=learning_rate_schedule(0.1, UnitType.sample), use_mean_gradient=True) assert learner.is_compatible_mode() == True assert learner.learning_rate() == 0.1 #test the override in the new version assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.minibatch_size == C.learners.IGNORE # the learner's reference minibatch size is still 0 # for backcompatibility test # this will be deprecated in future version # The UnitType will provide per minibatch instruction for the learner # this will be deprecated in future version learner = sgd(res.parameters, lr=learning_rate_schedule(0.1, UnitType.minibatch), minibatch_size=C.learners.IGNORE) assert learner.is_compatible_mode() == True assert learner.learning_rate() == 0.1 assert learner.minibatch_size == C.learners.IGNORE assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE # for backcompatibility test, in reset learning rate, the learner won't receive the reference minibatch size from the schedule # user will need to specify the reference minibatch size explicitly # this will be deprecated in future version learner = sgd(res.parameters, lr=0.1) learner.reset_learning_rate( learning_rate_schedule([1, 2, 3], UnitType.minibatch)) assert learner.learning_rate() == 1.0 learner.minibatch_size = C.learners.IGNORE # reset to be per minibatch assert learner.minibatch_size == C.learners.IGNORE assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.is_compatible_mode() == True learner_parameter = learner.parameters from cntk.variables import Parameter param = learner_parameter[0] assert isinstance(param, Parameter) unit_gain_value = C.default_unit_gain_value() assert unit_gain_value # back compatible API test momentum_time_constant = C.momentum_as_time_constant_schedule(1100) lr_per_sample = learning_parameter_schedule(0.1, minibatch_size=1) C.momentum_sgd(res.parameters, lr_per_sample, momentum_time_constant) C.momentum_sgd(res.parameters, lr_per_sample, momentum_time_constant, unit_gain_value) C.momentum_sgd(res.parameters, lr_per_sample, momentum_time_constant, unit_gain=unit_gain_value) C.set_default_unit_gain_value(False) unit_gain_value = C.default_unit_gain_value() assert not unit_gain_value C.set_default_unit_gain_value(True) unit_gain_value = C.default_unit_gain_value() assert unit_gain_value lr_per_sample = learning_rate_schedule([(3, 0.1), (2, 0.2), (1, 0.3)], unit=UnitType.sample) C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum_time_constant) C.fsadagrad(res.parameters, lr_per_sample, momentum_time_constant, unit_gain_value) C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum_time_constant, unit_gain=unit_gain_value) gamma, inc, dec, max, min = [0.5, 1.2, 0.7, 10, 1e-8] lr_per_sample = learning_rate_schedule([0.1, 0.2], unit=UnitType.sample, epoch_size=100) C.rmsprop(res.parameters, lr_per_sample, gamma, inc, dec, max, min, True) C.adadelta(res.parameters, lr_per_sample, use_mean_gradient=True)
def create_model(self): model1i = C.layers.Sequential([ # Convolution layers C.layers.Convolution2D((1,3), num_filters=8, pad=True, reduction_rank=0, activation=C.ops.tanh,name='conv_f'), C.layers.Convolution2D((1,3), num_filters=16, pad=True, reduction_rank=1, activation=C.ops.tanh,name='conv2_f'), C.layers.Convolution2D((1,3), num_filters=32, strides=(1,3), pad=False, reduction_rank=1, activation=C.ops.tanh,name='conv3_f'), ###### # Dense layers C.layers.Dense(32, activation=C.ops.relu,name='dense1_f'), #C.layers.Dense(32, activation=C.ops.relu,name='dense1_f'), #C.layers.Dense(16, activation=C.ops.relu,name='dense1_f') ]) (self._input) ### target model1t = C.layers.Sequential([ #C.layers.Dense(16, activation=C.ops.relu,name='dense2_f'), C.layers.Dense(32, activation=C.ops.relu,name='dense3_f') ]) (self._target) ### concatenate both processed target and observations inputs = C.ops.splice(model1i,model1t) ### Use input to predict next hidden state, and generate ### next observation model1 = C.layers.Sequential([ C.layers.Dense(64, activation=C.ops.relu), ###### # Recurrence C.layers.Recurrence(C.layers.LSTM(2048, init=C.glorot_uniform()),name='lstm_f'), ###### # Prediction #C.layers.Dense(16, activation=C.ops.relu,name='predict'), ###### # Decoder layers C.layers.Dense(256, activation=C.ops.relu,name='dense4_f'), #C.layers.Dense(64, activation=C.ops.relu,name='dense2'), C.layers.Dense(114, activation=C.ops.relu,name='dense5_f') ])(inputs) ###### # Reshape output model2 = C.ops.reshape(model1,(1,1,114)) model3 = C.layers.Sequential([ ###### # Deconvolution layers C.layers.ConvolutionTranspose((1,7), num_filters=8, strides=(1,1), pad=False, bias=False, init=C.glorot_uniform(1),name='deconv1'), C.layers.ConvolutionTranspose((1,3), num_filters=4, strides=(1,3), pad=False, bias=False, init=C.glorot_uniform(1),name='deconv2'), C.layers.ConvolutionTranspose((1,3), num_filters=1, pad=True,name='deconv3') ])(model2) model = C.ops.reshape(model3,(1,360)) if self._load_model: model = C.load_model('dnns/feature_predicter_ours.dnn') err = C.ops.reshape(C.ops.minus(model,self._output), (self._output_size)) sq_err = C.ops.square(err) mse = C.ops.reduce_mean(sq_err) rmse_loss = C.ops.sqrt(mse) rmse_eval = rmse_loss learner = C.adadelta(model.parameters) progress_printer = C.logging.ProgressPrinter(tag='Training') trainer = C.Trainer(model, (rmse_loss,rmse_eval), learner, progress_printer) return model, rmse_loss, learner, trainer
def test_learner_init(): i = C.input_variable(shape=(1, ), needs_gradient=True, name='a') w = parameter(shape=(1, )) res = i * w #test new API: learning_parameter_schedule #explictly specify reference minibatch size and learning rate is in number: learner = sgd(res.parameters, lr=0.1, minibatch_size=25) assert learner.is_compatible_mode() == False assert learner.minibatch_size == 25 #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == 25 assert learner.learning_rate() == 0.1 #no explictly specification of reference minibatch size and learning rate is in number: learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1)) assert learner.is_compatible_mode() == False assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.learning_rate() == 0.1 learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1, 20), minibatch_size=25) assert learner.is_compatible_mode() == False assert learner.minibatch_size == 25 #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == 20 assert learner.learning_rate() == 0.1 learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1, 20)) assert learner.is_compatible_mode() == False #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == 20 assert learner.learning_rate() == 0.1 #no explictly specification of reference minibatch size and learning rate is in number: learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1)) assert learner.is_compatible_mode() == False assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.learning_rate() == 0.1 #no explictly specification of reference minibatch size and learning rate is in number: learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1), minibatch_size=C.learners.IGNORE) assert learner.is_compatible_mode() == True assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.learning_rate() == 0.1 learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1, 20), minibatch_size=C.learners.IGNORE) assert learner.is_compatible_mode() == True assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == 20 assert learner.learning_rate() == 0.1 #no explictly specification of reference minibatch size and learning rate is in number: learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1), minibatch_size=C.learners.IGNORE) assert learner.is_compatible_mode() == True assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters: assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.learning_rate() == 0.1 mysgd = C.sgd(parameters=res.parameters, lr=0.4, minibatch_size=32) assert mysgd.minibatch_size == 32 assert mysgd._learning_rate_schedule.minibatch_size == 32 assert mysgd.learning_rate() == 0.4 mymomentum = C.momentum_sgd(parameters=res.parameters, lr=0.4, momentum=0.9, minibatch_size=32) assert mymomentum.minibatch_size == 32 assert mymomentum._learning_rate_schedule.minibatch_size == 32 assert mymomentum.learning_rate() == 0.4 myadadelta = C.adadelta(parameters=res.parameters, lr=0.4, minibatch_size=32) assert myadadelta.minibatch_size == 32 assert myadadelta._learning_rate_schedule.minibatch_size == 32 assert myadadelta.learning_rate() == 0.4 myadam = C.adam(parameters=res.parameters, lr=0.4, momentum=0.9, variance_momentum=0.9, minibatch_size=32) assert myadam.minibatch_size == 32 assert myadam._learning_rate_schedule.minibatch_size == 32 assert myadam.learning_rate() == 0.4 myadagrad = C.adagrad(parameters=res.parameters, lr=0.4, minibatch_size=32) assert myadagrad.minibatch_size == 32 assert myadagrad._learning_rate_schedule.minibatch_size == 32 assert myadagrad.learning_rate() == 0.4 myfsadagrad = C.fsadagrad(parameters=res.parameters, lr=0.4, momentum=0.9, variance_momentum=0.9, minibatch_size=32) assert myfsadagrad.minibatch_size == 32 assert myfsadagrad._learning_rate_schedule.minibatch_size == 32 assert myfsadagrad.learning_rate() == 0.4 mynesterov = C.nesterov(parameters=res.parameters, lr=0.4, momentum=0.9, minibatch_size=32) assert mynesterov.minibatch_size == 32 assert mynesterov._learning_rate_schedule.minibatch_size == 32 assert mynesterov.learning_rate() == 0.4 myrmsrop = C.rmsprop(parameters=res.parameters, lr=0.4, gamma=0.5, inc=1.2, dec=0.7, max=10, min=1e-8, minibatch_size=32) assert myrmsrop.minibatch_size == 32 assert myrmsrop._learning_rate_schedule.minibatch_size == 32 assert myrmsrop.learning_rate() == 0.4 mysgd = C.sgd(parameters=res.parameters, lr=[0.4, 0.1, 0.001], minibatch_size=32, epoch_size=512) assert mysgd.minibatch_size == 32 assert mysgd._learning_rate_schedule.minibatch_size == 32 assert mysgd._learning_rate_schedule[0] == 0.4 assert mysgd._learning_rate_schedule[512] == 0.1 assert mysgd._learning_rate_schedule[512 * 2] == 0.001 mymomentum = C.momentum_sgd(parameters=res.parameters, lr=[0.4, 0.1, 0.001], momentum=[0.9], minibatch_size=32, epoch_size=512) assert mymomentum.minibatch_size == 32 assert mymomentum._learning_rate_schedule.minibatch_size == 32 assert mymomentum._learning_rate_schedule[0] == 0.4 assert mymomentum._learning_rate_schedule[512] == 0.1 assert mymomentum._learning_rate_schedule[512 * 2] == 0.001 myadadelta = C.adadelta(parameters=res.parameters, lr=[0.4, 0.1, 0.001], minibatch_size=32, epoch_size=512) assert myadadelta.minibatch_size == 32 assert myadadelta._learning_rate_schedule.minibatch_size == 32 assert myadadelta._learning_rate_schedule[0] == 0.4 assert myadadelta._learning_rate_schedule[512] == 0.1 assert myadadelta._learning_rate_schedule[512 * 2] == 0.001 myadam = C.adam(parameters=res.parameters, lr=[0.4, 0.1, 0.001], momentum=[0.9, 0.1, 0.001], variance_momentum=[0.9], minibatch_size=32, epoch_size=512) assert myadam.minibatch_size == 32 assert myadam._learning_rate_schedule.minibatch_size == 32 assert myadam._learning_rate_schedule[0] == 0.4 assert myadam._learning_rate_schedule[512] == 0.1 assert myadam._learning_rate_schedule[512 * 2] == 0.001 myadagrad = C.adagrad(parameters=res.parameters, lr=[0.4, 0.1, 0.001], minibatch_size=32, epoch_size=512) assert myadagrad.minibatch_size == 32 assert myadagrad._learning_rate_schedule.minibatch_size == 32 assert myadagrad._learning_rate_schedule[0] == 0.4 assert myadagrad._learning_rate_schedule[512] == 0.1 assert myadagrad._learning_rate_schedule[512 * 2] == 0.001 myfsadagrad = C.fsadagrad(parameters=res.parameters, lr=[0.4, 0.1, 0.001], momentum=[0.9], variance_momentum=[0.9], minibatch_size=32, epoch_size=512) assert myadagrad.minibatch_size == 32 assert myadagrad._learning_rate_schedule.minibatch_size == 32 assert myadagrad._learning_rate_schedule[0] == 0.4 assert myadagrad._learning_rate_schedule[512] == 0.1 assert myadagrad._learning_rate_schedule[512 * 2] == 0.001 mynesterov = C.nesterov(parameters=res.parameters, lr=[0.4, 0.1, 0.001], momentum=[0.9], minibatch_size=32, epoch_size=512) assert mynesterov.minibatch_size == 32 assert mynesterov._learning_rate_schedule.minibatch_size == 32 assert mynesterov._learning_rate_schedule[0] == 0.4 assert mynesterov._learning_rate_schedule[512] == 0.1 assert mynesterov._learning_rate_schedule[512 * 2] == 0.001 myrmsrop = C.rmsprop(parameters=res.parameters, lr=[0.4, 0.1, 0.001], gamma=0.5, inc=1.2, dec=0.7, max=10, min=1e-8, minibatch_size=32, epoch_size=512) assert myrmsrop.minibatch_size == 32 assert myrmsrop._learning_rate_schedule.minibatch_size == 32 assert myrmsrop._learning_rate_schedule[0] == 0.4 assert myrmsrop._learning_rate_schedule[512] == 0.1 assert myrmsrop._learning_rate_schedule[512 * 2] == 0.001 learner_parameter = learner.parameters from cntk.variables import Parameter param = learner_parameter[0] assert isinstance(param, Parameter) unit_gain_value = C.default_unit_gain_value() assert unit_gain_value momentum = C.momentum_schedule(0.999, minibatch_size=1) lr_per_sample = learning_parameter_schedule(0.1, minibatch_size=1) C.momentum_sgd(res.parameters, lr_per_sample, momentum) C.momentum_sgd(res.parameters, lr_per_sample, momentum, unit_gain_value) C.momentum_sgd(res.parameters, lr_per_sample, momentum, unit_gain=unit_gain_value) C.set_default_unit_gain_value(False) unit_gain_value = C.default_unit_gain_value() assert not unit_gain_value lr_per_sample = learning_parameter_schedule([0.1, 0.2], minibatch_size=1) C.nesterov(res.parameters, lr=lr_per_sample, momentum=momentum) C.nesterov(res.parameters, lr_per_sample, momentum, unit_gain_value) C.nesterov(res.parameters, lr=lr_per_sample, momentum=momentum, unit_gain=unit_gain_value) lr_per_sample = learning_parameter_schedule([0.1] * 3 + [0.2] * 2 + [0.3], minibatch_size=1) C.adagrad(res.parameters, lr=lr_per_sample, need_ave_multiplier=True) C.set_default_unit_gain_value(True) unit_gain_value = C.default_unit_gain_value() assert unit_gain_value lr_per_sample = learning_parameter_schedule([(3, 0.1), (2, 0.2), (1, 0.3)], minibatch_size=1) C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum) C.fsadagrad(res.parameters, lr_per_sample, momentum, unit_gain_value) C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum, unit_gain=unit_gain_value) gamma, inc, dec, max, min = [0.5, 1.2, 0.7, 10, 1e-8] lr_per_sample = learning_parameter_schedule([0.1, 0.2], minibatch_size=1, epoch_size=100) C.rmsprop(res.parameters, lr_per_sample, gamma, inc, dec, max, min, True) C.adadelta(res.parameters, lr_per_sample)
def train(data_path, model_path, log_file, config_file, restore=False, profiling=False, gen_heartbeat=False): polymath = PolyMath(config_file) z, loss = polymath.model() training_config = importlib.import_module(config_file).training_config minibatch_size = training_config['minibatch_size'] max_epochs = training_config['max_epochs'] epoch_size = training_config['epoch_size'] log_freq = training_config['log_freq'] progress_writers = [ C.logging.ProgressPrinter(num_epochs=max_epochs, freq=log_freq, tag='Training', log_to_file=log_file, rank=C.Communicator.rank(), gen_heartbeat=gen_heartbeat) ] lr = C.learning_parameter_schedule(training_config['lr'], minibatch_size=None, epoch_size=None) ema = {} dummies = [] for p in z.parameters: ema_p = C.constant(0, shape=p.shape, dtype=p.dtype, name='ema_%s' % p.uid) ema[p.uid] = ema_p dummies.append(C.reduce_sum(C.assign(ema_p, 0.999 * ema_p + 0.001 * p))) dummy = C.combine(dummies) learner = C.adadelta(z.parameters, lr) if C.Communicator.num_workers() > 1: learner = C.data_parallel_distributed_learner(learner, num_quantization_bits=1) trainer = C.Trainer(z, (loss, None), learner, progress_writers) if profiling: C.debugging.start_profiler(sync_gpu=True) train_data_file = os.path.join(data_path, training_config['train_data']) train_data_ext = os.path.splitext(train_data_file)[-1].lower() model_file = os.path.join(model_path, model_name) model = C.combine(list(z.outputs) + [loss.output]) label_ab = argument_by_name(loss, 'ab') epoch_stat = {'best_val_err': 100, 'best_since': 0, 'val_since': 0} if restore and os.path.isfile(model_file): trainer.restore_from_checkpoint(model_file) epoch_stat['best_val_err'] = validate_model( os.path.join(data_path, training_config['val_data']), model, polymath) def post_epoch_work(epoch_stat): trainer.summarize_training_progress() epoch_stat['val_since'] += 1 if epoch_stat['val_since'] == training_config['val_interval']: epoch_stat['val_since'] = 0 temp = dict((p.uid, p.value) for p in z.parameters) for p in trainer.model.parameters: p.value = ema[p.uid].value val_err = validate_model( os.path.join(data_path, training_config['val_data']), model, polymath) if epoch_stat['best_val_err'] > val_err: epoch_stat['best_val_err'] = val_err epoch_stat['best_since'] = 0 trainer.save_checkpoint(model_file) for p in trainer.model.parameters: p.value = temp[p.uid] else: epoch_stat['best_since'] += 1 if epoch_stat['best_since'] > training_config['stop_after']: return False if profiling: C.debugging.enable_profiler() return True if train_data_ext == '.ctf': mb_source, input_map = create_mb_and_map(loss, train_data_file, polymath) for epoch in range(max_epochs): num_seq = 0 with tqdm(total=epoch_size, ncols=32, smoothing=0.1) as progress_bar: while True: if trainer.total_number_of_samples_seen >= training_config[ 'distributed_after']: data = mb_source.next_minibatch( minibatch_size * C.Communicator.num_workers(), input_map=input_map, num_data_partitions=C.Communicator.num_workers(), partition_index=C.Communicator.rank()) else: data = mb_source.next_minibatch(minibatch_size, input_map=input_map) trainer.train_minibatch(data) num_seq += trainer.previous_minibatch_sample_count dummy.eval() if num_seq >= epoch_size: break else: progress_bar.update( trainer.previous_minibatch_sample_count) if not post_epoch_work(epoch_stat): break else: if train_data_ext != '.tsv': raise Exception("Unsupported format") minibatch_seqs = training_config[ 'minibatch_seqs'] # number of sequences for epoch in range(max_epochs): # loop over epochs tsv_reader = create_tsv_reader(loss, train_data_file, polymath, minibatch_seqs, C.Communicator.num_workers()) minibatch_count = 0 for data in tsv_reader: if (minibatch_count % C.Communicator.num_workers()) == C.Communicator.rank(): trainer.train_minibatch(data) # update model with it dummy.eval() minibatch_count += 1 if not post_epoch_work(epoch_stat): break if profiling: C.debugging.stop_profiler()
#input = np.array(input_map) #input = C.input_variable(1000) #label = C.input_variable(num_output_classes) z = create_model(input) loss = C.cross_entropy_with_softmax(z, label) label_error = C.squared_error(z, label) # Instantiate the trainer object to drive the model training learning_rate = 0.8 lr_schedule = C.learning_parameter_schedule(learning_rate) learner = C.adadelta(z.parameters, lr_schedule) trainer = C.Trainer(z, (loss, label_error), [learner]) # Define a utility function to compute the moving average sum. # A more efficient implementation is possible with np.cumsum() function def moving_average(a, w=5): if len(a) < w: return a[:] # Need to send a copy of the array return [val if idx < w else sum(a[(idx-w):idx])/w for idx, val in enumerate(a)] # Defines a utility that prints the training progress def print_training_progress(trainer, mb, frequency, verbose=1): loss = "NA" error = "NA"
def test_learner_init_legacy(): i = C.input_variable(shape=(1,), needs_gradient=True, name='a') w = parameter(shape=(1,)) res = i * w # for backcompatibility test # this will be deprecated in future version learner = sgd(res.parameters, lr=learning_rate_schedule(0.1, UnitType.sample)) assert learner._learning_rate_schedule.minibatch_size == 1 # the deprecated per sample schedule should not use compatible mode assert learner.learning_rate() == 0.1 # for backcompatibility test # this will be deprecated in future version # The UnitType will provide per minibatch instruction for the learner # this will be deprecated in future version learner = sgd(res.parameters, lr=learning_rate_schedule(0.1, UnitType.minibatch)) assert learner.is_compatible_mode() == False assert learner.learning_rate() == 0.1 assert learner.minibatch_size == C.learners.IGNORE assert learner._learning_rate_schedule.minibatch_size == 0 # for backcompatibility test, in reset learning rate, the learner won't receive the reference minibatch size from the schedule # user will need to specify the reference minibatch size explicitly # this will be deprecated in future version learner = sgd(res.parameters, lr=0.1) learner.reset_learning_rate(learning_rate_schedule([1, 2, 3], UnitType.minibatch)) assert learner.learning_rate() == 1.0 learner.minibatch_size = C.learners.IGNORE # reset to be per minibatch assert learner.minibatch_size == C.learners.IGNORE assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.is_compatible_mode() == True # for backcompatibility test # this will be deprecated in future version learner = sgd(res.parameters, lr=learning_rate_schedule(0.1, UnitType.sample), minibatch_size=C.learners.IGNORE) assert learner.is_compatible_mode() == True assert learner.learning_rate() == 0.1 assert learner.minibatch_size == C.learners.IGNORE # the learner's reference minibatch size is still 0 # this will be deprecated in future version: This is logical invalid combination but it was the only way to use mean gradient and set learning rate in the past. learner = sgd(res.parameters, lr=learning_rate_schedule(0.1, UnitType.sample), use_mean_gradient=True) assert learner.is_compatible_mode() == True assert learner.learning_rate() == 0.1 #test the override in the new version assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.minibatch_size == C.learners.IGNORE # the learner's reference minibatch size is still 0 # for backcompatibility test # this will be deprecated in future version # The UnitType will provide per minibatch instruction for the learner # this will be deprecated in future version learner = sgd(res.parameters, lr=learning_rate_schedule(0.1, UnitType.minibatch), minibatch_size=C.learners.IGNORE) assert learner.is_compatible_mode() == True assert learner.learning_rate() == 0.1 assert learner.minibatch_size == C.learners.IGNORE assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE # for backcompatibility test, in reset learning rate, the learner won't receive the reference minibatch size from the schedule # user will need to specify the reference minibatch size explicitly # this will be deprecated in future version learner = sgd(res.parameters, lr=0.1) learner.reset_learning_rate(learning_rate_schedule([1, 2, 3], UnitType.minibatch)) assert learner.learning_rate() == 1.0 learner.minibatch_size = C.learners.IGNORE # reset to be per minibatch assert learner.minibatch_size == C.learners.IGNORE assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE assert learner.is_compatible_mode() == True learner_parameter = learner.parameters from cntk.variables import Parameter param = learner_parameter[0] assert isinstance(param, Parameter) unit_gain_value = C.default_unit_gain_value() assert unit_gain_value # back compatible API test momentum_time_constant = C.momentum_as_time_constant_schedule(1100) lr_per_sample = learning_parameter_schedule(0.1, minibatch_size=1) C.momentum_sgd(res.parameters, lr_per_sample, momentum_time_constant) C.momentum_sgd(res.parameters, lr_per_sample, momentum_time_constant, unit_gain_value) C.momentum_sgd(res.parameters, lr_per_sample, momentum_time_constant, unit_gain=unit_gain_value) C.set_default_unit_gain_value(False) unit_gain_value = C.default_unit_gain_value() assert not unit_gain_value C.set_default_unit_gain_value(True) unit_gain_value = C.default_unit_gain_value() assert unit_gain_value lr_per_sample = learning_rate_schedule([(3, 0.1), (2, 0.2), (1, 0.3)], unit=UnitType.sample) C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum_time_constant) C.fsadagrad(res.parameters, lr_per_sample, momentum_time_constant, unit_gain_value) C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum_time_constant, unit_gain=unit_gain_value) gamma, inc, dec, max, min = [0.5, 1.2, 0.7, 10, 1e-8] lr_per_sample = learning_rate_schedule([0.1, 0.2], unit=UnitType.sample, epoch_size=100) C.rmsprop(res.parameters, lr_per_sample, gamma, inc, dec, max, min, True) C.adadelta(res.parameters, lr_per_sample, use_mean_gradient=True)
def create_model(self): modeli = C.layers.Sequential([ # Convolution layers C.layers.Convolution2D((1, 3), num_filters=8, pad=True, reduction_rank=0, activation=C.ops.tanh, name='conv_a'), C.layers.Convolution2D((1, 3), num_filters=16, pad=True, reduction_rank=1, activation=C.ops.tanh, name='conv2_a'), C.layers.Convolution2D((1, 3), num_filters=32, pad=False, reduction_rank=1, activation=C.ops.tanh, name='conv3_a'), ###### # Dense layers #C.layers.Dense(128, activation=C.ops.relu,name='dense1_a'), #C.layers.Dense(64, activation=C.ops.relu,name='dense2_a'), C.layers.Dense(361, activation=C.ops.relu, name='dense3_a') ])(self._input) ### target modelt = C.layers.Sequential( [C.layers.Dense(360, activation=C.ops.relu, name='dense4_a')])(self._target) ### concatenate both processed target and observations inputs = C.ops.splice(modeli, modelt) ### Use input to predict next hidden state, and generate ### next observation model = C.layers.Sequential([ ###### C.layers.Dense(720, activation=C.ops.relu, name='dense5_a'), # Recurrence C.layers.Recurrence(C.layers.LSTM(2048, init=C.glorot_uniform()), name='lstm_a'), C.layers.Dense(1024, activation=None) ])(inputs) ###### # Prediction direction = C.layers.Sequential([ C.layers.Dense(720, activation=None, name='dense6_a'), C.layers.Dense(360, activation=C.ops.softmax, name='dense7_a') ])(model) velocity = C.layers.Sequential([ C.layers.Dense(128, activation=C.ops.relu), C.layers.Dense(64, activation=None), C.layers.Dense(1, activation=None) ])(model) model = C.ops.splice(direction, velocity) if self._load_model: model = C.load_model('dnns/action_predicter_f.dnn') direction = model[0:360] velocity = model[360] print(model) loss = C.squared_error(direction, self._output) + C.squared_error( velocity, self._output_velocity) error = C.classification_error(direction, self._output) + C.squared_error( velocity, self._output_velocity) learner = C.adadelta(model.parameters, l2_regularization_weight=0.001) progress_printer = C.logging.ProgressPrinter(tag='Training') trainer = C.Trainer(model, (loss, error), learner, progress_printer) return model, loss, learner, trainer
def train(data_path, model_path, log_file, config_file, restore=False, profiling=False, gen_heartbeat=False): polymath = PolyMath(config_file) z, loss = polymath.model() training_config = importlib.import_module(config_file).training_config max_epochs = training_config['max_epochs'] log_freq = training_config['log_freq'] progress_writers = [C.logging.ProgressPrinter( num_epochs = max_epochs, freq = log_freq, tag = 'Training', log_to_file = log_file, rank = C.Communicator.rank(), gen_heartbeat = gen_heartbeat)] lr = C.learning_parameter_schedule(training_config['lr'], minibatch_size=None, epoch_size=None) ema = {} dummies = [] for p in z.parameters: ema_p = C.constant(0, shape=p.shape, dtype=p.dtype, name='ema_%s' % p.uid) ema[p.uid] = ema_p dummies.append(C.reduce_sum(C.assign(ema_p, 0.999 * ema_p + 0.001 * p))) dummy = C.combine(dummies) learner = C.adadelta(z.parameters, lr) if C.Communicator.num_workers() > 1: learner = C.data_parallel_distributed_learner(learner) tensorboard_writer = TensorBoardProgressWriter(freq=10, log_dir='log', model=z) trainer = C.Trainer(z, (loss, None), learner, tensorboard_writer) if profiling: C.debugging.start_profiler(sync_gpu=True) train_data_file = os.path.join(data_path, training_config['train_data']) train_data_ext = os.path.splitext(train_data_file)[-1].lower() model_file = os.path.join(model_path, model_name) model = C.combine(list(z.outputs) + [loss.output]) label_ab = argument_by_name(loss, 'ab') epoch_stat = { 'best_val_err' : 100, 'best_since' : 0, 'val_since' : 0} if restore and os.path.isfile(model_file): trainer.restore_from_checkpoint(model_file) #after restore always re-evaluate epoch_stat['best_val_err'] = validate_model(os.path.join(data_path, training_config['val_data']), model, polymath) def post_epoch_work(epoch_stat): trainer.summarize_training_progress() epoch_stat['val_since'] += 1 if epoch_stat['val_since'] == training_config['val_interval']: epoch_stat['val_since'] = 0 temp = dict((p.uid, p.value) for p in z.parameters) for p in trainer.model.parameters: p.value = ema[p.uid].value val_err = validate_model(os.path.join(data_path, training_config['val_data']), model, polymath) if epoch_stat['best_val_err'] > val_err: epoch_stat['best_val_err'] = val_err epoch_stat['best_since'] = 0 trainer.save_checkpoint(model_file) for p in trainer.model.parameters: p.value = temp[p.uid] else: epoch_stat['best_since'] += 1 if epoch_stat['best_since'] > training_config['stop_after']: return False if profiling: C.debugging.enable_profiler() return True if train_data_ext == '.ctf': mb_source, input_map = create_mb_and_map(loss, train_data_file, polymath) minibatch_size = training_config['minibatch_size'] # number of samples epoch_size = training_config['epoch_size'] for epoch in range(max_epochs): num_seq = 0 while True: if trainer.total_number_of_samples_seen >= training_config['distributed_after']: data = mb_source.next_minibatch(minibatch_size*C.Communicator.num_workers(), input_map=input_map, num_data_partitions=C.Communicator.num_workers(), partition_index=C.Communicator.rank()) else: data = mb_source.next_minibatch(minibatch_size, input_map=input_map) trainer.train_minibatch(data) num_seq += trainer.previous_minibatch_sample_count dummy.eval() if num_seq >= epoch_size: break if not post_epoch_work(epoch_stat): break else: if train_data_ext != '.tsv': raise Exception("Unsupported format") minibatch_seqs = training_config['minibatch_seqs'] # number of sequences for epoch in range(max_epochs): # loop over epochs tsv_reader = create_tsv_reader(loss, train_data_file, polymath, minibatch_seqs, C.Communicator.num_workers()) minibatch_count = 0 for data in tsv_reader: if (minibatch_count % C.Communicator.num_workers()) == C.Communicator.rank(): trainer.train_minibatch(data) # update model with it dummy.eval() minibatch_count += 1 if not post_epoch_work(epoch_stat): break if profiling: C.debugging.stop_profiler()