def test_shared_variable_modifier(): weights = numpy.array([-1, 1], dtype=theano.config.floatX) features = [numpy.array(f, dtype=theano.config.floatX) for f in [[1, 2], [3, 4], [5, 6]]] targets = [(weights * f).sum() for f in features] n_batches = 3 dataset = IterableDataset(dict(features=features, targets=targets)) x = tensor.vector('features') y = tensor.scalar('targets') W = shared_floatx([0, 0], name='W') cost = ((x * W).sum() - y) ** 2 cost.name = 'cost' step_rule = Scale(0.001) sgd = GradientDescent(cost=cost, parameters=[W], step_rule=step_rule) main_loop = MainLoop( model=None, data_stream=dataset.get_example_stream(), algorithm=sgd, extensions=[ FinishAfter(after_n_epochs=1), SharedVariableModifier( step_rule.learning_rate, lambda n: numpy.cast[theano.config.floatX](10. / n) )]) main_loop.run() assert_allclose(step_rule.learning_rate.get_value(), numpy.cast[theano.config.floatX](10. / n_batches))
def test_shared_variable_modifier_two_params(): weights = numpy.array([-1, 1], dtype=floatX) features = [numpy.array(f, dtype=floatX) for f in [[1, 2], [3, 4], [5, 6]]] targets = [(weights * f).sum() for f in features] n_batches = 3 dataset = ContainerDataset(dict(features=features, targets=targets)) x = tensor.vector('features') y = tensor.scalar('targets') W = shared_floatx([0, 0], name='W') cost = ((x * W).sum() - y)**2 cost.name = 'cost' step_rule = Scale(0.001) sgd = GradientDescent(cost=cost, params=[W], step_rule=step_rule) modifier = SharedVariableModifier( step_rule.learning_rate, lambda _, val: numpy.cast[floatX](val * 0.2)) main_loop = MainLoop(model=None, data_stream=dataset.get_default_stream(), algorithm=sgd, extensions=[FinishAfter(after_n_epochs=1), modifier]) main_loop.run() new_value = step_rule.learning_rate.get_value() assert_allclose(new_value, 0.001 * 0.2**n_batches, atol=1e-5)
def decay_learning_rate(self, learning_rate_decay): """Decay learning rate after each epoch :learning_rate_decay: decay coeff. """ if learning_rate_decay not in (0, 1): learning_rate = self.step_rules[0].learning_rate self.extensions.append( SharedVariableModifier( learning_rate, lambda n, lr: numpy.cast[theano.config.floatX] (learning_rate_decay * lr), after_epoch=True, after_batch=False))
step_rules = [RMSProp(learning_rate=learning_rate, decay_rate=decay_rate), StepClipping(step_clipping)] algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule(step_rules)) # Extensions gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) monitored_vars = [cost, gradient_norm, step_norm] dev_monitor = DataStreamMonitoring(variables=[cost], after_epoch=True, before_first_epoch=True, data_stream=dev_stream, prefix="dev") train_monitor = TrainingDataMonitoring(variables=monitored_vars, after_batch=True, before_first_epoch=True, prefix='tra') extensions = [dev_monitor, train_monitor, Timing(), Printing(after_batch=True), FinishAfter(after_n_epochs=nepochs), saveload.Load(load_path), saveload.Checkpoint(last_path), ] + track_best('dev_cost', save_path) if learning_rate_decay not in (0, 1): extensions.append(SharedVariableModifier(step_rules[0].learning_rate, lambda n, lr: numpy.cast[theano.config.floatX](learning_rate_decay * lr), after_epoch=True, after_batch=False)) print('number of parameters in the model: ' + str(tensor.sum([p.size for p in cg.parameters]).eval())) # Finally build the main loop and train the model main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, model=Model(cost), extensions=extensions) main_loop.run()
def train(args, trial=11, no_valid=False): # Creating unique strings to save for experiments. data_valid = "data/"+args.data_name+"_trial_"+str(trial)+"_valid_size_"+str(args.train_size)+\ "_transitions_"+str(args.transitions) data_test = data_valid.replace("_valid_size", "_test_size") # If we want validation set to match modData of test set if modDataValid == 1: data_valid = data_valid.replace("_trial_", "_" + modData + "_trial_") data_test = data_test.replace("_trial_", "_" + modData + "_trial_") # By default, it is m0 data_train = "data/"+args.data_name+"_trial_"+str(trial)+"_train_size_"+str(args.train_size)+\ "_transitions_"+str(args.transitions) subStr = "rnn_type_"+args.rnn_type + "_trial_"+str(trial) + "_hiddenSize_"+str(args.hidden_size)+\ "_numLayers_"+str(args.num_layers)+ \ "_dropout_"+str(args.dropout)+"_train_size_"+str(args.train_size) + "_transitions_"+str(args.transitions)+\ "_novalid_"+str(args.no_valid) if modData == "m1": data_train = data_train.replace("_trial_", "_m1_trial_") subStr = subStr.replace("_trial_", "_m1_trial_") elif modData == "m3": data_train = data_train.replace("_trial_", "_m3_trial_") subStr = subStr.replace("_trial_", "_m3_trial_") data_valid = "data/"+args.data_name+"_m3_trial_"+str(trial)+"_valid_size_"+str(args.train_size)+\ "_transitions_"+str(args.transitions) data_test = "data/"+args.data_name+"_m3_trial_"+str(trial)+"_test_size_"+str(args.train_size)+\ "_transitions_"+str(args.transitions) print("on test: " + subStr) # Perform folder prefixing prefix_path = models_folder + args.data_name + "/" + subStr +"_tgrad_"+str(args.truncate_gradient)+\ "_boost_"+bStr(args.boosting) load_path2 = prefix + load_path save_path2 = prefix + save_path last_path2 = prefix + last_path plots_output2 = plots_output + args.data_name + "/" + subStr +"_tgrad_"+str(args.truncate_gradient)+\ "_boost_"+bStr(args.boosting) # obtain vocabulary size ix_to_char, char_to_ix, vocab_size = get_metadata( data_test.replace("_test", "")) print("vocab_size: " + str(vocab_size)) # Get train, valid, test streams sharedDataTrain, train_stream = get_stream_inGPU(data_train, sharedName='sharedData') train_streamCopy = copy.deepcopy(train_stream) sharedDataValid, dev_stream = get_stream_inGPU(data_valid, sharedName='sharedData') valid_streamCopy = copy.deepcopy(dev_stream) sharedDataTest, test_stream = get_stream_inGPU(data_test, sharedName='sharedData') test_streamCopy = copy.deepcopy(test_stream) # Create dummy sums sharedMRRSUM = shared(np.array(0.0, dtype=theano.config.floatX)) sharedTOTSUM = shared(np.array(0.0, dtype=theano.config.floatX)) sharedSUMVARs = { 'sharedMRRSUM': sharedMRRSUM, 'sharedTOTSUM': sharedTOTSUM } # Initialize batches batch_index_From = T.scalar('int_stream_From', dtype='int32') batch_index_To = T.scalar('int_stream_To', dtype='int32') # Index theano variables x = sharedDataTrain['x'][:, batch_index_From:batch_index_To] x.name = 'x' x_mask = sharedDataTrain['x_mask'][:, batch_index_From:batch_index_To] x_mask.name = 'x_mask' x_mask_o = sharedDataTrain['x_mask_o'][:, batch_index_From:batch_index_To] x_mask_o.name = 'x_mask_o' x_mask_o_mask = sharedDataTrain[ 'x_mask_o_mask'][:, batch_index_From:batch_index_To] x_mask_o_mask.name = 'x_mask_o_mask' y = sharedDataTrain['y'][:, batch_index_From:batch_index_To] y.name = 'y' y_mask = sharedDataTrain['y_mask'][:, batch_index_From:batch_index_To] y_mask.name = 'y_mask' y_mask_o = sharedDataTrain['y_mask_o'][:, batch_index_From:batch_index_To] y_mask_o.name = 'y_mask_o' y_mask_o_mask = sharedDataTrain[ 'y_mask_o_mask'][:, batch_index_From:batch_index_To] y_mask_o_mask.name = 'y_mask_o_mask' lens = sharedDataTrain['lens'][:, batch_index_From:batch_index_To] lens.name = 'lens' # Generate temp shared vars tempSharedData = {} tempSharedData[theano.config.floatX] = [ shared(np.array([[0], [0]], dtype=theano.config.floatX)), shared(np.array([[0], [0]], dtype=theano.config.floatX)), shared(np.array([[0], [0]], dtype=theano.config.floatX)), shared(np.array([[0], [0]], dtype=theano.config.floatX)), shared(np.array([[0], [0]], dtype=theano.config.floatX)), shared(np.array([[0], [0]], dtype=theano.config.floatX)) ] tempSharedData['uint8'] = [ shared(np.array([[0], [0]], dtype='uint8')), shared(np.array([[0], [0]], dtype='uint8')), shared(np.array([[0], [0]], dtype='uint8')) ] # Final mask is due to the generated mask and the input mask x_mask_final = x_mask * x_mask_o * x_mask_o_mask y_mask_final = y_mask * y_mask_o * y_mask_o_mask # Build neural network linear_output, cost = nn_fprop( x, x_mask_final, y, y_mask_final, lens, vocab_size, hidden_size, num_layers, rnn_type, boosting=boosting, scan_kwargs={'truncate_gradient': truncate_gradient}) # Keep a constant in gpu memory constant1 = shared(np.float32(1.0)) cost_int, ymasksum = RR_cost(y, linear_output, y_mask_final, constant1) # Validation calculations fRR = function(inputs=[ theano.In(batch_index_From, borrow=True), theano.In(batch_index_To, borrow=True) ], updates=[(sharedMRRSUM, sharedMRRSUM + cost_int), (sharedTOTSUM, sharedTOTSUM + ymasksum)]) # COST cg = ComputationGraph(cost) if dropout > 0: # Apply dropout only to the non-recurrent inputs (Zaremba et al. 2015) inputs = VariableFilter(theano_name_regex=r'.*apply_input.*')( cg.variables) cg = apply_dropout(cg, inputs, dropout) cost = cg.outputs[0] # Learning algorithm step_rules = [ RMSProp(learning_rate=rmsPropLearnRate, decay_rate=decay_rate), StepClipping(step_clipping) ] algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule(step_rules)) # Extensions # This is for tracking our best result trackbest = track_best('valid_MRR', save_path2, last_path2, num_epochs, nepochs, maxIterations, epsilon, tempSharedData) if onlyPlots: prefixes = ["train_cross", "valid_cross", "test_cross"] gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) monitored_vars = [cost, gradient_norm, step_norm] #this is faster train_monitor = myTrainingDataMonitoring( variables=monitored_vars, prefix=prefixes[0], after_batch=True, saveEveryXIteration=saveEveryXIteration) #train_monitor = DataStreamMonitoringPlot(variables=[cost], # data_stream=train_streamCopy, prefix=prefixes[0], sharedDataTrain=sharedDataTrain, sharedDataActualTest=sharedDataTrain, after_batch=True, saveEveryXIteration = saveEveryXIteration) valid_monitor = DataStreamMonitoringPlot( variables=[cost], data_stream=valid_streamCopy, prefix=prefixes[1], sharedDataTrain=sharedDataTrain, sharedDataActualTest=sharedDataValid, after_batch=True, saveEveryXIteration=saveEveryXIteration) test_monitor = DataStreamMonitoringPlot( variables=[cost], data_stream=test_streamCopy, prefix=prefixes[2], sharedDataTrain=sharedDataTrain, sharedDataActualTest=sharedDataTest, after_batch=True, saveEveryXIteration=saveEveryXIteration) trackbest = [trackbest[0], trackbest[2], trackbest[3], trackbest[4]] plot = Plot('Live Plotting', saveFolder=plots_output2, channels=[ 'train_cross_cost', 'valid_cross_cost', 'test_cross_cost' ], numProcesses=numProcesses, saveEveryXIteration=saveEveryXIteration, after_batch=True) extensions = [ train_monitor, valid_monitor, test_monitor, plot, Printing(), ProgressBar(), ] + trackbest else: dev_monitor = myDataStreamMonitoring(after_epoch=True, before_epoch=False, data_stream=dev_stream, prefix="valid", fRR=fRR, sharedVars=sharedSUMVARs, sharedDataTrain=sharedDataTrain, sharedDataValid=sharedDataValid) extensions = [ dev_monitor, Printing(), ProgressBar(), ] + trackbest if learning_rate_decay not in (0, 1): extensions.append( SharedVariableModifier(step_rules[0].learning_rate, lambda n, lr: np.cast[theano.config.floatX] (learning_rate_decay * lr), after_epoch=True, after_batch=False)) print 'number of parameters in the model: ' + str( T.sum([p.size for p in cg.parameters]).eval()) # Finally build the main loop and train the model main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, model=Model(cost), extensions=extensions) main_loop.run()
# plot = Plot('Plotting example', channels=[['cost']], after_batch=True, open_browser=True) extensions = [ set_train_flag, test_monitor, train_monitor, Timing(), Printing(after_epoch=True), FinishAfter(after_n_epochs=nepochs), saveload.Load(load_path), saveload.Checkpoint(last_path, every_n_epochs=10000), ] + track_best('test_cost', save_path) #+ track_best('train_cost', last_path) if learning_rate_decay not in (0, 1): extensions.append( SharedVariableModifier(step_rules[0].learning_rate, lambda n, lr: np.cast[theano.config.floatX] (learning_rate_decay * lr), after_epoch=False, every_n_epochs=lr_decay_every_n_epochs, after_batch=False)) print 'number of parameters in the model: ' + str( T.sum([p.size for p in cg.parameters]).eval()) # Finally build the main loop and train the model main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, model=Model(cost), extensions=extensions) main_loop.run()
# DEBUG this triggers an error on my machine # apply dropout to all the input variables inputs = VariableFilter(roles=[INPUT])(cg_nodropout.variables) # dropconnect # inputs = VariableFilter(roles=[PARAMETER])(cg_nodropout.variables) cg = apply_dropout(cg_nodropout, inputs, args.dropout_rate) else: cg = cg_nodropout step_compute = RMSProp(learning_rate=args.lr, max_scaling=1e10) algorithm = GradientDescent(step_rule=CompositeRule([RemoveNotFinite(), step_compute]), parameters=cg.parameters, cost=cost) extension_list = [] extension_list.append( SharedVariableModifier(step_compute.learning_rate, extensions.decay_learning_rate, after_batch=False, every_n_batches=batches_per_epoch, )) extension_list.append(FinishAfter(after_n_epochs=100001)) ## logging of test set performance extension_list.append(extensions.LogLikelihood(dpm, test_stream, scl, every_n_batches=args.ext_every_n*batches_per_epoch, before_training=False)) ## set up logging extension_list.extend([Timing(), Printing()]) model_dir = util.create_log_dir(args, dpm.name + '_' + args.dataset) model_save_name = os.path.join(model_dir, 'model.pkl') extension_list.append( Checkpoint(model_save_name, every_n_batches=args.ext_every_n*batches_per_epoch, save_separately=['log'])) # generate plots extension_list.append(extensions.PlotMonitors(model_dir,
def pretrain_rnn(train, rnnrbm, test=None, epochs=1000, bokeh=True): lr = theano.shared(float32(0.1)) probs, _, _, _ = rnnrbm.rnn_pretrain_pred(x, x_mask) cost = NegativeLogLikelihood().apply(y, probs, y_mask) error_rate = MismulitclassificationRate().apply(y, probs, y_mask) error_rate.name = "error on note as a whole" mistake_rate = MismulitmistakeRate().apply(y, probs, y_mask) mistake_rate.name = "single error within note" cost.name = 'final_cost' model = Model(cost) cg = ComputationGraph([cost]) step_rule = CompositeRule([ RemoveNotFinite(), StepClipping(30.0), Adam(learning_rate=lr), StepClipping(6.0), RemoveNotFinite() ]) algorithm = GradientDescent(step_rule=step_rule, cost=cost, params=cg.parameters) extensions = [ SharedVariableModifier(parameter=lr, function=lambda n, v: float32(0.7 * v) if n % 700 == 0 else v), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring( [ cost, error_rate, mistake_rate, ], # hidden_states, debug_val, param_nans, # aggregation.mean(algorithm.total_gradient_norm)], #+ params, prefix="train", after_epoch=False, every_n_batches=40), Timing(), Printing(), ProgressBar() ] if test is not None: extensions.append( DataStreamMonitoring([cost, error_rate, mistake_rate], data_stream=test, updates=cg.updates, prefix="test", after_epoch=False, every_n_batches=40)) if bokeh: extensions.append( Plot( 'Pretrain RNN', channels=[ [ 'train_error on note as a whole', 'train_single error within note', 'test_error on note as a whole', 'test_single error within note' ], ['train_rbm_cost'], # ['train_total_gradient_norm'], ])) main_loop = MainLoop(algorithm=algorithm, data_stream=train, model=model, extensions=extensions) return main_loop
def train_rnnrbm(train, rnnrbm, epochs=1000, test=None, bokeh=True, load_path=None): cdk = theano.shared(10) lr = theano.shared(float32(0.004)) cost, v_sample = rnnrbm.cost(examples=x, mask=x_mask, k=cdk) error_rate = MismulitclassificationRate().apply(x, v_sample[-1], x_mask) error_rate.name = "error on note as a whole" mistake_rate = MismulitmistakeRate().apply(x, v_sample[-1], x_mask) mistake_rate.name = "single error within note" cost.name = 'rbm_cost' model = Model(cost) cg = ComputationGraph([cost]) step_rule = CompositeRule([ RemoveNotFinite(), StepClipping(30.0), Adam(learning_rate=lr), StepClipping(6.0), RemoveNotFinite() ]) # Scale(0.01) gradients = dict( equizip(cg.parameters, T.grad(cost, cg.parameters, consider_constant=[v_sample]))) algorithm = GradientDescent(step_rule=step_rule, gradients=gradients, cost=cost, params=cg.parameters) algorithm.add_updates(cg.updates) extensions = [ SharedVariableModifier(parameter=cdk, function=lambda n, v: rnnrbm_cdk[n] if rnnrbm_cdk.get(n) else v), SharedVariableModifier(parameter=lr, function=lambda n, v: float32(0.78 * v) if n % (200 * 5) == 0 else v), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring( [ cost, error_rate, mistake_rate, ], # hidden_states, debug_val, param_nans, # aggregation.mean(algorithm.total_gradient_norm)], #+ params, prefix="train", after_epoch=False, every_n_batches=40), Timing(), Printing(), ProgressBar() ] if test is not None: extensions.append( DataStreamMonitoring([cost, error_rate, mistake_rate], data_stream=test, updates=cg.updates, prefix="test", after_epoch=False, every_n_batches=40)) if bokeh: extensions.append( Plot( 'Training RNN-RBM', channels=[ [ 'train_error on note as a whole', 'train_single error within note', 'test_error on note as a whole', 'test_single error within note' ], ['train_final_cost'], # ['train_total_gradient_norm'], ])) main_loop = MainLoop(algorithm=algorithm, data_stream=train, model=model, extensions=extensions) return main_loop
def run(): # Load Model net_size = 256 #Hard-code instead of loading model (takes too long to set up network) #net = vaegan.VAEGAN() #network_saver = saver.NetworkSaver('vaegan/models/', net=net) #network_saver.load() # DATA train_stream = get_stream(hdf5_file, 'train', batch_size) #TODO jonathan ? test_stream = get_stream(hdf5_file, 'test', batch_size) #TODO jonathan ? # MODEL x = T.TensorType('floatX', [False] * 3)('features') y = T.tensor3('targets', dtype='floatX') train_flag = [theano.shared(0)] x = x.swapaxes(0, 1) y = y.swapaxes(0, 1) # More Config out_size = len(output_columns) - 1 # code_mode=RL-MDN latent_size = net_size in_size = latent_size + len(input_columns) # NN fprop y_hat, cost, cells = nn_fprop(x, y, in_size, out_size, hidden_size, num_recurrent_layers, train_flag) # COST cg = ComputationGraph(cost) extra_updates = [] # RMS Prop training optimizer step_rules = [ RMSProp(learning_rate=learning_rate, decay_rate=decay_rate), StepClipping(step_clipping) ] parameters_to_update = cg.parameters algorithm = GradientDescent(cost=cg.outputs[0], parameters=parameters_to_update, step_rule=CompositeRule(step_rules)) algorithm.add_updates( extra_updates) # TODO jonathan what is this, is this needed? # Extensions gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) monitored_vars = [ cost, step_rules[0].learning_rate, gradient_norm, step_norm ] test_monitor = DataStreamMonitoring(variables=[cost], after_epoch=True, before_first_epoch=True, data_stream=test_stream, prefix="test") train_monitor = TrainingDataMonitoring(variables=monitored_vars, after_epoch=True, before_first_epoch=True, prefix='train') set_train_flag = SetTrainFlag(after_epoch=True, before_epoch=True, flag=train_flag) # plot = Plot('Plotting example', channels=[['cost']], after_batch=True, open_browser=True) extensions = [ set_train_flag, test_monitor, train_monitor, Timing(), Printing(after_epoch=True), FinishAfter(after_n_epochs=nepochs), saveload.Load(load_path), saveload.Checkpoint(last_path, every_n_epochs=10000), ] + track_best('test_cost', save_path) #+ track_best('train_cost', last_path) if learning_rate_decay not in (0, 1): extensions.append( SharedVariableModifier(step_rules[0].learning_rate, lambda n, lr: np.cast[theano.config.floatX] (learning_rate_decay * lr), after_epoch=False, every_n_epochs=lr_decay_every_n_epochs, after_batch=False)) print 'number of parameters in the model: ' + str( T.sum([p.size for p in cg.parameters]).eval()) # Finally build the main loop and train the model mainLoop = MainLoop(data_stream=train_stream, algorithm=algorithm, model=Model(cost), extensions=extensions) mainLoop.run()