def create_main_loop(save_path): model, bn_model, bn_updates = create_models() ali, = bn_model.top_bricks discriminator_loss, generator_loss = bn_model.outputs step_rule = Adam(learning_rate=LEARNING_RATE, beta1=BETA1) algorithm = ali_algorithm(discriminator_loss, ali.discriminator_parameters, step_rule, generator_loss, ali.generator_parameters, step_rule) algorithm.add_updates(bn_updates) streams = create_celeba_data_streams(BATCH_SIZE, MONITORING_BATCH_SIZE) main_loop_stream, train_monitor_stream, valid_monitor_stream = streams bn_monitored_variables = ( [v for v in bn_model.auxiliary_variables if 'norm' not in v.name] + bn_model.outputs) monitored_variables = ( [v for v in model.auxiliary_variables if 'norm' not in v.name] + model.outputs) extensions = [ Timing(), FinishAfter(after_n_epochs=NUM_EPOCHS), DataStreamMonitoring( bn_monitored_variables, train_monitor_stream, prefix="train", updates=bn_updates), DataStreamMonitoring( monitored_variables, valid_monitor_stream, prefix="valid"), Checkpoint(save_path, after_epoch=True, after_training=True, use_cpickle=True), ProgressBar(), Printing(), ] main_loop = MainLoop(model=bn_model, data_stream=main_loop_stream, algorithm=algorithm, extensions=extensions) return main_loop
def create_main_loop(): model, bn_model, bn_updates = create_models() ali, = bn_model.top_bricks discriminator_loss, generator_loss = bn_model.outputs step_rule = Adam(learning_rate=LEARNING_RATE, beta1=BETA1) algorithm = ali_algorithm(discriminator_loss, ali.discriminator_parameters, step_rule, generator_loss, ali.generator_parameters, step_rule) algorithm.add_updates(bn_updates) streams = create_gaussian_mixture_data_streams( batch_size=BATCH_SIZE, monitoring_batch_size=MONITORING_BATCH_SIZE, means=MEANS, variances=VARIANCES, priors=PRIORS) main_loop_stream, train_monitor_stream, valid_monitor_stream = streams bn_monitored_variables = ([ v for v in bn_model.auxiliary_variables if 'norm' not in v.name ] + bn_model.outputs) monitored_variables = ( [v for v in model.auxiliary_variables if 'norm' not in v.name] + model.outputs) extensions = [ Timing(), FinishAfter(after_n_epochs=NUM_EPOCHS), DataStreamMonitoring(bn_monitored_variables, train_monitor_stream, prefix="train", updates=bn_updates), DataStreamMonitoring(monitored_variables, valid_monitor_stream, prefix="valid"), Checkpoint(os.path.join(self._work_dir, "main_loop.tar"), after_epoch=True, after_training=True, use_cpickle=True), ProgressBar(), Printing(), #ModelLogger(folder=self._work_dir, after_epoch=True), GraphLogger(num_modes=1, num_samples=2500, dimension=2, r=0, std=1, folder=self._work_dir, after_epoch=True, after_training=True), MetricLogger(means=MEANS, variances=VARIANCES, folder=self._work_dir, after_epoch=True) ] main_loop = MainLoop(model=bn_model, data_stream=main_loop_stream, algorithm=algorithm, extensions=extensions) return main_loop
def create_main_loop(save_path): model, bn_model, bn_updates = create_models() ali, = bn_model.top_bricks discriminator_loss, generator_loss = bn_model.outputs step_rule = Adam(learning_rate=LEARNING_RATE, beta1=BETA1) algorithm = ali_algorithm(discriminator_loss, ali.discriminator_parameters, step_rule, generator_loss, ali.generator_parameters, step_rule) algorithm.add_updates(bn_updates) streams = create_cifar10_data_streams(BATCH_SIZE, MONITORING_BATCH_SIZE) main_loop_stream, train_monitor_stream, valid_monitor_stream = streams for d in main_loop_stream.get_epoch_iterator(as_dict=True): print(d.keys) print(d['features'].shape, d['features'].dtype) break main_loop_stream = ShapesDataset(num_examples=600, img_size=32, min_diameter=3, seed=1234).create_stream(batch_size=BATCH_SIZE, is_train=True) for d in main_loop_stream.get_epoch_iterator(as_dict=True): print(d.keys) print(d['features'].shape, d['features'].dtype) break train_monitor_stream = ShapesDataset(num_examples=100, img_size=32, min_diameter=3, seed=1234).create_stream(batch_size=BATCH_SIZE, is_train=False) valid_monitor_stream = ShapesDataset(num_examples=100, img_size=32, min_diameter=3, seed=5678).create_stream(batch_size=BATCH_SIZE, is_train=False) bn_monitored_variables = ( [v for v in bn_model.auxiliary_variables if 'norm' not in v.name] + bn_model.outputs) monitored_variables = ( [v for v in model.auxiliary_variables if 'norm' not in v.name] + model.outputs) extensions = [ Timing(), FinishAfter(after_n_epochs=NUM_EPOCHS), DataStreamMonitoring( bn_monitored_variables, train_monitor_stream, prefix="train", updates=bn_updates), DataStreamMonitoring( monitored_variables, valid_monitor_stream, prefix="valid"), Checkpoint(save_path, after_epoch=True, after_training=True, use_cpickle=True), ProgressBar(), Printing(), ] main_loop = MainLoop(model=bn_model, data_stream=main_loop_stream, algorithm=algorithm, extensions=extensions) return main_loop
def train_model(cost, train_stream, valid_stream, valid_freq, valid_rare, load_location=None, save_location=None): cost.name = 'nll' perplexity = 2**(cost / tensor.log(2)) perplexity.name = 'ppl' # Define the model model = Model(cost) # Load the parameters from a dumped model if load_location is not None: logger.info('Loading parameters...') model.set_param_values(load_parameter_values(load_location)) cg = ComputationGraph(cost) algorithm = GradientDescent(cost=cost, step_rule=Scale(learning_rate=0.01), params=cg.parameters) main_loop = MainLoop( model=model, data_stream=train_stream, algorithm=algorithm, extensions=[ DataStreamMonitoring([cost, perplexity], valid_stream, prefix='valid_all', every_n_batches=5000), # Overfitting of rare words occurs between 3000 and 4000 iterations DataStreamMonitoring([cost, perplexity], valid_rare, prefix='valid_rare', every_n_batches=500), DataStreamMonitoring([cost, perplexity], valid_freq, prefix='valid_frequent', every_n_batches=5000), Printing(every_n_batches=500) ]) main_loop.run() # Save the main loop if save_location is not None: logger.info('Saving the main loop...') dump_manager = MainLoopDumpManager(save_location) dump_manager.dump(main_loop) logger.info('Saved')
def run(): streams = create_celeba_streams(training_batch_size=100, monitoring_batch_size=500, include_targets=True) main_loop_stream = streams[0] train_monitor_stream = streams[1] valid_monitor_stream = streams[2] cg, bn_dropout_cg = create_training_computation_graphs() # Compute parameter updates for the batch normalization population # statistics. They are updated following an exponential moving average. pop_updates = get_batch_normalization_updates(bn_dropout_cg) decay_rate = 0.05 extra_updates = [(p, m * decay_rate + p * (1 - decay_rate)) for p, m in pop_updates] # Prepare algorithm step_rule = Adam() algorithm = GradientDescent(cost=bn_dropout_cg.outputs[0], parameters=bn_dropout_cg.parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) # Prepare monitoring cost = bn_dropout_cg.outputs[0] cost.name = 'cost' train_monitoring = DataStreamMonitoring( [cost], train_monitor_stream, prefix="train", before_first_epoch=False, after_epoch=False, after_training=True, updates=extra_updates) cost, accuracy = cg.outputs cost.name = 'cost' accuracy.name = 'accuracy' monitored_quantities = [cost, accuracy] valid_monitoring = DataStreamMonitoring( monitored_quantities, valid_monitor_stream, prefix="valid", before_first_epoch=False, after_epoch=False, every_n_epochs=5) # Prepare checkpoint checkpoint = Checkpoint( 'celeba_classifier.zip', every_n_epochs=5, use_cpickle=True) extensions = [Timing(), FinishAfter(after_n_epochs=50), train_monitoring, valid_monitoring, checkpoint, Printing(), ProgressBar()] main_loop = MainLoop(data_stream=main_loop_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def main(save_to, num_batches, continue_=False): mlp = MLP([Tanh(), Identity()], [1, 10, 1], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), seed=1) mlp.initialize() x = tensor.vector('numbers') y = tensor.vector('roots') cost = SquaredError().apply(y[:, None], mlp.apply(x[:, None])) cost.name = "cost" main_loop = MainLoop( GradientDescent(cost=cost, params=ComputationGraph(cost).parameters, step_rule=Scale(learning_rate=0.001)), get_data_stream(range(100)), model=Model(cost), extensions=([LoadFromDump(save_to)] if continue_ else []) + [ Timing(), FinishAfter(after_n_batches=num_batches), DataStreamMonitoring( [cost], get_data_stream(range(100, 200)), prefix="test"), TrainingDataMonitoring([cost], after_epoch=True), Dump(save_to), Printing() ]) main_loop.run() return main_loop
def prepare_opti(cost, test): model = Model(cost) algorithm = GradientDescent( cost=cost, parameters=model.parameters, step_rule=RMSProp(), on_unused_sources='ignore' ) extensions = [ FinishAfter(after_n_epochs=nb_epoch), FinishIfNoImprovementAfter(notification_name='test_cross_entropy', epochs=patience), TrainingDataMonitoring( [algorithm.cost], prefix="train", after_epoch=True), DataStreamMonitoring( [algorithm.cost], test_stream, prefix="test"), Printing(), ProgressBar(), #Checkpoint(path, after_epoch=True) ] if resume: print "Restoring from previous breakpoint" extensions.extend([ Load(path) ]) return model, algorithm, extensions
def main(save_to, num_epochs): mlp = MLP([Tanh(), Softmax()], [784, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') probs = mlp.apply(tensor.flatten(x, outdim=2)) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * (W1**2).sum() + .00005 * (W2**2).sum() cost.name = 'final_cost' mnist_train = MNIST(("train", )) mnist_test = MNIST(("test", )) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], Flatten(DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features', )), prefix="test"), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True), Checkpoint(save_to), Printing() ] if BLOCKS_EXTRAS_AVAILABLE: extensions.append( Plot('MNIST example', channels=[[ 'test_final_cost', 'test_misclassificationrate_apply_error_rate' ], ['train_total_gradient_norm']])) main_loop = MainLoop(algorithm, Flatten(DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features', )), model=Model(cost), extensions=extensions) main_loop.run()
def train_base_model(self, train_data, test_data, input_dim): x = T.matrix('features') y = T.matrix('targets') mlp, cost, mis_cost = self.create_base_model(x, y, input_dim) cg = ComputationGraph([cost]) inputs = VariableFilter(roles=[INPUT])(cg.variables) cg = apply_dropout(cg, inputs, 0.2) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam(learning_rate=0.001)) data_stream = train_data data_stream_test = test_data monitor = DataStreamMonitoring(variables=[mis_cost], data_stream=data_stream_test, prefix="test") plot_ext = Plot('F1-measure', channels=[['test_MisclassificationRate']], after_batch=True) main_loop = MainLoop(data_stream=data_stream, algorithm=algorithm, extensions=[ monitor, FinishAfter(after_n_epochs=50), Printing(), plot_ext ]) main_loop.run() return mlp
def prepare_opti(cost, test, *args): model = Model(cost) logger.info("Model created") algorithm = GradientDescent(cost=cost, parameters=model.parameters, step_rule=Adam(learning_rate=0.0015), on_unused_sources='ignore') to_monitor = [algorithm.cost] if args: to_monitor.extend(args) extensions = [ FinishAfter(after_n_epochs=nb_epoch), FinishIfNoImprovementAfter(notification_name='loglikelihood_nat', epochs=patience), TrainingDataMonitoring(to_monitor, prefix="train", after_epoch=True), DataStreamMonitoring(to_monitor, test_stream, prefix="test"), Printing(), ProgressBar(), ApplyMask(before_first_epoch=True, after_batch=True), Checkpoint(check, every_n_epochs=save_every), SaveModel(name=path + '/' + 'pixelcnn_{}'.format(dataset), every_n_epochs=save_every), GenerateSamples(every_n_epochs=save_every), #Checkpoint(path+'/'+'exp.log', save_separately=['log'],every_n_epochs=save_every), ] if resume: logger.info("Restoring from previous checkpoint") extensions = [Load(path + '/' + check)] return model, algorithm, extensions
def add_data_stream_monitor(data_stream, prefix): if not use_own_validation: extensions.append( DataStreamMonitoring(variables=theano_vars_to_monitor, data_stream=data_stream, prefix=prefix, before_epoch=False, **self.args.evaluate_every_n))
def run(model_name): running_on_laptop = socket.gethostname() == 'yop' X = tensor.tensor4('image_features', dtype='float32') T = tensor.matrix('targets', dtype='float32') image_border_size = 100 if running_on_laptop: host_plot = 'http://*****:*****@ %s' % (model_name, datetime.datetime.now(), socket.gethostname()), channels=[['loss', 'valid_loss_test'], ['valid_error']], after_epoch=True, server_url=host_plot), Printing(), Checkpoint('train2') ] main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def train(cost, error_rate, batch_size=100, num_epochs=150): # Setting Loggesetr timestr = time.strftime("%Y_%m_%d_at_%H_%M") save_path = 'results/memory_' + timestr log_path = os.path.join(save_path, 'log.txt') os.makedirs(save_path) fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) # Training blocks_model = Model(cost) all_params = blocks_model.parameters print "Number of found parameters:" + str(len(all_params)) print all_params training_algorithm = GradientDescent(cost=cost, parameters=all_params, step_rule=Adam(learning_rate=0.001)) # training_algorithm = GradientDescent( # cost=cost, params=all_params, # step_rule=Scale(learning_rate=model.default_lr)) monitored_variables = [cost, error_rate] # the rest is for validation # train_data_stream, valid_data_stream = get_mnist_streams( # 50000, batch_size) train_data_stream, valid_data_stream = get_mnist_video_streams(batch_size) train_monitoring = TrainingDataMonitoring(variables=monitored_variables, prefix="train", after_epoch=True) valid_monitoring = DataStreamMonitoring(variables=monitored_variables, data_stream=valid_data_stream, prefix="valid", after_epoch=True) main_loop = MainLoop( algorithm=training_algorithm, data_stream=train_data_stream, model=blocks_model, extensions=[ train_monitoring, valid_monitoring, FinishAfter(after_n_epochs=num_epochs), SaveParams('valid_misclassificationrate_apply_error_rate', blocks_model, save_path), SaveLog(save_path, after_epoch=True), ProgressBar(), Printing() ]) main_loop.run()
def __init__(self, worker, experiment, config): # Data dataset = CIFAR10('train', flatten=False) test_dataset = CIFAR10('test', flatten=False) batch_size = 128 scheme = ShuffledScheme(dataset.num_examples, batch_size) datastream = DataStream(dataset, iteration_scheme=scheme) test_scheme = ShuffledScheme(test_dataset.num_examples, batch_size) test_stream = DataStream(test_dataset, iteration_scheme=test_scheme) # Model m = ModelHelper(config) def score_func(mainloop): scores = mainloop.log.to_dataframe()["test_accur"].values return np.mean(np.sort(scores)[-4:-1]) # Algorithm cg = ComputationGraph([m.cost]) algorithm = GradientDescent(cost=m.cost, params=cg.parameters, step_rule=AdaM()) #job_name = os.path.basename(worker.running_job) job_name = os.path.basename(".") update_path = (os.path.join(os.path.join(worker.path, "updates"), job_name)) if not os.path.exists(update_path): os.mkdir(update_path) self.main_loop = MainLoop( algorithm, datastream, model=Model(m.cost), extensions=[ Timing(), TrainingDataMonitoring([m.cost, m.accur], prefix="train", after_epoch=True), DataStreamMonitoring([m.cost, m.accur], test_stream, prefix="test"), FinishAfter(after_n_epochs=1), LogToFile(os.path.join(update_path, "log.csv")), Printing(), EpochProgress(dataset.num_examples // batch_size + 1) #, DistributeUpdate(worker, every_n_epochs=1) #, DistributeWhetlabFinish(worker, experiment, score_func) #, Plot('cifar10', #channels=[['train_cost', 'test_cost'], ['train_accur', 'test_accur']]) ])
def monitor_stream(self, stream, prefix, **kwargs): variables = self.monitored_vars + self.get_quantitites_vars() if stream == self.train_stream: monitor = TrainingDataMonitoring(variables, prefix=prefix, **kwargs) else: monitor = DataStreamMonitoring(variables, data_stream=stream, prefix=prefix, **kwargs) self.extensions.insert(0, monitor)
def main(save_to, num_epochs): mlp = MLP([Tanh(), Softmax()], [784, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') probs = mlp.apply(x) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHTS])(cg.variables) cost = cost + .00005 * (W1**2).sum() + .00005 * (W2**2).sum() cost.name = 'final_cost' mnist_train = MNIST("train") mnist_test = MNIST("test") algorithm = GradientDescent(cost=cost, step_rule=SteepestDescent(learning_rate=0.1)) main_loop = MainLoop( mlp, DataStream(mnist_train, iteration_scheme=SequentialScheme(mnist_train.num_examples, 50)), algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), prefix="test"), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_every_epoch=True), SerializeMainLoop(save_to), Plot('MNIST example', channels=[[ 'test_final_cost', 'test_misclassificationrate_apply_error_rate' ], ['train_total_gradient_norm']]), Printing() ]) main_loop.run()
def main(save_to, num_epochs, batch_size): mlp = MLP([Tanh(), Tanh(), Tanh(), Softmax()], [3072, 4096, 1024, 512, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tt.tensor4('features', dtype='float32') y = tt.vector('label', dtype='int32') probs = mlp.apply(x.reshape((-1, 3072))) cost = CategoricalCrossEntropy().apply(y, probs) error_rate = MisclassificationRate().apply(y, probs) cg = ComputationGraph([cost]) ws = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * sum(([(w**2).sum() for w in ws])) cost.name = 'final_cost' train_dataset = Cifar10Dataset(data_dir='/home/belohlavek/data/cifar10', is_train=True) valid_dataset = Cifar10Dataset(data_dir='/home/belohlavek/data/cifar10', is_train=False) train_stream = train_dataset.get_stream(batch_size) valid_stream = valid_dataset.get_stream(batch_size) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam(learning_rate=0.001)) extensions = [ Timing(), LogExtension('/home/belohlavek/ALI/mlp.log'), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], valid_stream, prefix="test"), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True), Checkpoint(save_to), Printing() ] main_loop = MainLoop(algorithm, train_stream, model=Model(cost), extensions=extensions) main_loop.run()
def prepare_opti(cost, test): model = Model(cost) algorithm = GradientDescent(cost=cost, parameters=model.parameters, step_rule=Adam(), on_unused_sources='ignore') extensions = [ FinishAfter(after_n_epochs=nb_epoch), FinishIfNoImprovementAfter(notification_name='test_vae_cost', epochs=patience), TrainingDataMonitoring([algorithm.cost], after_epoch=True), DataStreamMonitoring([algorithm.cost], test, prefix="test"), Printing(), ProgressBar(), #SaveModel(name='vae', after_n_epochs=save_every) ] return model, algorithm, extensions
def train_model(cost, error_rate, train_stream, load_location=None, save_location=None): cost.name = "Cross_entropy" error_rate.name = 'Error_rate' # Define the model model = Model(cost) # Load the parameters from a dumped model if load_location is not None: logger.info('Loading parameters...') model.set_param_values(load_parameter_values(load_location)) cg = ComputationGraph(cost) step_rule = Momentum(learning_rate=0.1, momentum=0.9) algorithm = GradientDescent(cost=cost, step_rule=step_rule, params=cg.parameters) main_loop = MainLoop( model=model, data_stream=train_stream, algorithm=algorithm, extensions=[ # DataStreamMonitoring([cost], test_stream, prefix='test', # after_epoch=False, every_n_epochs=10), DataStreamMonitoring([cost], train_stream, prefix='train', after_epoch=True), Printing(after_epoch=True) ]) main_loop.run() # Save the main loop if save_location is not None: logger.info('Saving the main loop...') dump_manager = MainLoopDumpManager(save_location) dump_manager.dump(main_loop) logger.info('Saved')
def build_extensions_list(self): epochs = self.config['epochs'] save_freq = self.config['save_freq'] original_save = self.config['original_save'] if self.config.has_key('original_save') \ else False tracked_train = self.model.linksholder.filter_and_broadcast_request( 'train', 'TrackingLink') tracked_valid = self.model.linksholder.filter_and_broadcast_request( 'valid', 'TrackingLink') extensions = [ Timing(), Experiment(self.filemanager.exp_name, self.filemanager.local_path, self.filemanager.network_path, self.filemanager.full_dump), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring(tracked_train, prefix="train", after_epoch=True), DataStreamMonitoring(tracked_valid, self.streams['valid'], prefix="valid", after_epoch=True), #Ipdb(after_batch=True), SaveExperiment(self.model.parameters, original_save=original_save, every_n_epochs=save_freq), ] extensions += [ #LoadExperiment( # parameters, # full_load=False), ProgressBar(), Printing(), ] self.extensions = extensions
def train(self, data_file, output_data_file, n_epochs=0): training_data = dataset.T_H5PYDataset(data_file, which_sets=('train',)) test_data = dataset.T_H5PYDataset(data_file, which_sets=('test',)) session = Session(root_url='http://localhost:5006') if self.MainLoop is None: step_rules = [RMSProp(learning_rate=0.2, decay_rate=0.95), StepClipping(1)] algorithm = GradientDescent(cost=self.Cost, parameters=self.ComputationGraph.parameters, step_rule=CompositeRule(step_rules), on_unused_sources='ignore') train_stream = DataStream.default_stream( training_data, iteration_scheme=SequentialScheme( training_data.num_examples, batch_size=100)) test_stream = DataStream.default_stream( test_data, iteration_scheme=SequentialScheme( test_data.num_examples, batch_size=100)) self.MainLoop = MainLoop( model=Model(self.Cost), data_stream=train_stream, algorithm=algorithm, extensions=[ FinishAfter(after_n_epochs=n_epochs), Printing(), Checkpoint(output_data_file, every_n_epochs=50), TrainingDataMonitoring([self.Cost], after_batch=True, prefix='train'), DataStreamMonitoring([self.Cost], after_batch=True, data_stream=test_stream, prefix='test'), Plot(output_data_file, channels=[['train_cost', 'test_cost']]) ]) self.MainLoop.run()
TrainingDataMonitoring([v for l in m.monitor_vars for v in l], prefix='train', after_epoch=True) ] if config.save_freq is not None and dump_path is not None: extensions += [ SaveLoadParams(path=dump_path, model=model, before_training=False, after_training=True, after_epoch=True) ] if valid_stream is not None and config.valid_freq != -1: extensions += [ DataStreamMonitoring([v for l in m.monitor_vars_valid for v in l], valid_stream, prefix='valid'), ] if plot_avail: plot_channels = [ ['train_' + v.name for v in lt] + ['valid_' + v.name for v in lv] for lt, lv in zip(m.monitor_vars, m.monitor_vars_valid) ] extensions += [ Plot( document='deepmind_qa_' + model_name, channels=plot_channels, # server_url='http://localhost:5006/', # If you need, change this every_n_batches=config.print_freq) ] extensions += [
def main(name, dataset, epochs, batch_size, learning_rate, attention, n_iter, enc_dim, dec_dim, z_dim, oldmodel): image_size, data_train, data_valid, data_test = datasets.get_data(dataset) train_stream = Flatten( DataStream(data_train, iteration_scheme=SequentialScheme(data_train.num_examples, batch_size))) valid_stream = Flatten( DataStream(data_valid, iteration_scheme=SequentialScheme(data_valid.num_examples, batch_size))) test_stream = Flatten( DataStream(data_test, iteration_scheme=SequentialScheme(data_test.num_examples, batch_size))) if name is None: name = dataset img_height, img_width = image_size x_dim = img_height * img_width rnninits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } if attention != "": read_N, write_N = attention.split(',') read_N = int(read_N) write_N = int(write_N) read_dim = 2 * read_N**2 reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim, width=img_width, height=img_height, N=read_N, **inits) writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim, width=img_width, height=img_height, N=write_N, **inits) attention_tag = "r%d-w%d" % (read_N, write_N) else: read_dim = 2 * x_dim reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits) attention_tag = "full" #---------------------------------------------------------------------- # Learning rate def lr_tag(value): """ Convert a float into a short tag-usable string representation. E.g.: 0.1 -> 11 0.01 -> 12 0.001 -> 13 0.005 -> 53 """ exp = np.floor(np.log10(value)) leading = ("%e" % value)[0] return "%s%d" % (leading, -exp) lr_str = lr_tag(learning_rate) subdir = time.strftime("%Y%m%d-%H%M%S") + "-" + name longname = "%s-%s-t%d-enc%d-dec%d-z%d-lr%s" % ( dataset, attention_tag, n_iter, enc_dim, dec_dim, z_dim, lr_str) pickle_file = subdir + "/" + longname + ".pkl" print("\nRunning experiment %s" % longname) print(" dataset: %s" % dataset) print(" subdirectory: %s" % subdir) print(" learning rate: %g" % learning_rate) print(" attention: %s" % attention) print(" n_iterations: %d" % n_iter) print(" encoder dimension: %d" % enc_dim) print(" z dimension: %d" % z_dim) print(" decoder dimension: %d" % dec_dim) print(" batch size: %d" % batch_size) print(" epochs: %d" % epochs) print() #---------------------------------------------------------------------- encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits) decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits) encoder_mlp = MLP([Identity()], [(read_dim + dec_dim), 4 * enc_dim], name="MLP_enc", **inits) decoder_mlp = MLP([Identity()], [z_dim, 4 * dec_dim], name="MLP_dec", **inits) q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits) draw = DrawModel(n_iter, reader=reader, encoder_mlp=encoder_mlp, encoder_rnn=encoder_rnn, sampler=q_sampler, decoder_mlp=decoder_mlp, decoder_rnn=decoder_rnn, writer=writer) draw.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') #x_recons = 1. + x x_recons, kl_terms = draw.reconstruct(x) #x_recons, _, _, _, _ = draw.silly(x, n_steps=10, batch_size=100) #x_recons = x_recons[-1,:,:] #samples = draw.sample(100) #x_recons = samples[-1, :, :] #x_recons = samples[-1, :, :] recons_term = BinaryCrossEntropy().apply(x, x_recons) recons_term.name = "recons_term" cost = recons_term + kl_terms.sum(axis=0).mean() cost.name = "nll_bound" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ StepClipping(10.), Adam(learning_rate), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #algorithm.add_updates(scan_updates) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] for t in range(n_iter): kl_term_t = kl_terms[t, :].mean() kl_term_t.name = "kl_term_%d" % t #x_recons_t = T.nnet.sigmoid(c[t,:,:]) #recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t) #recons_term_t = recons_term_t.mean() #recons_term_t.name = "recons_term_%d" % t monitors += [kl_term_t] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["train_nll_bound", "test_nll_bound"], ["train_kl_term_%d" % t for t in range(n_iter)], #["train_recons_term_%d" % t for t in range(n_iter)], ["train_total_gradient_norm", "train_total_step_norm"] ] #------------------------------------------------------------ if not os.path.exists(subdir): os.makedirs(subdir) main_loop = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring(train_monitors, prefix="train", after_epoch=True), # DataStreamMonitoring( # monitors, # valid_stream, ## updates=scan_updates, # prefix="valid"), DataStreamMonitoring( monitors, test_stream, # updates=scan_updates, prefix="test"), Checkpoint(name, before_training=False, after_epoch=True, save_separately=['log', 'model']), #Checkpoint(image_size=image_size, save_subdir=subdir, path=pickle_file, before_training=False, after_epoch=True, save_separately=['log', 'model']), Plot(name, channels=plot_channels), ProgressBar(), Printing() ]) if oldmodel is not None: print("Initializing parameters with old model %s" % oldmodel) with open(oldmodel, "rb") as f: oldmodel = pickle.load(f) main_loop.model.set_param_values(oldmodel.get_param_values()) del oldmodel main_loop.run()
def main(save_to, num_epochs, regularization=0.0003, subset=None, num_batches=None, histogram=None, resume=False): batch_size = 500 output_size = 10 convnet = create_lenet_5() layers = convnet.layers x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) cost = (CategoricalCrossEntropy().apply(y.flatten(), probs) .copy(name='cost')) components = (ComponentwiseCrossEntropy().apply(y.flatten(), probs) .copy(name='components')) error_rate = (MisclassificationRate().apply(y.flatten(), probs) .copy(name='error_rate')) confusion = (ConfusionMatrix().apply(y.flatten(), probs) .copy(name='confusion')) confusion.tag.aggregation_scheme = Sum(confusion) cg = ComputationGraph([cost, error_rate, components]) # Apply regularization to the cost weights = VariableFilter(roles=[WEIGHT])(cg.variables) l2_norm = sum([(W ** 2).sum() for W in weights]) l2_norm.name = 'l2_norm' cost = cost + regularization * l2_norm cost.name = 'cost_with_regularization' if subset: start = 30000 - subset // 2 mnist_train = MNIST(("train",), subset=slice(start, start+subset)) else: mnist_train = MNIST(("train",)) mnist_train_stream = DataStream.default_stream( mnist_train, iteration_scheme=ShuffledScheme( mnist_train.num_examples, batch_size)) mnist_test = MNIST(("test",)) mnist_test_stream = DataStream.default_stream( mnist_test, iteration_scheme=ShuffledScheme( mnist_test.num_examples, batch_size)) # Train with simple SGD algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=AdaDelta(decay_rate=0.99)) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), DataStreamMonitoring( [cost, error_rate, confusion], mnist_test_stream, prefix="test"), TrainingDataMonitoring( [cost, error_rate, l2_norm, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint(save_to), ProgressBar(), Printing()] if histogram: attribution = AttributionExtension( components=components, parameters=cg.parameters, components_size=output_size, after_batch=True) extensions.insert(0, attribution) if resume: extensions.append(Load(save_to, True, True)) model = Model(cost) main_loop = MainLoop( algorithm, mnist_train_stream, model=model, extensions=extensions) main_loop.run() if histogram: save_attributions(attribution, filename=histogram) with open('execution-log.json', 'w') as outfile: json.dump(main_loop.log, outfile, cls=NumpyEncoder)
def initialize_all(config, save_path, bokeh_name, params, bokeh_server, bokeh, test_tag, use_load_ext, load_log, fast_start): root_path, extension = os.path.splitext(save_path) data = Data(**config['data']) train_conf = config['training'] recognizer = create_model(config, data, test_tag) # Separate attention_params to be handled differently # when regularization is applied attention = recognizer.generator.transition.attention attention_params = Selector(attention).get_parameters().values() logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) prediction, prediction_mask = add_exploration(recognizer, data, train_conf) # # Observables: # primary_observables = [] # monitored each batch secondary_observables = [] # monitored every 10 batches validation_observables = [] # monitored on the validation set cg = recognizer.get_cost_graph( batch=True, prediction=prediction, prediction_mask=prediction_mask) labels, = VariableFilter( applications=[recognizer.cost], name='labels')(cg) labels_mask, = VariableFilter( applications=[recognizer.cost], name='labels_mask')(cg) gain_matrix = VariableFilter( theano_name=RewardRegressionEmitter.GAIN_MATRIX)(cg) if len(gain_matrix): gain_matrix, = gain_matrix primary_observables.append( rename(gain_matrix.min(), 'min_gain')) primary_observables.append( rename(gain_matrix.max(), 'max_gain')) batch_cost = cg.outputs[0].sum() batch_size = rename(recognizer.labels.shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_total_cost" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) r = recognizer energies, = VariableFilter( applications=[r.generator.readout.readout], name="output_0")( cost_cg) bottom_output = VariableFilter( # We need name_regex instead of name because LookupTable calls itsoutput output_0 applications=[r.bottom.apply], name_regex="output")( cost_cg)[-1] attended, = VariableFilter( applications=[r.generator.transition.apply], name="attended")( cost_cg) attended_mask, = VariableFilter( applications=[r.generator.transition.apply], name="attended_mask")( cost_cg) weights, = VariableFilter( applications=[r.generator.evaluate], name="weights")( cost_cg) max_recording_length = rename(bottom_output.shape[0], "max_recording_length") # To exclude subsampling related bugs max_attended_mask_length = rename(attended_mask.shape[0], "max_attended_mask_length") max_attended_length = rename(attended.shape[0], "max_attended_length") max_num_phonemes = rename(labels.shape[0], "max_num_phonemes") min_energy = rename(energies.min(), "min_energy") max_energy = rename(energies.max(), "max_energy") mean_attended = rename(abs(attended).mean(), "mean_attended") mean_bottom_output = rename(abs(bottom_output).mean(), "mean_bottom_output") weights_penalty = rename(monotonicity_penalty(weights, labels_mask), "weights_penalty") weights_entropy = rename(entropy(weights, labels_mask), "weights_entropy") mask_density = rename(labels_mask.mean(), "mask_density") cg = ComputationGraph([ cost, weights_penalty, weights_entropy, min_energy, max_energy, mean_attended, mean_bottom_output, batch_size, max_num_phonemes, mask_density]) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config.get('regularization', dict()) regularized_cg = cg if reg_config.get('dropout'): logger.info('apply dropout') regularized_cg = apply_dropout(cg, [bottom_output], 0.5) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [p for p in cg.parameters if p not in attention_params] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) train_cost = regularized_cg.outputs[0] if reg_config.get("penalty_coof", .0) > 0: # big warning!!! # here we assume that: # regularized_weights_penalty = regularized_cg.outputs[1] train_cost = (train_cost + reg_config.get("penalty_coof", .0) * regularized_cg.outputs[1] / batch_size) if reg_config.get("decay", .0) > 0: train_cost = (train_cost + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters)) ** 2) train_cost = rename(train_cost, 'train_cost') gradients = None if reg_config.get('adaptive_noise'): logger.info('apply adaptive noise') if ((reg_config.get("penalty_coof", .0) > 0) or (reg_config.get("decay", .0) > 0)): logger.error('using adaptive noise with alignment weight panalty ' 'or weight decay is probably stupid') train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise( cg, cg.outputs[0], variables=cg.parameters, num_examples=data.get_dataset('train').num_examples, parameters=Model(regularized_cg.outputs[0]).get_parameter_dict().values(), **reg_config.get('adaptive_noise') ) train_cost.name = 'train_cost' adapt_noise_cg = ComputationGraph(train_cost) model_prior_mean = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_mean')(adapt_noise_cg)[0], 'model_prior_mean') model_cost = rename( VariableFilter(applications=[noise_brick.apply], name='model_cost')(adapt_noise_cg)[0], 'model_cost') model_prior_variance = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_variance')(adapt_noise_cg)[0], 'model_prior_variance') regularized_cg = ComputationGraph( [train_cost, model_cost] + regularized_cg.outputs + [model_prior_mean, model_prior_variance]) primary_observables += [ regularized_cg.outputs[1], # model cost regularized_cg.outputs[2], # task cost regularized_cg.outputs[-2], # model prior mean regularized_cg.outputs[-1]] # model prior variance model = Model(train_cost) if params: logger.info("Load parameters from " + params) # please note: we cannot use recognizer.load_params # as it builds a new computation graph that dies not have # shapred variables added by adaptive weight noise with open(params, 'r') as src: param_values = load_parameters(src) model.set_parameter_values(param_values) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, parameters[key].get_value().shape) for key in sorted(parameters.keys())], width=120)) # Define the training algorithm. clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append(AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) max_norm_rules = [] if reg_config.get('max_norm', False) > 0: logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable)] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat([name for name, p in parameters.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([name for name, p in parameters.items() if not p in maxnorm_subjects])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects)] burn_in = [] if train_conf.get('burn_in_steps', 0): burn_in.append( BurnIn(num_steps=train_conf['burn_in_steps'])) algorithm = GradientDescent( cost=train_cost, parameters=parameters.values(), gradients=gradients, step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)] + burn_in), on_unused_sources='warn') logger.debug("Scan Ops in the gradients") gradient_cg = ComputationGraph(algorithm.gradients.values()) for op in ComputationGraph(gradient_cg).scans: logger.debug(op) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. secondary_observables += list(regularized_cg.outputs) if not 'train_cost' in [v.name for v in secondary_observables]: secondary_observables += [train_cost] secondary_observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements ** 0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5 step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' secondary_observables.append(stats) primary_observables += [ train_cost, algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold, max_recording_length, max_attended_length, max_attended_mask_length] validation_observables += [ rename(aggregation.mean(batch_cost, batch_size), cost.name), rename(aggregation.sum_(batch_size), 'num_utterances'), weights_entropy, weights_penalty] def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name == 'weights_penalty': result.append(rename(aggregation.mean(var, batch_size), 'weights_penalty_per_recording')) elif var.name == 'weights_entropy': result.append(rename(aggregation.mean(var, labels_mask.sum()), 'weights_entropy_per_label')) else: result.append(var) return result mon_conf = config['monitoring'] # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append(Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append(TrainingDataMonitoring( primary_observables, after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(secondary_observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes(validation_observables), data.get_stream("valid", shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['validate_every_epochs'], every_n_batches=mon_conf['validate_every_batches'], after_training=False) extensions.append(validation) per = PhonemeErrorRate(recognizer, data, **config['monitoring']['search']) per_monitoring = DataStreamMonitoring( [per], data.get_stream("valid", batches=False, shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['search_every_epochs'], every_n_batches=mon_conf['search_every_batches'], after_training=False) extensions.append(per_monitoring) track_the_best_per = TrackTheBest( per_monitoring.record_name(per)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_cost = TrackTheBest( validation.record_name(cost)).set_conditions( before_first_epoch=True, after_epoch=True) extensions += [track_the_best_cost, track_the_best_per] extensions.append(AdaptiveClipping( algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500)) extensions += [ SwitchOffLengthFilter( data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf.get('num_batches'), after_n_epochs=train_conf.get('num_epochs')) .add_condition(["after_batch"], _gradient_norm_is_none), ] channels = [ # Plot 1: training and validation costs [average_monitoring.record_name(train_cost), validation.record_name(cost)], # Plot 2: gradient norm, [average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold)], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [average_monitoring._record_name('weights_entropy_per_label'), validation._record_name('weights_entropy_per_label')], # Plot 5: training and validation monotonicity penalty [average_monitoring._record_name('weights_penalty_per_recording'), validation._record_name('weights_penalty_per_recording')]] if bokeh: extensions += [ Plot(bokeh_name if bokeh_name else os.path.basename(save_path), channels, every_n_batches=10, server_url=bokeh_server),] extensions += [ Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension,)) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_cost.notification_name), (root_path + "_best_ll" + extension,)), ProgressBar()] extensions.append(EmbedIPython(use_main_loop_run_caller_env=True)) if config['net']['criterion']['name'].startswith('mse'): extensions.append( LogInputsGains( labels, cg, recognizer.generator.readout.emitter, data)) if train_conf.get('patience'): patience_conf = train_conf['patience'] if not patience_conf.get('notification_names'): # setdefault will not work for empty list patience_conf['notification_names'] = [ track_the_best_per.notification_name, track_the_best_cost.notification_name] extensions.append(Patience(**patience_conf)) extensions.append(Printing(every_n_batches=1, attribute_filter=PrintingFilterList())) return model, algorithm, data, extensions
predict = convnet.apply(x) cost = CategoricalCrossEntropy().apply(y.flatten(), predict).copy(name='cost') error = MisclassificationRate().apply(y.flatten(), predict) #Little trick to plot the error rate in two different plots (We can't use two time the same data in the plot for a unknow reason) error_rate = error.copy(name='error_rate') error_rate2 = error.copy(name='error_rate2') cg = ComputationGraph([cost, error_rate]) ########### ALGORITHM of training############# algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam(learning_rate=0.0005)) extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate, error_rate2], stream_valid, prefix="valid"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint("catsVsDogs256.pkl"), ProgressBar(), Printing() ] #Adding a live plot with the bokeh server extensions.append( Plot('CatsVsDogs_256', channels=[['train_error_rate', 'valid_error_rate'],
n_batches = 200 algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule( [StepClipping(10.0), Adam(lr)])) algorithm.add_updates(extra_updates) train_monitor = TrainingDataMonitoring(variables=monitoring_variables + data_monitoring, every_n_batches=n_batches, prefix="train") valid_monitor = DataStreamMonitoring(monitoring_variables, valid_stream, every_n_batches=n_batches, prefix="valid") extensions = [ ProgressBar(), Timing(every_n_batches=n_batches), train_monitor, valid_monitor, TrackTheBest('valid_nll', every_n_batches=n_batches), Plot(save_dir + "progress/" + experiment_name + ".png", [['train_nll', 'valid_nll'], ['valid_learning_rate']], every_n_batches=n_batches, email=False), Checkpoint(save_dir + "pkl/best_" + experiment_name + ".pkl", use_cpickle=True).add_condition( ['after_batch'], predicate=OnLogRecord('valid_nll_best_so_far')), Printing(every_n_batches=n_batches),
def main(save_to, model, train, test, num_epochs, input_size = (150,150), learning_rate=0.01, batch_size=50, num_batches=None, flatten_stream=False): """ save_to : where to save trained model model : model given in input must be already initialised (works with convnet and mlp) input_size : the shape of the reshaped image in input (before flattening is applied if flatten_stream is True) """ if flatten_stream : x = tensor.matrix('image_features') else : x = tensor.tensor4('image_features') y = tensor.lmatrix('targets') #Data augmentation #insert data augmentation here #Generating stream train_stream = DataStream.default_stream( train, iteration_scheme=ShuffledScheme(train.num_examples, batch_size) ) test_stream = DataStream.default_stream( test, iteration_scheme=ShuffledScheme(test.num_examples, batch_size) ) #Reshaping procedure #Add a crop option in scikitresize so that the image is not deformed #Resize to desired square shape train_stream = ScikitResize(train_stream, input_size, which_sources=('image_features',)) test_stream = ScikitResize(test_stream, input_size, which_sources=('image_features',)) #Flattening the stream if flatten_stream is True: train_stream = Flatten(train_stream, which_sources=('image_features',)) test_stream = Flatten(test_stream, which_sources=('image_features',)) # Apply input to model probs = model.apply(x) #Defining cost and various indices to watch #print(probs) #cost = SquaredError().apply(y.flatten(),probs) cost = CategoricalCrossEntropy().apply(y.flatten(), probs).copy(name='cost') error_rate = MisclassificationRate().apply(y.flatten(), probs).copy( name='error_rate') #Building Computation Graph cg = ComputationGraph([cost, error_rate]) # Train with simple SGD algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=learning_rate)) #Defining extensions extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), TrainingDataMonitoring([cost, error_rate,aggregation.mean(algorithm.total_gradient_norm)], prefix="train", every_n_batches=5), DataStreamMonitoring([cost, error_rate],test_stream,prefix="test", every_n_batches=25), Checkpoint(save_to), ProgressBar(), Printing(every_n_batches=5)] # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. model = Model(cost) main_loop = MainLoop( algorithm, train_stream, model=model, extensions=extensions) main_loop.run()
iteration_scheme=SequentialScheme( data_val.num_examples, batch_size=bs)) data_stream_test = DataStream.default_stream(data_test, iteration_scheme=SequentialScheme( data_test.num_examples, batch_size=bs)) monitor = TrainingDataMonitoring(variables=[cost], prefix='train', every_n_batches=500, after_epoch=True) monitor_val = DataStreamMonitoring(variables=[cost, error], data_stream=data_stream_val, prefix='validation', after_epoch=True) monitor_test = DataStreamMonitoring(variables=[error], data_stream=data_stream_test, prefix='test', after_epoch=True) learning_rate = 0.00008 n_epochs = 100 algorithm = GradientDescent(cost=cost, parameters=cg.parameters, on_unused_sources='ignore', step_rule=CompositeRule([ StepClipping(10.), Adam(learning_rate),
def main(mode, save_path, steps, num_batches, load_params): chars = (list(string.ascii_uppercase) + list(range(10)) + [' ', '.', ',', '\'', '"', '!', '?', '<UNK>']) char_to_ind = {char: i for i, char in enumerate(chars)} ind_to_char = {v: k for k, v in char_to_ind.iteritems()} train_dataset = TextFile(['/Tmp/serdyuk/data/wsj_text_train'], char_to_ind, bos_token=None, eos_token=None, level='character') valid_dataset = TextFile(['/Tmp/serdyuk/data/wsj_text_valid'], char_to_ind, bos_token=None, eos_token=None, level='character') vocab_size = len(char_to_ind) logger.info('Dictionary size: {}'.format(vocab_size)) if mode == 'continue': continue_training(save_path) return elif mode == "sample": main_loop = load(open(save_path, "rb")) generator = main_loop.model.get_top_bricks()[-1] sample = ComputationGraph(generator.generate( n_steps=steps, batch_size=1, iterate=True)).get_theano_function() states, outputs, costs = [data[:, 0] for data in sample()] print("".join([ind_to_char[s] for s in outputs])) numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() trans_freqs = numpy.zeros((vocab_size, vocab_size), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] return # Experiment configuration batch_size = 20 dim = 650 feedback_dim = 650 valid_stream = valid_dataset.get_example_stream() valid_stream = Batch(valid_stream, iteration_scheme=ConstantScheme(batch_size)) valid_stream = Padding(valid_stream) valid_stream = Mapping(valid_stream, _transpose) # Build the bricks and initialize them transition = GatedRecurrent(name="transition", dim=dim, activation=Tanh()) generator = SequenceGenerator( Readout(readout_dim=vocab_size, source_names=transition.apply.states, emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback( vocab_size, feedback_dim, name='feedback'), name="readout"), transition, weights_init=Uniform(std=0.04), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() transition.push_initialization_config() generator.initialize() # Build the cost computation graph. features = tensor.lmatrix('features') features_mask = tensor.matrix('features_mask') cost_matrix = generator.cost_matrix( features, mask=features_mask) batch_cost = cost_matrix.sum() cost = aggregation.mean( batch_cost, features.shape[1]) cost.name = "sequence_log_likelihood" char_cost = aggregation.mean( batch_cost, features_mask.sum()) char_cost.name = 'character_log_likelihood' ppl = 2 ** (cost / numpy.log(2)) ppl.name = 'ppl' bits_per_char = char_cost / tensor.log(2) bits_per_char.name = 'bits_per_char' length = features.shape[0] length.name = 'length' model = Model(batch_cost) if load_params: params = load_parameter_values(save_path) model.set_parameter_values(params) if mode == "train": # Give an idea of what's going on. logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_parameters().items()], width=120)) train_stream = train_dataset.get_example_stream() train_stream = Mapping(train_stream, _truncate) train_stream = Batch(train_stream, iteration_scheme=ConstantScheme(batch_size)) train_stream = Padding(train_stream) train_stream = Mapping(train_stream, _transpose) parameters = model.get_parameter_dict() maxnorm_subjects = VariableFilter(roles=[WEIGHT])(parameters.values()) algorithm = GradientDescent( cost=batch_cost, parameters=parameters.values(), step_rule=CompositeRule([StepClipping(1000.), AdaDelta(epsilon=1e-8) #, Restrict(VariableClipping(1.0, axis=0), maxnorm_subjects) ])) ft = features[:6, 0] ft.name = 'feature_example' observables = [cost, ppl, char_cost, length, bits_per_char] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements ** 0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5 step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' observables.append(stats) track_the_best_bpc = TrackTheBest('valid_bits_per_char') root_path, extension = os.path.splitext(save_path) this_step_monitoring = TrainingDataMonitoring( observables + [ft], prefix="this_step", after_batch=True) average_monitoring = TrainingDataMonitoring( observables + [algorithm.total_step_norm, algorithm.total_gradient_norm], prefix="average", every_n_batches=10) valid_monitoring = DataStreamMonitoring( observables, prefix="valid", every_n_batches=1500, before_training=False, data_stream=valid_stream) main_loop = MainLoop( algorithm=algorithm, data_stream=train_stream, model=model, extensions=[ this_step_monitoring, average_monitoring, valid_monitoring, track_the_best_bpc, Checkpoint(save_path, ), Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"], use_cpickle=True) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_bpc.notification_name), (root_path + "_best" + extension,)), Timing(after_batch=True), Printing(every_n_batches=10), Plot(root_path, [[average_monitoring.record_name(cost), valid_monitoring.record_name(cost)], [average_monitoring.record_name(algorithm.total_step_norm)], [average_monitoring.record_name(algorithm.total_gradient_norm)], [average_monitoring.record_name(ppl), valid_monitoring.record_name(ppl)], [average_monitoring.record_name(char_cost), valid_monitoring.record_name(char_cost)], [average_monitoring.record_name(bits_per_char), valid_monitoring.record_name(bits_per_char)]], every_n_batches=10) ]) main_loop.run() elif mode == 'evaluate': with open('/data/lisatmp3/serdyuk/wsj_lms/lms/wsj_trigram_with_initial_eos/lexicon.txt') as f: raw_words = [line.split()[1:-1] for line in f.readlines()] words = [[char_to_ind[c] if c in char_to_ind else char_to_ind['<UNK>'] for c in w] for w in raw_words] max_word_length = max([len(w) for w in words]) initial_states = tensor.matrix('init_states') cost_matrix_step = generator.cost_matrix(features, mask=features_mask, states=initial_states) cg = ComputationGraph(cost_matrix_step) states = cg.auxiliary_variables[-2] compute_cost = theano.function([features, features_mask, initial_states], [cost_matrix_step.sum(axis=0), states]) cost_matrix = generator.cost_matrix(features, mask=features_mask) initial_cg = ComputationGraph(cost_matrix) initial_states = initial_cg.auxiliary_variables[-2] total_word_cost = 0 num_words = 0 examples = numpy.zeros((max_word_length + 1, len(words)), dtype='int64') all_masks = numpy.zeros((max_word_length + 1, len(words)), dtype=floatX) for i, word in enumerate(words): examples[:len(word), i] = word all_masks[:len(word), i] = 1. single_space = numpy.array([char_to_ind[' ']])[:, None] for batch in valid_stream.get_epoch_iterator(): for example, mask in equizip(batch[0].T, batch[1].T): example = example[:(mask.sum())] spc_inds = list(numpy.where(example == char_to_ind[" "])[0]) state = generator.transition.transition.initial_states_.get_value()[None, :] for i, j in equizip([-1] + spc_inds, spc_inds + [-1]): word = example[(i+1):j, None] word_cost, states = compute_cost( word, numpy.ones_like(word, dtype=floatX), state) state = states[-1] costs = numpy.exp(-compute_cost( examples, all_masks, numpy.tile(state, [examples.shape[1], 1]))[0]) _, space_states = compute_cost( single_space, numpy.ones_like(single_space, dtype=floatX), state) state = space_states[-1] word_prob = numpy.exp(-word_cost) total_word_cost += word_cost + numpy.log(numpy.sum(costs)) num_words += 1 print(word_prob) print(numpy.sum(costs)) print("Average cost", total_word_cost / num_words) print("PPL", numpy.exp(total_word_cost / num_words)) print("Word-level perplexity") print(total_word_cost / num_words) else: assert False
def train(config, save_path, bokeh_name, params, bokeh_server, test_tag, use_load_ext, load_log, fast_start, validation_epochs, validation_batches, per_epochs, per_batches): root_path, extension = os.path.splitext(save_path) data = Data(**config['data']) # Build the main brick and initialize all parameters. recognizer = SpeechRecognizer( data.recordings_source, data.labels_source, data.eos_label, data.num_features, data.num_labels, name="recognizer", data_prepend_eos=data.prepend_eos, character_map=data.character_map, **config["net"]) for brick_path, attribute_dict in sorted( config['initialization'].items(), key=lambda (k, v): -k.count('/')): for attribute, value in attribute_dict.items(): brick, = Selector(recognizer).select(brick_path).bricks setattr(brick, attribute, value) brick.push_initialization_config() recognizer.initialize() # Separate attention_params to be handled differently # when regularization is applied attention = recognizer.generator.transition.attention attention_params = Selector(attention).get_parameters().values() logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) if params: logger.info("Load parameters from " + params) recognizer.load_params(params) if test_tag: tensor.TensorVariable.__str__ = tensor.TensorVariable.__repr__ __stream = data.get_stream("train") __data = next(__stream.get_epoch_iterator(as_dict=True)) recognizer.recordings.tag.test_value = __data[data.recordings_source] recognizer.recordings_mask.tag.test_value = __data[data.recordings_source + '_mask'] recognizer.labels.tag.test_value = __data[data.labels_source] recognizer.labels_mask.tag.test_value = __data[data.labels_source + '_mask'] theano.config.compute_test_value = 'warn' batch_cost = recognizer.get_cost_graph().sum() batch_size = named_copy(recognizer.recordings.shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) r = recognizer energies, = VariableFilter( applications=[r.generator.readout.readout], name="output_0")( cost_cg) bottom_output, = VariableFilter( applications=[r.bottom.apply], name="output")( cost_cg) attended, = VariableFilter( applications=[r.generator.transition.apply], name="attended")( cost_cg) attended_mask, = VariableFilter( applications=[r.generator.transition.apply], name="attended_mask")( cost_cg) weights, = VariableFilter( applications=[r.generator.evaluate], name="weights")( cost_cg) max_recording_length = named_copy(r.recordings.shape[0], "max_recording_length") # To exclude subsampling related bugs max_attended_mask_length = named_copy(attended_mask.shape[0], "max_attended_mask_length") max_attended_length = named_copy(attended.shape[0], "max_attended_length") max_num_phonemes = named_copy(r.labels.shape[0], "max_num_phonemes") min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") mean_attended = named_copy(abs(attended).mean(), "mean_attended") mean_bottom_output = named_copy(abs(bottom_output).mean(), "mean_bottom_output") weights_penalty = named_copy(monotonicity_penalty(weights, r.labels_mask), "weights_penalty") weights_entropy = named_copy(entropy(weights, r.labels_mask), "weights_entropy") mask_density = named_copy(r.labels_mask.mean(), "mask_density") cg = ComputationGraph([ cost, weights_penalty, weights_entropy, min_energy, max_energy, mean_attended, mean_bottom_output, batch_size, max_num_phonemes, mask_density]) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config['regularization'] regularized_cg = cg if reg_config.get('dropout'): logger.info('apply dropout') regularized_cg = apply_dropout(cg, [bottom_output], 0.5) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [p for p in cg.parameters if p not in attention_params] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) regularized_cost = regularized_cg.outputs[0] regularized_weights_penalty = regularized_cg.outputs[1] # Model is weird class, we spend lots of time arguing with Bart # what it should be. However it can already nice things, e.g. # one extract all the parameters from the computation graphs # and give them hierahical names. This help to notice when a # because of some bug a parameter is not in the computation # graph. model = SpeechModel(regularized_cost) params = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, params[key].get_value().shape) for key in sorted(params.keys())], width=120)) # Define the training algorithm. train_conf = config['training'] clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append(AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) max_norm_rules = [] if reg_config.get('max_norm', False): logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable)] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat([name for name, p in params.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([name for name, p in params.items() if not p in maxnorm_subjects])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects)] algorithm = GradientDescent( cost=regularized_cost + reg_config.get("penalty_coof", .0) * regularized_weights_penalty / batch_size + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters)) ** 2, parameters=params.values(), step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)])) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. observables = regularized_cg.outputs observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold] for name, param in params.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements ** 0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5 step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' observables.append(stats) def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name == 'weights_penalty': result.append(named_copy(aggregation.mean(var, batch_size), 'weights_penalty_per_recording')) elif var.name == 'weights_entropy': result.append(named_copy(aggregation.mean( var, recognizer.labels_mask.sum()), 'weights_entropy_per_label')) else: result.append(var) return result # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append(Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append(TrainingDataMonitoring( [observables[0], algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold, max_recording_length, max_attended_length, max_attended_mask_length], after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes([cost, weights_entropy, weights_penalty]), data.get_stream("valid"), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=validation_epochs, every_n_batches=validation_batches, after_training=False) extensions.append(validation) recognizer.init_beam_search(10) per = PhonemeErrorRate(recognizer, data.get_dataset("valid")) per_monitoring = DataStreamMonitoring( [per], data.get_stream("valid", batches=False, shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=per_epochs, every_n_batches=per_batches, after_training=False) extensions.append(per_monitoring) track_the_best_per = TrackTheBest( per_monitoring.record_name(per)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_likelihood = TrackTheBest( validation.record_name(cost)).set_conditions( before_first_epoch=True, after_epoch=True) extensions += [track_the_best_likelihood, track_the_best_per] extensions.append(AdaptiveClipping( algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500)) extensions += [ SwitchOffLengthFilter(data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf['num_batches'], after_n_epochs=train_conf['num_epochs']) .add_condition(["after_batch"], _gradient_norm_is_none), # Live plotting: requires launching `bokeh-server` # and allows to see what happens online. Plot(bokeh_name if bokeh_name else os.path.basename(save_path), [# Plot 1: training and validation costs [average_monitoring.record_name(regularized_cost), validation.record_name(cost)], # Plot 2: gradient norm, [average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold)], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [average_monitoring._record_name('weights_entropy_per_label'), validation._record_name('weights_entropy_per_label')], # Plot 5: training and validation monotonicity penalty [average_monitoring._record_name('weights_penalty_per_recording'), validation._record_name('weights_penalty_per_recording')]], every_n_batches=10, server_url=bokeh_server), Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension,)) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_likelihood.notification_name), (root_path + "_best_ll" + extension,)), ProgressBar(), Printing(every_n_batches=1, attribute_filter=PrintingFilterList() )] # Save the config into the status log = TrainingLog() log.status['_config'] = repr(config) main_loop = MainLoop( model=model, log=log, algorithm=algorithm, data_stream=data.get_stream("train"), extensions=extensions) main_loop.run()
mnist_train = MNIST("train") train_stream = Flatten( DataStream.default_stream(dataset=mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 128)), ) # load testing data mnist_test = MNIST("test") test_stream = Flatten( DataStream.default_stream(dataset=mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 1024)), ) # train the model from blocks.model import Model main_loop = MainLoop(model=Model(cost), data_stream=train_stream, algorithm=GradientDescent( cost=cost, params=ComputationGraph(cost).parameters, step_rule=Scale(learning_rate=0.1)), extensions=[ FinishAfter(after_n_epochs=5), DataStreamMonitoring(variables=[cost, error_rate], data_stream=test_stream, prefix="test"), Printing() ]) main_loop.run()
n_batches = 139 * 16 algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule( [StepClipping(10.0), Adam(lr)])) train_monitor = TrainingDataMonitoring(variables=[cost], every_n_batches=n_batches, prefix="train") valid_monitor = DataStreamMonitoring( [cost], valid_stream, after_epoch=True, #before_first_epoch = False, prefix="valid") extensions = extensions = [ Timing(every_n_batches=n_batches), train_monitor, valid_monitor, TrackTheBest('valid_nll', after_epoch=True), Plot(save_dir + experiment_name + ".png", [['train_nll', 'valid_nll']], every_n_batches=4 * n_batches, email=True), Checkpoint(save_dir + experiment_name + ".pkl", use_cpickle=True, every_n_batches=n_batches * 8, after_epoch=True), Checkpoint(save_dir + "best_" + experiment_name + ".pkl", after_epoch=True,
inputs = VariableFilter(theano_name_regex=r'.*apply_input.*')(cg.variables) cg = apply_dropout(cg, inputs, dropout) cost = cg.outputs[0] # Learning algorithm step_rules = [RMSProp(learning_rate=learning_rate, decay_rate=decay_rate), StepClipping(step_clipping)] algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule(step_rules)) # Extensions gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) monitored_vars = [cost, gradient_norm, step_norm] dev_monitor = DataStreamMonitoring(variables=[cost], after_batch=True, before_first_epoch=True, data_stream=dev_stream, prefix="dev") train_monitor = TrainingDataMonitoring(variables=monitored_vars, after_batch=True, before_first_epoch=True, prefix='train') plotter = Plot('RNN char-level prediction', channels=[[train_monitor.record_name(cost)], [dev_monitor.record_name(cost)]], server_url="http://bart4.iro.umontreal.ca:5006", after_batch=True) if start_from_checkpoint == True: extensions = [dev_monitor, train_monitor, Timing(), Printing(after_batch=True), FinishAfter(after_n_epochs=num_epochs), saveload.Load(last_path, load_log=True), plotter, saveload.Checkpoint(last_path, save_separately=['log']),