def test_gradient_clipping(self): """ Create a known gradient and check whether it is being clipped correctly """ mlp = MLP(layers=[Linear(dim=1, irange=0, layer_name='linear')], nvis=1) W, b = mlp.layers[0].get_params() W.set_value([[10]]) X = mlp.get_input_space().make_theano_batch() y = mlp.get_output_space().make_theano_batch() cost = Default() gradients, _ = cost.get_gradients(mlp, (X, y)) clipped_cost = GradientClipping(20, Default()) clipped_gradients, _ = clipped_cost.get_gradients(mlp, (X, y)) # The MLP defines f(x) = (x W)^2, with df/dW = 2 W x^2 f = function([X, y], [gradients[W].sum(), clipped_gradients[W].sum()], allow_input_downcast=True) # df/dW = df/db = 20 for W = 10, x = 1, so the norm is 20 * sqrt(2) # and the gradients should be clipped to 20 / sqrt(2) np.testing.assert_allclose(f([[1]], [[0]]), [20, 20 / np.sqrt(2)])
def __init__(self, layers, batch_size=None, input_space=None, input_source='features', target_source='targets', nvis=None, seed=None, layer_name=None, monitor_targets=True, dataset_adaptor=VectorDataset(), trainer=SGDTrainer(cost=Default()), **kwargs): self.configs = { 'layers': layers, 'batch_size': batch_size, 'input_space': input_space, 'input_source': input_source, 'target_source': target_source, 'nvis': nvis, 'seed': seed, 'layer_name': layer_name, 'monitor_targets': monitor_targets, # 'kwargs': kwargs, } self.dataset_adaptor = dataset_adaptor self.trainer = trainer
def set_training_criteria(self, learning_rate=0.05, cost=Default(), batch_size=10, max_epochs=10): self.training_alg = SGD(learning_rate=learning_rate, cost=cost, batch_size=batch_size, monitoring_dataset=self.datasets, termination_criterion=EpochCounter(max_epochs))
def test_gradient(self): """ Testing to see whether the gradient can be calculated. """ rnn = RNN(input_space=SequenceSpace(VectorSpace(dim=1)), layers=[Recurrent(dim=2, layer_name='recurrent', irange=0, nonlinearity=lambda x: x), Linear(dim=1, layer_name='linear', irange=0)]) X_data, X_mask = rnn.get_input_space().make_theano_batch() y_data, y_mask = rnn.get_output_space().make_theano_batch() default_cost = Default() cost = default_cost.expr(rnn, ((X_data, X_mask), (y_data, y_mask))) tensor.grad(cost, rnn.get_params(), disconnected_inputs='ignore')
def get_layer_trainer_logistic(layer, trainset,validset): # configs on sgd config = {'learning_rate': 0.1, 'cost' : Default(), 'batch_size': 150, 'monitoring_dataset': validset, 'termination_criterion': MonitorBased(channel_name='y_misclass',N=10,prop_decrease=0), 'update_callbacks': None } train_algo = SGD(**config) model = layer return Train(model = model, dataset = trainset, algorithm = train_algo, extensions = None)
def get_layer_trainer_logistic(layer, trainset): # configs on sgd config = {'learning_rate': 0.1, 'cost' : Default(), 'batch_size': 10, 'monitoring_batches': 10, 'monitoring_dataset': trainset, 'termination_criterion': EpochCounter(max_epochs=MAX_EPOCHS_SUPERVISED), 'update_callbacks': None } train_algo = SGD(**config) model = layer return Train(model = model, dataset = trainset, algorithm = train_algo, extensions = None)
def test_correctness(): """ Test that the cost function works with float64 """ x_train, y_train, x_valid, y_valid = create_dataset() trainset = DenseDesignMatrix(X=np.array(x_train), y=y_train) validset = DenseDesignMatrix(X=np.array(x_valid), y=y_valid) n_inputs = trainset.X.shape[1] n_outputs = 1 n_hidden = 10 hidden_istdev = 4 * (6 / float(n_inputs + n_hidden)) ** 0.5 output_istdev = 4 * (6 / float(n_hidden + n_outputs)) ** 0.5 model = MLP(layers=[Sigmoid(dim=n_hidden, layer_name='hidden', istdev=hidden_istdev), Sigmoid(dim=n_outputs, layer_name='output', istdev=output_istdev)], nvis=n_inputs, seed=[2013, 9, 16]) termination_criterion = And([EpochCounter(max_epochs=1), MonitorBased(prop_decrease=1e-7, N=2)]) cost = SumOfCosts([(0.99, Default()), (0.01, L1WeightDecay({}))]) algo = SGD(1e-1, update_callbacks=[ExponentialDecay(decay_factor=1.00001, min_lr=1e-10)], cost=cost, monitoring_dataset=validset, termination_criterion=termination_criterion, monitor_iteration_mode='even_shuffled_sequential', batch_size=2) train = Train(model=model, dataset=trainset, algorithm=algo) train.main_loop()
def get_default_cost(self): return Default()
def get_layer_MLP(layers,trainset,validset): #processor = Standardize(); # trainset = BlackBoxDataset( which_set = 'train', # start = 0, # stop = 900, # preprocessor = Standardize(), # fit_preprocessor = True, # fit_test_preprocessor = True, # ) # # validset = BlackBoxDataset( which_set = 'train', # start = 900, # stop = 1000 , # preprocessor = Standardize(), # fit_preprocessor = True, # fit_test_preprocessor = False, # ) dropCfg = { 'input_include_probs': { 'h0' : .8 } , 'input_scales': { 'h0': 1.} } config = { 'learning_rate': .1, 'init_momentum': .5, 'cost' : Default(), #Dropout(**dropCfg), 'monitoring_dataset': { 'train' : trainset, 'valid' : validset }, 'termination_criterion': MonitorBased(channel_name='valid_y_misclass',N=10,prop_decrease=0), 'update_callbacks': None } # configCfg0 = {'layer_name' : 'h0', # 'dim' : 1875, # 'irange' : .05, # # Rather than using weight decay, we constrain the norms of the weight vectors # 'max_col_norm' : 1.} # # configCfg1 = {'layer_name' : 'h1', # 'dim' : 1875, # 'irange' : .05, # # Rather than using weight decay, we constrain the norms of the weight vectors # 'max_col_norm' : 1.} sftmaxCfg = { 'layer_name': 'y', 'init_bias_target_marginals': trainset, # Initialize the weights to all 0s 'irange': .0, 'n_classes': 9 } layers.append(Softmax(**sftmaxCfg)) train_algo = SGD(**config) model = MLP(batch_size=10,layers=layers,nvis=1875) return Train(model = model, dataset = trainset, algorithm = train_algo, extensions = None, #[LinearDecayOverEpoch(start= 5, saturate= 100, decay_factor= .01)], save_path = "best_dbn_model.pkl", save_freq = 100)
def supervisedLayerwisePRL(trainset, testset): ''' The supervised layerwise training as used in the PRL Paper. Input ------ trainset : A path to an hdf5 file created through h5py. testset : A path to an hdf5 file created through h5py. ''' batch_size = 100 # Both train and test h5py files are expected to have a 'topo_view' and 'y' # datasets side them corresponding to the 'b01c' data format as used in pylearn2 # and 'y' equivalent to the one hot encoded labels trn = HDF5Dataset(filename=trainset, topo_view='topo_view', y='y', load_all=False) tst = HDF5Dataset(filename=testset, topo_view='topo_view', y='y', load_all=False) ''' The 1st Convolution and Pooling Layers are added below. ''' h1 = mlp.ConvRectifiedLinear(layer_name='h1', output_channels=64, irange=0.05, kernel_shape=[4, 4], pool_shape=[4, 4], pool_stride=[2, 2], max_kernel_norm=1.9365) fc = mlp.RectifiedLinear(layer_name='fc', dim=1500, irange=0.05) output = mlp.Softmax(layer_name='y', n_classes=171, irange=.005, max_col_norm=1.9365) layers = [h1, fc, output] mdl = mlp.MLP(layers, input_space=Conv2DSpace(shape=(70, 70), num_channels=1)) trainer = sgd.SGD( learning_rate=0.002, batch_size=batch_size, learning_rule=learning_rule.RMSProp(), cost=SumOfCosts( costs=[Default(), WeightDecay(coeffs=[0.0005, 0.0005, 0.0005])]), train_iteration_mode='shuffled_sequential', monitor_iteration_mode='sequential', termination_criterion=EpochCounter(max_epochs=15), monitoring_dataset={ 'test': tst, 'valid': vld }) watcher = best_params.MonitorBasedSaveBest( channel_name='valid_y_misclass', save_path='./Saved Models/conv_supervised_layerwise_best1.pkl') decay = sgd.LinearDecayOverEpoch(start=8, saturate=15, decay_factor=0.1) experiment = Train( dataset=trn, model=mdl, algorithm=trainer, extensions=[watcher, decay], ) experiment.main_loop() del mdl mdl = serial.load('./Saved Models/conv_supervised_layerwise_best1.pkl') mdl = push_monitor(mdl, 'k') ''' The 2nd Convolution and Pooling Layers are added below. ''' h2 = mlp.ConvRectifiedLinear(layer_name='h2', output_channels=64, irange=0.05, kernel_shape=[4, 4], pool_shape=[4, 4], pool_stride=[2, 2], max_kernel_norm=1.9365) fc = mlp.RectifiedLinear(layer_name='fc', dim=1500, irange=0.05) output = mlp.Softmax(layer_name='y', n_classes=171, irange=.005, max_col_norm=1.9365) del mdl.layers[-1] mdl.layer_names.remove('y') del mdl.layers[-1] mdl.layer_names.remove('fc') mdl.add_layers([h2, fc, output]) trainer = sgd.SGD(learning_rate=0.002, batch_size=batch_size, learning_rule=learning_rule.RMSProp(), cost=SumOfCosts(costs=[ Default(), WeightDecay(coeffs=[0.0005, 0.0005, 0.0005, 0.0005]) ]), train_iteration_mode='shuffled_sequential', monitor_iteration_mode='sequential', termination_criterion=EpochCounter(max_epochs=15), monitoring_dataset={ 'test': tst, 'valid': vld }) watcher = best_params.MonitorBasedSaveBest( channel_name='valid_y_misclass', save_path='./Saved Models/conv_supervised_layerwise_best2.pkl') decay = sgd.LinearDecayOverEpoch(start=8, saturate=15, decay_factor=0.1) experiment = Train( dataset=trn, model=mdl, algorithm=trainer, extensions=[watcher, decay], ) experiment.main_loop() del mdl mdl = serial.load('./Saved Models/conv_supervised_layerwise_best2.pkl') mdl = push_monitor(mdl, 'l') ''' The 3rd Convolution and Pooling Layers are added below. ''' h3 = mlp.ConvRectifiedLinear(layer_name='h2', output_channels=64, irange=0.05, kernel_shape=[4, 4], pool_shape=[4, 4], pool_stride=[2, 2], max_kernel_norm=1.9365) fc = mlp.RectifiedLinear(layer_name='h3', dim=1500, irange=0.05) output = mlp.Softmax(layer_name='y', n_classes=10, irange=.005, max_col_norm=1.9365) del mdl.layers[-1] mdl.layer_names.remove('y') del mdl.layers[-1] mdl.layer_names.remove('fc') mdl.add_layers([h3, output]) trainer = sgd.SGD( learning_rate=.002, batch_size=batch_size, learning_rule=learning_rule.RMSProp(), cost=SumOfCosts(costs=[ Default(), WeightDecay(coeffs=[0.0005, 0.0005, 0.0005, 0.0005, 0.0005]) ]), train_iteration_mode='shuffled_sequential', monitor_iteration_mode='sequential', termination_criterion=EpochCounter(max_epochs=15), monitoring_dataset={ 'test': tst, 'valid': vld }) watcher = best_params.MonitorBasedSaveBest( channel_name='valid_y_misclass', save_path='./Saved Models/conv_supervised_layerwise_best3.pkl') decay = sgd.LinearDecayOverEpoch(start=8, saturate=15, decay_factor=0.1) experiment = Train( dataset=trn, model=mdl, algorithm=trainer, extensions=[watcher, decay], ) experiment.main_loop()