def run_algorithm(): unsupported_modes = ['random_slice', 'random_uniform'] algorithm = SGD(learning_rate, cost, batch_size=batch_size, train_iteration_mode=mode, monitoring_dataset=None, termination_criterion=termination_criterion, update_callbacks=None, init_momentum=None, set_batch_size=False) algorithm.setup(dataset=dataset, model=model) raised = False try: algorithm.train(dataset) except ValueError: print mode assert mode in unsupported_modes raised = True if mode in unsupported_modes: assert raised return True return False
def test_sgd_unspec_num_mon_batch(): # tests that if you don't specify a number of # monitoring batches, SGD configures the monitor # to run on all the data m = 25 visited = [False] * m rng = np.random.RandomState([25, 9, 2012]) X = np.zeros((m, 1)) X[:, 0] = np.arange(m) dataset = DenseDesignMatrix(X=X) model = SoftmaxModel(1) learning_rate = 1e-3 batch_size = 5 cost = DummyCost() algorithm = SGD(learning_rate, cost, batch_size=batch_size, monitoring_batches=None, monitoring_dataset=dataset, termination_criterion=None, update_callbacks=None, init_momentum=None, set_batch_size=False) algorithm.setup(dataset=dataset, model=model) monitor = Monitor.get_monitor(model) X = T.matrix() def tracker(*data): X, = data assert X.shape[1] == 1 for i in xrange(X.shape[0]): visited[int(X[i, 0])] = True monitor.add_channel(name='tracker', ipt=X, val=0., prereqs=[tracker], data_specs=(model.get_input_space(), model.get_input_source())) monitor() if False in visited: print visited assert False
def test_sgd_unspec_num_mon_batch(): # tests that if you don't specify a number of # monitoring batches, SGD configures the monitor # to run on all the data m = 25 visited = [False] * m rng = np.random.RandomState([25, 9, 2012]) X = np.zeros((m, 1)) X[:, 0] = np.arange(m) dataset = DenseDesignMatrix(X=X) model = SoftmaxModel(1) learning_rate = 1e-3 batch_size = 5 cost = DummyCost() algorithm = SGD(learning_rate, cost, batch_size=5, monitoring_batches=None, monitoring_dataset=dataset, termination_criterion=None, update_callbacks=None, init_momentum=None, set_batch_size=False) algorithm.setup(dataset=dataset, model=model) monitor = Monitor.get_monitor(model) X = T.matrix() def tracker(*data): X, = data assert X.shape[1] == 1 for i in xrange(X.shape[0]): visited[int(X[i, 0])] = True monitor.add_channel(name='tracker', ipt=X, val=0., prereqs=[tracker], data_specs=(model.get_input_space(), model.get_input_source())) monitor() if False in visited: print visited assert False
def test_rmsprop(): """ Make sure that learning_rule.RMSProp obtains the same parameter values as with a hand-crafted RMSProp implementation, given a dummy model and learning rate scaler for each parameter. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())]) scales = [.01, .02, .05, 1., 5.] shapes = [(1,), (9,), (8, 7), (6, 5, 4), (3, 2, 2, 2)] model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) learning_rate = .001 decay = 0.90 max_scaling = 1e5 sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=RMSProp(decay), batch_size=1) sgd.setup(model=model, dataset=dataset) state = {} for param in model.get_params(): param_shape = param.get_value().shape state[param] = {} state[param]['g2'] = np.zeros(param_shape) def rmsprop_manual(model, state): inc = [] rval = [] epsilon = 1. / max_scaling for scale, param in izip(scales, model.get_params()): pstate = state[param] param_val = param.get_value() # begin rmsprop pstate['g2'] = decay * pstate['g2'] + (1 - decay) * param_val ** 2 rms_g_t = np.maximum(np.sqrt(pstate['g2']), epsilon) dx_t = - scale * learning_rate / rms_g_t * param_val rval += [param_val + dx_t] return rval manual = rmsprop_manual(model, state) sgd.train(dataset=dataset) assert all(np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params()))
def test_sgd_sequential(): # tests that requesting train_iteration_mode = 'sequential' # works dim = 1 batch_size = 3 m = 5 * batch_size dataset = ArangeDataset(m) model = SoftmaxModel(dim) learning_rate = 1e-3 batch_size = 5 visited = [False] * m def visit(X): assert X.shape[1] == 1 assert np.all(X[1:] == X[0:-1]+1) start = int(X[0, 0]) if start > 0: assert visited[start - 1] for i in xrange(batch_size): assert not visited[start+i] visited[start+i] = 1 data_specs = (model.get_input_space(), model.get_input_source()) cost = CallbackCost(visit, data_specs) # We need to include this so the test actually stops running at some point termination_criterion = EpochCounter(5) algorithm = SGD(learning_rate, cost, batch_size=batch_size, train_iteration_mode='sequential', monitoring_dataset=None, termination_criterion=termination_criterion, update_callbacks=None, init_momentum=None, set_batch_size=False) algorithm.setup(dataset=dataset, model=model) algorithm.train(dataset) assert all(visited)
def test_lr_scalers(): """ Tests that SGD respects Model.get_lr_scalers """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfParams(), (0., DummyCost())]) scales = [.01, .02, .05, 1., 5.] shapes = [(1, ), (9, ), (8, 7), (6, 5, 4), (3, 2, 2, 2)] learning_rate = .001 class ModelWithScalers(Model): def __init__(self): super(ModelWithScalers, self).__init__() self._params = [sharedX(np.zeros(shape)) for shape in shapes] self.input_space = VectorSpace(1) def __call__(self, X): # Implemented only so that DummyCost would work return X def get_lr_scalers(self): return dict(zip(self._params, scales)) model = ModelWithScalers() dataset = ArangeDataset(1) sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=Momentum(.0), batch_size=1) sgd.setup(model=model, dataset=dataset) manual = [param.get_value() for param in model.get_params()] manual = [ param - learning_rate * scale for param, scale in zip(manual, scales) ] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in zip(manual, model.get_params())) manual = [ param - learning_rate * scale for param, scale in zip(manual, scales) ] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in zip(manual, model.get_params()))
def test_adadelta(): """ Make sure that learning_rule.AdaDelta obtains the same parameter values as with a hand-crafted AdaDelta implementation, given a dummy model and learning rate scaler for each parameter. Reference: "AdaDelta: An Adaptive Learning Rate Method", Matthew D. Zeiler. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())]) model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) decay = 0.95 sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=AdaDelta(decay), batch_size=1) sgd.setup(model=model, dataset=dataset) state = {} for param in model.get_params(): param_shape = param.get_value().shape state[param] = {} state[param]['g2'] = np.zeros(param_shape) state[param]['dx2'] = np.zeros(param_shape) def adadelta_manual(model, state): inc = [] rval = [] for scale, param in izip(scales, model.get_params()): pstate = state[param] param_val = param.get_value() # begin adadelta pstate['g2'] = decay * pstate['g2'] + (1 - decay) * param_val**2 rms_g_t = np.sqrt(pstate['g2'] + scale * learning_rate) rms_dx_tm1 = np.sqrt(pstate['dx2'] + scale * learning_rate) dx_t = -rms_dx_tm1 / rms_g_t * param_val pstate['dx2'] = decay * pstate['dx2'] + (1 - decay) * dx_t**2 rval += [param_val + dx_t] return rval manual = adadelta_manual(model, state) sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params())) manual = adadelta_manual(model, state) sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params()))
def test_train_ae(): GC = GaussianCorruptor gsn = GSN.new(layer_sizes=[ds.X.shape[1], 1000], activation_funcs=["sigmoid", "tanh"], pre_corruptors=[None, GC(1.0)], post_corruptors=[SaltPepperCorruptor(0.5), GC(1.0)], layer_samplers=[BinomialSampler(), None], tied=False) # average MBCE over example rather than sum it _mbce = MeanBinaryCrossEntropy() reconstruction_cost = lambda a, b: _mbce.cost(a, b) / ds.X.shape[1] c = GSNCost([(0, 1.0, reconstruction_cost)], walkback=WALKBACK) alg = SGD(LEARNING_RATE, init_momentum=MOMENTUM, cost=c, termination_criterion=EpochCounter(MAX_EPOCHS), batches_per_iter=BATCHES_PER_EPOCH, batch_size=BATCH_SIZE, monitoring_dataset=ds, monitoring_batches=10) trainer = Train(ds, gsn, algorithm=alg, save_path="gsn_ae_example.pkl", save_freq=5) trainer.main_loop() print "done training"
def get_ae_pretrainer(layer, data, batch_size, epochs=30): init_lr = 0.05 train_algo = SGD( batch_size=batch_size, learning_rate=init_lr, learning_rule=Momentum(init_momentum=0.5), monitoring_batches=batch_size, monitoring_dataset=data, # for ContractiveAutoencoder: # cost=cost.SumOfCosts(costs=[[1., MeanSquaredReconstructionError()], # [0.5, cost.MethodCost(method='contraction_penalty')]]), # for HigherOrderContractiveAutoencoder: # cost=cost.SumOfCosts(costs=[[1., MeanSquaredReconstructionError()], # [0.5, cost.MethodCost(method='contraction_penalty')], # [0.5, cost.MethodCost(method='higher_order_penalty')]]), # for DenoisingAutoencoder: cost=MeanSquaredReconstructionError(), termination_criterion=EpochCounter(epochs)) return Train(model=layer, algorithm=train_algo, dataset=data, extensions=[ MomentumAdjustor(final_momentum=0.9, start=0, saturate=25), LinearDecayOverEpoch(start=1, saturate=25, decay_factor=.02) ])
def test_execution_order(): # ensure save is called directly after monitoring by checking # parameter values in `on_monitor` and `on_save`. model = MLP(layers=[Softmax(layer_name='y', n_classes=2, irange=0.)], nvis=3) dataset = DenseDesignMatrix(X=np.random.normal(size=(6, 3)), y=np.random.normal(size=(6, 2))) epoch_counter = EpochCounter(max_epochs=1) algorithm = SGD(batch_size=2, learning_rate=0.1, termination_criterion=epoch_counter) extension = ParamMonitor() train = Train(dataset=dataset, model=model, algorithm=algorithm, extensions=[extension], save_freq=1, save_path="save.pkl") # mock save train.save = MethodType(only_run_extensions, train) train.main_loop()
def get_train_sgd(self, config_id): row = self.db.executeSQL( """ SELECT learning_rate,batch_size,init_momentum, train_iteration_mode,cost_array,term_array FROM hps3.train_sgd WHERE config_id = %s """, (config_id, ), self.db.FETCH_ONE) if not row or row is None: raise HPSData("No stochasticGradientDescent for config_id="\ +str(config_id)) (learning_rate, batch_size, init_momentum, train_iteration_mode, cost_array, term_array) = row # cost cost = self.get_costs(cost_array) num_train_batch = (self.ntrain / self.batch_size) print "num training batches:", num_train_batch termination_criterion \ = self.get_terminations(config_id, term_array) return SGD(learning_rate=learning_rate, cost=cost, batch_size=batch_size, batches_per_iter=num_train_batch, monitoring_dataset=self.monitoring_dataset, termination_criterion=termination_criterion, init_momentum=init_momentum, train_iteration_mode=train_iteration_mode)
def train_model(): global ninput, noutput simdata = SimulationData( sim_path="../../javaDataCenter/generarDadesV1/CA_SDN_topo1/") simdata.load_data() simdata.preprocessor() dataset = simdata.get_matrix() structure = get_structure() layers = [] for pair in structure: layers.append(get_autoencoder(pair)) model = DeepComposedAutoencoder(layers) training_alg = SGD(learning_rate=1e-3, cost=MeanSquaredReconstructionError(), batch_size=1296, monitoring_dataset=dataset, termination_criterion=EpochCounter(max_epochs=50)) extensions = [MonitorBasedLRAdjuster()] experiment = Train(dataset=dataset, model=model, algorithm=training_alg, save_path='training2.pkl', save_freq=10, allow_overwrite=True, extensions=extensions) experiment.main_loop()
def get_trainer(model, trainset, validset, epochs=20, batch_size=200): monitoring_batches = None if validset is None else 20 train_algo = SGD(batch_size=batch_size, init_momentum=0.5, learning_rate=0.1, monitoring_batches=monitoring_batches, monitoring_dataset=validset, cost=Dropout(input_include_probs={ 'h0': 0.8, 'h1': 0.8, 'h2': 0.8, 'h3': 0.8, 'y': 0.5 }, input_scales={ 'h0': 1. / 0.8, 'h1': 1. / 0.8, 'h2': 1. / 0.8, 'h3': 1. / 0.8, 'y': 1. / 0.5 }, default_input_include_prob=0.5, default_input_scale=1. / 0.5), termination_criterion=EpochCounter(epochs), update_callbacks=ExponentialDecay(decay_factor=1.0001, min_lr=0.001)) return Train(model=model, algorithm=train_algo, dataset=trainset, save_freq=0, save_path='epoch', \ extensions=[MomentumAdjustor(final_momentum=0.9, start=0, saturate=int(epochs*0.8)), ])
def get_train_sgd(self): cost = MethodCost('cost_from_X') #cost = self.get_costs() num_train_batch = (self.ntrain/self.batch_size) print "num training batches:", num_train_batch termination_criterion = self.get_terminations() monitoring_dataset = {} for dataset_id in self.state.monitoring_dataset: if dataset_id == 'test' and self.test_ddm is not None: monitoring_dataset['test'] = self.test_ddm elif dataset_id == 'valid' and self.valid_ddm is not None: monitoring_dataset['valid'] = self.valid_ddm else: monitoring_dataset = None return SGD( learning_rate=self.state.learning_rate, batch_size=self.state.batch_size, cost=cost, batches_per_iter=num_train_batch, monitoring_dataset=monitoring_dataset, termination_criterion=termination_criterion, init_momentum=self.state.init_momentum, train_iteration_mode=self.state.train_iteration_mode)
def __init__(self, runner, model_params, resume=False, resume_data=None, s3_data=None, **kwargs): dataset = create_dense_design_matrix(x=runner.dp.train_set_x) if resume: model, model_params = self.resume_model(model_params, resume_data) else: model = self.new_model(model_params, dataset=dataset) termination_criterion = MaxEpochNumber(model_params['maxnum_iter']) algorithm = SGD(learning_rate=model_params['learning_rate']['init'], monitoring_dataset=dataset, cost=MeanSquaredReconstructionError(), termination_criterion=termination_criterion, batch_size=model_params['batch_size']) ext = AutoEncoderStatReporter(runner, resume=resume, resume_data=resume_data, save_freq=model_params['save_freq']) self.train_obj = Train(dataset=dataset, model=model, algorithm=algorithm, extensions=[ext])
def test_multiple_inputs(): """ Create a VectorSpacesDataset with two inputs (features0 and features1) and train an MLP which takes both inputs for 1 epoch. """ mlp = MLP(layers=[ FlattenerLayer( CompositeLayer('composite', [Linear(10, 'h0', 0.1), Linear(10, 'h1', 0.1)], { 0: [1], 1: [0] })), Softmax(5, 'softmax', 0.1) ], input_space=CompositeSpace([VectorSpace(15), VectorSpace(20)]), input_source=('features0', 'features1')) dataset = VectorSpacesDataset( (np.random.rand(20, 20).astype(theano.config.floatX), np.random.rand(20, 15).astype(theano.config.floatX), np.random.rand(20, 5).astype(theano.config.floatX)), (CompositeSpace( [VectorSpace(20), VectorSpace(15), VectorSpace(5)]), ('features1', 'features0', 'targets'))) train = Train(dataset, mlp, SGD(0.1, batch_size=5)) train.algorithm.termination_criterion = EpochCounter(1) train.main_loop()
def train_example(dataset=None): model = GaussianBinaryRBM(nvis=1296, nhid=61, irange=0.5, energy_function_class=grbm_type_1(), learn_sigma=True, init_sigma=.4, init_bias_hid=2., mean_vis=False, sigma_lr_scale=1e-3) cost = SMD(corruptor=GaussianCorruptor(stdev=0.4)) algorithm = SGD(learning_rate=.1, batch_size=5, monitoring_batches=20, monitoring_dataset=dataset, cost=cost, termination_criterion=MonitorBased(prop_decrease=0.01, N=1)) train = Train(dataset=dataset, model=model, save_path="./experiment/training.pkl", save_freq=10, algorithm=algorithm, extensions=[]) train.main_loop()
def test_sgd_sup(): # tests that we can run the sgd algorithm # on a supervised cost. # does not test for correctness at all, just # that the algorithm runs without dying dim = 3 m = 10 rng = np.random.RandomState([25, 9, 2012]) X = rng.randn(m, dim) idx = rng.randint(0, dim, (m, )) Y = np.zeros((m, dim)) for i in xrange(m): Y[i, idx[i]] = 1 dataset = DenseDesignMatrix(X=X, y=Y) m = 15 X = rng.randn(m, dim) idx = rng.randint(0, dim, (m,)) Y = np.zeros((m, dim)) for i in xrange(m): Y[i, idx[i]] = 1 # Including a monitoring dataset lets us test that # the monitor works with supervised data monitoring_dataset = DenseDesignMatrix(X=X, y=Y) model = SoftmaxModel(dim) learning_rate = 1e-3 batch_size = 5 cost = SupervisedDummyCost() # We need to include this so the test actually stops running at some point termination_criterion = EpochCounter(5) algorithm = SGD(learning_rate, cost, batch_size=batch_size, monitoring_batches=3, monitoring_dataset=monitoring_dataset, termination_criterion=termination_criterion, update_callbacks=None, init_momentum=None, set_batch_size=False) train = Train(dataset, model, algorithm, save_path=None, save_freq=0, extensions=None) train.main_loop()
def train_with_monitoring_datasets(train_dataset, monitoring_datasets, model_force_batch_size, train_iteration_mode, monitor_iteration_mode): model = SoftmaxModel(dim) if model_force_batch_size: model.force_batch_size = model_force_batch_size cost = DummyCost() algorithm = SGD(learning_rate, cost, batch_size=batch_size, train_iteration_mode=train_iteration_mode, monitor_iteration_mode=monitor_iteration_mode, monitoring_dataset=monitoring_datasets, termination_criterion=EpochCounter(2)) train = Train(train_dataset, model, algorithm, save_path=None, save_freq=0, extensions=None) train.main_loop()
def get_layer_trainer_sgd_autoencoder(layer, trainset, batch_size=10, learning_rate=0.1, max_epochs=100, name=''): # configs on sgd train_algo = SGD( learning_rate=learning_rate, # learning_rule = AdaDelta(), learning_rule=Momentum(init_momentum=0.5), cost=MeanSquaredReconstructionError(), batch_size=batch_size, monitoring_dataset=trainset, termination_criterion=EpochCounter(max_epochs=max_epochs), update_callbacks=None) log_callback = LoggingCallback(name) return Train(model=layer, algorithm=train_algo, extensions=[ log_callback, OneOverEpoch(start=1, half_life=5), MomentumAdjustor(final_momentum=0.7, start=10, saturate=100) ], dataset=trainset)
def test_pylearn2_trainin(): # Construct the model mlp = MLP(activations=[Sigmoid(), Sigmoid()], dims=[784, 100, 784], weights_init=IsotropicGaussian(), biases_init=Constant(0.01)) mlp.initialize() cost = SquaredError() block_cost = BlocksCost(cost) block_model = BlocksModel(mlp, (VectorSpace(dim=784), 'features')) # Load the data rng = numpy.random.RandomState(14) train_dataset = random_dense_design_matrix(rng, 1024, 784, 10) valid_dataset = random_dense_design_matrix(rng, 1024, 784, 10) # Silence Pylearn2's logger logger = logging.getLogger(pylearn2.__name__) logger.setLevel(logging.ERROR) # Training algorithm sgd = SGD(learning_rate=0.01, cost=block_cost, batch_size=128, monitoring_dataset=valid_dataset) train = Train(train_dataset, block_model, algorithm=sgd) train.main_loop(time_budget=3)
def model1(): #pdb.set_trace() # train set X has dim (60,000, 784), y has dim (60,000, 10) train_set = MNIST(which_set='train', one_hot=True) # test set X has dim (10,000, 784), y has dim (10,000, 10) valid_set = MNIST(which_set='test', one_hot=True) test_set = MNIST(which_set='test', one_hot=True) #import pdb #pdb.set_trace() #print train_set.X.shape[1] # =====<Create the MLP Model>===== h2_layer = NoisyRELU(layer_name='h1', sparse_init=15, noise_factor=5, dim=1000, desired_active_rate=0.2, bias_factor=20, max_col_norm=1) #h2_layer = RectifiedLinear(layer_name='h2', dim=100, sparse_init=15, max_col_norm=1) #print h1_layer.get_params() #h2 = RectifiedLinear(layer_name='h2', dim=500, sparse_init=15, max_col_norm=1) y_layer = Softmax(layer_name='y', n_classes=10, irange=0., max_col_norm=1) mlp = MLP(batch_size=200, input_space=VectorSpace(dim=train_set.X.shape[1]), layers=[h2_layer, y_layer]) # =====<Create the SGD algorithm>===== sgd = SGD(init_momentum=0.1, learning_rate=0.01, monitoring_dataset={'valid': valid_set}, cost=MethodCost('cost_from_X'), termination_criterion=MonitorBased( channel_name='valid_y_misclass', prop_decrease=0.001, N=50)) #sgd.setup(model=mlp, dataset=train_set) # =====<Extensions>===== ext = [MomentumAdjustor(start=1, saturate=10, final_momentum=0.9)] # =====<Create Training Object>===== save_path = './mlp_model1.pkl' train_obj = Train(dataset=train_set, model=mlp, algorithm=sgd, extensions=ext, save_path=save_path, save_freq=0) #train_obj.setup_extensions() #import pdb #pdb.set_trace() train_obj.main_loop() # =====<Run the training>===== '''
def create_algorithm(self, data, save_best_path=None): self.set_dataset(data) self.create_adjustors() term = EpochCounter(max_epochs=self.max_epochs) if self.valid_stop: cost_crit = MonitorBased(channel_name='valid_objective', prop_decrease=.0, N=3) term = And(criteria=[cost_crit, term]) #(layers, A_weight_decay) coeffs = None if self.reg_factors: rf = self.reg_factors lhdims = len(self.tagger.hdims) l_inputlayer = len(self.tagger.layers[0].layers) coeffs = ([[rf] * l_inputlayer] + ([rf] * lhdims) + [rf], rf) cost = SeqTaggerCost(coeffs, self.dropout) self.cost = cost self.mbsb = MonitorBasedSaveBest(channel_name='valid_objective', save_path=save_best_path) mon_dataset = dict(self.dataset) if not self.monitor_train: del mon_dataset['train'] _learning_rule = (self.momentum_rule if self.use_momentum else None) self.algorithm = SGD( batch_size=1, learning_rate=self.lr, termination_criterion=term, monitoring_dataset=mon_dataset, cost=cost, learning_rule=_learning_rule, ) self.algorithm.setup(self, self.dataset['train']) if self.plot_monitor: cn = ["valid_objective", "test_objective"] if self.monitor_train: cn.append("train_objective") plots = Plots(channel_names=cn, save_path=self.plot_monitor) self.pm = PlotManager([plots], freq=1) self.pm.setup(self, None, self.algorithm)
def test_adagrad(): """ Make sure that learning_rule.AdaGrad obtains the same parameter values as with a hand-crafted AdaGrad implementation, given a dummy model and learning rate scaler for each parameter. Reference: "Adaptive subgradient methods for online learning and stochastic optimization", Duchi J, Hazan E, Singer Y. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())]) model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=AdaGrad(), batch_size=1) sgd.setup(model=model, dataset=dataset) state = {} for param in model.get_params(): param_shape = param.get_value().shape state[param] = {} state[param]['sg2'] = np.zeros(param_shape) def adagrad_manual(model, state): rval = [] for scale, param in izip(scales, model.get_params()): pstate = state[param] param_val = param.get_value() # begin adadelta pstate['sg2'] += param_val**2 dx_t = -(scale * learning_rate / np.sqrt(pstate['sg2']) * param_val) rval += [param_val + dx_t] return rval manual = adagrad_manual(model, state) sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params())) manual = adagrad_manual(model, state) sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params()))
def test_lr_scalers_momentum(): """ Tests that SGD respects Model.get_lr_scalers when using momentum. """ cost = SumOfParams() scales = [.01, .02, .05, 1., 5.] shapes = [(1, ), (9, ), (8, 7), (6, 5, 4), (3, 2, 2, 2)] learning_rate = .001 class ModelWithScalers(Model): def __init__(self): self._params = [sharedX(np.zeros(shape)) for shape in shapes] self.input_space = VectorSpace(1) def get_lr_scalers(self): return dict(zip(self._params, scales)) model = ModelWithScalers() dataset = ArangeDataset(1) momentum = 0.5 sgd = SGD(cost=cost, learning_rate=learning_rate, init_momentum=momentum, batch_size=1) sgd.setup(model=model, dataset=dataset) manual = [param.get_value() for param in model.get_params()] inc = [-learning_rate * scale for param, scale in zip(manual, scales)] manual = [param + i for param, i in zip(manual, inc)] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in zip(manual, model.get_params())) manual = [ param - learning_rate * scale + i * momentum for param, scale, i in zip(manual, scales, inc) ] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in zip(manual, model.get_params()))
def test_adadelta(): """ Make sure that learning_rule.AdaDelta obtains the same parameter values as with a hand-crafted AdaDelta implementation, given a dummy model and learning rate scaler for each parameter. Reference: "AdaDelta: An Adaptive Learning Rate Method", Matthew D. Zeiler. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())]) model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) decay = 0.95 sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=AdaDelta(decay), batch_size=1) sgd.setup(model=model, dataset=dataset) state = {} for param in model.get_params(): param_shape = param.get_value().shape state[param] = {} state[param]['g2'] = np.zeros(param_shape) state[param]['dx2'] = np.zeros(param_shape) def adadelta_manual(model, state): inc = [] rval = [] for scale, param in izip(scales, model.get_params()): pstate = state[param] param_val = param.get_value() # begin adadelta pstate['g2'] = decay * pstate['g2'] + (1 - decay) * param_val ** 2 rms_g_t = np.sqrt(pstate['g2'] + scale * learning_rate) rms_dx_tm1 = np.sqrt(pstate['dx2'] + scale * learning_rate) dx_t = -rms_dx_tm1 / rms_g_t * param_val pstate['dx2'] = decay * pstate['dx2'] + (1 - decay) * dx_t ** 2 rval += [param_val + dx_t] return rval manual = adadelta_manual(model, state) sgd.train(dataset=dataset) assert all(np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params())) manual = adadelta_manual(model, state) sgd.train(dataset=dataset) assert all(np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params()))
def create_algorithm(self): cost_crit = MonitorBased(channel_name=self.optimize_for, prop_decrease=0., N=10) epoch_cnt_crit = EpochCounter(max_epochs=self.max_epochs) term = And(criteria=[cost_crit, epoch_cnt_crit]) self.algorithm = SGD(batch_size=100, learning_rate=.01, monitoring_dataset=self.alg_datasets, termination_criterion=term)
def test_train_supervised(): """ Train a supervised GSN. """ # initialize the GSN gsn = GSN.new( layer_sizes=[ds.X.shape[1], 1000, ds.y.shape[1]], activation_funcs=["sigmoid", "tanh", rescaled_softmax], pre_corruptors=[GaussianCorruptor(0.5)] * 3, post_corruptors=[ SaltPepperCorruptor(.3), None, SmoothOneHotCorruptor(.5) ], layer_samplers=[BinomialSampler(), None, MultinomialSampler()], tied=False) # average over costs rather than summing _rcost = MeanBinaryCrossEntropy() reconstruction_cost = lambda a, b: _rcost.cost(a, b) / ds.X.shape[1] _ccost = MeanBinaryCrossEntropy() classification_cost = lambda a, b: _ccost.cost(a, b) / ds.y.shape[1] # combine costs into GSNCost object c = GSNCost( [ # reconstruction on layer 0 with weight 1.0 (0, 1.0, reconstruction_cost), # classification on layer 2 with weight 2.0 (2, 2.0, classification_cost) ], walkback=WALKBACK, mode="supervised") alg = SGD( LEARNING_RATE, init_momentum=MOMENTUM, cost=c, termination_criterion=EpochCounter(MAX_EPOCHS), batches_per_iter=BATCHES_PER_EPOCH, batch_size=BATCH_SIZE, monitoring_dataset=ds, monitoring_batches=10, ) trainer = Train(ds, gsn, algorithm=alg, save_path="gsn_sup_example.pkl", save_freq=10, extensions=[MonitorBasedLRAdjuster()]) trainer.main_loop() print("done training")
def testing_multiple_datasets_with_specified_dataset_in_monitor_based_lr(): # tests that the class MonitorBasedLRAdjuster in sgd.py can properly use # the spcified dataset_name in the constructor when multiple datasets # exist. dim = 3 m = 10 rng = np.random.RandomState([06, 02, 2014]) X = rng.randn(m, dim) Y = rng.randn(m, dim) learning_rate = 1e-2 batch_size = 5 # We need to include this so the test actually stops running at some point epoch_num = 1 # including a monitoring datasets lets us test that # the monitor works with supervised data monitoring_train = DenseDesignMatrix(X=X) monitoring_test = DenseDesignMatrix(X=Y) cost = DummyCost() model = SoftmaxModel(dim) dataset = DenseDesignMatrix(X=X) termination_criterion = EpochCounter(epoch_num) monitoring_dataset = {'train': monitoring_train, 'test': monitoring_test} algorithm = SGD(learning_rate, cost, batch_size=batch_size, monitoring_batches=2, monitoring_dataset=monitoring_dataset, termination_criterion=termination_criterion, update_callbacks=None, init_momentum=None, set_batch_size=False) dataset_name = monitoring_dataset.keys()[0] monitor_lr = MonitorBasedLRAdjuster(dataset_name=dataset_name) train = Train(dataset, model, algorithm, save_path=None, save_freq=0, extensions=[monitor_lr]) train.main_loop()
def test_lr_scalers(): """ Tests that SGD respects Model.get_lr_scalers """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfParams(), (0., DummyCost())]) scales = [.01, .02, .05, 1., 5.] shapes = [(1,), (9,), (8, 7), (6, 5, 4), (3, 2, 2, 2)] learning_rate = .001 class ModelWithScalers(Model): def __init__(self): super(ModelWithScalers, self).__init__() self._params = [sharedX(np.zeros(shape)) for shape in shapes] self.input_space = VectorSpace(1) def __call__(self, X): # Implemented only so that DummyCost would work return X def get_lr_scalers(self): return dict(zip(self._params, scales)) model = ModelWithScalers() dataset = ArangeDataset(1) sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=Momentum(.0), batch_size=1) sgd.setup(model=model, dataset=dataset) manual = [param.get_value() for param in model.get_params()] manual = [param - learning_rate * scale for param, scale in zip(manual, scales)] sgd.train(dataset=dataset) assert all(np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in zip(manual, model.get_params())) manual = [param - learning_rate * scale for param, scale in zip(manual, scales)] sgd.train(dataset=dataset) assert all(np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in zip(manual, model.get_params()))
def set_training_criteria(self, learning_rate=0.05, cost=Default(), batch_size=10, max_epochs=10): self.training_alg = SGD(learning_rate=learning_rate, cost=cost, batch_size=batch_size, monitoring_dataset=self.datasets, termination_criterion=EpochCounter(max_epochs))
def set_training_criteria(self, learning_rate=0.05, cost=MeanSquaredReconstructionError(), batch_size=10, max_epochs=10): self.training_alg = SGD(learning_rate = learning_rate, cost = cost, batch_size = batch_size, monitoring_dataset = self.datasets, termination_criterion = EpochCounter(max_epochs))
def test_nesterov_momentum(): """ Make sure that learning_rule.Momentum obtains the same parameter values as with a hand-crafted sgd w/ momentum implementation, given a dummy model and learning rate scaler for each parameter. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfParams(), (0., DummyCost())]) model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) momentum = 0.5 sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=Momentum(momentum, nesterov_momentum=True), batch_size=1) sgd.setup(model=model, dataset=dataset) manual = [param.get_value() for param in model.get_params()] vel = [-learning_rate * scale for scale in scales] updates = [ -learning_rate * scale + v * momentum for scale, v in izip(scales, vel) ] manual = [param + update for param, update in izip(manual, updates)] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params())) vel = [ -learning_rate * scale + i * momentum for scale, i in izip(scales, vel) ] updates = [ -learning_rate * scale + v * momentum for scale, v in izip(scales, vel) ] manual = [param + update for param, update in izip(manual, updates)] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params()))
def prepare_adagrad_test(dataset_type='arange', model_type='random'): """ Factor out common code for AdaGrad tests. Parameters ---------- dataset_type : string, optional Can use either `arange` to use an ArangeDataset instance or `zeros` to create an all-zeros DenseDesignMatrix. model_type : string, optional How to initialize the model; `random` will initialize parameters to random values, `zeros` to zero. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())]) model = DummyModel(shapes, lr_scalers=scales, init_type=model_type) if dataset_type == 'arange': dataset = ArangeDataset(1) elif dataset_type == 'zeros': X = np.zeros((1, 1)) X[:, 0] = np.arange(1) dataset = DenseDesignMatrix(X) else: raise ValueError('Unknown value for dataset_type: %s', dataset_type) sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=AdaGrad(), batch_size=1) sgd.setup(model=model, dataset=dataset) state = {} for param in model.get_params(): param_shape = param.get_value().shape state[param] = {} state[param]['sg2'] = np.zeros(param_shape) return (cost, model, dataset, sgd, state)
def __init__(self, runner, model_params, resume=False, resume_data=None, s3_data=None, **kwargs): self.model_params = model_params self.out_nonlin = runner.model['out_nonlin'] if self.out_nonlin == 'LINEARGAUSSIAN': outputs_num = None cost = None else: outputs_num = runner.dp.uniq_outputs_num cost = self.get_cost_fn() dataset = self.construct_datasets(runner.dp.train_set_x, runner.dp.train_set_y, outputs_num) valid_dataset = self.construct_datasets(runner.dp.test_set_x, runner.dp.test_set_y, outputs_num) if resume: model = self.resume_model(model_params, resume_data) lr_init = model_params['learning_rate']['init'] / ( model_params['learning_rate']['decay_factor']** model.monitor.get_batches_seen()) else: model = self.new_model(model_params, dataset=dataset) lr_init = model_params['learning_rate']['init'] batches_per_iter = get_batches_per_iter(model_params, dataset) termination_criterion = MaxEpochNumber(model_params['maxnum_iter']) update_callbacks, extensions = construct_update( model_params, resume, resume_data) algorithm = SGD(learning_rate=lr_init, init_momentum=model_params['momentum']['init'], monitoring_dataset={ 'valid': valid_dataset, 'train': dataset }, cost=cost, termination_criterion=termination_criterion, update_callbacks=update_callbacks, batches_per_iter=batches_per_iter) self.train_obj = Train(dataset=dataset, model=model, algorithm=algorithm, extensions=extensions) ext = MLPStatReporter(model, runner, resume=resume, resume_data=resume_data, save_freq=model_params['save_freq']) self.train_obj.extensions.append(ext)
def model2(): #pdb.set_trace() # train set X has dim (60,000, 784), y has dim (60,000, 10) train_set = MNIST(which_set='train', one_hot=True) # test set X has dim (10,000, 784), y has dim (10,000, 10) test_set = MNIST(which_set='test', one_hot=True) # =====<Create the MLP Model>===== h1_layer = RectifiedLinear(layer_name='h1', dim=1000, irange=0.5) #print h1_layer.get_params() h2_layer = RectifiedLinear(layer_name='h2', dim=1000, sparse_init=15, max_col_norm=1) y_layer = Softmax(layer_name='y', n_classes=train_set.y.shape[1], irange=0.5) mlp = MLP(batch_size=100, input_space=VectorSpace(dim=train_set.X.shape[1]), layers=[h1_layer, h2_layer, y_layer]) # =====<Create the SGD algorithm>===== sgd = SGD(batch_size=100, init_momentum=0.1, learning_rate=0.01, monitoring_dataset={ 'valid': train_set, 'test': test_set }, cost=SumOfCosts(costs=[ MethodCost('cost_from_X'), WeightDecay(coeffs=[0.00005, 0.00005, 0.00005]) ]), termination_criterion=MonitorBased( channel_name='valid_y_misclass', prop_decrease=0.0001, N=5)) #sgd.setup(model=mlp, dataset=train_set) # =====<Extensions>===== ext = [MomentumAdjustor(start=1, saturate=10, final_momentum=0.99)] # =====<Create Training Object>===== save_path = './mlp_model2.pkl' train_obj = Train(dataset=train_set, model=mlp, algorithm=sgd, extensions=ext, save_path=save_path, save_freq=0) #train_obj.setup_extensions() train_obj.main_loop()
def run_algorithm(): unsupported_modes = ['random_slice', 'random_uniform'] algorithm = SGD(learning_rate, cost, batch_size=5, train_iteration_mode=mode, monitoring_dataset=None, termination_criterion=termination_criterion, update_callbacks=None, init_momentum=None, set_batch_size=False) algorithm.setup(dataset=dataset, model=model) raised = False try: algorithm.train(dataset) except ValueError: print mode assert mode in unsupported_modes raised = True if mode in unsupported_modes: assert raised return True return False
def test_adagrad(): """ Make sure that learning_rule.AdaGrad obtains the same parameter values as with a hand-crafted AdaGrad implementation, given a dummy model and learning rate scaler for each parameter. Reference: "Adaptive subgradient methods for online learning and stochastic optimization", Duchi J, Hazan E, Singer Y. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())]) model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=AdaGrad(), batch_size=1) sgd.setup(model=model, dataset=dataset) state = {} for param in model.get_params(): param_shape = param.get_value().shape state[param] = {} state[param]['sg2'] = np.zeros(param_shape) def adagrad_manual(model, state): rval = [] for scale, param in izip(scales, model.get_params()): pstate = state[param] param_val = param.get_value() # begin adadelta pstate['sg2'] += param_val ** 2 dx_t = - (scale * learning_rate / np.sqrt(pstate['sg2']) * param_val) rval += [param_val + dx_t] return rval manual = adagrad_manual(model, state) sgd.train(dataset=dataset) assert all(np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params())) manual = adagrad_manual(model, state) sgd.train(dataset=dataset) assert all(np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params()))
def test_lr_scalers_momentum(): """ Tests that SGD respects Model.get_lr_scalers when using momentum. """ cost = SumOfParams() scales = [ .01, .02, .05, 1., 5. ] shapes = [(1,), (9,), (8, 7), (6, 5, 4), (3, 2, 2, 2)] learning_rate = .001 class ModelWithScalers(Model): def __init__(self): self._params = [sharedX(np.zeros(shape)) for shape in shapes] self.input_space = VectorSpace(1) def get_lr_scalers(self): return dict(zip(self._params, scales)) model = ModelWithScalers() dataset = ArangeDataset(1) momentum = 0.5 sgd = SGD(cost=cost, learning_rate=learning_rate, init_momentum=momentum, batch_size=1) sgd.setup(model=model, dataset=dataset) manual = [param.get_value() for param in model.get_params()] inc = [ - learning_rate * scale for param, scale in zip(manual, scales)] manual = [param + i for param, i in zip(manual, inc)] sgd.train(dataset=dataset) assert all(np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in zip(manual, model.get_params())) manual = [param - learning_rate * scale + i * momentum for param, scale, i in zip(manual, scales, inc)] sgd.train(dataset=dataset) assert all(np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in zip(manual, model.get_params()))
def test_nesterov_momentum(): """ Make sure that learning_rule.Momentum obtains the same parameter values as with a hand-crafted sgd w/ momentum implementation, given a dummy model and learning rate scaler for each parameter. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfParams(), (0., DummyCost())]) model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) momentum = 0.5 sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=Momentum(momentum, nesterov_momentum=True), batch_size=1) sgd.setup(model=model, dataset=dataset) manual = [param.get_value() for param in model.get_params()] vel = [-learning_rate * scale for scale in scales] updates = [-learning_rate * scale + v * momentum for scale, v in izip(scales, vel)] manual = [param + update for param, update in izip(manual, updates)] sgd.train(dataset=dataset) assert all(np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params())) vel = [-learning_rate * scale + i * momentum for scale, i in izip(scales, vel)] updates = [-learning_rate * scale + v * momentum for scale, v in izip(scales, vel)] manual = [param + update for param, update in izip(manual, updates)] sgd.train(dataset=dataset) assert all(np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params()))
def test_momentum(): """ Make sure that learning_rule.Momentum obtains the same parameter values as with a hand-crafted sgd w/ momentum implementation, given a dummy model and learning rate scaler for each parameter. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfParams(), (0., DummyCost())]) scales = [.01, .02, .05, 1., 5.] shapes = [(1,), (9,), (8, 7), (6, 5, 4), (3, 2, 2, 2)] model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) learning_rate = .001 momentum = 0.5 sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=Momentum(momentum), batch_size=1) sgd.setup(model=model, dataset=dataset) manual = [param.get_value() for param in model.get_params()] inc = [-learning_rate * scale for scale in scales] manual = [param + i for param, i in izip(manual, inc)] sgd.train(dataset=dataset) assert all(np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params())) manual = [param - learning_rate * scale + i * momentum for param, scale, i in izip(manual, scales, inc)] sgd.train(dataset=dataset) assert all(np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params()))
def test_lr_scalers_momentum(): """ Tests that SGD respects Model.get_lr_scalers when using momentum. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfParams(), (0., DummyCost())]) scales = [.01, .02, .05, 1., 5.] shapes = [(1,), (9,), (8, 7), (6, 5, 4), (3, 2, 2, 2)] model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) learning_rate = .001 momentum = 0.5 sgd = SGD(cost=cost, learning_rate=learning_rate, init_momentum=momentum, batch_size=1) sgd.setup(model=model, dataset=dataset) manual = [param.get_value() for param in model.get_params()] inc = [-learning_rate * scale for param, scale in zip(manual, scales)] manual = [param + i for param, i in zip(manual, inc)] sgd.train(dataset=dataset) assert all(np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in zip(manual, model.get_params())) manual = [param - learning_rate * scale + i * momentum for param, scale, i in zip(manual, scales, inc)] sgd.train(dataset=dataset) assert all(np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in zip(manual, model.get_params()))
def create_algorithm(self, data, save_best_path=None): self.set_dataset(data) self.create_adjustors() term = EpochCounter(max_epochs=self.max_epochs) if self.valid_stop: cost_crit = MonitorBased(channel_name='valid_objective', prop_decrease=.0, N=3) term = And(criteria=[cost_crit, term]) #(layers, A_weight_decay) coeffs = None if self.reg_factors: rf = self.reg_factors lhdims = len(self.tagger.hdims) l_inputlayer = len(self.tagger.layers[0].layers) coeffs = ([[rf] * l_inputlayer] + ([rf] * lhdims) + [rf], rf) cost = SeqTaggerCost(coeffs, self.dropout) self.cost = cost self.mbsb = MonitorBasedSaveBest(channel_name='valid_objective', save_path=save_best_path) mon_dataset = dict(self.dataset) if not self.monitor_train: del mon_dataset['train'] _learning_rule = (self.momentum_rule if self.use_momentum else None) self.algorithm = SGD(batch_size=1, learning_rate=self.lr, termination_criterion=term, monitoring_dataset=mon_dataset, cost=cost, learning_rule=_learning_rule, ) self.algorithm.setup(self, self.dataset['train']) if self.plot_monitor: cn = ["valid_objective", "test_objective"] if self.monitor_train: cn.append("train_objective") plots = Plots(channel_names=cn, save_path=self.plot_monitor) self.pm = PlotManager([plots], freq=1) self.pm.setup(self, None, self.algorithm)
def create_algorithm(mlp, train_set): rng = RandomState(hash('tobipuma') % 4294967295) algorithm = SGD(batch_size=20, learning_rate=0.1) algorithm.rng = rng #try to always have same results for algorithm algorithm.setup(mlp, train_set) return algorithm
def runDeepLearning2(): ### Loading training set and separting it into training set and testing set myDataset = Dataset("/home/Stephen/Desktop/Bird/DLearn/Data/Emotion_small/") preprocess = 0 datasets = myDataset.loadTrain(preprocessFLAG=preprocess, flipFLAG=3) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] dataset_test = myDataset.loadTest(preprocess) test_set_x, test_set_y, test_set_y_array = dataset_test[0] # temporary solution to get the ground truth of sample out to test_set_y_array. # the reason is that after T.cast, test_set_y becomes TensorVariable, which I do not find way to output its # value...anyone can help? ### Model parameterso """ learning_rate = 0.02 n_epochs = 3000 nkerns=[30, 40, 40] # number of kernal at each layer, current best performance is 50.0% on testing set, kernal number is [30,40,40] batch_size = 500 # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ishape = (48, 48) # size of input images nClass = 7 """ rng = np.random.RandomState(23455) # Import yaml file that specifies the model to train # conv layer layer0 = ConvRectifiedLinear( layer_name="h2", output_channels=64, irange=0.05, kernel_shape=[8, 8], pool_shape=[4, 4], pool_stride=[2, 2], max_kernel_norm=0.9, ) # mlp layer2 = RectifiedLinear(layer_name="h1", dim=1000, sparse_init=15) # softmax layer3 = Softmax(max_col_norm=1.9365, layer_name="y", n_classes=7, istdev=0.05) ds = Dataset2(train_set_x, train_set_y) layers = [layer0, layer2, layer3] ann = mlp.MLP(layers, nvis=3) t_algo = SGD(learning_rate=1e-1, batch_size=500, termination_criterion=EpochCounter(400)) t_algo.setup(ann, ds) while True: trainer.train(dataset=ds) ann.monitor.report_epoch() ann.monitor() if not trainer.continue_learning(ann): break
) """ # softmax layer3 = Softmax( max_col_norm = 1.9365, layer_name = 'y', n_classes = 7, istdev = .05 ) layers = [layer0, layer1, layer3] #layers = [layer0, layer2, layer3] ann = MLP(layers, input_space=ishape) t_algo = SGD(learning_rate = 1e-1, batch_size = 100, batches_per_iter = 1, termination_criterion=EpochCounter(2) ) ds = DataPylearn2([train_set_x,train_set_y],[48,48,1],7) t_algo.setup(ann, ds) while True: t_algo.train(dataset=ds) ann.monitor.report_epoch() ann.monitor() if not t_algo.continue_learning(ann): break # test: https://github.com/lisa-lab/pylearn2/blob/master/pylearn2/scripts/icml_2013_wrepl/emotions/make_submission.py ds2 = DataPylearn2([test_set_x,test_set_y],[48,48,1],-1)
from pylearn2.termination_criteria import EpochCounter import theano import numpy as np n = 200 p = 2 X = np.random.normal(0, 1, (n, p)) y = X[:,0]* X[:, 1] + np.random.normal(0, .1, n) y.shape = (n, 1) ds = DenseDesignMatrix(X=X, y=y) hidden_layer = Sigmoid(layer_name='hidden', dim=10, irange=.1, init_bias=1.) output_layer = Linear(dim=1, layer_name='y', irange=.1) trainer = SGD(learning_rate=.05, batch_size=10, termination_criterion=EpochCounter(200)) layers = [hidden_layer, output_layer] ann = MLP(layers, nvis=2) trainer.setup(ann, ds) while True: trainer.train(dataset=ds) ann.monitor.report_epoch() ann.monitor() if not trainer.continue_learning(ann): break inputs = X y_est = ann.fprop(theano.shared(inputs, name='inputs')).eval() print(y_est.shape)
class SequenceTaggerNetwork(Model): def __init__(self, dataset, w2i, t2i, featurizer, edim=None, hdims=None, fedim=None, max_epochs=100, use_momentum=False, lr=.01, lr_lin_decay=None, lr_scale=False, lr_monitor_decay=False, valid_stop=False, reg_factors=None, dropout=False, dropout_params=None, embedding_init=None, embedded_model=None, monitor_train=True, plot_monitor=None, num=False): super(SequenceTaggerNetwork, self).__init__() self.vocab_size = dataset.vocab_size self.window_size = dataset.window_size self.total_feats = dataset.total_feats self.feat_num = dataset.feat_num self.n_classes = dataset.n_classes self.max_epochs = max_epochs if edim is None: edim = 50 if hdims is None: hdims = [100] if fedim is None: fedim = 5 self.edim = edim self.fedim = fedim self.hdims = hdims self.w2i = w2i self.t2i = t2i self.featurizer = featurizer self._create_tagger() A_value = numpy.random.uniform(low=-.1, high=.1, size=(self.n_classes + 2, self.n_classes)) self.A = sharedX(A_value, name='A') self.use_momentum = use_momentum self.lr = lr self.lr_lin_decay = lr_lin_decay self.lr_monitor_decay = lr_monitor_decay self.lr_scale = lr_scale self.valid_stop = valid_stop self.reg_factors = reg_factors self.close_cache = {} self.dropout_params = dropout_params self.dropout = dropout or self.dropout_params is not None self.hdims = hdims self.monitor_train = monitor_train self.num = num self.plot_monitor = plot_monitor if embedding_init is not None: self.set_embedding_weights(embedding_init) def _create_tagger(self): self.tagger = WordTaggerNetwork( self.vocab_size, self.window_size, self.total_feats, self.feat_num, self.hdims, self.edim, self.fedim, self.n_classes) def _create_data_specs(self, dataset): self.input_space = CompositeSpace([ dataset.data_specs[0].components[i] for i in xrange(len(dataset.data_specs[0].components) - 1)]) self.output_space = dataset.data_specs[0].components[-1] self.input_source = dataset.data_specs[1][:-1] self.target_source = dataset.data_specs[1][-1] def __getstate__(self): d = {} d['vocab_size'] = self.vocab_size d['window_size'] = self.window_size d['feat_num'] = self.feat_num d['total_feats'] = self.total_feats d['n_classes'] = self.n_classes d['input_space'] = self.input_space d['output_space'] = self.output_space d['input_source'] = self.input_source d['target_source'] = self.target_source d['A'] = self.A d['tagger'] = self.tagger d['w2i'] = self.w2i d['t2i'] = self.t2i d['featurizer'] = self.featurizer d['max_epochs'] = self.max_epochs d['use_momentum'] = self.use_momentum d['lr'] = self.lr d['lr_lin_decay'] = self.lr_lin_decay d['lr_monitor_decay'] = self.lr_monitor_decay d['lr_scale'] = self.lr_scale d['valid_stop'] = self.valid_stop d['reg_factors'] = self.reg_factors d['dropout'] = self.dropout d['dropout_params'] = self.dropout_params d['monitor_train'] = self.monitor_train d['num'] = self.num d['plot_monitor'] = self.plot_monitor return d def fprop(self, data): tagger_out = self.tagger.fprop(data) probs = T.concatenate([self.A, tagger_out]) return probs def dropout_fprop(self, data, default_input_include_prob=0.5, input_include_probs=None, default_input_scale=2.0, input_scales=None, per_example=True): if input_scales is None: input_scales = {'input': 1.0} if input_include_probs is None: input_include_probs = {'input': 1.0} if self.dropout_params is not None: if len(self.dropout_params) == len(self.tagger.layers) - 1: input_include_probs['tagger_out'] = self.dropout_params[-1] input_scales['tagger_out'] = 1.0/self.dropout_params[-1] for i, p in enumerate(self.dropout_params[:-1]): input_include_probs['h{0}'.format(i)] = p input_scales['h{0}'.format(i)] = 1.0/p tagger_out = self.tagger.dropout_fprop( data, default_input_include_prob, input_include_probs, default_input_scale, input_scales, per_example) probs = T.concatenate([self.A, tagger_out]) return probs @functools.wraps(Model.get_lr_scalers) def get_lr_scalers(self): if not self.lr_scale: return {} d = self.tagger.get_lr_scalers() d[self.A] = 1. / self.n_classes return d @functools.wraps(Model.get_params) def get_params(self): return self.tagger.get_params() + [self.A] def create_adjustors(self): initial_momentum = .5 final_momentum = .99 start = 1 saturate = self.max_epochs self.momentum_adjustor = learning_rule.MomentumAdjustor( final_momentum, start, saturate) self.momentum_rule = learning_rule.Momentum(initial_momentum, nesterov_momentum=True) if self.lr_monitor_decay: self.learning_rate_adjustor = MonitorBasedLRAdjuster( high_trigger=1., shrink_amt=0.9, low_trigger=.95, grow_amt=1.1, channel_name='train_objective') elif self.lr_lin_decay: self.learning_rate_adjustor = LinearDecayOverEpoch( start, saturate, self.lr_lin_decay) def compute_used_inputs(self): seen = {'words': set(), 'feats': set()} for sen_w in self.dataset['train'].X1: seen['words'] |= reduce( lambda x, y: set(x) | set(y), sen_w, set()) for sen_f in self.dataset['train'].X2: seen['feats'] |= reduce( lambda x, y: set(x) | set(y), sen_f, set()) words = set(xrange(len(self.w2i))) feats = set(xrange(self.total_feats)) self.notseen = { 'words': numpy.array(sorted(words - seen['words'])), 'feats': numpy.array(sorted(feats - seen['feats'])) } def set_dataset(self, data): self._create_data_specs(data['train']) self.dataset = data self.compute_used_inputs() self.tagger.notseen = self.notseen def create_algorithm(self, data, save_best_path=None): self.set_dataset(data) self.create_adjustors() term = EpochCounter(max_epochs=self.max_epochs) if self.valid_stop: cost_crit = MonitorBased(channel_name='valid_objective', prop_decrease=.0, N=3) term = And(criteria=[cost_crit, term]) #(layers, A_weight_decay) coeffs = None if self.reg_factors: rf = self.reg_factors lhdims = len(self.tagger.hdims) l_inputlayer = len(self.tagger.layers[0].layers) coeffs = ([[rf] * l_inputlayer] + ([rf] * lhdims) + [rf], rf) cost = SeqTaggerCost(coeffs, self.dropout) self.cost = cost self.mbsb = MonitorBasedSaveBest(channel_name='valid_objective', save_path=save_best_path) mon_dataset = dict(self.dataset) if not self.monitor_train: del mon_dataset['train'] _learning_rule = (self.momentum_rule if self.use_momentum else None) self.algorithm = SGD(batch_size=1, learning_rate=self.lr, termination_criterion=term, monitoring_dataset=mon_dataset, cost=cost, learning_rule=_learning_rule, ) self.algorithm.setup(self, self.dataset['train']) if self.plot_monitor: cn = ["valid_objective", "test_objective"] if self.monitor_train: cn.append("train_objective") plots = Plots(channel_names=cn, save_path=self.plot_monitor) self.pm = PlotManager([plots], freq=1) self.pm.setup(self, None, self.algorithm) def train(self): while True: if not self.algorithm.continue_learning(self): break self.algorithm.train(dataset=self.dataset['train']) self.monitor.report_epoch() self.monitor() self.mbsb.on_monitor(self, self.dataset['valid'], self.algorithm) if self.use_momentum: self.momentum_adjustor.on_monitor(self, self.dataset['valid'], self.algorithm) if hasattr(self, 'learning_rate_adjustor'): self.learning_rate_adjustor.on_monitor( self, self.dataset['valid'], self.algorithm) if hasattr(self, 'pm'): self.pm.on_monitor( self, self.dataset['valid'], self.algorithm) def prepare_tagging(self): X = self.get_input_space().make_theano_batch(batch_size=1) Y = self.fprop(X) self.f = theano.function([X[0], X[1]], Y) self.start = self.A.get_value()[0] self.end = self.A.get_value()[1] self.A_value = self.A.get_value()[2:] def process_input(self, words, feats): return self.f(words, feats) def tag_sen(self, words, feats, debug=False, return_probs=False): if not hasattr(self, 'f'): self.prepare_tagging() y = self.process_input(words, feats) tagger_out = y[2 + self.n_classes:] res = viterbi(self.start, self.A_value, self.end, tagger_out, self.n_classes, return_probs) if return_probs: return res / res.sum(axis=1)[:,numpy.newaxis] #return res.reshape((1, len(res))) if debug: return numpy.array([[e] for e in res[1]]), tagger_out return numpy.array([[e] for e in res[1]]) def get_score(self, dataset, mode='pwp'): self.prepare_tagging() tagged = (self.tag_sen(w, f) for w, f in izip(dataset.X1, dataset.X2)) gold = dataset.y good, bad = 0., 0. if mode == 'pwp': for t, g in izip(tagged, gold): g = g.argmax(axis=1) t = t.flatten() good += sum(t == g) bad += sum(t != g) return [good / (good + bad)] elif mode == 'f1': i2t = [t for t, i in sorted(self.t2i.items(), key=lambda x: x[1])] f1c = FScCounter(i2t, binary_input=False) gold = map(lambda x:x.argmax(axis=1), gold) tagged = map(lambda x:x.flatten(), tagged) return f1c.count_score(gold, tagged) def set_embedding_weights(self, embedding_init): # load embedding with gensim from gensim.models import Word2Vec try: m = Word2Vec.load_word2vec_format(embedding_init, binary=False) edim = m.layer1_size except UnicodeDecodeError: try: m = Word2Vec.load_word2vec_format(embedding_init, binary=True) edim = m.layer1_size except UnicodeDecodeError: # not in word2vec format m = Word2Vec.load(embedding_init) edim = m.layer1_size except ValueError: # glove model m = {} if embedding_init.endswith('gz'): fp = gzip.open(embedding_init) else: fp = open(embedding_init) for l in fp: le = l.split() m[le[0].decode('utf-8')] = numpy.array( [float(e) for e in le[1:]], dtype=theano.config.floatX) edim = len(le) - 1 if edim != self.edim: raise Exception("Embedding dim and edim doesn't match") m_lower = {} vocab = (m.vocab if hasattr(m, 'vocab') else m) for k in vocab: if k in ['UNKNOWN', 'PADDING']: continue if self.num: m_lower[replace_numerals(k.lower())] = m[k] else: m_lower[k.lower()] = m[k] # transform weight matrix with using self.w2i params = numpy.zeros( self.tagger.layers[0].layers[0].get_param_vector().shape, dtype=theano.config.floatX) e = self.edim for w in self.w2i: if w in m_lower: v = m_lower[w] i = self.w2i[w] params[i*e:(i+1)*e] = v if 'UNKNOWN' in vocab: params[-1*e:] = vocab['UNKNOWN'] if 'PADDING' in vocab: params[-2*e:-1*e] = vocab['PADDING'] self.tagger.layers[0].layers[0].set_param_vector(params)