def AddTrainingOperators(model, mbox_layers, gt_label): MultiboxInput = mbox_layers # MultiboxInput.append(gt_label) mbox_loc_cpu = model.net.CopyGPUToCPU(mbox_layers[0], 'mbox_loc_cpu') mbox_conf_cpu = model.net.CopyGPUToCPU(mbox_layers[1], 'mbox_conf_cpu') loc_pred, loc_gt, conf_pred, conf_gt = model.net.MultiboxLoss( [mbox_loc_cpu, mbox_conf_cpu, mbox_layers[2], gt_label], ['loc_pred', 'loc_gt', 'conf_pred', 'conf_gt']) # loc_pred,loc_gt,conf_pred,conf_gt=model.net.MultiboxLoss( # [mbox_layers[0], mbox_layers[1], mbox_layers[2],gt_label],['loc_pred','loc_gt','conf_pred','conf_gt'] # ) loc_pred_gpu = model.net.CopyCPUToGPU(loc_pred, 'loc_pred_gpu') loc_gt_gpu = model.net.CopyCPUToGPU(loc_gt, 'loc_gt_gpu') conf_pred_gpu = model.net.CopyCPUToGPU(conf_pred, 'conf_pred_gpu') conf_gt_gpu = model.net.CopyCPUToGPU(conf_gt, 'conf_gt_gpu') SmoothL1Loss = model.net.SmoothL1Loss([loc_pred_gpu, loc_gt_gpu], 'SmoothL1Loss') P, SoftmaxWithLoss = model.net.SoftmaxWithLoss( [conf_pred_gpu, conf_gt_gpu], ["P", "SoftmaxWithLoss"]) model.AddGradientOperators([SmoothL1Loss, SoftmaxWithLoss]) ITER = brew.iter(model, "iter") # ITER = model.param_init_net.ConstantFill([],'ITER',shape=[1],value=0,dtype=core.DataType.INT32) LR = model.LearningRate(ITER, "LR", base_lr=-0.001, policy="step", stepsize=80000, gamma=0.1) ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0) for param in model.params: param_grad = model.param_to_grad[param] model.WeightedSum([param, ONE, param_grad, LR], param)
def AddCheckpoints(model, checkpoint_iters, db_type): ITER = brew.iter(model, "iter") model.Checkpoint([ITER] + model.params, [], db=os.path.join(unique_timestamp, "action_tufts_checkpoint_%05d.lmdb"), db_type="lmdb", every=checkpoint_iters)
def add_parameter_update_ops(model): brew.add_weight_decay(model, weight_decay) iter = brew.iter(model, "iter") lr = model.net.LearningRate( [iter], "lr", base_lr=base_learning_rate, policy="step", stepsize=stepsize, gamma=0.1, ) for param in model.GetParams(): param_grad = model.param_to_grad[param] param_momentum = model.param_init_net.ConstantFill( [param], param + '_momentum', value=0.0 ) # Update param_grad and param_momentum in place model.net.MomentumSGDUpdate( [param_grad, param_momentum, lr, param], [param_grad, param_momentum, param], # almost 100% but with room to grow momentum=0.9, # netsterov is a defenseman for the Montreal Canadiens, but # Nesterov Momentum works slightly better than standard momentum nesterov=1, )
def AddTrainingOperators(model, output, label): loss = model.SquaredL2Distance([output, label], "loss") avgloss = model.AveragedLoss([loss], "avgloss") model.AddGradientOperators([avgloss]) ITER = brew.iter(model, "iter") stepsize = int(train_iters / 2000) if (stepsize > 30): stepsize = int(30 + (stepsize - 30) / 5) if stepsize < 1: stepsize = 1 assert (LEARN_RATE < 0) LR = model.LearningRate(ITER, "LR", base_lr=LEARN_RATE, policy="step", stepsize=stepsize, gamma=0.9995) ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0) for param in model.params: param_grad = model.param_to_grad[param] model.WeightedSum([param, ONE, param_grad, LR], param) model.Checkpoint([ITER] + model.params, [], db="checkpoint_%06d.lmdb", db_type="lmdb", every=10000)
def AddTrainingOperators(model): """ opt = optimizer.build_sgd(model, base_learning_rate=1e-5, policy="step", stepsize=1, gamma=0.999, momentum=0.9) # model.AddWeightDecay(1e-4) """ # brew.add_weight_decay(model, 1e-4) ITER = brew.iter(model, "iter") # ITER = model.Iter("iter") LR = model.LearningRate(ITER, "LR", base_lr=0.01, policy="step", stepsize=1, gamma=0.999, momentum=0.9) ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0) for param in model.GetParams(): param_grad = model.param_to_grad[param] param_momentum = model.param_init_net.ConstantFill([param], param + '_momentum', value=0.0) model.net.MomentumSGDUpdate( [param_grad, param_momentum, LR, param], [param_grad, param_momentum, param], momentum=0.9, nesterov=1, ) return
def add_loss_and_backpropagation(model, probs, labels): # define cross-entropy x_entropy = model.LabelCrossEntropy([probs, labels], 'x_entropy') # compute the expected loss loss = model.AveragedLoss(x_entropy, "loss") # use the average loss we just computed to add gradient operators to the model model.AddGradientOperators([loss]) # a counter ITER = brew.iter(model, "ITER") # set the learning rate schedule, stepsize is iteration size, gamma is simply lr * gamma LR = model.LearningRate(ITER, "LR", base_lr=-0.01, policy="step", stepsize=4, gamma=0.999) #regularization #brew.add_weight_decay(model, 0.001) # ONE is a constant value that is used in the gradient update. We only need # to create it once, so it is explicitly placed in param_init_net. ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0) # Now, for each parameter, we do the gradient updates. for param in model.params: # Note how we get the gradient of each parameter - ModelHelper keeps # track of that. param_grad = model.param_to_grad[param] # The update is a simple weighted sum: param = param + param_grad * LR model.WeightedSum([param, ONE, param_grad, LR], param) '''
def AddTrainingOperators(model, softmax, label): """Adds training operators to the model.""" xent = model.LabelCrossEntropy([softmax, label], 'xent') # compute the expected loss loss = model.AveragedLoss(xent, "loss") # track the accuracy of the model AddAccuracy(model, softmax, label) # use the average loss we just computed to add gradient operators to the model model.AddGradientOperators([loss]) # do a simple stochastic gradient descent ITER = brew.iter(model, "iter") # set the learning rate schedule LR = model.LearningRate(ITER, "LR", base_lr=-0.1, policy="step", stepsize=1, gamma=0.999) # ONE is a constant value that is used in the gradient update. We only need # to create it once, so it is explicitly placed in param_init_net. ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0) # Now, for each parameter, we do the gradient updates. for param in model.params: # Note how we get the gradient of each parameter - ModelHelper keeps # track of that. param_grad = model.param_to_grad[param] # The update is a simple weighted sum: param = param + param_grad * LR model.WeightedSum([param, ONE, param_grad, LR], param)
def add_check_points(model, time_stamp, checkpoint_iters, db_type): ITER = brew.iter(model, "iter") model.Checkpoint([ITER] + model.params, [], db=os.path.join(time_stamp, "cifar10_checkpoint_%05d.lmdb"), db_type=db_type, every=checkpoint_iters)
def AddCheckpoints(model, checkpoint_iters, db_type): ITER = brew.iter(train_model, "iter") train_model.Checkpoint([ITER] + train_model.params, [], db=os.path.join(unique_timestamp, "cifar10_checkpoint_%05d.lmdb"), db_type="lmdb", every=checkpoint_iters)
def build_conv_model(model_name, batch_size): model_gen_map = conv_model_generators() assert model_name in model_gen_map, "Model " + model_name + " not found" model, input_size = model_gen_map[model_name]("NCHW", None) input_shape = [batch_size, 3, input_size, input_size] if model_name == "MLP": input_shape = [batch_size, input_size] model.param_init_net.GaussianFill( [], "data", shape=input_shape, mean=0.0, std=1.0 ) model.param_init_net.UniformIntFill( [], "label", shape=[batch_size, ], min=0, max=999 ) model.AddGradientOperators(["loss"]) ITER = brew.iter(model, "iter") LR = model.net.LearningRate( ITER, "LR", base_lr=-1e-8, policy="step", stepsize=10000, gamma=0.999) ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0) for param in model.params: param_grad = model.param_to_grad[param] model.net.WeightedSum([param, ONE, param_grad, LR], param) return model
def ScaffoldModelCheckpoints(model, checkpointFolder, every): newCheckpointFolder = str(datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) newCheckpointFolder = join(checkpointFolder, newCheckpointFolder) print 'check point folder: ', newCheckpointFolder makedirs(newCheckpointFolder) iter = brew.iter(model, 'iterations') model.Checkpoint([iter] + model.params, [], db=join(newCheckpointFolder, 'dataset_checkpoint_%05d.lmdb'), db_type='lmdb', every=every)
def AddParameterUpdate(model): """ Simple plain SGD update -- not tuned to actually train the models """ ITER = brew.iter(model, "iter") LR = model.net.LearningRate( ITER, "LR", base_lr=-1e-8, policy="step", stepsize=10000, gamma=0.999) ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0) for param in model.params: param_grad = model.param_to_grad[param] model.net.WeightedSum([param, ONE, param_grad, LR], param)
def AddParameterUpdate(model): """ Simple plain SGD update -- not tuned to actually train the models """ ITER = brew.iter(model, "iter") LR = model.LearningRate( ITER, "LR", base_lr=-1e-8, policy="step", stepsize=10000, gamma=0.999) ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0) for param in model.params: param_grad = model.param_to_grad[param] model.WeightedSum([param, ONE, param_grad, LR], param)
def add_parameter_update_ops(model): """A simple parameter update code. :param model_helper.ModelHelper model: Model to add update parameters operators for. """ iteration = brew.iter(model, "ITER") learning_rate = model.net.LearningRate([iteration], "LR", base_lr=0.01, policy="fixed") one = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0) for param in model.GetParams(): grad = model.param_to_grad[param] model.WeightedSum([param, one, grad, learning_rate], param)
def AddTrainingOperators(model, softmax, label): """Adds training operators to the model.""" #labelcrossetropy usado para calcular el error de las predicciones xent = model.LabelCrossEntropy([softmax, label], 'xent') # compute the expected loss -> con la ayuda de crossentropy xent loss = model.AveragedLoss(xent, "loss") # track the accuracy of the model -> para llevar el siguimiento de nuestro modelo AddAccuracy(model, softmax, label) # use the average loss we just computed to add gradient operators to the model # se usa el loss porque es al que le queremos hacer el gradiente para minimizarlo ''' Inputs: ys: a list or a dictionary specifying what blobs we want to compute derivatives of. If the input is a list, we will automatically generate their gradients with all-one values; if the input is a dictionary, for any dictionary entries that are not None, we will take the corresponding blobs as their gradients; for all those that are None, we will auto-fill them with 1. ''' model.AddGradientOperators([loss]) # do a simple stochastic gradient descent #Stores a single integer, that gets incremented on each call to Run(). #Useful for tracking the iteration count during SGD, for example. ITER = brew.iter(model, "iter") # set the learning rate schedule LR = model.LearningRate(ITER, "LR", base_lr=-0.1, policy="step", stepsize=1, gamma=0.999) # ONE is a constant value that is used in the gradient update. We only need # to create it once, so it is explicitly placed in param_init_net. ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0) # Now, for each parameter, we do the gradient updates. for param in model.params: # Note how we get the gradient of each parameter - ModelHelper keeps # track of that. param_grad = model.param_to_grad[param] # The update is a simple weighted sum: param = param + param_grad * LR model.WeightedSum([param, ONE, param_grad, LR], param) ''' We will need to checkpoint the parameters of the model periodically. This is achieved via the Checkpoint operator. It also takes in a parameter every so that we dont checkpoint way too often. In this case, we will say lets checkpoint every 20 iterations, which should probably be fine. ''' model.Checkpoint([ITER] + model.params, [], db="mnist_lenet_checkpoint_%05d.lmdb", db_type="lmdb", every=20)
def add_parameter_update_ops(model): brew.add_weight_decay(model, args.weight_decay) ITER = brew.iter(model, "ITER") stepsz = int(30 * args.epoch_size / total_batch_size / num_shards) LR = model.net.LearningRate( [ITER], "LR", base_lr=args.base_learning_rate, policy="step", stepsize=stepsz, gamma=0.1, ) AddMomentumParameterUpdate(model, LR)
def AddTrainingParameters(model, softmax, label): xent = model.LabelCrossEntropy([softmax, label], 'xent') loss = model.AveragedLoss(xent, "loss") AddAccuracy(model, softmax, label) model.AddGradientOperators([loss]) ITER = brew.iter(model, "iter") LR = model.LearningRate(ITER, "LR", base_lr=-0.1, policy="step", stepsize=1, gamma=0.999) ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0) for param in model.params: param_grad = model.param_to_grad[param] model.WeightedSum([param, ONE, param_grad, LR], param)
def add_parameter_update_ops_resnet(model, base_learning_rate, weight_decay): brew.add_weight_decay(model, weight_decay) iter = brew.iter(model, "iter") lr = model.net.LearningRate([iter], "lr", base_lr=base_learning_rate, policy="fixed", gamma=0.1) for param in model.GetParams(): param_grad = model.param_to_grad[param] param_momentum = model.param_init_net.ConstantFill( [param], param + '_momentum', value=0.0 ) model.net.MomentumSGDUpdate( [param_grad, param_momentum, lr, param], [param_grad, param_momentum, param], momentum=0.9, nesterov=1)
def AddTrainingOperators(model, softmax, label): """Adds training operators to the model.""" xent = model.LabelCrossEntropy([softmax, label], 'xent') # compute the expected loss loss = model.AveragedLoss(xent, "loss") # track the accuracy of the model AddAccuracy(model, softmax, label) # use the average loss we just computed to add gradient operators to the model model.AddGradientOperators([loss]) ITER = brew.iter(model, "iter") LR = model.net.LearningRate(ITER, "LR", base_lr=-0.1, policy="step", stepsize=1, gamma=0.999) ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0) for param in model.params: param_grad = model.param_to_grad[param] model.net.WeightedSum([param, ONE, param_grad, LR], param)
def add_parameter_update(model): """ Add a simple gradient based parameter update with stepwise adaptive learning rate. """ # This counts the number if iterations we are making ITER = brew.iter(model, "iter") # Adds a LR to the model, updated using a simple step policy every 10k steps; gamma is an update parameter LR = model.LearningRate(ITER, "LR", base_lr=-args.base_learning_rate, policy="step", stepsize=1000, gamma=0.999) # This is a constant used in the following loop ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0) # Here we are essentially applying the gradients to the weights (using the classical method) for param in model.params: param_grad = model.param_to_grad[param] model.WeightedSum([param, ONE, param_grad, LR], param)
def AddTrainingOperators(model, predict, label, expect, base_lr, log=True): """Adds training operators to the model. predict: Predicted distribution by Policy Model expect: Expected distribution by MCTS, or transformed from Policy Model base_lr: Base Learning Rate. Always fixed """ # compute the expected loss if label: onehot = AddOneHot(model, label) softmax, xent = model.SoftmaxWithLoss([predict, onehot], ['softmax', 'xent'], label_prob=1) AddAccuracy(model, softmax, label, log) else: softmax, xent = model.SoftmaxWithLoss([predict, expect], ['softmax', 'xent'], label_prob=1) loss = model.AveragedLoss(xent, "loss") # use the average loss we just computed to add gradient operators to the model model.AddGradientOperators([loss]) # do a simple stochastic gradient descent ITER = brew.iter(model, "iter") # set the learning rate schedule LR = model.LearningRate( ITER, "LR", base_lr=base_lr, policy="fixed") # when policy=fixed, stepsize and gamma are ignored # ONE is a constant value that is used in the gradient update. We only need # to create it once, so it is explicitly placed in param_init_net. ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0) # Now, for each parameter, we do the gradient updates. for param in model.params: # Note how we get the gradient of each parameter - ModelHelper keeps # track of that. param_grad = model.param_to_grad[param] # The update is a simple weighted sum: param = param + param_grad * LR model.WeightedSum([param, ONE, param_grad, LR], param) if log: model.Print('loss', [], to_file=1)
def AddParameterUpdate_ops(model): brew.add_weight_decay(model, weight_decay) iter = brew.iter(model, "iter") lr = model.net.LearningRate( [iter], "lr", base_lr=base_learning_rate, policy="step", stepsize=stepsize, gamma=0.1, ) for param in model.GetParams(): param_grad = model.param_to_grad[param] param_momentum = model.param_init_net.ConstantFill([param], param + "_momentum", value=0.0) model.net.MomentumSGDUpdate( [param_grad, param_momentum, lr, param], [param_grad, param_momentum, param], momentum=0.9, nesterov=1, )
def AddTrainingOperators(model, softmax, label): """Adds training operators to the model.""" xent = model.LabelCrossEntropy([softmax, label], 'xent') # compute the expected loss loss = model.AveragedLoss(xent, "loss") # track the accuracy of the model AddAccuracy(model, softmax, label) # use the average loss we just computed to add gradient operators to the model model.AddGradientOperators([loss]) # do a simple stochastic gradient descent ITER = brew.iter(model, "iter") # set the learning rate schedule LR = model.LearningRate( ITER, "LR", base_lr=-0.1, policy="step", stepsize=1, gamma=0.999 ) # ONE is a constant value that is used in the gradient update. We only need # to create it once, so it is explicitly placed in param_init_net. ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0) # Now, for each parameter, we do the gradient updates. for param in model.params: # Note how we get the gradient of each parameter - ModelHelper keeps # track of that. param_grad = model.param_to_grad[param] # The update is a simple weighted sum: param = param + param_grad * LR model.WeightedSum([param, ONE, param_grad, LR], param)
def AddCheckpoints(model, checkpoint_iters, db_type): ITER = brew.iter(train_model, "iter") train_model.Checkpoint([ITER] + train_model.params, [], db="mstar_lenet_checkpoint_%05d.lmdb", db_type="lmdb", every=checkpoint_iters)
def Iter(self, *args, **kwargs): return brew.iter(self, *args, **kwargs)
def AddTrainingOperators(model, softmax, label): # something very important happens here xent = model.LabelCrossEntropy([softmax, label], 'xent') # compute the expected loss loss = model.AveragedLoss(xent, "loss") # track the accuracy of the model AddAccuracy(model, softmax, label) # use the average loss we just computed to add gradient operators to the model model.AddGradientOperators([loss]) # do a simple stochastic gradient descent ITER = brew.iter(model, "iter") # set the learning rate schedule LR = model.LearningRate(ITER, "LR", base_lr=-0.1, policy="step", stepsize=1, gamma=0.999) # ONE is a constant value that is used in the gradient update. We only need # to create it once, so it is explicitly placed in param_init_net. ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0) # Now, for each parameter, we do the gradient updates. for param in model.params: # Note how we get the gradient of each parameter - CNNModelHelper keeps # track of that. param_grad = model.param_to_grad[param] # The update is a simple weighted sum: param = param + param_grad * LR model.WeightedSum([param, ONE, param_grad, LR], param) # let's checkpoint every 20 iterations, which should probably be fine. # you may need to delete tutorial_files/tutorial-mnist to re-run the tutorial model.Checkpoint([ITER] + model.params, [], db="mnist_lenet_checkpoint_%05d.leveldb", db_type="leveldb", every=20) arg_scope = {"order": "NCHW"} train_model = model_helper.ModelHelper(name="mnist_train", arg_scope=arg_scope) data, label = AddInput(train_model, batch_size=64, db=os.path.join(data_folder, 'mnist-train-nchw-leveldb'), db_type='leveldb') softmax = AddLeNetModel(train_model, data) AddTrainingOperators(train_model, softmax, label) # Testing model. We will set the batch size to 100, so that the testing # pass is 100 iterations (10,000 images in total). # For the testing model, we need the data input part, the main LeNetModel # part, and an accuracy part. Note that init_params is set False because # we will be using the parameters obtained from the train model. test_model = model_helper.ModelHelper(name="mnist_test", arg_scope=arg_scope, init_params=False) data, label = AddInput(test_model, batch_size=100, db=os.path.join(data_folder, 'mnist-test-nchw-leveldb'), db_type='leveldb') softmax = AddLeNetModel(test_model, data) AddAccuracy(test_model, softmax, label) # Deployment model. We simply need the main LeNetModel part. deploy_model = model_helper.ModelHelper(name="mnist_deploy", arg_scope=arg_scope, init_params=False) AddLeNetModel(deploy_model, "data") graph = net_drawer.GetPydotGraphMinimal(train_model.net.Proto().op, "mnist", rankdir="LR", minimal_dependency=True) display.Image(graph.create_png(), width=800) with open(os.path.join(root_folder, "train_net.pbtxt"), 'w') as fid: fid.write(str(train_model.net.Proto())) with open(os.path.join(root_folder, "train_init_net.pbtxt"), 'w') as fid: fid.write(str(train_model.param_init_net.Proto())) with open(os.path.join(root_folder, "test_net.pbtxt"), 'w') as fid: fid.write(str(test_model.net.Proto())) with open(os.path.join(root_folder, "test_init_net.pbtxt"), 'w') as fid: fid.write(str(test_model.param_init_net.Proto())) with open(os.path.join(root_folder, "deploy_net.pbtxt"), 'w') as fid: fid.write(str(deploy_model.net.Proto())) print("Protocol buffers files have been created in your root folder: " + root_folder) # The parameter initialization network only needs to be run once. workspace.RunNetOnce(train_model.param_init_net) # creating the network workspace.CreateNet(train_model.net) # set the number of iterations and track the accuracy & loss total_iters = 200 accuracy = np.zeros(total_iters) loss = np.zeros(total_iters) # Now, we will manually run the network for 200 iterations. for i in range(total_iters): workspace.RunNet(train_model.net.Proto().name) accuracy[i] = workspace.FetchBlob('accuracy') loss[i] = workspace.FetchBlob('loss') # After the execution is done, let's plot the values. pyplot.plot(loss, 'b') pyplot.plot(accuracy, 'r') pyplot.legend(('Loss', 'Accuracy'), loc='upper right') # Let's look at some of the data. pyplot.figure() data = workspace.FetchBlob('data') _ = visualize.NCHW.ShowMultiple(data) pyplot.figure() softmax = workspace.FetchBlob('softmax') _ = pyplot.plot(softmax[0], 'ro') pyplot.title('Prediction for the first image') # run a test pass on the test net workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) test_accuracy = np.zeros(100) for i in range(100): workspace.RunNet(test_model.net.Proto().name) test_accuracy[i] = workspace.FetchBlob('accuracy') # After the execution is done, let's plot the values. pyplot.plot(test_accuracy, 'r') pyplot.title('Acuracy over test batches.') print('test_accuracy: %f' % test_accuracy.mean())
def CreateModel(self): log.debug("Start training") model = model_helper.ModelHelper(name="char_rnn") input_blob, seq_lengths, hidden_init, cell_init, target = \ model.net.AddExternalInputs( 'input_blob', 'seq_lengths', 'hidden_init', 'cell_init', 'target', ) hidden_output_all, self.hidden_output, _, self.cell_state = LSTM( model, input_blob, seq_lengths, (hidden_init, cell_init), self.D, self.hidden_size, scope="LSTM") output = brew.fc(model, hidden_output_all, None, dim_in=self.hidden_size, dim_out=self.D, axis=2) # axis is 2 as first two are T (time) and N (batch size). # We treat them as one big batch of size T * N softmax = model.net.Softmax(output, 'softmax', axis=2) softmax_reshaped, _ = model.net.Reshape(softmax, ['softmax_reshaped', '_'], shape=[-1, self.D]) # Create a copy of the current net. We will use it on the forward # pass where we don't need loss and backward operators self.forward_net = core.Net(model.net.Proto()) xent = model.net.LabelCrossEntropy([softmax_reshaped, target], 'xent') # Loss is average both across batch and through time # Thats why the learning rate below is multiplied by self.seq_length loss = model.net.AveragedLoss(xent, 'loss') model.AddGradientOperators([loss]) # Hand made SGD update. Normally one can use helper functions # to build an optimizer ITER = brew.iter(model, "iter") LR = model.LearningRate(ITER, "LR", base_lr=-0.1 * self.seq_length, policy="step", stepsize=1, gamma=0.9999) ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0) # Update weights for each of the model parameters for param in model.params: param_grad = model.param_to_grad[param] model.net.WeightedSum([param, ONE, param_grad, LR], param) self.model = model self.predictions = softmax self.loss = loss self.prepare_state = core.Net("prepare_state") self.prepare_state.Copy(self.hidden_output, hidden_init) self.prepare_state.Copy(self.cell_state, cell_init)
softmax = AddLeNetModel(train_model, data) ################################################################################## #### Step 3: Add training operators to the model # TODO: use the optimizer class here instead of doing sgd by hand xent = train_model.LabelCrossEntropy(['softmax', 'label'], 'xent') loss = train_model.AveragedLoss(xent, 'loss') brew.accuracy(train_model, ['softmax', 'label'], 'accuracy') train_model.AddGradientOperators([loss]) opt = optimizer.build_sgd(train_model, base_learning_rate=0.1) for param in train_model.GetOptimizationParamInfo(): opt(train_model.net, train_model.param_init_net, param) #model.Checkpoint([ITER] + model.params, [], db="mnist_lenet_checkpoint_%05d.lmdb", db_type="lmdb", every=20) ITER = brew.iter(train_model, "iter") train_model.Checkpoint([ITER] + train_model.params, [], db="mnist_lenet_checkpoint_%05d.lmdb", db_type="lmdb", every=checkpoint_iters) ################################################################################## #### Run the training procedure # run the param init network once workspace.RunNetOnce(train_model.param_init_net) # create the network workspace.CreateNet(train_model.net, overwrite=True) # Set the total number of iterations and track the accuracy and loss total_iters = training_iters accuracy = np.zeros(total_iters)