def test_train_step_with_backprop(self): net = PredCodMLP([3, 2, 2]) input = np.array([[1, 0, 1]]) X = add_bias_col(input) params = [p.copy() for p in net.params] output = np.array([1]) h = np.maximum(0, X.dot(params[0])) h = add_bias_col(h) scores = h.dot(params[1]) loss, dscores = cross_entropy_loss(scores, output) #print('pred: ', pred) #print('net pred: ', net.predict(input)) dw2 = h.T.dot(dscores) #print('dw2: ', dw2) dh = dscores.dot(params[1][:-1, :].T) dh[h[:, :-1] <= 0] = 0 dw1 = X.T.dot(dh) configs = [{}, {}] params[0], _ = adam(params[0], dw1, configs[0]) params[1], _ = adam(params[1], dw2, configs[1]) pc_params, layers = net._PredCodMLP__train_step( net.params, input, output, [{}, {}]) np.testing.assert_array_almost_equal(pc_params[0], params[0]) np.testing.assert_array_equal(pc_params[1], params[1])
def build(self, deterministic = False): self.model() self.predict['pspout'] = lasagne.layers.get_output(self.network['pspout']) self.predict['angleout'] = lasagne.layers.get_output(self.network['angleout']) self.loss['pspout'] = T.sqrt(lasagne.objectives.squared_error(self.predict['pspout'], self.psp_var).mean()) self.loss['angleout'] = T.sqrt(lasagne.objectives.squared_error(self.predict['angleout'], self.angler_var).mean()) self.loss['train'] = self.loss['pspout'] + self.loss['angleout'] _inputs = [self.input_var, self.psp_var, self.angler_var] paras = lasagne.layers.get_all_params(self.network['angleout'], trainable = True) print('Building training functions...') lrvars = [] self.layer_rep.append(len(paras)) for lr, i in zip(self.learning_rate, range(len(self.layer_rep) - 1)): lrvars += [theano.shared(np.float32(lr)) for j in range(self.layer_rep[i], self.layer_rep[i + 1])] self.updates, self.update_params = adam(self.loss['train'], paras, lrvars) self.ftrain = theano.function(_inputs, [], updates = self.updates) print('Building validation functions...') self.vals['pspout'] = abs((self.predict['pspout'] - self.psp_var) / self.psp_var).mean() self.vals['angleout'] = abs((self.predict['angleout'] - self.angler_var) / self.angler_var).mean() self.fval = theano.function(_inputs, [self.loss['train']] + [self.vals['pspout'], self.vals['angleout']])
def __train_step(self, params: List[np.ndarray], input: np.ndarray, output: np.ndarray, optim_configs: List[dict], lr=0.01) -> Tuple[List[np.ndarray], List[np.ndarray]]: scores, layers = self.__predict(params, input) #layers[-1] = output X = add_bias_col(input) for t in range(self.num_layers - 1): curr_mu = X.dot(params[0]) curr_err = layers[1] - curr_mu if t == self.num_layers - 2: N = X.shape[0] dW = X.T.dot(curr_err) / N params[0], optim_configs[0] = adam(params[0], dW, optim_configs[0]) """ for i in range(1, self.num_layers - 2): update_W = t == self.num_layers - 2 - i #print(t, i) config = optim_configs[i] layers[i], params[i], curr_err, optim_configs[i] = self.__update_layer(layers[i], curr_err, params[i], layers[i+1], update_W, config, lr) """ update_W = t == 0 loss, dscores = cross_entropy_loss(scores, output) layers[1], params[1], optim_configs[1] = self.__update_final_layer( layers[1], curr_err, params[1], dscores, update_W, optim_configs[1], lr) return params, layers
def __update_final_layer(self, X:np.ndarray, err: np.ndarray, W: np.ndarray, next_err: np.ndarray, update_W: bool, config=None, lr=0.1) \ -> Tuple[np.ndarray, np.ndarray, dict]: h = add_bias_col(np.maximum(0, X)) relu_mask = X > 0 X += -err + relu_mask * next_err.dot(W.T[:, :-1]) if update_W: N = X.shape[0] dW = h.T.dot(next_err) / N W, config = adam(W, dW, config) return X, W, config
def build(self, deterministic=False): self.model() self.predict["pspout"] = lasagne.layers.get_output(self.network["pspout"], deterministic=deterministic) self.predict["pspimg"] = lasagne.layers.get_output(self.network["pspimg"], deterministic=deterministic) self.predict["angleout"] = lasagne.layers.get_output(self.network["angleout"], deterministic=deterministic) self.predict["angleimg"] = lasagne.layers.get_output(self.network["angleimg"], deterministic=deterministic) self.predict["mal"] = lasagne.layers.get_output(self.network["mal"], deterministic=deterministic) self.loss["pspout"] = T.sqrt(lasagne.objectives.squared_error(self.predict["pspout"], self.psp_var).mean()) self.loss["angleout"] = lasagne.objectives.categorical_crossentropy( self.predict["angleout"], self.angle_var ).mean() # self.loss['mal'] = self.predict['mal'].mean() self.loss["mal"] = 0.01 * T.sqrt( lasagne.objectives.squared_error(T.argmax(self.predict["angleout"]), self.angle_var).mean() ) self.loss["train"] = 0 _inputs = [self.input_var] for i, _opt in enumerate(self.opt): if _opt == 1: if i < 3: _inputs.append(self.inputs[i]) self.loss["train"] += self.loss[self.optkey[i]] paras = lasagne.layers.get_all_params(self.network[self.optkey[i]], trainable=True) print("Loss %s add..." % self.optkey[i]) if deterministic == False: print("Building training functions...") lrvars = [] self.layer_rep.append(len(paras)) for lr, i in zip(self.learning_rate, range(len(self.layer_rep) - 1)): lrvars += [theano.shared(np.float32(lr)) for j in range(self.layer_rep[i], self.layer_rep[i + 1])] self.updates, self.update_params = adam(self.loss["train"], paras, lrvars) self.ftrain = theano.function(_inputs, [], updates=self.updates) print("Building validation functions...") self.vals["pspout"] = abs((self.predict["pspout"] - self.psp_var) / self.psp_var).mean() self.vals["angleout"] = T.eq(T.argmax(self.predict["angleout"], axis=1), self.angle_var).mean() self.vals["mal"] = self.loss["mal"] self.fval = theano.function( _inputs, [self.loss["train"]] + [self.vals[self.optkey[i]] for i in range(len(self.opt)) if self.opt[i] == 1], ) else: print("Building middle output functions...") for f in self.feat: self.ffeat[f] = theano.function([self.input_var], lasagne.layers.get_output(self.network[f])) print("Building prediction functions...") for key in self.predict: if key != "mal": self.fpred[key] = theano.function([self.input_var], self.predict[key])
def __update_layer(self, X: np.ndarray, err: np.ndarray, W: np.ndarray, next_X: np.ndarray, update_W: bool, config=None, lr=.01) \ -> Tuple[np.ndarray, np.ndarray, np.ndarray, dict]: h = add_bias_col(np.maximum(0, X)) next_mu = h.dot(W) next_err = next_X - next_mu #print(np.sum(next_err), '\n') relu_mask = X > 0 #print(-err, next_err.dot(W.T[:,:-1])) X += -err + relu_mask * next_err.dot(W.T[:, :-1]) if update_W: N = X.shape[0] dW = h.T.dot(next_err) / N W, config = adam(W, dW, config) h = add_bias_col(np.maximum(0, X)) next_mu = h.dot(W) next_err = next_X - next_mu return X, W, next_err, config
def train(train_x, train_y, val_x, val_y, d, hl, ol, config, lf): print("Function Invoked: train") epochs, eta, alpha, init_strategy, optimiser, batch_size, ac = config[ "epochs"], config["learning_rate"], config["weight_decay"], config[ "init_strategy"], config["optimiser"], config[ "batch_size"], config["ac"] if optimiser == "vgd": return vgd.vgd(train_x, train_y, val_x, val_y, d, hl, ol, ac, lf, epochs, eta, init_strategy, alpha) elif optimiser == "sgd": return sgd.sgd(train_x, train_y, val_x, val_y, d, hl, ol, ac, lf, epochs, eta, init_strategy, alpha) elif optimiser == "mgd": return mgd.mgd(train_x, train_y, val_x, val_y, d, hl, ol, ac, lf, epochs, eta, init_strategy, batch_size, alpha) elif optimiser == "nag": return nag.nag(train_x, train_y, val_x, val_y, d, hl, ol, ac, lf, epochs, eta, init_strategy, batch_size, alpha) elif optimiser == "adam": return adam.adam(train_x, train_y, val_x, val_y, d, hl, ol, ac, lf, epochs, eta, init_strategy, batch_size) elif optimiser == "rmsprop": return rmsprop.rmsprop(train_x, train_y, val_x, val_y, d, hl, ol, ac, lf, epochs, eta, init_strategy, batch_size) elif optimiser == "nadam": return nadam.nadam(train_x, train_y, val_x, val_y, d, hl, ol, ac, lf, epochs, eta, init_strategy, batch_size)
def evaluate_lenet5( learning_rate=0.1, n_epochs=200, dataset='/home/hudson/ug/tzbq97/deeplearn/data/mnist.pkl.gz', nkerns=[20, 50], batch_size=500): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches //= batch_size n_valid_batches //= batch_size n_test_batches //= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print('... building the model') # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (28, 28) is the size of MNIST images. layer0_input = x.reshape((batch_size, 1, 28, 28)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters '''grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ]''' updates = adam(cost, params, learning_rate=0.001, b1=0.9, b2=0.999, e=1e-8, gamma=1 - 1e-8) train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # end-snippet-1 ############### # TRAIN MODEL # ############### print('... training') # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience // 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print('training @ iter = ', iter) cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in range(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in range(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = timeit.default_timer() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print( ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr)
inps_sg = inps_net + [target_gradients, activation, latent_gradients, samples] tparams_net = OrderedDict() tparams_sg = OrderedDict() for key, val in tparams.iteritems(): print key # running means and running variances should not be included in the list for which gradients are computed if 'rm' in key or 'rv' in key: continue elif 'sg' in key: tparams_sg[key] = val else: tparams_net[key] = val print "Setting up optimizers" f_grad_shared, f_update = adam(lr, tparams_net, grads, inps_net, outs) # f_grad_shared_sg, f_update_sg = adam(lr, tparams_sg, grads_sg, inps_sg, loss_sg) # sgd with momentum updates sgd = SGD(lr=args.learning_rate) f_update_sg = theano.function(inps_sg, loss_sg, updates=sgd.get_grad_updates(loss_sg, param_sg), on_unused_input='ignore', profile=False) print "Training" cost_report = open('./Results/disc/SF/gradcomp_' + code_name + '_' + str(args.batch_size) + '_' + str(args.learning_rate) + '.txt', 'w') id_order = range(len(trc)) iters = 0 min_cost = 100000.0 epoch = 0 condition = False
for i in range(num_of_epochs): tmp_train_files = copy.copy(train_files) # progress bar (begin) sys.stdout.write("Epoch {}: ".format(i + 1)) sys.stdout.write("[%s]" % (" " * toolbar_width)) sys.stdout.flush() sys.stdout.write("\b" * (toolbar_width + 1)) # an epoch for j in range(train_size // batch_size): batch_files = random.sample(tmp_train_files, batch_size) for file in batch_files: tmp_train_files.remove(file) theta, moment_1, moment_2 = adam(batch_files, theta, moment_1, moment_2) # progress bar sys.stdout.write("#") sys.stdout.flush() # progress bar (end) sys.stdout.write("] Done.\n") loss_list_for_train.append(avg_loss(train_files, theta)) loss_list_for_valid.append(avg_loss(valid_files, theta)) accuracy_list_for_train.append(avg_accuracy(train_files, theta)) accuracy_list_for_valid.append(avg_accuracy(valid_files, theta)) # plot graphs title = "SGD loss on train_data"
batch_size = 5000 loss = "ce" opt = "adam" lr = 0.001 anneal = False momentum = 0.1 total_loss_train, total_loss_val = [0] * 4, [0] * 4 for i in range(0, 4): weights = [0] * layers biases = [0] * layers weights, biases = randInit.randomInitializer(weights, biases, inputs_num, outputs_num, layers, sizes[i]) total_loss_train[ i], total_loss_val[i], train_weights, train_biases = adam.adam( train_input, train_label, val_input, val_label, layers, biases, weights, activation, batch_size, loss, epochs, lr, anneal, gamma, expt_dir) print("value of i is %d", i) print(total_loss_train, total_loss_val) # for i in range(0,4): # total_loss_train[i] = total_loss_train[i].reshape(total_loss_train[i].shape[0], 1) # total_loss_val[i] = total_loss_val[i].reshape(total_loss_val[i].shape[0], 1) epoch_count = np.arange(1, epochs + 1) epoch_count = epoch_count.reshape(epoch_count.shape[0], 1) plt.figure(1) plt.plot(epoch_count, total_loss_train[0].T, 'r-', label='50') plt.plot(epoch_count, total_loss_train[1].T, 'b-', label='100')
grads_sg = T.grad(loss_sg, wrt=sg_params_list) grads_net = grad_list_3 + grad_list_2[:2] + grad_list_1[:2] # key things inps_sg = [semi_synth_grad_1, semi_synth_grad_2, h1, h2] required_output_from_graph = [ grad_list_2[2], grad_list_1[2], out1, out2 ] lr = T.scalar('learning_rate', dtype='float32') inps = [img_ids] print "Setting up optimizer" if train_rou == 'backprop': f_grad_shared, f_update = adam(lr, tparams, grads, inps, loss) elif train_rou == 'synthetic_gradients': # split params for sg modules and network tparams_net = OrderedDict() tparams_sg = OrderedDict() for key, val in tparams.iteritems(): if 'sg' in key: tparams_sg[key] = tparams[key] else: tparams_net[key] = tparams[key] f_grad_shared, f_update = adam(lr, tparams_net, grads_net, inps, [loss] + required_output_from_graph) f_grad_shared_sg, f_update_sg = adam(lr, tparams_sg, grads_sg,
grad = lambda x: H @ x # l2 regularization lam = 0.1 f_reg = lambda x: lam * x @ x grad_reg = lambda x: 2 * lam * x # covariance C = 5.0 * np.eye(dim) # risk function rn = lambda x: x @ x + 0.5 * np.trace(H @ C) from adam import adam y_best, Y = adam(grad, x0, C, max_iter=1000, batch_size=10, eta=2, beta1=0.9, beta2=0.9, eps=1e-7, func=f, verbose=True) # optimize x_best, X = adam_cv(grad, x0, C, grad_reg, max_iter=1000, batch_size=10, eta=2, beta1=0.9, beta2=0.9,
def __init__(self, We, params): lstm_layers_num = 1 en_hidden_size = We.shape[1] self.eta = params.eta self.num_labels = params.num_labels self.en_hidden_size = en_hidden_size self.de_hidden_size = params.de_hidden_size self.lstm_layers_num = params.lstm_layers_num self._train = None self._utter = None self.params = [] self.encoder_lstm_layers = [] self.decoder_lstm_layers = [] self.hos = [] self.Cos = [] encoderInputs = tensor.imatrix() decoderInputs, decoderTarget = tensor.imatrices(2) encoderMask, TF, decoderMask, decoderInputs0, diagonal = tensor.fmatrices( 5) self.lookuptable = theano.shared(We) #### the last one is for the stary symbole self.de_lookuptable = theano.shared(name="Decoder LookUpTable", value=init_xavier_uniform( self.num_labels + 1, self.de_hidden_size), borrow=True) self.linear = theano.shared( name="Linear", value=init_xavier_uniform(self.de_hidden_size + 2 * en_hidden_size, self.num_labels), borrow=True) self.linear_bias = theano.shared( name="Hidden to Bias", value=np.asarray(np.random.randn(self.num_labels, ) * 0., dtype=theano.config.floatX), borrow=True) self.hidden_decode = theano.shared(name="Hidden to Decode", value=init_xavier_uniform( 2 * en_hidden_size, self.de_hidden_size), borrow=True) self.hidden_bias = theano.shared( name="Hidden to Bias", value=np.asarray(np.random.randn(self.de_hidden_size, ) * 0., dtype=theano.config.floatX), borrow=True) #self.params += [self.linear, self.de_lookuptable, self.hidden_decode, self.hidden_bias] #concatenate self.params += [ self.linear, self.linear_bias, self.de_lookuptable, self.hidden_decode, self.hidden_bias ] #the initial hidden state of decoder lstm is zeros #(max_sent_size, batch_size, hidden_size) state_below = self.lookuptable[encoderInputs.flatten()].reshape( (encoderInputs.shape[0], encoderInputs.shape[1], self.en_hidden_size)) for _ in range(self.lstm_layers_num): enclstm_f = LSTM(self.en_hidden_size) enclstm_b = LSTM(self.en_hidden_size, True) self.encoder_lstm_layers.append(enclstm_f) #append self.encoder_lstm_layers.append(enclstm_b) #append self.params += enclstm_f.params + enclstm_b.params #concatenate hs_f, Cs_f = enclstm_f.forward(state_below, encoderMask) hs_b, Cs_b = enclstm_b.forward(state_below, encoderMask) hs = tensor.concatenate([hs_f, hs_b], axis=2) Cs = tensor.concatenate([Cs_f, Cs_b], axis=2) hs0 = tensor.concatenate([hs_f[-1], hs_b[0]], axis=1) Cs0 = tensor.concatenate([Cs_f[-1], Cs_b[0]], axis=1) #self.hos += tensor.tanh(tensor.dot(hs0, self.hidden_decode) + self.hidden_bias), #self.Cos += tensor.tanh(tensor.dot(Cs0, self.hidden_decode) + self.hidden_bias), self.hos += tensor.alloc( np.asarray(0., dtype=theano.config.floatX), encoderInputs.shape[1], self.de_hidden_size), self.Cos += tensor.alloc( np.asarray(0., dtype=theano.config.floatX), encoderInputs.shape[1], self.de_hidden_size), state_below = hs Encoder = state_below Encoder_shuffle = Encoder.dimshuffle(1, 0, 2) ## Encoder_shuffle : B x L x h_e ##Encoder_shuffle_re : B x L x h_d Encoder_shuffle_re = tensor.dot( Encoder_shuffle, self.hidden_decode) + self.hidden_bias[None, None, :] state_below = self.de_lookuptable[decoderInputs.flatten()].reshape( (decoderInputs.shape[0], decoderInputs.shape[1], self.de_hidden_size)) for i in range(self.lstm_layers_num): declstm = LSTM(self.de_hidden_size) self.decoder_lstm_layers += declstm, #append self.params += declstm.params #concatenate ho, Co = self.hos[i], self.Cos[i] state_below, Cs = declstm.forward(state_below, decoderMask, ho, Co) ##### Here we include the representation from the decoder decoder_lstm_outputs = tensor.concatenate([state_below, Encoder], axis=2) ei, di, dt = tensor.imatrices(3) #place holders em, dm, tf, di0, diag = tensor.fmatrices(5) ##################################################### ##################################################### linear_outputs = tensor.dot(decoder_lstm_outputs, self.linear) + self.linear_bias[None, None, :] softmax_outputs, updates = theano.scan( fn=lambda x: tensor.nnet.softmax(x), sequences=[linear_outputs], ) def _NLL(pred, y, m): return -m * tensor.log(pred[tensor.arange(encoderInputs.shape[1]), y]) costs, _ = theano.scan( fn=_NLL, sequences=[softmax_outputs, decoderTarget, decoderMask]) loss = costs.sum() / decoderMask.sum() + params.L2 * sum( lasagne.regularization.l2(x) for x in self.params) updates = lasagne.updates.adam(loss, self.params, self.eta) #updates = lasagne.updates.apply_momentum(updates, self.params, momentum=0.9) ################################################### #### using the ground truth when training ################################################## self._train = theano.function(inputs=[ei, em, di, dm, dt], outputs=[loss, softmax_outputs], updates=updates, givens={ encoderInputs: ei, encoderMask: em, decoderInputs: di, decoderMask: dm, decoderTarget: dt }) ######################################################################### ### For schedule sampling ######################################################################### ###### always use privous predict as next input def _step2(diag_, state_, hs_, Cs_): hs, Cs = [], [] token_idxs = tensor.cast(state_.argmax(axis=-1), "int32") msk_ = tensor.fill( (tensor.zeros_like(token_idxs, dtype="float32")), 1) msk_ = msk_.dimshuffle('x', 0) state_below0 = self.de_lookuptable[token_idxs].reshape( (1, encoderInputs.shape[1], self.de_hidden_size)) for i, lstm in enumerate(self.decoder_lstm_layers): h, C = lstm.forward(state_below0, msk_, hs_[i], Cs_[i]) #mind msk hs += h[-1], Cs += C[-1], state_below0 = h hs, Cs = tensor.as_tensor_variable(hs), tensor.as_tensor_variable( Cs) state_below0 = state_below0.reshape( (encoderInputs.shape[1], self.de_hidden_size)) attn_index = tensor.nonzero(diag_, True) attn_value = tensor.nonzero_values(diag_) en_context = Encoder_shuffle[:, attn_index[0], :] attn_context = Encoder_shuffle_re[:, attn_index[0], :] attn_weight = tensor.batched_dot(attn_context, state_below0) attn_weight = tensor.nnet.softmax(attn_weight) #attn_weight *= (encoderMask.dimshuffle(1,0)) attn_weight *= (attn_value.dimshuffle('x', 0)) ##attn_weight = attn_weight/(tensor.sum(attn_weight, axis=1).dimshuffle(0,'x')) ####### ctx_ : (b, h) ctx_ = tensor.sum(en_context * attn_weight[:, :, None], axis=1) state_below0 = tensor.concatenate([ctx_, state_below0], axis=1) newpred = tensor.dot(state_below0, self.linear) + self.linear_bias[None, :] state_below = tensor.nnet.softmax(newpred) ##### the beging symbole probablity is 0 extra_p = tensor.zeros_like(hs[:, :, 0]) state_below = tensor.concatenate([state_below, extra_p.T], axis=1) return state_below, hs, Cs hs0, Cs0 = tensor.as_tensor_variable( self.hos, name="hs0"), tensor.as_tensor_variable(self.Cos, name="Cs0") train_outputs, _ = theano.scan(fn=_step2, sequences=[diagonal], outputs_info=[decoderInputs0, hs0, Cs0], n_steps=encoderInputs.shape[0]) train_predict = train_outputs[0] train_costs, _ = theano.scan( fn=_NLL, sequences=[train_predict, decoderTarget, decoderMask]) train_loss = train_costs.sum() / decoderMask.sum() + params.L2 * sum( lasagne.regularization.l2(x) for x in self.params) from adam import adam train_updates = adam(train_loss, self.params, self.eta) #train_updates = lasagne.updates.apply_momentum(train_updates, self.params, momentum=0.9) self._train2 = theano.function( inputs=[ei, em, di0, dm, dt, diag], outputs=[train_loss, train_predict], updates=train_updates, givens={ encoderInputs: ei, encoderMask: em, decoderInputs0: di0, decoderMask: dm, decoderTarget: dt, diagonal: diag } #givens={encoderInputs:ei, encoderMask:em, decoderInputs:di, decoderMask:dm, decoderTarget:dt, TF:tf} ) listof_token_idx = train_predict.argmax(axis=-1) self._utter = theano.function(inputs=[ei, em, di0, diag], outputs=listof_token_idx, givens={ encoderInputs: ei, encoderMask: em, decoderInputs0: di0, diagonal: diag })
anneal_rate = 0.00003 tparams_net = OrderedDict() updates_bn = [] for key, val in tparams.iteritems(): if ('rmu' in key) or ('rvu' in key): continue elif 'rm' in key or 'rv' in key: updates_bn.append((tparams[key], tparams[key + 'u'])) else: tparams_net[key] = val print "Setting up optimizer" f_grad_shared, f_update = adam(lr, tparams_net, grads, inps, [cost, xtranorm], ups=updates_bn) print "Training" cost_report = open( './Results/' + args.latent_type + '/' + args.estimator + '/training_' + code_name + '_' + str(args.batch_size) + '_' + str(args.learning_rate) + '.txt', 'w') id_order = range(len(trc)) iters = 0 cur_temp = temperature_init min_cost = 100000.0 epoch = 0 condition = False
def exp_raw(dtype): shp = (None, 3, 256, 256) input_var = T.tensor4('input_var', dtype = 'float32') psp = T.dmatrix("psp") network = OrderedDict() network['input'] = lasagne.layers.InputLayer(shape = shp, input_var = input_var) # network = make_vgg16(network, 'model/vgg16_weights_from_caffe.h5') # First conv and segmentation part network['conv1_1'] = lasagne.layers.Conv2DLayer(network['input'], num_filters = 64, filter_size = (3, 3),nonlinearity = lasagne.nonlinearities.rectify, W=lasagne.init.GlorotUniform()) network['conv1_2'] = lasagne.layers.Conv2DLayer(network['conv1_1'], num_filters = 64, filter_size = (3, 3), nonlinearity = lasagne.nonlinearities.rectify) network['pool1_1'] = lasagne.layers.MaxPool2DLayer(network['conv1_2'], pool_size = (2, 2)) network['norm1_1'] = lasagne.layers.BatchNormLayer(network['pool1_1']) network['conv1_3'] = lasagne.layers.Conv2DLayer(network['norm1_1'], num_filters = 128, filter_size = (3, 3), nonlinearity = lasagne.nonlinearities.rectify) network['conv1_4'] = lasagne.layers.Conv2DLayer(network['conv1_3'], num_filters = 128, filter_size = (3, 3), nonlinearity = lasagne.nonlinearities.rectify) network['pool1_2'] = lasagne.layers.MaxPool2DLayer(network['conv1_4'], pool_size = (2, 2)) network['norm1_2'] = lasagne.layers.BatchNormLayer(network['pool1_2']) network['conv1_5'] = lasagne.layers.Conv2DLayer(network['norm1_2'], num_filters = 256, filter_size = (3, 3), nonlinearity = lasagne.nonlinearities.rectify) network['pool1_3'] = lasagne.layers.MaxPool2DLayer(network['conv1_5'], pool_size = (2, 2)) network['conv1_6'] = lasagne.layers.Conv2DLayer(network['pool1_3'], num_filters = 256, filter_size = (3, 3), nonlinearity = lasagne.nonlinearities.rectify) network['pool1_4'] = lasagne.layers.MaxPool2DLayer(network['conv1_6'], pool_size = (2, 2)) # Perspective Transform network['norm2'] = lasagne.layers.BatchNormLayer(network['pool1_4']) # network['cast'] = CastingLayer(network['norm2'], dtype) theano.config.floatX = dtype network['pfc2_1'] = lasagne.layers.DenseLayer( lasagne.layers.dropout(network['norm2'], p = 0.05), num_units = 1024, nonlinearity = lasagne.nonlinearities.rectify) network['pfc2_2'] = lasagne.layers.DenseLayer( lasagne.layers.dropout(network['pfc2_1'], p=0.05), num_units = 1024, nonlinearity = lasagne.nonlinearities.rectify) network['pfc2_3'] = lasagne.layers.DenseLayer( lasagne.layers.dropout(network['pfc2_2'], p=0.05), num_units = 1024, nonlinearity = lasagne.nonlinearities.rectify) # loss target 2 network['pfc_out'] = lasagne.layers.DenseLayer( lasagne.layers.dropout(network['pfc2_3'], p = 0.05), num_units = 8, nonlinearity = lasagne.nonlinearities.rectify) theano.config.floatX = 'float32' predict = lasagne.layers.get_output(network['pfc_out']) loss = T.sqrt(lasagne.objectives.squared_error(predict, psp).mean()) paras = lasagne.layers.get_all_params(network['pfc_out'], trainable = True) updates = adam(loss, paras, [theano.shared(np.float32(0.0001)) for i in range(len(paras))]) ftrain = theano.function([input_var, psp], [loss, predict], updates = updates) def get_inputs(meta, batch, path): # batchidx = [keys[i] for i in batch] input = np.array([read_image(path + 'patch/' + idx + '.jpg', shape = (256, 256)) for idx in batch]).astype(np.float32) seg = np.array([read_image(path + 'pmask/' + idx + '.jpg', shape = (256, 256)) for idx in batch]).astype(np.float32) dat = [meta[key] for key in batch] Ps = np.array([np.array(dat[i][0]).flatten()[0 : 8] for i in range(len(batch))]) for P in Ps: P[6 : 8] = (P[6 : 8] + 1e-3) * 1e4 return input, Ps path = '/home/yancz/text_generator/data/real/' dat, meta = load_data(path, 10000, False) for epoch in range(10): loss = 0 trs = 0 for batch in iterate_minibatch(dat['train'], 32, len(dat['train'])): inputs = get_inputs(meta, batch, path) l, valp = ftrain(*inputs) log(l) print(valp) loss += l trs += 1 loss /= trs log('loss ' + str(epoch) + ' ' + str(l)) return ftrain
def __init__(self, We_initial, char_embedd_table_initial, params): We = theano.shared(We_initial) We_inf = theano.shared(We_initial) embsize = We_initial.shape[1] hidden = params.hidden input_init = np.random.uniform(-0.1, 0.1, (10, MAX_lENGTH, 17)).astype('float32') self.input_init = theano.shared(input_init) input_var = T.imatrix(name='inputs') target_var = T.imatrix(name='targets') mask_var = T.fmatrix(name='masks') mask_var1 = T.fmatrix(name='masks1') length = T.iscalar() t_t = T.fscalar() Wyy0 = np.random.uniform(-0.02, 0.02, (18, 18)).astype('float32') Wyy = theano.shared(Wyy0) char_input_var = T.itensor3() char_embedd_dim = params.char_embedd_dim char_dic_size = len(params.char_dic) char_embedd_table = theano.shared(char_embedd_table_initial) l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) if params.emb ==1: l_emb_word = lasagne.layers.EmbeddingLayer(l_in_word, input_size= We_initial.shape[0] , output_size = embsize, W =We, name='word_embedding') else: l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We) layer_char_input = lasagne.layers.InputLayer(shape=(None, None, Max_Char_Length ), input_var=char_input_var, name='char-input') layer_char = lasagne.layers.reshape(layer_char_input, (-1, [2])) layer_char_embedding = lasagne.layers.EmbeddingLayer(layer_char, input_size=char_dic_size, output_size=char_embedd_dim, W=char_embedd_table, name='char_embedding') layer_char = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1)) # first get some necessary dimensions or parameters conv_window = 3 num_filters = params.num_filters # construct convolution layer cnn_layer = lasagne.layers.Conv1DLayer(layer_char, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn') # infer the pool size for pooling (pool size should go through all time step of cnn) _, _, pool_size = cnn_layer.output_shape # construct max pool layer pool_layer = lasagne.layers.MaxPool1DLayer(cnn_layer, pool_size=pool_size) # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters] output_cnn_layer = lasagne.layers.reshape(pool_layer, (-1, length, [1])) # finally, concatenate the two incoming layers together. incoming = lasagne.layers.concat([output_cnn_layer, l_emb_word], axis=2) l_lstm_wordf = lasagne.layers.LSTMLayer(incoming, hidden, mask_input=l_mask_word) l_lstm_wordb = lasagne.layers.LSTMLayer(incoming, hidden, mask_input=l_mask_word, backwards = True) concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2) l_reshape_concat = lasagne.layers.ReshapeLayer(concat,(-1,2*hidden)) l_local = lasagne.layers.DenseLayer(l_reshape_concat, num_units= 17, nonlinearity=lasagne.nonlinearities.linear) network_params = lasagne.layers.get_all_params(l_local, trainable=True) network_params.append(Wyy) #print len(network_params) f = open('NER_BiLSTM_CNN_CRF_.Batchsize_10_dropout_1_LearningRate_0.005_0.0_50_hidden_200.pickle','r') data = pickle.load(f) f.close() for idx, p in enumerate(network_params): p.set_value(data[idx]) def inner_function( targets_one_step, mask_one_step, prev_label, tg_energy): """ :param targets_one_step: [batch_size, t] :param prev_label: [batch_size, t] :param tg_energy: [batch_size] :return: """ new_ta_energy = T.dot(prev_label, Wyy[:-1,:-1]) new_ta_energy_t = tg_energy + T.sum(new_ta_energy*targets_one_step, axis =1) tg_energy_t = T.switch(mask_one_step, new_ta_energy_t, tg_energy) return [targets_one_step, tg_energy_t] local_energy = lasagne.layers.get_output(l_local, {l_in_word: input_var, l_mask_word: mask_var, layer_char_input:char_input_var}) local_energy = local_energy.reshape((-1, length, 17)) local_energy = local_energy*mask_var[:,:,None] ##################### # for the end symbole of a sequence #################### end_term = Wyy[:-1,-1] local_energy = local_energy + end_term.dimshuffle('x', 'x', 0)*mask_var1[:,:, None] predy_init = self.input_init[:,:length,:] a_params = [self.input_init] predy = T.nnet.softmax(predy_init.reshape((-1, 17))) predy = predy.reshape((-1, length, 17)) prediction = T.argmax(predy_init, axis=2) predy = predy*mask_var[:,:,None] targets_shuffled = predy.dimshuffle(1, 0, 2) target_time0 = targets_shuffled[0] masks_shuffled = mask_var.dimshuffle(1, 0) initial_energy0 = T.dot(target_time0, Wyy[-1,:-1]) initials = [target_time0, initial_energy0] [ _, target_energies], _ = theano.scan(fn=inner_function, outputs_info=initials, sequences=[targets_shuffled[1:], masks_shuffled[1:]]) cost11 = target_energies[-1] + T.sum(T.sum(local_energy*predy, axis=2)*mask_var, axis=1) predy_f = predy.reshape((-1, 17)) y_f = target_var.flatten() cost = T.mean(-cost11) from adam import adam updates_a = adam(cost, a_params, params.eta) #updates_a = lasagne.updates.sgd(cost, a_params, params.eta) #updates_a = lasagne.updates.apply_momentum(updates_a, a_params, momentum=0.9) self.inf_fn = theano.function([input_var, char_input_var, mask_var, mask_var1, length], cost, updates = updates_a) #corr = T.eq(prediction, target_var) #corr_train = (corr * mask_var).sum(dtype=theano.config.floatX) #num_tokens = mask_var.sum(dtype=theano.config.floatX) self.eval_fn = theano.function([input_var, char_input_var, mask_var, mask_var1, length], prediction, on_unused_input='ignore') if params.WarmStart: hidden_inf= params.hidden_inf char_embedd_table_inf = theano.shared(char_embedd_table_initial) l_in_word_a = lasagne.layers.InputLayer((None, None)) l_mask_word_a = lasagne.layers.InputLayer(shape=(None, None)) l_emb_word_a = lasagne.layers.EmbeddingLayer(l_in_word_a, input_size= We_initial.shape[0] , output_size = embsize, W = We_inf, name='inf_word_embedding') layer_char_input_a = lasagne.layers.InputLayer(shape=(None, None, Max_Char_Length ), input_var=char_input_var, name='char-input') layer_char_a = lasagne.layers.reshape(layer_char_input_a, (-1, [2])) layer_char_embedding_a = lasagne.layers.EmbeddingLayer(layer_char_a, input_size=char_dic_size, output_size=char_embedd_dim, W=char_embedd_table_inf, name='char_embedding') layer_char_a = lasagne.layers.DimshuffleLayer(layer_char_embedding_a, pattern=(0, 2, 1)) # first get some necessary dimensions or parameters conv_window = 3 num_filters = params.num_filters #_, sent_length, _ = incoming2.output_shape # construct convolution layer cnn_layer_a = lasagne.layers.Conv1DLayer(layer_char_a, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn') # infer the pool size for pooling (pool size should go through all time step of cnn) #_, _, pool_size = cnn_layer.output_shape # construct max pool layer pool_layer_a = lasagne.layers.MaxPool1DLayer(cnn_layer_a, pool_size=pool_size) # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters] output_cnn_layer_a = lasagne.layers.reshape(pool_layer_a, (-1, length, [1])) # finally, concatenate the two incoming layers together. l_emb_word_a = lasagne.layers.concat([output_cnn_layer_a, l_emb_word_a], axis=2) l_lstm_wordf_a = lasagne.layers.LSTMLayer(l_emb_word_a, hidden_inf, mask_input=l_mask_word_a) l_lstm_wordb_a = lasagne.layers.LSTMLayer(l_emb_word_a, hidden_inf, mask_input=l_mask_word_a, backwards = True) l_reshapef_a = lasagne.layers.ReshapeLayer(l_lstm_wordf_a ,(-1, hidden_inf)) l_reshapeb_a = lasagne.layers.ReshapeLayer(l_lstm_wordb_a ,(-1,hidden_inf)) concat2_a = lasagne.layers.ConcatLayer([l_reshapef_a, l_reshapeb_a]) l_local_a = lasagne.layers.DenseLayer(concat2_a, num_units= 17, nonlinearity=lasagne.nonlinearities.softmax) predy_inf = lasagne.layers.get_output(l_local_a, {l_in_word_a:input_var, l_mask_word_a:mask_var, layer_char_input_a:char_input_var}) predy_inf = predy_inf.reshape((-1, length, 17)) a_params = lasagne.layers.get_all_params(l_local_a, trainable=True) f = open('CRF_Inf_NER_.num_filters_50_dropout_1_LearningRate_0.001_1.0_emb_1_inf_0_hidden_200_annealing_0.pickle','r') data = pickle.load(f) f.close() for idx, p in enumerate(a_params): p.set_value(data[idx]) self.start_fn = theano.function([input_var, char_input_var, mask_var, length], predy_inf, on_unused_input='ignore')