def train_and_test(args, print_config): assert args.conv_layer_n == len(args.filter_widths) == len( args.nkerns) == (len(args.L2_regs) - 2) == len(args.fold_flags) == len( args.ks) # \mod{dim, 2^{\sum fold_flags}} == 0 assert args.embed_dm % (2**sum(args.fold_flags)) == 0 ################### # get the data # ################### datasets = load_data(args.corpus_path) train_set_x, train_set_y = datasets[0] dev_set_x, dev_set_y = datasets[1] test_set_x, test_set_y = datasets[2] word2index = datasets[3] index2word = datasets[4] pretrained_embeddings = datasets[5] n_train_batches = train_set_x.get_value( borrow=True).shape[0] / args.batch_size n_dev_batches = dev_set_x.get_value( borrow=True).shape[0] / args.dev_test_batch_size n_test_batches = test_set_x.get_value( borrow=True).shape[0] / args.dev_test_batch_size train_sent_len = train_set_x.get_value(borrow=True).shape[1] possible_labels = set(train_set_y.get_value().tolist()) if args.use_pretrained_embedding: args.embed_dm = pretrained_embeddings.get_value().shape[1] ################################### # Symbolic variable definition # ################################### x = T.imatrix('x') # the word indices matrix y = T.ivector('y') # the sentiment labels batch_index = T.iscalar('batch_index') rng = np.random.RandomState(1234) ############################### # Construction of the network # ############################### # Layer 1, the embedding layer layer1 = WordEmbeddingLayer( rng, input=x, vocab_size=len(word2index), embed_dm=args.embed_dm, embeddings=(pretrained_embeddings if args.use_pretrained_embedding else None)) dropout_layers = [layer1] layers = [layer1] for i in range(args.conv_layer_n): fold_flag = args.fold_flags[i] # for the dropout layer dpl = DropoutLayer(input=dropout_layers[-1].output, rng=rng, dropout_rate=args.dropout_rates[0]) next_layer_dropout_input = dpl.output next_layer_input = layers[-1].output # for the conv layer filter_shape = (args.nkerns[i], (1 if i == 0 else args.nkerns[i - 1]), 1, args.filter_widths[i]) k = args.ks[i] print "For conv layer(%s) %d, filter shape = %r, k = %d, dropout_rate = %f and normalized weight init: %r and fold: %d" % ( args.conv_activation_unit, i + 2, filter_shape, k, args.dropout_rates[i], args.norm_w, fold_flag) # we have two layers adding to two paths repsectively, # one for training # the other for prediction(averaged model) dropout_conv_layer = ConvFoldingPoolLayer( rng, input=next_layer_dropout_input, filter_shape=filter_shape, k=k, norm_w=args.norm_w, fold=fold_flag, activation=args.conv_activation_unit) # for prediction # sharing weight with dropout layer conv_layer = ConvFoldingPoolLayer( rng, input=next_layer_input, filter_shape=filter_shape, k=k, activation=args.conv_activation_unit, fold=fold_flag, W=dropout_conv_layer.W * (1 - args.dropout_rates[i]), # model averaging b=dropout_conv_layer.b) dropout_layers.append(dropout_conv_layer) layers.append(conv_layer) # last, the output layer # both dropout and without dropout if sum(args.fold_flags) > 0: n_in = args.nkerns[-1] * args.ks[-1] * args.embed_dm / (2**sum( args.fold_flags)) else: n_in = args.nkerns[-1] * args.ks[-1] * args.embed_dm print "For output layer, n_in = %d, dropout_rate = %f" % ( n_in, args.dropout_rates[-1]) dropout_output_layer = LogisticRegression( rng, input=dropout_layers[-1].output.flatten(2), n_in=n_in, # divided by 2x(how many times are folded) n_out=len(possible_labels) # five sentiment level ) output_layer = LogisticRegression( rng, input=layers[-1].output.flatten(2), n_in=n_in, n_out=len(possible_labels), W=dropout_output_layer.W * (1 - args.dropout_rates[-1]), # sharing the parameters, don't forget b=dropout_output_layer.b) dropout_layers.append(dropout_output_layer) layers.append(output_layer) ############################### # Error and cost # ############################### # cost and error come from different model! dropout_cost = dropout_output_layer.nnl(y) errors = output_layer.errors(y) def prepare_L2_sqr(param_layers, L2_regs): assert len(L2_regs) == len(param_layers) return T.sum([ L2_reg / 2 * ((layer.W if hasattr(layer, "W") else layer.embeddings)**2).sum() for L2_reg, layer in zip(L2_regs, param_layers) ]) L2_sqr = prepare_L2_sqr(dropout_layers, args.L2_regs) L2_sqr_no_ebd = prepare_L2_sqr(dropout_layers[1:], args.L2_regs[1:]) if args.use_L2_reg: cost = dropout_cost + L2_sqr cost_no_ebd = dropout_cost + L2_sqr_no_ebd else: cost = dropout_cost cost_no_ebd = dropout_cost ############################### # Parameters to be used # ############################### print "Delay embedding learning by %d epochs" % ( args.embedding_learning_delay_epochs) print "param_layers: %r" % dropout_layers param_layers = dropout_layers ############################## # Parameter Update # ############################## print "Using AdaDelta with rho = %f and epsilon = %f" % (args.rho, args.epsilon) params = [param for layer in param_layers for param in layer.params] param_shapes = [ param for layer in param_layers for param in layer.param_shapes ] param_grads = [T.grad(cost, param) for param in params] # AdaDelta parameter update # E[g^2] # initialized to zero egs = [ theano.shared(value=np.zeros(param_shape, dtype=theano.config.floatX), borrow=True, name="Eg:" + param.name) for param_shape, param in zip(param_shapes, params) ] # E[\delta x^2], initialized to zero exs = [ theano.shared(value=np.zeros(param_shape, dtype=theano.config.floatX), borrow=True, name="Ex:" + param.name) for param_shape, param in zip(param_shapes, params) ] new_egs = [ args.rho * eg + (1 - args.rho) * g**2 for eg, g in zip(egs, param_grads) ] delta_x = [ -(T.sqrt(ex + args.epsilon) / T.sqrt(new_eg + args.epsilon)) * g for new_eg, ex, g in zip(new_egs, exs, param_grads) ] new_exs = [ args.rho * ex + (1 - args.rho) * (dx**2) for ex, dx in zip(exs, delta_x) ] egs_updates = zip(egs, new_egs) exs_updates = zip(exs, new_exs) param_updates = [(p, p + dx) for dx, g, p in zip(delta_x, param_grads, params)] updates = egs_updates + exs_updates + param_updates # updates WITHOUT embedding # exclude the embedding parameter egs_updates_no_ebd = zip(egs[1:], new_egs[1:]) exs_updates_no_ebd = zip(exs[1:], new_exs[1:]) param_updates_no_ebd = [ (p, p + dx) for dx, g, p in zip(delta_x, param_grads, params)[1:] ] updates_no_emb = egs_updates_no_ebd + exs_updates_no_ebd + param_updates_no_ebd def make_train_func(cost, updates): return theano.function( inputs=[batch_index], outputs=[cost], updates=updates, givens={ x: train_set_x[batch_index * args.batch_size:(batch_index + 1) * args.batch_size], y: train_set_y[batch_index * args.batch_size:(batch_index + 1) * args.batch_size] }) train_model_no_ebd = make_train_func(cost_no_ebd, updates_no_emb) train_model = make_train_func(cost, updates) def make_error_func(x_val, y_val): return theano.function( inputs=[], outputs=errors, givens={ x: x_val, y: y_val }, ) dev_error = make_error_func(dev_set_x, dev_set_y) test_error = make_error_func(test_set_x, test_set_y) ############################# # Debugging purpose code # ############################# # : PARAMETER TUNING NOTE: # some demonstration of the gradient vanishing probelm train_data_at_index = { x: train_set_x[batch_index * args.batch_size:(batch_index + 1) * args.batch_size], } train_data_at_index_with_y = { x: train_set_x[batch_index * args.batch_size:(batch_index + 1) * args.batch_size], y: train_set_y[batch_index * args.batch_size:(batch_index + 1) * args.batch_size] } if print_config["nnl"]: get_nnl = theano.function( inputs=[batch_index], outputs=dropout_cost, givens={ x: train_set_x[batch_index * args.batch_size:(batch_index + 1) * args.batch_size], y: train_set_y[batch_index * args.batch_size:(batch_index + 1) * args.batch_size] }) if print_config["L2_sqr"]: get_L2_sqr = theano.function(inputs=[], outputs=L2_sqr) get_L2_sqr_no_ebd = theano.function(inputs=[], outputs=L2_sqr_no_ebd) if print_config["grad_abs_mean"]: print_grads = theano.function( inputs=[], outputs=[ theano.printing.Print(param.name)(T.mean(T.abs_(param_grad))) for param, param_grad in zip(params, param_grads) ], givens={ x: train_set_x, y: train_set_y }) activations = [l.output for l in dropout_layers[1:-1]] weight_grads = [T.grad(cost, l.W) for l in dropout_layers[1:-1]] if print_config["activation_hist"]: # turn into 1D array get_activations = theano.function( inputs=[batch_index], outputs=[val.flatten(1) for val in activations], givens=train_data_at_index) if print_config["weight_grad_hist"]: # turn into 1D array get_weight_grads = theano.function( inputs=[batch_index], outputs=[val.flatten(1) for val in weight_grads], givens=train_data_at_index_with_y) if print_config["activation_tracking"]: # get the mean and variance of activations for each conv layer get_activation_mean = theano.function( inputs=[batch_index], outputs=[T.mean(val) for val in activations], givens=train_data_at_index) get_activation_std = theano.function( inputs=[batch_index], outputs=[T.std(val) for val in activations], givens=train_data_at_index) if print_config["weight_grad_tracking"]: # get the mean and variance of activations for each conv layer get_weight_grad_mean = theano.function( inputs=[batch_index], outputs=[T.mean(g) for g in weight_grads], givens=train_data_at_index_with_y) get_weight_grad_std = theano.function( inputs=[batch_index], outputs=[T.std(g) for g in weight_grads], givens=train_data_at_index_with_y) #the training loop patience = args.patience # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) best_validation_loss = np.inf best_iter = 0 start_time = time.clock() done_looping = False epoch = 0 nnls = [] L2_sqrs = [] activation_means = [[] for i in range(args.conv_layer_n)] activation_stds = [[] for i in range(args.conv_layer_n)] weight_grad_means = [[] for i in range(args.conv_layer_n)] weight_grad_stds = [[] for i in range(args.conv_layer_n)] activation_hist_data = [[] for i in range(args.conv_layer_n)] weight_grad_hist_data = [[] for i in range(args.conv_layer_n)] train_errors = [] dev_errors = [] try: print "validation_frequency = %d" % validation_frequency while (epoch < args.n_epochs): epoch += 1 print "At epoch {0}".format(epoch) if epoch == (args.embedding_learning_delay_epochs + 1): print "########################" print "Start training embedding" print "########################" # shuffle the training data train_set_x_data = train_set_x.get_value(borrow=True) train_set_y_data = train_set_y.get_value(borrow=True) permutation = np.random.permutation( train_set_x.get_value(borrow=True).shape[0]) train_set_x.set_value(train_set_x_data[permutation]) train_set_y.set_value(train_set_y_data[permutation]) for minibatch_index in range(n_train_batches): if epoch >= (args.embedding_learning_delay_epochs + 1): train_cost = train_model(minibatch_index) else: train_cost = train_model_no_ebd(minibatch_index) iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # train_error_val = np.mean([train_error(i) # for i in range(n_train_batches)]) dev_error_val = dev_error() # print "At epoch %d and minibatch %d. \nTrain error %.2f%%\nDev error %.2f%%\n" %( # epoch, # minibatch_index, # train_error_val * 100, # dev_error_val * 100 # ) print "At epoch %d and minibatch %d. \nDev error %.2f%%\n" % ( epoch, minibatch_index, dev_error_val * 100) # train_errors.append(train_error_val) dev_errors.append(dev_error_val) if dev_error_val < best_validation_loss: best_iter = iter #improve patience if loss improvement is good enough if dev_error_val < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = dev_error_val test_error_val = test_error() print((' epoch %i, minibatch %i/%i, test error of' ' best dev error %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_error_val * 100.)) print "Dumping model to %s" % (args.model_path) dump_params(params, args.model_path) if (minibatch_index + 1) % 50 == 0 or minibatch_index == n_train_batches - 1: print "%d / %d minibatches completed" % ( minibatch_index + 1, n_train_batches) if print_config["nnl"]: print "`nnl` for the past 50 minibatches is %f" % ( np.mean(np.array(nnls))) nnls = [] if print_config["L2_sqr"]: print "`L2_sqr`` for the past 50 minibatches is %f" % ( np.mean(np.array(L2_sqrs))) L2_sqrs = [] ################## # Plotting stuff # ################## if print_config["nnl"]: nnl = get_nnl(minibatch_index) # print "nll for batch %d: %f" %(minibatch_index, nnl) nnls.append(nnl) if print_config["L2_sqr"]: if epoch >= (args.embedding_learning_delay_epochs + 1): L2_sqrs.append(get_L2_sqr()) else: L2_sqrs.append(get_L2_sqr_no_ebd()) if print_config["activation_tracking"]: layer_means = get_activation_mean(minibatch_index) layer_stds = get_activation_std(minibatch_index) for layer_ms, layer_ss, layer_m, layer_s in zip( activation_means, activation_stds, layer_means, layer_stds): layer_ms.append(layer_m) layer_ss.append(layer_s) if print_config["weight_grad_tracking"]: layer_means = get_weight_grad_mean(minibatch_index) layer_stds = get_weight_grad_std(minibatch_index) for layer_ms, layer_ss, layer_m, layer_s in zip( weight_grad_means, weight_grad_stds, layer_means, layer_stds): layer_ms.append(layer_m) layer_ss.append(layer_s) if print_config["activation_hist"]: for layer_hist, layer_data in zip( activation_hist_data, get_activations(minibatch_index)): layer_hist += layer_data.tolist() if print_config["weight_grad_hist"]: for layer_hist, layer_data in zip( weight_grad_hist_data, get_weight_grads(minibatch_index)): layer_hist += layer_data.tolist() except: import traceback traceback.print_exc(file=sys.stdout) finally: from plot_util import (plot_hist, plot_track, plot_error_vs_epoch, plt) if print_config["activation_tracking"]: plot_track(activation_means, activation_stds, "activation_tracking") if print_config["weight_grad_tracking"]: plot_track(weight_grad_means, weight_grad_stds, "weight_grad_tracking") if print_config["activation_hist"]: plot_hist(activation_hist_data, "activation_hist") if print_config["weight_grad_hist"]: plot_hist(weight_grad_hist_data, "weight_grad_hist") if print_config["error_vs_epoch"]: train_errors = [0] * len(dev_errors) ax = plot_error_vs_epoch( train_errors, dev_errors, title=('Best dev score: %f %% ' ' at iter %i with test error %f %%') % (best_validation_loss * 100., best_iter + 1, test_error_val * 100.)) if not args.task_signature: plt.show() else: plt.savefig("plots/" + args.task_signature + ".png") end_time = time.clock() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_error_val * 100.)) # save the result with open(args.output, "a") as f: f.write("%s\t%f\t%f\n" % (args.task_signature, best_validation_loss, test_error_val)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
n_out = 5 W_logreg = np.asarray(np.random.rand(n_in, n_out), dtype = theano.config.floatX) b_logreg = np.asarray(np.random.rand(n_out), dtype = theano.config.floatX) layer3 = LogisticRegression(rng = rng, input = layer2.output.flatten(2), n_in = n_in, n_out = n_out, W = theano.shared(value = W_logreg, name = "W_logreg"), b = theano.shared(value = b_logreg, name = "b_logreg") ) f1 = theano.function(inputs = [x_symbol, y_symbol], outputs = layer3.nnl(y_symbol) ) f2 = theano.function(inputs = [x_symbol, y_symbol], outputs = layer3.errors(y_symbol) ) f3 = theano.function(inputs = [x_symbol], outputs = layer3.p_y_given_x ) f_el = theano.function(inputs = [x_symbol], outputs = layer1.output ) f_cl = theano.function(inputs = [x_symbol],
print out.shape expected = (1, feat_map_n, EMB_DIM / 2, k) assert out.shape == expected, "%r != %r" % (out.shape, expected) ##### Test Part Three ############### # LogisticRegressionLayer ################################# print "############# LogisticRegressionLayer ##############" l3 = LogisticRegression( rng, input=l2.output.flatten(2), n_in=feat_map_n * k * EMB_DIM / 2, # we fold once, so divide by 2 n_out=5, # five sentiment level ) print "n_in = %d" % (2 * 2 * math.ceil(EMB_DIM / 2.0)) y = T.ivector("y") # the sentence sentiment label p_y_given_x = theano.function(inputs=[x], outputs=l3.p_y_given_x, mode="DebugMode") print "p_y_given_x = " print p_y_given_x(np.array([[1, 3, 4, 5], [0, 1, 4, 7]], dtype=np.int32)) cost = theano.function(inputs=[x, y], outputs=l3.nnl(y), mode="DebugMode") print "cost:\n", cost(np.array([[1, 3, 4, 5], [0, 1, 4, 7]], dtype=np.int32), np.array([1, 2], dtype=np.int32))
class RNTN(object): """ Recursive Neural Tensor Network architecture """ def __init__(self, x, y, vocab_size, embed_dim, label_n): """ x: theano.tensor.imatrix, (minibatch size, 3) the tree matrix of the minibatch for each row, (node id, left child id, right child id) y: theano.tensor.ivector, (minibatch size,) the labels vocab_size: int vocabulary size, including both the words and phrases embed_dim: int the embedding dimension """ assert x.ndim == 2 assert y.ndim == 1 parent_ids = x[:, 0] children_ids = x[:, 1:] rng = np.random.RandomState(1234) self.embedding = theano.shared( value=rng.normal(0, 0.05, (vocab_size, embed_dim)), name='embedding', borrow=True, ) self.rntn_layer = RNTNLayer(rng, embed_dim) # Update the embedding by # forwarding the embedding from bottom to up # and getting the vector for each node in each tree def update_embedding(child_indices, my_index, embedding): assert child_indices.ndim == 1 assert my_index.ndim == 0 return T.switch( T.eq( child_indices[0], -1 ), # NOTE: not using all() because it's non-differentiable embedding, # if no child, return the word embedding T.set_subtensor( embedding[ my_index], # otherwise, compute the embedding of RNTN layer self.rntn_layer.output(embedding[child_indices[0]], embedding[child_indices[1]]))) final_embedding, updates = theano.scan( fn=update_embedding, sequences=[children_ids, parent_ids], outputs_info=self. embedding, # we should pass the whole matrix and fill in the positions if necessary ) self.update_embedding = theano.function( inputs=[x], updates=[(self.embedding, T.set_subtensor(self.embedding[parent_ids], final_embedding[-1][parent_ids]))]) # the logistic regression layer that predicts the label self.logreg_layer = LogisticRegression( rng, input=final_embedding[-1][parent_ids], n_in=embed_dim, n_out=label_n) cost = self.logreg_layer.nnl(y) params = self.logreg_layer.params + self.rntn_layer.params + [ self.embedding ] self.params = params param_shapes = self.logreg_layer.param_shapes + self.rntn_layer.param_shapes + [ (vocab_size, embed_dim) ] grads = [T.grad(cost=cost, wrt=p) for p in params] updates = build_adadelta_updates(params, param_shapes, grads, epsilon=0.1) # TODO: in this step, forward propagation is done again besides the one in `update_embedding` # this extra computation should be avoided self.train = theano.function(inputs=[x, y], updates=updates)
def train_and_test(args, print_config): assert args.conv_layer_n == len(args.filter_widths) == len(args.nkerns) == (len(args.L2_regs) - 2) == len(args.fold_flags) == len(args.ks) # \mod{dim, 2^{\sum fold_flags}} == 0 assert args.embed_dm % (2 ** sum(args.fold_flags)) == 0 ################### # get the data # ################### datasets = load_data(args.corpus_path) train_set_x, train_set_y = datasets[0] dev_set_x, dev_set_y = datasets[1] test_set_x, test_set_y = datasets[2] word2index = datasets[3] index2word = datasets[4] pretrained_embeddings = datasets[5] n_train_batches = train_set_x.get_value(borrow=True).shape[0] / args.batch_size n_dev_batches = dev_set_x.get_value(borrow=True).shape[0] / args.dev_test_batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / args.dev_test_batch_size train_sent_len = train_set_x.get_value(borrow=True).shape[1] possible_labels = set(train_set_y.get_value().tolist()) if args.use_pretrained_embedding: args.embed_dm = pretrained_embeddings.get_value().shape[1] ################################### # Symbolic variable definition # ################################### x = T.imatrix('x') # the word indices matrix y = T.ivector('y') # the sentiment labels batch_index = T.iscalar('batch_index') rng = np.random.RandomState(1234) ############################### # Construction of the network # ############################### # Layer 1, the embedding layer layer1 = WordEmbeddingLayer(rng, input = x, vocab_size = len(word2index), embed_dm = args.embed_dm, embeddings = ( pretrained_embeddings if args.use_pretrained_embedding else None ) ) dropout_layers = [layer1] layers = [layer1] for i in xrange(args.conv_layer_n): fold_flag = args.fold_flags[i] # for the dropout layer dpl = DropoutLayer( input = dropout_layers[-1].output, rng = rng, dropout_rate = args.dropout_rates[0] ) next_layer_dropout_input = dpl.output next_layer_input = layers[-1].output # for the conv layer filter_shape = ( args.nkerns[i], (1 if i == 0 else args.nkerns[i-1]), 1, args.filter_widths[i] ) k = args.ks[i] print "For conv layer(%s) %d, filter shape = %r, k = %d, dropout_rate = %f and normalized weight init: %r and fold: %d" %( args.conv_activation_unit, i+2, filter_shape, k, args.dropout_rates[i], args.norm_w, fold_flag ) # we have two layers adding to two paths repsectively, # one for training # the other for prediction(averaged model) dropout_conv_layer = ConvFoldingPoolLayer(rng, input = next_layer_dropout_input, filter_shape = filter_shape, k = k, norm_w = args.norm_w, fold = fold_flag, activation = args.conv_activation_unit) # for prediction # sharing weight with dropout layer conv_layer = ConvFoldingPoolLayer(rng, input = next_layer_input, filter_shape = filter_shape, k = k, activation = args.conv_activation_unit, fold = fold_flag, W = dropout_conv_layer.W * (1 - args.dropout_rates[i]), # model averaging b = dropout_conv_layer.b ) dropout_layers.append(dropout_conv_layer) layers.append(conv_layer) # last, the output layer # both dropout and without dropout if sum(args.fold_flags) > 0: n_in = args.nkerns[-1] * args.ks[-1] * args.embed_dm / (2**sum(args.fold_flags)) else: n_in = args.nkerns[-1] * args.ks[-1] * args.embed_dm print "For output layer, n_in = %d, dropout_rate = %f" %(n_in, args.dropout_rates[-1]) dropout_output_layer = LogisticRegression( rng, input = dropout_layers[-1].output.flatten(2), n_in = n_in, # divided by 2x(how many times are folded) n_out = len(possible_labels) # five sentiment level ) output_layer = LogisticRegression( rng, input = layers[-1].output.flatten(2), n_in = n_in, n_out = len(possible_labels), W = dropout_output_layer.W * (1 - args.dropout_rates[-1]), # sharing the parameters, don't forget b = dropout_output_layer.b ) dropout_layers.append(dropout_output_layer) layers.append(output_layer) ############################### # Error and cost # ############################### # cost and error come from different model! dropout_cost = dropout_output_layer.nnl(y) errors = output_layer.errors(y) def prepare_L2_sqr(param_layers, L2_regs): assert len(L2_regs) == len(param_layers) return T.sum([ L2_reg / 2 * ((layer.W if hasattr(layer, "W") else layer.embeddings) ** 2).sum() for L2_reg, layer in zip(L2_regs, param_layers) ]) L2_sqr = prepare_L2_sqr(dropout_layers, args.L2_regs) L2_sqr_no_ebd = prepare_L2_sqr(dropout_layers[1:], args.L2_regs[1:]) if args.use_L2_reg: cost = dropout_cost + L2_sqr cost_no_ebd = dropout_cost + L2_sqr_no_ebd else: cost = dropout_cost cost_no_ebd = dropout_cost ############################### # Parameters to be used # ############################### print "Delay embedding learning by %d epochs" %(args.embedding_learning_delay_epochs) print "param_layers: %r" %dropout_layers param_layers = dropout_layers ############################## # Parameter Update # ############################## print "Using AdaDelta with rho = %f and epsilon = %f" %(args.rho, args.epsilon) params = [param for layer in param_layers for param in layer.params] param_shapes= [param for layer in param_layers for param in layer.param_shapes] param_grads = [T.grad(cost, param) for param in params] # AdaDelta parameter update # E[g^2] # initialized to zero egs = [ theano.shared( value = np.zeros(param_shape, dtype = theano.config.floatX ), borrow = True, name = "Eg:" + param.name ) for param_shape, param in zip(param_shapes, params) ] # E[\delta x^2], initialized to zero exs = [ theano.shared( value = np.zeros(param_shape, dtype = theano.config.floatX ), borrow = True, name = "Ex:" + param.name ) for param_shape, param in zip(param_shapes, params) ] new_egs = [ args.rho * eg + (1 - args.rho) * g ** 2 for eg, g in zip(egs, param_grads) ] delta_x = [ -(T.sqrt(ex + args.epsilon) / T.sqrt(new_eg + args.epsilon)) * g for new_eg, ex, g in zip(new_egs, exs, param_grads) ] new_exs = [ args.rho * ex + (1 - args.rho) * (dx ** 2) for ex, dx in zip(exs, delta_x) ] egs_updates = zip(egs, new_egs) exs_updates = zip(exs, new_exs) param_updates = [ (p, p + dx) for dx, g, p in zip(delta_x, param_grads, params) ] updates = egs_updates + exs_updates + param_updates # updates WITHOUT embedding # exclude the embedding parameter egs_updates_no_ebd = zip(egs[1:], new_egs[1:]) exs_updates_no_ebd = zip(exs[1:], new_exs[1:]) param_updates_no_ebd = [ (p, p + dx) for dx, g, p in zip(delta_x, param_grads, params)[1:] ] updates_no_emb = egs_updates_no_ebd + exs_updates_no_ebd + param_updates_no_ebd def make_train_func(cost, updates): return theano.function(inputs = [batch_index], outputs = [cost], updates = updates, givens = { x: train_set_x[batch_index * args.batch_size: (batch_index + 1) * args.batch_size], y: train_set_y[batch_index * args.batch_size: (batch_index + 1) * args.batch_size] } ) train_model_no_ebd = make_train_func(cost_no_ebd, updates_no_emb) train_model = make_train_func(cost, updates) def make_error_func(x_val, y_val): return theano.function(inputs = [], outputs = errors, givens = { x: x_val, y: y_val }, ) dev_error = make_error_func(dev_set_x, dev_set_y) test_error = make_error_func(test_set_x, test_set_y) ############################# # Debugging purpose code # ############################# # : PARAMETER TUNING NOTE: # some demonstration of the gradient vanishing probelm train_data_at_index = { x: train_set_x[batch_index * args.batch_size: (batch_index + 1) * args.batch_size], } train_data_at_index_with_y = { x: train_set_x[batch_index * args.batch_size: (batch_index + 1) * args.batch_size], y: train_set_y[batch_index * args.batch_size: (batch_index + 1) * args.batch_size] } if print_config["nnl"]: get_nnl = theano.function( inputs = [batch_index], outputs = dropout_cost, givens = { x: train_set_x[batch_index * args.batch_size: (batch_index + 1) * args.batch_size], y: train_set_y[batch_index * args.batch_size: (batch_index + 1) * args.batch_size] } ) if print_config["L2_sqr"]: get_L2_sqr = theano.function( inputs = [], outputs = L2_sqr ) get_L2_sqr_no_ebd = theano.function( inputs = [], outputs = L2_sqr_no_ebd ) if print_config["grad_abs_mean"]: print_grads = theano.function( inputs = [], outputs = [theano.printing.Print(param.name)( T.mean(T.abs_(param_grad)) ) for param, param_grad in zip(params, param_grads) ], givens = { x: train_set_x, y: train_set_y } ) activations = [ l.output for l in dropout_layers[1:-1] ] weight_grads = [ T.grad(cost, l.W) for l in dropout_layers[1:-1] ] if print_config["activation_hist"]: # turn into 1D array get_activations = theano.function( inputs = [batch_index], outputs = [ val.flatten(1) for val in activations ], givens = train_data_at_index ) if print_config["weight_grad_hist"]: # turn into 1D array get_weight_grads = theano.function( inputs = [batch_index], outputs = [ val.flatten(1) for val in weight_grads ], givens = train_data_at_index_with_y ) if print_config["activation_tracking"]: # get the mean and variance of activations for each conv layer get_activation_mean = theano.function( inputs = [batch_index], outputs = [ T.mean(val) for val in activations ], givens = train_data_at_index ) get_activation_std = theano.function( inputs = [batch_index], outputs = [ T.std(val) for val in activations ], givens = train_data_at_index ) if print_config["weight_grad_tracking"]: # get the mean and variance of activations for each conv layer get_weight_grad_mean = theano.function( inputs = [batch_index], outputs = [ T.mean(g) for g in weight_grads ], givens = train_data_at_index_with_y ) get_weight_grad_std = theano.function( inputs = [batch_index], outputs = [ T.std(g) for g in weight_grads ], givens = train_data_at_index_with_y ) #the training loop patience = args.patience # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) best_validation_loss = np.inf best_iter = 0 start_time = time.clock() done_looping = False epoch = 0 nnls = [] L2_sqrs = [] activation_means = [[] for i in xrange(args.conv_layer_n)] activation_stds = [[] for i in xrange(args.conv_layer_n)] weight_grad_means = [[] for i in xrange(args.conv_layer_n)] weight_grad_stds = [[] for i in xrange(args.conv_layer_n)] activation_hist_data = [[] for i in xrange(args.conv_layer_n)] weight_grad_hist_data = [[] for i in xrange(args.conv_layer_n)] train_errors = [] dev_errors = [] try: print "validation_frequency = %d" %validation_frequency while (epoch < args.n_epochs): epoch += 1 print "At epoch {0}".format(epoch) if epoch == (args.embedding_learning_delay_epochs + 1): print "########################" print "Start training embedding" print "########################" # shuffle the training data train_set_x_data = train_set_x.get_value(borrow = True) train_set_y_data = train_set_y.get_value(borrow = True) permutation = np.random.permutation(train_set_x.get_value(borrow=True).shape[0]) train_set_x.set_value(train_set_x_data[permutation]) train_set_y.set_value(train_set_y_data[permutation]) for minibatch_index in xrange(n_train_batches): if epoch >= (args.embedding_learning_delay_epochs + 1): train_cost = train_model(minibatch_index) else: train_cost = train_model_no_ebd(minibatch_index) iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # train_error_val = np.mean([train_error(i) # for i in xrange(n_train_batches)]) dev_error_val = dev_error() # print "At epoch %d and minibatch %d. \nTrain error %.2f%%\nDev error %.2f%%\n" %( # epoch, # minibatch_index, # train_error_val * 100, # dev_error_val * 100 # ) print "At epoch %d and minibatch %d. \nDev error %.2f%%\n" %( epoch, minibatch_index, dev_error_val * 100 ) # train_errors.append(train_error_val) dev_errors.append(dev_error_val) if dev_error_val < best_validation_loss: best_iter = iter #improve patience if loss improvement is good enough if dev_error_val < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = dev_error_val test_error_val = test_error() print( ( ' epoch %i, minibatch %i/%i, test error of' ' best dev error %f %%' ) % ( epoch, minibatch_index + 1, n_train_batches, test_error_val * 100. ) ) print "Dumping model to %s" %(args.model_path) dump_params(params, args.model_path) if (minibatch_index+1) % 50 == 0 or minibatch_index == n_train_batches - 1: print "%d / %d minibatches completed" %(minibatch_index + 1, n_train_batches) if print_config["nnl"]: print "`nnl` for the past 50 minibatches is %f" %(np.mean(np.array(nnls))) nnls = [] if print_config["L2_sqr"]: print "`L2_sqr`` for the past 50 minibatches is %f" %(np.mean(np.array(L2_sqrs))) L2_sqrs = [] ################## # Plotting stuff # ################## if print_config["nnl"]: nnl = get_nnl(minibatch_index) # print "nll for batch %d: %f" %(minibatch_index, nnl) nnls.append(nnl) if print_config["L2_sqr"]: if epoch >= (args.embedding_learning_delay_epochs + 1): L2_sqrs.append(get_L2_sqr()) else: L2_sqrs.append(get_L2_sqr_no_ebd()) if print_config["activation_tracking"]: layer_means = get_activation_mean(minibatch_index) layer_stds = get_activation_std(minibatch_index) for layer_ms, layer_ss, layer_m, layer_s in zip(activation_means, activation_stds, layer_means, layer_stds): layer_ms.append(layer_m) layer_ss.append(layer_s) if print_config["weight_grad_tracking"]: layer_means = get_weight_grad_mean(minibatch_index) layer_stds = get_weight_grad_std(minibatch_index) for layer_ms, layer_ss, layer_m, layer_s in zip(weight_grad_means, weight_grad_stds, layer_means, layer_stds): layer_ms.append(layer_m) layer_ss.append(layer_s) if print_config["activation_hist"]: for layer_hist, layer_data in zip(activation_hist_data , get_activations(minibatch_index)): layer_hist += layer_data.tolist() if print_config["weight_grad_hist"]: for layer_hist, layer_data in zip(weight_grad_hist_data , get_weight_grads(minibatch_index)): layer_hist += layer_data.tolist() except: import traceback traceback.print_exc(file = sys.stdout) finally: from plot_util import (plot_hist, plot_track, plot_error_vs_epoch, plt) if print_config["activation_tracking"]: plot_track(activation_means, activation_stds, "activation_tracking") if print_config["weight_grad_tracking"]: plot_track(weight_grad_means, weight_grad_stds, "weight_grad_tracking") if print_config["activation_hist"]: plot_hist(activation_hist_data, "activation_hist") if print_config["weight_grad_hist"]: plot_hist(weight_grad_hist_data, "weight_grad_hist") if print_config["error_vs_epoch"]: train_errors = [0] * len(dev_errors) ax = plot_error_vs_epoch(train_errors, dev_errors, title = ('Best dev score: %f %% ' ' at iter %i with test error %f %%') %( best_validation_loss * 100., best_iter + 1, test_error_val * 100. ) ) if not args.task_signature: plt.show() else: plt.savefig("plots/" + args.task_signature + ".png") end_time = time.clock() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_error_val * 100.)) # save the result with open(args.output, "a") as f: f.write("%s\t%f\t%f\n" %(args.task_signature, best_validation_loss, test_error_val)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
np_l = LogisticRegression(W, b) ######################### # THEANO PART ######################### x_symbol = theano.tensor.dmatrix('x') y_symbol = theano.tensor.ivector('y') th_l = TheanoLogisticRegression(rng=np.random.RandomState(1234), input=x_symbol, n_in=10, n_out=5, W=theano.shared(value=W, name="W"), b=theano.shared(value=b, name="b")) f1 = theano.function(inputs=[x_symbol, y_symbol], outputs=th_l.nnl(y_symbol)) actual = np_l.nnl(x, y) expected = f1(x, y) assert_matrix_eq(actual, expected, "nnl") f2 = theano.function(inputs=[x_symbol, y_symbol], outputs=th_l.errors(y_symbol)) actual = np_l.errors(x, y) expected = f2(x, y) assert_matrix_eq(actual, expected, "errors")
assert out.shape == expected, "%r != %r" % (out.shape, expected) ##### Test Part Three ############### # LogisticRegressionLayer ################################# print "############# LogisticRegressionLayer ##############" l3 = LogisticRegression( rng, input=l2.output.flatten(2), n_in=feat_map_n * k * EMB_DIM / 2, # we fold once, so divide by 2 n_out=5 # five sentiment level ) print "n_in = %d" % (2 * 2 * math.ceil(EMB_DIM / 2.)) y = T.ivector('y') # the sentence sentiment label p_y_given_x = theano.function(inputs=[x], outputs=l3.p_y_given_x, mode="DebugMode") print "p_y_given_x = " print p_y_given_x(np.array([[1, 3, 4, 5], [0, 1, 4, 7]], dtype=np.int32)) cost = theano.function(inputs=[x, y], outputs=l3.nnl(y), mode="DebugMode") print "cost:\n", cost(np.array([[1, 3, 4, 5], [0, 1, 4, 7]], dtype=np.int32), np.array([1, 2], dtype=np.int32))
W=theano.shared(value=W, name="W"), b=theano.shared(value=b, name="b")) n_in = filter_shape[0] * k * embed_dm / 2 n_out = 5 W_logreg = np.asarray(np.random.rand(n_in, n_out), dtype=theano.config.floatX) b_logreg = np.asarray(np.random.rand(n_out), dtype=theano.config.floatX) layer3 = LogisticRegression(rng=rng, input=layer2.output.flatten(2), n_in=n_in, n_out=n_out, W=theano.shared(value=W_logreg, name="W_logreg"), b=theano.shared(value=b_logreg, name="b_logreg")) f1 = theano.function(inputs=[x_symbol, y_symbol], outputs=layer3.nnl(y_symbol)) f2 = theano.function(inputs=[x_symbol, y_symbol], outputs=layer3.errors(y_symbol)) f3 = theano.function(inputs=[x_symbol], outputs=layer3.p_y_given_x) f_el = theano.function(inputs=[x_symbol], outputs=layer1.output) f_cl = theano.function(inputs=[x_symbol], outputs=layer2.output) ######################### # NUMPY PART # #########################
class RNTN(object): """ Recursive Neural Tensor Network architecture """ def __init__(self, x, y, vocab_size, embed_dim, label_n): """ x: theano.tensor.imatrix, (minibatch size, 3) the tree matrix of the minibatch for each row, (node id, left child id, right child id) y: theano.tensor.ivector, (minibatch size,) the labels vocab_size: int vocabulary size, including both the words and phrases embed_dim: int the embedding dimension """ assert x.ndim == 2 assert y.ndim == 1 parent_ids = x[:,0] children_ids = x[:,1:] rng = np.random.RandomState(1234) self.embedding = theano.shared( value = rng.normal(0, 0.05, (vocab_size, embed_dim)), name = 'embedding', borrow = True, ) self.rntn_layer = RNTNLayer(rng, embed_dim) # Update the embedding by # forwarding the embedding from bottom to up # and getting the vector for each node in each tree def update_embedding(child_indices, my_index, embedding): assert child_indices.ndim == 1 assert my_index.ndim == 0 return T.switch(T.eq(child_indices[0], -1), # NOTE: not using all() because it's non-differentiable embedding, # if no child, return the word embedding T.set_subtensor(embedding[my_index], # otherwise, compute the embedding of RNTN layer self.rntn_layer.output(embedding[child_indices[0]], embedding[child_indices[1]]) ) ) final_embedding, updates = theano.scan( fn = update_embedding, sequences = [children_ids, parent_ids], outputs_info = self.embedding, # we should pass the whole matrix and fill in the positions if necessary ) self.update_embedding = theano.function(inputs = [x], updates = [(self.embedding, T.set_subtensor(self.embedding[parent_ids], final_embedding[-1][parent_ids]))]) # the logistic regression layer that predicts the label self.logreg_layer = LogisticRegression(rng, input = final_embedding[-1][parent_ids], n_in = embed_dim, n_out = label_n ) cost = self.logreg_layer.nnl(y) params = self.logreg_layer.params + self.rntn_layer.params + [self.embedding] self.params = params param_shapes = self.logreg_layer.param_shapes + self.rntn_layer.param_shapes + [(vocab_size, embed_dim)] grads = [T.grad(cost = cost, wrt=p) for p in params] updates = build_adadelta_updates(params, param_shapes, grads, epsilon = 0.1) # TODO: in this step, forward propagation is done again besides the one in `update_embedding` # this extra computation should be avoided self.train = theano.function(inputs = [x, y], updates = updates)
x_symbol = theano.tensor.dmatrix('x') y_symbol = theano.tensor.ivector('y') th_l = TheanoLogisticRegression(rng = np.random.RandomState(1234), input = x_symbol, n_in = 10, n_out = 5, W = theano.shared(value = W, name = "W"), b = theano.shared(value = b, name = "b") ) f1 = theano.function(inputs = [x_symbol, y_symbol], outputs = th_l.nnl(y_symbol) ) actual = np_l.nnl(x, y) expected = f1(x, y) assert_matrix_eq(actual, expected, "nnl") f2 = theano.function(inputs = [x_symbol, y_symbol], outputs = th_l.errors(y_symbol) ) actual = np_l.errors(x, y) expected = f2(x, y)
def main(): print "############# Load Datasets ##############" import stanfordSentimentTreebank as sst skip_unknown_words = bool(args.get("--skip")) shuffle_flag = bool(args.get("--shuffle")) datatype = args.get("--datatype") if datatype == 5: # Fine-grained 5-class n_class = 5 elif datatype == 2: # Binary 2-class n_class = 2 # print "skip_unknown_words",skip_unknown_words vocab, index2word, datasets, datasets_all_sentences, funcs = sst.load_stanfordSentimentTreebank_dataset( normalize=True, skip_unknown_words=skip_unknown_words, datatype=datatype ) train_set, test_set, dev_set = datasets train_set_sentences, test_set_sentences, dev_set_sentences = datasets_all_sentences get, sentence2ids, ids2sentence = funcs # 関数を読み込み scores, sentences = zip(*train_set_sentences) sentences = [[word for word in sentence.lower().split()] for sentence in sentences] vocab_size = len(vocab) dev_unknown_count = sum([unknown_word_count for score, (ids, unknown_word_count) in dev_set]) test_unknown_count = sum([unknown_word_count for score, (ids, unknown_word_count) in test_set]) train_set = [(score, ids) for score, (ids, unknown_word_count) in train_set] test_set = [(score, ids) for score, (ids, unknown_word_count) in test_set] dev_set = [(score, ids) for score, (ids, unknown_word_count) in dev_set] print "train_size : ", len(train_set) print "dev_size : ", len(dev_set) print "test_size : ", len(test_set) print "-" * 30 print "vocab_size: ", len(vocab) print "dev_unknown_words : ", dev_unknown_count print "test_unknown_words : ", test_unknown_count print args # EMB_DIM = 50 EMB_DIM = args.get("--emb_size") vocab_size = len(vocab) feat_map_n_1 = args.get("--feat_map_n_1") feat_map_n_final = args.get("--feat_map_n_final") height = 1 width1 = args.get("--width1") width2 = args.get("--width2") k_top = args.get("--k_top") n_class = n_class alpha = args.get("--alpha") n_epoch = args.get("--n_epoch") dropout_rate0 = args.get("--dropout_rate0") dropout_rate1 = args.get("--dropout_rate1") dropout_rate2 = args.get("--dropout_rate2") activation = args.get("--activation") learn = args.get("--learn") number_of_convolutinal_layer = 2 pretrain = args.get("--pretrain") if pretrain == "word2vec": print "*Using word2vec" embeddings_W, model = pretrained_embedding.use_word2vec( sentences=sentences, index2word=index2word, emb_dim=EMB_DIM ) # -0.5 ~ 0.5で初期化している elif pretrain == "glove": print "*Using glove" embeddings_W = pretrained_embedding.use_glove( sentences=sentences, index2word=index2word, emb_dim=EMB_DIM, model_file="glove_model/glove_50_iter2900.model", ) else: embeddings_W = np.asarray(rng.normal(0, 0.05, size=(vocab_size, EMB_DIM)), dtype=theano.config.floatX) embeddings_W[0, :] = 0 print np.amax(embeddings_W) print np.amin(embeddings_W) # print "*embeddings" print embeddings_W # print bool(embeddings) # input_x = [1, 3, 4, 5, 0, 22, 4, 5] print "############# Model Setting ##############" x = T.imatrix("x") length_x = T.iscalar("length_x") y = T.ivector("y") # the sentence sentiment label embeddings = WordEmbeddingLayer(rng=rng, input=x, vocab_size=vocab_size, embed_dm=EMB_DIM, embeddings=embeddings_W) def dropout(X, p=0.5): if p > 0: retain_prob = 1 - p X *= srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX) # X /= retain_prob return X # number_of_convolutinal_layer = theano.shared(number_of_convolutinal_layer) # dynamic_func = theano.function(inputs=[length_x], outputs=number_of_convolutinal_layer * length_x) # dynamic_func_test = theano.function( # inputs = [length_x], # outputs = dynamic_func(length_x), # ) # print dynamic_func(len([1,2,3])) l1 = DynamicConvFoldingPoolLayer( rng, input=dropout(embeddings.output, p=dropout_rate0), filter_shape=(feat_map_n_1, 1, height, width1), # two feature map, height: 1, width: 2, k_top=k_top, number_of_convolutinal_layer=number_of_convolutinal_layer, index_of_convolitonal_layer=1, length_x=length_x, activation=activation, ) l1_no_dropout = DynamicConvFoldingPoolLayer( rng, input=embeddings.output, W=l1.W * (1 - dropout_rate0), b=l1.b, filter_shape=(feat_map_n_1, 1, height, width1), # two feature map, height: 1, width: 2, k_top=k_top, number_of_convolutinal_layer=number_of_convolutinal_layer, index_of_convolitonal_layer=1, length_x=length_x, activation=activation, ) l2 = DynamicConvFoldingPoolLayer( rng, input=dropout(l1.output, p=dropout_rate1), filter_shape=(feat_map_n_final, feat_map_n_1, height, width2), # two feature map, height: 1, width: 2, k_top=k_top, number_of_convolutinal_layer=number_of_convolutinal_layer, index_of_convolitonal_layer=2, length_x=length_x, activation=activation, ) l2_no_dropout = DynamicConvFoldingPoolLayer( rng, input=l1_no_dropout.output, W=l2.W * (1 - dropout_rate1), b=l2.b, filter_shape=(feat_map_n_final, feat_map_n_1, height, width2), # two feature map, height: 1, width: 2, k_top=k_top, number_of_convolutinal_layer=number_of_convolutinal_layer, index_of_convolitonal_layer=2, length_x=length_x, activation=activation, ) # l2_output = theano.function( # inputs = [x,length_x], # outputs = l2.output, # # on_unused_input='ignore' # ) # TODO: # check the dimension # input: 1 x 1 x 6 x 4 # out = l2_output( # np.array([input_x], dtype = np.int32), # len(input_x), # ) # test = theano.function( # inputs = [x], # outputs = embeddings.output, # ) # print "--input--" # print np.array([input_x], dtype = np.int32).shape # print "--input embeddings--" # a = np.array([input_x], dtype = np.int32) # print test(a).shape # print "-- output --" # print out # print out.shape # x = T.dscalar("x") # b = T.dscalar("b") # a = 1 # f = theano.function(inputs=[x,b], outputs=b * x + a) # print f(2,2) # expected = (1, feat_map_n, EMB_DIM / 2, k) # assert out.shape == expected, "%r != %r" %(out.shape, expected) ##### Test Part Three ############### # LogisticRegressionLayer ################################# # print "############# LogisticRegressionLayer ##############" l_final = LogisticRegression( rng, input=dropout(l2.output.flatten(2), p=dropout_rate2), n_in=feat_map_n_final * k_top * EMB_DIM, # n_in = feat_map_n * k * EMB_DIM / 2, # we fold once, so divide by 2 n_out=n_class, # five sentiment level ) l_final_no_dropout = LogisticRegression( rng, input=l2_no_dropout.output.flatten(2), W=l_final.W * (1 - dropout_rate2), b=l_final.b, n_in=feat_map_n_final * k_top * EMB_DIM, # n_in = feat_map_n * k * EMB_DIM / 2, # we fold once, so divide by 2 n_out=n_class, # five sentiment level ) print "n_in : ", feat_map_n_final * k_top * EMB_DIM # print "n_in = %d" %(2 * 2 * math.ceil(EMB_DIM / 2.)) # p_y_given_x = theano.function( # inputs = [x, length_x], # outputs = l_final.p_y_given_x, # allow_input_downcast=True, # # mode = "DebugMode" # ) # print "p_y_given_x = " # print p_y_given_x( # np.array([input_x], dtype=np.int32), # len(input_x) # ) cost = theano.function( inputs=[x, length_x, y], outputs=l_final.nnl(y), allow_input_downcast=True, # mode = "DebugMode" ) # print "cost:\n", cost( # np.array([input_x], dtype = np.int32), # len(input_x), # np.array([1], dtype = np.int32) # ) print "############# Learning ##############" layers = [] layers.append(embeddings) layers.append(l1) layers.append(l2) layers.append(l_final) cost = l_final.nnl(y) params = [p for layer in layers for p in layer.params] param_shapes = [l.param_shapes for l in layers] param_grads = [T.grad(cost, param) for param in params] def sgd(cost, params, lr=0.05): grads = [T.grad(cost, param) for param in params] updates = [] for p, g in zip(params, grads): updates.append([p, p - g * lr]) return updates from sgd import rmsprop, adagrad, adadelta, adam # updates = sgd(cost, l_final.params) # print param_grads if learn == "sgd": updates = sgd(cost, params, lr=0.05) elif learn == "adam": updates = adam(loss_or_grads=cost, params=params, learning_rate=alpha) elif learn == "adagrad": updates = adagrad(loss_or_grads=cost, params=params, learning_rate=alpha) elif learn == "adadelta": updates = adadelta(loss_or_grads=cost, params=params) elif learn == "rmsprop": updates = rmsprop(loss_or_grads=cost, params=params, learning_rate=alpha) train = theano.function(inputs=[x, length_x, y], outputs=cost, updates=updates, allow_input_downcast=True) # predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True) predict = theano.function( inputs=[x, length_x], outputs=T.argmax(l_final_no_dropout.p_y_given_x, axis=1), allow_input_downcast=True, # mode = "DebugMode" ) def b(x_data): return np.array(x_data, dtype=np.int32) def test(test_set): # print "############# TEST ##############" y_pred = [] test_set_y = [] # for train_x, train_y in zip(X_data, Y_data): # print test_set # Accuracy_count = 0 for test_y, test_x in test_set: test_x = b([test_x]) p = predict(test_x, len(test_x))[0] y_pred.append(p) test_set_y.append(test_y) # if test_y == p: # Accuracy_count += 1 # print "*predict :",predict(train_x, len(train_x)), train_y # Accuracy = float(Accuracy_count) / len(test_set) # print " accuracy : %f" % Accuracy, return accuracy_score(test_set_y, y_pred) # print classification_report(test_set_y, y_pred) # train_set_rand = np.ndarray(train_set) train_set_rand = train_set[:] train_cost_sum = 0.0 for epoch in xrange(n_epoch): print "== epoch : %d ==" % epoch if shuffle_flag: np.random.shuffle(train_set_rand) # train_set_rand = np.random.permutation(train_set) for i, x_y_set in enumerate(train_set_rand): train_y, train_x = x_y_set train_x = b([train_x]) train_y = b([train_y]) train_cost = train(train_x, len(train_x), train_y) train_cost_sum += train_cost if i % 1000 == 0 or i == len(train_set) - 1: print "i : (%d/%d)" % (i, len(train_set)), print " (cost : %f )" % train_cost print " cost :", train_cost_sum print " train_set : %f" % test(train_set) print " dev_set : %f" % test(dev_set) print " test_set : %f" % test(test_set) """
def main(): print "############# Load Datasets ##############" import stanfordSentimentTreebank as sst skip_unknown_words = bool(args.get("--skip")) shuffle_flag = bool(args.get("--shuffle")) datatype = args.get("--datatype") if datatype == 5: # Fine-grained 5-class n_class = 5 elif datatype == 2: # Binary 2-class n_class = 2 # print "skip_unknown_words",skip_unknown_words vocab, index2word, datasets, datasets_all_sentences, funcs = sst.load_stanfordSentimentTreebank_dataset(normalize=True, skip_unknown_words=skip_unknown_words, datatype=datatype) train_set, test_set, dev_set = datasets train_set_sentences, test_set_sentences, dev_set_sentences = datasets_all_sentences get,sentence2ids, ids2sentence = funcs # 関数を読み込み scores, sentences = zip(*train_set_sentences) sentences = [[word for word in sentence.lower().split()] for sentence in sentences] vocab_size = len(vocab) dev_unknown_count = sum([unknown_word_count for score,(ids,unknown_word_count) in dev_set]) test_unknown_count = sum([unknown_word_count for score,(ids,unknown_word_count) in test_set]) train_set = [(score, ids) for score,(ids,unknown_word_count) in train_set] test_set = [(score, ids) for score,(ids,unknown_word_count) in test_set] dev_set = [(score, ids) for score,(ids,unknown_word_count) in dev_set] print "train_size : ", len(train_set) print "dev_size : ", len(dev_set) print "test_size : ", len(test_set) print "-"*30 print "vocab_size: ", len(vocab) print "dev_unknown_words : ", dev_unknown_count print "test_unknown_words : ", test_unknown_count print args # EMB_DIM = 50 EMB_DIM = args.get("--emb_size") vocab_size = len(vocab) feat_map_n_1 = args.get("--feat_map_n_1") feat_map_n_final = args.get("--feat_map_n_final") height = 1 width1 = args.get("--width1") width2 = args.get("--width2") k_top = args.get("--k_top") n_class = n_class alpha = args.get("--alpha") n_epoch = args.get("--n_epoch") dropout_rate0 = args.get("--dropout_rate0") dropout_rate1 = args.get("--dropout_rate1") dropout_rate2 = args.get("--dropout_rate2") activation = args.get("--activation") learn = args.get("--learn") number_of_convolutinal_layer = 2 use_regular = bool(args.get("--use_regular")) regular_c = args.get("--regular_c") pretrain = args.get('--pretrain') if pretrain == 'word2vec': print "*Using word2vec" embeddings_W, model = pretrained_embedding.use_word2vec(sentences=sentences, index2word=index2word, emb_dim=EMB_DIM) # -0.5 ~ 0.5で初期化している elif pretrain == 'glove': print "*Using glove" embeddings_W = pretrained_embedding.use_glove(sentences=sentences, index2word=index2word, emb_dim=EMB_DIM, model_file='glove_model/glove_50_iter2900.model') else: embeddings_W = np.asarray( rng.normal(0, 0.05, size = (vocab_size, EMB_DIM)), dtype = theano.config.floatX ) embeddings_W[0,:] = 0 print np.amax(embeddings_W) print np.amin(embeddings_W) # print "*embeddings" print embeddings_W # print bool(embeddings) # input_x = [1, 3, 4, 5, 0, 22, 4, 5] print "############# Model Setting ##############" x = T.imatrix('x') length_x = T.iscalar('length_x') y = T.ivector('y') # the sentence sentiment label embeddings = WordEmbeddingLayer(rng=rng, input=x, vocab_size=vocab_size, embed_dm=EMB_DIM, embeddings=embeddings_W) def dropout(X, p=0.5): if p > 0: retain_prob = 1 - p X *= srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX) # X /= retain_prob return X # number_of_convolutinal_layer = theano.shared(number_of_convolutinal_layer) # dynamic_func = theano.function(inputs=[length_x], outputs=number_of_convolutinal_layer * length_x) # dynamic_func_test = theano.function( # inputs = [length_x], # outputs = dynamic_func(length_x), # ) # print dynamic_func(len([1,2,3])) l1 = DynamicConvFoldingPoolLayer(rng, input = dropout(embeddings.output, p=dropout_rate0), filter_shape = (feat_map_n_1, 1, height, width1), # two feature map, height: 1, width: 2, k_top = k_top, number_of_convolutinal_layer=number_of_convolutinal_layer, index_of_convolitonal_layer=1, length_x=length_x, activation = activation ) l1_no_dropout = DynamicConvFoldingPoolLayer(rng, input = embeddings.output, W=l1.W * (1 - dropout_rate0), b=l1.b, filter_shape = (feat_map_n_1, 1, height, width1), # two feature map, height: 1, width: 2, k_top = k_top, number_of_convolutinal_layer=number_of_convolutinal_layer, index_of_convolitonal_layer=1, length_x=length_x, activation = activation ) l2 = DynamicConvFoldingPoolLayer(rng, input = dropout(l1.output, p=dropout_rate1), filter_shape = (feat_map_n_final, feat_map_n_1, height, width2), # two feature map, height: 1, width: 2, k_top = k_top, number_of_convolutinal_layer=number_of_convolutinal_layer, index_of_convolitonal_layer=2, length_x=length_x, activation = activation ) l2_no_dropout = DynamicConvFoldingPoolLayer(rng, input = l1_no_dropout.output, W=l2.W * (1 - dropout_rate1), b=l2.b, filter_shape = (feat_map_n_final, feat_map_n_1, height, width2), # two feature map, height: 1, width: 2, k_top = k_top, number_of_convolutinal_layer=number_of_convolutinal_layer, index_of_convolitonal_layer=2, length_x=length_x, activation = activation ) # l2_output = theano.function( # inputs = [x,length_x], # outputs = l2.output, # # on_unused_input='ignore' # ) # TODO: # check the dimension # input: 1 x 1 x 6 x 4 # out = l2_output( # np.array([input_x], dtype = np.int32), # len(input_x), # ) # test = theano.function( # inputs = [x], # outputs = embeddings.output, # ) # print "--input--" # print np.array([input_x], dtype = np.int32).shape # print "--input embeddings--" # a = np.array([input_x], dtype = np.int32) # print test(a).shape # print "-- output --" # print out # print out.shape # x = T.dscalar("x") # b = T.dscalar("b") # a = 1 # f = theano.function(inputs=[x,b], outputs=b * x + a) # print f(2,2) # expected = (1, feat_map_n, EMB_DIM / 2, k) # assert out.shape == expected, "%r != %r" %(out.shape, expected) ##### Test Part Three ############### # LogisticRegressionLayer ################################# # print "############# LogisticRegressionLayer ##############" l_final = LogisticRegression( rng, input = dropout(l2.output.flatten(2), p=dropout_rate2), n_in = feat_map_n_final * k_top * EMB_DIM, # n_in = feat_map_n * k * EMB_DIM / 2, # we fold once, so divide by 2 n_out = n_class, # five sentiment level ) l_final_no_dropout = LogisticRegression( rng, input = l2_no_dropout.output.flatten(2), W = l_final.W * (1 - dropout_rate2), b = l_final.b, n_in = feat_map_n_final * k_top * EMB_DIM, # n_in = feat_map_n * k * EMB_DIM / 2, # we fold once, so divide by 2 n_out = n_class, # five sentiment level ) print "n_in : ", feat_map_n_final * k_top * EMB_DIM # print "n_in = %d" %(2 * 2 * math.ceil(EMB_DIM / 2.)) # p_y_given_x = theano.function( # inputs = [x, length_x], # outputs = l_final.p_y_given_x, # allow_input_downcast=True, # # mode = "DebugMode" # ) # print "p_y_given_x = " # print p_y_given_x( # np.array([input_x], dtype=np.int32), # len(input_x) # ) cost = theano.function( inputs = [x, length_x, y], outputs = l_final.nnl(y), allow_input_downcast=True, # mode = "DebugMode" ) # print "cost:\n", cost( # np.array([input_x], dtype = np.int32), # len(input_x), # np.array([1], dtype = np.int32) # ) print "############# Learning ##############" from sgd import sgd, rmsprop, adagrad, adadelta, adam from regularizer import regularize_l2 layers = [] layers.append(embeddings) layers.append(l1) layers.append(l2) layers.append(l_final) cost = l_final.nnl(y) params = [p for layer in layers for p in layer.params] param_shapes = [l.param_shapes for l in layers] param_grads = [T.grad(cost, param) for param in params] # regularizer setting regularizers = {} regularizers['c'] = regular_c # 2.0, 4.0, 15.0 regularizers['func'] = [None for _ in range(len(params))] if use_regular: regularizers_func = [] regularizers_func.append([regularize_l2(l=0.0001)]) # [embeddings] regularizers_func.append([regularize_l2(l=0.00003), None]) # [W, b] regularizers_func.append([regularize_l2(l=0.000003), None]) # [W, b] regularizers_func.append([regularize_l2(l=0.0001), None]) # [logreg_W, logreg_b] regularizers_func = [r_func for r in regularizers_func for r_func in r] regularizers['func'] = regularizers_func # if third conv layer: 1e-5 print embeddings.params print l1.params print l2.params print l_final.params # updates = sgd(cost, l_final.params) # RegE = 1e-4 # print param_grads if learn == "sgd": updates = sgd(cost, params, lr=0.05) elif learn == "adam": updates = adam(loss_or_grads=cost, params=params, learning_rate=alpha, regularizers=regularizers) elif learn == "adagrad": updates = adagrad(loss_or_grads=cost, params=params, learning_rate=alpha, regularizers=regularizers) elif learn == "adadelta": updates = adadelta(loss_or_grads=cost, params=params, regularizers=regularizers) elif learn == "rmsprop": updates = rmsprop(loss_or_grads=cost, params=params, learning_rate=alpha, regularizers=regularizers) train = theano.function(inputs=[x, length_x, y], outputs=cost, updates=updates, allow_input_downcast=True) # predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True) predict = theano.function( inputs = [x, length_x], outputs = T.argmax(l_final_no_dropout.p_y_given_x, axis=1), allow_input_downcast=True, # mode = "DebugMode" ) def b(x_data): return np.array(x_data, dtype=np.int32) def test(test_set): # print "############# TEST ##############" y_pred = [] test_set_y = [] # for train_x, train_y in zip(X_data, Y_data): # print test_set # Accuracy_count = 0 for test_y,test_x in test_set: test_x = b([test_x]) p = predict(test_x, len(test_x))[0] y_pred.append(p) test_set_y.append(test_y) # if test_y == p: # Accuracy_count += 1 # print "*predict :",predict(train_x, len(train_x)), train_y # Accuracy = float(Accuracy_count) / len(test_set) # print " accuracy : %f" % Accuracy, return accuracy_score(test_set_y, y_pred) # print classification_report(test_set_y, y_pred) # train_set_rand = np.ndarray(train_set) train_set_rand = train_set[:] train_cost_sum = 0.0 for epoch in xrange(n_epoch): print "== epoch : %d ==" % epoch if shuffle_flag: np.random.shuffle(train_set_rand) # train_set_rand = np.random.permutation(train_set) for i,x_y_set in enumerate(train_set_rand): train_y, train_x = x_y_set train_x = b([train_x]) train_y = b([train_y]) train_cost = train(train_x, len(train_x) , train_y) train_cost_sum += train_cost if i % 1000 == 0 or i == len(train_set)-1: print "i : (%d/%d)" % (i, len(train_set)) , print " (cost : %f )" % train_cost print ' cost :', train_cost_sum print ' train_set : %f' % test(train_set) print ' dev_set : %f' % test(dev_set) print ' test_set : %f' % test(test_set) '''