def __init__(self, emb_dim, vocab_size, num_slots, max_sent_len, optimiser=lasagne.updates.adam): self.emb_dim = emb_dim self.vocab_size = vocab_size self.num_slots = num_slots # num of hdn state units. self.cell = RenCell(self.emb_dim, self.num_slots) self.optimiser = optimiser # Paceholders for input self.Stories = T.ltensor3(name='Stories') # Num_stories x T x K_max self.Queries = T.ltensor3( name='Queries') # Num_stories X Num_queries X K_max self.Indices = T.lmatrix(name="Indices") # Num_Stories X Num_queries self.Answers = T.lmatrix(name='Answers') # Num_stories X Num_queries # Data Set dimensions self.N = T.shape(self.Stories)[0] self.K = max_sent_len # Combine cell parameters with all the other parameters and init self.params = self.cell.params self.params.update(self._initialise_weights()) # Build the Computation Graph and get the training function self._create_network(self.params) self.train_func = self._get_train_func()
def build_graph(self): # theano variables iw_b = T.lmatrix('iw_b') ic_b = T.ltensor3('ic_b') it_b = T.lmatrix('it_b') il_b = T.lmatrix('il_b') v_b = T.lmatrix('v_b') # valid action mask y_b = T.lvector('y_b') # index of the correct action from oracle steps = T.lscalar('steps') # num_of steps lr = self.args.learn_rate * self.args.decay**T.cast( T.floor(steps / 2000.), 'float32') iw, ic, it, il, self.actor = self.get_actor(False) iw_avg, ic_avg, it_avg, il_avg, self.actor_avg = self.get_actor(True) actor_prob = L.get_output(self.actor_avg, { iw_avg: iw_b, ic_avg: ic_b, it_avg: it_b, il_avg: il_b }, deterministic=True) actor_rest = actor_prob * T.cast( v_b, theano.config.floatX ) # mask the probabilities of invalid actions to 0 actor_pred = T.argmax(actor_rest, 1) self.actor_predict = theano.function([v_b, iw_b, ic_b, it_b, il_b], actor_pred, on_unused_input='ignore') y_hat = L.get_output(self.actor, { iw: iw_b, ic: ic_b, it: it_b, il: il_b }, deterministic=False) xent = T.mean(lasagne.objectives.categorical_crossentropy(y_hat, y_b)) reg = lasagne.regularization.regularize_network_params( L.get_all_layers(self.actor), lasagne.regularization.l2) cost = xent + self.args.reg_rate * reg correct = T.eq(T.argmax(y_hat, 1), y_b).sum() params = L.get_all_params(self.actor) avg_params = L.get_all_params(self.actor_avg) grads = T.grad(cost, params) if self.args.grad_norm: grads, norm = lasagne.updates.total_norm_constraint( grads, self.args.grad_norm, return_norm=True) updates = lasagne.updates.momentum(grads, params, lr, self.args.momentum) updates = apply_moving_average(params, avg_params, updates, steps, 0.9999) inputs = [steps, y_b, v_b, iw_b, ic_b, it_b, il_b] self.train_actor_supervised = theano.function(inputs, [correct, cost], updates=updates, on_unused_input='ignore')
def test10(): src = T.ltensor3("src") tgt = T.lmatrix("tgt") mask = T.matrix("mask") prd = T.matrix("prd") n_hids, vocab_size = 3, 60 hs = HierarchicalSoftmax(src, n_hids, vocab_size) #prd = hs.test() res = hs.cost(tgt, mask) x = [ [[1,1,1],[2,2,2],[3,3,3],[4,4,4]], [[3,3,3],[4,4,4],[5,5,5],[6,6,6]] ] y = [ [1,1,1,1], [1,1,1,1] ] m = [ [1,1,0,0], [1,1,0,0] ] fn3 = theano.function(inputs=[src,tgt,mask], outputs=[res], on_unused_input='ignore') res = fn3(x,y,m) print res , res[0].shape x_a = np.array(x) print x_a.shape, x_a[y]
def __init__(self, data, config, fast_predict=False): self.embedding_shapes = data.embedding_shapes self.lstm_type = config.lstm_cell self.lstm_hidden_size = int(config.lstm_hidden_size) self.num_lstm_layers = int(config.num_lstm_layers) self.max_grad_norm = float(config.max_grad_norm) self.vocab_size = data.word_dict.size() self.label_space_size = data.label_dict.size() self.unk_id = data.unk_id # Initialize layers and parameters self.embedding_layer = EmbeddingLayer(data.embedding_shapes, data.embeddings) self.params = [p for p in self.embedding_layer.params] self.rnn_layers = [None] * self.num_lstm_layers for l in range(self.num_lstm_layers): input_dim = self.embedding_layer.output_size if l == 0 else self.lstm_hidden_size input_dropout = config.input_dropout_prob if ( config.per_layer_dropout or l == 0) else 0.0 recurrent_dropout = config.recurrent_dropout_prob self.rnn_layers[l] = get_rnn_layer(self.lstm_type)( input_dim, self.lstm_hidden_size, input_dropout_prob=input_dropout, recurrent_dropout_prob=recurrent_dropout, fast_predict=fast_predict, prefix='lstm_{}'.format(l)) self.params.extend(self.rnn_layers[l].params) self.softmax_layer = SoftmaxLayer(self.lstm_hidden_size, self.label_space_size) self.params.extend(self.softmax_layer.params) # Build model # Shape of x: [seq_len, batch_size, num_features] self.x0 = tensor.ltensor3('x') self.y0 = tensor.lmatrix('y') self.mask0 = tensor.matrix('mask', dtype=floatX) self.is_train = tensor.bscalar('is_train') self.x = self.x0.dimshuffle(1, 0, 2) self.y = self.y0.dimshuffle(1, 0) self.mask = self.mask0.dimshuffle(1, 0) self.inputs = [None] * (self.num_lstm_layers + 1) self.inputs[0] = self.embedding_layer.connect(self.x) self.rev_mask = self.mask[::-1] for l, rnn in enumerate(self.rnn_layers): outputs = rnn.connect(self.inputs[l], self.mask if l % 2 == 0 else self.rev_mask, self.is_train) self.inputs[l + 1] = outputs[::-1] self.scores, self.pred = self.softmax_layer.connect(self.inputs[-1]) self.pred0 = self.pred.reshape([self.x.shape[0], self.x.shape[1]]).dimshuffle(1, 0)
def get_distribution_by_ctx_emb_function(self): """ Return predictions and scores of shape [batch_size, time_steps, label space size]. Used at test time. """ inputs_0 = tensor.ltensor3('inputs_0') self.inputs = [None] * (self.num_lstm_layers + 1) self.inputs[0] = inputs_0 self.rev_mask = self.mask[::-1] for l, rnn in enumerate(self.rnn_layers): outputs = rnn.connect(self.inputs[l], self.mask if l % 2 == 0 else self.rev_mask, self.is_train) self.inputs[l + 1] = outputs[::-1] self.scores, self.pred = self.softmax_layer.connect(self.inputs[-1]) self.pred0 = self.pred.reshape( [self.mask.shape[0], self.mask.shape[1]]).dimshuffle(1, 0) # (sent_len, batch_size, label_space_size) --> (batch_size, sent_len, label_space_size) scores0 = self.scores.reshape([ self.inputs[0].shape[0], self.inputs[0].shape[1], self.label_space_size ]).dimshuffle(1, 0, 2) return theano.function([inputs_0, self.mask0], [self.pred0, scores0], name='f_ctx_gemb_pred', allow_input_downcast=True, on_unused_input='warn', givens=({ self.is_train: numpy.cast['int8'](0) }))
def arch_memnet_selfsup(self): ''' memory net with self supervision. ''' contexts = T.ltensor3('contexts') querys = T.lmatrix('querys') yvs = T.lmatrix('yvs') params = [] question_layer = Embed(self.vocab_size, self.hidden_dim) q = T.reshape(question_layer(querys.flatten()), (self.batchsize, self.sen_maxlen, self.hidden_dim) ) if self.kwargs.get('position_encoding'): lmat = position_encoding(self.sen_maxlen, self.hidden_dim).dimshuffle('x', 0, 1) print '[memory network] use PE' q = q * lmat u = mean(q, axis=1) params.extend(question_layer.params) mem_layer = MemoryLayer(self.batchsize, self.mem_size, self.unit_size, self.vocab_size, self.hidden_dim, **self.kwargs) probs = mem_layer.get_probs(contexts, u).dimshuffle(0, 2) inputs = { 'contexts': contexts, 'querys': querys, 'yvs': yvs, 'cvs': T.lmatrix('cvs') } return (probs, inputs, params)
def get_eval_with_gemb_function(self): inputs_0 = tensor.ltensor3('inputs_0') self.inputs = [None] * (self.num_lstm_layers + 1) self.inputs[0] = inputs_0 self.rev_mask = self.mask[::-1] for l, rnn in enumerate(self.rnn_layers): outputs = rnn.connect(self.inputs[l], self.mask if l % 2 == 0 else self.rev_mask, self.is_train) self.inputs[l + 1] = outputs[::-1] self.scores, self.pred = self.softmax_layer.connect(self.inputs[-1]) self.pred0 = self.pred.reshape( [self.mask.shape[0], self.mask.shape[1]]).dimshuffle(1, 0) # (sent_len, batch_size, label_space_size) --> (batch_size, sent_len, label_space_size) scores0 = self.scores.reshape([ self.inputs[0].shape[0], self.inputs[0].shape[1], self.label_space_size ]).dimshuffle(1, 0, 2) loss = CrossEntropyLoss().connect(self.scores, self.mask, self.y) return theano.function([inputs_0, self.mask0, self.y0], [self.pred0, loss], name='f_gemb_eval', allow_input_downcast=True, on_unused_input='warn', givens=({ self.is_train: numpy.cast['int8'](0) }))
def make_node(self, prediction, prediction_mask, groundtruth, groundtruth_mask): prediction = tensor.as_tensor_variable(prediction) prediction_mask = tensor.as_tensor_variable(prediction_mask) groundtruth = tensor.as_tensor_variable(groundtruth) groundtruth_mask = tensor.as_tensor_variable(groundtruth_mask) return theano.Apply( self, [prediction, prediction_mask, groundtruth, groundtruth_mask], [tensor.ltensor3()])
def test_inc_wrong_rank(self): self.assertRaises(TypeError, sparse_gram_inc, self.base, self.amt, self.i0, tensor.lmatrix()) self.assertRaises(TypeError, sparse_gram_inc, self.base, self.amt, tensor.lscalar(), self.i1) self.assertRaises(TypeError, sparse_gram_inc, self.base, tensor.ltensor3(), self.i0, self.i1) self.assertRaises(TypeError, sparse_gram_inc, tensor.vector(), self.amt, self.i0, self.i1)
def test_language_model(): with temporary_content_path(TEST_VOCAB) as path: vocab = Vocabulary(path) with temporary_content_path(TEST_DICT_JSON, suffix=".json") as path: dict_ = Dictionary(path) floatX = theano.config.floatX def make_data_and_mask(data): data = [[str2vec(s, 3) for s in row] for row in data] data = np.array(data) mask = np.ones((data.shape[0], data.shape[1]), dtype=floatX) return data, mask words_val, mask_val = make_data_and_mask([['p', 'e', 'a', ], ['a', 'e', 'p',]]) mask_val[1,2] = 0 print "data:" print words_val print "mask:" print mask_val mask_def_emb_val = np.asarray([[0, 1], [0,0]]) # With the dictionary retrieval = Retrieval(vocab, dict_, exclude_top_k=7) lm = LanguageModel(7, 5, vocab.size(), vocab.size(), vocab=vocab, retrieval=retrieval, compose_type='transform_and_sum', weights_init=Uniform(width=0.1), biases_init=Uniform(width=0.1)) lm.initialize() words = tensor.ltensor3('words') mask = tensor.matrix('mask', dtype=floatX) costs = lm.apply(words, mask) cg = ComputationGraph(costs) def_mean, = VariableFilter(name='_dict_word_embeddings')(cg) def_mean_f = theano.function([words], def_mean) perplexities = VariableFilter(name_regex='perplexity.*')(cg) mask_def_emb, = VariableFilter(name='mask_def_emb')(cg) perplexities_f = theano.function([words, mask], perplexities) perplexities_v = perplexities_f(words_val, mask_val) mask_emb_f = theano.function([words, mask], mask_def_emb) mask_def_v = mask_emb_f(words_val, mask_val) for v,p in zip(perplexities_v,perplexities): print p.name, ":", v assert(np.allclose(mask_def_v, mask_def_emb_val))
def arch_memnet_lexical(self): ''' each memory slot is a lexical. ''' contexts = T.ltensor3('contexts') querys = T.lmatrix('querys') yvs = T.lvector('yvs') hop = 1 params = [] question_layer = Embed(self.vocab_size, self.hidden_dim) q = T.reshape(question_layer(querys.flatten()), (self.batchsize, self.sen_maxlen, self.hidden_dim) ) if self.kwargs.get('position_encoding'): lmat = position_encoding(self.sen_maxlen, self.hidden_dim).dimshuffle('x', 0, 1) print '[memory network] use PE' q = q * lmat u = mean(q, axis=1) params.extend(question_layer.params) mem_layers = [] for hi in range(hop): mem_layer = MemoryLayer(self.batchsize, self.mem_size, self.unit_size, self.vocab_size, self.hidden_dim, **self.kwargs) params.extend(mem_layer.params) mem_layers.append(mem_layer) o = mem_layer(contexts, u) u = u + o linear = LinearLayer(self.hidden_dim, self.vocab_size) params.extend(linear.params) probs = softmax(linear(u)) inputs = { 'contexts': contexts, 'querys': querys, 'yvs': yvs, 'cvs': T.lmatrix('cvs') } return (probs, inputs, params)
def arch_lstmq(self, param_b=2): contexts = T.ltensor3('contexts') querys = T.lmatrix('querys') yvs = T.lvector('yvs') params = [] question_layer = Embed(self.vocab_size, self.hidden_dim) params.extend(question_layer.params) q = T.reshape(question_layer(querys.flatten()), (self.batchsize, self.sen_maxlen, self.hidden_dim) ) lmat = position_encoding(self.sen_maxlen, self.hidden_dim).dimshuffle('x', 0, 1) q = q * lmat u = mean(q, axis=1) embed_layer = Embed(self.vocab_size, self.hidden_dim) params.extend(embed_layer.params) lmat = position_encoding(self.unit_size, self.hidden_dim).dimshuffle('x', 'x', 0, 1) m = T.reshape(embed_layer(contexts.flatten()), (self.batchsize, self.mem_size, self.unit_size, self.hidden_dim)) m = mean(m * lmat, axis=2) lstm = LSTMq(self.batchsize, self.hidden_dim) params.extend(lstm.params) o = lstm(m.dimshuffle(1, 0, 2), u) linear = LinearLayer(self.hidden_dim, self.vocab_size) params.extend(linear.params) probs = softmax(linear(o)) inputs = { 'contexts': contexts, 'querys': querys, 'yvs': yvs, 'cvs': T.lmatrix('cvs') } return (probs, inputs, params)
def main(): # ZEROUT_DUMMY_WORD = False ZEROUT_DUMMY_WORD = True ## Load data # mode = 'TRAIN-ALL' #mode = 'TRAIN_DATA' #mode = 'TRAIN_NO_OVERLAP' #if len(sys.argv) > 1: # mode = sys.argv[1] # if not mode in ['TRAIN', 'TRAIN-ALL']: # print "ERROR! The two possible training settings are: ['TRAIN', 'TRAIN-ALL']" # sys.exit(1) mode = 'k_time_data1'.upper() print "Running training in the {} setting".format(mode) position_num = 10 select_model = "PSCM" if select_model == "PSCM": click_model_index = 4 #PSCM elif select_model == "UBM": click_model_index = 1 else: raise "MODEL SELECT ERROR!" data_dir = mode add_train = numpy.load(os.path.join(data_dir, 'train.additions.npy')) q_train = numpy.load(os.path.join(data_dir, 'train.questions.npy')) a_train = numpy.load(os.path.join(data_dir, 'train.answers.npy')) y_train = numpy.load(os.path.join(data_dir, 'train.labels.npy')) add_dev = numpy.load(os.path.join(data_dir, 'dev.additions.npy')) q_dev = numpy.load(os.path.join(data_dir, 'dev.questions.npy')) a_dev = numpy.load(os.path.join(data_dir, 'dev.answers.npy')) #q_overlap_dev = numpy.load(os.path.join(data_dir, 'dev.q_overlap_indices.npy')) #a_overlap_dev = numpy.load(os.path.join(data_dir, 'dev.a_overlap_indices.npy')) y_dev = numpy.load(os.path.join(data_dir, 'dev.labels.npy')) qids_dev = numpy.load(os.path.join(data_dir, 'dev.qids.npy')) add_test = numpy.load(os.path.join(data_dir, 'test.additions.npy')) q_test = numpy.load(os.path.join(data_dir, 'test.questions.npy')) a_test = numpy.load(os.path.join(data_dir, 'test.answers.npy')) #q_overlap_test = numpy.load(os.path.join(data_dir, 'test.q_overlap_indices.npy')) #a_overlap_test = numpy.load(os.path.join(data_dir, 'test.a_overlap_indices.npy')) y_test = numpy.load(os.path.join(data_dir, 'test.labels.npy')) qids_test = numpy.load(os.path.join(data_dir, 'test.qids.npy')) # x_train = numpy.load(os.path.join(data_dir, 'train.overlap_feats.npy')) # x_dev = numpy.load(os.path.join(data_dir, 'dev.overlap_feats.npy')) # x_test = numpy.load(os.path.join(data_dir, 'test.overlap_feats.npy')) # feats_ndim = x_train.shape[1] # from sklearn.preprocessing import StandardScaler # scaler = StandardScaler() # print "Scaling overlap features" # x_train = scaler.fit_transform(x_train) # x_dev = scaler.transform(x_dev) # x_test = scaler.transform(x_test) #multi dim #y_train_tmp = numpy.dstack((y_train, y_train, y_train))[0] #y_dev_tmp = numpy.dstack((y_dev, y_dev, y_dev))[0] #y_test_tmp = numpy.dstack((y_test, y_test, y_test))[0] #y_train = y_train_tmp #y_dev = y_dev_tmp #y_test = y_test_tmp max_query_id = numpy.max([ numpy.max(add_train[:, 0]), numpy.max(add_test[:, 0]), numpy.max(add_dev[:, 0]) ]) max_url_id = numpy.max([ numpy.max(add_train[:, 1:]), numpy.max(add_test[:, 1:]), numpy.max(add_dev[:, 1:]) ]) print 'max_query_id', max_query_id print 'max_url_id', max_url_id print 'y_train', numpy.unique(y_train, return_counts=True) print 'y_dev', numpy.unique(y_dev, return_counts=True) print 'y_test', numpy.unique(y_test, return_counts=True) print 'q_train', q_train.shape print 'q_dev', q_dev.shape print 'q_test', q_test.shape print 'a_train', a_train.shape print 'a_dev', a_dev.shape print 'a_test', a_test.shape ## Get the word embeddings from the nnet trained on SemEval # ndim = 40 # nnet_outdir = 'exp/ndim=60;batch=100;max_norm=0;learning_rate=0.1;2014-12-02-15:53:14' # nnet_fname = os.path.join(nnet_outdir, 'nnet.dat') # params_fname = os.path.join(nnet_outdir, 'best_dev_params.epoch=00;batch=14640;dev_f1=83.12;test_acc=85.00.dat') # train_nnet, test_nnet = nn_layers.load_nnet(nnet_fname, params_fname) numpy_rng = numpy.random.RandomState(123) q_max_sent_size = q_train.shape[1] a_max_sent_size = a_train.shape[2] # print 'max', numpy.max(a_train) # print 'min', numpy.min(a_train) #ndim = 5 #print "Generating random vocabulary for word overlap indicator features with dim:", ndim #dummy_word_id = numpy.max(a_overlap_train) # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim)) #print "Gaussian" #vocab_emb_overlap = numpy_rng.randn(dummy_word_id + 1, ndim) * 0.25 # vocab_emb_overlap = numpy_rng.randn(dummy_word_id+1, ndim) * 0.05 # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim)) #vocab_emb_overlap[-1] = 0 # Load word2vec embeddings fname = os.path.join(data_dir, 'emb_vectors.skip.1124.4m.10w.npy') print "Loading word embeddings from", fname vocab_emb = numpy.load(fname) ndim = vocab_emb.shape[1] dummpy_word_idx = numpy.max(a_train) print "Word embedding matrix size:", vocab_emb.shape x = T.dmatrix('x') x_q = T.lmatrix('q') #x_q_overlap = T.lmatrix('q_overlap') #x_a = T.lmatrix('a') x_a_all = T.ltensor3('a_all') #x_a_overlap = T.lmatrix('a_overlap') #y = T.ivector('y') y = T.imatrix('y') add_info = T.dmatrix('add_info') ####### n_outs = 2 n_epochs = 15 batch_size = 50 learning_rate = 0.1 max_norm = 0 print 'batch_size', batch_size print 'n_epochs', n_epochs print 'learning_rate', learning_rate print 'max_norm', max_norm ## 1st conv layer. #ndim = vocab_emb.shape[1] + vocab_emb_overlap.shape[1] ndim = vocab_emb.shape[1] ### Nonlinearity type # activation = nn_layers.relu_f activation = T.tanh dropout_rate = 0.5 nkernels = 100 q_k_max = 1 a_k_max = 1 # filter_widths = [3,4,5] q_filter_widths = [5] a_filter_widths = [5] ###### QUESTION ###### lookup_table_words = nn_layers.LookupTableFastStatic( W=vocab_emb, pad=max(q_filter_widths) - 1) #lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=max(q_filter_widths) - 1) #lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words, lookup_table_overlap]) lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words]) num_input_channels = 1 input_shape = (batch_size, num_input_channels, q_max_sent_size + 2 * (max(q_filter_widths) - 1), ndim) conv_layers = [] for filter_width in q_filter_widths: filter_shape = (nkernels, num_input_channels, filter_width, ndim) conv = nn_layers.Conv2dLayer(rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape) non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0], activation=activation) pooling = nn_layers.KMaxPoolLayer(k_max=q_k_max) conv2dNonLinearMaxPool = nn_layers.FeedForwardNet( layers=[conv, non_linearity, pooling]) conv_layers.append(conv2dNonLinearMaxPool) join_layer = nn_layers.ParallelLayer(layers=conv_layers) flatten_layer = nn_layers.FlattenLayer() nnet_q = nn_layers.FeedForwardNet(layers=[ lookup_table, join_layer, flatten_layer, ]) #nnet_q.set_input((x_q, x_q_overlap)) nnet_q.set_input([x_q]) ###### ###### ANSWER ###### nnet_a_list = [] #lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths) - 1) for i in xrange(position_num): #lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths) - 1) #lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=max(q_filter_widths) - 1) #lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words, lookup_table_overlap]) #lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words]) # num_input_channels = len(lookup_table.layers) #input_shape = (batch_size, num_input_channels, a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim) input_shape = (batch_size, num_input_channels, a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim) conv_layers = [] for filter_width in a_filter_widths: filter_shape = (nkernels, num_input_channels, filter_width, ndim) conv = nn_layers.Conv2dLayer(rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape) non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0], activation=activation) pooling = nn_layers.KMaxPoolLayer(k_max=a_k_max) conv2dNonLinearMaxPool = nn_layers.FeedForwardNet( layers=[conv, non_linearity, pooling]) conv_layers.append(conv2dNonLinearMaxPool) join_layer = nn_layers.ParallelLayer(layers=conv_layers) flatten_layer = nn_layers.FlattenLayer() nnet_a = nn_layers.FeedForwardNet(layers=[ lookup_table, join_layer, flatten_layer, ]) #nnet_a.set_input((x_a, x_a_overlap)) nnet_a.set_input([x_a_all[:, i, :]]) nnet_a_list.append(nnet_a) ####### # print 'nnet_q.output', nnet_q.output.ndim q_logistic_n_in = nkernels * len(q_filter_widths) * q_k_max #a_logistic_n_in = nkernels * len(a_filter_widths) * a_k_max a_logistic_n_in = nkernels * len(a_filter_widths) * a_k_max print "q_logistic_n_in, ", q_logistic_n_in print "a_logistic_n_in, ", a_logistic_n_in #pairwise_layer = nn_layers.PositionPairwiseNoFeatsLayer(q_in=q_logistic_n_in, a_in=a_logistic_n_in,position=position_num) pairwise_layer = nn_layers.PositionOnlySimPairwiseNoFeatsLayer( q_in=q_logistic_n_in, a_in=a_logistic_n_in, position=position_num) pairwise_out_list = [nnet_q.output] for i in xrange(position_num): pairwise_out_list.append(nnet_a_list[i].output) pairwise_layer.set_input(pairwise_out_list) #pairwise_layer.set_input((nnet_q.output, nnet_a.output)) # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + a_logistic_n_in # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 50 # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 1 #n_in = q_logistic_n_in + a_logistic_n_in * position_num + 1 * position_num #n_in = 1 * position_num + position_num * (position_num - 1) / 2 n_in = q_logistic_n_in + a_logistic_n_in * position_num + 1 * position_num + position_num * ( position_num - 1) / 2 # n_in = feats_ndim + 1 # n_in = feats_ndim + 50 hidden_layer = nn_layers.LinearLayer(numpy_rng, n_in=n_in, n_out=n_in, activation=activation) hidden_layer.set_input(pairwise_layer.output) #classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs) #classifier.set_input(hidden_layer.output) classifier = nn_layers.FeatureClickModelLayer( n_in=n_in, n_out=n_outs, max_q_id=max_query_id, max_u_id=max_url_id, dim=position_num, click_model_index=click_model_index) #classifier = nn_layers.SimpleClickModelLayer(n_in=n_in, n_out=n_outs, max_q_id=max_query_id, max_u_id=max_url_id, dim=position_num) #classifier = nn_layers.MultiDimLogisticRegression(n_in=n_in, n_out=n_outs, dim=position_num) #classifier = nn_layers.LogisticRegression2(n_in=n_in, n_out=n_outs) classifier.set_input([hidden_layer.output, add_info]) #train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, pairwise_layer, hidden_layer, classifier], # name="Training nnet") train_nnet = nn_layers.FeedForwardNet( layers=[nnet_q] + nnet_a_list + [pairwise_layer, hidden_layer, classifier], name="Training nnet") test_nnet = train_nnet ####### #print train_nnet params = train_nnet.params ts = datetime.now().strftime('%Y-%m-%d-%H.%M.%S') nnet_outdir = 'exp.multi.out/model={},data={};ndim={};batch={};max_norm={};learning_rate={};{}'.format( select_model, mode, ndim, batch_size, max_norm, learning_rate, ts) if not os.path.exists(nnet_outdir): os.makedirs(nnet_outdir) nnet_fname = os.path.join(nnet_outdir, 'nnet.dat') print "Saving to", nnet_fname cPickle.dump([train_nnet, test_nnet], open(nnet_fname, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) #total_params = sum([numpy.prod(param.shape.eval()) for param in params]) #print 'Total params number:', total_params cost = train_nnet.layers[-1].training_cost(y) # y_train_counts = numpy.unique(y_train, return_counts=True)[1].astype(numpy.float32) # weights_data = numpy.sum(y_train_counts) / y_train_counts # weights_data_norm = numpy.linalg.norm(weights_data) # weights_data /= weights_data_norm # print 'weights_data', weights_data # weights = theano.shared(weights_data, borrow=True) # cost = train_nnet.layers[-1].training_cost_weighted(y, weights=weights) predictions = test_nnet.layers[-1].y_pred #predictions_prob = test_nnet.layers[-1].p_y_given_x[:, position_num:position_num * 2] predictions_prob = test_nnet.layers[-1].p_y_given_x ### L2 regularization # L2_word_emb = 1e-4 # L2_conv1d = 3e-5 # # L2_softmax = 1e-3 # L2_softmax = 1e-4 # print "Regularizing nnet weights" # for w in train_nnet.weights: # L2_reg = 0. # if w.name.startswith('W_emb'): # L2_reg = L2_word_emb # elif w.name.startswith('W_conv1d'): # L2_reg = L2_conv1d # elif w.name.startswith('W_softmax'): # L2_reg = L2_softmax # elif w.name == 'W': # L2_reg = L2_softmax # print w.name, L2_reg # cost += T.sum(w**2) * L2_reg # batch_x = T.dmatrix('batch_x') batch_x_q = T.lmatrix('batch_x_q') #batch_x_a = T.lmatrix('batch_x_a') batch_x_a_all = T.ltensor3('batch_x_a_all') #batch_x_q_overlap = T.lmatrix('batch_x_q_overlap') #batch_x_a_overlap = T.lmatrix('batch_x_a_overlap') #batch_y = T.ivector('batch_y') batch_y = T.imatrix('batch_y') batch_add_info = T.dmatrix('batch_add_info') # updates = sgd_trainer.get_adagrad_updates(cost, params, learning_rate=learning_rate, max_norm=max_norm, _eps=1e-6) updates = sgd_trainer.get_adadelta_updates(cost, params, rho=0.95, eps=1e-6, max_norm=max_norm, word_vec_name='W_emb') inputs_pred = [ batch_x_q, batch_x_a_all, batch_add_info, #batch_x_q_overlap, #batch_x_a_overlap, # batch_x, ] givens_pred = { x_q: batch_x_q, x_a_all: batch_x_a_all, add_info: batch_add_info, #x_q_overlap: batch_x_q_overlap, #x_a_overlap: batch_x_a_overlap, # x: batch_x } inputs_train = [ batch_x_q, batch_x_a_all, #batch_x_q_overlap, #batch_x_a_overlap, # batch_x, batch_add_info, batch_y, ] givens_train = { x_q: batch_x_q, x_a_all: batch_x_a_all, #x_q_overlap: batch_x_q_overlap, #x_a_overlap: batch_x_a_overlap, # x: batch_x, add_info: batch_add_info, y: batch_y } train_fn = theano.function(inputs=inputs_train, outputs=cost, updates=updates, givens=givens_train, on_unused_input='warn') pred_fn = theano.function(inputs=inputs_pred, outputs=predictions, givens=givens_pred, on_unused_input='warn') pred_prob_fn = theano.function(inputs=inputs_pred, outputs=predictions_prob, givens=givens_pred, on_unused_input='warn') def predict_batch(batch_iterator): #preds = numpy.vstack([pred_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap) for # batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, _ in batch_iterator]) preds = numpy.vstack([ pred_fn(batch_x_q, batch_x_a, batch_add_info) for batch_x_q, batch_x_a, batch_add_info, _ in batch_iterator ]) real_preds = preds[:, -1 * position_num:] inner_outputs = preds return real_preds[:batch_iterator. n_samples], inner_outputs[:batch_iterator.n_samples] def predict_prob_batch(batch_iterator): #preds = numpy.vstack([pred_prob_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap) for # batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, _ in batch_iterator]) preds = numpy.vstack([ pred_prob_fn(batch_x_q, batch_x_a, batch_add_info) for batch_x_q, batch_x_a, batch_add_info, _ in batch_iterator ]) real_preds = preds[:, -1 * position_num:] inner_outputs = preds return real_preds[:batch_iterator. n_samples], inner_outputs[:batch_iterator.n_samples] train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [q_train, a_train, add_train, y_train], batch_size=batch_size, randomize=True) dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [q_dev, a_dev, add_dev, y_dev], batch_size=batch_size, randomize=False) test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [q_test, a_test, add_test, y_test], batch_size=batch_size, randomize=False) labels = sorted(numpy.unique(y_test[:, -1])) print 'labels', labels def perplexity_score(labels, preds): positionPerplexity = [0.0] * position_num positionPerplexityClickSkip = [[0.0, 0.0] for i in xrange(position_num)] counts = [0] * position_num countsClickSkip = [[0, 0] for i in xrange(position_num)] for label, pred in zip(labels, preds): for i in range(0, len(label)): click = 1 if label[i] else 0 tmp_pred = max(min(pred[i], 0.99999), 0.00001) logProb = math.log(tmp_pred, 2) if click == 0: logProb = math.log(1 - tmp_pred, 2) positionPerplexity[i] += logProb positionPerplexityClickSkip[i][click] += logProb counts[i] += 1 countsClickSkip[i][click] += 1 positionPerplexity = [ 2**(-x / count if count else x) for (x, count) in zip(positionPerplexity, counts) ] positionPerplexityClickSkip = [[2 ** (-x[click] / (count[click] if count[click] else 1) if count else x) \ for (x, count) in zip(positionPerplexityClickSkip, countsClickSkip)] for click in xrange(2)] perplexity = sum(positionPerplexity) / len(positionPerplexity) ret_str = "---------\n" ret_str += "Perplexity\t" + str(perplexity) + "\n" ret_str += "positionPerplexity" for i in range(0, position_num): ret_str += "\t" + str(positionPerplexity[i]) ret_str += "\n" ret_str += "positionPerplexitySkip" for i in range(0, position_num): ret_str += "\t" + str(positionPerplexityClickSkip[0][i]) ret_str += "\n" ret_str += "positionPerplexityClick" for i in range(0, position_num): ret_str += "\t" + str(positionPerplexityClickSkip[1][i]) ret_str += "\n------------\n" #print ret_str return perplexity, ret_str def map_score(qids, labels, preds): qid2cand = defaultdict(list) for qid, label, pred in zip(qids, labels, preds): qid2cand[qid].append((pred, label)) average_precs = [] for qid, candidates in qid2cand.iteritems(): average_prec = 0 running_correct_count = 0 for i, (score, label) in enumerate(sorted(candidates, reverse=True), 1): if label > 0: running_correct_count += 1 average_prec += float(running_correct_count) / i average_precs.append(average_prec / (running_correct_count + 1e-6)) map_score = sum(average_precs) / len(average_precs) return map_score print "Zero out dummy word:", ZEROUT_DUMMY_WORD if ZEROUT_DUMMY_WORD: W_emb_list = [w for w in params if w.name == 'W_emb'] zerout_dummy_word = theano.function( [], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list]) # weights_dev = numpy.zeros(len(y_dev)) # weights_dev[y_dev == 0] = weights_data[0] # weights_dev[y_dev == 1] = weights_data[1] # print weights_dev best_dev_acc = -numpy.inf best_dev_perp = numpy.inf epoch = 0 timer_train = time.time() no_best_dev_update = 0 num_train_batches = len(train_set_iterator) while epoch < n_epochs: timer = time.time() for i, (x_q, x_a, add, y) in enumerate(tqdm(train_set_iterator), 1): train_fn(x_q, x_a, add, y) # Make sure the null word in the word embeddings always remains zero if ZEROUT_DUMMY_WORD: zerout_dummy_word() if i % 10 == 0 or i == num_train_batches: y_pred_dev, y_inner_dev = predict_prob_batch(dev_set_iterator) #print "shape:" #print str(y_dev.shape) #print str(y_pred_dev.shape) # # dev_acc = map_score(qids_dev, y_dev, predict_prob_batch(dev_set_iterator)) * 100 dev_acc = metrics.roc_auc_score(y_dev[:, -1], y_pred_dev[:, -1]) * 100 dev_perp, dev_perp_str = perplexity_score(y_dev, y_pred_dev) if dev_acc > best_dev_acc: y_pred, y_inner = predict_prob_batch(test_set_iterator) test_acc = map_score(qids_test, y_test[:, -1], y_pred[:, -1]) * 100 print( 'epoch: {} batch: {} dev auc: {:.4f}; test map: {:.4f}; best_dev_acc: {:.4f}' .format(epoch, i, dev_acc, test_acc, best_dev_acc)) best_dev_acc = dev_acc if dev_perp < best_dev_perp: y_pred, y_inner = predict_prob_batch(test_set_iterator) test_acc = map_score(qids_test, y_test[:, -1], y_pred[:, -1]) * 100 test_perplexity, test_perplexity_str = perplexity_score( y_test, y_pred) print( 'epoch: {} batch: {} dev auc: {:.4f}; test map: {:.4f}; best_dev_acc: {:.4f}; dev_perp: {:.4f}; best_dev_perp: {:.4f}' .format(epoch, i, dev_acc, test_acc, best_dev_acc, dev_perp, best_dev_perp)) print str(test_perplexity_str) best_params = [ numpy.copy(p.get_value(borrow=True)) for p in params ] best_inner = y_inner no_best_dev_update = 0 best_dev_perp = dev_perp if no_best_dev_update >= 3: print "Quitting after of no update of the best score on dev set", no_best_dev_update break numpy.savetxt( os.path.join( nnet_outdir, 'test.epoch={:02d};batch={:05d};dev_perp={:.2f}.best_inner.npy' .format(epoch, i, best_dev_perp)), best_inner) print('epoch {} took {:.4f} seconds'.format(epoch, time.time() - timer)) epoch += 1 no_best_dev_update += 1 print('Training took: {:.4f} seconds'.format(time.time() - timer_train)) for i, param in enumerate(best_params): params[i].set_value(param, borrow=True) y_pred_test, y_inner_test = predict_prob_batch(test_set_iterator) test_acc = map_score(qids_test, y_test[:, -1], y_pred_test[:, -1]) * 100 test_perp, test_perp_str = perplexity_score(y_test, y_pred_test) print "FINAL ACCURACY" print str(test_acc) print "FINAL PERPLEXITY" print str(test_perp_str) fname = os.path.join( nnet_outdir, 'best_dev_params.epoch={:02d};batch={:05d};dev_acc={:.2f}.dat'.format( epoch, i, best_dev_acc)) numpy.savetxt( os.path.join( nnet_outdir, 'test.epoch={:02d};batch={:05d};dev_acc={:.2f}.predictions.npy'. format(epoch, i, best_dev_acc)), y_pred_test) numpy.savetxt( os.path.join( nnet_outdir, 'test.final.epoch={:02d};batch={:05d};dev_acc={:.2f}.best_inner.npy' .format(epoch, i, best_dev_acc)), best_inner) cPickle.dump(best_params, open(fname, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL)
def train_language_model(new_training_job, config, save_path, params, fast_start, fuel_server, seed): c = config if seed: fuel.config.default_seed = seed blocks.config.config.default_seed = seed data, lm, retrieval = initialize_data_and_model(config) # full main loop can be saved... main_loop_path = os.path.join(save_path, 'main_loop.tar') # or only state (log + params) which can be useful not to pickle embeddings state_path = os.path.join(save_path, 'training_state.tar') stream_path = os.path.join(save_path, 'stream.pkl') best_tar_path = os.path.join(save_path, "best_model.tar") words = tensor.ltensor3('words') words_mask = tensor.matrix('words_mask') if theano.config.compute_test_value != 'off': test_value_data = next( data.get_stream('train', batch_size=4, max_length=5).get_epoch_iterator()) words.tag.test_value = test_value_data[0] words_mask.tag.test_value = test_value_data[1] costs, updates = lm.apply(words, words_mask) cost = rename(costs.mean(), 'mean_cost') cg = Model(cost) if params: logger.debug("Load parameters from {}".format(params)) with open(params) as src: cg.set_parameter_values(load_parameters(src)) length = rename(words.shape[1], 'length') perplexity, = VariableFilter(name='perplexity')(cg) perplexities = VariableFilter(name_regex='perplexity.*')(cg) monitored_vars = [length, cost] + perplexities if c['dict_path']: num_definitions, = VariableFilter(name='num_definitions')(cg) monitored_vars.extend([num_definitions]) parameters = cg.get_parameter_dict() trained_parameters = parameters.values() saved_parameters = parameters.values() if c['embedding_path']: logger.debug("Exclude word embeddings from the trained parameters") trained_parameters = [ p for p in trained_parameters if not p == lm.get_def_embeddings_params() ] saved_parameters = [ p for p in saved_parameters if not p == lm.get_def_embeddings_params() ] if c['cache_size'] != 0: logger.debug("Enable fake recursivity for looking up embeddings") trained_parameters = [ p for p in trained_parameters if not p == lm.get_cache_params() ] logger.info("Cost parameters" + "\n" + pprint.pformat([ " ".join( (key, str(parameters[key].get_value().shape), 'trained' if parameters[key] in trained_parameters else 'frozen')) for key in sorted(parameters.keys()) ], width=120)) rules = [] if c['grad_clip_threshold']: rules.append(StepClipping(c['grad_clip_threshold'])) rules.append(Adam(learning_rate=c['learning_rate'], beta1=c['momentum'])) algorithm = GradientDescent(cost=cost, parameters=trained_parameters, step_rule=CompositeRule(rules)) if c['cache_size'] != 0: algorithm.add_updates(updates) train_monitored_vars = list(monitored_vars) if c['grad_clip_threshold']: train_monitored_vars.append(algorithm.total_gradient_norm) word_emb_RMS, = VariableFilter(name='word_emb_RMS')(cg) main_rnn_in_RMS, = VariableFilter(name='main_rnn_in_RMS')(cg) train_monitored_vars.extend([word_emb_RMS, main_rnn_in_RMS]) if c['monitor_parameters']: train_monitored_vars.extend(parameter_stats(parameters, algorithm)) # We use a completely random seed on purpose. With Fuel server # it's currently not possible to restore the state of the training # stream. That's why it's probably better to just have it stateless. stream_seed = numpy.random.randint(0, 10000000) if fuel_server else None training_stream = data.get_stream('train', batch_size=c['batch_size'], max_length=c['max_length'], seed=stream_seed) valid_stream = data.get_stream('valid', batch_size=c['batch_size_valid'], max_length=c['max_length'], seed=stream_seed) original_training_stream = training_stream if fuel_server: # the port will be configured by the StartFuelServer extension training_stream = ServerDataStream( sources=training_stream.sources, produces_examples=training_stream.produces_examples) validation = DataStreamMonitoring(monitored_vars, valid_stream, prefix="valid").set_conditions( before_first_epoch=not fast_start, on_resumption=True, every_n_batches=c['mon_freq_valid']) track_the_best = TrackTheBest(validation.record_name(perplexity), choose_best=min).set_conditions( on_resumption=True, after_epoch=True, every_n_batches=c['mon_freq_valid']) # don't save them the entire main loop to avoid pickling everything if c['fast_checkpoint']: load = (LoadNoUnpickling(state_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job)) cp_args = { 'save_main_loop': False, 'save_separately': ['log', 'iteration_state'], 'parameters': saved_parameters } checkpoint = Checkpoint(state_path, before_training=not fast_start, every_n_batches=c['save_freq_batches'], after_training=not fast_start, **cp_args) if c['checkpoint_every_n_batches']: intermediate_cp = IntermediateCheckpoint( state_path, every_n_batches=c['checkpoint_every_n_batches'], after_training=False, **cp_args) else: load = (Load(main_loop_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job)) cp_args = { 'save_separately': ['iteration_state'], 'parameters': saved_parameters } checkpoint = Checkpoint(main_loop_path, before_training=not fast_start, every_n_batches=c['save_freq_batches'], after_training=not fast_start, **cp_args) if c['checkpoint_every_n_batches']: intermediate_cp = IntermediateCheckpoint( main_loop_path, every_n_batches=c['checkpoint_every_n_batches'], after_training=False, **cp_args) checkpoint = checkpoint.add_condition( ['after_batch', 'after_epoch'], OnLogRecord(track_the_best.notification_name), (best_tar_path, )) extensions = [ load, StartFuelServer(original_training_stream, stream_path, before_training=fuel_server), Timing(every_n_batches=c['mon_freq_train']) ] if retrieval: extensions.append( RetrievalPrintStats(retrieval=retrieval, every_n_batches=c['mon_freq_train'], before_training=not fast_start)) extensions.extend([ TrainingDataMonitoring(train_monitored_vars, prefix="train", every_n_batches=c['mon_freq_train']), validation, track_the_best, checkpoint ]) if c['checkpoint_every_n_batches']: extensions.append(intermediate_cp) extensions.extend([ DumpTensorflowSummaries(save_path, every_n_batches=c['mon_freq_train'], after_training=True), Printing(on_resumption=True, every_n_batches=c['mon_freq_train']), FinishIfNoImprovementAfter(track_the_best.notification_name, iterations=50 * c['mon_freq_valid'], every_n_batches=c['mon_freq_valid']), FinishAfter(after_n_batches=c['n_batches']) ]) logger.info("monitored variables during training:" + "\n" + pprint.pformat(train_monitored_vars, width=120)) logger.info("monitored variables during valid:" + "\n" + pprint.pformat(monitored_vars, width=120)) main_loop = MainLoop(algorithm, training_stream, model=Model(cost), extensions=extensions) main_loop.run()
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser( "Case study of generating a Markov chain with RNN.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "mode", choices=["train", "sample"], help="The mode to run. Use `train` to train a new model" " and `sample` to sample a sequence generated by an" " existing one.") parser.add_argument( "save_path", default="sine", help="The part to save PyLearn2 model") parser.add_argument( "--steps", type=int, default=100, help="Number of steps to plot") parser.add_argument( "--reset", action="store_true", default=False, help="Start training from scratch") args = parser.parse_args() num_states = ChainDataset.num_states if args.mode == "train": # Experiment configuration rng = numpy.random.RandomState(1) batch_size = 50 seq_len = 100 dim = 10 feedback_dim = 8 # Build the bricks and initialize them transition = GatedRecurrent(name="transition", activation=Tanh(), dim=dim) generator = SequenceGenerator( LinearReadout(readout_dim=num_states, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback( num_states, feedback_dim, name='feedback'), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() logger.debug("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_params().items()], width=120)) logger.debug("Markov chain entropy: {}".format( ChainDataset.entropy)) logger.debug("Expected min error: {}".format( -ChainDataset.entropy * seq_len * batch_size)) if os.path.isfile(args.save_path) and not args.reset: model = Pylearn2Model.load(args.save_path) else: model = Pylearn2Model(generator) # Build the cost computation graph. # Note: would be probably nicer to make cost part of the model. x = tensor.ltensor3('x') cost = Pylearn2Cost(model.brick.cost(x[:, :, 0]).sum()) dataset = ChainDataset(rng, seq_len) sgd = SGD(learning_rate=0.0001, cost=cost, batch_size=batch_size, batches_per_iter=10, monitoring_dataset=dataset, monitoring_batch_size=batch_size, monitoring_batches=1, learning_rule=Pylearn2LearningRule( SGDLearningRule(), dict(training_objective=cost.cost))) train = Pylearn2Train(dataset, model, algorithm=sgd, save_path=args.save_path, save_freq=10) train.main_loop() elif args.mode == "sample": model = Pylearn2Model.load(args.save_path) generator = model.brick sample = ComputationGraph(generator.generate( n_steps=args.steps, batch_size=1, iterate=True)).function() states, outputs, costs = [data[:, 0] for data in sample()] numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() print("Frequencies:\n {} vs {}".format(freqs, ChainDataset.equilibrium)) trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] print("Transition frequencies:\n{}\nvs\n{}".format( trans_freqs, ChainDataset.trans_prob)) else: assert False
def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, batch_size=10000, emb_size=50, margin=0.3, L2_weight=1e-10, update_freq=1, norm_threshold=5.0, max_truncate=40, line_no=16450007, neg_size=60, test_neg_size=300, comment=''):#L1Distance_ model_options = locals().copy() print "model options", model_options triple_path='/mounts/data/proj/wenpeng/Dataset/freebase/SimpleQuestions_v2/freebase-subsets/' rng = numpy.random.RandomState(1234) # triples, entity_size, relation_size, entity_count, relation_count=load_triples(triple_path+'freebase_mtr100_mte100-train.txt', line_no, triple_path)#vocab_size contain train, dev and test triples, entity_size, relation_size, train_triples_set, train_entity_set, train_relation_set,statistics=load_Train(triple_path+'freebase-FB5M2M-combined.txt', line_no, triple_path) train_h2t=statistics[0] train_t2h=statistics[1] train_r2t=statistics[2] train_r2h=statistics[3] train_r_replace_tail_prop=statistics[4] print 'triple size:', len(triples), 'entity_size:', entity_size, 'relation_size:', relation_size#, len(entity_count), len(relation_count) rand_values=random_value_normal((entity_size, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) entity_E=theano.shared(value=rand_values, borrow=True) rand_values=random_value_normal((relation_size, emb_size), theano.config.floatX, numpy.random.RandomState(4321)) relation_E=theano.shared(value=rand_values, borrow=True) GRU_U, GRU_W, GRU_b=create_GRU_para(rng, word_dim=emb_size, hidden_dim=emb_size) # GRU_U1, GRU_W1, GRU_b1=create_GRU_para(rng, word_dim=emb_size, hidden_dim=emb_size) # GRU_U2, GRU_W2, GRU_b2=create_GRU_para(rng, word_dim=emb_size, hidden_dim=emb_size) # GRU_U_combine, GRU_W_combine, GRU_b_combine=create_nGRUs_para(rng, word_dim=emb_size, hidden_dim=emb_size, n=3) # para_to_load=[entity_E, relation_E, GRU_U, GRU_W, GRU_b] # load_model_from_file(triple_path+'Best_Paras_dim'+str(emb_size), para_to_load) #+'_hits10_63.616' # GRU_U_combine=[GRU_U0, GRU_U1, GRU_U2] # GRU_W_combine=[GRU_W0, GRU_W1, GRU_W2] # GRU_b_combine=[GRU_b0, GRU_b1, GRU_b2] # w2v_entity_rand_values=random_value_normal((entity_size, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) # # w2v_relation_rand_values=random_value_normal((relation_size, emb_size), theano.config.floatX, numpy.random.RandomState(4321)) # # w2v_entity_rand_values=load_word2vec_to_init(w2v_entity_rand_values, triple_path+'freebase_mtr100_mte100-train.txt_ids_entityEmb50.txt') # w2v_relation_rand_values=load_word2vec_to_init(w2v_relation_rand_values, triple_path+'freebase_mtr100_mte100-train.txt_ids_relationEmb50.txt') # w2v_entity_rand_values=theano.shared(value=w2v_entity_rand_values, borrow=True) # w2v_relation_rand_values=theano.shared(value=w2v_relation_rand_values, borrow=True) # entity_E_ensemble=entity_E+norm_matrix(w2v_entity_rand_values) # relation_E_ensemble=relation_E+norm_matrix(w2v_relation_rand_values) norm_entity_E=norm_matrix(entity_E) norm_relation_E=norm_matrix(relation_E) n_batchs=line_no/batch_size remain_triples=line_no%batch_size if remain_triples>0: batch_start=list(numpy.arange(n_batchs)*batch_size)+[line_no-batch_size] else: batch_start=list(numpy.arange(n_batchs)*batch_size) # batch_start=theano.shared(numpy.asarray(batch_start, dtype=theano.config.floatX), borrow=True) # batch_start=T.cast(batch_start, 'int64') # allocate symbolic variables for the data # index = T.lscalar() x_index_l = T.lmatrix('x_index_l') # now, x is the index matrix, must be integer n_index_T = T.ltensor3('n_index_T') ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' dist_tail=one_batch_parallel_Ramesh(x_index_l, norm_entity_E, norm_relation_E, GRU_U, GRU_W, GRU_b, emb_size) loss__tail_is=one_neg_batches_parallel_Ramesh(n_index_T, norm_entity_E, norm_relation_E, GRU_U, GRU_W, GRU_b, emb_size) loss_tail_i=T.maximum(0.0, margin+dist_tail.reshape((dist_tail.shape[0],1))-loss__tail_is) # loss_relation_i=T.maximum(0.0, margin+dist_relation.reshape((dist_relation.shape[0],1))-loss_relation_is) # loss_head_i=T.maximum(0.0, margin+dist_head.reshape((dist_head.shape[0],1))-loss_head_is) # loss_tail_i_test=T.maximum(0.0, 0.0+dist_tail.reshape((dist_tail.shape[0],1))-loss__tail_is) # binary_matrix_test=T.gt(loss_tail_i_test, 0) # sum_vector_test=T.sum(binary_matrix_test, axis=1) # binary_vector_hits10=T.gt(sum_vector_test, 10) # test_loss=T.sum(binary_vector_hits10)*1.0/batch_size # loss_relation_i=T.maximum(0.0, margin+dis_relation.reshape((dis_relation.shape[0],1))-loss__relation_is) # loss_head_i=T.maximum(0.0, margin+dis_head.reshape((dis_head.shape[0],1))-loss__head_is) # def neg_slice(neg_matrix): # dist_tail_slice, dis_relation_slice, dis_head_slice=one_batch_parallel_Ramesh(neg_matrix, entity_E, relation_E, GRU_U_combine, GRU_W_combine, GRU_b_combine, emb_size) # loss_tail_i=T.maximum(0.0, margin+dist_tail-dist_tail_slice) # loss_relation_i=T.maximum(0.0, margin+dis_relation-dis_relation_slice) # loss_head_i=T.maximum(0.0, margin+dis_head-dis_head_slice) # return loss_tail_i, loss_relation_i, loss_head_i # # (loss__tail_is, loss__relation_is, loss__head_is), updates = theano.scan( # neg_slice, # sequences=n_index_T, # outputs_info=None) loss_tails=T.mean(T.sum(loss_tail_i, axis=1) ) # loss_relations=T.mean(T.sum(loss_relation_i, axis=1) ) # loss_heads=T.mean(T.sum(loss_head_i, axis=1) ) loss=loss_tails#+loss_relations+loss_heads L2_loss=debug_print((entity_E** 2).sum()+(relation_E** 2).sum()\ +(GRU_U** 2).sum()+(GRU_W** 2).sum(), 'L2_reg') # Div_loss=Diversify_Reg(GRU_U[0])+Diversify_Reg(GRU_U[1])+Diversify_Reg(GRU_U[2])+\ # Diversify_Reg(GRU_W[0])+Diversify_Reg(GRU_W[1])+Diversify_Reg(GRU_W[2]) cost=loss+L2_weight*L2_loss#+div_reg*Div_loss #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = [entity_E, relation_E, GRU_U, GRU_W, GRU_b] # params_conv = [conv_W, conv_b] params_to_store=[entity_E, relation_E, GRU_U, GRU_W, GRU_b] accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-9))) #AdaGrad updates.append((acc_i, acc)) # grads = T.grad(cost, params) # updates = [] # for param_i, grad_i in zip(params, grads): # updates.append((param_i, param_i - learning_rate * grad_i)) #AdaGrad train_model = theano.function([x_index_l, n_index_T], [loss, cost], updates=updates,on_unused_input='ignore') # test_model = theano.function([x_index_l, n_index_T], test_loss, on_unused_input='ignore') # # train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y], # givens={ # x_index_l: indices_train_l[index: index + batch_size], # x_index_r: indices_train_r[index: index + batch_size], # y: trainY[index: index + batch_size], # left_l: trainLeftPad_l[index], # right_l: trainRightPad_l[index], # left_r: trainLeftPad_r[index], # right_r: trainRightPad_r[index], # length_l: trainLengths_l[index], # length_r: trainLengths_r[index], # norm_length_l: normalized_train_length_l[index], # norm_length_r: normalized_train_length_r[index], # mts: mt_train[index: index + batch_size], # wmf: wm_train[index: index + batch_size]}, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant # validation_frequency = min(n_train_batches/5, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False svm_max=0.0 best_epoch=0 # corpus_triples_set=train_triples_set|dev_triples_set|test_triples_set best_train_loss=1000000 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 # learning_rate/=epoch # print 'lr:', learning_rate #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 #shuffle(train_batch_start)#shuffle training data loss_sum=0.0 for start in batch_start: if start%100000==0: print start, '...' pos_triples=triples[start:start+batch_size] all_negs=[] # count=0 for pos_triple in pos_triples: neg_triples=get_n_neg_triples_train(pos_triple, train_triples_set, train_entity_set, train_r_replace_tail_prop, neg_size) # # print 'neg_head_triples' # neg_relation_triples=get_n_neg_triples(pos_triple, train_triples_set, train_entity_set, train_relation_set, 1, neg_size/3) # # print 'neg_relation_triples' # neg_tail_triples=get_n_neg_triples(pos_triple, train_triples_set, train_entity_set, train_relation_set, 2, neg_size/3) # print 'neg_tail_triples' all_negs.append(neg_triples) # print 'neg..', count # count+=1 neg_tensor=numpy.asarray(all_negs).reshape((batch_size, neg_size, 3)).transpose(1,0,2) loss, cost= train_model(pos_triples, neg_tensor) loss_sum+=loss loss_sum/=len(batch_start) print 'Training loss:', loss_sum, 'cost:', cost # loss_test=0.0 # # for test_start in batch_start_test: # pos_triples=test_triples[test_start:test_start+batch_size] # all_negs=[] # for pos_triple in pos_triples: # neg_triples=get_n_neg_triples_new(pos_triple, corpus_triples_set, test_entity_set, test_relation_set, test_neg_size/2, True) # all_negs.append(neg_triples) # # neg_tensor=numpy.asarray(all_negs).reshape((batch_size, test_neg_size, 3)).transpose(1,0,2) # loss_test+= test_model(pos_triples, neg_tensor) # # # loss_test/=n_batchs_test # print '\t\t\tUpdating epoch', epoch, 'finished! Test hits10:', 1.0-loss_test if loss_sum< best_train_loss: store_model_to_file(triple_path+comment+'Best_Paras_dim'+str(emb_size), params_to_store) # store_model_to_file(triple_path+'Divreg_Best_Paras_dim'+str(emb_size), params_to_store) best_train_loss=loss_sum print 'Finished storing best params' # exit(0) print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min' mid_time = time.clock() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
logger = logging.getLogger(__name__) configuration = getattr(configurations, args.proto)() # added by Zhaopeng Tu, 2016-05-12 if args.state: configuration.update(eval(open(args.state).read())) logger.info("\nModel options:\n{}".format(pprint.pformat(configuration))) src = T.lmatrix() src_mask = T.matrix() trg = T.lmatrix() trg_mask = T.matrix() # added by Longyue src_hist = T.ltensor3() src_hist_mask = T.tensor3() # added by Zhaopeng Tu, 2016-07-13 # for fast training of new parameters ite = T.fscalar() rng = numpy.random.RandomState(1234) enc_dec = EncoderDecoder(rng, **configuration) # modified by Zhaopeng Tu, 2016-07-13 # for fast training of new parameters # enc_dec.build_trainer(src, src_mask, trg, trg_mask) enc_dec.build_trainer(src, src_mask, src_hist, src_hist_mask, trg, trg_mask, ite) enc_dec.build_sampler()
def make_node(self, groundtruth, recognized): recognized = tensor.as_tensor_variable(recognized) groundtruth = tensor.as_tensor_variable(groundtruth) return theano.Apply( self, [groundtruth, recognized], [tensor.ltensor3(), tensor.ltensor3()])
def train(cfig, epochs, language, model_alias, models_name): sentence = T.lmatrix() sentence_mask = T.matrix() sentence_morph = T.ltensor3() sentence_morph_mask = T.tensor3() use_noise = T.iscalar() lm = rnnlm_quick(**cfig) if model_alias == 'lstm2layer': use_maxout = True getattr(lm, models_name['lstm2layer'])(sentence, sentence_mask, use_noise, use_maxout) elif model_alias == 'rnnlm': use_maxout = True getattr(lm, models_name['rnnlm'])(sentence, sentence_mask, use_noise, use_maxout) else: getattr(lm, models_name[model_alias])(sentence, sentence_mask, sentence_morph, sentence_morph_mask, use_noise) cost_sum = lm.cost cost_mean = lm.cost/sentence.shape[1] params = lm.params regular = lm.L1 * 1e-5 + lm.L2 * 1e-5 grads = T.grad(cost_mean, params) hard_clipping = cfig['hard_clipping'] soft_clipping = T.fscalar() skip_nan_batch = 0 grads, nan_num, inf_num = step_clipping(params, grads, soft_clipping, cfig['shrink_scale_after_skip_nan_grad'], cfig['skip_nan_grad']) updates = adadelta(params, grads, hard_clipping) vs , vs_morph , vs_morph_mask = DStream(datatype='valid', config=cfig) ts , ts_morph , ts_morph_mask = DStream(datatype='test', config=cfig) fn = theano.function([sentence, sentence_mask, sentence_morph, sentence_morph_mask, use_noise, soft_clipping], [cost_mean, nan_num, inf_num], updates=updates , on_unused_input='ignore') test_fn = theano.function([sentence, sentence_mask, sentence_morph, sentence_morph_mask, use_noise], [cost_sum] , on_unused_input='ignore') start_time = datetime.now() start_cpu_time = time.clock() cur_time = start_time cur_cpu_time = start_cpu_time print ('training start at {}'.format(start_time)) valid_errs = [] test_errs = [] time_his, time_cpu_his = [], [] patience = 200 bad_counter = 0 for epoch in range(epochs): ds , ds_morph , ds_morph_mask = DStream(datatype='train', config=cfig) for data_tuple , data_morph , mask_morph in zip(ds.get_epoch_iterator() , ds_morph , ds_morph_mask): data , mask = data_tuple #print data.shape , data_morph.shape , mask_morph.shape if cfig['drop_last_batch_if_small'] and (0.0 + len(data)) / cfig['batch_size'] < 0.95: #logger.info('drop batch with: {}/{} ratio'.format(len(data), cfig['batch_size'])) pass # FIXME any idea to identify the last batch? else: cur_clip = soft_clipping_curve(epoch, cfig['soft_clipping_epoch'], cfig['soft_clipping_begin'], cfig['soft_clipping_end']) cur_batch_time = datetime.now() cur_batch_cpu_time = time.clock() data_morph = data_morph.transpose((1 , 0 , 2)) mask_morph = mask_morph.transpose((1 , 0 , 2)) c, grad_nan_num, grad_inf_num = fn(data.T, mask.T, data_morph , mask_morph , 1, cur_clip) batch_elasped_seconds = (datetime.now() - cur_batch_time).total_seconds() batch_elasped_cpu_seconds = (time.clock() - cur_batch_cpu_time) #print data.shape , data_morph.shape , mask_morph.shape logger.info('grad nan/inf num: {} {} at epoch {} cost {},{}'.format(grad_nan_num, grad_inf_num, epoch, batch_elasped_seconds, batch_elasped_cpu_seconds)) valid_err = test(test_fn, vs , vs_morph , vs_morph_mask) test_err = test(test_fn, ts , ts_morph , ts_morph_mask) valid_errs.append(valid_err) test_errs.append(test_err) if valid_err <= numpy.array(valid_errs).min(): bad_counter = 0 if len(valid_errs) > patience and valid_err >= \ numpy.array(valid_errs)[:-patience].min(): bad_counter += 1 valid_min = numpy.min(valid_errs) valid_min_idx = numpy.argmin(valid_errs) valid_min_test = test_errs[valid_min_idx] pre_time, pre_cpu_time = cur_time, cur_cpu_time cur_time, cur_cpu_time= datetime.now(), time.clock() elasped_minutes = (cur_time - start_time).total_seconds() / 60. elasped_cpu_minutes = (cur_cpu_time - start_cpu_time) / 60. batch_elasped_seconds = (cur_time - pre_time).total_seconds() batch_elasped_cpu_seconds = (cur_cpu_time - pre_cpu_time) print ('{:>3} epoch {:>2} bad t/v {:>5.2f} {:>5.2f} itr:{:>3} min_itr:{:>3} t/vbest:{:>5.2f} {:>5.2f} batch {:>4.0f}s, all {:>5.1f}m, cpu {:>4.0f}s, all{:>5.1f}m nan {} inf {}'.\ format(epoch, bad_counter, test_err, valid_err, len(valid_errs)-1, valid_min_idx, valid_min_test, valid_min, batch_elasped_seconds, elasped_minutes, batch_elasped_cpu_seconds, elasped_cpu_minutes, grad_nan_num, grad_inf_num)); time_his.append(batch_elasped_seconds) time_cpu_his.append(batch_elasped_cpu_seconds) sys.stdout.flush() if bad_counter > patience: print "Early Stop! outter loop" break valid_min = numpy.min(valid_errs) valid_min_idx = numpy.argmin(valid_errs) valid_min_test = test_errs[valid_min_idx] test_min = numpy.min(test_errs) test_min_idx = numpy.argmin(test_errs) cur_cfig = str(valid_min_idx) + ':' + str(valid_min) cur_cfig += ',' + str(valid_min_test) + ',' cur_cfig += str(test_min_idx) + ':' + str(test_min) final_res = '#== epoch:valid_min,valid_min_test,epoch:test_min: ' + cur_cfig print final_res end_time = datetime.now() use_time = (end_time - start_time).total_seconds() end_cpu_time = time.clock() use_cpu_time = (end_cpu_time - start_cpu_time) print 'training cost time {}s'.format(use_time) print 'training cost cpu time {}s'.format(use_cpu_time) print 'epoch cost mean time {}'.format(numpy.mean(time_his)) print 'epoch cost mean cpu time {}'.format(numpy.mean(time_cpu_his)) vals = [ "#"*100,str(datetime.now()),language,model_alias,final_res,"#"*100] print '\n'.join(vals) result = [numpy.mean(time_his), numpy.mean(time_cpu_his), use_time, use_cpu_time, valid_min, valid_min_test, test_min] return result
def build_sampler(self): # added by Longyue x_hist = T.ltensor3() x_hist_mask = T.tensor3() annotations_1 = self.encoder_hist_1.apply_1(x_hist, x_hist_mask) annotations_1 = annotations_1[-1] annotations_2 = self.encoder_hist_2.apply_2(annotations_1) annotations_3 = annotations_2[-1] x = T.lmatrix() # Build Networks # src_mask is None c = self.encoder.apply(x, None, annotations_3) #init_context = ctx[0, :, -self.n_hids_src:] # mean pooling init_context = c.mean(0) # added by Longyue init_context = concatenate([init_context, annotations_3], axis=annotations_3.ndim - 1) init_state = self.decoder.create_init_state(init_context) outs = [init_state, c, annotations_3] if not self.with_attention: outs.append(init_context) # compile function print 'Building compile_init_state_and_context function ...' self.compile_init_and_context = theano.function( [x, x_hist, x_hist_mask], outs, name='compile_init_and_context') print 'Done' y = T.lvector() cur_state = T.matrix() # if it is the first word, emb should be all zero, and it is indicated by -1 trg_emb = T.switch(y[:, None] < 0, T.alloc(0., 1, self.n_in_trg), self.table_trg.apply(y)) # added by Zhaopeng Tu, 2016-06-09 # for with_attention=False if self.with_attention and self.with_coverage: cov_before = T.tensor3() if self.coverage_type is 'linguistic': print 'Building compile_fertility ...' fertility = self.decoder._get_fertility(c) fertility = T.addbroadcast(fertility, 1) self.compile_fertility = theano.function( [c], [fertility], name='compile_fertility') print 'Done' else: fertility = None else: cov_before = None fertility = None # apply one step # modified by Zhaopeng Tu, 2016-04-29 # [next_state, ctxs] = self.decoder.apply(state_below=trg_emb, results = self.decoder.apply( state_below=trg_emb, init_state=cur_state, # added by Zhaopeng Tu, 2016-06-09 init_context=None if self.with_attention else init_context, c=c if self.with_attention else None, hist=annotations_3, # added by Longyue one_step=True, # added by Zhaopeng Tu, 2016-04-27 cov_before=cov_before, fertility=fertility) next_state = results[0] if self.with_attention: ctxs, alignment = results[1], results[2] if self.with_coverage: cov = results[3] else: # if with_attention=False, we always use init_context as the source representation ctxs = init_context readout = self.decoder.readout(next_state, ctxs, trg_emb) # maxout if self.maxout_part > 1: readout = self.decoder.one_step_maxout(readout) # apply dropout if self.dropout < 1.0: readout = Dropout(self.trng, readout, 0, self.dropout) # compute the softmax probability next_probs = self.logistic_layer.get_probs(readout) # sample from softmax distribution to get the sample next_sample = self.trng.multinomial(pvals=next_probs).argmax(1) # compile function print 'Building compile_next_state_and_probs function ...' inps = [y, cur_state] if self.with_attention: inps.append(c) else: inps.append(init_context) # added by Longyue inps.append(annotations_3) outs = [next_probs, next_state, next_sample] # added by Zhaopeng Tu, 2016-06-09 if self.with_attention: outs.append(alignment) # added by Zhaopeng Tu, 2016-04-29 if self.with_coverage: inps.append(cov_before) if self.coverage_type is 'linguistic': inps.append(fertility) outs.append(cov) self.compile_next_state_and_probs = theano.function( inps, outs, name='compile_next_state_and_probs') print 'Done' # added by Zhaopeng Tu, 2016-07-18 # for reconstruction if self.with_reconstruction: # Build Networks # trg_mask is None inverse_c = T.tensor3() # mean pooling inverse_init_context = inverse_c.mean(0) inverse_init_state = self.inverse_decoder.create_init_state( inverse_init_context) outs = [inverse_init_state] if not self.with_attention: outs.append(inverse_init_context) # compile function print 'Building compile_inverse_init_state_and_context function ...' self.compile_inverse_init_and_context = theano.function( [inverse_c], outs, name='compile_inverse_init_and_context') print 'Done' src = T.lvector() inverse_cur_state = T.matrix() trg_mask = T.matrix() # if it is the first word, emb should be all zero, and it is indicated by -1 src_emb = T.switch(src[:, None] < 0, T.alloc(0., 1, self.n_in_src), self.table_src.apply(src)) # apply one step # modified by Zhaopeng Tu, 2016-04-29 inverse_results = self.inverse_decoder.apply( state_below=src_emb, init_state=inverse_cur_state, # added by Zhaopeng Tu, 2016-06-09 init_context=None if self.with_attention else inverse_init_context, c=inverse_c if self.with_attention else None, c_mask=trg_mask, one_step=True) inverse_next_state = inverse_results[0] if self.with_attention: inverse_ctxs, inverse_alignment = inverse_results[ 1], inverse_results[2] else: # if with_attention=False, we always use init_context as the source representation inverse_ctxs = init_context inverse_readout = self.inverse_decoder.readout( inverse_next_state, inverse_ctxs, src_emb) # maxout if self.maxout_part > 1: inverse_readout = self.inverse_decoder.one_step_maxout( inverse_readout) # apply dropout if self.dropout < 1.0: inverse_readout = Dropout(self.srng, inverse_readout, 0, self.dropout) # compute the softmax probability inverse_next_probs = self.inverse_logistic_layer.get_probs( inverse_readout) # sample from softmax distribution to get the sample inverse_next_sample = self.srng.multinomial( pvals=inverse_next_probs).argmax(1) # compile function print 'Building compile_inverse_next_state_and_probs function ...' inps = [src, trg_mask, inverse_cur_state] if self.with_attention: inps.append(inverse_c) else: inps.append(inverse_init_context) outs = [ inverse_next_probs, inverse_next_state, inverse_next_sample ] # added by Zhaopeng Tu, 2016-06-09 if self.with_attention: outs.append(inverse_alignment) self.compile_inverse_next_state_and_probs = theano.function( inps, outs, name='compile_inverse_next_state_and_probs') print 'Done'
def train_extractive_qa(new_training_job, config, save_path, params, fast_start, fuel_server, seed): if seed: fuel.config.default_seed = seed blocks.config.config.default_seed = seed root_path = os.path.join(save_path, 'training_state') extension = '.tar' tar_path = root_path + extension best_tar_path = root_path + '_best' + extension c = config data, qam = initialize_data_and_model(c) if theano.config.compute_test_value != 'off': test_value_data = next( data.get_stream('train', shuffle=True, batch_size=4, max_length=5).get_epoch_iterator(as_dict=True)) for var in qam.input_vars.values(): var.tag.test_value = test_value_data[var.name] costs = qam.apply_with_default_vars() cost = rename(costs.mean(), 'mean_cost') cg = Model(cost) if params: logger.debug("Load parameters from {}".format(params)) with open(params) as src: cg.set_parameter_values(load_parameters(src)) length = rename(qam.contexts.shape[1], 'length') batch_size = rename(qam.contexts.shape[0], 'batch_size') predicted_begins, = VariableFilter(name='predicted_begins')(cg) predicted_ends, = VariableFilter(name='predicted_ends')(cg) exact_match, = VariableFilter(name='exact_match')(cg) exact_match_ratio = rename(exact_match.mean(), 'exact_match_ratio') context_unk_ratio, = VariableFilter(name='context_unk_ratio')(cg) monitored_vars = [ length, batch_size, cost, exact_match_ratio, context_unk_ratio ] if c['dict_path']: def_unk_ratio, = VariableFilter(name='def_unk_ratio')(cg) num_definitions = rename(qam.input_vars['defs'].shape[0], 'num_definitions') max_definition_length = rename(qam.input_vars['defs'].shape[1], 'max_definition_length') monitored_vars.extend( [def_unk_ratio, num_definitions, max_definition_length]) if c['def_word_gating'] == 'self_attention': def_gates = VariableFilter(name='def_gates')(cg) def_gates_min = tensor.minimum(*[x.min() for x in def_gates]) def_gates_max = tensor.maximum(*[x.max() for x in def_gates]) monitored_vars.extend([ rename(def_gates_min, 'def_gates_min'), rename(def_gates_max, 'def_gates_max') ]) text_match_ratio = TextMatchRatio(data_path=os.path.join( fuel.config.data_path[0], 'squad/dev-v1.1.json'), requires=[ predicted_begins, predicted_ends, tensor.ltensor3('contexts_text'), tensor.lmatrix('q_ids') ], name='text_match_ratio') parameters = cg.get_parameter_dict() trained_parameters = parameters.values() if c['embedding_path']: logger.debug("Exclude word embeddings from the trained parameters") trained_parameters = [ p for p in trained_parameters if not p == qam.embeddings_var() ] if c['train_only_def_part']: def_reading_parameters = qam.def_reading_parameters() trained_parameters = [ p for p in trained_parameters if p in def_reading_parameters ] logger.info("Cost parameters" + "\n" + pprint.pformat([ " ".join( (key, str(parameters[key].get_value().shape), 'trained' if parameters[key] in trained_parameters else 'frozen')) for key in sorted(parameters.keys()) ], width=120)) # apply dropout to the training cost and to all the variables # that we monitor during training train_cost = cost train_monitored_vars = list(monitored_vars) if c['dropout']: regularized_cg = ComputationGraph([cost] + train_monitored_vars) # Dima: the dropout that I implemented first bidir_outputs, = VariableFilter(bricks=[Bidirectional], roles=[OUTPUT])(cg) readout_layers = VariableFilter(bricks=[Rectifier], roles=[OUTPUT])(cg) dropout_vars = [bidir_outputs] + readout_layers logger.debug("applying dropout to {}".format(", ".join( [v.name for v in dropout_vars]))) regularized_cg = apply_dropout(regularized_cg, dropout_vars, c['dropout']) # a new dropout with exactly same mask at different steps emb_vars = VariableFilter(roles=[EMBEDDINGS])(regularized_cg) emb_dropout_mask = get_dropout_mask(emb_vars[0], c['emb_dropout']) if c['emb_dropout_type'] == 'same_mask': regularized_cg = apply_dropout2(regularized_cg, emb_vars, c['emb_dropout'], dropout_mask=emb_dropout_mask) elif c['emb_dropout_type'] == 'regular': regularized_cg = apply_dropout(regularized_cg, emb_vars, c['emb_dropout']) else: raise ValueError("unknown dropout type {}".format( c['emb_dropout_type'])) train_cost = regularized_cg.outputs[0] train_monitored_vars = regularized_cg.outputs[1:] rules = [] if c['grad_clip_threshold']: rules.append(StepClipping(c['grad_clip_threshold'])) rules.append(Adam(learning_rate=c['learning_rate'], beta1=c['momentum'])) algorithm = GradientDescent(cost=train_cost, parameters=trained_parameters, step_rule=CompositeRule(rules)) if c['grad_clip_threshold']: train_monitored_vars.append(algorithm.total_gradient_norm) if c['monitor_parameters']: train_monitored_vars.extend(parameter_stats(parameters, algorithm)) training_stream = data.get_stream('train', batch_size=c['batch_size'], shuffle=True, max_length=c['max_length']) original_training_stream = training_stream if fuel_server: # the port will be configured by the StartFuelServer extension training_stream = ServerDataStream( sources=training_stream.sources, produces_examples=training_stream.produces_examples) extensions = [ LoadNoUnpickling(tar_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job), StartFuelServer(original_training_stream, os.path.join(save_path, 'stream.pkl'), before_training=fuel_server), Timing(every_n_batches=c['mon_freq_train']), TrainingDataMonitoring(train_monitored_vars, prefix="train", every_n_batches=c['mon_freq_train']), ] validation = DataStreamMonitoring( [text_match_ratio] + monitored_vars, data.get_stream('dev', batch_size=c['batch_size_valid'], raw_text=True, q_ids=True), prefix="dev").set_conditions(before_training=not fast_start, after_epoch=True) dump_predictions = DumpPredictions(save_path, text_match_ratio, before_training=not fast_start, after_epoch=True) track_the_best_exact = TrackTheBest( validation.record_name(exact_match_ratio), choose_best=max).set_conditions(before_training=True, after_epoch=True) track_the_best_text = TrackTheBest( validation.record_name(text_match_ratio), choose_best=max).set_conditions(before_training=True, after_epoch=True) extensions.extend([ validation, dump_predictions, track_the_best_exact, track_the_best_text ]) # We often use pretrained word embeddings and we don't want # to load and save them every time. To avoid that, we use # save_main_loop=False, we only save the trained parameters, # and we save the log and the iterations state separately # in the tar file. extensions.extend([ Checkpoint(tar_path, parameters=trained_parameters, save_main_loop=False, save_separately=['log', 'iteration_state'], before_training=not fast_start, every_n_epochs=c['save_freq_epochs'], every_n_batches=c['save_freq_batches'], after_training=not fast_start).add_condition( ['after_batch', 'after_epoch'], OnLogRecord(track_the_best_text.notification_name), (best_tar_path, )), DumpTensorflowSummaries(save_path, after_epoch=True, every_n_batches=c['mon_freq_train'], after_training=True), RetrievalPrintStats(retrieval=data._retrieval, every_n_batches=c['mon_freq_train'], before_training=not fast_start), Printing(after_epoch=True, every_n_batches=c['mon_freq_train']), FinishAfter(after_n_batches=c['n_batches'], after_n_epochs=c['n_epochs']), Annealing(c['annealing_learning_rate'], after_n_epochs=c['annealing_start_epoch']), LoadNoUnpickling(best_tar_path, after_n_epochs=c['annealing_start_epoch']) ]) main_loop = MainLoop(algorithm, training_stream, model=Model(cost), extensions=extensions) main_loop.run()
def build_mlp(args, netid, input_var=None, mask_inputs=False): """Build MLP model""" # pylint: disable=bad-continuation # This creates an MLP of two hidden layers of 800 units each, followed by # a softmax output layer of 10 units. It applies 20% dropout to the input # data and 50% dropout to the hidden layers. # Input layer, specifying the expected input shape of the network # (unspecified batchsize, 1 channel, 28 rows and 28 columns) and # linking it to the given Theano variable `input_var`, if any: l_in = lasagne.layers.InputLayer(shape=(None, 1, 28, 28), input_var=input_var, name="%d_%s" % (netid, "l_in")) mask_in = None if mask_inputs: mask_in = T.ltensor3() # Apply 20% dropout to the input data: l_in_drop = dropout.DropoutLayer(l_in, mask=mask_in, p=args.input_dropout_rate, name="%d_%s" % (netid, "l_in_drop")) # Add a fully-connected layer of 800 units, using the linear rectifier, and # initializing weights with Glorot's scheme (which is the default anyway): l_hid1 = lasagne.layers.DenseLayer( l_in_drop, num_units=200, nonlinearity=lasagne.nonlinearities.rectify, W=lasagne.init.GlorotUniform(), name="%d_%s" % (netid, "l_hid1")) # We'll now add dropout of 50%: mask_hid1 = None if mask_inputs: mask_hid1 = T.lvector() l_hid1_drop = dropout.DropoutLayer(l_hid1, mask=mask_hid1, p=args.dropout_rate, name="%d_%s" % (netid, "l_hid1_drop")) # Another 800-unit layer: l_hid2 = lasagne.layers.DenseLayer( l_hid1_drop, num_units=200, nonlinearity=lasagne.nonlinearities.rectify, name="%d_%s" % (netid, "l_hid2")) # 50% dropout again: mask_hid2 = None if mask_inputs: mask_hid2 = T.lvector() l_hid2_drop = dropout.DropoutLayer(l_hid2, mask=mask_hid2, p=args.dropout_rate, name="%d_%s" % (netid, "l_hid2_drop")) # Finally, we'll add the fully-connected output layer, of 10 softmax units: l_out = lasagne.layers.DenseLayer( l_hid2_drop, num_units=10, nonlinearity=lasagne.nonlinearities.softmax, name="%d_%s" % (netid, "l_out")) masks = [mask_in, mask_hid1, mask_hid2] # Each layer is linked to its incoming layer(s), so we only need to pass # the output layer to give access to a network in Lasagne: return l_out, masks
def generate_embeddings(config, tar_path, part, dest_path, format_, average=False, encoder_embeddings=None, **kwargs): """ generate embeddings for all the defintions, average them and serialize OR if encoder_embeddings, serialize the models' encoder embeddings config: name of the config of the model tar_path: tar path of the model parameters part: part of the dataset (should be either 'train', 'valid', 'test' or 'all') dest_path: directory where the serialized embeddings will be written format: either 'dict' or 'glove' encoder_embeddings: None, 'only', 'mixed', 'if_missing' - None: don't include encoder embeddings - 'only': don't read any data, just serialize the encoder embeddings - 'mixed': add the encoder embeddings to the list of definition embeddings - 'if_missing': add the encoder embeddings when there is no corresponding def average: if true, multi-prototype embeddings will be averaged """ if not os.path.exists(dest_path): os.makedirs(dest_path) c = config data, model = initialize_data_and_model(c, train_phase=False) words = T.ltensor3('words') words_mask = T.matrix('words_mask') keys = T.lmatrix('keys') n_identical_keys = T.lvector('n_identical_keys') sym_args = [words, words_mask] if format_ not in ['dict', 'glove']: raise ValueError("format should be either: dict, glove") if not c['encoder'] and encoder_embeddings != 'only': raise ValueError('Error: this model does not have an encoder.') if use_keys(c): sym_args.append(keys) if use_n_identical_keys(c): sym_args.append(n_identical_keys) costs = model.apply(*sym_args, train_phase=False) cg = Model(costs) with open(tar_path) as src: cg.set_parameter_values(load_parameters(src)) if encoder_embeddings: if encoder_embeddings == 'only' and not c['encoder']: embeddings_array = model.get_def_embeddings_params('key').eval() else: embeddings_array = model.get_def_embeddings_params('main').eval() entries = model.get_embeddings_entries() enc_embeddings = { e: np.asarray(a) for e, a in zip(entries, embeddings_array) } if encoder_embeddings == 'only': serialize_embeddings(enc_embeddings, format_, dest_path, "encoder_embeddings") return 0 embeddings_var, = VariableFilter(name='embeddings')(cg) compute = dict({"embeddings": embeddings_var}) if c['proximity_coef'] != 0: prox_var, = VariableFilter(name='proximity_term')(cg) compute["proximity_term"] = prox_var print "sym args", sym_args predict_f = theano.function(sym_args, compute) batch_size = 256 # size of test_unseen stream = data.get_stream(part, batch_size=batch_size, max_length=c['max_length'], remove_keys=False, remove_n_identical_keys=False) raw_data = [] # list of dicts containing the inputs and computed outputs i = 0 vocab = model._vocab print "start computing" embeddings = defaultdict(list) for input_data in stream.get_epoch_iterator(as_dict=True): if i % 10 == 0: print "iteration:", i words = input_data['words'] words_mask = input_data['words_mask'] keys = input_data['keys'] n_identical_keys = input_data['n_identical_keys'] args = [words, words_mask] if use_keys(c): args.append(keys) if use_n_identical_keys(c): args.append(n_identical_keys) to_save = predict_f(*args) for k, h in zip(keys, to_save['embeddings']): key = vec2str(k) if encoder_embeddings == 'if_missing': try: del enc_embeddings[key] except KeyError: pass embeddings[key].append(h) i += 1 if encoder_embeddings in ['mixed', 'if_missing']: for k, e in enc_embeddings.iteritems(): embeddings[k].append(e) if encoder_embeddings == 'mixed': prefix_fname = 'mix_e_' elif encoder_embeddings == 'if_missing': prefix_fname = 'if_mis_e_' else: prefix_fname = '' # combine: if average: mean_embeddings = {} for k in embeddings.keys(): mean_embeddings[k] = np.mean(np.asarray(embeddings[k]), axis=0) serialize_embeddings(mean_embeddings, format_, dest_path, prefix_fname + "mean_embeddings") else: serialize_embeddings(embeddings, format_, dest_path, prefix_fname + "embeddings")
def main(mode, save_path, steps, time_budget, reset): num_states = ChainDataset.num_states if mode == "train": # Experiment configuration rng = numpy.random.RandomState(1) batch_size = 50 seq_len = 100 dim = 10 feedback_dim = 8 # Build the bricks and initialize them transition = GatedRecurrent(name="transition", activation=Tanh(), dim=dim) generator = SequenceGenerator( LinearReadout(readout_dim=num_states, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback( num_states, feedback_dim, name='feedback'), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_params().items()], width=120)) logger.info("Markov chain entropy: {}".format( ChainDataset.entropy)) logger.info("Expected min error: {}".format( -ChainDataset.entropy * seq_len * batch_size)) if os.path.isfile(save_path) and not reset: model = Pylearn2Model.load(save_path) else: model = Pylearn2Model(generator) # Build the cost computation graph. # Note: would be probably nicer to make cost part of the model. x = tensor.ltensor3('x') cost = Pylearn2Cost(model.brick.cost(x[:, :, 0]).sum()) dataset = ChainDataset(rng, seq_len) sgd = SGD(learning_rate=0.0001, cost=cost, batch_size=batch_size, batches_per_iter=10, monitoring_dataset=dataset, monitoring_batch_size=batch_size, monitoring_batches=1, learning_rule=Pylearn2LearningRule( SGDLearningRule(), dict(training_objective=cost.cost))) train = Pylearn2Train(dataset, model, algorithm=sgd, save_path=save_path, save_freq=10) train.main_loop(time_budget=time_budget) elif mode == "sample": model = Pylearn2Model.load(save_path) generator = model.brick sample = ComputationGraph(generator.generate( n_steps=steps, batch_size=1, iterate=True)).function() states, outputs, costs = [data[:, 0] for data in sample()] numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() print("Frequencies:\n {} vs {}".format(freqs, ChainDataset.equilibrium)) trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] print("Transition frequencies:\n{}\nvs\n{}".format( trans_freqs, ChainDataset.trans_prob)) else: assert False
def evaluate_lm(config, tar_path, part, num_examples, dest_path, **kwargs): c = config if part not in ['valid', 'test_unseen', 'test']: raise ValueError() data, lm, _ = initialize_data_and_model(c) words = T.ltensor3('words') words_mask = T.matrix('words_mask') costs = lm.apply(words, words_mask) cg = Model(costs) with open(tar_path) as src: cg.set_parameter_values(load_parameters(src)) perplexities = VariableFilter(name_regex='perplexity.*')(cg) mask_sums = [p.tag.aggregation_scheme.denominator for p in perplexities] CEs = [p.tag.aggregation_scheme.numerator for p in perplexities] proba_out, = VariableFilter(name='proba_out')(cg) unk_ratios = VariableFilter(name_regex='unk_ratio.*')(cg) #num_definitions, = VariableFilter(name='num_definitions')(cg) print perplexities print CEs print mask_sums name_to_aggregate = [p.name for p in perplexities] for CE, mask_sum, name in zip(CEs, mask_sums, name_to_aggregate): CE.name = name + "_num" mask_sum.name = name + "_denom" compute_l = CEs + mask_sums + unk_ratios if part == 'test_unseen': compute_l.append(proba_out) compute = dict({p.name: p for p in compute_l}) print "to compute:", compute.keys() predict_f = theano.function([words, words_mask], compute) if part == 'test_unseen': batch_size = 1 else: batch_size = 128 # size of test_unseen stream = data.get_stream(part, batch_size=batch_size, max_length=100) raw_data = [] # list of dicts containing the inputs and computed outputs i = 0 print "start computing" for input_data in stream.get_epoch_iterator(as_dict=True): if i and i % 100 == 0: print "iteration:", i words = input_data['words'] words_mask = input_data['words_mask'] to_save = predict_f(words, words_mask) to_save.update(input_data) raw_data.append(to_save) i += 1 # aggregate in the log space aggregated = Counter() sum_mask_track = Counter() for d in raw_data: coef = d['words_mask'].sum() # over timesteps and batches for name in name_to_aggregate: aggregated[name] += d[name + "_num"] sum_mask_track[name] += d[name + "_denom"] for k, v in aggregated.iteritems(): print "k, v, m:", k, v, sum_mask_track[k] aggregated[k] = np.exp(v / sum_mask_track[k]) n_params = sum([np.prod(p.shape.eval()) for p in cg.parameters]) aggregated['n_params'] = n_params print "aggregated stats:", aggregated print "# of parameters {}".format(n_params) #TODO: check that different batch_size yields same validation error than # end of training validation error. # TODO: I think blocks aggreg is simply mean which should break # when we use masks??? investigate if not os.path.exists(dest_path): os.makedirs(dest_path) if part == 'test_unseen': np.savez( os.path.join(dest_path, "predictions"), words=input_data['words'], words_mask=input_data['words_mask'], #unk_ratio = to_save['unk_ratio'], #def_unk_ratio = to_save['def_unk_ratio'], proba_out=to_save['languagemodel_apply_proba_out'], vocab_in=lm._vocab.words[:c['num_input_words']], vocab_out=lm._vocab.words[:c['num_output_words']]) json.dump(aggregated, open(os.path.join(dest_path, "aggregates.json"), "w"), sort_keys=True, indent=2)
def train_model(new_training_job, config, save_path, params, fast_start, fuel_server, seed): c = config if seed: fuel.config.default_seed = seed blocks.config.config.default_seed = seed data, model = initialize_data_and_model(config, train_phase=True) # full main loop can be saved... main_loop_path = os.path.join(save_path, 'main_loop.tar') # or only state (log + params) which can be useful not to pickle embeddings state_path = os.path.join(save_path, 'training_state.tar') stream_path = os.path.join(save_path, 'stream.pkl') best_tar_path = os.path.join(save_path, "best_model.tar") keys = tensor.lmatrix('keys') n_identical_keys = tensor.lvector('n_identical_keys') words = tensor.ltensor3('words') words_mask = tensor.matrix('words_mask') if theano.config.compute_test_value != 'off': #TODO test_value_data = next( data.get_stream('train', batch_size=4, max_length=5).get_epoch_iterator()) words.tag.test_value = test_value_data[0] words_mask.tag.test_value = test_value_data[1] if use_keys(c) and use_n_identical_keys(c): costs = model.apply(words, words_mask, keys, n_identical_keys, train_phase=True) elif use_keys(c): costs = model.apply(words, words_mask, keys, train_phase=True) else: costs = model.apply(words, words_mask, train_phase=True) cost = rename(costs.mean(), 'mean_cost') cg = Model(cost) if params: logger.debug("Load parameters from {}".format(params)) with open(params) as src: cg.set_parameter_values(load_parameters(src)) length = rename(words.shape[1], 'length') perplexity, = VariableFilter(name='perplexity')(cg) monitored_vars = [length, cost, perplexity] if c['proximity_coef']: proximity_term, = VariableFilter(name='proximity_term')(cg) monitored_vars.append(proximity_term) print "inputs of the model:", cg.inputs parameters = cg.get_parameter_dict() trained_parameters = parameters.values() saved_parameters = parameters.values() if c['embedding_path']: if c['freeze_pretrained']: logger.debug( "Exclude pretrained encoder embeddings from the trained parameters" ) to_freeze = 'main' elif c['provide_targets']: logger.debug( "Exclude pretrained targets from the trained parameters") to_freeze = 'target' trained_parameters = [ p for p in trained_parameters if not p == model.get_def_embeddings_params(to_freeze) ] saved_parameters = [ p for p in saved_parameters if not p == model.get_def_embeddings_params(to_freeze) ] logger.info("Cost parameters" + "\n" + pprint.pformat([ " ".join( (key, str(parameters[key].get_value().shape), 'trained' if parameters[key] in trained_parameters else 'frozen')) for key in sorted(parameters.keys()) ], width=120)) rules = [] if c['grad_clip_threshold']: rules.append(StepClipping(c['grad_clip_threshold'])) rules.append(Adam(learning_rate=c['learning_rate'], beta1=c['momentum'])) algorithm = GradientDescent(cost=cost, parameters=trained_parameters, step_rule=CompositeRule(rules)) train_monitored_vars = list(monitored_vars) if c['grad_clip_threshold']: train_monitored_vars.append(algorithm.total_gradient_norm) if c['monitor_parameters']: train_monitored_vars.extend(parameter_stats(parameters, algorithm)) # We use a completely random seed on purpose. With Fuel server # it's currently not possible to restore the state of the training # stream. That's why it's probably better to just have it stateless. stream_seed = numpy.random.randint(0, 10000000) if fuel_server else None training_stream = data.get_stream( 'train', batch_size=c['batch_size'], max_length=c['max_length'], seed=stream_seed, remove_keys=not use_keys(c), remove_n_identical_keys=not use_n_identical_keys(c)) print "trainin_stream will contains sources:", training_stream.sources original_training_stream = training_stream if fuel_server: # the port will be configured by the StartFuelServer extension training_stream = ServerDataStream( sources=training_stream.sources, produces_examples=training_stream.produces_examples) validate = c['mon_freq_valid'] > 0 if validate: valid_stream = data.get_stream( 'valid', batch_size=c['batch_size_valid'], max_length=c['max_length'], seed=stream_seed, remove_keys=not use_keys(c), remove_n_identical_keys=not use_n_identical_keys(c)) validation = DataStreamMonitoring( monitored_vars, valid_stream, prefix="valid").set_conditions(before_first_epoch=not fast_start, on_resumption=True, every_n_batches=c['mon_freq_valid']) track_the_best = TrackTheBest(validation.record_name(cost), choose_best=min).set_conditions( on_resumption=True, after_epoch=True, every_n_batches=c['mon_freq_valid']) # don't save them the entire main loop to avoid pickling everything if c['fast_checkpoint']: cp_path = state_path load = (LoadNoUnpickling(cp_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job)) cp_args = { 'save_main_loop': False, 'save_separately': ['log', 'iteration_state'], 'parameters': saved_parameters } else: cp_path = main_loop_path load = (Load(cp_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job)) cp_args = { 'save_separately': ['iteration_state'], 'parameters': saved_parameters } checkpoint = Checkpoint(cp_path, before_training=not fast_start, every_n_batches=c['save_freq_batches'], after_training=not fast_start, **cp_args) if c['checkpoint_every_n_batches'] > 0 or c[ 'checkpoint_every_n_epochs'] > 0: intermediate_cp = IntermediateCheckpoint( cp_path, every_n_epochs=c['checkpoint_every_n_epochs'], every_n_batches=c['checkpoint_every_n_batches'], after_training=False, **cp_args) if validate: checkpoint = checkpoint.add_condition( ['after_batch', 'after_epoch'], OnLogRecord(track_the_best.notification_name), (best_tar_path, )) extensions = [ load, StartFuelServer(original_training_stream, stream_path, before_training=fuel_server), Timing(every_n_batches=c['mon_freq_train']) ] extensions.extend([ TrainingDataMonitoring(train_monitored_vars, prefix="train", every_n_batches=c['mon_freq_train']), ]) if validate: extensions.extend([validation, track_the_best]) extensions.append(checkpoint) if c['checkpoint_every_n_batches'] > 0 or c[ 'checkpoint_every_n_epochs'] > 0: extensions.append(intermediate_cp) extensions.extend( [Printing(on_resumption=True, every_n_batches=c['mon_freq_train'])]) if validate and c['n_valid_early'] > 0: extensions.append( FinishIfNoImprovementAfter(track_the_best.notification_name, iterations=c['n_valid_early'] * c['mon_freq_valid'], every_n_batches=c['mon_freq_valid'])) extensions.append(FinishAfter(after_n_epochs=c['n_epochs'])) logger.info("monitored variables during training:" + "\n" + pprint.pformat(train_monitored_vars, width=120)) logger.info("monitored variables during valid:" + "\n" + pprint.pformat(monitored_vars, width=120)) main_loop = MainLoop(algorithm, training_stream, model=Model(cost), extensions=extensions) main_loop.run()
def main(): # ZEROUT_DUMMY_WORD = False ZEROUT_DUMMY_WORD = True ## Load data # mode = 'TRAIN-ALL' #mode = 'TRAIN_DATA' #mode = 'TRAIN_NO_OVERLAP' #if len(sys.argv) > 1: # mode = sys.argv[1] # if not mode in ['TRAIN', 'TRAIN-ALL']: # print "ERROR! The two possible training settings are: ['TRAIN', 'TRAIN-ALL']" # sys.exit(1) mode = 'k_time_data1'.upper() print "Running training in the {} setting".format(mode) position_num = 10 select_model = "PSCM" if select_model == "PSCM": click_model_index = 4 #PSCM elif select_model == "UBM": click_model_index = 1 else: raise "MODEL SELECT ERROR!" data_dir = mode add_train = numpy.load(os.path.join(data_dir, 'train.additions.npy')) q_train = numpy.load(os.path.join(data_dir, 'train.questions.npy')) a_train = numpy.load(os.path.join(data_dir, 'train.answers.npy')) y_train = numpy.load(os.path.join(data_dir, 'train.labels.npy')) add_dev = numpy.load(os.path.join(data_dir, 'dev.additions.npy')) q_dev = numpy.load(os.path.join(data_dir, 'dev.questions.npy')) a_dev = numpy.load(os.path.join(data_dir, 'dev.answers.npy')) #q_overlap_dev = numpy.load(os.path.join(data_dir, 'dev.q_overlap_indices.npy')) #a_overlap_dev = numpy.load(os.path.join(data_dir, 'dev.a_overlap_indices.npy')) y_dev = numpy.load(os.path.join(data_dir, 'dev.labels.npy')) qids_dev = numpy.load(os.path.join(data_dir, 'dev.qids.npy')) add_test = numpy.load(os.path.join(data_dir, 'test.additions.npy')) q_test = numpy.load(os.path.join(data_dir, 'test.questions.npy')) a_test = numpy.load(os.path.join(data_dir, 'test.answers.npy')) #q_overlap_test = numpy.load(os.path.join(data_dir, 'test.q_overlap_indices.npy')) #a_overlap_test = numpy.load(os.path.join(data_dir, 'test.a_overlap_indices.npy')) y_test = numpy.load(os.path.join(data_dir, 'test.labels.npy')) qids_test = numpy.load(os.path.join(data_dir, 'test.qids.npy')) # x_train = numpy.load(os.path.join(data_dir, 'train.overlap_feats.npy')) # x_dev = numpy.load(os.path.join(data_dir, 'dev.overlap_feats.npy')) # x_test = numpy.load(os.path.join(data_dir, 'test.overlap_feats.npy')) # feats_ndim = x_train.shape[1] # from sklearn.preprocessing import StandardScaler # scaler = StandardScaler() # print "Scaling overlap features" # x_train = scaler.fit_transform(x_train) # x_dev = scaler.transform(x_dev) # x_test = scaler.transform(x_test) #multi dim #y_train_tmp = numpy.dstack((y_train, y_train, y_train))[0] #y_dev_tmp = numpy.dstack((y_dev, y_dev, y_dev))[0] #y_test_tmp = numpy.dstack((y_test, y_test, y_test))[0] #y_train = y_train_tmp #y_dev = y_dev_tmp #y_test = y_test_tmp max_query_id = numpy.max([numpy.max(add_train[:, 0]), numpy.max(add_test[:, 0]), numpy.max(add_dev[:, 0])]) max_url_id = numpy.max([numpy.max(add_train[:, 1:]), numpy.max(add_test[:, 1:]), numpy.max(add_dev[:, 1:])]) print 'max_query_id', max_query_id print 'max_url_id', max_url_id print 'y_train', numpy.unique(y_train, return_counts=True) print 'y_dev', numpy.unique(y_dev, return_counts=True) print 'y_test', numpy.unique(y_test, return_counts=True) print 'q_train', q_train.shape print 'q_dev', q_dev.shape print 'q_test', q_test.shape print 'a_train', a_train.shape print 'a_dev', a_dev.shape print 'a_test', a_test.shape ## Get the word embeddings from the nnet trained on SemEval # ndim = 40 # nnet_outdir = 'exp/ndim=60;batch=100;max_norm=0;learning_rate=0.1;2014-12-02-15:53:14' # nnet_fname = os.path.join(nnet_outdir, 'nnet.dat') # params_fname = os.path.join(nnet_outdir, 'best_dev_params.epoch=00;batch=14640;dev_f1=83.12;test_acc=85.00.dat') # train_nnet, test_nnet = nn_layers.load_nnet(nnet_fname, params_fname) numpy_rng = numpy.random.RandomState(123) q_max_sent_size = q_train.shape[1] a_max_sent_size = a_train.shape[2] # print 'max', numpy.max(a_train) # print 'min', numpy.min(a_train) #ndim = 5 #print "Generating random vocabulary for word overlap indicator features with dim:", ndim #dummy_word_id = numpy.max(a_overlap_train) # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim)) #print "Gaussian" #vocab_emb_overlap = numpy_rng.randn(dummy_word_id + 1, ndim) * 0.25 # vocab_emb_overlap = numpy_rng.randn(dummy_word_id+1, ndim) * 0.05 # vocab_emb_overlap = numpy_rng.uniform(-0.25, 0.25, size=(dummy_word_id+1, ndim)) #vocab_emb_overlap[-1] = 0 # Load word2vec embeddings fname = os.path.join(data_dir, 'emb_vectors.skip.1124.4m.10w.npy') print "Loading word embeddings from", fname vocab_emb = numpy.load(fname) ndim = vocab_emb.shape[1] dummpy_word_idx = numpy.max(a_train) print "Word embedding matrix size:", vocab_emb.shape x = T.dmatrix('x') x_q = T.lmatrix('q') #x_q_overlap = T.lmatrix('q_overlap') #x_a = T.lmatrix('a') x_a_all = T.ltensor3('a_all') #x_a_overlap = T.lmatrix('a_overlap') #y = T.ivector('y') y = T.imatrix('y') add_info = T.dmatrix('add_info') ####### n_outs = 2 n_epochs = 15 batch_size = 50 learning_rate = 0.1 max_norm = 0 print 'batch_size', batch_size print 'n_epochs', n_epochs print 'learning_rate', learning_rate print 'max_norm', max_norm ## 1st conv layer. #ndim = vocab_emb.shape[1] + vocab_emb_overlap.shape[1] ndim = vocab_emb.shape[1] ### Nonlinearity type # activation = nn_layers.relu_f activation = T.tanh dropout_rate = 0.5 nkernels = 100 q_k_max = 1 a_k_max = 1 # filter_widths = [3,4,5] q_filter_widths = [5] a_filter_widths = [5] ###### QUESTION ###### lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths) - 1) #lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=max(q_filter_widths) - 1) #lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words, lookup_table_overlap]) lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words]) num_input_channels = 1 input_shape = (batch_size, num_input_channels, q_max_sent_size + 2 * (max(q_filter_widths) - 1), ndim) conv_layers = [] for filter_width in q_filter_widths: filter_shape = (nkernels, num_input_channels, filter_width, ndim) conv = nn_layers.Conv2dLayer(rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape) non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0], activation=activation) pooling = nn_layers.KMaxPoolLayer(k_max=q_k_max) conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(layers=[conv, non_linearity, pooling]) conv_layers.append(conv2dNonLinearMaxPool) join_layer = nn_layers.ParallelLayer(layers=conv_layers) flatten_layer = nn_layers.FlattenLayer() nnet_q = nn_layers.FeedForwardNet(layers=[ lookup_table, join_layer, flatten_layer, ]) #nnet_q.set_input((x_q, x_q_overlap)) nnet_q.set_input([x_q]) ###### ###### ANSWER ###### nnet_a_list = [] #lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths) - 1) for i in xrange(position_num): #lookup_table_words = nn_layers.LookupTableFastStatic(W=vocab_emb, pad=max(q_filter_widths) - 1) #lookup_table_overlap = nn_layers.LookupTableFast(W=vocab_emb_overlap, pad=max(q_filter_widths) - 1) #lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words, lookup_table_overlap]) #lookup_table = nn_layers.ParallelLookupTable(layers=[lookup_table_words]) # num_input_channels = len(lookup_table.layers) #input_shape = (batch_size, num_input_channels, a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim) input_shape = (batch_size, num_input_channels, a_max_sent_size + 2 * (max(a_filter_widths) - 1), ndim) conv_layers = [] for filter_width in a_filter_widths: filter_shape = (nkernels, num_input_channels, filter_width, ndim) conv = nn_layers.Conv2dLayer(rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape) non_linearity = nn_layers.NonLinearityLayer(b_size=filter_shape[0], activation=activation) pooling = nn_layers.KMaxPoolLayer(k_max=a_k_max) conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(layers=[conv, non_linearity, pooling]) conv_layers.append(conv2dNonLinearMaxPool) join_layer = nn_layers.ParallelLayer(layers=conv_layers) flatten_layer = nn_layers.FlattenLayer() nnet_a = nn_layers.FeedForwardNet(layers=[ lookup_table, join_layer, flatten_layer, ]) #nnet_a.set_input((x_a, x_a_overlap)) nnet_a.set_input([x_a_all[:, i, :]]) nnet_a_list.append(nnet_a) ####### # print 'nnet_q.output', nnet_q.output.ndim q_logistic_n_in = nkernels * len(q_filter_widths) * q_k_max #a_logistic_n_in = nkernels * len(a_filter_widths) * a_k_max a_logistic_n_in = nkernels * len(a_filter_widths) * a_k_max print "q_logistic_n_in, ", q_logistic_n_in print "a_logistic_n_in, ", a_logistic_n_in #pairwise_layer = nn_layers.PositionPairwiseNoFeatsLayer(q_in=q_logistic_n_in, a_in=a_logistic_n_in,position=position_num) pairwise_layer = nn_layers.PositionOnlySimPairwiseNoFeatsLayer(q_in=q_logistic_n_in, a_in=a_logistic_n_in,position=position_num) pairwise_out_list = [nnet_q.output] for i in xrange(position_num): pairwise_out_list.append(nnet_a_list[i].output) pairwise_layer.set_input(pairwise_out_list) #pairwise_layer.set_input((nnet_q.output, nnet_a.output)) # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + a_logistic_n_in # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 50 # n_in = q_logistic_n_in + a_logistic_n_in + feats_ndim + 1 #n_in = q_logistic_n_in + a_logistic_n_in * position_num + 1 * position_num #n_in = 1 * position_num + position_num * (position_num - 1) / 2 n_in = q_logistic_n_in + a_logistic_n_in * position_num + 1 * position_num + position_num * (position_num - 1) / 2 # n_in = feats_ndim + 1 # n_in = feats_ndim + 50 hidden_layer = nn_layers.LinearLayer(numpy_rng, n_in=n_in, n_out=n_in, activation=activation) hidden_layer.set_input(pairwise_layer.output) #classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs) #classifier.set_input(hidden_layer.output) classifier = nn_layers.FeatureClickModelLayer(n_in=n_in, n_out=n_outs, max_q_id=max_query_id, max_u_id=max_url_id, dim=position_num,click_model_index=click_model_index) #classifier = nn_layers.SimpleClickModelLayer(n_in=n_in, n_out=n_outs, max_q_id=max_query_id, max_u_id=max_url_id, dim=position_num) #classifier = nn_layers.MultiDimLogisticRegression(n_in=n_in, n_out=n_outs, dim=position_num) #classifier = nn_layers.LogisticRegression2(n_in=n_in, n_out=n_outs) classifier.set_input([hidden_layer.output, add_info]) #train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q, nnet_a, pairwise_layer, hidden_layer, classifier], # name="Training nnet") train_nnet = nn_layers.FeedForwardNet(layers=[nnet_q] + nnet_a_list + [pairwise_layer, hidden_layer, classifier], name="Training nnet") test_nnet = train_nnet ####### #print train_nnet params = train_nnet.params ts = datetime.now().strftime('%Y-%m-%d-%H.%M.%S') nnet_outdir = 'exp.multi.out/model={},data={};ndim={};batch={};max_norm={};learning_rate={};{}'.format(select_model,mode, ndim, batch_size, max_norm,learning_rate, ts) if not os.path.exists(nnet_outdir): os.makedirs(nnet_outdir) nnet_fname = os.path.join(nnet_outdir, 'nnet.dat') print "Saving to", nnet_fname cPickle.dump([train_nnet, test_nnet], open(nnet_fname, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) #total_params = sum([numpy.prod(param.shape.eval()) for param in params]) #print 'Total params number:', total_params cost = train_nnet.layers[-1].training_cost(y) # y_train_counts = numpy.unique(y_train, return_counts=True)[1].astype(numpy.float32) # weights_data = numpy.sum(y_train_counts) / y_train_counts # weights_data_norm = numpy.linalg.norm(weights_data) # weights_data /= weights_data_norm # print 'weights_data', weights_data # weights = theano.shared(weights_data, borrow=True) # cost = train_nnet.layers[-1].training_cost_weighted(y, weights=weights) predictions = test_nnet.layers[-1].y_pred #predictions_prob = test_nnet.layers[-1].p_y_given_x[:, position_num:position_num * 2] predictions_prob = test_nnet.layers[-1].p_y_given_x ### L2 regularization # L2_word_emb = 1e-4 # L2_conv1d = 3e-5 # # L2_softmax = 1e-3 # L2_softmax = 1e-4 # print "Regularizing nnet weights" # for w in train_nnet.weights: # L2_reg = 0. # if w.name.startswith('W_emb'): # L2_reg = L2_word_emb # elif w.name.startswith('W_conv1d'): # L2_reg = L2_conv1d # elif w.name.startswith('W_softmax'): # L2_reg = L2_softmax # elif w.name == 'W': # L2_reg = L2_softmax # print w.name, L2_reg # cost += T.sum(w**2) * L2_reg # batch_x = T.dmatrix('batch_x') batch_x_q = T.lmatrix('batch_x_q') #batch_x_a = T.lmatrix('batch_x_a') batch_x_a_all = T.ltensor3('batch_x_a_all') #batch_x_q_overlap = T.lmatrix('batch_x_q_overlap') #batch_x_a_overlap = T.lmatrix('batch_x_a_overlap') #batch_y = T.ivector('batch_y') batch_y = T.imatrix('batch_y') batch_add_info = T.dmatrix('batch_add_info') # updates = sgd_trainer.get_adagrad_updates(cost, params, learning_rate=learning_rate, max_norm=max_norm, _eps=1e-6) updates = sgd_trainer.get_adadelta_updates(cost, params, rho=0.95, eps=1e-6, max_norm=max_norm, word_vec_name='W_emb') inputs_pred = [batch_x_q, batch_x_a_all, batch_add_info, #batch_x_q_overlap, #batch_x_a_overlap, # batch_x, ] givens_pred = {x_q: batch_x_q, x_a_all: batch_x_a_all, add_info: batch_add_info, #x_q_overlap: batch_x_q_overlap, #x_a_overlap: batch_x_a_overlap, # x: batch_x } inputs_train = [batch_x_q, batch_x_a_all, #batch_x_q_overlap, #batch_x_a_overlap, # batch_x, batch_add_info, batch_y, ] givens_train = {x_q: batch_x_q, x_a_all: batch_x_a_all, #x_q_overlap: batch_x_q_overlap, #x_a_overlap: batch_x_a_overlap, # x: batch_x, add_info: batch_add_info, y: batch_y} train_fn = theano.function(inputs=inputs_train, outputs=cost, updates=updates, givens=givens_train, on_unused_input='warn') pred_fn = theano.function(inputs=inputs_pred, outputs=predictions, givens=givens_pred, on_unused_input='warn') pred_prob_fn = theano.function(inputs=inputs_pred, outputs=predictions_prob, givens=givens_pred, on_unused_input='warn') def predict_batch(batch_iterator): #preds = numpy.vstack([pred_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap) for # batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, _ in batch_iterator]) preds = numpy.vstack([pred_fn(batch_x_q, batch_x_a, batch_add_info) for batch_x_q, batch_x_a, batch_add_info, _ in batch_iterator]) real_preds = preds[:, -1 * position_num:] inner_outputs = preds return real_preds[:batch_iterator.n_samples], inner_outputs[:batch_iterator.n_samples] def predict_prob_batch(batch_iterator): #preds = numpy.vstack([pred_prob_fn(batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap) for # batch_x_q, batch_x_a, batch_x_q_overlap, batch_x_a_overlap, _ in batch_iterator]) preds = numpy.vstack([pred_prob_fn(batch_x_q, batch_x_a, batch_add_info) for batch_x_q, batch_x_a, batch_add_info, _ in batch_iterator]) real_preds = preds[:, -1 * position_num:] inner_outputs = preds return real_preds[:batch_iterator.n_samples], inner_outputs[:batch_iterator.n_samples] train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(numpy_rng, [q_train, a_train, add_train, y_train],batch_size=batch_size, randomize=True) dev_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(numpy_rng,[q_dev, a_dev, add_dev, y_dev], batch_size=batch_size, randomize=False) test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(numpy_rng,[q_test, a_test, add_test, y_test], batch_size=batch_size, randomize=False) labels = sorted(numpy.unique(y_test[:, -1])) print 'labels', labels def perplexity_score(labels, preds): positionPerplexity = [0.0] * position_num positionPerplexityClickSkip = [[0.0, 0.0] for i in xrange(position_num)] counts = [0] * position_num countsClickSkip = [[0, 0] for i in xrange(position_num)] for label, pred in zip(labels, preds): for i in range(0, len(label)): click = 1 if label[i] else 0 tmp_pred = max(min(pred[i], 0.99999), 0.00001) logProb = math.log(tmp_pred, 2) if click == 0: logProb = math.log(1 - tmp_pred, 2) positionPerplexity[i] += logProb positionPerplexityClickSkip[i][click] += logProb counts[i] += 1 countsClickSkip[i][click] += 1 positionPerplexity = [2 ** (-x / count if count else x) for (x, count) in zip(positionPerplexity, counts)] positionPerplexityClickSkip = [[2 ** (-x[click] / (count[click] if count[click] else 1) if count else x) \ for (x, count) in zip(positionPerplexityClickSkip, countsClickSkip)] for click in xrange(2)] perplexity = sum(positionPerplexity) / len(positionPerplexity) ret_str = "---------\n" ret_str += "Perplexity\t" + str(perplexity) + "\n" ret_str += "positionPerplexity" for i in range(0, position_num): ret_str += "\t" + str(positionPerplexity[i]) ret_str += "\n" ret_str += "positionPerplexitySkip" for i in range(0, position_num): ret_str += "\t" + str(positionPerplexityClickSkip[0][i]) ret_str += "\n" ret_str += "positionPerplexityClick" for i in range(0, position_num): ret_str += "\t" + str(positionPerplexityClickSkip[1][i]) ret_str += "\n------------\n" #print ret_str return perplexity, ret_str def map_score(qids, labels, preds): qid2cand = defaultdict(list) for qid, label, pred in zip(qids, labels, preds): qid2cand[qid].append((pred, label)) average_precs = [] for qid, candidates in qid2cand.iteritems(): average_prec = 0 running_correct_count = 0 for i, (score, label) in enumerate(sorted(candidates, reverse=True), 1): if label > 0: running_correct_count += 1 average_prec += float(running_correct_count) / i average_precs.append(average_prec / (running_correct_count + 1e-6)) map_score = sum(average_precs) / len(average_precs) return map_score print "Zero out dummy word:", ZEROUT_DUMMY_WORD if ZEROUT_DUMMY_WORD: W_emb_list = [w for w in params if w.name == 'W_emb'] zerout_dummy_word = theano.function([], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list]) # weights_dev = numpy.zeros(len(y_dev)) # weights_dev[y_dev == 0] = weights_data[0] # weights_dev[y_dev == 1] = weights_data[1] # print weights_dev best_dev_acc = -numpy.inf best_dev_perp = numpy.inf epoch = 0 timer_train = time.time() no_best_dev_update = 0 num_train_batches = len(train_set_iterator) while epoch < n_epochs: timer = time.time() for i, (x_q, x_a, add, y) in enumerate(tqdm(train_set_iterator), 1): train_fn(x_q, x_a, add, y) # Make sure the null word in the word embeddings always remains zero if ZEROUT_DUMMY_WORD: zerout_dummy_word() if i % 10 == 0 or i == num_train_batches: y_pred_dev, y_inner_dev = predict_prob_batch(dev_set_iterator) #print "shape:" #print str(y_dev.shape) #print str(y_pred_dev.shape) # # dev_acc = map_score(qids_dev, y_dev, predict_prob_batch(dev_set_iterator)) * 100 dev_acc = metrics.roc_auc_score(y_dev[:, -1], y_pred_dev[:, -1]) * 100 dev_perp, dev_perp_str = perplexity_score(y_dev, y_pred_dev) if dev_acc > best_dev_acc: y_pred, y_inner = predict_prob_batch(test_set_iterator) test_acc = map_score(qids_test, y_test[:, -1], y_pred[:, -1]) * 100 print('epoch: {} batch: {} dev auc: {:.4f}; test map: {:.4f}; best_dev_acc: {:.4f}'.format(epoch, i, dev_acc, test_acc, best_dev_acc)) best_dev_acc = dev_acc if dev_perp < best_dev_perp: y_pred, y_inner = predict_prob_batch(test_set_iterator) test_acc = map_score(qids_test, y_test[:, -1], y_pred[:, -1]) * 100 test_perplexity, test_perplexity_str = perplexity_score(y_test, y_pred) print('epoch: {} batch: {} dev auc: {:.4f}; test map: {:.4f}; best_dev_acc: {:.4f}; dev_perp: {:.4f}; best_dev_perp: {:.4f}'.format(epoch, i, dev_acc, test_acc, best_dev_acc, dev_perp, best_dev_perp)) print str(test_perplexity_str) best_params = [numpy.copy(p.get_value(borrow=True)) for p in params] best_inner = y_inner no_best_dev_update = 0 best_dev_perp = dev_perp if no_best_dev_update >= 3: print "Quitting after of no update of the best score on dev set", no_best_dev_update break numpy.savetxt(os.path.join(nnet_outdir, 'test.epoch={:02d};batch={:05d};dev_perp={:.2f}.best_inner.npy'.format(epoch, i, best_dev_perp)), best_inner) print('epoch {} took {:.4f} seconds'.format(epoch, time.time() - timer)) epoch += 1 no_best_dev_update += 1 print('Training took: {:.4f} seconds'.format(time.time() - timer_train)) for i, param in enumerate(best_params): params[i].set_value(param, borrow=True) y_pred_test, y_inner_test = predict_prob_batch(test_set_iterator) test_acc = map_score(qids_test, y_test[:, -1], y_pred_test[:, -1]) * 100 test_perp, test_perp_str = perplexity_score(y_test, y_pred_test) print "FINAL ACCURACY" print str(test_acc) print "FINAL PERPLEXITY" print str(test_perp_str) fname = os.path.join(nnet_outdir, 'best_dev_params.epoch={:02d};batch={:05d};dev_acc={:.2f}.dat'.format(epoch, i, best_dev_acc)) numpy.savetxt(os.path.join(nnet_outdir, 'test.epoch={:02d};batch={:05d};dev_acc={:.2f}.predictions.npy'.format(epoch, i, best_dev_acc)), y_pred_test) numpy.savetxt(os.path.join(nnet_outdir, 'test.final.epoch={:02d};batch={:05d};dev_acc={:.2f}.best_inner.npy'.format(epoch, i, best_dev_acc)), best_inner) cPickle.dump(best_params, open(fname, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL)