def evaluate_perplexity(self, data, nll_func): nll_preds = [] nll_masks = [] for idbs, idts in data: nll, mask = nll_func(idbs, idts) assert nll.shape == mask.shape nll_preds.append(nll) nll_masks.append(mask) avg_nll = evaluate_average(predictions=nll_preds, masks=nll_masks) return np.exp(avg_nll)
def evaluate_perplexity(self, data, nll_func): nll_preds = [ ] nll_masks = [ ] for idbs, idts in data: nll, mask = nll_func(idbs, idts) assert nll.shape == mask.shape nll_preds.append(nll) nll_masks.append(mask) avg_nll = evaluate_average( predictions = nll_preds, masks = nll_masks ) return np.exp(avg_nll)
def train(self, args, train, dev, test=None): embedding_layer = self.layers[0] dropout_prob = np.float64(args["dropout"]).astype(theano.config.floatX) batch_size = args["batch_size"] unroll_size = args["unroll_size"] train = create_batches(train, embedding_layer.map_to_ids, batch_size) dev = create_batches(dev, embedding_layer.map_to_ids, batch_size) if test is not None: test = create_batches(test, embedding_layer.map_to_ids, batch_size) cost = T.sum(self.nll) / self.idxs.shape[1] updates, lr, gnorm = create_optimization_updates( cost=cost, params=self.params, lr=args["learning_rate"], beta1=args["beta1"], beta2=args["beta2"], rho=args["rho"], momentum=args["momentum"], gamma=args["gamma"], eps=args["eps"], method=args["learning"])[:3] #if args["learning"] == "adadelta": # lr.set_value(args["learning_rate"]) train_func = theano.function( inputs=[self.idxs, self.idys, self.init_state], outputs=[cost, self.last_state, gnorm], updates=updates) eval_func = theano.function( inputs=[self.idxs, self.idys, self.init_state], outputs=[self.nll, self.last_state]) N = (len(train[0]) - 1) / unroll_size + 1 say(" train: {} tokens, {} mini-batches\n".format( len(train[0].ravel()), N)) say(" dev: {} tokens\n".format(len(dev[0].ravel()))) say("\tp_norm: {}\n".format(self.get_pnorm_stat())) decay_lr = args["decay_lr"] and args["learning"].lower() != "adadelta" and \ args["learning"].lower() != "adagrad" lr_0 = args["learning_rate"] iter_cnt = 0 unchanged = 0 best_dev = 1e+10 start_time = 0 max_epoch = args["max_epoch"] for epoch in xrange(max_epoch): if unchanged > 5: break start_time = time.time() prev_state = np.zeros((batch_size, self.n_d * 2), dtype=theano.config.floatX) train_loss = 0.0 for i in xrange(N): # get current batch x = train[0][i * unroll_size:(i + 1) * unroll_size] y = train[1][i * unroll_size:(i + 1) * unroll_size] iter_cnt += 1 if decay_lr: lr.set_value(np.float32(lr_0 / iter_cnt**0.5)) cur_loss, prev_state, grad_norm = train_func(x, y, prev_state) train_loss += cur_loss / len(x) if math.isnan(cur_loss) or math.isnan(grad_norm): say("\nNaN !!\n") return if i % 10 == 0: say("\r{}".format(i)) if i == N - 1: self.dropout.set_value(0.0) dev_preds = self.evaluate(eval_func, dev, batch_size, unroll_size) dev_loss = evaluate_average(predictions=dev_preds, masks=None) dev_ppl = np.exp(dev_loss) self.dropout.set_value(dropout_prob) say("\r\n") say( ( "Epoch={} lr={:.3f} train_loss={:.3f} train_ppl={:.1f} " \ +"dev_loss={:.3f} dev_ppl={:.1f}\t|g|={:.3f}\t[{:.1f}m]\n" ).format( epoch, float(lr.get_value(borrow=True)), train_loss/N, np.exp(train_loss/N), dev_loss, dev_ppl, float(grad_norm), (time.time()-start_time)/60.0 )) say("\tp_norm: {}\n".format(self.get_pnorm_stat())) # halve the learning rate #if args["learning"] == "sgd" and dev_ppl > best_dev-1: # lr.set_value(np.max([lr.get_value()/2.0, np.float32(0.0001)])) if dev_ppl < best_dev: best_dev = dev_ppl if test is None: continue self.dropout.set_value(0.0) test_preds = self.evaluate(eval_func, test, batch_size, unroll_size) test_loss = evaluate_average(predictions=test_preds, masks=None) test_ppl = np.exp(test_loss) self.dropout.set_value(dropout_prob) say("\tbest_dev={:.1f} test_loss={:.3f} test_ppl={:.1f}\n" .format(best_dev, test_loss, test_ppl)) if best_dev > 200: unchanged += 1 say("\n")
def train(self, args, train, dev, test=None): embedding_layer = self.layers[0] dropout_prob = np.float64(args["dropout"]).astype(theano.config.floatX) batch_size = args["batch_size"] unroll_size = args["unroll_size"] train = create_batches(train, embedding_layer.map_to_ids, batch_size) dev = create_batches(dev, embedding_layer.map_to_ids, batch_size) if test is not None: test = create_batches(test, embedding_layer.map_to_ids, batch_size) cost = T.sum(self.nll) / self.idxs.shape[1] updates, lr, gnorm = create_optimization_updates( cost = cost, params = self.params, lr = args["learning_rate"], beta1 = args["beta1"], beta2 = args["beta2"], rho = args["rho"], momentum = args["momentum"], gamma = args["gamma"], eps = args["eps"], method = args["learning"] )[:3] #if args["learning"] == "adadelta": # lr.set_value(args["learning_rate"]) train_func = theano.function( inputs = [ self.idxs, self.idys, self.init_state ], outputs = [cost, self.last_state, gnorm ], updates = updates ) eval_func = theano.function( inputs = [ self.idxs, self.idys, self.init_state ], outputs = [self.nll, self.last_state ] ) N = (len(train[0])-1)/unroll_size + 1 say(" train: {} tokens, {} mini-batches\n".format( len(train[0].ravel()), N )) say(" dev: {} tokens\n".format(len(dev[0].ravel()))) say("\tp_norm: {}\n".format( self.get_pnorm_stat() )) decay_lr = args["decay_lr"] and args["learning"].lower() != "adadelta" and \ args["learning"].lower() != "adagrad" lr_0 = args["learning_rate"] iter_cnt = 0 unchanged = 0 best_dev = 1e+10 start_time = 0 max_epoch = args["max_epoch"] for epoch in xrange(max_epoch): if unchanged > 5: break start_time = time.time() prev_state = np.zeros((batch_size, self.n_d*2), dtype=theano.config.floatX) train_loss = 0.0 for i in xrange(N): # get current batch x = train[0][i*unroll_size:(i+1)*unroll_size] y = train[1][i*unroll_size:(i+1)*unroll_size] iter_cnt += 1 if decay_lr: lr.set_value(np.float32(lr_0/iter_cnt**0.5)) cur_loss, prev_state, grad_norm = train_func(x, y, prev_state) train_loss += cur_loss/len(x) if math.isnan(cur_loss) or math.isnan(grad_norm): say("\nNaN !!\n") return if i % 10 == 0: say("\r{}".format(i)) if i == N-1: self.dropout.set_value(0.0) dev_preds = self.evaluate(eval_func, dev, batch_size, unroll_size) dev_loss = evaluate_average( predictions = dev_preds, masks = None ) dev_ppl = np.exp(dev_loss) self.dropout.set_value(dropout_prob) say("\r\n") say( ( "Epoch={} lr={:.3f} train_loss={:.3f} train_ppl={:.1f} " \ +"dev_loss={:.3f} dev_ppl={:.1f}\t|g|={:.3f}\t[{:.1f}m]\n" ).format( epoch, float(lr.get_value(borrow=True)), train_loss/N, np.exp(train_loss/N), dev_loss, dev_ppl, float(grad_norm), (time.time()-start_time)/60.0 )) say("\tp_norm: {}\n".format( self.get_pnorm_stat() )) # halve the learning rate #if args["learning"] == "sgd" and dev_ppl > best_dev-1: # lr.set_value(np.max([lr.get_value()/2.0, np.float32(0.0001)])) if dev_ppl < best_dev: best_dev = dev_ppl if test is None: continue self.dropout.set_value(0.0) test_preds = self.evaluate(eval_func, test, batch_size, unroll_size) test_loss = evaluate_average( predictions = test_preds, masks = None ) test_ppl = np.exp(test_loss) self.dropout.set_value(dropout_prob) say("\tbest_dev={:.1f} test_loss={:.3f} test_ppl={:.1f}\n".format( best_dev, test_loss, test_ppl)) if best_dev > 200: unchanged += 1 say("\n")
def train(self, args, train, dev, test=None): embedding_layer = self.layers[-2] dropout_prob = np.float64(args["dropout"]).astype(theano.config.floatX) rnn_dropout_prob = np.float64(args["rnn_dropout"]).astype( theano.config.floatX) batch_size = args["batch_size"] unroll_size = args["unroll_size"] train = create_batches(train, embedding_layer.map_to_ids, batch_size) dev = create_batches(dev, embedding_layer.map_to_ids, 1) if test is not None: test = create_batches(test, embedding_layer.map_to_ids, 1) cost = T.sum(self.nll) / self.idxs.shape[1] updates, lr, gnorm = create_optimization_updates( cost=cost, params=self.params, lr=args["learning_rate"], eps=args["eps"], method=args["learning"])[:3] train_func = theano.function(inputs=[self.idxs, self.idys] + self.init_state, outputs=[cost, gnorm] + self.last_state, updates=updates) eval_func = theano.function( inputs=[self.idxs, self.idys] + self.init_state, outputs=[self.nll] + self.last_state, ) N = (len(train[0]) - 1) / unroll_size + 1 say(" train: {} tokens, {} mini-batches\n".format( len(train[0].ravel()), N)) say(" dev: {} tokens\n".format(len(dev[0].ravel()))) say("\tp_norm: {}\n".format(self.get_pnorm_stat())) decay_epoch = args["lr_decay_epoch"] decay_rate = args["lr_decay"] lr_0 = args["learning_rate"] iter_cnt = 0 depth = args["depth"] unchanged = 0 best_dev = 1e+10 start_time = 0 max_epoch = args["max_epoch"] for epoch in xrange(max_epoch): unchanged += 1 if unchanged > 20: break if decay_epoch > 0 and epoch >= decay_epoch: lr.set_value(np.float32(lr.get_value() * decay_rate)) start_time = time.time() prev_state = [ np.zeros((batch_size, self.n_d), dtype=theano.config.floatX) for i in xrange(depth * 2) ] train_loss = 0.0 for i in xrange(N): # get current batch x = train[0][i * unroll_size:(i + 1) * unroll_size] y = train[1][i * unroll_size:(i + 1) * unroll_size] iter_cnt += 1 ret = train_func(x, y, *prev_state) cur_loss, grad_norm, prev_state = ret[0], ret[1], ret[2:] train_loss += cur_loss / len(x) if i % 10 == 0: say("\r{}".format(i)) if i == N - 1: self.dropout.set_value(0.0) self.rnn_dropout.set_value(0.0) dev_preds = self.evaluate(eval_func, dev, 1, unroll_size) dev_loss = evaluate_average(predictions=dev_preds, masks=None) dev_ppl = np.exp(dev_loss) self.dropout.set_value(dropout_prob) self.rnn_dropout.set_value(rnn_dropout_prob) say("\r\n") say( ( "Epoch={} lr={:.4f} train_loss={:.3f} train_ppl={:.1f} " \ +"dev_loss={:.3f} dev_ppl={:.1f}\t|g|={:.3f}\t[{:.1f}m]\n" ).format( epoch, float(lr.get_value(borrow=True)), train_loss/N, np.exp(train_loss/N), dev_loss, dev_ppl, float(grad_norm), (time.time()-start_time)/60.0 )) say("\tp_norm: {}\n".format(self.get_pnorm_stat())) if dev_ppl < best_dev: best_dev = dev_ppl if test is None: continue self.dropout.set_value(0.0) self.rnn_dropout.set_value(0.0) test_preds = self.evaluate(eval_func, test, 1, unroll_size) test_loss = evaluate_average(predictions=test_preds, masks=None) test_ppl = np.exp(test_loss) self.dropout.set_value(dropout_prob) self.rnn_dropout.set_value(rnn_dropout_prob) say("\tbest_dev={:.1f} test_loss={:.3f} test_ppl={:.1f}\n" .format(best_dev, test_loss, test_ppl)) if best_dev < 200: unchanged = 0 say("\n")