def main(): parser = argparse.ArgumentParser( description='Tuning with bi-directional LSTM-CNN-CRF') parser.add_argument('--fine_tune', action='store_true', help='Fine tune the word embeddings') parser.add_argument('--embedding', choices=['word2vec', 'glove', 'senna', 'random'], help='Embedding for words', required=True) parser.add_argument('--embedding_dict', default=None, help='path for embedding dict') parser.add_argument('--batch_size', type=int, default=10, help='Number of sentences in each batch') parser.add_argument('--num_units', type=int, default=100, help='Number of hidden units in LSTM') parser.add_argument('--num_filters', type=int, default=20, help='Number of filters in CNN') parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate') parser.add_argument('--decay_rate', type=float, default=0.1, help='Decay rate of learning rate') parser.add_argument('--grad_clipping', type=float, default=0, help='Gradient clipping') parser.add_argument('--gamma', type=float, default=1e-6, help='weight for regularization') parser.add_argument('--peepholes', action='store_true', help='Peepholes for LSTM') parser.add_argument('--oov', choices=['random', 'embedding'], help='Embedding for oov word', required=True) parser.add_argument('--update', choices=['sgd', 'momentum', 'nesterov', 'adadelta'], help='update algorithm', default='sgd') parser.add_argument('--regular', choices=['none', 'l2'], help='regularization for training', required=True) parser.add_argument('--dropout', action='store_true', help='Apply dropout layers') parser.add_argument('--patience', type=int, default=5, help='Patience for early stopping') parser.add_argument('--output_prediction', action='store_true', help='Output predictions to temp files') parser.add_argument( '--train') # "data/POS-penn/wsj/split1/wsj1.train.original" parser.add_argument( '--dev') # "data/POS-penn/wsj/split1/wsj1.dev.original" parser.add_argument( '--test') # "data/POS-penn/wsj/split1/wsj1.test.original" args = parser.parse_args() def construct_input_layer(): if fine_tune: layer_input = lasagne.layers.InputLayer(shape=(None, max_length), input_var=input_var, name='input') layer_embedding = lasagne.layers.EmbeddingLayer( layer_input, input_size=alphabet_size, output_size=embedd_dim, W=embedd_table, name='embedding') return layer_embedding else: layer_input = lasagne.layers.InputLayer(shape=(None, max_length, embedd_dim), input_var=input_var, name='input') return layer_input def construct_char_input_layer(): layer_char_input = lasagne.layers.InputLayer(shape=(None, max_sent_length, max_char_length), input_var=char_input_var, name='char-input') layer_char_input = lasagne.layers.reshape(layer_char_input, (-1, [2])) layer_char_embedding = lasagne.layers.EmbeddingLayer( layer_char_input, input_size=char_alphabet_size, output_size=char_embedd_dim, W=char_embedd_table, name='char_embedding') layer_char_input = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1)) return layer_char_input logger = utils.get_logger("BiLSTM-CNN-CRF") fine_tune = args.fine_tune oov = args.oov regular = args.regular embedding = args.embedding embedding_path = args.embedding_dict train_path = args.train dev_path = args.dev test_path = args.test update_algo = args.update grad_clipping = args.grad_clipping peepholes = args.peepholes num_filters = args.num_filters gamma = args.gamma output_predict = args.output_prediction dropout = args.dropout X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \ embedd_table, label_alphabet, \ C_train, C_dev, C_test, char_embedd_table = data_processor.load_dataset_sequence_labeling(train_path, dev_path, test_path, oov=oov, fine_tune=fine_tune, embedding=embedding, embedding_path=embedding_path, use_character=True) num_labels = label_alphabet.size() - 1 logger.info("constructing network...") # create variables target_var = T.imatrix(name='targets') mask_var = T.matrix(name='masks', dtype=theano.config.floatX) if fine_tune: input_var = T.imatrix(name='inputs') num_data, max_length = X_train.shape alphabet_size, embedd_dim = embedd_table.shape else: input_var = T.tensor3(name='inputs', dtype=theano.config.floatX) num_data, max_length, embedd_dim = X_train.shape char_input_var = T.itensor3(name='char-inputs') num_data_char, max_sent_length, max_char_length = C_train.shape char_alphabet_size, char_embedd_dim = char_embedd_table.shape assert (max_length == max_sent_length) assert (num_data == num_data_char) # construct input and mask layers layer_incoming1 = construct_char_input_layer() layer_incoming2 = construct_input_layer() layer_mask = lasagne.layers.InputLayer(shape=(None, max_length), input_var=mask_var, name='mask') # construct bi-rnn-cnn num_units = args.num_units bi_lstm_cnn_crf = build_BiLSTM_CNN_CRF(layer_incoming1, layer_incoming2, num_units, num_labels, mask=layer_mask, grad_clipping=grad_clipping, peepholes=peepholes, num_filters=num_filters, dropout=dropout) logger.info("Network structure: hidden=%d, filter=%d" % (num_units, num_filters)) # compute loss num_tokens = mask_var.sum(dtype=theano.config.floatX) # get outpout of bi-lstm-cnn-crf shape [batch, length, num_labels, num_labels] energies_train = lasagne.layers.get_output(bi_lstm_cnn_crf) energies_eval = lasagne.layers.get_output(bi_lstm_cnn_crf, deterministic=True) loss_train = crf_loss(energies_train, target_var, mask_var).mean() loss_eval = crf_loss(energies_eval, target_var, mask_var).mean() # l2 regularization? if regular == 'l2': l2_penalty = lasagne.regularization.regularize_network_params( bi_lstm_cnn_crf, lasagne.regularization.l2) loss_train = loss_train + gamma * l2_penalty _, corr_train = crf_accuracy(energies_train, target_var) corr_train = (corr_train * mask_var).sum(dtype=theano.config.floatX) prediction_eval, corr_eval = crf_accuracy(energies_eval, target_var) corr_eval = (corr_eval * mask_var).sum(dtype=theano.config.floatX) # Create update expressions for training. # hyper parameters to tune: learning rate, momentum, regularization. batch_size = args.batch_size learning_rate = 1.0 if update_algo == 'adadelta' else args.learning_rate decay_rate = args.decay_rate momentum = 0.9 params = lasagne.layers.get_all_params(bi_lstm_cnn_crf, trainable=True) updates = utils.create_updates(loss_train, params, update_algo, learning_rate, momentum=momentum) # Compile a function performing a training step on a mini-batch train_fn = theano.function( [input_var, target_var, mask_var, char_input_var], [loss_train, corr_train, num_tokens], updates=updates) # Compile a second function evaluating the loss and accuracy of network eval_fn = theano.function( [input_var, target_var, mask_var, char_input_var], [loss_eval, corr_eval, num_tokens, prediction_eval]) # Finally, launch the training loop. logger.info( "Start training: %s with regularization: %s(%f), dropout: %s, fine tune: %s (#training data: %d, batch size: %d, clip: %.1f, peepholes: %s)..." \ % ( update_algo, regular, (0.0 if regular == 'none' else gamma), dropout, fine_tune, num_data, batch_size, grad_clipping, peepholes)) num_batches = num_data / batch_size num_epochs = 50 best_loss = 1e+12 best_acc = 0.0 best_epoch_loss = 0 best_epoch_acc = 0 best_loss_test_err = 0. best_loss_test_corr = 0. best_acc_test_err = 0. best_acc_test_corr = 0. stop_count = 0 lr = learning_rate patience = args.patience for epoch in range(1, num_epochs + 1): print 'Epoch %d (learning rate=%.4f, decay rate=%.4f): ' % (epoch, lr, decay_rate) logger.info('Epoch %d (learning rate=%.4f, decay rate=%.4f): ' % (epoch, lr, decay_rate)) train_err = 0.0 train_corr = 0.0 train_total = 0 train_inst = 0 start_time = time.time() num_back = 0 train_batches = 0 for batch in utils.iterate_minibatches(X_train, Y_train, masks=mask_train, char_inputs=C_train, batch_size=batch_size, shuffle=True): inputs, targets, masks, char_inputs = batch err, corr, num = train_fn(inputs, targets, masks, char_inputs) train_err += err * inputs.shape[0] train_corr += corr train_total += num train_inst += inputs.shape[0] train_batches += 1 time_ave = (time.time() - start_time) / train_batches time_left = (num_batches - train_batches) * time_ave # update log #sys.stdout.write("\b" * num_back) log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % ( min(train_batches * batch_size, num_data), num_data, train_err / train_inst, train_corr * 100 / train_total, time_left) #sys.stdout.write(log_info) num_back = len(log_info) logger.info(log_info) # update training log after each epoch assert train_inst == num_data # sys.stdout.write("\b" * num_back) # print 'train: %d/%d loss: %.4f, acc: %.2f%%, time: %.2fs' % ( # min(train_batches * batch_size, num_data), num_data, # train_err / num_data, train_corr * 100 / train_total, time.time() - start_time) logger.info('train: %d/%d loss: %.4f, acc: %.2f%%, time: %.2fs' % (min(train_batches * batch_size, num_data), num_data, train_err / num_data, train_corr * 100 / train_total, time.time() - start_time)) #evaluate performance on dev data dev_err = 0.0 dev_corr = 0.0 dev_total = 0 dev_inst = 0 for batch in utils.iterate_minibatches(X_dev, Y_dev, masks=mask_dev, char_inputs=C_dev, batch_size=batch_size): inputs, targets, masks, char_inputs = batch err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs) dev_err += err * inputs.shape[0] dev_corr += corr dev_total += num dev_inst += inputs.shape[0] if output_predict: utils.output_predictions(predictions, targets, masks, 'tmp3/dev%d' % epoch, label_alphabet, is_flattened=False) # print 'dev loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( # dev_err / dev_inst, dev_corr, dev_total, dev_corr * 100 / dev_total) logger.info('dev loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (dev_err / dev_inst, dev_corr, dev_total, dev_corr * 100 / dev_total)) logger.info( 'dev_err: %.4f, best_loss: %.4f, best_acc: %.4f, dev_corr: %.4f, dev_total: %.4f, (dev_corr/dev_total): %.4f' % (dev_err, best_loss, best_acc, dev_corr, dev_total, dev_corr / dev_total)) if best_loss < dev_err and best_acc > dev_corr / dev_total: stop_count += 1 else: update_loss = False update_acc = False stop_count = 0 if best_loss > dev_err: update_loss = True best_loss = dev_err best_epoch_loss = epoch if best_acc < dev_corr / dev_total: update_acc = True best_acc = dev_corr / dev_total best_epoch_acc = epoch # # evaluate on test data when better performance detected # test_err = 0.0 # test_corr = 0.0 # test_total = 0 # test_inst = 0 # for batch in utils.iterate_minibatches(X_test, Y_test, masks=mask_test, char_inputs=C_test, # batch_size=batch_size): # inputs, targets, masks, char_inputs = batch # err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs) # test_err += err * inputs.shape[0] # test_corr += corr # test_total += num # test_inst += inputs.shape[0] # if output_predict: # utils.output_predictions(predictions, targets, masks, 'tmp3/test%d' % epoch, label_alphabet, # is_flattened=False) # # print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( # # test_err / test_inst, test_corr, test_total, test_corr * 100 / test_total) # logger.info('test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( # test_err / test_inst, test_corr, test_total, test_corr * 100 / test_total)) # if update_loss: # best_loss_test_err = test_err # best_loss_test_corr = test_corr # if update_acc: # best_acc_test_err = test_err # best_acc_test_corr = test_corr logger.info('stop_count: %.4f' % (stop_count)) # stop if dev acc decrease 3 time straightly. if stop_count == patience: break # re-compile a function with new learning rate for training if update_algo != 'adadelta': lr = learning_rate / (1.0 + epoch * decay_rate) updates = utils.create_updates(loss_train, params, update_algo, lr, momentum=momentum) train_fn = theano.function( [input_var, target_var, mask_var, char_input_var], [loss_train, corr_train, num_tokens], updates=updates) # evaluate on test data when better performance detected test_err = 0.0 test_corr = 0.0 test_total = 0 test_inst = 0 for batch in utils.iterate_minibatches(X_test, Y_test, masks=mask_test, char_inputs=C_test, batch_size=batch_size): inputs, targets, masks, char_inputs = batch err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs) test_err += err * inputs.shape[0] test_corr += corr test_total += num test_inst += inputs.shape[0] if output_predict: utils.output_predictions(predictions, targets, masks, 'tmp4/test%d' % epoch, label_alphabet, is_flattened=False) # print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( # test_err / test_inst, test_corr, test_total, test_corr * 100 / test_total) logger.info('test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (test_err / test_inst, test_corr, test_total, test_corr * 100 / test_total)) if update_loss: best_loss_test_err = test_err best_loss_test_corr = test_corr if update_acc: best_acc_test_err = test_err best_acc_test_corr = test_corr # print best performance on test data. logger.info("final best loss test performance (at epoch %d)" % best_epoch_loss) logger.info("final best acc test performance (at epoch %d)" % best_epoch_acc) # print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( # best_loss_test_err / test_inst, best_loss_test_corr, test_total, best_loss_test_corr * 100 / test_total) logger.info('test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (best_loss_test_err / test_inst, best_loss_test_corr, test_total, best_loss_test_corr * 100 / test_total)) # print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( # best_acc_test_err / test_inst, best_acc_test_corr, test_total, best_acc_test_corr * 100 / test_total) logger.info('test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (best_acc_test_err / test_inst, best_acc_test_corr, test_total, best_acc_test_corr * 100 / test_total))
def main(): parser = argparse.ArgumentParser( description='Tuning with bi-directional LSTM') parser.add_argument('--fine_tune', action='store_true', help='Fine tune the word embeddings') parser.add_argument('--embedding', choices=['word2vec', 'glove', 'senna'], help='Embedding for words', required=True) parser.add_argument( '--embedding_dict', default='data/word2vec/GoogleNews-vectors-negative300.bin', help='path for embedding dict') parser.add_argument('--batch_size', type=int, default=10, help='Number of sentences in each batch') parser.add_argument('--num_units', type=int, default=100, help='Number of hidden units in LSTM') parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate') parser.add_argument('--decay_rate', type=float, default=0.1, help='Decay rate of learning rate') parser.add_argument('--grad_clipping', type=float, default=0, help='Gradient clipping') parser.add_argument('--gamma', type=float, default=1e-6, help='weight for regularization') parser.add_argument('--peepholes', action='store_true', help='Peepholes for LSTM') parser.add_argument('--oov', choices=['random', 'embedding'], help='Embedding for oov word', required=True) parser.add_argument('--update', choices=['sgd', 'momentum', 'nesterov'], help='update algorithm', default='sgd') parser.add_argument('--regular', choices=['none', 'l2'], help='regularization for training', required=True) parser.add_argument('--dropout', action='store_true', help='Apply dropout layers') parser.add_argument('--output_prediction', action='store_true', help='Output predictions to temp files') parser.add_argument( '--train') # "data/POS-penn/wsj/split1/wsj1.train.original" parser.add_argument( '--dev') # "data/POS-penn/wsj/split1/wsj1.dev.original" parser.add_argument( '--test') # "data/POS-penn/wsj/split1/wsj1.test.original" args = parser.parse_args() def construct_input_layer(): if fine_tune: layer_input = lasagne.layers.InputLayer(shape=(None, max_length), input_var=input_var, name='input') layer_embedding = lasagne.layers.EmbeddingLayer( layer_input, input_size=alphabet_size, output_size=embedd_dim, W=embedd_table, name='embedding') return layer_embedding else: layer_input = lasagne.layers.InputLayer(shape=(None, max_length, embedd_dim), input_var=input_var, name='input') return layer_input logger = utils.get_logger("BiLSTM") fine_tune = args.fine_tune oov = args.oov regular = args.regular embedding = args.embedding embedding_path = args.embedding_dict train_path = args.train dev_path = args.dev test_path = args.test update_algo = args.update grad_clipping = args.grad_clipping peepholes = args.peepholes gamma = args.gamma output_predict = args.output_prediction dropout = args.dropout X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \ embedd_table, label_alphabet, _, _, _, _ = data_processor.load_dataset_sequence_labeling(train_path, dev_path, test_path, oov=oov, fine_tune=fine_tune, embedding=embedding, embedding_path=embedding_path) num_labels = label_alphabet.size() - 1 logger.info("constructing network...") # create variables target_var = T.imatrix(name='targets') mask_var = T.matrix(name='masks', dtype=theano.config.floatX) if fine_tune: input_var = T.imatrix(name='inputs') num_data, max_length = X_train.shape alphabet_size, embedd_dim = embedd_table.shape else: input_var = T.tensor3(name='inputs', dtype=theano.config.floatX) num_data, max_length, embedd_dim = X_train.shape # construct input and mask layers layer_incoming = construct_input_layer() layer_mask = lasagne.layers.InputLayer(shape=(None, max_length), input_var=mask_var, name='mask') # construct bi-lstm num_units = args.num_units bi_lstm = build_BiLSTM(layer_incoming, num_units, mask=layer_mask, grad_clipping=grad_clipping, peepholes=peepholes, dropout=dropout) # reshape bi-rnn to [batch * max_length, num_units] bi_lstm = lasagne.layers.reshape(bi_lstm, (-1, [2])) # construct output layer (dense layer with softmax) layer_output = lasagne.layers.DenseLayer( bi_lstm, num_units=num_labels, nonlinearity=nonlinearities.softmax, name='softmax') # get output of bi-rnn shape=[batch * max_length, #label] prediction_train = lasagne.layers.get_output(layer_output) prediction_eval = lasagne.layers.get_output(layer_output, deterministic=True) final_prediction = T.argmax(prediction_eval, axis=1) # flat target_var to vector target_var_flatten = target_var.flatten() # flat mask_var to vector mask_var_flatten = mask_var.flatten() # compute loss num_loss = mask_var_flatten.sum(dtype=theano.config.floatX) # for training, we use mean of loss over number of labels loss_train = lasagne.objectives.categorical_crossentropy( prediction_train, target_var_flatten) loss_train = (loss_train * mask_var_flatten).sum(dtype=theano.config.floatX) / num_loss # l2 regularization? if regular == 'l2': l2_penalty = lasagne.regularization.regularize_network_params( layer_output, lasagne.regularization.l2) loss_train = loss_train + gamma * l2_penalty loss_eval = lasagne.objectives.categorical_crossentropy( prediction_eval, target_var_flatten) loss_eval = (loss_eval * mask_var_flatten).sum(dtype=theano.config.floatX) / num_loss # compute number of correct labels corr_train = lasagne.objectives.categorical_accuracy( prediction_train, target_var_flatten) corr_train = (corr_train * mask_var_flatten).sum(dtype=theano.config.floatX) corr_eval = lasagne.objectives.categorical_accuracy( prediction_eval, target_var_flatten) corr_eval = (corr_eval * mask_var_flatten).sum(dtype=theano.config.floatX) # Create update expressions for training. # hyper parameters to tune: learning rate, momentum, regularization. batch_size = args.batch_size learning_rate = args.learning_rate decay_rate = args.decay_rate momentum = 0.9 params = lasagne.layers.get_all_params(layer_output, trainable=True) updates = utils.create_updates(loss_train, params, update_algo, learning_rate, momentum=momentum) # Compile a function performing a training step on a mini-batch train_fn = theano.function([input_var, target_var, mask_var], [loss_train, corr_train, num_loss], updates=updates) # Compile a second function evaluating the loss and accuracy of network eval_fn = theano.function( [input_var, target_var, mask_var], [loss_eval, corr_eval, num_loss, final_prediction]) # Finally, launch the training loop. logger.info( "Start training: %s with regularization: %s(%f), dropout: %s, fine tune: %s (#training data: %d, batch size: %d, clip: %.1f, peepholes: %s)..." \ % ( update_algo, regular, (0.0 if regular == 'none' else gamma), dropout, fine_tune, num_data, batch_size, grad_clipping, peepholes)) num_batches = num_data / batch_size num_epochs = 1000 best_loss = 1e+12 best_acc = 0.0 best_epoch_loss = 0 best_epoch_acc = 0 best_loss_test_err = 0. best_loss_test_corr = 0. best_acc_test_err = 0. best_acc_test_corr = 0. stop_count = 0 lr = learning_rate patience = 5 for epoch in range(1, num_epochs + 1): print('Epoch %d (learning rate=%.4f, decay rate=%.4f): ' % (epoch, lr, decay_rate)) train_err = 0.0 train_corr = 0.0 train_total = 0 start_time = time.time() num_back = 0 train_batches = 0 for batch in utils.iterate_minibatches(X_train, Y_train, masks=mask_train, batch_size=batch_size, shuffle=True): inputs, targets, masks, _ = batch err, corr, num = train_fn(inputs, targets, masks) train_err += err * num train_corr += corr train_total += num train_batches += 1 time_ave = (time.time() - start_time) / train_batches time_left = (num_batches - train_batches) * time_ave # update log sys.stdout.write("\b" * num_back) log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % ( min(train_batches * batch_size, num_data), num_data, train_err / train_total, train_corr * 100 / train_total, time_left) sys.stdout.write(log_info) num_back = len(log_info) # update training log after each epoch sys.stdout.write("\b" * num_back) print('train: %d/%d loss: %.4f, acc: %.2f%%, time: %.2fs' % (min(train_batches * batch_size, num_data), num_data, train_err / train_total, train_corr * 100 / train_total, time.time() - start_time)) # evaluate performance on dev data dev_err = 0.0 dev_corr = 0.0 dev_total = 0 for batch in utils.iterate_minibatches(X_dev, Y_dev, masks=mask_dev, batch_size=batch_size): inputs, targets, masks, _ = batch err, corr, num, predictions = eval_fn(inputs, targets, masks) dev_err += err * num dev_corr += corr dev_total += num if output_predict: utils.output_predictions(predictions, targets, masks, 'tmp/dev%d' % epoch, label_alphabet) print('dev loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (dev_err / dev_total, dev_corr, dev_total, dev_corr * 100 / dev_total)) if best_loss < dev_err and best_acc > dev_corr / dev_total: stop_count += 1 else: update_loss = False update_acc = False stop_count = 0 if best_loss > dev_err: update_loss = True best_loss = dev_err best_epoch_loss = epoch if best_acc < dev_corr / dev_total: update_acc = True best_acc = dev_corr / dev_total best_epoch_acc = epoch # evaluate on test data when better performance detected test_err = 0.0 test_corr = 0.0 test_total = 0 for batch in utils.iterate_minibatches(X_test, Y_test, masks=mask_test, batch_size=batch_size): inputs, targets, masks, _ = batch err, corr, num, predictions = eval_fn(inputs, targets, masks) test_err += err * num test_corr += corr test_total += num if output_predict: utils.output_predictions(predictions, targets, masks, 'tmp/test%d' % epoch, label_alphabet) print('test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (test_err / test_total, test_corr, test_total, test_corr * 100 / test_total)) if update_loss: best_loss_test_err = test_err best_loss_test_corr = test_corr if update_acc: best_acc_test_err = test_err best_acc_test_corr = test_corr # stop if dev acc decrease 3 time straightly. if stop_count == patience: break # re-compile a function with new learning rate for training lr = learning_rate / (1.0 + epoch * decay_rate) updates = utils.create_updates(loss_train, params, update_algo, lr, momentum=momentum) train_fn = theano.function([input_var, target_var, mask_var], [loss_train, corr_train, num_loss], updates=updates) # print best performance on test data. logger.info("final best loss test performance (at epoch %d)" % (best_epoch_loss)) print('test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (best_loss_test_err / test_total, best_loss_test_corr, test_total, best_loss_test_corr * 100 / test_total)) logger.info("final best acc test performance (at epoch %d)" % (best_epoch_acc)) print('test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (best_acc_test_err / test_total, best_acc_test_corr, test_total, best_acc_test_corr * 100 / test_total))
def main(): parser = argparse.ArgumentParser( description='Tuning with bi-directional LSTM-CNN-CRF') parser.add_argument('--fine_tune', action='store_true', help='Fine tune the word embeddings') parser.add_argument( '--embedding', choices=['word2vec', 'glove', 'senna', 'random', 'polyglot'], help='Embedding for words', required=True) parser.add_argument('--embedding_dict', default=None, help='path for embedding dict') parser.add_argument('--batch_size', type=int, default=10, help='Number of sentences in each batch') parser.add_argument('--num_units', type=int, default=100, help='Number of hidden units in LSTM') parser.add_argument('--num_filters', type=int, default=20, help='Number of filters in CNN') parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate') parser.add_argument('--decay_rate', type=float, default=0.1, help='Decay rate of learning rate') parser.add_argument('--grad_clipping', type=float, default=0, help='Gradient clipping') parser.add_argument('--gamma', type=float, default=1e-6, help='weight for regularization') parser.add_argument('--peepholes', action='store_true', help='Peepholes for LSTM') parser.add_argument('--oov', choices=['random', 'embedding'], help='Embedding for oov word', required=True) parser.add_argument( '--update', choices=['sgd', 'momentum', 'nesterov', 'adadelta', 'adam'], help='update algorithm', default='sgd') parser.add_argument('--regular', choices=['none', 'l2'], help='regularization for training', required=True) parser.add_argument('--dropout', action='store_true', help='Apply dropout layers') parser.add_argument('--patience', type=int, default=5, help='Patience for early stopping') parser.add_argument('--output_prediction', action='store_true', help='Output predictions to temp files') parser.add_argument('--train') parser.add_argument('--dev') parser.add_argument('--test') parser.add_argument('--exp_dir') parser.add_argument('--adv', type=float, default=0) parser.add_argument('--seed', type=int, default=1234) parser.add_argument('--reload', default=None, help='path for reloading') args = parser.parse_args() np.random.seed(args.seed) lasagne.random.set_rng(np.random) def construct_input_layer(): if fine_tune: layer_input = lasagne.layers.InputLayer(shape=(None, max_length), input_var=input_var, name='input') layer_embedding = Normalized_EmbeddingLayer( layer_input, input_size=alphabet_size, output_size=embedd_dim, vocab_freqs=word_freqs, W=embedd_table, name='embedding') raw_layer = layer_embedding else: layer_input = lasagne.layers.InputLayer(shape=(None, max_length, embedd_dim), input_var=input_var, name='input') raw_layer = layer_input return raw_layer # [batch, max_sent_length, embedd_dim] def construct_char_input_layer(): layer_char_input = lasagne.layers.InputLayer(shape=(None, max_sent_length, max_char_length), input_var=char_input_var, name='char-input') layer_char_input = lasagne.layers.reshape( layer_char_input, (-1, [2])) # [batch * max_sent_length, max_char_length] layer_char_embedding = Normalized_EmbeddingLayer( layer_char_input, input_size=char_alphabet_size, output_size=char_embedd_dim, vocab_freqs=char_freqs, W=char_embedd_table, name='char_embedding' ) # [n_examples, max_char_length, char_embedd_dim] #layer_char_input = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1)) # [n_examples, char_embedd_dim, max_char_length] return layer_char_embedding logger = utils.get_logger("BiLSTM-BiLSTM-CRF") fine_tune = args.fine_tune oov = args.oov regular = args.regular embedding = args.embedding embedding_path = args.embedding_dict train_path = args.train dev_path = args.dev test_path = args.test update_algo = args.update grad_clipping = args.grad_clipping peepholes = args.peepholes gamma = args.gamma output_predict = args.output_prediction dropout = args.dropout exp_dir = args.exp_dir if not os.path.isdir(exp_dir): os.mkdir(exp_dir) exp_name = exp_dir.split('/')[-1] exp_mode = exp_name.split('_')[0] # 'pos' or 'ner', etc. save_dir = os.path.join(exp_dir, 'save') eval_dir = os.path.join(exp_dir, 'eval') if not os.path.isdir(save_dir): os.mkdir(save_dir) if not os.path.isdir(eval_dir): os.mkdir(eval_dir) eval_script = "./conlleval" if exp_mode == 'pos': (word_col_in_data, label_col_in_data) = (0, 1) elif exp_mode == 'ner': (word_col_in_data, label_col_in_data) = (0, 3) elif exp_mode == 'chunk': (word_col_in_data, label_col_in_data) = (0, 2) else: (word_col_in_data, label_col_in_data) = (1, 3) # assume CoNLL-U style # load data X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \ (embedd_table, word_freqs), label_alphabet, \ C_train, C_dev, C_test, (char_embedd_table, char_freqs) = data_processor.load_dataset_sequence_labeling(train_path, dev_path, test_path, word_col_in_data, label_col_in_data, label_name=exp_mode, oov=oov, fine_tune=True, embedding=embedding, embedding_path=embedding_path, use_character=True) num_labels = label_alphabet.size() - 1 logger.info("constructing network...") # create variables target_var = T.imatrix(name='targets') mask_var = T.matrix(name='masks', dtype=theano.config.floatX) num_tokens = mask_var.sum(dtype=theano.config.floatX) if fine_tune: input_var = T.imatrix(name='inputs') num_data, max_length = X_train.shape alphabet_size, embedd_dim = embedd_table.shape else: input_var = T.tensor3(name='inputs', dtype=theano.config.floatX) num_data, max_length, embedd_dim = X_train.shape char_input_var = T.itensor3(name='char-inputs') num_data_char, max_sent_length, max_char_length = C_train.shape char_alphabet_size, char_embedd_dim = char_embedd_table.shape assert (max_length == max_sent_length) assert (num_data == num_data_char) # prepare initial input layer and embeddings char_layer = construct_char_input_layer() word_layer = construct_input_layer() char_emb = Lyrs.get_output(char_layer) word_emb = Lyrs.get_output(word_layer) # construct input and mask layers char_in_layer = Lyrs.InputLayer(shape=(None, max_char_length, char_embedd_dim)) word_in_layer = Lyrs.InputLayer(shape=(None, max_length, embedd_dim)) layer_mask = lasagne.layers.InputLayer(shape=(None, max_length), input_var=mask_var, name='mask') # construct bilstm_bilstm_crf num_units = args.num_units num_filters = args.num_filters logger.info("Network structure: hidden=%d, filter=%d" % (num_units, num_filters)) bilstm_bilstm_crf = build_BiLSTM_BiLSTM_CRF(char_in_layer, word_in_layer, num_units, num_labels, mask=layer_mask, grad_clipping=grad_clipping, peepholes=peepholes, num_filters=num_filters, dropout=dropout) # compute loss def loss_from_embedding(char_emb, word_emb, deterministic=False, return_all=True): # get outpout of bi-lstm-cnn-crf shape [batch, length, num_labels, num_labels] energies = Lyrs.get_output(bilstm_bilstm_crf, inputs={ char_in_layer: char_emb, word_in_layer: word_emb }, deterministic=deterministic) loss = crf_loss(energies, target_var, mask_var).mean() if return_all: predict, corr = crf_accuracy(energies, target_var) corr = (corr * mask_var).sum(dtype=theano.config.floatX) return loss, predict, corr else: return loss loss_eval, prediction_eval, corr_eval = loss_from_embedding( char_emb, word_emb, deterministic=True) loss_train_ori, _, corr_train = loss_from_embedding(char_emb, word_emb) if args.adv: logger.info('Preparing adversarial training...') loss_train_adv = adversarial_loss(char_emb, word_emb, loss_from_embedding, loss_train_ori, perturb_scale=args.adv) loss_train = (loss_train_ori + loss_train_adv) / 2.0 else: loss_train_adv = T.as_tensor_variable( np.asarray(0.0, dtype=theano.config.floatX)) loss_train = loss_train_ori + loss_train_adv # l2 regularization? if regular == 'l2': l2_penalty = lasagne.regularization.regularize_network_params( bilstm_bilstm_crf, lasagne.regularization.l2) loss_train = loss_train + gamma * l2_penalty # Create update expressions for training. # hyper parameters to tune: learning rate, momentum, regularization. batch_size = args.batch_size learning_rate = 1.0 if update_algo == 'adadelta' else args.learning_rate decay_rate = args.decay_rate momentum = 0.9 params = Lyrs.get_all_params( bilstm_bilstm_crf, trainable=True) + Lyrs.get_all_params( char_layer, trainable=True) + Lyrs.get_all_params(word_layer, trainable=True) updates = utils.create_updates(loss_train, params, update_algo, learning_rate, momentum=momentum) # Compile a function performing a training step on a mini-batch train_fn = theano.function( [input_var, target_var, mask_var, char_input_var], [loss_train_ori, loss_train_adv, corr_train, num_tokens], updates=updates) # Compile a second function evaluating the loss and accuracy of network eval_fn = theano.function( [input_var, target_var, mask_var, char_input_var], [loss_eval, corr_eval, num_tokens, prediction_eval]) # reload saved model if args.reload is not None: logger.info('Reloading saved parameters from %s ...\n' % args.reload) with np.load(args.reload) as f: param_values = [f['arr_%d' % j] for j in range(len(f.files))] Lyrs.set_all_param_values(word_layer, param_values[0:1]) Lyrs.set_all_param_values(char_layer, param_values[1:2]) Lyrs.set_all_param_values(bilstm_bilstm_crf, param_values[2:]) # Finally, launch the training loop. logger.info( "Start training: %s with regularization: %s(%f), dropout: %s, fine tune: %s (#training data: %d, batch size: %d, clip: %.1f, peepholes: %s) ..." \ % ( update_algo, regular, (0.0 if regular == 'none' else gamma), dropout, fine_tune, num_data, batch_size, grad_clipping, peepholes)) num_batches = num_data / batch_size num_epochs = 1000 best_acc = np.array([0.0, 0.0, 0.0]) best_epoch_acc = np.array([0, 0, 0]) best_acc_test_err = np.array([0.0, 0.0, 0.0]) best_acc_test_corr = np.array([0.0, 0.0, 0.0]) stop_count = 0 lr = learning_rate patience = args.patience for epoch in range(1, num_epochs + 1): print print 'Epoch %d (learning rate=%.7f, decay rate=%.4f): ' % (epoch, lr, decay_rate) train_err_ori = 0.0 train_err_adv = 0.0 train_corr = 0.0 train_total = 0 train_inst = 0 start_time = time.time() num_back = 0 train_batches = 0 epoch_save_dir = os.path.join(save_dir, 'epoch%d' % epoch) os.mkdir(epoch_save_dir) for batch in utils.iterate_minibatches(X_train, Y_train, masks=mask_train, char_inputs=C_train, batch_size=batch_size, shuffle=True): inputs, targets, masks, char_inputs = batch err_ori, err_adv, corr, num = train_fn(inputs, targets, masks, char_inputs) train_err_ori += err_ori * inputs.shape[0] train_err_adv += err_adv * inputs.shape[0] train_corr += corr train_total += num train_inst += inputs.shape[0] train_batches += 1 time_ave = (time.time() - start_time) / train_batches time_left = (num_batches - train_batches) * time_ave # update log if train_batches % (num_batches // 10) == 0: log_info = 'train: %d/%d L_ori: %.4f, L_adv: %.4f, acc: %.2f%%, time left: %.2fs\n' % ( min(train_batches * batch_size, num_data), num_data, train_err_ori / train_inst, train_err_adv / train_inst, train_corr * 100 / train_total, time_left) sys.stdout.write(log_info) sys.stdout.flush() # save the parameter values #param_values = Lyrs.get_all_param_values(bilstm_bilstm_crf) #np.savez(epoch_save_dir + '/iter%d.npz' % train_batches, *param_values) # save the parameter values param_values = Lyrs.get_all_param_values( word_layer) + Lyrs.get_all_param_values( char_layer) + Lyrs.get_all_param_values(bilstm_bilstm_crf) np.savez(epoch_save_dir + '/final.npz', *param_values) # update training log after each epoch assert train_inst == num_data print 'train: %d/%d L_ori: %.4f, L_adv: %.4f, acc: %.2f%%, time: %.2fs' % ( min(train_batches * batch_size, num_data), num_data, train_err_ori / train_inst, train_err_adv / train_inst, train_corr * 100 / train_total, time.time() - start_time) # evaluate performance on dev data dev_err = 0.0 dev_corr = 0.0 dev_total = 0 dev_inst = 0 for batch in utils.iterate_minibatches(X_dev, Y_dev, masks=mask_dev, char_inputs=C_dev, batch_size=batch_size): inputs, targets, masks, char_inputs = batch err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs) dev_err += err * inputs.shape[0] dev_corr += corr dev_total += num dev_inst += inputs.shape[0] if output_predict: output_file = eval_dir + '/dev%d' % epoch utils.output_predictions(predictions, targets, masks, output_file, label_alphabet, is_flattened=False) print 'dev loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( dev_err / dev_inst, dev_corr, dev_total, dev_corr * 100 / dev_total) #update_loss = False update_acc = False if best_acc.min() > dev_corr / dev_total: stop_count += 1 else: stop_count = 0 if best_acc.min() < dev_corr / dev_total: update_acc = True idx_to_update = best_acc.argmin() best_acc[idx_to_update] = dev_corr / dev_total best_epoch_acc[idx_to_update] = epoch # evaluate on test data test_err = 0.0 test_corr = 0.0 test_total = 0 test_inst = 0 for batch in utils.iterate_minibatches(X_test, Y_test, masks=mask_test, char_inputs=C_test, batch_size=batch_size): inputs, targets, masks, char_inputs = batch err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs) test_err += err * inputs.shape[0] test_corr += corr test_total += num test_inst += inputs.shape[0] if output_predict: output_file = eval_dir + '/test%d' % epoch utils.output_predictions(predictions, targets, masks, output_file, label_alphabet, is_flattened=False) # print out test result if stop_count > 0: print '(cf.', print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( test_err / test_inst, test_corr, test_total, test_corr * 100 / test_total), if output_predict and exp_mode in ['ner', 'chunk']: stdout = subprocess.check_output([eval_script], stdin=open(output_file)) f1_score = stdout.split("\n")[1].split()[7] # this is string print ", f1:", f1_score else: print sys.stdout.flush() if update_acc: best_acc_test_err[idx_to_update] = test_err best_acc_test_corr[idx_to_update] = test_corr # stop if dev acc decrease 3 time straightly. if stop_count == patience: break # re-compile a function with new learning rate for training if update_algo not in ['adam', 'adadelta']: if decay_rate >= 0: lr = learning_rate / (1.0 + epoch * decay_rate) else: if stop_count > 0 and stop_count % 3 == 0: learning_rate /= 2.0 lr = learning_rate updates = utils.create_updates(loss_train, params, update_algo, lr, momentum=momentum) train_fn = theano.function( [input_var, target_var, mask_var, char_input_var], [loss_train_ori, loss_train_adv, corr_train, num_tokens], updates=updates) # print best performance on test data. for i in range(len(best_epoch_acc)): logger.info("final best acc test performance (at epoch %d)" % best_epoch_acc[i]) print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( best_acc_test_err[i] / test_inst, best_acc_test_corr[i], test_total, best_acc_test_corr[i] * 100 / test_total)
def main(): parser = argparse.ArgumentParser(description='Tuning with bi-directional RNN') parser.add_argument('--fine_tune', action='store_true', help='Fine tune the word embeddings') parser.add_argument('--embedding', choices=['word2vec', 'glove', 'senna'], help='Embedding for words', required=True) parser.add_argument('--embedding_dict', default='data/word2vec/GoogleNews-vectors-negative300.bin', help='path for embedding dict') parser.add_argument('--batch_size', type=int, default=10, help='Number of sentences in each batch') parser.add_argument('--num_units', type=int, default=100, help='Number of hidden units in RNN') parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate') parser.add_argument('--decay_rate', type=float, default=0.1, help='Decay rate of learning rate') parser.add_argument('--grad_clipping', type=float, default=0, help='Gradient clipping') parser.add_argument('--gamma', type=float, default=1e-6, help='weight for regularization') parser.add_argument('--oov', choices=['random', 'embedding'], help='Embedding for oov word', required=True) parser.add_argument('--update', choices=['sgd', 'momentum', 'nesterov'], help='update algorithm', default='sgd') parser.add_argument('--regular', choices=['none', 'l2'], help='regularization for training', required=True) parser.add_argument('--dropout', action='store_true', help='Apply dropout layers') parser.add_argument('--output_prediction', action='store_true', help='Output predictions to temp files') parser.add_argument('--train') # "data/POS-penn/wsj/split1/wsj1.train.original" parser.add_argument('--dev') # "data/POS-penn/wsj/split1/wsj1.dev.original" parser.add_argument('--test') # "data/POS-penn/wsj/split1/wsj1.test.original" args = parser.parse_args() def construct_input_layer(): if fine_tune: layer_input = lasagne.layers.InputLayer(shape=(None, max_length), input_var=input_var, name='input') layer_embedding = lasagne.layers.EmbeddingLayer(layer_input, input_size=alphabet_size, output_size=embedd_dim, W=embedd_table, name='embedding') return layer_embedding else: layer_input = lasagne.layers.InputLayer(shape=(None, max_length, embedd_dim), input_var=input_var, name='input') return layer_input logger = utils.get_logger("BiRNN") fine_tune = args.fine_tune oov = args.oov regular = args.regular embedding = args.embedding embedding_path = args.embedding_dict train_path = args.train dev_path = args.dev test_path = args.test update_algo = args.update grad_clipping = args.grad_clipping gamma = args.gamma output_predict = args.output_prediction dropout = args.dropout X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \ embedd_table, label_alphabet, _, _, _, _ = data_processor.load_dataset_sequence_labeling(train_path, dev_path, test_path, oov=oov, fine_tune=fine_tune, embedding=embedding, embedding_path=embedding_path) num_labels = label_alphabet.size() - 1 logger.info("constructing network...") # create variables target_var = T.imatrix(name='targets') mask_var = T.matrix(name='masks', dtype=theano.config.floatX) if fine_tune: input_var = T.imatrix(name='inputs') num_data, max_length = X_train.shape alphabet_size, embedd_dim = embedd_table.shape else: input_var = T.tensor3(name='inputs', dtype=theano.config.floatX) num_data, max_length, embedd_dim = X_train.shape # construct input and mask layers layer_incoming = construct_input_layer() layer_mask = lasagne.layers.InputLayer(shape=(None, max_length), input_var=mask_var, name='mask') # construct bi-rnn num_units = args.num_units bi_rnn = build_BiRNN(layer_incoming, num_units, mask=layer_mask, grad_clipping=grad_clipping, dropout=dropout) # reshape bi-rnn to [batch * max_length, num_units] bi_rnn = lasagne.layers.reshape(bi_rnn, (-1, [2])) # construct output layer (dense layer with softmax) layer_output = lasagne.layers.DenseLayer(bi_rnn, num_units=num_labels, nonlinearity=nonlinearities.softmax, name='softmax') # get output of bi-rnn shape=[batch * max_length, #label] prediction_train = lasagne.layers.get_output(layer_output) prediction_eval = lasagne.layers.get_output(layer_output, deterministic=True) final_prediction = T.argmax(prediction_eval, axis=1) # flat target_var to vector target_var_flatten = target_var.flatten() # flat mask_var to vector mask_var_flatten = mask_var.flatten() # compute loss num_loss = mask_var_flatten.sum(dtype=theano.config.floatX) # for training, we use mean of loss over number of labels loss_train = lasagne.objectives.categorical_crossentropy(prediction_train, target_var_flatten) loss_train = (loss_train * mask_var_flatten).sum(dtype=theano.config.floatX) / num_loss ############################################ # l2 regularization? if regular == 'l2': l2_penalty = lasagne.regularization.regularize_network_params(layer_output, lasagne.regularization.l2) loss_train = loss_train + gamma * l2_penalty # dima regularization? # if regular == 'dima': # params_regular = utils.get_all_params_by_name(layer_output, name=['forward.hidden_to_hidden.W', # 'backward.hidden_to_hidden.W']) # dima_penalty = lasagne.regularization.apply_penalty(params_regular, dima) # loss_train = loss_train + gamma * dima_penalty loss_eval = lasagne.objectives.categorical_crossentropy(prediction_eval, target_var_flatten) loss_eval = (loss_eval * mask_var_flatten).sum(dtype=theano.config.floatX) / num_loss # compute number of correct labels corr_train = lasagne.objectives.categorical_accuracy(prediction_train, target_var_flatten) corr_train = (corr_train * mask_var_flatten).sum(dtype=theano.config.floatX) corr_eval = lasagne.objectives.categorical_accuracy(prediction_eval, target_var_flatten) corr_eval = (corr_eval * mask_var_flatten).sum(dtype=theano.config.floatX) # Create update expressions for training. # hyper parameters to tune: learning rate, momentum, regularization. batch_size = args.batch_size learning_rate = args.learning_rate decay_rate = args.decay_rate momentum = 0.9 params = lasagne.layers.get_all_params(layer_output, trainable=True) updates = utils.create_updates(loss_train, params, update_algo, learning_rate, momentum=momentum) # Compile a function performing a training step on a mini-batch train_fn = theano.function([input_var, target_var, mask_var], [loss_train, corr_train, num_loss], updates=updates) # Compile a second function evaluating the loss and accuracy of network eval_fn = theano.function([input_var, target_var, mask_var], [loss_eval, corr_eval, num_loss, final_prediction]) # Finally, launch the training loop. logger.info( "Start training: %s with regularization: %s(%f), dropout: %s, fine tune: %s (#training data: %d, batch size: %d, clip: %.1f)..." \ % ( update_algo, regular, (0.0 if regular == 'none' else gamma), dropout, fine_tune, num_data, batch_size, grad_clipping)) num_batches = num_data / batch_size num_epochs = 1000 best_loss = 1e+12 best_acc = 0.0 best_epoch_loss = 0 best_epoch_acc = 0 best_loss_test_err = 0. best_loss_test_corr = 0. best_acc_test_err = 0. best_acc_test_corr = 0. stop_count = 0 lr = learning_rate patience = 5 for epoch in range(1, num_epochs + 1): print 'Epoch %d (learning rate=%.4f, decay rate=%.4f): ' % (epoch, lr, decay_rate) train_err = 0.0 train_corr = 0.0 train_total = 0 start_time = time.time() num_back = 0 train_batches = 0 for batch in utils.iterate_minibatches(X_train, Y_train, masks=mask_train, batch_size=batch_size, shuffle=True): inputs, targets, masks, _ = batch err, corr, num = train_fn(inputs, targets, masks) train_err += err * num train_corr += corr train_total += num train_batches += 1 time_ave = (time.time() - start_time) / train_batches time_left = (num_batches - train_batches) * time_ave # update log sys.stdout.write("\b" * num_back) log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % ( min(train_batches * batch_size, num_data), num_data, train_err / train_total, train_corr * 100 / train_total, time_left) sys.stdout.write(log_info) num_back = len(log_info) # update training log after each epoch sys.stdout.write("\b" * num_back) print 'train: %d/%d loss: %.4f, acc: %.2f%%, time: %.2fs' % ( min(train_batches * batch_size, num_data), num_data, train_err / train_total, train_corr * 100 / train_total, time.time() - start_time) # evaluate performance on dev data dev_err = 0.0 dev_corr = 0.0 dev_total = 0 for batch in utils.iterate_minibatches(X_dev, Y_dev, masks=mask_dev, batch_size=batch_size): inputs, targets, masks, _ = batch err, corr, num, predictions = eval_fn(inputs, targets, masks) dev_err += err * num dev_corr += corr dev_total += num if output_predict: utils.output_predictions(predictions, targets, masks, 'tmp/dev%d' % epoch, label_alphabet) print 'dev loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( dev_err / dev_total, dev_corr, dev_total, dev_corr * 100 / dev_total) if best_loss < dev_err and best_acc > dev_corr / dev_total: stop_count += 1 else: update_loss = False update_acc = False stop_count = 0 if best_loss > dev_err: update_loss = True best_loss = dev_err best_epoch_loss = epoch if best_acc < dev_corr / dev_total: update_acc = True best_acc = dev_corr / dev_total best_epoch_acc = epoch # evaluate on test data when better performance detected test_err = 0.0 test_corr = 0.0 test_total = 0 for batch in utils.iterate_minibatches(X_test, Y_test, masks=mask_test, batch_size=batch_size): inputs, targets, masks, _ = batch err, corr, num, predictions = eval_fn(inputs, targets, masks) test_err += err * num test_corr += corr test_total += num if output_predict: utils.output_predictions(predictions, targets, masks, 'tmp/test%d' % epoch, label_alphabet) print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( test_err / test_total, test_corr, test_total, test_corr * 100 / test_total) if update_loss: best_loss_test_err = test_err best_loss_test_corr = test_corr if update_acc: best_acc_test_err = test_err best_acc_test_corr = test_corr # stop if dev acc decrease 3 time straightly. if stop_count == patience: break # re-compile a function with new learning rate for training lr = learning_rate / (1.0 + epoch * decay_rate) updates = utils.create_updates(loss_train, params, update_algo, lr, momentum=momentum) train_fn = theano.function([input_var, target_var, mask_var], [loss_train, corr_train, num_loss], updates=updates) # print best performance on test data. logger.info("final best loss test performance (at epoch %d)" % best_epoch_loss) print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( best_loss_test_err / test_total, best_loss_test_corr, test_total, best_loss_test_corr * 100 / test_total) logger.info("final best acc test performance (at epoch %d)" % best_epoch_acc) print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( best_acc_test_err / test_total, best_acc_test_corr, test_total, best_acc_test_corr * 100 / test_total)
def main(): args = read_args() def construct_input_layer(): if fine_tune: layer_input = lasagne.layers.InputLayer( shape=(None, max_length), input_var=input_var, name='input') layer_embedding = lasagne.layers.EmbeddingLayer( layer_input, input_size=alphabet_size, output_size=embedd_dim, W=embedd_table, name='embedding') return layer_embedding else: layer_input = lasagne.layers.InputLayer( shape=(None, max_length, embedd_dim), input_var=input_var, name='input') return layer_input logger = utils.get_logger("BiRNN") fine_tune = args.fine_tune oov = args.oov regular = args.regular embedding = args.embedding embedding_path = args.embedding_dict train_path = args.train dev_path = args.dev test_path = args.test update_algo = args.update grad_clipping = args.grad_clipping gamma = args.gamma output_predict = args.output_prediction dropout = args.dropout X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, \ mask_test, embedd_table, label_alphabet, _, _, _, _ = data_processor.load_dataset_sequence_labeling( train_path, dev_path, test_path, oov=oov, fine_tune=fine_tune, embedding=embedding, embedding_path=embedding_path) num_labels = label_alphabet.size() - 1 logger.info("constructing network...") # create variables target_var = T.imatrix(name='targets') mask_var = T.matrix(name='masks', dtype=theano.config.floatX) if fine_tune: input_var = T.imatrix(name='inputs') num_data, max_length = X_train.shape alphabet_size, embedd_dim = embedd_table.shape else: input_var = T.tensor3(name='inputs', dtype=theano.config.floatX) num_data, max_length, embedd_dim = X_train.shape # construct input and mask layers layer_incoming = construct_input_layer() layer_mask = lasagne.layers.InputLayer(shape=(None, max_length), input_var=mask_var, name='mask') # construct bi-rnn num_units = args.num_units bi_rnn = build_BiRNN(layer_incoming, num_units, mask=layer_mask, grad_clipping=grad_clipping, dropout=dropout) # reshape bi-rnn to [batch * max_length, num_units] bi_rnn = lasagne.layers.reshape(bi_rnn, (-1, [2])) # construct output layer (dense layer with softmax) layer_output = lasagne.layers.DenseLayer( bi_rnn, num_units=num_labels, nonlinearity=nonlinearities.softmax, name='softmax') # get output of bi-rnn shape=[batch * max_length, #label] prediction_train = lasagne.layers.get_output(layer_output) prediction_eval = lasagne.layers.get_output(layer_output, deterministic=True) final_prediction = T.argmax(prediction_eval, axis=1) # flat target_var to vector target_var_flatten = target_var.flatten() # flat mask_var to vector mask_var_flatten = mask_var.flatten() # compute loss num_loss = mask_var_flatten.sum(dtype=theano.config.floatX) # for training, we use mean of loss over number of labels loss_train = lasagne.objectives.categorical_crossentropy(prediction_train, target_var_flatten) loss_train = (loss_train * mask_var_flatten).sum( dtype=theano.config.floatX) / num_loss ############################################ # l2 regularization? if regular == 'l2': l2_penalty = lasagne.regularization.regularize_network_params( layer_output, lasagne.regularization.l2) loss_train = loss_train + gamma * l2_penalty loss_eval = lasagne.objectives.categorical_crossentropy(prediction_eval, target_var_flatten) loss_eval = (loss_eval * mask_var_flatten).sum( dtype=theano.config.floatX) / num_loss # compute number of correct labels corr_train = lasagne.objectives.categorical_accuracy(prediction_train, target_var_flatten) corr_train = (corr_train * mask_var_flatten).sum(dtype=theano.config.floatX) corr_eval = lasagne.objectives.categorical_accuracy(prediction_eval, target_var_flatten) corr_eval = (corr_eval * mask_var_flatten).sum(dtype=theano.config.floatX) # Create update expressions for training. # hyper parameters to tune: learning rate, momentum, regularization. batch_size = args.batch_size learning_rate = args.learning_rate decay_rate = args.decay_rate momentum = 0.9 params = lasagne.layers.get_all_params(layer_output, trainable=True) updates = utils.create_updates(loss_train, params, update_algo, learning_rate, momentum=momentum) # Compile a function performing a training step on a mini-batch train_fn = theano.function([input_var, target_var, mask_var], [loss_train, corr_train, num_loss], updates=updates) # Compile a second function evaluating the loss and accuracy of network eval_fn = theano.function([input_var, target_var, mask_var], [loss_eval, corr_eval, num_loss, final_prediction]) # Finally, launch the training loop. log_start(batch_size, dropout, fine_tune, gamma, grad_clipping, logger, num_data, regular, update_algo) num_batches = num_data / batch_size num_epochs = args.epochs best_loss = 1e+12 best_acc = 0.0 best_epoch_loss = 0 best_epoch_acc = 0 best_loss_test_err = 0. best_loss_test_corr = 0. best_acc_test_err = 0. best_acc_test_corr = 0. stop_count = 0 lr = learning_rate patience = 5 safe_mkdir('tmp') for epoch in range(1, num_epochs + 1): logger.info('Epoch %d (learning rate=%.4f, decay rate=%.4f): ' % ( epoch, lr, decay_rate)) train_err = 0.0 train_corr = 0.0 train_total = 0 start_time = time.time() train_batches = 0 for batch in utils.iterate_minibatches(X_train, Y_train, masks=mask_train, batch_size=batch_size, shuffle=True): inputs, targets, masks, _ = batch err, corr, num = train_fn(inputs, targets, masks) train_err += err * num train_corr += corr train_total += num train_batches += 1 time_ave = (time.time() - start_time) / train_batches time_left = (num_batches - train_batches) * time_ave # update training log after each epoch print('train: %d/%d loss: %.4f, acc: %.2f%%, time: %.2fs' % ( min(train_batches * batch_size, num_data), num_data, train_err / train_total, train_corr * 100 / train_total, time.time() - start_time)) # evaluate performance on dev data dev_err = 0.0 dev_corr = 0.0 dev_total = 0 for batch in utils.iterate_minibatches(X_dev, Y_dev, masks=mask_dev, batch_size=batch_size): inputs, targets, masks, _ = batch err, corr, num, predictions = eval_fn(inputs, targets, masks) dev_err += err * num dev_corr += corr dev_total += num if output_predict: utils.output_predictions(predictions, targets, masks, 'tmp/dev%d' % epoch, label_alphabet) log_loss('dev', dev_corr, dev_err, dev_total, logger) if best_loss < dev_err and best_acc > dev_corr / dev_total: stop_count += 1 else: update_loss = False update_acc = False stop_count = 0 if best_loss > dev_err: update_loss = True best_loss = dev_err best_epoch_loss = epoch if best_acc < dev_corr / dev_total: update_acc = True best_acc = dev_corr / dev_total best_epoch_acc = epoch # evaluate on test data when better performance detected test_err = 0.0 test_corr = 0.0 test_total = 0 for batch in utils.iterate_minibatches(X_test, Y_test, masks=mask_test, batch_size=batch_size): inputs, targets, masks, _ = batch err, corr, num, predictions = eval_fn(inputs, targets, masks) test_err += err * num test_corr += corr test_total += num if output_predict: utils.output_predictions(predictions, targets, masks, 'tmp/test%d' % epoch, label_alphabet) log_loss('test', test_corr, test_err, test_total, logger) if update_loss: best_loss_test_err = test_err best_loss_test_corr = test_corr if update_acc: best_acc_test_err = test_err best_acc_test_corr = test_corr # stop if dev acc decrease 3 time straightly. if stop_count == patience: break # re-compile a function with new learning rate for training lr = learning_rate / (1.0 + epoch * decay_rate) updates = utils.create_updates(loss_train, params, update_algo, lr, momentum=momentum) train_fn = theano.function([input_var, target_var, mask_var], [loss_train, corr_train, num_loss], updates=updates) # print best performance on test data. logger.info( "final best loss test performance (at epoch %d)" % best_epoch_loss) log_loss('best loss in test', corr=best_loss_test_corr, error=best_loss_test_err, total=test_total, logger=logger) logger.info( "final best acc test performance (at epoch %d)" % best_epoch_acc) log_loss('best accuracy in test', corr=best_acc_test_corr, error=best_acc_test_err, total=test_total, logger=logger) # Log last predictions # Compile a third function evaluating the final predictions only predict_fn = theano.function([input_var, mask_var], [final_prediction], allow_input_downcast=True) predictions = predict_fn(X_test, mask_test)[0] utils.output_predictions(predictions, Y_dev, mask_test, 'tmp/final_test', label_alphabet)
def main(): parser = argparse.ArgumentParser(description='Tuning with bi-directional LSTM-CNN-CRF') parser.add_argument('--fine_tune', action='store_true', help='Fine tune the word embeddings') parser.add_argument('--embedding', choices=['word2vec', 'glove', 'senna', 'random'], help='Embedding for words', required=True) parser.add_argument('--embedding_dict', default=None, help='path for embedding dict') parser.add_argument('--batch_size', type=int, default=10, help='Number of sentences in each batch') parser.add_argument('--num_units', type=int, default=100, help='Number of hidden units in LSTM') parser.add_argument('--num_filters', type=int, default=20, help='Number of filters in CNN') parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate') parser.add_argument('--decay_rate', type=float, default=0.1, help='Decay rate of learning rate') parser.add_argument('--grad_clipping', type=float, default=0, help='Gradient clipping') parser.add_argument('--gamma', type=float, default=1e-6, help='weight for regularization') parser.add_argument('--peepholes', action='store_true', help='Peepholes for LSTM') parser.add_argument('--oov', choices=['random', 'embedding'], help='Embedding for oov word', required=True) parser.add_argument('--update', choices=['sgd', 'momentum', 'nesterov', 'adadelta'], help='update algorithm', default='sgd') parser.add_argument('--regular', choices=['none', 'l2'], help='regularization for training', required=True) parser.add_argument('--dropout', action='store_true', help='Apply dropout layers') parser.add_argument('--patience', type=int, default=5, help='Patience for early stopping') parser.add_argument('--output_prediction', action='store_true', help='Output predictions to temp files') parser.add_argument('--train') # "data/POS-penn/wsj/split1/wsj1.train.original" parser.add_argument('--dev') # "data/POS-penn/wsj/split1/wsj1.dev.original" parser.add_argument('--test') # "data/POS-penn/wsj/split1/wsj1.test.original" args = parser.parse_args() def construct_input_layer(): if fine_tune: layer_input = lasagne.layers.InputLayer(shape=(None, max_length), input_var=input_var, name='input') layer_embedding = lasagne.layers.EmbeddingLayer(layer_input, input_size=alphabet_size, output_size=embedd_dim, W=embedd_table, name='embedding') return layer_embedding else: layer_input = lasagne.layers.InputLayer(shape=(None, max_length, embedd_dim), input_var=input_var, name='input') return layer_input def construct_char_input_layer(): layer_char_input = lasagne.layers.InputLayer(shape=(None, max_sent_length, max_char_length), input_var=char_input_var, name='char-input') layer_char_input = lasagne.layers.reshape(layer_char_input, (-1, [2])) layer_char_embedding = lasagne.layers.EmbeddingLayer(layer_char_input, input_size=char_alphabet_size, output_size=char_embedd_dim, W=char_embedd_table, name='char_embedding') layer_char_input = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1)) return layer_char_input logger = utils.get_logger("BiLSTM-CNN-CRF") fine_tune = args.fine_tune oov = args.oov regular = args.regular embedding = args.embedding embedding_path = args.embedding_dict train_path = args.train dev_path = args.dev test_path = args.test update_algo = args.update grad_clipping = args.grad_clipping peepholes = args.peepholes num_filters = args.num_filters gamma = args.gamma output_predict = args.output_prediction dropout = args.dropout X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \ embedd_table, label_alphabet, \ C_train, C_dev, C_test, char_embedd_table = data_processor.load_dataset_sequence_labeling(train_path, dev_path, test_path, oov=oov, fine_tune=fine_tune, embedding=embedding, embedding_path=embedding_path, use_character=True) num_labels = label_alphabet.size() - 1 logger.info("constructing network...") # create variables target_var = T.imatrix(name='targets') mask_var = T.matrix(name='masks', dtype=theano.config.floatX) if fine_tune: input_var = T.imatrix(name='inputs') num_data, max_length = X_train.shape alphabet_size, embedd_dim = embedd_table.shape else: input_var = T.tensor3(name='inputs', dtype=theano.config.floatX) num_data, max_length, embedd_dim = X_train.shape char_input_var = T.itensor3(name='char-inputs') num_data_char, max_sent_length, max_char_length = C_train.shape char_alphabet_size, char_embedd_dim = char_embedd_table.shape assert (max_length == max_sent_length) assert (num_data == num_data_char) # construct input and mask layers layer_incoming1 = construct_char_input_layer() layer_incoming2 = construct_input_layer() layer_mask = lasagne.layers.InputLayer(shape=(None, max_length), input_var=mask_var, name='mask') # construct bi-rnn-cnn num_units = args.num_units bi_lstm_cnn_crf = build_BiLSTM_CNN_CRF(layer_incoming1, layer_incoming2, num_units, num_labels, mask=layer_mask, grad_clipping=grad_clipping, peepholes=peepholes, num_filters=num_filters, dropout=dropout) logger.info("Network structure: hidden=%d, filter=%d" % (num_units, num_filters)) # compute loss num_tokens = mask_var.sum(dtype=theano.config.floatX) # get outpout of bi-lstm-cnn-crf shape [batch, length, num_labels, num_labels] energies_train = lasagne.layers.get_output(bi_lstm_cnn_crf) energies_eval = lasagne.layers.get_output(bi_lstm_cnn_crf, deterministic=True) loss_train = crf_loss(energies_train, target_var, mask_var).mean() loss_eval = crf_loss(energies_eval, target_var, mask_var).mean() # l2 regularization? if regular == 'l2': l2_penalty = lasagne.regularization.regularize_network_params(bi_lstm_cnn_crf, lasagne.regularization.l2) loss_train = loss_train + gamma * l2_penalty _, corr_train = crf_accuracy(energies_train, target_var) corr_train = (corr_train * mask_var).sum(dtype=theano.config.floatX) prediction_eval, corr_eval = crf_accuracy(energies_eval, target_var) corr_eval = (corr_eval * mask_var).sum(dtype=theano.config.floatX) # Create update expressions for training. # hyper parameters to tune: learning rate, momentum, regularization. batch_size = args.batch_size learning_rate = 1.0 if update_algo == 'adadelta' else args.learning_rate decay_rate = args.decay_rate momentum = 0.9 params = lasagne.layers.get_all_params(bi_lstm_cnn_crf, trainable=True) updates = utils.create_updates(loss_train, params, update_algo, learning_rate, momentum=momentum) # Compile a function performing a training step on a mini-batch train_fn = theano.function([input_var, target_var, mask_var, char_input_var], [loss_train, corr_train, num_tokens], updates=updates) # Compile a second function evaluating the loss and accuracy of network eval_fn = theano.function([input_var, target_var, mask_var, char_input_var], [loss_eval, corr_eval, num_tokens, prediction_eval]) # Finally, launch the training loop. logger.info( "Start training: %s with regularization: %s(%f), dropout: %s, fine tune: %s (#training data: %d, batch size: %d, clip: %.1f, peepholes: %s)..." \ % ( update_algo, regular, (0.0 if regular == 'none' else gamma), dropout, fine_tune, num_data, batch_size, grad_clipping, peepholes)) num_batches = num_data / batch_size num_epochs = 1000 best_loss = 1e+12 best_acc = 0.0 best_epoch_loss = 0 best_epoch_acc = 0 best_loss_test_err = 0. best_loss_test_corr = 0. best_acc_test_err = 0. best_acc_test_corr = 0. stop_count = 0 lr = learning_rate patience = args.patience for epoch in range(1, num_epochs + 1): print 'Epoch %d (learning rate=%.4f, decay rate=%.4f): ' % (epoch, lr, decay_rate) train_err = 0.0 train_corr = 0.0 train_total = 0 train_inst = 0 start_time = time.time() num_back = 0 train_batches = 0 for batch in utils.iterate_minibatches(X_train, Y_train, masks=mask_train, char_inputs=C_train, batch_size=batch_size, shuffle=True): inputs, targets, masks, char_inputs = batch err, corr, num = train_fn(inputs, targets, masks, char_inputs) train_err += err * inputs.shape[0] train_corr += corr train_total += num train_inst += inputs.shape[0] train_batches += 1 time_ave = (time.time() - start_time) / train_batches time_left = (num_batches - train_batches) * time_ave # update log sys.stdout.write("\b" * num_back) log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % ( min(train_batches * batch_size, num_data), num_data, train_err / train_inst, train_corr * 100 / train_total, time_left) sys.stdout.write(log_info) num_back = len(log_info) # update training log after each epoch assert train_inst == num_data sys.stdout.write("\b" * num_back) print 'train: %d/%d loss: %.4f, acc: %.2f%%, time: %.2fs' % ( min(train_batches * batch_size, num_data), num_data, train_err / num_data, train_corr * 100 / train_total, time.time() - start_time) # evaluate performance on dev data dev_err = 0.0 dev_corr = 0.0 dev_total = 0 dev_inst = 0 for batch in utils.iterate_minibatches(X_dev, Y_dev, masks=mask_dev, char_inputs=C_dev, batch_size=batch_size): inputs, targets, masks, char_inputs = batch err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs) dev_err += err * inputs.shape[0] dev_corr += corr dev_total += num dev_inst += inputs.shape[0] if output_predict: utils.output_predictions(predictions, targets, masks, 'tmp/dev%d' % epoch, label_alphabet, is_flattened=False) print 'dev loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( dev_err / dev_inst, dev_corr, dev_total, dev_corr * 100 / dev_total) if best_loss < dev_err and best_acc > dev_corr / dev_total: stop_count += 1 else: update_loss = False update_acc = False stop_count = 0 if best_loss > dev_err: update_loss = True best_loss = dev_err best_epoch_loss = epoch if best_acc < dev_corr / dev_total: update_acc = True best_acc = dev_corr / dev_total best_epoch_acc = epoch # evaluate on test data when better performance detected test_err = 0.0 test_corr = 0.0 test_total = 0 test_inst = 0 for batch in utils.iterate_minibatches(X_test, Y_test, masks=mask_test, char_inputs=C_test, batch_size=batch_size): inputs, targets, masks, char_inputs = batch err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs) test_err += err * inputs.shape[0] test_corr += corr test_total += num test_inst += inputs.shape[0] if output_predict: utils.output_predictions(predictions, targets, masks, 'tmp/test%d' % epoch, label_alphabet, is_flattened=False) print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( test_err / test_inst, test_corr, test_total, test_corr * 100 / test_total) if update_loss: best_loss_test_err = test_err best_loss_test_corr = test_corr if update_acc: best_acc_test_err = test_err best_acc_test_corr = test_corr # stop if dev acc decrease 3 time straightly. if stop_count == patience: break # re-compile a function with new learning rate for training if update_algo != 'adadelta': lr = learning_rate / (1.0 + epoch * decay_rate) updates = utils.create_updates(loss_train, params, update_algo, lr, momentum=momentum) train_fn = theano.function([input_var, target_var, mask_var, char_input_var], [loss_train, corr_train, num_tokens], updates=updates) # print best performance on test data. logger.info("final best loss test performance (at epoch %d)" % best_epoch_loss) print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( best_loss_test_err / test_inst, best_loss_test_corr, test_total, best_loss_test_corr * 100 / test_total) logger.info("final best acc test performance (at epoch %d)" % best_epoch_acc) print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( best_acc_test_err / test_inst, best_acc_test_corr, test_total, best_acc_test_corr * 100 / test_total)
def main(): parser = argparse.ArgumentParser(description='Tuning with bi-directional LSTM-CNN-CRF') parser.add_argument('--fine_tune', action='store_true', help='Fine tune the word embeddings') parser.add_argument('--embedding', choices=['word2vec', 'glove', 'senna', 'random'], help='Embedding for words', required=True) parser.add_argument('--embedding_dict', default=None, help='path for embedding dict') parser.add_argument('--batch_size', type=int, default=10, help='Number of sentences in each batch') parser.add_argument('--num_units', type=int, default=100, help='Number of hidden units in LSTM') parser.add_argument('--num_filters', type=int, default=20, help='Number of filters in CNN') parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate') parser.add_argument('--decay_rate', type=float, default=0.1, help='Decay rate of learning rate') parser.add_argument('--grad_clipping', type=float, default=0, help='Gradient clipping') parser.add_argument('--gamma', type=float, default=1e-6, help='weight for regularization') parser.add_argument('--peepholes', action='store_true', help='Peepholes for LSTM') parser.add_argument('--oov', choices=['random', 'embedding'], help='Embedding for oov word', required=True) parser.add_argument('--update', choices=['sgd', 'momentum', 'nesterov', 'adadelta'], help='update algorithm', default='sgd') parser.add_argument('--regular', choices=['none', 'l2'], help='regularization for training', required=True) parser.add_argument('--dropout', action='store_true', help='Apply dropout layers') parser.add_argument('--patience', type=int, default=5, help='Patience for early stopping') parser.add_argument('--output_prediction', default='true', action='store_true', help='Output predictions to temp files') parser.add_argument('--train') # "data/POS-penn/wsj/split1/wsj1.train.original" parser.add_argument('--dev') # "data/POS-penn/wsj/split1/wsj1.dev.original" parser.add_argument('--test') # "data/POS-penn/wsj/split1/wsj1.test.original" parser.add_argument("--model") # model name args = parser.parse_args() def construct_input_layer(): if fine_tune: layer_input = lasagne.layers.InputLayer(shape=(None, max_length), input_var=input_var, name='input') layer_embedding = lasagne.layers.EmbeddingLayer(layer_input, input_size=alphabet_size, output_size=embedd_dim, W=embedd_table, name='embedding') return layer_embedding else: layer_input = lasagne.layers.InputLayer(shape=(None, max_length, embedd_dim), input_var=input_var, name='input') return layer_input def construct_char_input_layer(): layer_char_input = lasagne.layers.InputLayer(shape=(None, max_sent_length, max_char_length), input_var=char_input_var, name='char-input') layer_char_input = lasagne.layers.reshape(layer_char_input, (-1, [2])) layer_char_embedding = lasagne.layers.EmbeddingLayer(layer_char_input, input_size=char_alphabet_size, output_size=char_embedd_dim, W=char_embedd_table, name='char_embedding') layer_char_input = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1)) return layer_char_input logger = utils.get_logger("BiLSTM-CNN-CRF") fine_tune = args.fine_tune oov = args.oov regular = args.regular embedding = args.embedding embedding_path = args.embedding_dict train_path = args.train dev_path = args.dev test_path = args.test update_algo = args.update grad_clipping = args.grad_clipping peepholes = args.peepholes num_filters = args.num_filters gamma = args.gamma output_predict = args.output_prediction dropout = args.dropout modelname = args.model # 读取数据训练集,dev集,测试集 X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \ embedd_table, label_alphabet, word_alphabet, \ C_train, C_dev, C_test, char_embedd_table = data_processor.load_dataset_sequence_labeling(train_path, dev_path, test_path, oov=oov, fine_tune=fine_tune, embedding=embedding, embedding_path=embedding_path, use_character=True) print 'label_alphabet' for i in range(label_alphabet.size()): print i print label_alphabet.get_instance(i) # print Y_test, Y_test.shape; sys.exit(1) my_size = data_processor.MAX_LENGTH_TRAIN my_size = data_processor.MY_MAX_LENGTH print "\tMY_SIZE", my_size, data_processor.MAX_LENGTH_TRAIN # my_size = data_processor.MAX_LENGTH_DEV print "\tMYSIZE", my_size num_labels = label_alphabet.size() - 1 # 构建网络 logger.info("constructing network...") # create variables target_var = T.imatrix(name='targets') mask_var = T.matrix(name='masks', dtype=theano.config.floatX) if fine_tune: input_var = T.imatrix(name='inputs') num_data, max_length = X_train.shape alphabet_size, embedd_dim = embedd_table.shape else: input_var = T.tensor3(name='inputs', dtype=theano.config.floatX) num_data, max_length, embedd_dim = X_train.shape char_input_var = T.itensor3(name='char-inputs') num_data_char, max_sent_length, max_char_length = C_train.shape char_alphabet_size, char_embedd_dim = char_embedd_table.shape assert (max_length == max_sent_length) assert (num_data == num_data_char) # 构建输入层 # construct input and mask layers logger.info("construct input and mask layers...") layer_incoming1 = construct_char_input_layer() layer_incoming2 = construct_input_layer() layer_mask = lasagne.layers.InputLayer(shape=(None, max_length), input_var=mask_var, name='mask') # construct bi-rnn-cnn logger.info("construct bi-rnn-cnn...") num_units = args.num_units bi_lstm_cnn_crf = build_BiLSTM_CNN_CRF(layer_incoming1, layer_incoming2, num_units, num_labels, mask=layer_mask, grad_clipping=grad_clipping, peepholes=peepholes, num_filters=num_filters, dropout=dropout) logger.info("Network structure: hidden=%d, filter=%d" % (num_units, num_filters)) # compute loss num_tokens = mask_var.sum(dtype=theano.config.floatX) # get outpout of bi-lstm-cnn-crf shape [batch, length, num_labels, num_labels] energies_train = lasagne.layers.get_output(bi_lstm_cnn_crf) energies_eval = lasagne.layers.get_output(bi_lstm_cnn_crf, deterministic=True) loss_train = crf_loss(energies_train, target_var, mask_var).mean() # print loss_train; sys.exit(1) loss_eval = crf_loss(energies_eval, target_var, mask_var).mean() # l2 regularization? if regular == 'l2': l2_penalty = lasagne.regularization.regularize_network_params(bi_lstm_cnn_crf, lasagne.regularization.l2) loss_train = loss_train + gamma * l2_penalty _, corr_train = crf_accuracy(energies_train, target_var) corr_train = (corr_train * mask_var).sum(dtype=theano.config.floatX) prediction_eval, corr_eval = crf_accuracy(energies_eval, target_var) corr_eval = (corr_eval * mask_var).sum(dtype=theano.config.floatX) # Create update expressions for training. # hyper parameters to tune: learning rate, momentum, regularization. batch_size = args.batch_size learning_rate = 1.0 if update_algo == 'adadelta' else args.learning_rate decay_rate = args.decay_rate momentum = 0.9 params = lasagne.layers.get_all_params(bi_lstm_cnn_crf, trainable=True) updates = utils.create_updates(loss_train, params, update_algo, learning_rate, momentum=momentum) # Compile a function performing a training step on a mini-batch train_fn = theano.function([input_var, target_var, mask_var, char_input_var], [loss_train, corr_train, num_tokens], updates=updates) # Compile a second function evaluating the loss and accuracy of network eval_fn = theano.function([input_var, target_var, mask_var, char_input_var], [loss_eval, corr_eval, num_tokens, prediction_eval]) # Finally, launch the training loop. logger.info( "Start training: %s with regularization: %s(%f), dropout: %s, fine tune: %s (#training data: %d, batch size: %d, clip: %.1f, peepholes: %s)..." \ % ( update_algo, regular, (0.0 if regular == 'none' else gamma), dropout, fine_tune, num_data, batch_size, grad_clipping, peepholes)) num_batches = num_data / batch_size num_epochs = 1000 # num_epochs = 1 best_loss = 1e+12 best_acc = 0.0 best_epoch_loss = 0 best_epoch_acc = 0 best_loss_test_err = 0. best_loss_test_corr = 0. best_acc_test_err = 0. best_acc_test_corr = 0. stop_count = 0 lr = learning_rate patience = args.patience for epoch in range(1, num_epochs + 1): print 'Epoch %d (learning rate=%.4f, decay rate=%.4f): ' % (epoch, lr, decay_rate) train_err = 0.0 train_corr = 0.0 train_total = 0 train_inst = 0 start_time = time.time() num_back = 0 train_batches = 0 for batch in utils.iterate_minibatches(X_train, Y_train, masks=mask_train, char_inputs=C_train, batch_size=batch_size, shuffle=True): inputs, targets, masks, char_inputs = batch err, corr, num = train_fn(inputs, targets, masks, char_inputs) train_err += err * inputs.shape[0] train_corr += corr train_total += num train_inst += inputs.shape[0] train_batches += 1 time_ave = (time.time() - start_time) / train_batches time_left = (num_batches - train_batches) * time_ave # update log sys.stdout.write("\b" * num_back) log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % ( min(train_batches * batch_size, num_data), num_data, train_err / train_inst, train_corr * 100 / train_total, time_left) sys.stdout.write(log_info) num_back = len(log_info) # update training log after each epoch assert train_inst == num_data sys.stdout.write("\b" * num_back) print 'train: %d/%d loss: %.4f, acc: %.2f%%, time: %.2fs' % ( min(train_batches * batch_size, num_data), num_data, train_err / num_data, train_corr * 100 / train_total, time.time() - start_time) # evaluate performance on dev data dev_err = 0.0 dev_corr = 0.0 dev_total = 0 dev_inst = 0 test_err_sentence = 0 my_f1 = {} my_prs = [] my_trs = [] for batch in utils.iterate_minibatches(X_dev, Y_dev, masks=mask_dev, char_inputs=C_dev, batch_size=batch_size): inputs, targets, masks, char_inputs = batch err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs) # print "-->HERE COMES THE PREDS",predictions # print predictions.shape,type(predictions); for i in xrange(batch_size): try: input_clear = [word_alphabet.get_instance(y) for y in list(inputs[i, :])] except IndexError: continue if len(input_clear) == 0: continue try: dev_size = input_clear.index(None) except ValueError: dev_size = my_size # print dev_size my_trs += list(targets[i, :dev_size]) my_prs += list(predictions[i, :dev_size]) # for j in xrange(len(targets[i,:dev_size])): # pr = predictions[i,j] # tr = targets[i,j] # my_f1[(pr,tr)] = my_f1.get((pr,tr),0)+1 # input_clear = [word_alphabet.get_instance(y) for y in list(inputs[0,:])] # print input_clear # my_f1 = f1_score(my_trs,my_prs,average="macro") # print [label_alphabet.get_instance(y+1) for y in list(targets[0,:])] # print targets[0,:],predictions[0,:],inputs.shape[0],my_f1; sys.exit(1) # print err,inputs.shape[0] dev_err += err * inputs.shape[0] dev_corr += corr dev_total += num dev_inst += inputs.shape[0] if output_predict: utils.output_predictions(predictions, targets, masks, 'tmp/dev%d' % epoch, label_alphabet, is_flattened=False) dev_f1 = f1_score(my_trs, my_prs, average="macro") classify_report = metrics.classification_report(my_trs, my_prs) print 'dev classify_report' print classify_report print 'dev loss: %.4f, corr: %d, total: %d, acc: %.2f%%, f1: %.4f' % ( dev_err / dev_inst, dev_corr, dev_total, dev_corr * 100 / dev_total, dev_f1) # CHANGE THIS IF NECESSARY # MODEL SELECTION ON DEV CRITERION, SE useF1 = True useLoss = False criterion = dev_f1 if useF1 else dev_corr / dev_total if best_loss < dev_err and best_acc > criterion: stop_count += 1 else: update_loss = False update_acc = False stop_count = 0 if best_loss > dev_err: update_loss = True best_loss = dev_err best_epoch_loss = epoch if best_acc < criterion: update_acc = True best_acc = criterion best_epoch_acc = epoch else: if useLoss == False: continue # evaluate on test data when better performance detected test_err = 0.0 test_corr = 0.0 test_total = 0 test_inst = 0 test_err_sentence = 0 test_sentences = 0 print "#SAVING MODEL" np.savez(modelname, *lasagne.layers.get_all_param_values(bi_lstm_cnn_crf)) test_prs = [] test_trs = [] for batch in utils.iterate_minibatches(X_test, Y_test, masks=mask_test, char_inputs=C_test, batch_size=batch_size): inputs, targets, masks, char_inputs = batch err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs) # print "-->HERE COMES THE PREDS",predictions # print predictions.shape,type(predictions); for i in xrange(batch_size): try: input_clear = [word_alphabet.get_instance(y) for y in list(inputs[i, :])] except IndexError: continue if len(input_clear) == 0: continue try: test_size = input_clear.index(None) except ValueError: test_size = my_size # print dev_size test_trs += list(targets[i, :test_size]) test_prs += list(predictions[i, :test_size]) # print predictions # SE # print "AAA",inputs[0],len(inputs[0]),inputs[0][0] input_clear = [word_alphabet.get_instance(y) for x in inputs for y in list(x)] # predictions,dir(label_alphabet),label_alphabet.get_instance(4) # SE target_clear = [label_alphabet.get_instance(y + 1) for x in targets for y in list(x)] target_clear_pred = [label_alphabet.get_instance(y + 1) for x in predictions for y in list(x)] # SE # print my_size # comment this out # my_size = 652 # print input_clear; sys.exit(1) # print input_clear for ii in range(batch_size): Z = input_clear[ii * my_size:(ii + 1) * my_size] if len(Z) == 0: continue try: size = Z.index(None) except ValueError: size = my_size # print size itruth = input_clear[ii * my_size:(ii + 1) * my_size][:size] EMPTY = "EMPTY" EMPTY = "EMPTY_EMPTY" otruth = filter(lambda z: z != EMPTY, target_clear[ii * my_size:(ii + 1) * my_size][:size]) opred = filter(lambda z: z != EMPTY, target_clear_pred[ii * my_size:(ii + 1) * my_size][:size]) if otruth == opred: test_err_sentence += 1 # print "CORRECT",itruth,otruth,opred print "#CORRECT %%%", len(itruth), len(opred), len(otruth) printout(itruth, opred, otruth) print else: print "#WRONG %%%" # ,itruth,otruth,opred printout(itruth, opred, otruth) print test_sentences += 1 test_err += err * inputs.shape[0] test_corr += corr test_total += num test_inst += inputs.shape[0] if output_predict: utils.output_predictions(predictions, targets, masks, 'tmp/test%d' % epoch, label_alphabet, is_flattened=False) test_f1 = f1_score(test_trs, test_prs, average="macro") test_classify_report = metrics.classification_report(test_trs, test_prs) print 'label_alphabet' for i in range(label_alphabet.size()): print i print label_alphabet.get_instance(i) print 'Epoch %d test classify_report' % epoch print test_classify_report print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%% f1: %.4f' % ( test_err / test_inst, test_corr, test_total, test_corr * 100 / test_total, test_f1), test_err_sentence * 1.0 / test_sentences, test_err_sentence, test_sentences if update_loss: best_loss_test_err = test_err best_loss_test_corr = test_corr if update_acc: best_acc_test_err = test_err best_acc_test_corr = test_corr # stop if dev acc decrease 3 time straightly. if stop_count == patience: break # re-compile a function with new learning rate for training if update_algo != 'adadelta': lr = learning_rate / (1.0 + epoch * decay_rate) updates = utils.create_updates(loss_train, params, update_algo, lr, momentum=momentum) train_fn = theano.function([input_var, target_var, mask_var, char_input_var], [loss_train, corr_train, num_tokens], updates=updates) # print best performance on test data. logger.info("final best loss test performance (at epoch %d)" % best_epoch_loss) print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( best_loss_test_err / test_inst, best_loss_test_corr, test_total, best_loss_test_corr * 100 / test_total) logger.info("final best acc test performance (at epoch %d)" % best_epoch_acc) print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( best_acc_test_err / test_inst, best_acc_test_corr, test_total, best_acc_test_corr * 100 / test_total)
def main(): parser = argparse.ArgumentParser( description='Tuning with bi-directional LSTM-CNN-CRF') parser.add_argument('--fine_tune', action='store_true', help='Fine tune the word embeddings') parser.add_argument('--embedding', choices=['word2vec', 'glove', 'senna', 'random'], help='Embedding for words', required=True) parser.add_argument('--embedding_dict', default=None, help='path for embedding dict') parser.add_argument('--batch_size', type=int, default=10, help='Number of sentences in each batch') parser.add_argument('--num_units', type=int, default=100, help='Number of hidden units in LSTM') parser.add_argument('--num_filters', type=int, default=20, help='Number of filters in CNN') parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate') parser.add_argument('--decay_rate', type=float, default=0.1, help='Decay rate of learning rate') parser.add_argument('--grad_clipping', type=float, default=0, help='Gradient clipping') parser.add_argument('--gamma', type=float, default=1e-6, help='weight for regularization') parser.add_argument('--peepholes', action='store_true', help='Peepholes for LSTM') parser.add_argument('--oov', choices=['random', 'embedding'], help='Embedding for oov word', required=True) parser.add_argument('--update', choices=['sgd', 'momentum', 'nesterov', 'adadelta'], help='update algorithm', default='sgd') parser.add_argument('--regular', choices=['none', 'l2'], help='regularization for training', required=True) parser.add_argument('--dropout', action='store_true', help='Apply dropout layers') parser.add_argument('--patience', type=int, default=5, help='Patience for early stopping') parser.add_argument('--output_prediction', action='store_true', help='Output predictions to temp files') parser.add_argument( '--train') # "data/POS-penn/wsj/split1/wsj1.train.original" parser.add_argument( '--dev') # "data/POS-penn/wsj/split1/wsj1.dev.original" parser.add_argument( '--test') # "data/POS-penn/wsj/split1/wsj1.test.original" parser.add_argument('--realtest') parser.add_argument('--mymodel') args = parser.parse_args() def construct_input_layer(): if fine_tune: layer_input = lasagne.layers.InputLayer(shape=(None, max_length), input_var=input_var, name='input') layer_embedding = lasagne.layers.EmbeddingLayer( layer_input, input_size=alphabet_size, output_size=embedd_dim, W=embedd_table, name='embedding') return layer_embedding else: layer_input = lasagne.layers.InputLayer(shape=(None, max_length, embedd_dim), input_var=input_var, name='input') return layer_input def construct_char_input_layer(): layer_char_input = lasagne.layers.InputLayer(shape=(None, max_sent_length, max_char_length), input_var=char_input_var, name='char-input') layer_char_input = lasagne.layers.reshape(layer_char_input, (-1, [2])) layer_char_embedding = lasagne.layers.EmbeddingLayer( layer_char_input, input_size=char_alphabet_size, output_size=char_embedd_dim, W=char_embedd_table, name='char_embedding') layer_char_input = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1)) return layer_char_input logger = utils.get_logger("BiLSTM-CNN-CRF") fine_tune = args.fine_tune oov = args.oov regular = args.regular embedding = args.embedding embedding_path = args.embedding_dict train_path = args.train dev_path = args.dev test_path = args.test real_test_path = args.realtest update_algo = args.update grad_clipping = args.grad_clipping peepholes = args.peepholes num_filters = args.num_filters gamma = args.gamma output_predict = args.output_prediction dropout = args.dropout mymodel = args.mymodel print "Model is", mymodel, test_path, real_test_path X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, _X_real_test, _Y_real_test, _mask_real_test, \ embedd_table, label_alphabet, word_alphabet, \ C_train, C_dev, C_test, _C_real_test, char_embedd_table = data_processor.load_dataset_sequence_labeling(train_path, dev_path, test_path, test_path, oov=oov, fine_tune=fine_tune, embedding=embedding, embedding_path=embedding_path, use_character=True) _X_train, _Y_train, _mask_train, _X_dev, _Y_dev, _mask_dev, _X_test, _Y_test, _mask_test, X_real_test, Y_real_test, mask_real_test, \ _embedd_table, _label_alphabet, _word_alphabet, \ _C_train, _C_dev, _C_test, C_real_test, _char_embedd_table = data_processor.load_dataset_sequence_labeling(train_path, dev_path, test_path, real_test_path, oov=oov,fine_tune=fine_tune, embedding=embedding, embedding_path=embedding_path,use_character=True) #print _C_train.shape,_C_dev.shape,_C_test.shape,C_real_test.shape; #sys.exit(1) my_size = data_processor.MAX_LENGTH_TRAIN my_size = data_processor.MY_MAX_LENGTH #my_size = data_processor.MAX_LENGTH_DEV print "\tMYSIZE", my_size, C_real_test.shape, C_test.shape, C_train.shape num_labels = label_alphabet.size() - 1 logger.info("constructing network...") # create variables target_var = T.imatrix(name='targets') mask_var = T.matrix(name='masks', dtype=theano.config.floatX) if fine_tune: input_var = T.imatrix(name='inputs') num_data, max_length = X_train.shape alphabet_size, embedd_dim = embedd_table.shape else: input_var = T.tensor3(name='inputs', dtype=theano.config.floatX) num_data, max_length, embedd_dim = X_train.shape char_input_var = T.itensor3(name='char-inputs') num_data_char, max_sent_length, max_char_length = C_train.shape char_alphabet_size, char_embedd_dim = char_embedd_table.shape assert (max_length == max_sent_length) assert (num_data == num_data_char) # construct input and mask layers layer_incoming1 = construct_char_input_layer() layer_incoming2 = construct_input_layer() layer_mask = lasagne.layers.InputLayer(shape=(None, max_length), input_var=mask_var, name='mask') # construct bi-rnn-cnn num_units = args.num_units bi_lstm_cnn_crf = build_BiLSTM_CNN_CRF(layer_incoming1, layer_incoming2, num_units, num_labels, mask=layer_mask, grad_clipping=grad_clipping, peepholes=peepholes, num_filters=num_filters, dropout=dropout) # bi_lstm_cnn_crf = None logger.info("Network structure: hidden=%d, filter=%d" % (num_units, num_filters)) # compute loss num_tokens = mask_var.sum(dtype=theano.config.floatX) # get outpout of bi-lstm-cnn-crf shape [batch, length, num_labels, num_labels] energies_train = lasagne.layers.get_output(bi_lstm_cnn_crf) energies_eval = lasagne.layers.get_output(bi_lstm_cnn_crf, deterministic=True) loss_train = crf_loss(energies_train, target_var, mask_var).mean() loss_eval = crf_loss(energies_eval, target_var, mask_var).mean() # l2 regularization? if regular == 'l2': l2_penalty = lasagne.regularization.regularize_network_params( bi_lstm_cnn_crf, lasagne.regularization.l2) loss_train = loss_train + gamma * l2_penalty _, corr_train = crf_accuracy(energies_train, target_var) corr_train = (corr_train * mask_var).sum(dtype=theano.config.floatX) prediction_eval, corr_eval = crf_accuracy(energies_eval, target_var) corr_eval = (corr_eval * mask_var).sum(dtype=theano.config.floatX) # Create update expressions for training. # hyper parameters to tune: learning rate, momentum, regularization. batch_size = args.batch_size learning_rate = 1.0 if update_algo == 'adadelta' else args.learning_rate decay_rate = args.decay_rate momentum = 0.9 params = lasagne.layers.get_all_params(bi_lstm_cnn_crf, trainable=True) updates = utils.create_updates(loss_train, params, update_algo, learning_rate, momentum=momentum) # Compile a function performing a training step on a mini-batch train_fn = theano.function( [input_var, target_var, mask_var, char_input_var], [loss_train, corr_train, num_tokens], updates=updates) # Compile a second function evaluating the loss and accuracy of network eval_fn = theano.function( [input_var, target_var, mask_var, char_input_var], [loss_eval, corr_eval, num_tokens, prediction_eval]) my_prediction_eval = my_crf_accuracy(energies_eval) my_eval_fn = theano.function([input_var, mask_var, char_input_var], [my_prediction_eval]) # Finally, launch the training loop. logger.info( "Start training: %s with regularization: %s(%f), dropout: %s, fine tune: %s (#training data: %d, batch size: %d, clip: %.1f, peepholes: %s)..." \ % ( update_algo, regular, (0.0 if regular == 'none' else gamma), dropout, fine_tune, num_data, batch_size, grad_clipping, peepholes)) num_batches = num_data / batch_size num_epochs = 1000 best_loss = 1e+12 best_acc = 0.0 best_epoch_loss = 0 best_epoch_acc = 0 best_loss_test_err = 0. best_loss_test_corr = 0. best_acc_test_err = 0. best_acc_test_corr = 0. stop_count = 0 lr = learning_rate patience = args.patience print "#LOADING MODEL" #np.savez("model.npz",*lasagne.layers.get_all_param_values(bi_lstm_cnn_crf)) # just load the data, see here: # https://github.com/Lasagne/Lasagne/blob/master/examples/mnist.py #try: mymodel = sys.argv[1] #except IndexError: # mymodel = "models.npz" with np.load(mymodel) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] lasagne.layers.set_all_param_values(bi_lstm_cnn_crf, param_values) correct = 0 total = 0 print dir(bi_lstm_cnn_crf) #print bi_lstm_cnn_crf.predict([1,2,3,4]) #sys.exit(1) print X_real_test.shape, Y_real_test.shape, C_real_test.shape, mask_real_test.shape # that's a stupid hack #C_real_test = C_real_test[:len(X_real_test)] #print X_real_test[0:1] #print my_eval_fn(X_real_test[0:1],mask_real_test[0:1],C_real_test[0:1]) #sys.exit(1) for batch in utils.iterate_minibatches(X_real_test, Y_real_test, masks=mask_real_test, char_inputs=C_real_test, batch_size=batch_size): inputs, targets, masks, char_inputs = batch #print inputs,targets,masks,char_inputs; sys.exit(1) err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs) #print predictions # SE input_clear = [ word_alphabet.get_instance(y) for x in inputs for y in list(x) ] # predictions,dir(label_alphabet),label_alphabet.get_instance(4) # SE target_clear = [ label_alphabet.get_instance(y + 1) for x in targets for y in list(x) ] target_clear_pred = [ label_alphabet.get_instance(y + 1) for x in predictions for y in list(x) ] # SE #print my_size # comment this out #my_size = 557 #print input_clear; sys.exit(1) for ii in range(batch_size): Z = input_clear[ii * my_size:(ii + 1) * my_size] if len(Z) == 0: continue try: size = Z.index(None) except ValueError: size = my_size #print size itruth = input_clear[ii * my_size:(ii + 1) * my_size][:size] otruth = filter( lambda z: z != "EMPTY", target_clear[ii * my_size:(ii + 1) * my_size][:size]) opred = filter( lambda z: z != "EMPTY", target_clear_pred[ii * my_size:(ii + 1) * my_size][:size]) total += len(opred) correct += len( filter(lambda x: x == True, [otruth[jj] == opred[jj] for jj in xrange(len(opred))])) if otruth == opred: #test_err_sentence += 1 #print "CORRECT",itruth,otruth,opred #print "#CORRECT %%%" printout(itruth, opred, otruth) print else: #print "#WRONG %%%" #itruth,otruth,opred printout(itruth, opred, otruth) print print correct, total, correct * 1.0 / total