def main(): parser = argparse.ArgumentParser( description='Tuning with bi-directional RNN-CNN') parser.add_argument('--mode', choices=['RNN', 'LSTM', 'GRU'], help='architecture of rnn', required=True) parser.add_argument('--num_epochs', type=int, default=100, help='Number of training epochs') parser.add_argument('--batch_size', type=int, default=16, help='Number of sentences in each batch') parser.add_argument('--hidden_size', type=int, default=128, help='Number of hidden units in RNN') parser.add_argument('--tag_space', type=int, default=0, help='Dimension of tag space') parser.add_argument('--num_layers', type=int, default=1, help='Number of layers of RNN') parser.add_argument('--num_filters', type=int, default=30, help='Number of filters in CNN') parser.add_argument('--char_dim', type=int, default=30, help='Dimension of Character embeddings') parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate') parser.add_argument('--decay_rate', type=float, default=0.1, help='Decay rate of learning rate') parser.add_argument('--gamma', type=float, default=0.0, help='weight for regularization') parser.add_argument('--dropout', choices=['std', 'variational'], help='type of dropout', required=True) parser.add_argument('--p_rnn', nargs=2, type=float, required=True, help='dropout rate for RNN') parser.add_argument('--p_in', type=float, default=0.33, help='dropout rate for input embeddings') parser.add_argument('--p_out', type=float, default=0.33, help='dropout rate for output layer') parser.add_argument('--schedule', type=int, help='schedule for learning rate decay') parser.add_argument('--unk_replace', type=float, default=0., help='The rate to replace a singleton word with UNK') parser.add_argument('--embedding', choices=['glove', 'senna', 'sskip', 'polyglot'], help='Embedding for words', required=True) parser.add_argument('--embedding_dict', help='path for embedding dict') parser.add_argument( '--train') # "data/POS-penn/wsj/split1/wsj1.train.original" parser.add_argument( '--dev') # "data/POS-penn/wsj/split1/wsj1.dev.original" parser.add_argument( '--test') # "data/POS-penn/wsj/split1/wsj1.test.original" args = parser.parse_args() logger = get_logger("NER") mode = args.mode train_path = args.train dev_path = args.dev test_path = args.test num_epochs = args.num_epochs batch_size = args.batch_size hidden_size = args.hidden_size num_filters = args.num_filters learning_rate = args.learning_rate momentum = 0.9 decay_rate = args.decay_rate gamma = args.gamma schedule = args.schedule p_rnn = tuple(args.p_rnn) p_in = args.p_in p_out = args.p_out unk_replace = args.unk_replace embedding = args.embedding embedding_path = args.embedding_dict embedd_dict, embedd_dim = utils.load_embedding_dict( embedding, embedding_path) logger.info("Creating Alphabets") word_alphabet, char_alphabet, pos_alphabet, \ chunk_alphabet, ner_alphabet = conll03_data.create_alphabets("data/alphabets/ner/", train_path, data_paths=[dev_path, test_path], embedd_dict=embedd_dict, max_vocabulary_size=50000) logger.info("Word Alphabet Size: %d" % word_alphabet.size()) logger.info("Character Alphabet Size: %d" % char_alphabet.size()) logger.info("POS Alphabet Size: %d" % pos_alphabet.size()) logger.info("Chunk Alphabet Size: %d" % chunk_alphabet.size()) logger.info("NER Alphabet Size: %d" % ner_alphabet.size()) logger.info("Reading Data") use_gpu = torch.cuda.is_available() data_train = conll03_data.read_data_to_variable(train_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet, use_gpu=use_gpu) num_data = sum(data_train[1]) num_labels = ner_alphabet.size() data_dev = conll03_data.read_data_to_variable(dev_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet, use_gpu=use_gpu, volatile=True) data_test = conll03_data.read_data_to_variable(test_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet, use_gpu=use_gpu, volatile=True) writer = CoNLL03Writer(word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet) def construct_word_embedding_table(): scale = np.sqrt(3.0 / embedd_dim) table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32) table[conll03_data.UNK_ID, :] = np.random.uniform( -scale, scale, [1, embedd_dim]).astype(np.float32) oov = 0 for word, index in list(word_alphabet.items()): if word in embedd_dict: embedding = embedd_dict[word] elif word.lower() in embedd_dict: embedding = embedd_dict[word.lower()] else: embedding = np.random.uniform( -scale, scale, [1, embedd_dim]).astype(np.float32) oov += 1 table[index, :] = embedding print('oov: %d' % oov) return torch.from_numpy(table) word_table = construct_word_embedding_table() logger.info("constructing network...") char_dim = args.char_dim window = 3 num_layers = args.num_layers tag_space = args.tag_space initializer = nn.init.xavier_uniform if args.dropout == 'std': network = BiRecurrentConv(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), num_filters, window, mode, hidden_size, num_layers, num_labels, tag_space=tag_space, embedd_word=word_table, p_in=p_in, p_out=p_out, p_rnn=p_rnn, initializer=initializer) else: network = BiVarRecurrentConv(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), num_filters, window, mode, hidden_size, num_layers, num_labels, tag_space=tag_space, embedd_word=word_table, p_in=p_in, p_out=p_out, p_rnn=p_rnn, initializer=initializer) if use_gpu: network.cuda() lr = learning_rate # optim = Adam(network.parameters(), lr=lr, betas=(0.9, 0.9), weight_decay=gamma) optim = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True) logger.info( "Network: %s, num_layer=%d, hidden=%d, filter=%d, tag_space=%d" % (mode, num_layers, hidden_size, num_filters, tag_space)) logger.info( "training: l2: %f, (#training data: %d, batch: %d, unk replace: %.2f)" % (gamma, num_data, batch_size, unk_replace)) logger.info("dropout(in, out, rnn): (%.2f, %.2f, %s)" % (p_in, p_out, p_rnn)) num_batches = num_data / batch_size + 1 dev_f1 = 0.0 dev_acc = 0.0 dev_precision = 0.0 dev_recall = 0.0 test_f1 = 0.0 test_acc = 0.0 test_precision = 0.0 test_recall = 0.0 best_epoch = 0 for epoch in range(1, num_epochs + 1): print( 'Epoch %d (%s(%s), learning rate=%.4f, decay rate=%.4f (schedule=%d)): ' % (epoch, mode, args.dropout, lr, decay_rate, schedule)) train_err = 0. train_corr = 0. train_total = 0. start_time = time.time() num_back = 0 network.train() for batch in range(1, num_batches + 1): word, char, _, _, labels, masks, lengths = conll03_data.get_batch_variable( data_train, batch_size, unk_replace=unk_replace) optim.zero_grad() loss, corr, _ = network.loss( word, char, labels, mask=masks, length=lengths, leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS) loss.backward() optim.step() num_tokens = masks.data.sum() train_err += loss.data[0] * num_tokens train_corr += corr.data[0] train_total += num_tokens time_ave = (time.time() - start_time) / batch time_left = (num_batches - batch) * time_ave # update log if batch % 100 == 0: sys.stdout.write("\b" * num_back) sys.stdout.write(" " * num_back) sys.stdout.write("\b" * num_back) log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % ( batch, num_batches, train_err / train_total, train_corr * 100 / train_total, time_left) sys.stdout.write(log_info) sys.stdout.flush() num_back = len(log_info) sys.stdout.write("\b" * num_back) sys.stdout.write(" " * num_back) sys.stdout.write("\b" * num_back) print('train: %d loss: %.4f, acc: %.2f%%, time: %.2fs' % (num_batches, train_err / train_total, train_corr * 100 / train_total, time.time() - start_time)) # evaluate performance on dev data network.eval() tmp_filename = 'tmp/%s_dev%d' % (str(uid), epoch) writer.start(tmp_filename) for batch in conll03_data.iterate_batch_variable(data_dev, batch_size): word, char, pos, chunk, labels, masks, lengths = batch _, _, preds = network.loss( word, char, labels, mask=masks, length=lengths, leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS) writer.write(word.data.cpu().numpy(), pos.data.cpu().numpy(), chunk.data.cpu().numpy(), preds.data.cpu().numpy(), labels.data.cpu().numpy(), lengths.cpu().numpy()) writer.close() acc, precision, recall, f1 = evaluate(tmp_filename) print( 'dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' % (acc, precision, recall, f1)) if dev_f1 < f1: dev_f1 = f1 dev_acc = acc dev_precision = precision dev_recall = recall best_epoch = epoch # evaluate on test data when better performance detected tmp_filename = 'tmp/%s_test%d' % (str(uid), epoch) writer.start(tmp_filename) for batch in conll03_data.iterate_batch_variable( data_test, batch_size): word, char, pos, chunk, labels, masks, lengths = batch _, _, preds = network.loss( word, char, labels, mask=masks, length=lengths, leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS) writer.write(word.data.cpu().numpy(), pos.data.cpu().numpy(), chunk.data.cpu().numpy(), preds.data.cpu().numpy(), labels.data.cpu().numpy(), lengths.cpu().numpy()) writer.close() test_acc, test_precision, test_recall, test_f1 = evaluate( tmp_filename) print( "best dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % (dev_acc, dev_precision, dev_recall, dev_f1, best_epoch)) print( "best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % (test_acc, test_precision, test_recall, test_f1, best_epoch)) if epoch % schedule == 0: lr = learning_rate / (1.0 + epoch * decay_rate) optim = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True)
def main(): parser = argparse.ArgumentParser( description='Tuning with bi-directional RNN-CNN') parser.add_argument('--mode', choices=['RNN', 'LSTM', 'GRU'], help='architecture of rnn', required=True) parser.add_argument('--num_epochs', type=int, default=1000, help='Number of training epochs') parser.add_argument('--batch_size', type=int, default=16, help='Number of sentences in each batch') parser.add_argument('--hidden_size', type=int, default=128, help='Number of hidden units in RNN') parser.add_argument('--num_filters', type=int, default=30, help='Number of filters in CNN') parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate') parser.add_argument('--decay_rate', type=float, default=0.1, help='Decay rate of learning rate') parser.add_argument('--gamma', type=float, default=0.0, help='weight for regularization') parser.add_argument('--dropout', choices=['std', 'variational'], help='type of dropout', required=True) parser.add_argument('--p', type=float, default=0.5, help='dropout rate') parser.add_argument('--schedule', type=int, help='schedule for learning rate decay') parser.add_argument('--output_prediction', action='store_true', help='Output predictions to temp files') parser.add_argument( '--train') # "data/POS-penn/wsj/split1/wsj1.train.original" parser.add_argument( '--dev') # "data/POS-penn/wsj/split1/wsj1.dev.original" parser.add_argument( '--test') # "data/POS-penn/wsj/split1/wsj1.test.original" args = parser.parse_args() logger = get_logger("POSTagger") mode = args.mode train_path = args.train dev_path = args.dev test_path = args.test num_epochs = args.num_epochs batch_size = args.batch_size hidden_size = args.hidden_size num_filters = args.num_filters learning_rate = args.learning_rate momentum = 0.9 decay_rate = args.decay_rate gamma = args.gamma schedule = args.schedule p = args.p output_predict = args.output_prediction embedd_dict, embedd_dim = utils.load_word_embedding_dict( 'glove', "data/glove/glove.6B/glove.6B.100d.gz") logger.info("Creating Alphabets") word_alphabet, char_alphabet, pos_alphabet, \ type_alphabet = conllx_data.create_alphabets("data/alphabets/pos/", train_path, data_paths=[dev_path,test_path], max_vocabulary_size=50000, embedd_dict=embedd_dict) logger.info("Word Alphabet Size: %d" % word_alphabet.size()) logger.info("Character Alphabet Size: %d" % char_alphabet.size()) logger.info("POS Alphabet Size: %d" % pos_alphabet.size()) logger.info("Reading Data") use_gpu = torch.cuda.is_available() data_train = conllx_data.read_data_to_variable(train_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet, use_gpu=use_gpu) # data_train = conllx_data.read_data(train_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet) # num_data = sum([len(bucket) for bucket in data_train]) num_data = sum(data_train[1]) num_labels = pos_alphabet.size() data_dev = conllx_data.read_data_to_variable(dev_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet, use_gpu=use_gpu) data_test = conllx_data.read_data_to_variable(test_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet, use_gpu=use_gpu) def construct_word_embedding_table(): scale = np.sqrt(3.0 / embedd_dim) table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32) table[conllx_data.UNK_ID, :] = np.random.uniform( -scale, scale, [1, embedd_dim]).astype(np.float32) oov = 0 for word, index in word_alphabet.items(): if word in embedd_dict: embedding = embedd_dict[word] elif word.lower() in embedd_dict: embedding = embedd_dict[word.lower()] else: embedding = np.random.uniform( -scale, scale, [1, embedd_dim]).astype(np.float32) oov += 1 table[index, :] = embedding print('oov: %d' % oov) return torch.from_numpy(table) word_table = construct_word_embedding_table() logger.info("constructing network...") char_dim = 30 window = 3 num_layers = 1 if args.dropout == 'std': network = BiRecurrentConv(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), num_filters, window, mode, hidden_size, num_layers, num_labels, embedd_word=word_table, p_rnn=p) else: network = BiVarRecurrentConv(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), num_filters, window, mode, hidden_size, num_layers, num_labels, embedd_word=word_table, p_rnn=p) if use_gpu: network.cuda() lr = learning_rate # optim = Adam(network.parameters(), lr=lr, betas=(0.9, 0.9), weight_decay=gamma) optim = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True) logger.info("Network: %s, num_layer=%d, hidden=%d, filter=%d" % (mode, num_layers, hidden_size, num_filters)) logger.info( "training: l2: %f, (#training data: %d, batch: %d, dropout: %.2f)" % (gamma, num_data, batch_size, p)) num_batches = num_data / batch_size + 1 dev_correct = 0.0 best_epoch = 0 test_correct = 0.0 test_total = 0 for epoch in range(1, num_epochs + 1): print('Epoch %d (%s(%s), learning rate=%.4f, decay rate=%.4f (%d)): ' % (epoch, mode, args.dropout, lr, decay_rate, schedule)) train_err = 0. train_corr = 0. train_total = 0. start_time = time.time() num_back = 0 network.train() for batch in range(1, num_batches + 1): word, char, labels, _, _, masks, lengths = conllx_data.get_batch_variable( data_train, batch_size) optim.zero_grad() loss, corr, _ = network.loss( word, char, labels, mask=masks, leading_symbolic=conllx_data.NUM_SYMBOLIC_TAGS) loss.backward() optim.step() num_tokens = masks.data.sum() train_err += loss.data[0] * num_tokens train_corr += corr.data[0] train_total += num_tokens time_ave = (time.time() - start_time) / batch time_left = (num_batches - batch) * time_ave # update log if batch % 100 == 0: sys.stdout.write("\b" * num_back) log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % ( batch, num_batches, train_err / train_total, train_corr * 100 / train_total, time_left) sys.stdout.write(log_info) num_back = len(log_info) sys.stdout.write("\b" * num_back) print('train: %d loss: %.4f, acc: %.2f%%, time: %.2fs' % (epoch * num_batches, train_err / train_total, train_corr * 100 / train_total, time.time() - start_time)) # evaluate performance on dev data network.eval() dev_corr = 0.0 dev_total = 0 for batch in conllx_data.iterate_batch_variable(data_dev, batch_size): word, char, labels, _, _, masks, lengths = batch _, corr, preds = network.loss( word, char, labels, mask=masks, leading_symbolic=conllx_data.NUM_SYMBOLIC_TAGS) num_tokens = masks.data.sum() dev_corr += corr.data[0] dev_total += num_tokens print('dev corr: %d, total: %d, acc: %.2f%%' % (dev_corr, dev_total, dev_corr * 100 / dev_total)) if dev_correct < dev_corr: dev_correct = dev_corr best_epoch = epoch # evaluate on test data when better performance detected test_corr = 0.0 test_total = 0 for batch in conllx_data.iterate_batch_variable( data_test, batch_size): word, char, labels, _, _, masks, lengths = batch _, corr, preds = network.loss( word, char, labels, mask=masks, leading_symbolic=conllx_data.NUM_SYMBOLIC_TAGS) num_tokens = masks.data.sum() test_corr += corr.data[0] test_total += num_tokens test_correct = test_corr print("best dev corr: %d, total: %d, acc: %.2f%% (epoch: %d)" % (dev_correct, dev_total, dev_correct * 100 / dev_total, best_epoch)) print("best test corr: %d, total: %d, acc: %.2f%% (epoch: %d)" % (test_correct, test_total, test_correct * 100 / test_total, best_epoch)) if epoch in schedule: lr = learning_rate / (1.0 + epoch * decay_rate) optim = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True)
def main(): parser = argparse.ArgumentParser( description='NER with bi-directional RNN-CNN') parser.add_argument('--config', type=str, help='config file', required=True) parser.add_argument('--num_epochs', type=int, default=100, help='Number of training epochs') parser.add_argument('--batch_size', type=int, default=16, help='Number of sentences in each batch') parser.add_argument('--loss_type', choices=['sentence', 'token'], default='sentence', help='loss type (default: sentence)') parser.add_argument('--optim', choices=['sgd', 'adam'], help='type of optimizer', required=True) parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate') parser.add_argument('--lr_decay', type=float, default=0.999995, help='Decay rate of learning rate') parser.add_argument('--amsgrad', action='store_true', help='AMS Grad') parser.add_argument('--grad_clip', type=float, default=0, help='max norm for gradient clip (default 0: no clip') parser.add_argument('--warmup_steps', type=int, default=0, metavar='N', help='number of steps to warm up (default: 0)') parser.add_argument('--weight_decay', type=float, default=0.0, help='weight for l2 norm decay') parser.add_argument('--unk_replace', type=float, default=0., help='The rate to replace a singleton word with UNK') parser.add_argument('--embedding', choices=['glove', 'senna', 'sskip', 'polyglot'], help='Embedding for words', required=True) parser.add_argument('--embedding_dict', help='path for embedding dict') parser.add_argument('--train', help='path for training file.', required=True) parser.add_argument('--dev', help='path for dev file.', required=True) parser.add_argument('--test', help='path for test file.', required=True) parser.add_argument('--model_path', help='path for saving model file.', required=True) args = parser.parse_args() logger = get_logger("NER") args.cuda = torch.cuda.is_available() device = torch.device('cuda', 0) if args.cuda else torch.device('cpu') train_path = args.train dev_path = args.dev test_path = args.test num_epochs = args.num_epochs batch_size = args.batch_size optim = args.optim learning_rate = args.learning_rate lr_decay = args.lr_decay amsgrad = args.amsgrad warmup_steps = args.warmup_steps weight_decay = args.weight_decay grad_clip = args.grad_clip loss_ty_token = args.loss_type == 'token' unk_replace = args.unk_replace model_path = args.model_path model_name = os.path.join(model_path, 'model.pt') embedding = args.embedding embedding_path = args.embedding_dict print(args) embedd_dict, embedd_dim = utils.load_embedding_dict( embedding, embedding_path) logger.info("Creating Alphabets") alphabet_path = os.path.join(model_path, 'alphabets') word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet = conll03_data.create_alphabets( alphabet_path, train_path, data_paths=[dev_path, test_path], embedd_dict=embedd_dict, max_vocabulary_size=50000) logger.info("Word Alphabet Size: %d" % word_alphabet.size()) logger.info("Character Alphabet Size: %d" % char_alphabet.size()) logger.info("POS Alphabet Size: %d" % pos_alphabet.size()) logger.info("Chunk Alphabet Size: %d" % chunk_alphabet.size()) logger.info("NER Alphabet Size: %d" % ner_alphabet.size()) logger.info("Reading Data") data_train = conll03_data.read_bucketed_data(train_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet) num_data = sum(data_train[1]) num_labels = ner_alphabet.size() data_dev = conll03_data.read_data(dev_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet) data_test = conll03_data.read_data(test_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet) writer = CoNLL03Writer(word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet) def construct_word_embedding_table(): scale = np.sqrt(3.0 / embedd_dim) table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32) table[conll03_data.UNK_ID, :] = np.random.uniform( -scale, scale, [1, embedd_dim]).astype(np.float32) oov = 0 for word, index in word_alphabet.items(): if word in embedd_dict: embedding = embedd_dict[word] elif word.lower() in embedd_dict: embedding = embedd_dict[word.lower()] else: embedding = np.random.uniform( -scale, scale, [1, embedd_dim]).astype(np.float32) oov += 1 table[index, :] = embedding print('oov: %d' % oov) return torch.from_numpy(table) word_table = construct_word_embedding_table() logger.info("constructing network...") hyps = json.load(open(args.config, 'r')) json.dump(hyps, open(os.path.join(model_path, 'config.json'), 'w'), indent=2) dropout = hyps['dropout'] crf = hyps['crf'] bigram = hyps['bigram'] assert embedd_dim == hyps['embedd_dim'] char_dim = hyps['char_dim'] mode = hyps['rnn_mode'] hidden_size = hyps['hidden_size'] out_features = hyps['out_features'] num_layers = hyps['num_layers'] p_in = hyps['p_in'] p_out = hyps['p_out'] p_rnn = hyps['p_rnn'] activation = hyps['activation'] if dropout == 'std': if crf: network = BiRecurrentConvCRF(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), mode, hidden_size, out_features, num_layers, num_labels, embedd_word=word_table, p_in=p_in, p_out=p_out, p_rnn=p_rnn, bigram=bigram, activation=activation) else: network = BiRecurrentConv(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), mode, hidden_size, out_features, num_layers, num_labels, embedd_word=word_table, p_in=p_in, p_out=p_out, p_rnn=p_rnn, activation=activation) elif dropout == 'variational': if crf: network = BiVarRecurrentConvCRF(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), mode, hidden_size, out_features, num_layers, num_labels, embedd_word=word_table, p_in=p_in, p_out=p_out, p_rnn=p_rnn, bigram=bigram, activation=activation) else: network = BiVarRecurrentConv(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), mode, hidden_size, out_features, num_layers, num_labels, embedd_word=word_table, p_in=p_in, p_out=p_out, p_rnn=p_rnn, activation=activation) else: raise ValueError('Unkown dropout type: {}'.format(dropout)) network = network.to(device) optimizer, scheduler = get_optimizer(network.parameters(), optim, learning_rate, lr_decay, amsgrad, weight_decay, warmup_steps) model = "{}-CNN{}".format(mode, "-CRF" if crf else "") logger.info("Network: %s, num_layer=%d, hidden=%d, act=%s" % (model, num_layers, hidden_size, activation)) logger.info( "training: l2: %f, (#training data: %d, batch: %d, unk replace: %.2f)" % (weight_decay, num_data, batch_size, unk_replace)) logger.info("dropout(in, out, rnn): %s(%.2f, %.2f, %s)" % (dropout, p_in, p_out, p_rnn)) print('# of Parameters: %d' % (sum([param.numel() for param in network.parameters()]))) best_f1 = 0.0 best_acc = 0.0 best_precision = 0.0 best_recall = 0.0 test_f1 = 0.0 test_acc = 0.0 test_precision = 0.0 test_recall = 0.0 best_epoch = 0 patient = 0 num_batches = num_data // batch_size + 1 result_path = os.path.join(model_path, 'tmp') if not os.path.exists(result_path): os.makedirs(result_path) for epoch in range(1, num_epochs + 1): start_time = time.time() train_loss = 0. num_insts = 0 num_words = 0 num_back = 0 network.train() lr = scheduler.get_lr()[0] print('Epoch %d (%s, lr=%.6f, lr decay=%.6f, amsgrad=%s, l2=%.1e): ' % (epoch, optim, lr, lr_decay, amsgrad, weight_decay)) if args.cuda: torch.cuda.empty_cache() gc.collect() for step, data in enumerate( iterate_data(data_train, batch_size, bucketed=True, unk_replace=unk_replace, shuffle=True)): optimizer.zero_grad() words = data['WORD'].to(device) chars = data['CHAR'].to(device) labels = data['NER'].to(device) masks = data['MASK'].to(device) nbatch = words.size(0) nwords = masks.sum().item() loss_total = network.loss(words, chars, labels, mask=masks).sum() if loss_ty_token: loss = loss_total.div(nwords) else: loss = loss_total.div(nbatch) loss.backward() if grad_clip > 0: clip_grad_norm_(network.parameters(), grad_clip) optimizer.step() scheduler.step() with torch.no_grad(): num_insts += nbatch num_words += nwords train_loss += loss_total.item() # update log if step % 100 == 0: torch.cuda.empty_cache() sys.stdout.write("\b" * num_back) sys.stdout.write(" " * num_back) sys.stdout.write("\b" * num_back) curr_lr = scheduler.get_lr()[0] log_info = '[%d/%d (%.0f%%) lr=%.6f] loss: %.4f (%.4f)' % ( step, num_batches, 100. * step / num_batches, curr_lr, train_loss / num_insts, train_loss / num_words) sys.stdout.write(log_info) sys.stdout.flush() num_back = len(log_info) sys.stdout.write("\b" * num_back) sys.stdout.write(" " * num_back) sys.stdout.write("\b" * num_back) print('total: %d (%d), loss: %.4f (%.4f), time: %.2fs' % (num_insts, num_words, train_loss / num_insts, train_loss / num_words, time.time() - start_time)) print('-' * 100) # evaluate performance on dev data with torch.no_grad(): outfile = os.path.join(result_path, 'pred_dev%d' % epoch) scorefile = os.path.join(result_path, "score_dev%d" % epoch) acc, precision, recall, f1 = eval(data_dev, network, writer, outfile, scorefile, device) print( 'Dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' % (acc, precision, recall, f1)) if best_f1 < f1: torch.save(network.state_dict(), model_name) best_f1 = f1 best_acc = acc best_precision = precision best_recall = recall best_epoch = epoch # evaluate on test data when better performance detected outfile = os.path.join(result_path, 'pred_test%d' % epoch) scorefile = os.path.join(result_path, "score_test%d" % epoch) test_acc, test_precision, test_recall, test_f1 = eval( data_test, network, writer, outfile, scorefile, device) print( 'test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' % (test_acc, test_precision, test_recall, test_f1)) patient = 0 else: patient += 1 print('-' * 100) print( "Best dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d (%d))" % (best_acc, best_precision, best_recall, best_f1, best_epoch, patient)) print( "Best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d (%d))" % (test_acc, test_precision, test_recall, test_f1, best_epoch, patient)) print('=' * 100) if patient > 4: logger.info('reset optimizer momentums') scheduler.reset_state() patient = 0
def main(): parser = argparse.ArgumentParser( description='Tuning with bi-directional RNN-CNN') parser.add_argument('--mode', choices=['RNN', 'LSTM', 'GRU'], help='architecture of rnn', required=True) parser.add_argument('--num_epochs', type=int, default=100, help='Number of training epochs') parser.add_argument('--num_layers', type=int, default=2, help='Number of layers') parser.add_argument('--batch_size', type=int, default=16, help='Number of sentences in each batch') parser.add_argument('--bidirectional', default=True) parser.add_argument('--hidden_size', type=int, default=128, help='Number of hidden units in RNN') parser.add_argument('--tag_space', type=int, default=0, help='Dimension of tag space') parser.add_argument('--num_filters', type=int, default=30, help='Number of filters in CNN') parser.add_argument('--char_dim', type=int, default=30, help='Dimension of Character embeddings') parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate') parser.add_argument('--decay_rate', type=float, default=0.1, help='Decay rate of learning rate') parser.add_argument('--gamma', type=float, default=0.0, help='weight for regularization') parser.add_argument('--dropout', choices=['std', 'variational'], help='type of dropout', required=True) parser.add_argument('--p', type=float, default=0.5, help='dropout rate') parser.add_argument('--schedule', type=int, help='schedule for learning rate decay') parser.add_argument('--unk_replace', type=float, default=0., help='The rate to replace a singleton word with UNK') # parser.add_argument('--embedding', choices=['glove', 'senna', 'sskip', 'polyglot'], help='Embedding for words', #required=True) # parser.add_argument('--embedding_dict', help='path for embedding dict') parser.add_argument('--data_path') parser.add_argument('--modelname', default="ASR_ERR_LSTM.json.pth.tar", help='model name') parser.add_argument('--task', default="MEDIA", help='task name : MEDIA or ATIS') parser.add_argument('--optim', default="SGD", help=' Optimizer : SGD or ADAM') parser.add_argument( '--train') # "data/POS-penn/wsj/split1/wsj1.train.original" parser.add_argument( '--dev') # "data/POS-penn/wsj/split1/wsj1.dev.original" parser.add_argument( '--test') # "data/POS-penn/wsj/split1/wsj1.test.original" args = parser.parse_args() tim = datetime.now().strftime("%Y%m%d-%H%M%S") log_file = '%s/log/log_model_%s_mode_%s_num_epochs_%d_batch_size_%d_hidden_size_%d_num_layers_%d_optim_%s_lr_%f_tag_space_%s.txt' % ( args.data_path, args.modelname, args.mode, args.num_epochs, args.batch_size, args.hidden_size, args.num_layers, args.optim, args.learning_rate, str(args.tag_space)) logger = get_logger("SLU_BLSTM", log_file) mode = args.mode train_path = args.train dev_path = args.dev test_path = args.test num_epochs = args.num_epochs batch_size = args.batch_size hidden_size = args.hidden_size num_filters = args.num_filters learning_rate = args.learning_rate momentum = 0.9 decay_rate = args.decay_rate gamma = args.gamma schedule = args.schedule data_path = args.data_path bidirectional = args.bidirectional p = args.p unk_replace = args.unk_replace # embedding = args.embedding # embedding_path = args.embedding_dict feat_vec_dim = data_loader.get_feature_dim(train_path) out_path = args.data_path logger.info("Creating Alphabets") word_alphabet, char_alphabet, target_alphabet = data_loader.create_alphabets( '%s/data_dic' % (data_path), train_path, data_paths=[dev_path, test_path], max_vocabulary_size=50000) logger.info("Word feature_vec_dim: %d" % feat_vec_dim) logger.info("Word Alphabet Size: %d" % word_alphabet.size()) logger.info("Character Alphabet Size: %d" % char_alphabet.size()) logger.info("Target Alphabet Size: %d" % target_alphabet.size()) logger.info("Bidirectionnal %s" % bidirectional) logger.info("Reading Data") use_gpu = torch.cuda.is_available() data_train = data_loader.read_data_to_variable(train_path, word_alphabet, char_alphabet, target_alphabet, use_gpu=use_gpu) num_data = sum(data_train[1]) num_labels = target_alphabet.size() data_dev = data_loader.read_data_to_variable(dev_path, word_alphabet, char_alphabet, target_alphabet, use_gpu=use_gpu, volatile=True) data_test = data_loader.read_data_to_variable(test_path, word_alphabet, char_alphabet, target_alphabet, use_gpu=use_gpu, volatile=True) writer = DataWriter(word_alphabet, char_alphabet, target_alphabet) char_dim = args.char_dim window = 3 num_layers = args.num_layers tag_space = args.tag_space print("num_labels ", word_alphabet.size()) if args.dropout == 'std': network = BiRecurrentConv(feat_vec_dim, word_alphabet.size(), num_filters, window, mode, hidden_size, num_layers, num_labels, tag_space=tag_space, p_rnn=p, bidirectional=bidirectional) else: network = BiVarRecurrentConv(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), num_filters, window, mode, hidden_size, num_layers, num_labels, tag_space=tag_space, embedd_word=word_table, p_rnn=p) print(network) if use_gpu: network.cuda() lr = learning_rate if args.optim == "SGD": optim = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True) else: optim = Adam(network.parameters(), lr=lr, betas=(0.9, 0.9), weight_decay=gamma) logger.info( "Network: %s, num_layer=%d, hidden=%d, filter=%d, tag_space=%d" % (mode, num_layers, hidden_size, num_filters, tag_space)) logger.info( "training: l2: %f, (#training data: %d, batch: %d, dropout: %.2f, unk replace: %.2f)" % (gamma, num_data, batch_size, p, unk_replace)) num_batches = num_data / batch_size + 1 dev_f1 = 0.0 dev_acc = 0.0 dev_precision = 0.0 dev_recall = 0.0 test_f1 = 0.0 test_acc = 0.0 test_precision = 0.0 test_recall = 0.0 best_epoch = 0 model_path = "" for epoch in range(1, num_epochs + 1): print( 'Epoch %d (%s(%s), learning rate=%.4f, decay rate=%.4f (schedule=%d)): ' % (epoch, mode, args.dropout, lr, decay_rate, schedule)) train_err = 0. train_corr = 0. train_total = 0. start_time = time.time() num_back = 0 network.train() #batch=1 for batch in range(1, num_batches + 1): #for batch_train in data_loader.iterate_batch_variable(data_train, batch_size): word, features, sents, char, labels, masks, lengths = data_loader.get_batch_variable( data_train, batch_size, unk_replace=unk_replace) optim.zero_grad() loss, corr, _, _ = network.loss( features, char, labels, mask=masks, length=lengths, leading_symbolic=data_loader.NUM_SYMBOLIC_TAGS) loss.backward() optim.step() num_tokens = masks.data.sum() #train_err += loss.data * num_tokens train_err += loss.data[0] * num_tokens #train_corr += corr.data train_corr += corr.data[0] train_total += num_tokens time_ave = (time.time() - start_time) / batch time_left = (num_batches - batch) * time_ave # update log if batch % 100 == 0: sys.stdout.write("\b" * num_back) sys.stdout.write(" " * num_back) sys.stdout.write("\b" * num_back) log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % ( batch, num_batches, train_err / train_total, train_corr * 100 / train_total, time_left) sys.stdout.write(log_info) sys.stdout.flush() num_back = len(log_info) batch = batch + 1 sys.stdout.write("\b" * num_back) sys.stdout.write(" " * num_back) sys.stdout.write("\b" * num_back) print('train: %d loss: %.4f, time: %.2fs' % (num_batches, train_err / train_total, time.time() - start_time)) logger.info( 'train: %d loss: %.4f, time: %.2fs' % (num_batches, train_err / train_total, time.time() - start_time)) print('train: %d loss: %.4f, acc: %.2f%%, time: %.2fs' % (num_batches, train_err / train_total, train_corr * 100 / train_total, time.time() - start_time)) loss_results = train_err / train_total # evaluate performance on dev data network.eval() tmp_filename = '%s/predictions/dev_%s_num_layers_%s_%s.txt' % ( out_path, args.optim, str(args.num_layers), str(uid)) writer.start(tmp_filename) all_target = [] all_preds = [] for batch in data_loader.iterate_batch_variable(data_dev, batch_size): word, features, sents, char, labels, masks, lengths = batch _, _, preds, probs = network.loss( features, char, labels, mask=masks, length=lengths, leading_symbolic=data_loader.NUM_SYMBOLIC_TAGS) writer.write(word.data.cpu().numpy(), sents.data.cpu().numpy(), preds.data.cpu().numpy(), labels.data.cpu().numpy(), lengths.cpu().numpy()) # correct_tag, pred_tag=writer.tensor_to_list(preds.cpu().numpy(),labels.cpu().numpy(), lengths.cpu().numpy()) # all_target.extend(correct_tag) # all_preds.extend(pred_tag) writer.close() # precision, recall,f1,acc=writer.evaluate(all_preds,all_target) acc, precision, recall, f1 = evaluate(tmp_filename, data_path, "dev", args.task, args.optim) print( 'dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' % (acc, precision, recall, f1)) logger.info( 'dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' % (acc, precision, recall, f1)) if dev_acc < acc: dev_f1 = f1 dev_acc = acc dev_precision = precision dev_recall = recall best_epoch = epoch # save best model model_path = "%s/models/best_model_%s_mode_%s_num_epochs_%d_batch_size_%d_hidden_size_%d_num_layers_%d_bestdevacc_%f_bestepoch_%d_optim_%s_lr_%f_tag_space_%s" % ( args.data_path, args.modelname, mode, num_epochs, batch_size, hidden_size, args.num_layers, dev_acc, best_epoch, args.optim, args.learning_rate, str(tag_space)) torch.save(network, model_path) # evaluate on test data when better performance detected """ tmp_filename = '%s/tmp/%s_test%d' % (data_path,tim, epoch) writer.start(tmp_filename) for batch in data_loader.iterate_batch_variable(data_test, batch_size): word, features, sents, char, labels, masks, lengths = batch _, _, preds,probs = network.loss(features, char, labels, mask=masks, length=lengths, leading_symbolic=data_loader.NUM_SYMBOLIC_TAGS) writer.write(word.data.cpu().numpy(),sents.data.cpu().numpy(), preds.data.cpu().numpy(), probs.data.cpu().numpy(), labels.data.cpu().numpy(), lengths.cpu().numpy()) writer.close() test_acc, test_precision, test_recall, test_f1 = evaluate(tmp_filename, data_path,"test",tim) """ logger.info( "best dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % (dev_acc, dev_precision, dev_recall, dev_f1, best_epoch)) # logger.info("best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % ( # test_acc, test_precision, test_recall, test_f1, best_epoch)) if epoch % schedule == 0: lr = learning_rate / (1.0 + epoch * decay_rate) if args.optim == "SGD": optim = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True) else: optim = Adam(network.parameters(), lr=lr, betas=(0.9, 0.9), weight_decay=gamma) # end epoch # test evaluation # load model print("model path ", model_path) network = torch.load(model_path) if use_gpu: network.cuda() # mode eval network.eval() # evaluate on test dev when better performance detected tmp_filename = '%s/predictions/dev_best_model_%s_mode_%s_num_epochs_%d_batch_size_%d_hidden_size_%d_num_layers_%d_bestdevacc_%f_bestF1_%f_bestepoch_%d_optim_%s_lr_%f_tag_space_%s' % ( out_path, args.modelname, mode, num_epochs, batch_size, hidden_size, num_layers, dev_acc, dev_f1, best_epoch, args.optim, args.learning_rate, tag_space) #tmp_filename = '%s/predictions/dev_best_model_%s_mode_%s_num_epochs_%d_batch_size_%d_hidden_size_%d_num_layers_%d_bestdevacc_%f_bestepoch_%d_optim_%s_lr_%f_tag_space_%s' % (out_path,args.modelname,mode,num_epochs,batch_size,hidden_size,num_layers,dev_acc,best_epoch,args.optim,args.learning_rate,tag_space) #tmp_filename = '%s/predictions/dev_bestmodel_devacc_%f_epoch_%d' % (out_path,dev_acc, best_epoch) writer.start(tmp_filename) all_target = [] all_preds = [] for batch in data_loader.iterate_batch_variable(data_dev, batch_size): word, features, sents, char, labels, masks, lengths = batch _, _, preds, probs = network.loss( features, char, labels, mask=masks, length=lengths, leading_symbolic=data_loader.NUM_SYMBOLIC_TAGS) writer.write(word.data.cpu().numpy(), sents.data.cpu().numpy(), preds.data.cpu().numpy(), labels.data.cpu().numpy(), lengths.cpu().numpy()) writer.close() dev_acc, dev_precision, dev_recall, dev_f1 = evaluate( tmp_filename, data_path, "dev", args.task, args.optim) # evaluate on test data when better performance detected tmp_filename = '%s/predictions/test_best_model_%s_mode_%s_num_epochs_%d_batch_size_%d_hidden_size_%d_num_layers_%d_bestdevacc_%f_bestF1_%f_bestepoch_%d_optim_%s_lr_%f_tag_space_%s' % ( out_path, args.modelname, mode, num_epochs, batch_size, hidden_size, num_layers, dev_acc, dev_f1, best_epoch, args.optim, args.learning_rate, tag_space) #tmp_filename = '%s/predictions/test_best_model_%s_mode_%s_num_epochs_%d_batch_size_%d_hidden_size_%d_num_layers_%d_bestdevacc_%f_bestepoch_%d_optim_%s_lr_%f_tag_space_%s' % (out_path,args.modelname,mode,num_epochs,batch_size,hidden_size,num_layers,dev_acc,best_epoch, args.optim, args.learning_rate, tag_space) writer.start(tmp_filename) all_target = [] all_preds = [] for batch in data_loader.iterate_batch_variable(data_test, batch_size): word, features, sents, char, labels, masks, lengths = batch _, _, preds, probs = network.loss( features, char, labels, mask=masks, length=lengths, leading_symbolic=data_loader.NUM_SYMBOLIC_TAGS) writer.write(word.data.cpu().numpy(), sents.data.cpu().numpy(), preds.data.cpu().numpy(), labels.data.cpu().numpy(), lengths.cpu().numpy()) writer.close() test_acc, test_precision, test_recall, test_f1 = evaluate( tmp_filename, data_path, "test", args.task, args.optim) print( "best dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % (dev_acc, dev_precision, dev_recall, dev_f1, best_epoch)) print( "best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % (test_acc, test_precision, test_recall, test_f1, best_epoch)) logger.info( "best dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % (dev_acc, dev_precision, dev_recall, dev_f1, best_epoch)) logger.info( "best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % (test_acc, test_precision, test_recall, test_f1, best_epoch))