def test(args): print '\nNEURAL POS TAGGER START\n' print '\tINITIAL EMBEDDING\t%s %s' % (args.word_list, args.emb_list) print '\tWORD\t\t\tEmb Dim: %d Hidden Dim: %d' % (args.w_emb_dim, args.w_hidden_dim) print '\tCHARACTER\t\tEmb Dim: %d Hidden Dim: %d' % (args.c_emb_dim, args.c_hidden_dim) print '\tOPTIMIZATION\t\tMethod: %s Learning Rate: %f\n' % (args.opt, args.lr) """ load vocab """ print 'Loading vocabularies...\n' vocab_word = io_utils.load_data('vocab_word') vocab_char = io_utils.load_data('vocab_char') vocab_tag = io_utils.load_data('vocab_tag') print '\tWord size: %d Char size: %d' % (vocab_word.size(), vocab_char.size()) """ load data """ print '\nLoading data set...\n' test_corpus, test_vocab_word, test_vocab_char, test_vocab_tag = io_utils.load_conll(args.dev_data) print '\tTest Sentences: %d' % len(test_corpus) """ converting into ids """ print '\nConverting into IDs...\n' test_x, test_c, test_b, test_y = preprocessor.convert_into_ids(test_corpus, vocab_word, vocab_char, vocab_tag) """ tagger set up """ tagger = io_utils.load_data(args.load) dev_f = theano.function( inputs=tagger.input[:-1], outputs=tagger.result, mode='FAST_RUN' ) """ Prediction """ print '\nPREDICTION START\n' print '\tBatch Index: ', start = time.time() total = 0.0 correct = 0 for index in xrange(len(test_x)): if index % 100 == 0 and index != 0: print index, sys.stdout.flush() if tagger.name == 'char': corrects = dev_f(test_x[index], test_c[index], test_b[index], test_y[index]) else: corrects = dev_f(test_x[index], test_y[index]) total += len(corrects) correct += np.sum(corrects) end = time.time() print '\n\tTime: %f seconds' % (end - start) print '\tAccuracy:%f Total:%d Correct:%d' % ((correct / total), total, correct)
def train(args): print '\nNEURAL POS TAGGER START\n' print '\tINITIAL EMBEDDING\t%s %s' % (args.word_list, args.emb_list) print '\tWORD\t\t\tEmb Dim: %d Hidden Dim: %d' % (args.w_emb_dim, args.w_hidden_dim) print '\tCHARACTER\t\tEmb Dim: %d Hidden Dim: %d' % (args.c_emb_dim, args.c_hidden_dim) print '\tOPTIMIZATION\t\tMethod: %s Learning Rate: %f\n' % (args.opt, args.lr) """ load data """ print 'Loading data sets...\n' train_corpus, vocab_word, vocab_char, vocab_tag, max_char_len = load_train_data( args) dev_corpus, dev_vocab_word, dev_vocab_char, dev_vocab_tag, dev_max_char_len = load_dev_data( args) if dev_corpus: for w in dev_vocab_word.i2w: if args.vocab_size is None or vocab_word.size() < args.vocab_size: vocab_word.add_word(w) for c in dev_vocab_char.i2w: vocab_char.add_word(c) for t in dev_vocab_tag.i2w: vocab_tag.add_word(t) if args.save: io_utils.dump_data(vocab_word, 'vocab_word') io_utils.dump_data(vocab_char, 'vocab_char') io_utils.dump_data(vocab_tag, 'vocab_tag') """ load word embeddings """ init_w_emb = None if args.emb_list: print '\tLoading pre-trained word embeddings...\n' init_w_emb = io_utils.load_init_emb(args.emb_list, args.word_list, vocab_word) w_emb_dim = init_w_emb.shape[1] else: print '\tUse random-initialized word embeddings...\n' w_emb_dim = args.w_emb_dim """ limit data set """ train_corpus = train_corpus[:args.data_size] train_corpus.sort(key=lambda a: len(a)) """ converting into ids """ print '\nConverting into IDs...\n' tr_x, tr_c, tr_b, tr_y = preprocessor.convert_into_ids( train_corpus, vocab_word, vocab_char, vocab_tag) if args.dev_data: dev_x, dev_c, dev_b, dev_y = preprocessor.convert_into_ids( dev_corpus, vocab_word, vocab_char, vocab_tag) print '\tTrain Sentences: %d Dev Sentences: %d' % (len(train_corpus), len(dev_corpus)) else: print '\tTrain Sentences: %d' % len(train_corpus) print '\tWord size: %d Char size: %d' % (vocab_word.size(), vocab_char.size()) """ tagger set up """ tagger = set_model(args, init_w_emb, w_emb_dim, vocab_word, vocab_char, vocab_tag) train_f = theano.function(inputs=tagger.input, outputs=[tagger.nll, tagger.result], updates=tagger.updates, mode='FAST_RUN') dev_f = theano.function(inputs=tagger.input[:-1], outputs=tagger.result, mode='FAST_RUN') def _train(): print '\nTRAINING START\n' for epoch in xrange(args.epoch): _lr = args.lr / float(epoch + 1) indices = range(len(tr_x)) random.shuffle(indices) print '\nEpoch: %d' % (epoch + 1) print '\n\tTrain set' print '\t\tBatch Index: ', start = time.time() total = 0.0 correct = 0 losses = 0.0 for i, index in enumerate(indices): if i % 100 == 0 and i != 0: print i, sys.stdout.flush() if args.model == 'char': loss, corrects = train_f(tr_x[index], tr_c[index], tr_b[index], tr_y[index], _lr) else: loss, corrects = train_f(tr_x[index], tr_y[index], _lr) assert math.isnan(loss) is False, index total += len(corrects) correct += np.sum(corrects) losses += loss end = time.time() print '\n\t\tTime: %f seconds' % (end - start) print '\t\tNegative Log Likelihood: %f' % losses print '\t\tAccuracy:%f Total:%d Correct:%d' % ( (correct / total), total, correct) if args.save: io_utils.dump_data( tagger, 'model-%s.epoch-%d' % (args.model, epoch + 1)) _dev(dev_f) def _dev(_dev_f): print '\n\tDev set' print '\t\tBatch Index: ', start = time.time() total = 0.0 correct = 0 for index in xrange(len(dev_x)): if index % 100 == 0 and index != 0: print index, sys.stdout.flush() if args.model == 'char': corrects = _dev_f(dev_x[index], dev_c[index], dev_b[index], dev_y[index]) else: corrects = _dev_f(dev_x[index], dev_y[index]) total += len(corrects) correct += np.sum(corrects) end = time.time() print '\n\t\tTime: %f seconds' % (end - start) print '\t\tAccuracy:%f Total:%d Correct:%d' % ( (correct / total), total, correct) _train()
def train(args): print '\nNEURAL POS TAGGER START\n' print '\tINITIAL EMBEDDING\t%s %s' % (args.word_list, args.emb_list) print '\tWORD\t\t\tEmb Dim: %d Hidden Dim: %d' % (args.w_emb_dim, args.w_hidden_dim) print '\tCHARACTER\t\tEmb Dim: %d Hidden Dim: %d' % (args.c_emb_dim, args.c_hidden_dim) print '\tOPTIMIZATION\t\tMethod: %s Learning Rate: %f\n' % (args.opt, args.lr) """ load data """ print 'Loading data sets...\n' train_corpus, vocab_word, vocab_char, vocab_tag, max_char_len = load_train_data(args) dev_corpus, dev_vocab_word, dev_vocab_char, dev_vocab_tag, dev_max_char_len = load_dev_data(args) if dev_corpus: for w in dev_vocab_word.i2w: if args.vocab_size is None or vocab_word.size() < args.vocab_size: vocab_word.add_word(w) for c in dev_vocab_char.i2w: vocab_char.add_word(c) for t in dev_vocab_tag.i2w: vocab_tag.add_word(t) if args.save: io_utils.dump_data(vocab_word, 'vocab_word') io_utils.dump_data(vocab_char, 'vocab_char') io_utils.dump_data(vocab_tag, 'vocab_tag') """ load word embeddings """ init_w_emb = None if args.emb_list: print '\tLoading pre-trained word embeddings...\n' init_w_emb = io_utils.load_init_emb(args.emb_list, args.word_list, vocab_word) w_emb_dim = init_w_emb.shape[1] else: print '\tUse random-initialized word embeddings...\n' w_emb_dim = args.w_emb_dim """ limit data set """ train_corpus = train_corpus[:args.data_size] train_corpus.sort(key=lambda a: len(a)) """ converting into ids """ print '\nConverting into IDs...\n' tr_x, tr_c, tr_b, tr_y = preprocessor.convert_into_ids(train_corpus, vocab_word, vocab_char, vocab_tag) if args.dev_data: dev_x, dev_c, dev_b, dev_y = preprocessor.convert_into_ids(dev_corpus, vocab_word, vocab_char, vocab_tag) print '\tTrain Sentences: %d Dev Sentences: %d' % (len(train_corpus), len(dev_corpus)) else: print '\tTrain Sentences: %d' % len(train_corpus) print '\tWord size: %d Char size: %d' % (vocab_word.size(), vocab_char.size()) """ tagger set up """ tagger = set_model(args, init_w_emb, w_emb_dim, vocab_word, vocab_char, vocab_tag) train_f = theano.function( inputs=tagger.input, outputs=[tagger.nll, tagger.result], updates=tagger.updates, mode='FAST_RUN' ) dev_f = theano.function( inputs=tagger.input[:-1], outputs=tagger.result, mode='FAST_RUN' ) def _train(): print '\nTRAINING START\n' for epoch in xrange(args.epoch): _lr = args.lr / float(epoch+1) indices = range(len(tr_x)) random.shuffle(indices) print '\nEpoch: %d' % (epoch + 1) print '\n\tTrain set' print '\t\tBatch Index: ', start = time.time() total = 0.0 correct = 0 losses = 0.0 for i, index in enumerate(indices): if i % 100 == 0 and i != 0: print i, sys.stdout.flush() if args.model == 'char': loss, corrects = train_f(tr_x[index], tr_c[index], tr_b[index], tr_y[index], _lr) else: loss, corrects = train_f(tr_x[index], tr_y[index], _lr) assert math.isnan(loss) is False, index total += len(corrects) correct += np.sum(corrects) losses += loss end = time.time() print '\n\t\tTime: %f seconds' % (end - start) print '\t\tNegative Log Likelihood: %f' % losses print '\t\tAccuracy:%f Total:%d Correct:%d' % ((correct / total), total, correct) if args.save: io_utils.dump_data(tagger, 'model-%s.epoch-%d' % (args.model, epoch+1)) _dev(dev_f) def _dev(_dev_f): print '\n\tDev set' print '\t\tBatch Index: ', start = time.time() total = 0.0 correct = 0 for index in xrange(len(dev_x)): if index % 100 == 0 and index != 0: print index, sys.stdout.flush() if args.model == 'char': corrects = _dev_f(dev_x[index], dev_c[index], dev_b[index], dev_y[index]) else: corrects = _dev_f(dev_x[index], dev_y[index]) total += len(corrects) correct += np.sum(corrects) end = time.time() print '\n\t\tTime: %f seconds' % (end - start) print '\t\tAccuracy:%f Total:%d Correct:%d' % ((correct / total), total, correct) _train()