def evaluate(sess, model): Yhat = inference(model) #Return the index with the largest value across axis Ypredict = tf.argmax(Yhat, axis=1, output_type=tf.int32) # predictions test predictions_test = [ map(lambda x: idx2label[x], sess.run(Ypredict, feed_dict={x_input: [sentence]})) for sentence in test_lex ] groundtruth_test = [map(lambda x: idx2label[x], label) for label in test_y] words_test = [map(lambda x: idx2word[x], word) for word in test_lex] predictions_valid = [ map(lambda x: idx2label[x], sess.run(Ypredict, feed_dict={x_input: [sentence]})) for sentence in valid_lex ] groundtruth_valid = [ map(lambda x: idx2label[x], label) for label in valid_y ] words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex] # evaluation // compute the accuracy using conlleval.pl res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + "current.test.txt") res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + "current.valid.txt") return res_test, res_valid
def validate(model_filename): word_model = CustomEmbedding() valid_set, indexes = word_model.valid_set, word_model.indexes w2idx, la2idx = indexes['w2idx'], indexes['la2idx'] idx2w, idx2la = indexes['idx2w'], indexes['idx2la'] n_classes = len(idx2la) n_vocab = len(idx2w) valid_x, valid_label = valid_set log("Processing word indexes... ") words_val = [list(map(lambda x: idx2w[x], w)) for w in valid_x] groundtruth_val = [list(map(lambda x: idx2la[x], y)) for y in valid_label] log("Done processing word indexes!") process = Process() process.load(model_filename) predword_val = process.validate(valid_set) metrics = conlleval(predword_val, groundtruth_val, words_val, 'diff.txt') log('Precision = {}, Recall = {}, F1 = {}'.format(metrics['precision'], metrics['recall'], metrics['f1']))
# View each sentence as a batch .. sent = sent[np.newaxis, :] if sent.shape[1] > 1: # ignore 1 word sentences model.train_on_batch(sent, label) from metrics.accuracy import conlleval labels_pred_val = [] bar = progressbar.ProgressBar(max_value=len(val_x)) for n_batch, sent in bar(enumerate(val_x)): label = val_label[n_batch] label = np.eye(n_classes)[label][np.newaxis, :] sent = sent[np.newaxis, :] pred = model.predict_on_batch(sent) pred = np.argmax(pred, -1)[0] labels_pred_val.append(pred) labels_pred_val = [list(map(lambda x: idx2la[x], y)) \ for y in labels_pred_val] con_dict = conlleval(labels_pred_val, labels_val, words_val, 'measure.txt') print('Precision = {}, Recall = {}, F1 = {}'.format( con_dict['r'], con_dict['p'], con_dict['f1'])) # model.fit(x=train_x, y=train_label, steps_per_epoch=1,callbacks=[checkpointer, tensorboard]) model.save_weights(path + "MUSIC_LSTM-" + str(i) + ".h5") model_json = model.to_json() with open(path + "model_embed_lstm.json", "w") as jf: jf.write(model_json)
# evaluation // back into the real world : idx -> words predictions_test = [map(lambda x: idx2label[x], rnn.classify(numpy.asarray(contextwin(x, s['win'])).astype('int32'), f) ) for x, f in zip (test_lex, test_feat)] ground_truth_test = [map(lambda x: idx2label[x], y) for y in test_y] words_test = [ map(lambda x: idx2word[x], w) for w in test_lex] predictions_valid = [map(lambda x: idx2label[x], rnn.classify(numpy.asarray(contextwin(x, s['win'])).astype('int32'), f)) for x, f in zip (valid_lex, valid_feat)] ground_truth_valid = [map(lambda x: idx2label[x], y) for y in valid_y] words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex] # evaluation // compute the accuracy using conlleval.pl res_test = conlleval(predictions_test, ground_truth_test, words_test, model_folder + '/current.test.txt') res_valid = conlleval(predictions_valid, ground_truth_valid, words_valid, model_folder + '/current.valid.txt') if res_test['f1'] > best_f1_test: rnn.save(model_folder) best_f1_test, best_f1_test_val = res_test['f1'], res_valid['f1'] if s['verbose']: print 'NEW BEST: epoch', e, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' '*20 s['vf1'], s['vp'], s['vr'] = res_valid['f1'], res_valid['p'], res_valid['r'] s['tf1'], s['tp'], s['tr'] = res_test['f1'], res_test['p'], res_test['r'] s['be'] = e
def main(): settings = { 'fold': 3, # 5 folds 0,1,2,3,4 'lr': 0.0627142536696559, 'verbose': 1, 'decay': False, # decay on the learning rate if improvement stops 'win': 7, # number of words in the context window 'bs': 9, # number of backprop through time steps 'nhidden': 100, # number of hidden units 'seed': 345, 'emb_dimension': 100, # dimension of word embedding 'nepochs': 50 } folder = os.path.basename(__file__).split('.')[0] if not os.path.exists(folder): os.mkdir(folder) # load the dataset train_set, valid_set, test_set, dic = load.atisfold(settings['fold']) idx2label = dict((k, v) for v, k in dic['labels2idx'].iteritems()) idx2word = dict((k, v) for v, k in dic['words2idx'].iteritems()) train_lex, train_ne, train_y = train_set valid_lex, valid_ne, valid_y = valid_set test_lex, test_ne, test_y = test_set vocsize = len(dic['words2idx']) nclasses = len(dic['labels2idx']) nsentences = len(train_lex) # instantiate the model numpy.random.seed(settings['seed']) random.seed(settings['seed']) if LOAD: print "Loading model from %s..." % folder rnn = ElmanRNNModel.load(folder) else: rnn = ElmanRNNModel( hidden_dims=settings['nhidden'], num_classes=nclasses, vocab_size=vocsize, embed_dims=settings['emb_dimension'], context_size=settings['win'] ) # train with early stopping on validation set best_f1 = -numpy.inf settings['current_lr'] = settings['lr'] for e in xrange(settings['nepochs']): # shuffle shuffle([train_lex, train_ne, train_y], settings['seed']) settings['current_epoch'] = e tic = time.time() for i in xrange(nsentences): cwords = contextwin(train_lex[i], settings['win']) words = map( lambda x: numpy.asarray(x).astype('int32'), minibatch(cwords, settings['bs']) ) labels = train_y[i] for word_batch, label_last_word in zip(words, labels): rnn.train(word_batch, label_last_word, settings['current_lr']) rnn.normalize() if settings['verbose']: print '[learning] epoch %i >> %2.2f%%' % (e, (i+1)*100./nsentences), \ 'completed in %.2f (sec) <<\r' % (time.time()-tic), sys.stdout.flush() # evaluation // back into the real world : idx -> words predictions_test = [ map(lambda x: idx2label[x], rnn.classify(numpy.asarray(contextwin(x, settings['win'])).astype('int32'))) for x in test_lex ] groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y ] words_test = [map(lambda x: idx2word[x], w) for w in test_lex] predictions_valid = [ map( lambda idx: idx2label[idx], rnn.classify( numpy.asarray(contextwin(x, settings['win'])).astype('int32')) ) for x in valid_lex ] groundtruth_valid = [map(lambda x: idx2label[x], y) for y in valid_y] words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex] # evaluation // compute the accuracy using conlleval.pl res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt') res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt') if res_valid['f1'] > best_f1: rnn.save(folder) best_f1 = res_valid['f1'] if settings['verbose']: print 'NEW BEST: epoch', e, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' '*20 settings['vf1'], settings['vp'], settings['vr'] = res_valid['f1'], res_valid['p'], res_valid['r'] settings['tf1'], settings['tp'], settings['tr'] = res_test['f1'], res_test['p'], res_test['r'] settings['be'] = e subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt']) subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt']) else: print '' # learning rate decay if no improvement in 10 epochs if settings['decay'] and abs(settings['be'] - settings['current_epoch']) >= 10: settings['current_lr'] *= 0.5 if settings['current_lr'] < 1e-5: break print 'BEST RESULT: epoch', e, 'valid F1', settings['vf1'], 'best test F1', settings['tf1'], 'with the model', folder
def train(self): # Prepare data sentence_train, slot_train, sentence_dev, slot_dev, vocab_sentence,\ vocab_slot = data_helper.prepare_data( "data", sentence_training_file, slot_training_file, sentence_developing_file, slot_developing_file, from_vocabulary_size=2000, to_vocabulary_size=2000, tokenizer=None) sentence_developing, slot_devloping = data_helper.read_data( sentence_dev, slot_dev, max_size=None) sentence_training, slot_training = data_helper.read_data( sentence_train, slot_train, max_size=None) ## TODO: #sentence_training, slot_training = sentence_training[:1000],\ # slot_training[:1000] # Dictionaries w2id_sentence, id2w_sentence = data_helper.initialize_vocabulary( vocab_sentence) w2id_slot, id2w_slot = data_helper.initialize_vocabulary(vocab_slot) # For conlleval script words_train = [ list(map(lambda x: id2w_sentence[x].decode('utf8'), w)) for w in sentence_training ] labels_train = [ list(map(lambda x: id2w_slot[x].decode('utf8'), y)) for y in slot_training ] words_val = [ list(map(lambda x: id2w_sentence[x].decode('utf8'), w)) for w in sentence_developing ] labels_val = [ list(map(lambda x: id2w_slot[x].decode('utf8'), y)) for y in slot_devloping ] # Define model n_vocab = len(w2id_sentence) n_classes = len(w2id_slot) #model = Sequential() #model.add(Embedding(n_vocab,100)) #model.add(Convolution1D(128, 5, border_mode='same', activation='relu')) #model.add(Dropout(0.25)) #model.add(GRU(100,return_sequences=True)) #model.add(TimeDistributed(Dense(n_classes, activation='softmax'))) #model.compile('rmsprop', 'categorical_crossentropy') ## Training ##n_epochs = 30 #n_epochs = 1 train_f_scores = [] val_f_scores = [] best_val_f1 = 0 #print("Training =>") #train_pred_label = [] #avgLoss = 0 #for i in range(n_epochs): # print("Training epoch {}".format(i)) # bar = progressbar.ProgressBar(max_value=len(sentence_training)) # for n_batch, sent in bar(enumerate(sentence_training)): # label = slot_training[n_batch] # # Make labels one hot # label = np.eye(n_classes)[label][np.newaxis, :] # # View each sentence as a batch # sent = sent[np.newaxis, :] # if sent.shape[1] > 1: #ignore 1 word sentences # loss = model.train_on_batch(sent, label) # avgLoss += loss # pred = model.predict_on_batch(sent) # pred = np.argmax(pred, -1)[0] # train_pred_label.append(pred) # avgLoss = avgLoss/n_batch # predword_train = [list(map(lambda x: id2w_slot[x].decode('utf8'), y)) # for y in train_pred_label] # con_dict = conlleval(predword_train, labels_train, # words_train, 'measure.txt') # train_f_scores.append(con_dict['f1']) # print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format( # avgLoss, con_dict['r'], con_dict['p'], con_dict['f1'])) # # Save model # model.save(filepath_model) # gc.collect() print("Validating =>") from keras.models import load_model model = load_model(filepath_model) labels_pred_val = [] avgLoss = 0 bar = progressbar.ProgressBar(max_value=len(sentence_developing)) for n_batch, sent in bar(enumerate(sentence_developing)): label = slot_devloping[n_batch] label = np.eye(n_classes)[label][np.newaxis, :] sent = sent[np.newaxis, :] if sent.shape[1] > 1: #some bug in keras loss = model.test_on_batch(sent, label) avgLoss += loss pred = model.predict_on_batch(sent) pred = np.argmax(pred, -1)[0] labels_pred_val.append(pred) avgLoss = avgLoss / n_batch gc.collect() predword_val = [ list(map(lambda x: id2w_slot[x].decode('utf8'), y)) for y in labels_pred_val ] con_dict = conlleval(predword_val, labels_val, words_val, 'measure.txt') val_f_scores.append(con_dict['f1']) print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format( avgLoss, con_dict['r'], con_dict['p'], con_dict['f1'])) if con_dict['f1'] > best_val_f1: best_val_f1 = con_dict['f1'] print('here') with open('model_architecture.json', 'w') as outf: outf.write(model.to_json()) model.save_weights('best_model_weights.h5', overwrite=True) print("Best validation F1 score = {}".format(best_val_f1)) print()
def play_with_spelling(): """Play with spelling mistakes""" print CONF np.random.seed(CONF['seed']) random.seed(CONF['seed']) print "Calculate output" session_files = get_session_files(number_of_files=CONF['number_of_files'], random_seed=CONF['seed']) sentences = get_sentences(session_files) print len(sentences) labels2idx = char2idx = get_char_to_idx(sentences) print "Prepare train, validation and test sets" train_valid_sentences, test_sentences = train_test_split(sentences, test_size=0.15, random_state=CONF['seed']) train_sentences, valid_sentences = train_test_split(train_valid_sentences, test_size=0.2, random_state=CONF['seed']) print len(train_valid_sentences), len(test_sentences) test_lex, test_y = create_tests(test_sentences, CONF['error_probability'], labels2idx, char2idx) valid_lex, valid_y = create_tests(valid_sentences, CONF['error_probability'], labels2idx, char2idx) train_lex = [] train_y = [] for error_probability in (CONF['error_probability'], CONF['error_probability'] / 10, CONF['error_probability'] / 100, 0): _train_idxes, _train_labels_idxes = create_tests(train_sentences, error_probability, labels2idx, char2idx) train_lex.extend(_train_idxes) train_y.extend(_train_labels_idxes) # train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_idxes, train_valid_labels_idxes, test_size=0.2, random_state=CONF['seed']) print len(train_lex), len(valid_lex), len(train_y), len(valid_y) print "Some more prep" idx2label = dict((k, v) for v, k in labels2idx.iteritems()) # Reverse the dictionary idx2word = dict((k, v) for v, k in char2idx.iteritems()) # Reverse the dictionary groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y] windowed_test_lex = [np.asarray(contextwin(x, CONF['win'])).astype('int32') for x in test_lex] windowed_valid_lex = [np.asarray(contextwin(x, CONF['win'])).astype('int32') for x in valid_lex] words_test = [ map(lambda x: idx2word[x], w) for w in test_lex] groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ] words_valid = [ map(lambda x: idx2word[x], w) for w in valid_lex] vocsize = 1 + len(set(item for lex in (train_lex, valid_lex, test_lex) for sublist in lex for item in sublist)) nclasses = 1 + len(set(item for _y in (train_y, test_y, valid_y) for sublist in _y for item in sublist)) nsentences = len(train_lex) words_lex = [] for i in xrange(nsentences): cwords = contextwin(train_lex[i], CONF['win']) words = [np.asarray(x).astype(np.int32) for x in minibatch(cwords, CONF['batch_size'])] words_lex.append(words) print "Some file os calls" folder = os.path.basename(__file__).split('.')[0] + "_3" if not os.path.exists(folder): os.mkdir(folder) print "Create a Neural Network" rnn = regular_elman(nh=CONF['nhidden'], nc=nclasses, ne=vocsize, de=CONF['emb_dimension'], cs=CONF['win'],) # train with early stopping on validation set best_f1 = -np.inf CONF['current_learning_rate'] = CONF['learning_rate'] print "Start training" start_time = print_time = time.time() for epoch in xrange(CONF['nepochs']): # shuffle shuffle([words_lex, train_y], CONF['seed']) CONF['ce'] = epoch tic = time.time() percentage_of_sentences_to_train = (epoch + 1) / CONF['nepochs'] numer_of_sentences_to_train = int(nsentences * percentage_of_sentences_to_train) print "starting an epoch, numer_of_sentences_to_train =", numer_of_sentences_to_train test_size = int(len(windowed_test_lex) * percentage_of_sentences_to_train) print "test_size", test_size validation_size = int(len(windowed_valid_lex) * percentage_of_sentences_to_train) print "validation_size", validation_size for _ in xrange(30): # Trauma! print "_", _ for i in xrange(numer_of_sentences_to_train): words = words_lex[i] labels = train_y[i] for word_batch, label_last_word in zip(words, labels): rnn.train(word_batch, label_last_word, CONF['current_learning_rate']) rnn.normalize() if CONF['verbose'] and time.time() - print_time > 30: print '[learning] epoch %i >> %2.2f%%' % (epoch, (i + 1) * 100. / numer_of_sentences_to_train), 'completed in %.2f (sec) <<\r' % (time.time() - tic), print_time = time.time() # evaluation // back into the real world : idx -> words if CONF['verbose']: print "Classify test" predictions_test = [[idx2label[x] for x in rnn.classify(windowed_test_lex_item)] for windowed_test_lex_item in windowed_test_lex[:test_size]] if CONF['verbose']: print "Classify validation" predictions_valid = [[idx2label[x] for x in rnn.classify(windowed_valid_lex_item)] for windowed_valid_lex_item in windowed_valid_lex[:validation_size]] # evaluation // compute the accuracy using conlleval.pl if CONF['verbose']: print "Evaluate test and validation" res_test = conlleval(predictions_test, groundtruth_test[:test_size], words_test[:test_size], folder + '/current.test.txt') res_valid = conlleval(predictions_valid, groundtruth_valid[:validation_size], words_valid[:validation_size], folder + '/current.valid.txt') if res_valid['f1'] > best_f1: rnn.save(folder) best_f1 = res_valid['f1'] print 'NEW BEST: epoch', epoch, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' ' * 20 CONF['vf1'], CONF['vp'], CONF['vr'] = res_valid['f1'], res_valid['p'], res_valid['r'] CONF['tf1'], CONF['tp'], CONF['tr'] = res_test['f1'], res_test['p'], res_test['r'] CONF['be'] = epoch subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt']) subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt']) else: print ' : epoch', epoch, 'valid F1', res_valid['f1'], ' test F1', res_test['f1'], ' ' * 20 # rnn.load(folder) # learning rate decay if no improvement in 10 epochs if CONF['decay'] and abs(CONF['be'] - CONF['ce']) >= 10: CONF['current_learning_rate'] *= 0.5 if CONF['current_learning_rate'] < 1e-5: break print 'BEST RESULT: epoch', CONF['be'], 'valid F1', best_f1, 'best test F1', CONF['tf1'], 'with the model', folder print "total time = {} seconds".format(time.time() - start_time)
def play_with_splitting_sentences(): """Play with splitting sentences""" conf = { # 'fold': 3, # 5 folds 0,1,2,3,4 'lr': 0.0627142536696559, 'verbose': False, 'decay': True, # decay on the learning rate if improvement stops 'win': 15, # number of characters in the context window 'bs': 5, # number of back-propagation through time steps 'nhidden': 100, # number of hidden units 'seed': 345, 'emb_dimension': 30, # dimension of character embedding 'nepochs': 10} number_of_files = 50000 np.random.seed(conf['seed']) random.seed(conf['seed']) print "Calculate output" session_files = get_session_files(number_of_files=number_of_files, random_seed=conf['seed']) # Limit the scope To speed things up... labels2idx = {"O": 0, "X": 1} sentences = [] idxes = [] labels_idxes = [] labels = [] char2idx = get_char_to_idx(session_files) for session_file in session_files: session = json.loads(open(session_file, "rb").read()) sentence = session_to_text0(session) if not sentence.strip(): continue sentence_out, label = create_test(sentence, probability=0.2) sentences.append(sentence_out) labels.append(label) labels_idxes.append(np.fromiter((labels2idx[l] for l in label), dtype=np.uint32)) idxes.append(np.fromiter((char2idx[char] for char in sentence_out), dtype=np.uint32)) print "Prepare train, validation and test sets" train_valid_lex, test_lex, train_valid_y, test_y = train_test_split(idxes, labels_idxes, test_size=0.15, random_state=42) train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_lex, train_valid_y, test_size=0.2, random_state=42) print "Some more prep" idx2label = dict((k, v) for v, k in labels2idx.iteritems()) # Reverse the dictionary idx2word = dict((k, v) for v, k in char2idx.iteritems()) # Reverse the dictionary # vocsize = 1 + len(set(reduce(\ # lambda x, y: list(x)+list(y),\ # train_lex+valid_lex+test_lex))) vocsize = 1 + len(set(item for lex in (train_lex, valid_lex, test_lex) for sublist in lex for item in sublist)) nclasses = 2 #len(set(reduce(lambda x, y: list(x) + list(y), train_y + test_y + valid_y))) nsentences = len(train_lex) print "Some file os calls" folder = os.path.basename(__file__).split('.')[0] + "_3" if not os.path.exists(folder): os.mkdir(folder) print "Create a Neural Network" rnn = regular_elman(nh=conf['nhidden'], nc=nclasses, ne=vocsize, de=conf['emb_dimension'], cs=conf['win'],) # train with early stopping on validation set best_f1 = -np.inf conf['clr'] = conf['lr'] print "Start training" start_time = time.time() for epoch in xrange(conf['nepochs']): # shuffle shuffle([train_lex, train_y], conf['seed']) conf['ce'] = epoch tic = time.time() for i in xrange(nsentences): cwords = contextwin(train_lex[i], conf['win']) words = [np.asarray(x).astype(np.int32) for x in minibatch(cwords, conf['bs'])] labels = train_y[i] for word_batch , label_last_word in zip(words, labels): rnn.train(word_batch, label_last_word, conf['clr']) rnn.normalize() if conf['verbose']: print '[learning] epoch %i >> %2.2f%%' % (epoch, (i + 1) * 100. / nsentences), 'completed in %.2f (sec) <<\r' % (time.time() - tic), sys.stdout.flush() # evaluation // back into the real world : idx -> words predictions_test = [ map(lambda x: idx2label[x], \ rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\ for x in test_lex ] groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ] words_test = [ map(lambda x: idx2word[x], w) for w in test_lex] predictions_valid = [ map(lambda x: idx2label[x], \ rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\ for x in valid_lex ] groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ] words_valid = [ map(lambda x: idx2word[x], w) for w in valid_lex] # evaluation // compute the accuracy using conlleval.pl res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt') res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt') if res_valid['f1'] > best_f1: rnn.save(folder) best_f1 = res_valid['f1'] print 'NEW BEST: epoch', epoch, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' ' * 20 conf['vf1'], conf['vp'], conf['vr'] = res_valid['f1'], res_valid['p'], res_valid['r'] conf['tf1'], conf['tp'], conf['tr'] = res_test['f1'], res_test['p'], res_test['r'] conf['be'] = epoch subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt']) subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt']) else: print ' : epoch', epoch, 'valid F1', res_valid['f1'], ' test F1', res_test['f1'], ' ' * 20 # learning rate decay if no improvement in 10 epochs if conf['decay'] and abs(conf['be'] - conf['ce']) >= 10: conf['clr'] *= 0.5 if conf['clr'] < 1e-5: break print 'BEST RESULT: epoch', conf['be'], 'valid F1', best_f1, 'best test F1', conf['tf1'], 'with the model', folder print "total time = {} seconds".format(time.time() - start_time)
ground_truth_test = [map(lambda x: idx2label[x], y) for y in test_y] words_test = [map(lambda x: idx2word[x], w) for w in test_lex] predictions_valid = [ map( lambda x: idx2label[x], rnn.classify( numpy.asarray(contextwin(x, s['win'])).astype('int32'), f)) for x, f in zip(valid_lex, valid_feat) ] ground_truth_valid = [map(lambda x: idx2label[x], y) for y in valid_y] words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex] # evaluation // compute the accuracy using conlleval.pl res_test = conlleval(predictions_test, ground_truth_test, words_test, model_folder + '/current.test.txt') res_valid = conlleval(predictions_valid, ground_truth_valid, words_valid, model_folder + '/current.valid.txt') if res_test['f1'] > best_f1_test: rnn.save(model_folder) best_f1_test, best_f1_test_val = res_test['f1'], res_valid['f1'] if s['verbose']: print 'NEW BEST: epoch', e, 'valid F1', res_valid[ 'f1'], 'best test F1', res_test['f1'], ' ' * 20 s['vf1'], s['vp'], s['vr'] = res_valid['f1'], res_valid[ 'p'], res_valid['r'] s['tf1'], s['tp'], s['tr'] = res_test['f1'], res_test[
sent = sent[np.newaxis, :] #print(sent) if sent.shape[1] > 1: #some bug in keras loss = model.train_on_batch(sent, label) avgLoss += loss pred = model.predict_on_batch(sent) pred = np.argmax(pred, -1)[0] #print(pred) train_pred_label.append(pred) avgLoss = avgLoss / n_batch predword_train = [ list(map(lambda x: idx2la[x], y)) for y in train_pred_label ] con_dict = conlleval(predword_train, trainY, trainY, 'r.txt') train_f_scores.append(con_dict['f1']) print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format( avgLoss, con_dict['r'], con_dict['p'], con_dict['f1'])) print("Validating =>") val_pred_label = [] avgLoss = 0 bar = progressbar.ProgressBar(max_value=len(val_x)) for n_batch, sent in bar(enumerate(val_x)): label = val_label[n_batch] label = np.eye(n_classes)[label][np.newaxis, :] sent = sent[np.newaxis, :]
def train(self): sentence_developing, slot_devloping = data_helper.read_data( self.sentence_dev, self.slot_dev, max_size=None) sentence_training, slot_training = data_helper.read_data( self.sentence_train, self.slot_train, max_size=None) # Make toy data; comment this block to train on the full dataset #n_toy = 1000 #sentence_training, slot_training = sentence_training[:n_toy],\ # slot_training[:n_toy] #sentence_developing, slot_devloping = sentence_developing[:round(n_toy/2)],\ # slot_devloping[:round(n_toy/2)] # Dictionaries w2id_sentence, id2w_sentence = data_helper.initialize_vocabulary( self.vocab_sentence) w2id_slot, id2w_slot = data_helper.initialize_vocabulary( self.vocab_slot) # For conlleval script words_train = [ list(map(lambda x: id2w_sentence[x].decode('utf8'), w)) for w in sentence_training ] labels_train = [ list(map(lambda x: id2w_slot[x].decode('utf8'), y)) for y in slot_training ] words_val = [ list(map(lambda x: id2w_sentence[x].decode('utf8'), w)) for w in sentence_developing ] labels_val = [ list(map(lambda x: id2w_slot[x].decode('utf8'), y)) for y in slot_devloping ] # Define model n_vocab = len(w2id_sentence) n_classes = len(w2id_slot) model = Sequential() model.add(Embedding(n_vocab, 100)) model.add(Convolution1D(128, 5, border_mode='same', activation='relu')) model.add(Dropout(0.25)) model.add(GRU(100, return_sequences=True)) model.add(TimeDistributed(Dense(n_classes, activation='softmax'))) model.compile('rmsprop', 'categorical_crossentropy') # Training #n_epochs = 30 n_epochs = 1 train_f_scores = [] val_f_scores = [] best_val_f1 = 0 print("Training =>") train_pred_label = [] avgLoss = 0 for i in range(n_epochs): print("Training epoch {}".format(i)) bar = progressbar.ProgressBar(max_value=len(sentence_training)) for n_batch, sent in bar(enumerate(sentence_training)): label = slot_training[n_batch] # Make labels one hot label = np.eye(n_classes)[label][np.newaxis, :] # View each sentence as a batch sent = sent[np.newaxis, :] if sent.shape[1] > 1: #ignore 1 word sentences loss = model.train_on_batch(sent, label) avgLoss += loss pred = model.predict_on_batch(sent) pred = np.argmax(pred, -1)[0] train_pred_label.append(pred) avgLoss = avgLoss / n_batch predword_train = [ list(map(lambda x: id2w_slot[x].decode('utf8'), y)) for y in train_pred_label ] con_dict = conlleval(predword_train, labels_train, words_train, 'measure.txt') train_f_scores.append(con_dict['f1']) print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format( avgLoss, con_dict['r'], con_dict['p'], con_dict['f1'])) # Save model model.save(model_file) print("Validating =>") labels_pred_val = [] avgLoss = 0 bar = progressbar.ProgressBar(max_value=len(sentence_developing)) for n_batch, sent in bar(enumerate(sentence_developing)): label = slot_devloping[n_batch] label = np.eye(n_classes)[label][np.newaxis, :] sent = sent[np.newaxis, :] if sent.shape[1] > 1: #some bug in keras loss = model.test_on_batch(sent, label) avgLoss += loss pred = model.predict_on_batch(sent) pred = np.argmax(pred, -1)[0] labels_pred_val.append(pred) avgLoss = avgLoss / n_batch predword_val = [ list(map(lambda x: id2w_slot[x].decode('utf8'), y)) for y in labels_pred_val ] con_dict = conlleval(predword_val, labels_val, words_val, 'measure.txt') val_f_scores.append(con_dict['f1']) print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format( avgLoss, con_dict['r'], con_dict['p'], con_dict['f1'])) if con_dict['f1'] > best_val_f1: best_val_f1 = con_dict['f1'] with open('model_architecture.json', 'w') as outf: outf.write(model.to_json()) model.save_weights('best_model_weights.h5', overwrite=True) print("Best validation F1 score = {}".format(best_val_f1)) print() # Prevent from tensorflow bugs: BaseSession.__del__ gc.collect()
def train(model=None, re_train=False): word_model = CustomEmbedding( re_train=re_train ) # To prevent creating embeddings if re_train is True train_set, valid_set, indexes = word_model.train_set, word_model.valid_set, word_model.indexes w2idx, la2idx = indexes['w2idx'], indexes['la2idx'] idx2w, idx2la = indexes['idx2w'], indexes['idx2la'] n_classes = len(idx2la) n_vocab = len(idx2w) train_x, train_label = train_set valid_x, valid_label = valid_set log("Processing word indexes... ") words_val = [list(map(lambda x: idx2w[x], w)) for w in valid_x] groundtruth_val = [list(map(lambda x: idx2la[x], y)) for y in valid_label] log("Done processing word indexes!") if re_train == False: ''' DEFINE MODEL ''' model = Sequential() model.add(word_model.EmbeddingLayer()) model.add(Conv1D(128, 5, padding="same", activation='relu')) model.add(Dropout(Config.DROPOUT)) model.add( Bidirectional( LSTM(units=Config.EMBEDDING_SIZE, dropout=Config.DROPOUT, recurrent_dropout=Config.DROPOUT, kernel_initializer=he_normal(), return_sequences=True))) model.add(SeqSelfAttention(attention_activation='sigmoid')) # model.add(GRU(units=Config.EMBEDDING_SIZE, # dropout=Config.DROPOUT, # recurrent_dropout=Config.DROPOUT, # kernel_initializer=he_normal(), # return_sequences=True)) model.add(TimeDistributed(Dense(n_classes, activation='softmax'))) model.add(CRF(n_classes, sparse_target=False, learn_mode='join')) model.compile(Config.OPTIMIZER, Config.LOSS, metrics=[crf_viterbi_accuracy]) process = Process(model) max_f1 = 0 try: for i in range(Config.N_EPOCHS): log("Epoch " + str(i + 1), display=False) highlight('violet', 'Epoch ' + str(i + 1)) partition(80) log("Training ") process.train(train_set) log("Validating ") predword_val = process.validate(valid_set) # Accuracy tests here using (predword_val, groundtruth_val, words_val) and save best model metrics = conlleval(predword_val, groundtruth_val, words_val, 'diff.txt') log('Precision = {}, Recall = {}, F1 = {}'.format( metrics['precision'], metrics['recall'], metrics['f1'])) if metrics['f1'] > max_f1: max_f1 = metrics['f1'] process.save('trained_model_' + str(Config.FILE_PATTERN) + '_' + str(max_f1)) log("New model saved!", display=False) highlight('white', 'Best validation F1 score : ' + str(max_f1)) log('Best validation F1 score : ' + str(max_f1), display=False) log('Cleaning /trained_model folder...') clean() log('Removed all other saved models, kept the best model only!') except KeyboardInterrupt: # If in case ctrl + c pressed, needs to clean up and exit log("\nTraining interrupted with ctrl + c ...") log('Cleaning /trained_model folder...') clean() log('Removed all other saved models, kept the best model only!') sys.exit()
if sent.shape[1] > 1: # ignore 1 word sentences model.train_on_batch(sent, label) from metrics.accuracy import conlleval labels_pred_val = [] bar = progressbar.ProgressBar(max_value=len(train_encoded[851:])) for n_batch, sent in bar(enumerate(train_encoded[851:])): label = lbl_encoded[n_batch] label = np.eye(n_classes)[label][np.newaxis, :] sent = sent[np.newaxis, :] pred = model.predict_on_batch(sent) pred = np.argmax(pred, -1)[0] labels_pred_val.append(pred) labels_pred_val = [list(map(lambda x: idx2la[x], y)) \ for y in labels_pred_val] con_dict = conlleval(labels_pred_val, labels_train[851:], words_train[851:], 'measure.txt') print('Precision = {}, Recall = {}, F1 = {}'.format( con_dict['r'], con_dict['p'], con_dict['f1'])) # model.fit(x=train_x, y=train_label, steps_per_epoch=1,callbacks=[checkpointer, tensorboard]) model.save_weights(path + "music_LSTM-" + str(i) + ".h5") model_json = model.to_json() with open(path + "model_embed_lstm.json", "w") as jf: jf.write(model_json)
def run(params): start_time = time.time() folder = os.path.basename(__file__).split('.')[0] if not os.path.exists(folder): os.mkdir(folder) rhoList = numpy.array([100, 50]).astype( numpy.int32 ) # 100,90,80,70,60,50,0 # combining forward and backward layers # load the dataset eval_options = [] params['measure'] = 'F1score' if params['dataset'] == 'atis': train_set, valid_set, test_set, dic = loadData.atisfold(params['fold']) if params['dataset'] == 'ner': train_set, valid_set, test_set, dic = loadData.ner() if params['dataset'] == 'chunk': train_set, valid_set, test_set, dic = loadData.chunk() if params['dataset'] == 'pos': train_set, valid_set, test_set, dic = loadData.pos() eval_options = ['-r'] params['measure'] = 'Accuracy' idx2label = dict((k, v) for v, k in dic['labels2idx'].items()) idx2word = dict((k, v) for v, k in dic['words2idx'].items()) train_lex, train_ne, train_y = train_set valid_lex, valid_ne, valid_y = valid_set test_lex, test_ne, test_y = test_set ## :( hack # train_lex = train_lex[::100] # train_ne = train_ne[::100] # train_y = train_y[::100] # valid_lex = valid_lex[::100] # valid_ne = valid_ne[::100] # valid_y = valid_y[::100] # test_lex = test_lex[::100] # test_ne = test_ne[::100] # test_y = test_y[::100] vocsize = len(dic['words2idx']) nclasses = len(dic['labels2idx']) nsentences = len(train_lex) wv = None if params['WVFolder'] != 'random': if '[' in params['WVFolder'] and ']' in params['WVFolder']: folderSet = set( eval(params['WVFolder'].replace('[', '[\'').replace( ']', '\']').replace(',', '\',\''))) print(folderSet) wv = numpy.zeros( (vocsize + 1, params['WVModel']['emb_dimension'] * len(folderSet))) modelIndex = 0 for folder in folderSet: params['WVFile'] = folder + '/' + 'words' + str( params['WVModel']['emb_dimension']) + '.npy' params['WVVocabFile'] = folder + '/' + 'words' + str( params['WVModel']['emb_dimension']) + '.vocab' # load word vector wvnp = np.load(params['WVFile']) # load vocab with open(params['WVVocabFile']) as f: vocab = [line.strip() for line in f if len(line) > 0] wi = dict([(a, i) for i, a in enumerate(vocab)]) random_v = math.sqrt( 6.0 / numpy.sum(params['WVModel']['emb_dimension']) ) * numpy.random.uniform(-1.0, 1.0, (params['WVModel']['emb_dimension'])) miss = 0 # the number of missing words in pre-trained word embeddings for i in range(0, vocsize): word = idx2word[i] if word in wi: wv[i][params['WVModel']['emb_dimension'] * modelIndex:params['WVModel']['emb_dimension'] * (modelIndex + 1)] = wvnp[wi[word]] # print wvnp[wi[word]] else: wv[i][params['WVModel']['emb_dimension'] * modelIndex:params['WVModel']['emb_dimension'] * (modelIndex + 1)] = random_v miss += 1 print("missing words rate : ", miss, '/', vocsize) params['WVModel']['vocab_size'] = len(vocab) modelIndex = modelIndex + 1 params['WVModel']['emb_dimension'] *= len(folderSet) # return else: folder = params['WVFolder'] params['WVFile'] = folder + '/' + 'words' + str( params['WVModel']['emb_dimension']) + '.npy' params['WVVocabFile'] = folder + '/' + 'words' + str( params['WVModel']['emb_dimension']) + '.vocab' # load word vector wvnp = np.load(params['WVFile']) params['WVModel']['emb_dimension'] = len(wvnp[0]) # load vocab with open(params['WVVocabFile']) as f: vocab = [line.strip() for line in f if len(line) > 0] wi = dict([(a, i) for i, a in enumerate(vocab)]) wv = numpy.zeros((vocsize + 1, params['WVModel']['emb_dimension'])) random_v = math.sqrt(6.0 / numpy.sum( params['WVModel']['emb_dimension'])) * numpy.random.uniform( -1.0, 1.0, (params['WVModel']['emb_dimension'])) miss = 0 # the number of missing words in pre-trained word embeddings for i in range(0, vocsize): word = idx2word[i] if word in wi: wv[i] = wvnp[wi[word]] # print wvnp[wi[word]] else: wv[i] = random_v miss += 1 print("missing words rate : ", miss, '/', vocsize) params['WVModel']['vocab_size'] = len(vocab) print(json.dumps(params, sort_keys=True, indent=4, separators=(',', ': '))) rhoSuffix = "%_forward" best_valid = {} best_test = {} for i_rho in range(len(rhoList)): best_valid[str(rhoList[i_rho]) + rhoSuffix] = -numpy.inf best_test[str(rhoList[i_rho]) + rhoSuffix] = -numpy.inf validMeasureList = {} testMeasureList = {} # this is used for drawing line chart. for i_rho in range(len(rhoList)): validMeasureList[str(rhoList[i_rho]) + rhoSuffix] = [] testMeasureList[str(rhoList[i_rho]) + rhoSuffix] = [] # instanciate the model numpy.random.seed(params['seed']) random.seed(params['seed']) rnn = elman_attention.model(nh=params['nhidden'], nc=nclasses, ne=vocsize, de=params['WVModel']['emb_dimension'], attention=params['attention'], h_win=(params['h_win_left'], params['h_win_right']), lvrg=params['lvrg'], wv=wv) # train for e in range(params['nepochs']): # shuffle shuffle([train_lex, train_ne, train_y], params['seed']) tic = time.time() for i in range(nsentences): cwords = contextwin(train_lex[i]) labels = train_y[i] nl, aaL = rnn.train(cwords, labels, params['dropRate'], 1) # rnn.normalize() if params['verbose']: sys.stdout.write( ('\r[learning] epoch %i >> %2.2f%%' % (e, (i + 1) * 100. / nsentences) + (' average speed in %.2f (min) <<' % ((time.time() - tic) / 60 / (i + 1) * nsentences)) + (' completed in %.2f (sec) <<' % ((time.time() - tic))))) sys.stdout.flush() print('start test', time.time() / 60) print('start pred train', time.time() / 60) predictions_train = [[map(lambda varible: idx2label[varible], w) \ for w in rnn.classify(numpy.asarray(contextwin(x)).astype('int32'), params['dropRate'], 0, rhoList)] for x in train_lex] predictions_test = [[map(lambda varible: idx2label[varible], w) \ for w in rnn.classify(numpy.asarray(contextwin(x)).astype('int32'), params['dropRate'], 0, rhoList)] for x in test_lex] predictions_valid = [[map(lambda varible: idx2label[varible], w) \ for w in rnn.classify(numpy.asarray(contextwin(x)).astype('int32'), params['dropRate'], 0, rhoList)] for x in valid_lex] for i_rho in range(len(rhoList)): groundtruth_train = [ map(lambda x: idx2label[x], y) for y in train_y ] words_train = [map(lambda x: idx2word[x], w) for w in train_lex] groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y] words_test = [map(lambda x: idx2word[x], w) for w in test_lex] groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ] words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex] ptrain = [p[i_rho] for p in predictions_train] ptest = [p[i_rho] for p in predictions_test] pvalid = [p[i_rho] for p in predictions_valid] res_train = conlleval( ptrain, groundtruth_train, words_train, folder + '/current.train.txt' + str(i_rho) + str(params['seed']), eval_options) res_test = conlleval( ptest, groundtruth_test, words_test, folder + '/current.test.txt' + str(i_rho) + str(params['seed']), eval_options) res_valid = conlleval( pvalid, groundtruth_valid, words_valid, folder + '/current.valid.txt' + str(i_rho) + str(params['seed']), eval_options) print(' epoch', e, ' rhoList ', i_rho, ' train p', res_train['p'], 'valid p', res_valid['p'], ' train r', res_train['r'], 'valid r', res_valid['r'], ' train ', params['measure'], res_train['measure'], 'valid ', params['measure'], res_valid['measure'], 'best test ', params['measure'], res_test['measure'], ' ' * 20) validMeasureList[str(rhoList[i_rho]) + rhoSuffix].append( res_valid['measure']) testMeasureList[str(rhoList[i_rho]) + rhoSuffix].append( res_test['measure']) if res_valid['measure'] > best_valid[str(rhoList[i_rho]) + rhoSuffix]: best_valid[str(rhoList[i_rho]) + rhoSuffix] = res_valid['measure'] best_test[str(rhoList[i_rho]) + rhoSuffix] = res_test['measure'] for i_rho in range( len(rhoList)): # this is used for drawing line chart. print(i_rho, params['dataset'], end=' ') for v in testMeasureList[str(rhoList[i_rho]) + rhoSuffix]: print(v, end=' ') print('') for i_rho in range(len(rhoList)): print('current best results', rhoList[i_rho], ' ', best_valid[str(rhoList[i_rho]) + rhoSuffix], '/', best_test[str(rhoList[i_rho]) + rhoSuffix]) end_time = time.time() with open(params['JSONOutputFile'], 'w') as outputFile: params['results'] = {} params['results']['best_valid_' + params['measure']] = best_valid params['results']['best_test_' + params['measure']] = best_test params['results']['valid_' + params['measure'] + 'ListBasedOnEpochs'] = validMeasureList params['results']['test_' + params['measure'] + 'ListBasedOnEpochs'] = testMeasureList params['running_time'] = {} params['running_time']['start'] = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(start_time)) params['running_time']['end'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(end_time)) params['running_time']['duration'] = end_time - start_time res = json.dump(params, outputFile, sort_keys=True, indent=4, separators=(',', ': ')) print(res)
sent = sent[np.newaxis, :] # if sent.shape[1] > 1: # some bug in keras # loss = model.train_on_batch(sent, label) # avgLoss += loss pred = model.predict_on_batch(sent) pred = np.argmax(pred, -1)[0] train_pred_label.append(pred) avgLoss = avgLoss / n_batch predword_train = [ list(map(lambda x: idx2la[x], y)) for y in train_pred_label ] con_dict = conlleval(predword_train, groundtruth_train, words_train, 'r.txt') train_f_scores.append(con_dict['f1']) print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format( avgLoss, con_dict['r'], con_dict['p'], con_dict['f1'])) print("Validating =>") val_pred_label = [] avgLoss = 0 bar = progressbar.ProgressBar(max_value=len(val_x)) for n_batch, sent in bar(enumerate(val_x)): label = val_label[n_batch] label = np.eye(n_classes)[label][np.newaxis, :] sent = sent[np.newaxis, :]
label = train_label[n_batch] label = np.eye(n_classes)[label][np.newaxis,:] sent = sent[np.newaxis,:] if sent.shape[1] > 1: #some bug in keras loss = model.train_on_batch(sent, label) avgLoss += loss pred = model.predict_on_batch(sent) pred = np.argmax(pred,-1)[0] train_pred_label.append(pred) avgLoss = avgLoss/n_batch predword_train = [ list(map(lambda x: idx2la[x], y)) for y in train_pred_label] con_dict = conlleval(predword_train, groundtruth_train, words_train, 'r.txt') train_f_scores.append(con_dict['f1']) print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format(avgLoss, con_dict['r'], con_dict['p'], con_dict['f1'])) print("Validating =>") val_pred_label = [] avgLoss = 0 bar = progressbar.ProgressBar(max_value=len(val_x)) for n_batch, sent in bar(enumerate(val_x)): label = val_label[n_batch] label = np.eye(n_classes)[label][np.newaxis,:] sent = sent[np.newaxis,:]
def prepare_data(): """Prepare the data""" conf = { 'fold': 3, # 5 folds 0,1,2,3,4 'lr': 0.0627142536696559, 'verbose': True, 'decay': True, # decay on the learning rate if improvement stops 'win': 7, # number of words in the context window 'bs': 9, # number of back-propagation through time steps 'nhidden': 100, # number of hidden units 'seed': 345, 'emb_dimension': 300, # dimension of word embedding 'nepochs': 50 } np.random.seed(conf['seed']) random.seed(conf['seed']) session_files = get_session_files( number_of_files=None, random_seed=conf['seed']) # Limit the scope To speed things up... sentences = [] idxes = [] labels = [] labels_idxes = [] print "Calculate words2idx" words2idx = get_words2idx(session_files) unknown = words2idx["<UNK>"] print "Calculate output" for session_file in session_files: session = json.loads(open(session_file, "rb").read()) sentence = session_to_text0(session) if not sentence.strip(): continue sentences.append(sentence) token_list = tokenize(sentence.lower()) dtp_search_res = dtp_search(sentence, None) iobes = to_iob(token_list, dtp_search_res) labels.append(iobes) labels_idxes.append( np.fromiter((LABELS2IDX[iob] for iob in iobes), dtype=np.int32)) # token_list = [re.sub(r"\d", "DIGIT", token) for token in token_list] idxes.append( np.fromiter( (words2idx.get(token, unknown) for token in token_list), dtype=np.int32)) print "Prepare train, validation and test sets" train_valid_lex, test_lex, train_valid_y, test_y = train_test_split( idxes, labels_idxes, test_size=0.15, random_state=42) train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_lex, train_valid_y, test_size=0.2, random_state=42) idx2label = dict( (k, v) for v, k in LABELS2IDX.iteritems()) # Reverse the dictionary idx2word = dict( (k, v) for v, k in words2idx.iteritems()) # Reverse the dictionary vocsize = len(idx2word) nclasses = len({label for labels in labels_idxes for label in labels}) # nclasses = len(set(reduce(lambda x, y: list(x) + list(y), train_y + test_y + valid_y))) nsentences = len(train_lex) folder = os.path.basename(__file__).split('.')[0] if not os.path.exists(folder): os.mkdir(folder) print "Loading Word2Vec" word2vec = Word2Vec.load_word2vec_format(WORD2VEC_FILENAME, binary=True) # C binary format print "Calculate word embeddings" embeddings = 0.2 * np.random.uniform( -1.0, 1.0, (vocsize + 1, conf['emb_dimension'])).astype( theano.config.floatX ) # add one for PADDING at the end @UndefinedVariable for idx, word in idx2word.iteritems(): try: embedding = word2vec[word] except KeyError: try: embedding = word2vec[word.capitalize()] except KeyError: embedding = embeddings[idx] # Keep it random embeddings[idx] = embedding del word2vec # It is huge print "Create a Neural Network" rnn = elman2vec(nh=conf['nhidden'], nc=nclasses, ne=vocsize, de=conf['emb_dimension'], cs=conf['win'], embeddings=embeddings) # train with early stopping on validation set best_f1 = -np.inf conf['clr'] = conf['lr'] print "Start training" for epoch in xrange(conf['nepochs']): # shuffle shuffle([train_lex, train_y], conf['seed']) conf['ce'] = epoch tic = time.time() for i in xrange(nsentences): cwords = contextwin(train_lex[i], conf['win']) words = [ np.asarray(x).astype(np.int32) for x in minibatch(cwords, conf['bs']) ] labels = train_y[i] for word_batch, label_last_word in zip(words, labels): rnn.train(word_batch, label_last_word, conf['clr']) # rnn.normalize() if conf['verbose']: print '[learning] epoch %i >> %2.2f%%' % ( epoch, (i + 1) * 100. / nsentences ), 'completed in %.2f (sec) <<\r' % (time.time() - tic), sys.stdout.flush() # evaluation // back into the real world : idx -> words predictions_test = [ map(lambda x: idx2label[x], \ rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\ for x in test_lex ] groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y] words_test = [map(lambda x: idx2word[x], w) for w in test_lex] predictions_valid = [ map(lambda x: idx2label[x], \ rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\ for x in valid_lex ] groundtruth_valid = [map(lambda x: idx2label[x], y) for y in valid_y] words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex] # evaluation // compute the accuracy using conlleval.pl res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt') res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt') if res_valid['f1'] > best_f1: rnn.save(folder) best_f1 = res_valid['f1'] print 'NEW BEST: epoch', epoch, 'valid F1', res_valid[ 'f1'], 'best test F1', res_test['f1'], ' ' * 20 conf['vf1'], conf['vp'], conf['vr'] = res_valid['f1'], res_valid[ 'p'], res_valid['r'] conf['tf1'], conf['tp'], conf['tr'] = res_test['f1'], res_test[ 'p'], res_test['r'] conf['be'] = epoch subprocess.call([ 'mv', folder + '/current.test.txt', folder + '/best.test.txt' ]) subprocess.call([ 'mv', folder + '/current.valid.txt', folder + '/best.valid.txt' ]) else: print ' : epoch', epoch, 'valid F1', res_valid[ 'f1'], ' test F1', res_test['f1'], ' ' * 20 # learning rate decay if no improvement in 10 epochs if conf['decay'] and abs(conf['be'] - conf['ce']) >= 10: conf['clr'] *= 0.5 if conf['clr'] < 1e-5: break print 'BEST RESULT: epoch', epoch, 'valid F1', res_valid[ 'f1'], 'best test F1', res_test['f1'], 'with the model', folder
def play_with_splitting_sentences(): """Play with splitting sentences""" conf = { # 'fold': 3, # 5 folds 0,1,2,3,4 'lr': 0.0627142536696559, 'verbose': False, 'decay': True, # decay on the learning rate if improvement stops 'win': 15, # number of characters in the context window 'bs': 5, # number of back-propagation through time steps 'nhidden': 100, # number of hidden units 'seed': 345, 'emb_dimension': 30, # dimension of character embedding 'nepochs': 10 } number_of_files = 50000 np.random.seed(conf['seed']) random.seed(conf['seed']) print "Calculate output" session_files = get_session_files( number_of_files=number_of_files, random_seed=conf['seed']) # Limit the scope To speed things up... labels2idx = {"O": 0, "X": 1} sentences = [] idxes = [] labels_idxes = [] labels = [] char2idx = get_char_to_idx(session_files) for session_file in session_files: session = json.loads(open(session_file, "rb").read()) sentence = session_to_text0(session) if not sentence.strip(): continue sentence_out, label = create_test(sentence, probability=0.2) sentences.append(sentence_out) labels.append(label) labels_idxes.append( np.fromiter((labels2idx[l] for l in label), dtype=np.uint32)) idxes.append( np.fromiter((char2idx[char] for char in sentence_out), dtype=np.uint32)) print "Prepare train, validation and test sets" train_valid_lex, test_lex, train_valid_y, test_y = train_test_split( idxes, labels_idxes, test_size=0.15, random_state=42) train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_lex, train_valid_y, test_size=0.2, random_state=42) print "Some more prep" idx2label = dict( (k, v) for v, k in labels2idx.iteritems()) # Reverse the dictionary idx2word = dict( (k, v) for v, k in char2idx.iteritems()) # Reverse the dictionary # vocsize = 1 + len(set(reduce(\ # lambda x, y: list(x)+list(y),\ # train_lex+valid_lex+test_lex))) vocsize = 1 + len( set(item for lex in (train_lex, valid_lex, test_lex) for sublist in lex for item in sublist)) nclasses = 2 #len(set(reduce(lambda x, y: list(x) + list(y), train_y + test_y + valid_y))) nsentences = len(train_lex) print "Some file os calls" folder = os.path.basename(__file__).split('.')[0] + "_3" if not os.path.exists(folder): os.mkdir(folder) print "Create a Neural Network" rnn = regular_elman( nh=conf['nhidden'], nc=nclasses, ne=vocsize, de=conf['emb_dimension'], cs=conf['win'], ) # train with early stopping on validation set best_f1 = -np.inf conf['clr'] = conf['lr'] print "Start training" start_time = time.time() for epoch in xrange(conf['nepochs']): # shuffle shuffle([train_lex, train_y], conf['seed']) conf['ce'] = epoch tic = time.time() for i in xrange(nsentences): cwords = contextwin(train_lex[i], conf['win']) words = [ np.asarray(x).astype(np.int32) for x in minibatch(cwords, conf['bs']) ] labels = train_y[i] for word_batch, label_last_word in zip(words, labels): rnn.train(word_batch, label_last_word, conf['clr']) rnn.normalize() if conf['verbose']: print '[learning] epoch %i >> %2.2f%%' % ( epoch, (i + 1) * 100. / nsentences ), 'completed in %.2f (sec) <<\r' % (time.time() - tic), sys.stdout.flush() # evaluation // back into the real world : idx -> words predictions_test = [ map(lambda x: idx2label[x], \ rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\ for x in test_lex ] groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y] words_test = [map(lambda x: idx2word[x], w) for w in test_lex] predictions_valid = [ map(lambda x: idx2label[x], \ rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\ for x in valid_lex ] groundtruth_valid = [map(lambda x: idx2label[x], y) for y in valid_y] words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex] # evaluation // compute the accuracy using conlleval.pl res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt') res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt') if res_valid['f1'] > best_f1: rnn.save(folder) best_f1 = res_valid['f1'] print 'NEW BEST: epoch', epoch, 'valid F1', res_valid[ 'f1'], 'best test F1', res_test['f1'], ' ' * 20 conf['vf1'], conf['vp'], conf['vr'] = res_valid['f1'], res_valid[ 'p'], res_valid['r'] conf['tf1'], conf['tp'], conf['tr'] = res_test['f1'], res_test[ 'p'], res_test['r'] conf['be'] = epoch subprocess.call([ 'mv', folder + '/current.test.txt', folder + '/best.test.txt' ]) subprocess.call([ 'mv', folder + '/current.valid.txt', folder + '/best.valid.txt' ]) else: print ' : epoch', epoch, 'valid F1', res_valid[ 'f1'], ' test F1', res_test['f1'], ' ' * 20 # learning rate decay if no improvement in 10 epochs if conf['decay'] and abs(conf['be'] - conf['ce']) >= 10: conf['clr'] *= 0.5 if conf['clr'] < 1e-5: break print 'BEST RESULT: epoch', conf[ 'be'], 'valid F1', best_f1, 'best test F1', conf[ 'tf1'], 'with the model', folder print "total time = {} seconds".format(time.time() - start_time)
def prepare_data(): """Prepare the data""" conf = {'fold': 3, # 5 folds 0,1,2,3,4 'lr': 0.0627142536696559, 'verbose': True, 'decay': True, # decay on the learning rate if improvement stops 'win': 7, # number of words in the context window 'bs': 9, # number of back-propagation through time steps 'nhidden': 100, # number of hidden units 'seed': 345, 'emb_dimension': 300, # dimension of word embedding 'nepochs': 50} np.random.seed(conf['seed']) random.seed(conf['seed']) session_files = get_session_files(number_of_files=None, random_seed=conf['seed']) # Limit the scope To speed things up... sentences = [] idxes = [] labels = [] labels_idxes = [] print "Calculate words2idx" words2idx = get_words2idx(session_files) unknown = words2idx["<UNK>"] print "Calculate output" for session_file in session_files: session = json.loads(open(session_file, "rb").read()) sentence = session_to_text0(session) if not sentence.strip(): continue sentences.append(sentence) token_list = tokenize(sentence.lower()) dtp_search_res = dtp_search(sentence, None) iobes = to_iob(token_list, dtp_search_res) labels.append(iobes) labels_idxes.append(np.fromiter((LABELS2IDX[iob] for iob in iobes), dtype=np.int32)) # token_list = [re.sub(r"\d", "DIGIT", token) for token in token_list] idxes.append(np.fromiter((words2idx.get(token, unknown) for token in token_list), dtype=np.int32)) print "Prepare train, validation and test sets" train_valid_lex, test_lex, train_valid_y, test_y = train_test_split(idxes, labels_idxes, test_size=0.15, random_state=42) train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_lex, train_valid_y, test_size=0.2, random_state=42) idx2label = dict((k, v) for v, k in LABELS2IDX.iteritems()) # Reverse the dictionary idx2word = dict((k, v) for v, k in words2idx.iteritems()) # Reverse the dictionary vocsize = len(idx2word) nclasses = len({label for labels in labels_idxes for label in labels}) # nclasses = len(set(reduce(lambda x, y: list(x) + list(y), train_y + test_y + valid_y))) nsentences = len(train_lex) folder = os.path.basename(__file__).split('.')[0] if not os.path.exists(folder): os.mkdir(folder) print "Loading Word2Vec" word2vec = Word2Vec.load_word2vec_format(WORD2VEC_FILENAME, binary=True) # C binary format print "Calculate word embeddings" embeddings = 0.2 * np.random.uniform(-1.0, 1.0, (vocsize + 1, conf['emb_dimension'])).astype(theano.config.floatX) # add one for PADDING at the end @UndefinedVariable for idx, word in idx2word.iteritems(): try: embedding = word2vec[word] except KeyError: try: embedding = word2vec[word.capitalize()] except KeyError: embedding = embeddings[idx] # Keep it random embeddings[idx] = embedding del word2vec # It is huge print "Create a Neural Network" rnn = elman2vec(nh=conf['nhidden'], nc=nclasses, ne=vocsize, de=conf['emb_dimension'], cs=conf['win'], embeddings=embeddings) # train with early stopping on validation set best_f1 = -np.inf conf['clr'] = conf['lr'] print "Start training" for epoch in xrange(conf['nepochs']): # shuffle shuffle([train_lex, train_y], conf['seed']) conf['ce'] = epoch tic = time.time() for i in xrange(nsentences): cwords = contextwin(train_lex[i], conf['win']) words = [np.asarray(x).astype(np.int32) for x in minibatch(cwords, conf['bs'])] labels = train_y[i] for word_batch , label_last_word in zip(words, labels): rnn.train(word_batch, label_last_word, conf['clr']) # rnn.normalize() if conf['verbose']: print '[learning] epoch %i >> %2.2f%%' % (epoch, (i + 1) * 100. / nsentences), 'completed in %.2f (sec) <<\r' % (time.time() - tic), sys.stdout.flush() # evaluation // back into the real world : idx -> words predictions_test = [ map(lambda x: idx2label[x], \ rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\ for x in test_lex ] groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ] words_test = [ map(lambda x: idx2word[x], w) for w in test_lex] predictions_valid = [ map(lambda x: idx2label[x], \ rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\ for x in valid_lex ] groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ] words_valid = [ map(lambda x: idx2word[x], w) for w in valid_lex] # evaluation // compute the accuracy using conlleval.pl res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt') res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt') if res_valid['f1'] > best_f1: rnn.save(folder) best_f1 = res_valid['f1'] print 'NEW BEST: epoch', epoch, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' ' * 20 conf['vf1'], conf['vp'], conf['vr'] = res_valid['f1'], res_valid['p'], res_valid['r'] conf['tf1'], conf['tp'], conf['tr'] = res_test['f1'], res_test['p'], res_test['r'] conf['be'] = epoch subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt']) subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt']) else: print ' : epoch', epoch, 'valid F1', res_valid['f1'], ' test F1', res_test['f1'], ' ' * 20 # learning rate decay if no improvement in 10 epochs if conf['decay'] and abs(conf['be'] - conf['ce']) >= 10: conf['clr'] *= 0.5 if conf['clr'] < 1e-5: break print 'BEST RESULT: epoch', epoch, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], 'with the model', folder