def run_on_ollie_dataset(iob_ollie_dataset_path,use_cross_validation): settings = {'partial_training': 0.8, 'partial_testing': 0.2, 'fold': 10, # 5 folds 0,1,2,3,4 'lr': 0.05, 'verbose': 1, 'decay': False, # decay on the learning rate if improvement stops 'win': 7, # number of words in the context window 'bs': 9, # number of backprop through time steps 'nhidden': 100, # number of hidden units 'seed': 345, 'emb_dimension': 100, # dimension of word embedding 'nepochs': 50} # iob_ollie_dataset_file=open(iob_ollie_dataset_path,'r') indices=create_word2ind(iob_ollie_dataset_path) words_index=indices['wordIndex'] labels_index=indices['labelIndex'] word2index = words_index.getCurrentIndex() index2word = words_index.getIndex2Word() label2index = labels_index.getCurrentIndex() index2label = labels_index.getIndex2Word() vocsize=len(word2index) nclasses=len(label2index) new_network_folder = datetime.datetime.now().strftime('%Y-%m-%d_%Hh%M') rnn,model_folder=create_network(settings,nclasses,vocsize,new_network_folder) print('RNN model created and saved under %s' % model_folder) [labeled_data,labeled_data_size]=get_labeled_data(iob_ollie_dataset_path) print('Labeled data size for articles: ',labeled_data_size) sentences_list, labels_list = labeled_data.getData() while [] in sentences_list: print('Empty sentences were found. They will be removed') empty=sentences_list.index([]) sentences_list.pop(empty) labels_list.pop(empty) assert len(sentences_list)==len(labels_list) number_labeled_sentences = len(sentences_list) print('The training phase of the RNN model on the Ollie dataset will begin now') rnn=rnn.load(model_folder) ######################################################### # training with consideration to parameters in settings # ######################################################### if not use_cross_validation: print('No cross-validation techniques will be used in this training process') shuffle([sentences_list, labels_list], settings['seed']) training_size = int(math.floor(settings['partial_training'] * number_labeled_sentences)) testing_size = int(math.floor(settings['partial_testing'] * number_labeled_sentences)) print('Training size: [0:{0}] = {0}'.format(training_size)) train_sentences = sentences_list[0:training_size] train_labels = labels_list[0:training_size] print('Testing size: [{0}:{1}] = {2}'.format(training_size, training_size + testing_size, testing_size)) test_sentences = sentences_list[training_size:training_size + testing_size] test_labels = labels_list[training_size:training_size + testing_size] else: print('Cross validation will be used') #################### # training process # #################### # number_train_sentences = len(train_sentences) # number_train_labels_toGuess = sum([len(x) for x in test_labels]) # print('Starting training with {0} labeled sentences in total for {1} epochs.'. # format(number_train_sentences, settings['nepochs'])) best_accuracy = -numpy.inf current_learning_rate = settings['lr'] best_epoch = 0 f1_of_best_acc=0 conf_mat_of_best_acc=None for e in range(0, settings['nepochs']): print('Epoch {0}'.format(e)) print('----------------------------------------------') if use_cross_validation: #################### # validation phase # #################### print('Validation phase in process') shuffle([sentences_list, labels_list], settings['seed']) divide_in_folds=lambda lst,sz:[lst[i:i+sz] for i in range(0,len(lst),sz)] if len(sentences_list)%settings['fold']==0: size_of_fold=math.floor(len(sentences_list)/settings['fold']) else: size_of_fold=(math.floor(len(sentences_list)/settings['fold']))+1 sentences_in_folds=divide_in_folds(sentences_list,size_of_fold) labels_in_folds=divide_in_folds(labels_list,size_of_fold) assert len(sentences_in_folds)==settings['fold'] assert len(sentences_in_folds)==len(labels_in_folds) all_validation_accuracies=[] for j in range(0,len(sentences_in_folds)): ex_tr_sent=sentences_in_folds[:] ex_tr_labels=labels_in_folds[:] # val_sent=sentences_in_folds[j] # val_labels=labels_in_folds[j] # assert len(val_sent)==len(val_labels) val_sent=ex_tr_sent.pop(j) val_labels=ex_tr_labels.pop(j) assert len(val_sent)==len(val_labels) assert len(ex_tr_sent)==len(ex_tr_labels) tr_sent=[] tr_labels=[] for c in range(0,len(ex_tr_sent)): tr_sent.extend(ex_tr_sent[c]) tr_labels.extend(ex_tr_labels[c]) assert len(tr_sent)==len(tr_labels) train_dict={'sentences':tr_sent,'labels':tr_labels} validation_dict={'sentences':val_sent,'labels':val_labels} print('Training the fold number %i will begin now' % (j+1)) [current_validation_accuracy,f1,conf_mat]=get_accuracy(rnn,train_dict,validation_dict,word2index,label2index,settings, current_learning_rate,e,index2word,is_validation=True) all_validation_accuracies.append(current_validation_accuracy) assert len(all_validation_accuracies)==settings['fold'] mean_validation=sum(all_validation_accuracies)/len(all_validation_accuracies) if mean_validation>best_accuracy: best_accuracy=mean_validation f1_of_best_acc=f1 conf_mat_of_best_acc=conf_mat print('New best validation accuracy: %2.2f%%' % best_accuracy) # rnn.save(model_folder) print('A new RNN has been saved.') else: print('Validation phase did not come up with a better accuracy (only %2.2f%%).' '. A new epoch will begin' % mean_validation) # rnn=rnn.load(model_folder) #continue ################## # Training phase # ################## else: shuffle([train_sentences, train_labels], settings['seed']) print('Training in progress') # rnn=rnn.load(model_folder) # print('RNN saved during the validation phase has been loaded') training_dict={'sentences':train_sentences,'labels':train_labels} testing_dict={'sentences':test_sentences,'labels':test_labels} [testing_accuracy,f1,conf_mat]=get_accuracy(rnn,training_dict,testing_dict,word2index,label2index,settings, current_learning_rate,e,index2word,is_validation=False) print('Accuracy during the testing phase (number of correct guessed labels) at %2.2f%%.' % testing_accuracy) # check if current epoch is the best if testing_accuracy> best_accuracy: best_accuracy = testing_accuracy best_epoch = e f1_of_best_acc=f1 conf_mat_of_best_acc=conf_mat rnn.save(model_folder) print('Better testing accuracy !!') else: rnn=rnn.load(model_folder) if abs(best_epoch-e)>=5: current_learning_rate*=0.5 if current_learning_rate<1e-5: break print('BEST RESULT: epoch ', best_epoch, 'with best accuracy: ', best_accuracy, '.',) # iob_ollie_dataset_file.close() pickle.dump([best_accuracy,f1_of_best_acc,conf_mat_of_best_acc],open('perf.pck','wb'))
def trainingSession(model, best_f1): for epoch in range(epochs_num): # Shuffle tools.shuffle([train_lex, train_ne, train_y], seed) tic = time.time() train_loss = np.inf # Train for i in range(nsentences): labels = train_y[i] train_x = np.array([train_lex[i]]) temp_loss, _ = sess.run([total_loss, train_op], feed_dict={ x_input: train_x, y_labels: labels }) if train_loss > temp_loss: train_loss = temp_loss if verbose: print( "[Learning] Epoch %i >> %2.2f%%" % (epoch + 1, (i + 1) * 100. / nsentences), "completed in %.2f (sec) <<\r" % (time.time() - tic)) print("[Learning] Epoch %i Loss %2.2f" % (epoch + 1, train_loss * 100000)) # =============================# # Evaluation # # # # back into the real world # # idx -> words # # =============================# print("Evaluating...") istraining = False res_test, res_valid = evaluate(sess, model) istraining = True if res_valid['f1'] > best_f1: # Save the variables to disk. #save_path = saver.save(sess, "/tmp/model.ckpt") #print("Model saved in path: %s" % save_path) best_f1 = res_valid['f1'] if 1: print("NEW BEST: epoch", epoch + 1, "valid F1", res_valid['f1'], "best test F1", res_test['f1'], " " * 20) vf1, vp, vr = res_valid['f1'], res_valid['p'], res_valid['r'] tf1, tp, tr = res_test['f1'], res_test['p'], res_test['r'] best_epoch = epoch subprocess.call( ['mv', folder + "current.test.txt", folder + "best.test.txt"]) subprocess.call([ 'mv', folder + "current.valid.txt", folder + "best.valid.txt" ]) else: print() #if preTraining and train_loss<0.5: # break # print the best result print('BEST RESULT: epoch', best_epoch + 1, 'valid F1', vf1, 'best test F1', tf1, 'with the model', folder) return best_f1
init=initialize, featdim=14) else: print "Invalid RNN type: ", rnn_type sys.exit(-1) # create a folder for store the models if not os.path.exists(model_folder): os.mkdir(model_folder) # train with early stopping on validation set best_f1_test, best_f1_test_val = -numpy.inf, -numpy.inf s['clr'] = s['lr'] # learning rate for e in xrange(s['nepochs']): # shuffle shuffle([train_lex, train_y, train_feat], s['seed']) s['ce'] = e tic = time.time() for i in xrange(num_sentences): context_words = contextwin( train_lex[i], s['win'] ) #list of list of indexes corresponding to context windows surrounding each word in the sentence words = map(lambda x: numpy.asarray(x).astype('int32'), minibatch(context_words, s['bs'])) features = minibatch(train_feat[i], s['bs']) labels = train_y[i] for word_batch, feature_batch, label_last_word in zip( words, features, labels):
def train(dim_word=100, # word vector dimensionality dim_char=10, # the number of LSTM units max_char=10, # the number of LSTM units dim=100, # the number of LSTM units win=5, #Window size bs=5, #number of backprop through time steps seed=123, verbose=1, use_model='GRU', #Choose the model from- LSTM, DEEPLSTM, RNN, patience=10, # early stopping patience max_epochs=50, lrate=0.0005, # learning rate maxlen=100, # maximum length of the description data_train=['data/qe/train/train.src.lc', 'data/qe/train/train.mt.lc', 'data/qe/train/train.align'], data_train_y = 'data/qe/train/train.tags', data_valid=['data/qe/dev/dev.src.lc', 'data/qe/dev/dev.mt.lc', 'data/qe/dev/dev.align'], data_valid_y = 'data/qe/dev/dev.tags', data_test=['data/qe/test/test.src.lc', 'data/qe/test/test.mt.lc', 'data/qe/test/test.align'], data_test_y = 'data/qe/test/test.tags', dictionaries=['data/qe/train/train.src.lc.json', 'data/qe/train/train.mt.lc.json'], character2index=['data/qe/train/train.src.lc.dict_char.json', 'data/qe/train/train.mt.lc.dict_char.json'], label2index = 'data/qe/train/train.tags.json', embeddings=['data/qe/pretrain/ep_qe.en.vector.txt', 'data/qe/pretrain/ep_qe.de.vector.txt'], use_adadelta=False, use_bilingual=False, use_pretrain=False, use_quest=False, use_tag=False, use_char=False, saveto=False, shuffle_each_epoch=True, load_data=None, ): model_options = OrderedDict(sorted(locals().copy().items())) print 'Model_Options:', model_options model_name = model_options['use_model'][0] if model_options['use_adadelta']: model_name += '_adadelta' if model_options['use_char']: model_name += '_char' if model_options['use_bilingual']: model_name += '_bilingual' if model_options['use_pretrain']: model_name += '_pretrain' print 'Using model:', model_name processed_data = [] if load_data: with gzip.open(load_data[0],'rb') as fp: processed_data = cPickle.load(fp) else: processed_data = preprocess_data(data_train=model_options['data_train'], data_train_y=model_options['data_train_y'][0], data_valid=model_options['data_valid'], data_valid_y=model_options['data_valid_y'][0], data_test=model_options['data_test'], data_test_y=model_options['data_test_y'][0], dictionaries=model_options['dictionaries'], character2index=model_options['character2index'], label2index = model_options['label2index'][0], embeddings = model_options['embeddings'], use_bilingual=model_options['use_bilingual'], use_char=model_options['use_char'], use_pretrain=model_options['use_pretrain']) """ Savinn the model/data with model_name """ save_data = folder = '' if use_tag: save_data = 'tag.data_' + model_name + '.pkl.gz' folder = 'tag.' + model_name if use_quest: save_data = 'quest.data_' + model_name + '.pkl.gz' folder = 'quest.' + model_name if saveto: with gzip.open(save_data,'wb') as fp: cPickle.dump(processed_data, fp) if not os.path.exists(folder): os.mkdir(folder) train, train_y, test, test_y, valid, valid_y, w2idxs, char2idxs, label2idxs, embs=processed_data idx2label = dict((k,v) for v,k in label2idxs.iteritems()) #print len(train), len(test), len(valid) vocsize_s = vocsize_t = vocsize_schar = vocsize_tchar = 0 emb_s, emb_t, train_s, train_schar, train_t, train_tchar, test_s, test_schar, test_t, test_tchar, valid_s, valid_schar, valid_t, valid_tchar = ([] for i in range(14)) if (use_bilingual or len(train) == 4) and use_char: emb_s, emb_t = embs train_s, train_t, train_schar, train_tchar = train test_s, test_t, test_schar, test_tchar = test valid_s, valid_t, valid_schar, valid_tchar = valid vocsize_s = len(w2idxs[0]) vocsize_t = len(w2idxs[1]) vocsize_schar = len(char2idxs[0]) vocsize_tchar = len(char2idxs[1]) elif use_char: emb_t = embs[0] train_t, train_tchar = train test_t, test_tchar = test valid_t, valid_tchar = valid vocsize_t = len(w2idxs[0]) vocsize_tchar = len(char2idxs[0]) elif use_bilingual or len(train) == 2: emb_s, emb_t = embs train_s, train_t = train test_s, test_t = test valid_s, valid_t = valid vocsize_s = len(w2idxs[0]) vocsize_t = len(w2idxs[1]) else : emb_t = embs[0] train_t = train[0] test_t = test[0] valid_t = valid[0] vocsize_t = len(w2idxs[0]) nclasses = len(label2idxs) nsentences = len(train_t) numpy.random.seed(model_options['seed']) # instanciate the model rnn = select_model[model_name]( nh = model_options['dim'], nc = nclasses, de = model_options['dim_word'], cs = model_options['win'], de_char = model_options['dim_char'], ne_char = vocsize_tchar, ne_src = vocsize_s, ne_tgt = vocsize_t, emb_src = emb_s, emb_tgt = emb_t, max_char = model_options['max_char']) # train with early stopping on validation set best_f1 = -numpy.inf model_options['patience'] = 2 batch_size = (nsentences/100) * 10 n_batches = nsentences//batch_size print n_batches for e in xrange(model_options['max_epochs']): model_options['ce'] = e #shuffle if shuffle_each_epoch: shuffle([train_t, train_s, train_tchar, train_y], model_options['seed']) tic = time.time() for k in xrange(n_batches): #Creating batches batch_train_s = [] batch_train_char = [] if model_options['use_bilingual']: batch_train_s = train_s[k*batch_size:(k+1)*batch_size] if model_options['use_char']: batch_train_char = train_tchar[k*batch_size:(k+1)*batch_size] batch_train_t = train_t[k*batch_size:(k+1)*batch_size] batch_train_y = train_y[k*batch_size:(k+1)*batch_size] batch_err = 0 for i in xrange(batch_size): cwords_src = [] padded_chars = [] if model_options['use_bilingual']: cwords_src = contextwin(batch_train_s[i], model_options['win']) if model_options['use_char']: padded_chars = add_padding(batch_train_char[i], model_options['max_char']) #print batch_train_char[0] #print padded_chars cwords_tgt = contextwin(batch_train_t[i], model_options['win']) labels = batch_train_y[i] if model_options['use_bilingual'] and model_options['use_char']: err = rnn.train_grad_shared(cwords_src, cwords_tgt, padded_chars, labels, model_options['lrate']) elif model_options['use_char']: err = rnn.train_grad_shared(cwords_tgt, padded_chars, labels, model_options['lrate']) elif model_options['use_bilingual']: err = rnn.train_grad_shared(cwords_src, cwords_tgt, labels, model_options['lrate']) elif model_options['use_adadelta']: err = rnn.train_grad_shared(cwords_tgt, labels, model_options['lrate']) else: err = rnn.train(cwords_tgt, labels, model_options['lrate']) if model_options['use_adadelta']: rnn.train_update(model_options['lrate']) rnn.normalize() if model_options['verbose']: print '[learning] epoch %i batch %i >> %2.2f%%'%(e, k, (i+1)*100./batch_size),'completed in %.2f (sec) <<\r'%(time.time()-tic), sys.stdout.flush() if(k % model_options['patience'] == 0): predictions_test, groundtruth_test, predictions_valid, \ groundtruth_valid = ([] for i in range(4)) if model_options['use_bilingual'] and model_options['use_char']: predictions_test = [ map(lambda x: idx2label[x], rnn.classify(numpy.asarray(contextwin(x, model_options['win'])).astype('int32'), numpy.asarray(contextwin(_x, model_options['win'])).astype('int32'), numpy.asarray(add_padding(__x, model_options['max_char'])).astype('int32'))) for x, _x, __x in zip(test_s, test_t, test_tchar) ] groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ] #words_test = [ map(lambda x: idx2word[x], w) for w in test_lex] predictions_valid = [ map(lambda x: idx2label[x], rnn.classify(numpy.asarray(contextwin(x, model_options['win'])).astype('int32'), numpy.asarray(contextwin(_x, model_options['win'])).astype('int32'), numpy.asarray(add_padding(__x, model_options['max_char'])).astype('int32'))) for x, _x, __x in zip(valid_s, valid_t, valid_tchar) ] groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ] elif model_options['use_bilingual']: #evaluation // back into the real world : idx -> words predictions_test = [ map(lambda x: idx2label[x], rnn.classify(numpy.asarray(contextwin(x_src, model_options['win'])).astype('int32'), numpy.asarray(contextwin(x_tgt,model_options['win'])).astype('int32'))) for x_src, x_tgt in zip(test_s, test_t) ] groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ] #words_test = [ map(lambda x: idx2word_de[x], w) for w in test_lex] predictions_valid = [ map(lambda x: idx2label[x], rnn.classify(numpy.asarray(contextwin(x_src, model_options['win'])).astype('int32'), numpy.asarray(contextwin(x_tgt,model_options['win'])).astype('int32'))) for x_src, x_tgt in zip(valid_s, valid_t) ] groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ] #words_valid = [ map(lambda x: idx2word_de[x], w) for w in valid_lex] elif model_options['use_char']: predictions_test = [ map(lambda x: idx2label[x], rnn.classify(numpy.asarray(contextwin(x, model_options['win'])).astype('int32'), numpy.asarray(add_padding(_x, model_options['max_char'])).astype('int32'))) for x, _x, in zip(test_t, test_tchar) ] groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ] #words_test = [ map(lambda x: idx2word[x], w) for w in test_lex] predictions_valid = [ map(lambda x: idx2label[x], rnn.classify(numpy.asarray(contextwin(x, model_options['win'])).astype('int32'), numpy.asarray(add_padding(_x, model_options['max_char'])).astype('int32'))) for x, _x, in zip(valid_t, valid_tchar) ] groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ] else: #evaluation // back into the real world : idx -> words predictions_test = [ map(lambda x: idx2label[x], rnn.classify(numpy.asarray(contextwin(x, model_options['win'])).astype('int32'))) for x in test_t ] groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ] #words_test = [ map(lambda x: idx2word[x], w) for w in test_t] predictions_valid = [ map(lambda x: idx2label[x], rnn.classify(numpy.asarray(contextwin(x, model_options['win'])).astype('int32'))) for x in valid_t ] groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ] #words_valid = [ map(lambda x: idx2word[x], w) for w in valid_t] #evaluation // compute the accuracy using conlleval.pl res_test = [] res_valid = [] current_score = 0 if model_options['use_quest']: res_test=wmt_eval(predictions_test, groundtruth_test, folder+'/current.test.txt') res_valid=wmt_eval(predictions_valid, groundtruth_valid, folder+'/current.valid.txt') current_score = res_valid[2][0] if model_options['use_tag']: res_test=icon_eval(predictions_test, groundtruth_test, folder+'/current.test.txt') res_valid=icon_eval(predictions_valid, groundtruth_valid, folder+'/current.valid.txt') current_score = res_valid[1] if current_score > best_f1: """ Save the model and model parameters """ rnn.save(folder) filename = folder +'/model' with open('%s.json'%filename, 'wb') as f: json.dump(model_options, f, indent=2) best_f1 = current_score if model_options['verbose']: print 'NEW BEST: epoch', e, 'valid F1', res_valid, 'test F1' , res_test , ' '*20 model_options['be'] = e subprocess.call(['mv', folder + '/current.test.txt.hyp', folder+'/best.test.txt']) subprocess.call(['mv', folder + '/current.valid.txt.hyp', folder+'/best.valid.txt']) else: print '' #Break if no improvement in 10 epochs if abs(model_options['be']-model_options['ce']) >= 10: break print 'BEST RESULT: epoch', model_options['be'] , 'valid F1', best_f1 , 'with the model', folder
em=dataset["embeddings"], init=initialize) else: print "Invalid RNN type: ", rnn_type sys.exit(-1) # create a folder for store the models if not os.path.exists(model_folder): os.mkdir(model_folder) # train with early stopping on validation set best_f1_test, best_f1_test_val = -numpy.inf, -numpy.inf s['clr'] = s['lr'] # learning rate for e in xrange(s['nepochs']): # shuffle shuffle([train_lex, train_y, train_feat], s['seed']) s['ce'] = e tic = time.time() for i in xrange(num_sentences): context_words = contextwin(train_lex[i], s['win']) words = map(lambda x: numpy.asarray(x).astype('int32'), minibatch(context_words, s['bs'])) features = minibatch(train_feat[i], s['bs']) labels = train_y[i] for word_batch, feature_batch, label_last_word in zip(words, features, labels): rnn.train(word_batch, feature_batch, label_last_word, s['clr']) rnn.normalize() if s['verbose']:
# instanciate the model numpy.random.seed(s['seed']) random.seed(s['seed']) rnn = model(nh=s['nhidden'], nc=nclasses, ne=vocsize, de=s['emb_dimension'], cs=s['win']) # train with early stopping on validation set best_f1 = -numpy.inf s['clr'] = s['lr'] for e in range(s['nepochs']): # shuffle shuffle([train_lex, train_ne, train_y], s['seed']) s['ce'] = e tic = time.time() for i in range(nsentences): cwords = contextwin(train_lex[i], s['win']) words = map(lambda x: numpy.asarray(x).astype('int32'),\ minibatch(cwords, s['bs'])) labels = train_y[i] for word_batch, label_last_word in zip(words, labels): rnn.train(word_batch, label_last_word, s['clr']) rnn.normalize() if s['verbose']: print( '[learning] epoch {} >> {:2.2f}, completed in {:.2f} (sec) '
def main(): settings = { 'fold': 3, # 5 folds 0,1,2,3,4 'lr': 0.0627142536696559, 'verbose': 1, 'decay': False, # decay on the learning rate if improvement stops 'win': 7, # number of words in the context window 'bs': 9, # number of backprop through time steps 'nhidden': 100, # number of hidden units 'seed': 345, 'emb_dimension': 100, # dimension of word embedding 'nepochs': 50 } folder = os.path.basename(__file__).split('.')[0] if not os.path.exists(folder): os.mkdir(folder) # load the dataset train_set, valid_set, test_set, dic = load.atisfold(settings['fold']) idx2label = dict((k, v) for v, k in dic['labels2idx'].iteritems()) idx2word = dict((k, v) for v, k in dic['words2idx'].iteritems()) train_lex, train_ne, train_y = train_set valid_lex, valid_ne, valid_y = valid_set test_lex, test_ne, test_y = test_set vocsize = len(dic['words2idx']) nclasses = len(dic['labels2idx']) nsentences = len(train_lex) # instantiate the model numpy.random.seed(settings['seed']) random.seed(settings['seed']) if LOAD: print "Loading model from %s..." % folder rnn = ElmanRNNModel.load(folder) else: rnn = ElmanRNNModel( hidden_dims=settings['nhidden'], num_classes=nclasses, vocab_size=vocsize, embed_dims=settings['emb_dimension'], context_size=settings['win'] ) # train with early stopping on validation set best_f1 = -numpy.inf settings['current_lr'] = settings['lr'] for e in xrange(settings['nepochs']): # shuffle shuffle([train_lex, train_ne, train_y], settings['seed']) settings['current_epoch'] = e tic = time.time() for i in xrange(nsentences): cwords = contextwin(train_lex[i], settings['win']) words = map( lambda x: numpy.asarray(x).astype('int32'), minibatch(cwords, settings['bs']) ) labels = train_y[i] for word_batch, label_last_word in zip(words, labels): rnn.train(word_batch, label_last_word, settings['current_lr']) rnn.normalize() if settings['verbose']: print '[learning] epoch %i >> %2.2f%%' % (e, (i+1)*100./nsentences), \ 'completed in %.2f (sec) <<\r' % (time.time()-tic), sys.stdout.flush() # evaluation // back into the real world : idx -> words predictions_test = [ map(lambda x: idx2label[x], rnn.classify(numpy.asarray(contextwin(x, settings['win'])).astype('int32'))) for x in test_lex ] groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y ] words_test = [map(lambda x: idx2word[x], w) for w in test_lex] predictions_valid = [ map( lambda idx: idx2label[idx], rnn.classify( numpy.asarray(contextwin(x, settings['win'])).astype('int32')) ) for x in valid_lex ] groundtruth_valid = [map(lambda x: idx2label[x], y) for y in valid_y] words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex] # evaluation // compute the accuracy using conlleval.pl res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt') res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt') if res_valid['f1'] > best_f1: rnn.save(folder) best_f1 = res_valid['f1'] if settings['verbose']: print 'NEW BEST: epoch', e, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' '*20 settings['vf1'], settings['vp'], settings['vr'] = res_valid['f1'], res_valid['p'], res_valid['r'] settings['tf1'], settings['tp'], settings['tr'] = res_test['f1'], res_test['p'], res_test['r'] settings['be'] = e subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt']) subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt']) else: print '' # learning rate decay if no improvement in 10 epochs if settings['decay'] and abs(settings['be'] - settings['current_epoch']) >= 10: settings['current_lr'] *= 0.5 if settings['current_lr'] < 1e-5: break print 'BEST RESULT: epoch', e, 'valid F1', settings['vf1'], 'best test F1', settings['tf1'], 'with the model', folder
def run_process(articles, use_cross_validation): settings = { 'partial_training': 0.8, 'partial_testing': 0.2, 'fold': 10, # 5 folds 0,1,2,3,4 'lr': 0.05, 'verbose': 1, 'decay': False, # decay on the learning rate if improvement stops 'win': 7, # number of words in the context window 'bs': 9, # number of backprop through time steps 'nhidden': 100, # number of hidden units 'seed': 345, 'emb_dimension': 100, # dimension of word embedding 'nepochs': 50 } indices = create_word2ind(articles) word_index = indices['wordIndex'] label_index = indices['labelIndex'] word2index = word_index.getCurrentIndex() index2word = word_index.getIndex2Word() label2index = label_index.getCurrentIndex() index2label = label_index.getIndex2Word() vocsize = len(word2index) nclasses = len(label2index) new_network_folder = datetime.datetime.now().strftime('%Y-%m-%d_%Hh%M') rnn, model_folder = create_network(settings, nclasses, vocsize, new_network_folder) print('RNN model created and saved under %s' % model_folder) labeled_data = get_labeled_data(articles)[0] labeled_data_size_for_each_article = get_labeled_data(articles)[1] print('Labeled data sizes for articles: ', labeled_data_size_for_each_article) sentences_list, labels_list = labeled_data.getData() while [] in sentences_list: print('Empty sentences found. They will be removed') empty = sentences_list.index([]) sentences_list.pop(empty) labels_list.pop(empty) assert len(sentences_list) == len(labels_list) number_labeled_sentences = len(sentences_list) # for i in range(0, len(articles)): # article = articles[i] print('Training for ', articles, ' will begin now') rnn = rnn.load(model_folder) #use_cross_validation=False ############################################### # specific articles for training and testing # ############################################### # train_sentences = sentences_list[0:labeled_data_size_for_each_article[2]] # train_labels = labels_list[0:labeled_data_size_for_each_article[2]] # # test_sentences = sentences_list[labeled_data_size_for_each_article[2]:] # test_labels = labels_list[labeled_data_size_for_each_article[2]:] # print('Training + validation size: [0:{0}]={0}'.format(labeled_data_size_for_each_article[2])) # print('Testing size: [{0}:{1}]={2}'.format(labeled_data_size_for_each_article[2],len(sentences_list), # len(sentences_list)-labeled_data_size_for_each_article[2])) ############################################################ # training and testing according to parameters in settings # ############################################################ if not use_cross_validation: print( 'No cross-validation techniques will be used in this training process' ) shuffle([sentences_list, labels_list], settings['seed']) training_size = int( math.floor(settings['partial_training'] * number_labeled_sentences)) testing_size = int( math.floor(settings['partial_testing'] * number_labeled_sentences)) print('Training size: [0:{0}] = {0}'.format(training_size)) train_sentences = sentences_list[0:training_size] train_labels = labels_list[0:training_size] print('Testing size: [{0}:{1}] = {2}'.format( training_size, training_size + testing_size, testing_size)) test_sentences = sentences_list[training_size:training_size + testing_size] test_labels = labels_list[training_size:training_size + testing_size] #################### # training process # #################### number_train_sentences = len(train_sentences) number_train_labels_toGuess = sum([len(x) for x in test_labels]) print( 'Starting training with {0} labeled sentences in total for {1} epochs.' .format(number_train_sentences, settings['nepochs'])) best_accuracy = -numpy.inf current_learning_rate = settings['lr'] best_epoch = 0 for e in range(0, settings['nepochs']): print('Epoch {0}'.format(e)) print('----------------------------------------------') shuffle([train_sentences, train_labels], settings['seed']) if use_cross_validation: #################### # validation phase # #################### print('Validation phase in process') shuffle([sentences_list, labels_list], settings['seed']) divide_in_folds = lambda lst, sz: [ lst[i:i + sz] for i in range(0, len(lst), sz) ] if len(sentences_list) % settings['fold'] == 0: size_of_fold = math.floor( len(sentences_list) / settings['fold']) else: size_of_fold = (math.floor( len(sentences_list) / settings['fold'])) + 1 sentences_in_folds = divide_in_folds(sentences_list, size_of_fold) labels_in_folds = divide_in_folds(labels_list, size_of_fold) assert len(sentences_in_folds) == settings['fold'] assert len(sentences_in_folds) == len(labels_in_folds) all_validation_accuracies = [] for j in range(0, len(sentences_in_folds)): ex_tr_sent = sentences_in_folds[:] ex_tr_labels = labels_in_folds[:] val_sent = sentences_in_folds[j] val_labels = labels_in_folds[j] assert len(val_sent) == len(val_labels) ex_tr_sent.pop(j) ex_tr_labels.pop(j) assert len(ex_tr_sent) == len(ex_tr_labels) tr_sent = [] tr_labels = [] for c in range(0, len(ex_tr_sent)): tr_sent.extend(ex_tr_sent[c]) tr_labels.extend(ex_tr_labels[c]) assert len(tr_sent) == len(tr_labels) train_dict = {'sentences': tr_sent, 'labels': tr_labels} validation_dict = {'sentences': val_sent, 'labels': val_labels} print('Training the fold number %i will begin now' % (j + 1)) [current_validation_accuracy, f1, conf_mat] = get_accuracy(rnn, train_dict, validation_dict, word2index, label2index, settings, current_learning_rate, e, index2label, is_validation=True) all_validation_accuracies.append(current_validation_accuracy) assert len(all_validation_accuracies) == settings['fold'] mean_validation = sum(all_validation_accuracies) / len( all_validation_accuracies) if mean_validation > best_accuracy: best_accuracy = mean_validation print('New best validation accuracy: %2.2f%%' % best_accuracy) rnn.save(model_folder) print('A new RNN has been saved.') else: print( 'Validation phase did not come up with a better accuracy (only %2.2f%%).' '. A new epoch will begin' % mean_validation) rnn = rnn.load(model_folder) #continue ################## # Training phase # ################## else: print('Training in progress') # rnn=rnn.load(model_folder) # print('RNN saved during the validation phase has been loaded') training_dict = { 'sentences': train_sentences, 'labels': train_labels } testing_dict = {'sentences': test_sentences, 'labels': test_labels} [testing_accuracy, f1, conf_mat] = get_accuracy(rnn, training_dict, testing_dict, word2index, label2index, settings, current_learning_rate, e, index2label, is_validation=False) print( 'Accuracy during the testing phase (number of correct guessed labels) at %2.2f%%.' % testing_accuracy) # check if current epoch is the best if testing_accuracy > best_accuracy: best_accuracy = testing_accuracy best_epoch = e print('Better testing accuracy !!') if abs(best_epoch - e) >= 5: current_learning_rate *= 0.5 if current_learning_rate < 1e-5: break print( 'BEST RESULT: epoch ', best_epoch, 'with best accuracy: ', best_accuracy, '.', )
def play_with_splitting_sentences(): """Play with splitting sentences""" conf = { # 'fold': 3, # 5 folds 0,1,2,3,4 'lr': 0.0627142536696559, 'verbose': False, 'decay': True, # decay on the learning rate if improvement stops 'win': 15, # number of characters in the context window 'bs': 5, # number of back-propagation through time steps 'nhidden': 100, # number of hidden units 'seed': 345, 'emb_dimension': 30, # dimension of character embedding 'nepochs': 10 } number_of_files = 50000 np.random.seed(conf['seed']) random.seed(conf['seed']) print "Calculate output" session_files = get_session_files( number_of_files=number_of_files, random_seed=conf['seed']) # Limit the scope To speed things up... labels2idx = {"O": 0, "X": 1} sentences = [] idxes = [] labels_idxes = [] labels = [] char2idx = get_char_to_idx(session_files) for session_file in session_files: session = json.loads(open(session_file, "rb").read()) sentence = session_to_text0(session) if not sentence.strip(): continue sentence_out, label = create_test(sentence, probability=0.2) sentences.append(sentence_out) labels.append(label) labels_idxes.append( np.fromiter((labels2idx[l] for l in label), dtype=np.uint32)) idxes.append( np.fromiter((char2idx[char] for char in sentence_out), dtype=np.uint32)) print "Prepare train, validation and test sets" train_valid_lex, test_lex, train_valid_y, test_y = train_test_split( idxes, labels_idxes, test_size=0.15, random_state=42) train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_lex, train_valid_y, test_size=0.2, random_state=42) print "Some more prep" idx2label = dict( (k, v) for v, k in labels2idx.iteritems()) # Reverse the dictionary idx2word = dict( (k, v) for v, k in char2idx.iteritems()) # Reverse the dictionary # vocsize = 1 + len(set(reduce(\ # lambda x, y: list(x)+list(y),\ # train_lex+valid_lex+test_lex))) vocsize = 1 + len( set(item for lex in (train_lex, valid_lex, test_lex) for sublist in lex for item in sublist)) nclasses = 2 #len(set(reduce(lambda x, y: list(x) + list(y), train_y + test_y + valid_y))) nsentences = len(train_lex) print "Some file os calls" folder = os.path.basename(__file__).split('.')[0] + "_3" if not os.path.exists(folder): os.mkdir(folder) print "Create a Neural Network" rnn = regular_elman( nh=conf['nhidden'], nc=nclasses, ne=vocsize, de=conf['emb_dimension'], cs=conf['win'], ) # train with early stopping on validation set best_f1 = -np.inf conf['clr'] = conf['lr'] print "Start training" start_time = time.time() for epoch in xrange(conf['nepochs']): # shuffle shuffle([train_lex, train_y], conf['seed']) conf['ce'] = epoch tic = time.time() for i in xrange(nsentences): cwords = contextwin(train_lex[i], conf['win']) words = [ np.asarray(x).astype(np.int32) for x in minibatch(cwords, conf['bs']) ] labels = train_y[i] for word_batch, label_last_word in zip(words, labels): rnn.train(word_batch, label_last_word, conf['clr']) rnn.normalize() if conf['verbose']: print '[learning] epoch %i >> %2.2f%%' % ( epoch, (i + 1) * 100. / nsentences ), 'completed in %.2f (sec) <<\r' % (time.time() - tic), sys.stdout.flush() # evaluation // back into the real world : idx -> words predictions_test = [ map(lambda x: idx2label[x], \ rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\ for x in test_lex ] groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y] words_test = [map(lambda x: idx2word[x], w) for w in test_lex] predictions_valid = [ map(lambda x: idx2label[x], \ rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\ for x in valid_lex ] groundtruth_valid = [map(lambda x: idx2label[x], y) for y in valid_y] words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex] # evaluation // compute the accuracy using conlleval.pl res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt') res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt') if res_valid['f1'] > best_f1: rnn.save(folder) best_f1 = res_valid['f1'] print 'NEW BEST: epoch', epoch, 'valid F1', res_valid[ 'f1'], 'best test F1', res_test['f1'], ' ' * 20 conf['vf1'], conf['vp'], conf['vr'] = res_valid['f1'], res_valid[ 'p'], res_valid['r'] conf['tf1'], conf['tp'], conf['tr'] = res_test['f1'], res_test[ 'p'], res_test['r'] conf['be'] = epoch subprocess.call([ 'mv', folder + '/current.test.txt', folder + '/best.test.txt' ]) subprocess.call([ 'mv', folder + '/current.valid.txt', folder + '/best.valid.txt' ]) else: print ' : epoch', epoch, 'valid F1', res_valid[ 'f1'], ' test F1', res_test['f1'], ' ' * 20 # learning rate decay if no improvement in 10 epochs if conf['decay'] and abs(conf['be'] - conf['ce']) >= 10: conf['clr'] *= 0.5 if conf['clr'] < 1e-5: break print 'BEST RESULT: epoch', conf[ 'be'], 'valid F1', best_f1, 'best test F1', conf[ 'tf1'], 'with the model', folder print "total time = {} seconds".format(time.time() - start_time)
def play_with_spelling(): """Play with spelling mistakes""" print CONF np.random.seed(CONF['seed']) random.seed(CONF['seed']) print "Calculate output" session_files = get_session_files(number_of_files=CONF['number_of_files'], random_seed=CONF['seed']) sentences = get_sentences(session_files) print len(sentences) labels2idx = char2idx = get_char_to_idx(sentences) print "Prepare train, validation and test sets" train_valid_sentences, test_sentences = train_test_split(sentences, test_size=0.15, random_state=CONF['seed']) train_sentences, valid_sentences = train_test_split(train_valid_sentences, test_size=0.2, random_state=CONF['seed']) print len(train_valid_sentences), len(test_sentences) test_lex, test_y = create_tests(test_sentences, CONF['error_probability'], labels2idx, char2idx) valid_lex, valid_y = create_tests(valid_sentences, CONF['error_probability'], labels2idx, char2idx) train_lex = [] train_y = [] for error_probability in (CONF['error_probability'], CONF['error_probability'] / 10, CONF['error_probability'] / 100, 0): _train_idxes, _train_labels_idxes = create_tests(train_sentences, error_probability, labels2idx, char2idx) train_lex.extend(_train_idxes) train_y.extend(_train_labels_idxes) # train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_idxes, train_valid_labels_idxes, test_size=0.2, random_state=CONF['seed']) print len(train_lex), len(valid_lex), len(train_y), len(valid_y) print "Some more prep" idx2label = dict((k, v) for v, k in labels2idx.iteritems()) # Reverse the dictionary idx2word = dict((k, v) for v, k in char2idx.iteritems()) # Reverse the dictionary groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y] windowed_test_lex = [np.asarray(contextwin(x, CONF['win'])).astype('int32') for x in test_lex] windowed_valid_lex = [np.asarray(contextwin(x, CONF['win'])).astype('int32') for x in valid_lex] words_test = [ map(lambda x: idx2word[x], w) for w in test_lex] groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ] words_valid = [ map(lambda x: idx2word[x], w) for w in valid_lex] vocsize = 1 + len(set(item for lex in (train_lex, valid_lex, test_lex) for sublist in lex for item in sublist)) nclasses = 1 + len(set(item for _y in (train_y, test_y, valid_y) for sublist in _y for item in sublist)) nsentences = len(train_lex) words_lex = [] for i in xrange(nsentences): cwords = contextwin(train_lex[i], CONF['win']) words = [np.asarray(x).astype(np.int32) for x in minibatch(cwords, CONF['batch_size'])] words_lex.append(words) print "Some file os calls" folder = os.path.basename(__file__).split('.')[0] + "_3" if not os.path.exists(folder): os.mkdir(folder) print "Create a Neural Network" rnn = regular_elman(nh=CONF['nhidden'], nc=nclasses, ne=vocsize, de=CONF['emb_dimension'], cs=CONF['win'],) # train with early stopping on validation set best_f1 = -np.inf CONF['current_learning_rate'] = CONF['learning_rate'] print "Start training" start_time = print_time = time.time() for epoch in xrange(CONF['nepochs']): # shuffle shuffle([words_lex, train_y], CONF['seed']) CONF['ce'] = epoch tic = time.time() percentage_of_sentences_to_train = (epoch + 1) / CONF['nepochs'] numer_of_sentences_to_train = int(nsentences * percentage_of_sentences_to_train) print "starting an epoch, numer_of_sentences_to_train =", numer_of_sentences_to_train test_size = int(len(windowed_test_lex) * percentage_of_sentences_to_train) print "test_size", test_size validation_size = int(len(windowed_valid_lex) * percentage_of_sentences_to_train) print "validation_size", validation_size for _ in xrange(30): # Trauma! print "_", _ for i in xrange(numer_of_sentences_to_train): words = words_lex[i] labels = train_y[i] for word_batch, label_last_word in zip(words, labels): rnn.train(word_batch, label_last_word, CONF['current_learning_rate']) rnn.normalize() if CONF['verbose'] and time.time() - print_time > 30: print '[learning] epoch %i >> %2.2f%%' % (epoch, (i + 1) * 100. / numer_of_sentences_to_train), 'completed in %.2f (sec) <<\r' % (time.time() - tic), print_time = time.time() # evaluation // back into the real world : idx -> words if CONF['verbose']: print "Classify test" predictions_test = [[idx2label[x] for x in rnn.classify(windowed_test_lex_item)] for windowed_test_lex_item in windowed_test_lex[:test_size]] if CONF['verbose']: print "Classify validation" predictions_valid = [[idx2label[x] for x in rnn.classify(windowed_valid_lex_item)] for windowed_valid_lex_item in windowed_valid_lex[:validation_size]] # evaluation // compute the accuracy using conlleval.pl if CONF['verbose']: print "Evaluate test and validation" res_test = conlleval(predictions_test, groundtruth_test[:test_size], words_test[:test_size], folder + '/current.test.txt') res_valid = conlleval(predictions_valid, groundtruth_valid[:validation_size], words_valid[:validation_size], folder + '/current.valid.txt') if res_valid['f1'] > best_f1: rnn.save(folder) best_f1 = res_valid['f1'] print 'NEW BEST: epoch', epoch, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' ' * 20 CONF['vf1'], CONF['vp'], CONF['vr'] = res_valid['f1'], res_valid['p'], res_valid['r'] CONF['tf1'], CONF['tp'], CONF['tr'] = res_test['f1'], res_test['p'], res_test['r'] CONF['be'] = epoch subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt']) subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt']) else: print ' : epoch', epoch, 'valid F1', res_valid['f1'], ' test F1', res_test['f1'], ' ' * 20 # rnn.load(folder) # learning rate decay if no improvement in 10 epochs if CONF['decay'] and abs(CONF['be'] - CONF['ce']) >= 10: CONF['current_learning_rate'] *= 0.5 if CONF['current_learning_rate'] < 1e-5: break print 'BEST RESULT: epoch', CONF['be'], 'valid F1', best_f1, 'best test F1', CONF['tf1'], 'with the model', folder print "total time = {} seconds".format(time.time() - start_time)
def prepare_data(): """Prepare the data""" conf = { 'fold': 3, # 5 folds 0,1,2,3,4 'lr': 0.0627142536696559, 'verbose': True, 'decay': True, # decay on the learning rate if improvement stops 'win': 7, # number of words in the context window 'bs': 9, # number of back-propagation through time steps 'nhidden': 100, # number of hidden units 'seed': 345, 'emb_dimension': 300, # dimension of word embedding 'nepochs': 50 } np.random.seed(conf['seed']) random.seed(conf['seed']) session_files = get_session_files( number_of_files=None, random_seed=conf['seed']) # Limit the scope To speed things up... sentences = [] idxes = [] labels = [] labels_idxes = [] print "Calculate words2idx" words2idx = get_words2idx(session_files) unknown = words2idx["<UNK>"] print "Calculate output" for session_file in session_files: session = json.loads(open(session_file, "rb").read()) sentence = session_to_text0(session) if not sentence.strip(): continue sentences.append(sentence) token_list = tokenize(sentence.lower()) dtp_search_res = dtp_search(sentence, None) iobes = to_iob(token_list, dtp_search_res) labels.append(iobes) labels_idxes.append( np.fromiter((LABELS2IDX[iob] for iob in iobes), dtype=np.int32)) # token_list = [re.sub(r"\d", "DIGIT", token) for token in token_list] idxes.append( np.fromiter( (words2idx.get(token, unknown) for token in token_list), dtype=np.int32)) print "Prepare train, validation and test sets" train_valid_lex, test_lex, train_valid_y, test_y = train_test_split( idxes, labels_idxes, test_size=0.15, random_state=42) train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_lex, train_valid_y, test_size=0.2, random_state=42) idx2label = dict( (k, v) for v, k in LABELS2IDX.iteritems()) # Reverse the dictionary idx2word = dict( (k, v) for v, k in words2idx.iteritems()) # Reverse the dictionary vocsize = len(idx2word) nclasses = len({label for labels in labels_idxes for label in labels}) # nclasses = len(set(reduce(lambda x, y: list(x) + list(y), train_y + test_y + valid_y))) nsentences = len(train_lex) folder = os.path.basename(__file__).split('.')[0] if not os.path.exists(folder): os.mkdir(folder) print "Loading Word2Vec" word2vec = Word2Vec.load_word2vec_format(WORD2VEC_FILENAME, binary=True) # C binary format print "Calculate word embeddings" embeddings = 0.2 * np.random.uniform( -1.0, 1.0, (vocsize + 1, conf['emb_dimension'])).astype( theano.config.floatX ) # add one for PADDING at the end @UndefinedVariable for idx, word in idx2word.iteritems(): try: embedding = word2vec[word] except KeyError: try: embedding = word2vec[word.capitalize()] except KeyError: embedding = embeddings[idx] # Keep it random embeddings[idx] = embedding del word2vec # It is huge print "Create a Neural Network" rnn = elman2vec(nh=conf['nhidden'], nc=nclasses, ne=vocsize, de=conf['emb_dimension'], cs=conf['win'], embeddings=embeddings) # train with early stopping on validation set best_f1 = -np.inf conf['clr'] = conf['lr'] print "Start training" for epoch in xrange(conf['nepochs']): # shuffle shuffle([train_lex, train_y], conf['seed']) conf['ce'] = epoch tic = time.time() for i in xrange(nsentences): cwords = contextwin(train_lex[i], conf['win']) words = [ np.asarray(x).astype(np.int32) for x in minibatch(cwords, conf['bs']) ] labels = train_y[i] for word_batch, label_last_word in zip(words, labels): rnn.train(word_batch, label_last_word, conf['clr']) # rnn.normalize() if conf['verbose']: print '[learning] epoch %i >> %2.2f%%' % ( epoch, (i + 1) * 100. / nsentences ), 'completed in %.2f (sec) <<\r' % (time.time() - tic), sys.stdout.flush() # evaluation // back into the real world : idx -> words predictions_test = [ map(lambda x: idx2label[x], \ rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\ for x in test_lex ] groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y] words_test = [map(lambda x: idx2word[x], w) for w in test_lex] predictions_valid = [ map(lambda x: idx2label[x], \ rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\ for x in valid_lex ] groundtruth_valid = [map(lambda x: idx2label[x], y) for y in valid_y] words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex] # evaluation // compute the accuracy using conlleval.pl res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt') res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt') if res_valid['f1'] > best_f1: rnn.save(folder) best_f1 = res_valid['f1'] print 'NEW BEST: epoch', epoch, 'valid F1', res_valid[ 'f1'], 'best test F1', res_test['f1'], ' ' * 20 conf['vf1'], conf['vp'], conf['vr'] = res_valid['f1'], res_valid[ 'p'], res_valid['r'] conf['tf1'], conf['tp'], conf['tr'] = res_test['f1'], res_test[ 'p'], res_test['r'] conf['be'] = epoch subprocess.call([ 'mv', folder + '/current.test.txt', folder + '/best.test.txt' ]) subprocess.call([ 'mv', folder + '/current.valid.txt', folder + '/best.valid.txt' ]) else: print ' : epoch', epoch, 'valid F1', res_valid[ 'f1'], ' test F1', res_test['f1'], ' ' * 20 # learning rate decay if no improvement in 10 epochs if conf['decay'] and abs(conf['be'] - conf['ce']) >= 10: conf['clr'] *= 0.5 if conf['clr'] < 1e-5: break print 'BEST RESULT: epoch', epoch, 'valid F1', res_valid[ 'f1'], 'best test F1', res_test['f1'], 'with the model', folder
def run(params): start_time = time.time() folder = os.path.basename(__file__).split('.')[0] if not os.path.exists(folder): os.mkdir(folder) rhoList = numpy.array([100, 50]).astype( numpy.int32 ) # 100,90,80,70,60,50,0 # combining forward and backward layers # load the dataset eval_options = [] params['measure'] = 'F1score' if params['dataset'] == 'atis': train_set, valid_set, test_set, dic = loadData.atisfold(params['fold']) if params['dataset'] == 'ner': train_set, valid_set, test_set, dic = loadData.ner() if params['dataset'] == 'chunk': train_set, valid_set, test_set, dic = loadData.chunk() if params['dataset'] == 'pos': train_set, valid_set, test_set, dic = loadData.pos() eval_options = ['-r'] params['measure'] = 'Accuracy' idx2label = dict((k, v) for v, k in dic['labels2idx'].items()) idx2word = dict((k, v) for v, k in dic['words2idx'].items()) train_lex, train_ne, train_y = train_set valid_lex, valid_ne, valid_y = valid_set test_lex, test_ne, test_y = test_set ## :( hack # train_lex = train_lex[::100] # train_ne = train_ne[::100] # train_y = train_y[::100] # valid_lex = valid_lex[::100] # valid_ne = valid_ne[::100] # valid_y = valid_y[::100] # test_lex = test_lex[::100] # test_ne = test_ne[::100] # test_y = test_y[::100] vocsize = len(dic['words2idx']) nclasses = len(dic['labels2idx']) nsentences = len(train_lex) wv = None if params['WVFolder'] != 'random': if '[' in params['WVFolder'] and ']' in params['WVFolder']: folderSet = set( eval(params['WVFolder'].replace('[', '[\'').replace( ']', '\']').replace(',', '\',\''))) print(folderSet) wv = numpy.zeros( (vocsize + 1, params['WVModel']['emb_dimension'] * len(folderSet))) modelIndex = 0 for folder in folderSet: params['WVFile'] = folder + '/' + 'words' + str( params['WVModel']['emb_dimension']) + '.npy' params['WVVocabFile'] = folder + '/' + 'words' + str( params['WVModel']['emb_dimension']) + '.vocab' # load word vector wvnp = np.load(params['WVFile']) # load vocab with open(params['WVVocabFile']) as f: vocab = [line.strip() for line in f if len(line) > 0] wi = dict([(a, i) for i, a in enumerate(vocab)]) random_v = math.sqrt( 6.0 / numpy.sum(params['WVModel']['emb_dimension']) ) * numpy.random.uniform(-1.0, 1.0, (params['WVModel']['emb_dimension'])) miss = 0 # the number of missing words in pre-trained word embeddings for i in range(0, vocsize): word = idx2word[i] if word in wi: wv[i][params['WVModel']['emb_dimension'] * modelIndex:params['WVModel']['emb_dimension'] * (modelIndex + 1)] = wvnp[wi[word]] # print wvnp[wi[word]] else: wv[i][params['WVModel']['emb_dimension'] * modelIndex:params['WVModel']['emb_dimension'] * (modelIndex + 1)] = random_v miss += 1 print("missing words rate : ", miss, '/', vocsize) params['WVModel']['vocab_size'] = len(vocab) modelIndex = modelIndex + 1 params['WVModel']['emb_dimension'] *= len(folderSet) # return else: folder = params['WVFolder'] params['WVFile'] = folder + '/' + 'words' + str( params['WVModel']['emb_dimension']) + '.npy' params['WVVocabFile'] = folder + '/' + 'words' + str( params['WVModel']['emb_dimension']) + '.vocab' # load word vector wvnp = np.load(params['WVFile']) params['WVModel']['emb_dimension'] = len(wvnp[0]) # load vocab with open(params['WVVocabFile']) as f: vocab = [line.strip() for line in f if len(line) > 0] wi = dict([(a, i) for i, a in enumerate(vocab)]) wv = numpy.zeros((vocsize + 1, params['WVModel']['emb_dimension'])) random_v = math.sqrt(6.0 / numpy.sum( params['WVModel']['emb_dimension'])) * numpy.random.uniform( -1.0, 1.0, (params['WVModel']['emb_dimension'])) miss = 0 # the number of missing words in pre-trained word embeddings for i in range(0, vocsize): word = idx2word[i] if word in wi: wv[i] = wvnp[wi[word]] # print wvnp[wi[word]] else: wv[i] = random_v miss += 1 print("missing words rate : ", miss, '/', vocsize) params['WVModel']['vocab_size'] = len(vocab) print(json.dumps(params, sort_keys=True, indent=4, separators=(',', ': '))) rhoSuffix = "%_forward" best_valid = {} best_test = {} for i_rho in range(len(rhoList)): best_valid[str(rhoList[i_rho]) + rhoSuffix] = -numpy.inf best_test[str(rhoList[i_rho]) + rhoSuffix] = -numpy.inf validMeasureList = {} testMeasureList = {} # this is used for drawing line chart. for i_rho in range(len(rhoList)): validMeasureList[str(rhoList[i_rho]) + rhoSuffix] = [] testMeasureList[str(rhoList[i_rho]) + rhoSuffix] = [] # instanciate the model numpy.random.seed(params['seed']) random.seed(params['seed']) rnn = elman_attention.model(nh=params['nhidden'], nc=nclasses, ne=vocsize, de=params['WVModel']['emb_dimension'], attention=params['attention'], h_win=(params['h_win_left'], params['h_win_right']), lvrg=params['lvrg'], wv=wv) # train for e in range(params['nepochs']): # shuffle shuffle([train_lex, train_ne, train_y], params['seed']) tic = time.time() for i in range(nsentences): cwords = contextwin(train_lex[i]) labels = train_y[i] nl, aaL = rnn.train(cwords, labels, params['dropRate'], 1) # rnn.normalize() if params['verbose']: sys.stdout.write( ('\r[learning] epoch %i >> %2.2f%%' % (e, (i + 1) * 100. / nsentences) + (' average speed in %.2f (min) <<' % ((time.time() - tic) / 60 / (i + 1) * nsentences)) + (' completed in %.2f (sec) <<' % ((time.time() - tic))))) sys.stdout.flush() print('start test', time.time() / 60) print('start pred train', time.time() / 60) predictions_train = [[map(lambda varible: idx2label[varible], w) \ for w in rnn.classify(numpy.asarray(contextwin(x)).astype('int32'), params['dropRate'], 0, rhoList)] for x in train_lex] predictions_test = [[map(lambda varible: idx2label[varible], w) \ for w in rnn.classify(numpy.asarray(contextwin(x)).astype('int32'), params['dropRate'], 0, rhoList)] for x in test_lex] predictions_valid = [[map(lambda varible: idx2label[varible], w) \ for w in rnn.classify(numpy.asarray(contextwin(x)).astype('int32'), params['dropRate'], 0, rhoList)] for x in valid_lex] for i_rho in range(len(rhoList)): groundtruth_train = [ map(lambda x: idx2label[x], y) for y in train_y ] words_train = [map(lambda x: idx2word[x], w) for w in train_lex] groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y] words_test = [map(lambda x: idx2word[x], w) for w in test_lex] groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ] words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex] ptrain = [p[i_rho] for p in predictions_train] ptest = [p[i_rho] for p in predictions_test] pvalid = [p[i_rho] for p in predictions_valid] res_train = conlleval( ptrain, groundtruth_train, words_train, folder + '/current.train.txt' + str(i_rho) + str(params['seed']), eval_options) res_test = conlleval( ptest, groundtruth_test, words_test, folder + '/current.test.txt' + str(i_rho) + str(params['seed']), eval_options) res_valid = conlleval( pvalid, groundtruth_valid, words_valid, folder + '/current.valid.txt' + str(i_rho) + str(params['seed']), eval_options) print(' epoch', e, ' rhoList ', i_rho, ' train p', res_train['p'], 'valid p', res_valid['p'], ' train r', res_train['r'], 'valid r', res_valid['r'], ' train ', params['measure'], res_train['measure'], 'valid ', params['measure'], res_valid['measure'], 'best test ', params['measure'], res_test['measure'], ' ' * 20) validMeasureList[str(rhoList[i_rho]) + rhoSuffix].append( res_valid['measure']) testMeasureList[str(rhoList[i_rho]) + rhoSuffix].append( res_test['measure']) if res_valid['measure'] > best_valid[str(rhoList[i_rho]) + rhoSuffix]: best_valid[str(rhoList[i_rho]) + rhoSuffix] = res_valid['measure'] best_test[str(rhoList[i_rho]) + rhoSuffix] = res_test['measure'] for i_rho in range( len(rhoList)): # this is used for drawing line chart. print(i_rho, params['dataset'], end=' ') for v in testMeasureList[str(rhoList[i_rho]) + rhoSuffix]: print(v, end=' ') print('') for i_rho in range(len(rhoList)): print('current best results', rhoList[i_rho], ' ', best_valid[str(rhoList[i_rho]) + rhoSuffix], '/', best_test[str(rhoList[i_rho]) + rhoSuffix]) end_time = time.time() with open(params['JSONOutputFile'], 'w') as outputFile: params['results'] = {} params['results']['best_valid_' + params['measure']] = best_valid params['results']['best_test_' + params['measure']] = best_test params['results']['valid_' + params['measure'] + 'ListBasedOnEpochs'] = validMeasureList params['results']['test_' + params['measure'] + 'ListBasedOnEpochs'] = testMeasureList params['running_time'] = {} params['running_time']['start'] = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(start_time)) params['running_time']['end'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(end_time)) params['running_time']['duration'] = end_time - start_time res = json.dump(params, outputFile, sort_keys=True, indent=4, separators=(',', ': ')) print(res)
# instanciate the model numpy.random.seed(s['seed']) random.seed(s['seed']) rnn = model( nh = s['nhidden'], nc = nclasses, ne = vocsize, de = s['emb_dimension'], cs = s['win'] ) # train with early stopping on validation set best_f1 = -numpy.inf s['clr'] = s['lr'] for e in xrange(s['nepochs']): # shuffle shuffle([train_lex, train_ne, train_y], s['seed']) s['ce'] = e tic = time.time() for i in xrange(nsentences): cwords = contextwin(train_lex[i], s['win']) words = map(lambda x: numpy.asarray(x).astype('int32'),\ minibatch(cwords, s['bs'])) labels = train_y[i] for word_batch , label_last_word in zip(words, labels): rnn.train(word_batch, label_last_word, s['clr']) rnn.normalize() if s['verbose']: print '[learning] epoch %i >> %2.2f%%'%(e,(i+1)*100./nsentences),'completed in %.2f (sec) <<\r'%(time.time()-tic), sys.stdout.flush() # evaluation // back into the real world : idx -> words
def play_with_splitting_sentences(): """Play with splitting sentences""" conf = { # 'fold': 3, # 5 folds 0,1,2,3,4 'lr': 0.0627142536696559, 'verbose': False, 'decay': True, # decay on the learning rate if improvement stops 'win': 15, # number of characters in the context window 'bs': 5, # number of back-propagation through time steps 'nhidden': 100, # number of hidden units 'seed': 345, 'emb_dimension': 30, # dimension of character embedding 'nepochs': 10} number_of_files = 50000 np.random.seed(conf['seed']) random.seed(conf['seed']) print "Calculate output" session_files = get_session_files(number_of_files=number_of_files, random_seed=conf['seed']) # Limit the scope To speed things up... labels2idx = {"O": 0, "X": 1} sentences = [] idxes = [] labels_idxes = [] labels = [] char2idx = get_char_to_idx(session_files) for session_file in session_files: session = json.loads(open(session_file, "rb").read()) sentence = session_to_text0(session) if not sentence.strip(): continue sentence_out, label = create_test(sentence, probability=0.2) sentences.append(sentence_out) labels.append(label) labels_idxes.append(np.fromiter((labels2idx[l] for l in label), dtype=np.uint32)) idxes.append(np.fromiter((char2idx[char] for char in sentence_out), dtype=np.uint32)) print "Prepare train, validation and test sets" train_valid_lex, test_lex, train_valid_y, test_y = train_test_split(idxes, labels_idxes, test_size=0.15, random_state=42) train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_lex, train_valid_y, test_size=0.2, random_state=42) print "Some more prep" idx2label = dict((k, v) for v, k in labels2idx.iteritems()) # Reverse the dictionary idx2word = dict((k, v) for v, k in char2idx.iteritems()) # Reverse the dictionary # vocsize = 1 + len(set(reduce(\ # lambda x, y: list(x)+list(y),\ # train_lex+valid_lex+test_lex))) vocsize = 1 + len(set(item for lex in (train_lex, valid_lex, test_lex) for sublist in lex for item in sublist)) nclasses = 2 #len(set(reduce(lambda x, y: list(x) + list(y), train_y + test_y + valid_y))) nsentences = len(train_lex) print "Some file os calls" folder = os.path.basename(__file__).split('.')[0] + "_3" if not os.path.exists(folder): os.mkdir(folder) print "Create a Neural Network" rnn = regular_elman(nh=conf['nhidden'], nc=nclasses, ne=vocsize, de=conf['emb_dimension'], cs=conf['win'],) # train with early stopping on validation set best_f1 = -np.inf conf['clr'] = conf['lr'] print "Start training" start_time = time.time() for epoch in xrange(conf['nepochs']): # shuffle shuffle([train_lex, train_y], conf['seed']) conf['ce'] = epoch tic = time.time() for i in xrange(nsentences): cwords = contextwin(train_lex[i], conf['win']) words = [np.asarray(x).astype(np.int32) for x in minibatch(cwords, conf['bs'])] labels = train_y[i] for word_batch , label_last_word in zip(words, labels): rnn.train(word_batch, label_last_word, conf['clr']) rnn.normalize() if conf['verbose']: print '[learning] epoch %i >> %2.2f%%' % (epoch, (i + 1) * 100. / nsentences), 'completed in %.2f (sec) <<\r' % (time.time() - tic), sys.stdout.flush() # evaluation // back into the real world : idx -> words predictions_test = [ map(lambda x: idx2label[x], \ rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\ for x in test_lex ] groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ] words_test = [ map(lambda x: idx2word[x], w) for w in test_lex] predictions_valid = [ map(lambda x: idx2label[x], \ rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\ for x in valid_lex ] groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ] words_valid = [ map(lambda x: idx2word[x], w) for w in valid_lex] # evaluation // compute the accuracy using conlleval.pl res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt') res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt') if res_valid['f1'] > best_f1: rnn.save(folder) best_f1 = res_valid['f1'] print 'NEW BEST: epoch', epoch, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' ' * 20 conf['vf1'], conf['vp'], conf['vr'] = res_valid['f1'], res_valid['p'], res_valid['r'] conf['tf1'], conf['tp'], conf['tr'] = res_test['f1'], res_test['p'], res_test['r'] conf['be'] = epoch subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt']) subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt']) else: print ' : epoch', epoch, 'valid F1', res_valid['f1'], ' test F1', res_test['f1'], ' ' * 20 # learning rate decay if no improvement in 10 epochs if conf['decay'] and abs(conf['be'] - conf['ce']) >= 10: conf['clr'] *= 0.5 if conf['clr'] < 1e-5: break print 'BEST RESULT: epoch', conf['be'], 'valid F1', best_f1, 'best test F1', conf['tf1'], 'with the model', folder print "total time = {} seconds".format(time.time() - start_time)
def prepare_data(): """Prepare the data""" conf = {'fold': 3, # 5 folds 0,1,2,3,4 'lr': 0.0627142536696559, 'verbose': True, 'decay': True, # decay on the learning rate if improvement stops 'win': 7, # number of words in the context window 'bs': 9, # number of back-propagation through time steps 'nhidden': 100, # number of hidden units 'seed': 345, 'emb_dimension': 300, # dimension of word embedding 'nepochs': 50} np.random.seed(conf['seed']) random.seed(conf['seed']) session_files = get_session_files(number_of_files=None, random_seed=conf['seed']) # Limit the scope To speed things up... sentences = [] idxes = [] labels = [] labels_idxes = [] print "Calculate words2idx" words2idx = get_words2idx(session_files) unknown = words2idx["<UNK>"] print "Calculate output" for session_file in session_files: session = json.loads(open(session_file, "rb").read()) sentence = session_to_text0(session) if not sentence.strip(): continue sentences.append(sentence) token_list = tokenize(sentence.lower()) dtp_search_res = dtp_search(sentence, None) iobes = to_iob(token_list, dtp_search_res) labels.append(iobes) labels_idxes.append(np.fromiter((LABELS2IDX[iob] for iob in iobes), dtype=np.int32)) # token_list = [re.sub(r"\d", "DIGIT", token) for token in token_list] idxes.append(np.fromiter((words2idx.get(token, unknown) for token in token_list), dtype=np.int32)) print "Prepare train, validation and test sets" train_valid_lex, test_lex, train_valid_y, test_y = train_test_split(idxes, labels_idxes, test_size=0.15, random_state=42) train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_lex, train_valid_y, test_size=0.2, random_state=42) idx2label = dict((k, v) for v, k in LABELS2IDX.iteritems()) # Reverse the dictionary idx2word = dict((k, v) for v, k in words2idx.iteritems()) # Reverse the dictionary vocsize = len(idx2word) nclasses = len({label for labels in labels_idxes for label in labels}) # nclasses = len(set(reduce(lambda x, y: list(x) + list(y), train_y + test_y + valid_y))) nsentences = len(train_lex) folder = os.path.basename(__file__).split('.')[0] if not os.path.exists(folder): os.mkdir(folder) print "Loading Word2Vec" word2vec = Word2Vec.load_word2vec_format(WORD2VEC_FILENAME, binary=True) # C binary format print "Calculate word embeddings" embeddings = 0.2 * np.random.uniform(-1.0, 1.0, (vocsize + 1, conf['emb_dimension'])).astype(theano.config.floatX) # add one for PADDING at the end @UndefinedVariable for idx, word in idx2word.iteritems(): try: embedding = word2vec[word] except KeyError: try: embedding = word2vec[word.capitalize()] except KeyError: embedding = embeddings[idx] # Keep it random embeddings[idx] = embedding del word2vec # It is huge print "Create a Neural Network" rnn = elman2vec(nh=conf['nhidden'], nc=nclasses, ne=vocsize, de=conf['emb_dimension'], cs=conf['win'], embeddings=embeddings) # train with early stopping on validation set best_f1 = -np.inf conf['clr'] = conf['lr'] print "Start training" for epoch in xrange(conf['nepochs']): # shuffle shuffle([train_lex, train_y], conf['seed']) conf['ce'] = epoch tic = time.time() for i in xrange(nsentences): cwords = contextwin(train_lex[i], conf['win']) words = [np.asarray(x).astype(np.int32) for x in minibatch(cwords, conf['bs'])] labels = train_y[i] for word_batch , label_last_word in zip(words, labels): rnn.train(word_batch, label_last_word, conf['clr']) # rnn.normalize() if conf['verbose']: print '[learning] epoch %i >> %2.2f%%' % (epoch, (i + 1) * 100. / nsentences), 'completed in %.2f (sec) <<\r' % (time.time() - tic), sys.stdout.flush() # evaluation // back into the real world : idx -> words predictions_test = [ map(lambda x: idx2label[x], \ rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\ for x in test_lex ] groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ] words_test = [ map(lambda x: idx2word[x], w) for w in test_lex] predictions_valid = [ map(lambda x: idx2label[x], \ rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\ for x in valid_lex ] groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ] words_valid = [ map(lambda x: idx2word[x], w) for w in valid_lex] # evaluation // compute the accuracy using conlleval.pl res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt') res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt') if res_valid['f1'] > best_f1: rnn.save(folder) best_f1 = res_valid['f1'] print 'NEW BEST: epoch', epoch, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' ' * 20 conf['vf1'], conf['vp'], conf['vr'] = res_valid['f1'], res_valid['p'], res_valid['r'] conf['tf1'], conf['tp'], conf['tr'] = res_test['f1'], res_test['p'], res_test['r'] conf['be'] = epoch subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt']) subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt']) else: print ' : epoch', epoch, 'valid F1', res_valid['f1'], ' test F1', res_test['f1'], ' ' * 20 # learning rate decay if no improvement in 10 epochs if conf['decay'] and abs(conf['be'] - conf['ce']) >= 10: conf['clr'] *= 0.5 if conf['clr'] < 1e-5: break print 'BEST RESULT: epoch', epoch, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], 'with the model', folder