Exemplo n.º 1
0
def run_on_ollie_dataset(iob_ollie_dataset_path,use_cross_validation):
    settings = {'partial_training': 0.8,
                'partial_testing': 0.2,
                'fold': 10,  # 5 folds 0,1,2,3,4
                'lr': 0.05,
                'verbose': 1,
                'decay': False,  # decay on the learning rate if improvement stops
                'win': 7,  # number of words in the context window
                'bs': 9,  # number of backprop through time steps
                'nhidden': 100,  # number of hidden units
                'seed': 345,
                'emb_dimension': 100,  # dimension of word embedding
                'nepochs': 50}

    # iob_ollie_dataset_file=open(iob_ollie_dataset_path,'r')
    indices=create_word2ind(iob_ollie_dataset_path)
    words_index=indices['wordIndex']
    labels_index=indices['labelIndex']
    word2index = words_index.getCurrentIndex()
    index2word = words_index.getIndex2Word()
    label2index = labels_index.getCurrentIndex()
    index2label = labels_index.getIndex2Word()

    vocsize=len(word2index)
    nclasses=len(label2index)
    new_network_folder = datetime.datetime.now().strftime('%Y-%m-%d_%Hh%M')
    rnn,model_folder=create_network(settings,nclasses,vocsize,new_network_folder)
    print('RNN model created and saved under %s' % model_folder)

    [labeled_data,labeled_data_size]=get_labeled_data(iob_ollie_dataset_path)
    print('Labeled data size for articles: ',labeled_data_size)
    sentences_list, labels_list = labeled_data.getData()
    while [] in sentences_list:
        print('Empty sentences were found. They will be removed')
        empty=sentences_list.index([])
        sentences_list.pop(empty)
        labels_list.pop(empty)
    assert len(sentences_list)==len(labels_list)
    number_labeled_sentences = len(sentences_list)

    print('The training phase of the RNN model on the Ollie dataset will begin now')
    rnn=rnn.load(model_folder)

    #########################################################
    # training with consideration to parameters in settings #
    #########################################################
    if not use_cross_validation:
        print('No cross-validation techniques will be used in this training process')
        shuffle([sentences_list, labels_list], settings['seed'])
        training_size = int(math.floor(settings['partial_training'] * number_labeled_sentences))
        testing_size = int(math.floor(settings['partial_testing'] * number_labeled_sentences))
        print('Training size: [0:{0}] = {0}'.format(training_size))
        train_sentences = sentences_list[0:training_size]
        train_labels = labels_list[0:training_size]
        print('Testing size: [{0}:{1}] = {2}'.format(training_size, training_size + testing_size, testing_size))
        test_sentences = sentences_list[training_size:training_size + testing_size]
        test_labels = labels_list[training_size:training_size + testing_size]
    else:
        print('Cross validation will be used')



    ####################
    # training process #
    ####################
    # number_train_sentences = len(train_sentences)
    # number_train_labels_toGuess = sum([len(x) for x in test_labels])
    # print('Starting training with {0} labeled sentences in total for {1} epochs.'.
    #       format(number_train_sentences, settings['nepochs']))

    best_accuracy = -numpy.inf
    current_learning_rate = settings['lr']
    best_epoch = 0

    f1_of_best_acc=0
    conf_mat_of_best_acc=None

    for e in range(0, settings['nepochs']):
        print('Epoch {0}'.format(e))
        print('----------------------------------------------')

        if use_cross_validation:
            ####################
            # validation phase #
            ####################
            print('Validation phase in process')
            shuffle([sentences_list, labels_list], settings['seed'])
            divide_in_folds=lambda lst,sz:[lst[i:i+sz] for i in range(0,len(lst),sz)]
            if len(sentences_list)%settings['fold']==0:
                size_of_fold=math.floor(len(sentences_list)/settings['fold'])
            else:
                size_of_fold=(math.floor(len(sentences_list)/settings['fold']))+1
            sentences_in_folds=divide_in_folds(sentences_list,size_of_fold)
            labels_in_folds=divide_in_folds(labels_list,size_of_fold)
            assert len(sentences_in_folds)==settings['fold']
            assert len(sentences_in_folds)==len(labels_in_folds)
            all_validation_accuracies=[]
            for j in range(0,len(sentences_in_folds)):
                ex_tr_sent=sentences_in_folds[:]
                ex_tr_labels=labels_in_folds[:]

                # val_sent=sentences_in_folds[j]
                # val_labels=labels_in_folds[j]
                # assert len(val_sent)==len(val_labels)

                val_sent=ex_tr_sent.pop(j)
                val_labels=ex_tr_labels.pop(j)
                assert len(val_sent)==len(val_labels)
                assert len(ex_tr_sent)==len(ex_tr_labels)

                tr_sent=[]
                tr_labels=[]
                for c in range(0,len(ex_tr_sent)):
                    tr_sent.extend(ex_tr_sent[c])
                    tr_labels.extend(ex_tr_labels[c])

                assert len(tr_sent)==len(tr_labels)

                train_dict={'sentences':tr_sent,'labels':tr_labels}
                validation_dict={'sentences':val_sent,'labels':val_labels}

                print('Training the fold number %i will begin now' % (j+1))
                [current_validation_accuracy,f1,conf_mat]=get_accuracy(rnn,train_dict,validation_dict,word2index,label2index,settings,
                                                         current_learning_rate,e,index2word,is_validation=True)

                all_validation_accuracies.append(current_validation_accuracy)
            assert len(all_validation_accuracies)==settings['fold']
            mean_validation=sum(all_validation_accuracies)/len(all_validation_accuracies)
            if mean_validation>best_accuracy:
                best_accuracy=mean_validation
                f1_of_best_acc=f1
                conf_mat_of_best_acc=conf_mat
                print('New best validation accuracy: %2.2f%%' % best_accuracy)
                # rnn.save(model_folder)
                print('A new RNN has been saved.')
            else:
                print('Validation phase did not come up with a better accuracy (only %2.2f%%).'
                      '. A new epoch will begin' % mean_validation)
                # rnn=rnn.load(model_folder)
                #continue
        ##################
        # Training phase #
        ##################
        else:
            shuffle([train_sentences, train_labels], settings['seed'])
            print('Training in progress')
            # rnn=rnn.load(model_folder)
            # print('RNN saved during the validation phase has been loaded')
            training_dict={'sentences':train_sentences,'labels':train_labels}
            testing_dict={'sentences':test_sentences,'labels':test_labels}
            [testing_accuracy,f1,conf_mat]=get_accuracy(rnn,training_dict,testing_dict,word2index,label2index,settings,
                                          current_learning_rate,e,index2word,is_validation=False)

            print('Accuracy during the testing phase (number of correct guessed labels) at %2.2f%%.' % testing_accuracy)

            # check if current epoch is the best
            if testing_accuracy> best_accuracy:
                best_accuracy = testing_accuracy
                best_epoch = e
                f1_of_best_acc=f1
                conf_mat_of_best_acc=conf_mat
                rnn.save(model_folder)
                print('Better testing accuracy !!')
            else:
                rnn=rnn.load(model_folder)

        if abs(best_epoch-e)>=5:
            current_learning_rate*=0.5

        if current_learning_rate<1e-5: break

    print('BEST RESULT: epoch ', best_epoch, 'with best accuracy: ', best_accuracy, '.',)
    # iob_ollie_dataset_file.close()
    pickle.dump([best_accuracy,f1_of_best_acc,conf_mat_of_best_acc],open('perf.pck','wb'))
Exemplo n.º 2
0
def trainingSession(model, best_f1):
    for epoch in range(epochs_num):
        # Shuffle
        tools.shuffle([train_lex, train_ne, train_y], seed)
        tic = time.time()
        train_loss = np.inf
        # Train
        for i in range(nsentences):
            labels = train_y[i]
            train_x = np.array([train_lex[i]])

            temp_loss, _ = sess.run([total_loss, train_op],
                                    feed_dict={
                                        x_input: train_x,
                                        y_labels: labels
                                    })

            if train_loss > temp_loss:
                train_loss = temp_loss
            if verbose:
                print(
                    "[Learning] Epoch %i >> %2.2f%%" %
                    (epoch + 1, (i + 1) * 100. / nsentences),
                    "completed in %.2f (sec) <<\r" % (time.time() - tic))
        print("[Learning] Epoch %i Loss %2.2f" %
              (epoch + 1, train_loss * 100000))

        # =============================#
        #           Evaluation         #
        #                              #
        #   back into the real world   #
        #         idx -> words         #
        # =============================#
        print("Evaluating...")
        istraining = False
        res_test, res_valid = evaluate(sess, model)
        istraining = True

        if res_valid['f1'] > best_f1:
            # Save the variables to disk.
            #save_path = saver.save(sess, "/tmp/model.ckpt")
            #print("Model saved in path: %s" % save_path)
            best_f1 = res_valid['f1']
            if 1:
                print("NEW BEST: epoch", epoch + 1, "valid F1",
                      res_valid['f1'], "best test F1", res_test['f1'],
                      " " * 20)

            vf1, vp, vr = res_valid['f1'], res_valid['p'], res_valid['r']
            tf1, tp, tr = res_test['f1'], res_test['p'], res_test['r']
            best_epoch = epoch
            subprocess.call(
                ['mv', folder + "current.test.txt", folder + "best.test.txt"])
            subprocess.call([
                'mv', folder + "current.valid.txt", folder + "best.valid.txt"
            ])
        else:
            print()

        #if preTraining and train_loss<0.5:
        #    break

    # print the best result
    print('BEST RESULT: epoch', best_epoch + 1, 'valid F1', vf1,
          'best test F1', tf1, 'with the model', folder)
    return best_f1
Exemplo n.º 3
0
                              init=initialize,
                              featdim=14)
    else:
        print "Invalid RNN type: ", rnn_type
        sys.exit(-1)

    # create a folder for store the models
    if not os.path.exists(model_folder): os.mkdir(model_folder)

    # train with early stopping on validation set
    best_f1_test, best_f1_test_val = -numpy.inf, -numpy.inf
    s['clr'] = s['lr']  # learning rate

    for e in xrange(s['nepochs']):
        # shuffle
        shuffle([train_lex, train_y, train_feat], s['seed'])
        s['ce'] = e
        tic = time.time()

        for i in xrange(num_sentences):
            context_words = contextwin(
                train_lex[i], s['win']
            )  #list of list of indexes corresponding to context windows surrounding each word in the sentence
            words = map(lambda x: numpy.asarray(x).astype('int32'),
                        minibatch(context_words, s['bs']))
            features = minibatch(train_feat[i], s['bs'])

            labels = train_y[i]

            for word_batch, feature_batch, label_last_word in zip(
                    words, features, labels):
Exemplo n.º 4
0
def train(dim_word=100,  # word vector dimensionality
          dim_char=10,  # the number of LSTM units
          max_char=10,  # the number of LSTM units
          dim=100,  # the number of LSTM units
	  win=5, #Window size
	  bs=5, #number of backprop through time steps
	  seed=123,
	  verbose=1,
          use_model='GRU', #Choose the model from- LSTM, DEEPLSTM, RNN, 
          patience=10,  # early stopping patience
          max_epochs=50,
          lrate=0.0005,  # learning rate
          maxlen=100,  # maximum length of the description
          data_train=['data/qe/train/train.src.lc',
              'data/qe/train/train.mt.lc',
              'data/qe/train/train.align'],
          data_train_y = 'data/qe/train/train.tags',
          data_valid=['data/qe/dev/dev.src.lc',
                'data/qe/dev/dev.mt.lc',
                'data/qe/dev/dev.align'],
          data_valid_y = 'data/qe/dev/dev.tags',
          data_test=['data/qe/test/test.src.lc',
                'data/qe/test/test.mt.lc',
                'data/qe/test/test.align'],
          data_test_y = 'data/qe/test/test.tags',
          dictionaries=['data/qe/train/train.src.lc.json',
              'data/qe/train/train.mt.lc.json'],
          character2index=['data/qe/train/train.src.lc.dict_char.json',
              'data/qe/train/train.mt.lc.dict_char.json'],
	  label2index = 'data/qe/train/train.tags.json',
          embeddings=['data/qe/pretrain/ep_qe.en.vector.txt',
              'data/qe/pretrain/ep_qe.de.vector.txt'],
	  use_adadelta=False,
          use_bilingual=False,
          use_pretrain=False,
          use_quest=False,
          use_tag=False,
          use_char=False,
          saveto=False,
          shuffle_each_epoch=True,
	  load_data=None,
    ):

	model_options = OrderedDict(sorted(locals().copy().items()))
	print 'Model_Options:', model_options

	model_name = model_options['use_model'][0]
	if model_options['use_adadelta']:
		model_name += '_adadelta'
	if model_options['use_char']:
		model_name += '_char'
	if model_options['use_bilingual']:
		model_name += '_bilingual'
	if model_options['use_pretrain']:
		model_name += '_pretrain'

	print 'Using model:', model_name

	processed_data = []
	if load_data:
	    with gzip.open(load_data[0],'rb') as fp:
			processed_data = cPickle.load(fp)
	else:
	    processed_data = preprocess_data(data_train=model_options['data_train'], 
		data_train_y=model_options['data_train_y'][0],
		data_valid=model_options['data_valid'], data_valid_y=model_options['data_valid_y'][0], 
		data_test=model_options['data_test'], data_test_y=model_options['data_test_y'][0], 
		dictionaries=model_options['dictionaries'],
		character2index=model_options['character2index'],
		label2index = model_options['label2index'][0],
		embeddings = model_options['embeddings'],
		use_bilingual=model_options['use_bilingual'], 
		use_char=model_options['use_char'], 
		use_pretrain=model_options['use_pretrain'])

	"""
	Savinn the model/data with model_name
	"""
	save_data = folder = ''
	if use_tag:
		save_data = 'tag.data_' + model_name + '.pkl.gz'
		folder = 'tag.' + model_name
	if use_quest:
		save_data = 'quest.data_' + model_name + '.pkl.gz'
		folder = 'quest.' + model_name

	if saveto:
		with gzip.open(save_data,'wb') as fp:
       			cPickle.dump(processed_data, fp)
    	if not os.path.exists(folder): os.mkdir(folder)

	train, train_y, test, test_y, valid, valid_y, w2idxs, char2idxs, label2idxs, embs=processed_data
	idx2label = dict((k,v) for v,k in label2idxs.iteritems())
	#print len(train), len(test), len(valid)

	vocsize_s = vocsize_t = vocsize_schar = vocsize_tchar = 0
        emb_s, emb_t, train_s, train_schar, train_t, train_tchar, test_s, test_schar, test_t, test_tchar, valid_s, valid_schar, valid_t, valid_tchar = ([] for i in range(14))
		
	if (use_bilingual or len(train) == 4) and use_char:
		emb_s, emb_t = embs
		train_s, train_t, train_schar, train_tchar = train
		test_s, test_t, test_schar, test_tchar = test
		valid_s, valid_t, valid_schar, valid_tchar = valid
    		vocsize_s = len(w2idxs[0])
    		vocsize_t = len(w2idxs[1])
		vocsize_schar = len(char2idxs[0])
		vocsize_tchar = len(char2idxs[1])

	elif use_char:
		emb_t = embs[0]
		train_t, train_tchar = train
		test_t, test_tchar = test
		valid_t, valid_tchar = valid
    		vocsize_t = len(w2idxs[0])
		vocsize_tchar = len(char2idxs[0])

	elif use_bilingual or len(train) == 2:
		emb_s, emb_t = embs
		train_s, train_t = train
		test_s, test_t = test
		valid_s, valid_t = valid
    		vocsize_s = len(w2idxs[0])
    		vocsize_t = len(w2idxs[1])
	else :
		emb_t = embs[0]
		train_t = train[0]
		test_t = test[0]
		valid_t = valid[0]
    		vocsize_t = len(w2idxs[0])

    	nclasses = len(label2idxs)
    	nsentences = len(train_t)

    	numpy.random.seed(model_options['seed'])
    	# instanciate the model
    	rnn = select_model[model_name]( nh = model_options['dim'],
                    nc = nclasses,
                    de = model_options['dim_word'],
                    cs = model_options['win'],
                    de_char = model_options['dim_char'],
		    ne_char = vocsize_tchar,
		    ne_src = vocsize_s,
		    ne_tgt = vocsize_t,
		    emb_src = emb_s,
		    emb_tgt = emb_t,
		    max_char = model_options['max_char'])

    	# train with early stopping on validation set
    	best_f1 = -numpy.inf
    	model_options['patience'] = 2
    	batch_size = (nsentences/100) * 10
    	n_batches = nsentences//batch_size
    	print n_batches
    	for e in xrange(model_options['max_epochs']):
	  model_options['ce'] = e
      	  #shuffle
	  if shuffle_each_epoch:
      	  	shuffle([train_t, train_s, train_tchar, train_y], model_options['seed'])

      	  tic = time.time()
      	  for k in xrange(n_batches):
            #Creating batches
	    batch_train_s = []
	    batch_train_char = []

	    if model_options['use_bilingual']:
            	batch_train_s = train_s[k*batch_size:(k+1)*batch_size]
	    if model_options['use_char']:
            	batch_train_char = train_tchar[k*batch_size:(k+1)*batch_size]

            batch_train_t = train_t[k*batch_size:(k+1)*batch_size]
            batch_train_y = train_y[k*batch_size:(k+1)*batch_size]
            batch_err = 0
            for i in xrange(batch_size):
		cwords_src = []
		padded_chars = []
		if model_options['use_bilingual']:
                	cwords_src = contextwin(batch_train_s[i], model_options['win'])
		if model_options['use_char']:
			padded_chars = add_padding(batch_train_char[i], model_options['max_char'])

		#print batch_train_char[0]
		#print padded_chars
                cwords_tgt = contextwin(batch_train_t[i], model_options['win'])
                labels = batch_train_y[i]

		if model_options['use_bilingual'] and model_options['use_char']:
                     err = rnn.train_grad_shared(cwords_src, cwords_tgt, padded_chars, labels, model_options['lrate'])
		elif model_options['use_char']:
                     err = rnn.train_grad_shared(cwords_tgt, padded_chars, labels, model_options['lrate'])
		elif model_options['use_bilingual']:
                     err = rnn.train_grad_shared(cwords_src, cwords_tgt, labels, model_options['lrate'])
		elif model_options['use_adadelta']:
                     err = rnn.train_grad_shared(cwords_tgt, labels, model_options['lrate'])
		else:
		     err = rnn.train(cwords_tgt, labels, model_options['lrate'])
                
		if model_options['use_adadelta']:
		     rnn.train_update(model_options['lrate'])

                rnn.normalize()
                
                if model_options['verbose']:
                    print '[learning] epoch %i batch %i >> %2.2f%%'%(e, k, (i+1)*100./batch_size),'completed in %.2f (sec) <<\r'%(time.time()-tic),
		    sys.stdout.flush()

	    if(k % model_options['patience'] == 0):

		predictions_test, groundtruth_test, predictions_valid, \
			groundtruth_valid = ([] for i in range(4))

		if model_options['use_bilingual'] and model_options['use_char']:
			predictions_test = [ map(lambda x: idx2label[x],
				rnn.classify(numpy.asarray(contextwin(x, 
				model_options['win'])).astype('int32'),
				numpy.asarray(contextwin(_x, model_options['win'])).astype('int32'),
				numpy.asarray(add_padding(__x, 
				model_options['max_char'])).astype('int32')))
				for x, _x, __x in zip(test_s, test_t, test_tchar) ]
                	groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ]
                	#words_test = [ map(lambda x: idx2word[x], w) for w in test_lex]

                	predictions_valid = [ map(lambda x: idx2label[x],
                                 rnn.classify(numpy.asarray(contextwin(x, 
				 model_options['win'])).astype('int32'),
				 numpy.asarray(contextwin(_x, model_options['win'])).astype('int32'),
				 numpy.asarray(add_padding(__x, 
				 model_options['max_char'])).astype('int32')))
                                 for x, _x, __x in zip(valid_s, valid_t, valid_tchar) ]
                	groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ]

		elif model_options['use_bilingual']:
			#evaluation // back into the real world : idx -> words
            		predictions_test = [ map(lambda x: idx2label[x],
                                 rnn.classify(numpy.asarray(contextwin(x_src, 
				 model_options['win'])).astype('int32'),
                                 numpy.asarray(contextwin(x_tgt,model_options['win'])).astype('int32')))
                                 for x_src, x_tgt in zip(test_s, test_t) ]
            		groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ]
           		#words_test = [ map(lambda x: idx2word_de[x], w) for w in test_lex]

            		predictions_valid = [ map(lambda x: idx2label[x],
                                 rnn.classify(numpy.asarray(contextwin(x_src, 
				 model_options['win'])).astype('int32'),
                                 numpy.asarray(contextwin(x_tgt,model_options['win'])).astype('int32')))
                                 for x_src, x_tgt in zip(valid_s, valid_t) ]
            		groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ]
            		#words_valid = [ map(lambda x: idx2word_de[x], w) for w in valid_lex]


		elif model_options['use_char']:
			predictions_test = [ map(lambda x: idx2label[x],
				rnn.classify(numpy.asarray(contextwin(x, 
				model_options['win'])).astype('int32'),
				numpy.asarray(add_padding(_x, 
				model_options['max_char'])).astype('int32')))
                                for x, _x, in zip(test_t, test_tchar) ]
                	groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ]
                	#words_test = [ map(lambda x: idx2word[x], w) for w in test_lex]

                	predictions_valid = [ map(lambda x: idx2label[x],
				rnn.classify(numpy.asarray(contextwin(x, 
				model_options['win'])).astype('int32'),
				numpy.asarray(add_padding(_x,
				model_options['max_char'])).astype('int32')))
                                for x, _x, in zip(valid_t, valid_tchar) ]
                	groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ]
		else:
                	#evaluation // back into the real world : idx -> words
                	predictions_test = [ map(lambda x: idx2label[x],
				rnn.classify(numpy.asarray(contextwin(x,
				model_options['win'])).astype('int32'))) for x in test_t ]

                	groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ]
                	#words_test = [ map(lambda x: idx2word[x], w) for w in test_t]

                	predictions_valid = [ map(lambda x: idx2label[x], 
				rnn.classify(numpy.asarray(contextwin(x, 
				model_options['win'])).astype('int32'))) for x in valid_t ]
                	groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ]
                	#words_valid = [ map(lambda x: idx2word[x], w) for w in valid_t]

                #evaluation // compute the accuracy using conlleval.pl
		res_test = []
		res_valid = []
		current_score = 0
		if model_options['use_quest']:
                   res_test=wmt_eval(predictions_test, groundtruth_test, folder+'/current.test.txt')
               	   res_valid=wmt_eval(predictions_valid, groundtruth_valid, folder+'/current.valid.txt')
		   current_score = res_valid[2][0]
		if model_options['use_tag']:
                  res_test=icon_eval(predictions_test, groundtruth_test, folder+'/current.test.txt')
                  res_valid=icon_eval(predictions_valid, groundtruth_valid, folder+'/current.valid.txt')
		  current_score = res_valid[1]

                if current_score > best_f1:

		    """
			Save the model and model parameters
		    """
                    rnn.save(folder)
		    filename = folder +'/model'
		    with open('%s.json'%filename, 'wb') as f:
			  json.dump(model_options, f, indent=2)

                    best_f1 = current_score
                    if model_options['verbose']:
                        print 'NEW BEST: epoch', e, 'valid F1', res_valid, 'test F1' , res_test , ' '*20
                    model_options['be'] = e
		    subprocess.call(['mv', folder + '/current.test.txt.hyp', folder+'/best.test.txt'])
                    subprocess.call(['mv', folder + '/current.valid.txt.hyp', folder+'/best.valid.txt'])
                else:
                    print ''
          #Break if no improvement in 10 epochs
          if abs(model_options['be']-model_options['ce']) >= 10:  break
        print 'BEST RESULT: epoch', model_options['be'] , 'valid F1', best_f1 , 'with the model', folder
Exemplo n.º 5
0
                           em=dataset["embeddings"],
                           init=initialize)
    else:
        print "Invalid RNN type: ", rnn_type
        sys.exit(-1)

    # create a folder for store the models
    if not os.path.exists(model_folder): os.mkdir(model_folder)

    # train with early stopping on validation set
    best_f1_test, best_f1_test_val = -numpy.inf, -numpy.inf
    s['clr'] = s['lr'] # learning rate

    for e in xrange(s['nepochs']):
        # shuffle
        shuffle([train_lex, train_y, train_feat], s['seed'])
        s['ce'] = e
        tic = time.time()

        for i in xrange(num_sentences):
            context_words = contextwin(train_lex[i], s['win'])
            words = map(lambda x: numpy.asarray(x).astype('int32'), minibatch(context_words, s['bs']))
            features = minibatch(train_feat[i], s['bs'])

            labels   = train_y[i]

            for word_batch, feature_batch, label_last_word in zip(words, features, labels):
                rnn.train(word_batch, feature_batch, label_last_word, s['clr'])
                rnn.normalize()

            if s['verbose']:
Exemplo n.º 6
0
    # instanciate the model
    numpy.random.seed(s['seed'])
    random.seed(s['seed'])
    rnn = model(nh=s['nhidden'],
                nc=nclasses,
                ne=vocsize,
                de=s['emb_dimension'],
                cs=s['win'])

    # train with early stopping on validation set
    best_f1 = -numpy.inf
    s['clr'] = s['lr']
    for e in range(s['nepochs']):
        # shuffle
        shuffle([train_lex, train_ne, train_y], s['seed'])
        s['ce'] = e
        tic = time.time()
        for i in range(nsentences):
            cwords = contextwin(train_lex[i], s['win'])
            words  = map(lambda x: numpy.asarray(x).astype('int32'),\
                         minibatch(cwords, s['bs']))
            labels = train_y[i]

            for word_batch, label_last_word in zip(words, labels):
                rnn.train(word_batch, label_last_word, s['clr'])
                rnn.normalize()

            if s['verbose']:
                print(
                    '[learning] epoch {} >> {:2.2f}, completed in {:.2f} (sec) '
Exemplo n.º 7
0
def main():
    settings = {
        'fold': 3,  # 5 folds 0,1,2,3,4
        'lr': 0.0627142536696559,
        'verbose': 1,
        'decay': False,  # decay on the learning rate if improvement stops
        'win': 7,  # number of words in the context window
        'bs': 9,  # number of backprop through time steps
        'nhidden': 100,  # number of hidden units
        'seed': 345,
        'emb_dimension': 100,  # dimension of word embedding
        'nepochs': 50
    }

    folder = os.path.basename(__file__).split('.')[0]

    if not os.path.exists(folder):
        os.mkdir(folder)

    # load the dataset
    train_set, valid_set, test_set, dic = load.atisfold(settings['fold'])
    idx2label = dict((k, v) for v, k in dic['labels2idx'].iteritems())
    idx2word = dict((k, v) for v, k in dic['words2idx'].iteritems())

    train_lex, train_ne, train_y = train_set
    valid_lex, valid_ne, valid_y = valid_set
    test_lex,  test_ne,  test_y = test_set

    vocsize = len(dic['words2idx'])
    nclasses = len(dic['labels2idx'])
    nsentences = len(train_lex)

    # instantiate the model
    numpy.random.seed(settings['seed'])
    random.seed(settings['seed'])

    if LOAD:
        print "Loading model from %s..." % folder

        rnn = ElmanRNNModel.load(folder)
    else:
        rnn = ElmanRNNModel(
            hidden_dims=settings['nhidden'],
            num_classes=nclasses,
            vocab_size=vocsize,
            embed_dims=settings['emb_dimension'],
            context_size=settings['win']
        )

    # train with early stopping on validation set
    best_f1 = -numpy.inf
    settings['current_lr'] = settings['lr']
    for e in xrange(settings['nepochs']):
        # shuffle
        shuffle([train_lex, train_ne, train_y], settings['seed'])
        settings['current_epoch'] = e
        tic = time.time()
        for i in xrange(nsentences):
            cwords = contextwin(train_lex[i], settings['win'])

            words = map(
                lambda x: numpy.asarray(x).astype('int32'),
                minibatch(cwords, settings['bs'])
            )

            labels = train_y[i]

            for word_batch, label_last_word in zip(words, labels):
                rnn.train(word_batch, label_last_word, settings['current_lr'])
                rnn.normalize()

            if settings['verbose']:
                print '[learning] epoch %i >> %2.2f%%' % (e, (i+1)*100./nsentences), \
                    'completed in %.2f (sec) <<\r' % (time.time()-tic),
                sys.stdout.flush()

        # evaluation // back into the real world : idx -> words
        predictions_test = [
            map(lambda x: idx2label[x],
                rnn.classify(numpy.asarray(contextwin(x, settings['win'])).astype('int32')))
            for x in test_lex
        ]

        groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y ]

        words_test = [map(lambda x: idx2word[x], w) for w in test_lex]

        predictions_valid = [
            map(
                lambda idx: idx2label[idx],
                rnn.classify(
                    numpy.asarray(contextwin(x, settings['win'])).astype('int32'))
            )
            for x in valid_lex
        ]

        groundtruth_valid = [map(lambda x: idx2label[x], y) for y in valid_y]

        words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex]

        # evaluation // compute the accuracy using conlleval.pl
        res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt')
        res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt')

        if res_valid['f1'] > best_f1:
            rnn.save(folder)
            best_f1 = res_valid['f1']
            if settings['verbose']:
                print 'NEW BEST: epoch', e, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' '*20
            settings['vf1'], settings['vp'], settings['vr'] = res_valid['f1'], res_valid['p'], res_valid['r']
            settings['tf1'], settings['tp'], settings['tr'] = res_test['f1'],  res_test['p'],  res_test['r']
            settings['be'] = e
            subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt'])
            subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt'])
        else:
            print ''

        # learning rate decay if no improvement in 10 epochs
        if settings['decay'] and abs(settings['be'] - settings['current_epoch']) >= 10:
            settings['current_lr'] *= 0.5

        if settings['current_lr'] < 1e-5:
            break

    print 'BEST RESULT: epoch', e, 'valid F1', settings['vf1'], 'best test F1', settings['tf1'], 'with the model', folder
def run_process(articles, use_cross_validation):
    settings = {
        'partial_training': 0.8,
        'partial_testing': 0.2,
        'fold': 10,  # 5 folds 0,1,2,3,4
        'lr': 0.05,
        'verbose': 1,
        'decay': False,  # decay on the learning rate if improvement stops
        'win': 7,  # number of words in the context window
        'bs': 9,  # number of backprop through time steps
        'nhidden': 100,  # number of hidden units
        'seed': 345,
        'emb_dimension': 100,  # dimension of word embedding
        'nepochs': 50
    }

    indices = create_word2ind(articles)
    word_index = indices['wordIndex']
    label_index = indices['labelIndex']
    word2index = word_index.getCurrentIndex()
    index2word = word_index.getIndex2Word()
    label2index = label_index.getCurrentIndex()
    index2label = label_index.getIndex2Word()
    vocsize = len(word2index)
    nclasses = len(label2index)

    new_network_folder = datetime.datetime.now().strftime('%Y-%m-%d_%Hh%M')
    rnn, model_folder = create_network(settings, nclasses, vocsize,
                                       new_network_folder)
    print('RNN model created and saved under %s' % model_folder)

    labeled_data = get_labeled_data(articles)[0]
    labeled_data_size_for_each_article = get_labeled_data(articles)[1]
    print('Labeled data sizes for articles: ',
          labeled_data_size_for_each_article)
    sentences_list, labels_list = labeled_data.getData()
    while [] in sentences_list:
        print('Empty sentences found. They will be removed')
        empty = sentences_list.index([])
        sentences_list.pop(empty)
        labels_list.pop(empty)
    assert len(sentences_list) == len(labels_list)
    number_labeled_sentences = len(sentences_list)

    # for i in range(0, len(articles)):
    # article = articles[i]
    print('Training for ', articles, ' will begin now')
    rnn = rnn.load(model_folder)
    #use_cross_validation=False
    ###############################################
    # specific articles for training and testing #
    ###############################################
    # train_sentences = sentences_list[0:labeled_data_size_for_each_article[2]]
    # train_labels = labels_list[0:labeled_data_size_for_each_article[2]]
    #
    # test_sentences = sentences_list[labeled_data_size_for_each_article[2]:]
    # test_labels = labels_list[labeled_data_size_for_each_article[2]:]
    # print('Training + validation size: [0:{0}]={0}'.format(labeled_data_size_for_each_article[2]))
    # print('Testing size: [{0}:{1}]={2}'.format(labeled_data_size_for_each_article[2],len(sentences_list),
    #                                            len(sentences_list)-labeled_data_size_for_each_article[2]))

    ############################################################
    # training and testing according to parameters in settings #
    ############################################################
    if not use_cross_validation:
        print(
            'No cross-validation techniques will be used in this training process'
        )
        shuffle([sentences_list, labels_list], settings['seed'])
        training_size = int(
            math.floor(settings['partial_training'] *
                       number_labeled_sentences))
        testing_size = int(
            math.floor(settings['partial_testing'] * number_labeled_sentences))
        print('Training size: [0:{0}] = {0}'.format(training_size))
        train_sentences = sentences_list[0:training_size]
        train_labels = labels_list[0:training_size]
        print('Testing size: [{0}:{1}] = {2}'.format(
            training_size, training_size + testing_size, testing_size))
        test_sentences = sentences_list[training_size:training_size +
                                        testing_size]
        test_labels = labels_list[training_size:training_size + testing_size]

    ####################
    # training process #
    ####################
    number_train_sentences = len(train_sentences)
    number_train_labels_toGuess = sum([len(x) for x in test_labels])
    print(
        'Starting training with {0} labeled sentences in total for {1} epochs.'
        .format(number_train_sentences, settings['nepochs']))

    best_accuracy = -numpy.inf
    current_learning_rate = settings['lr']
    best_epoch = 0
    for e in range(0, settings['nepochs']):
        print('Epoch {0}'.format(e))
        print('----------------------------------------------')
        shuffle([train_sentences, train_labels], settings['seed'])
        if use_cross_validation:
            ####################
            # validation phase #
            ####################
            print('Validation phase in process')
            shuffle([sentences_list, labels_list], settings['seed'])
            divide_in_folds = lambda lst, sz: [
                lst[i:i + sz] for i in range(0, len(lst), sz)
            ]
            if len(sentences_list) % settings['fold'] == 0:
                size_of_fold = math.floor(
                    len(sentences_list) / settings['fold'])
            else:
                size_of_fold = (math.floor(
                    len(sentences_list) / settings['fold'])) + 1
            sentences_in_folds = divide_in_folds(sentences_list, size_of_fold)
            labels_in_folds = divide_in_folds(labels_list, size_of_fold)
            assert len(sentences_in_folds) == settings['fold']
            assert len(sentences_in_folds) == len(labels_in_folds)
            all_validation_accuracies = []
            for j in range(0, len(sentences_in_folds)):
                ex_tr_sent = sentences_in_folds[:]
                ex_tr_labels = labels_in_folds[:]

                val_sent = sentences_in_folds[j]
                val_labels = labels_in_folds[j]
                assert len(val_sent) == len(val_labels)

                ex_tr_sent.pop(j)
                ex_tr_labels.pop(j)
                assert len(ex_tr_sent) == len(ex_tr_labels)

                tr_sent = []
                tr_labels = []
                for c in range(0, len(ex_tr_sent)):
                    tr_sent.extend(ex_tr_sent[c])
                    tr_labels.extend(ex_tr_labels[c])

                assert len(tr_sent) == len(tr_labels)

                train_dict = {'sentences': tr_sent, 'labels': tr_labels}
                validation_dict = {'sentences': val_sent, 'labels': val_labels}

                print('Training the fold number %i will begin now' % (j + 1))
                [current_validation_accuracy, f1,
                 conf_mat] = get_accuracy(rnn,
                                          train_dict,
                                          validation_dict,
                                          word2index,
                                          label2index,
                                          settings,
                                          current_learning_rate,
                                          e,
                                          index2label,
                                          is_validation=True)

                all_validation_accuracies.append(current_validation_accuracy)
            assert len(all_validation_accuracies) == settings['fold']
            mean_validation = sum(all_validation_accuracies) / len(
                all_validation_accuracies)
            if mean_validation > best_accuracy:
                best_accuracy = mean_validation
                print('New best validation accuracy: %2.2f%%' % best_accuracy)
                rnn.save(model_folder)
                print('A new RNN has been saved.')
            else:
                print(
                    'Validation phase did not come up with a better accuracy (only %2.2f%%).'
                    '. A new epoch will begin' % mean_validation)
                rnn = rnn.load(model_folder)
                #continue
        ##################
        # Training phase #
        ##################
        else:
            print('Training in progress')
            # rnn=rnn.load(model_folder)
            # print('RNN saved during the validation phase has been loaded')
            training_dict = {
                'sentences': train_sentences,
                'labels': train_labels
            }
            testing_dict = {'sentences': test_sentences, 'labels': test_labels}
            [testing_accuracy, f1,
             conf_mat] = get_accuracy(rnn,
                                      training_dict,
                                      testing_dict,
                                      word2index,
                                      label2index,
                                      settings,
                                      current_learning_rate,
                                      e,
                                      index2label,
                                      is_validation=False)

            print(
                'Accuracy during the testing phase (number of correct guessed labels) at %2.2f%%.'
                % testing_accuracy)

            # check if current epoch is the best
            if testing_accuracy > best_accuracy:
                best_accuracy = testing_accuracy
                best_epoch = e
                print('Better testing accuracy !!')

        if abs(best_epoch - e) >= 5:
            current_learning_rate *= 0.5

        if current_learning_rate < 1e-5: break

    print(
        'BEST RESULT: epoch ',
        best_epoch,
        'with best accuracy: ',
        best_accuracy,
        '.',
    )
Exemplo n.º 9
0
def play_with_splitting_sentences():
    """Play with splitting sentences"""
    conf = {  # 'fold': 3, # 5 folds 0,1,2,3,4
        'lr': 0.0627142536696559,
        'verbose': False,
        'decay': True,  # decay on the learning rate if improvement stops
        'win': 15,  # number of characters in the context window
        'bs': 5,  # number of back-propagation through time steps
        'nhidden': 100,  # number of hidden units
        'seed': 345,
        'emb_dimension': 30,  # dimension of character embedding
        'nepochs': 10
    }
    number_of_files = 50000
    np.random.seed(conf['seed'])
    random.seed(conf['seed'])
    print "Calculate output"
    session_files = get_session_files(
        number_of_files=number_of_files,
        random_seed=conf['seed'])  # Limit the scope To speed things up...
    labels2idx = {"O": 0, "X": 1}
    sentences = []
    idxes = []
    labels_idxes = []
    labels = []
    char2idx = get_char_to_idx(session_files)
    for session_file in session_files:
        session = json.loads(open(session_file, "rb").read())
        sentence = session_to_text0(session)
        if not sentence.strip():
            continue
        sentence_out, label = create_test(sentence, probability=0.2)
        sentences.append(sentence_out)
        labels.append(label)
        labels_idxes.append(
            np.fromiter((labels2idx[l] for l in label), dtype=np.uint32))
        idxes.append(
            np.fromiter((char2idx[char] for char in sentence_out),
                        dtype=np.uint32))

    print "Prepare train, validation and test sets"
    train_valid_lex, test_lex, train_valid_y, test_y = train_test_split(
        idxes, labels_idxes, test_size=0.15, random_state=42)
    train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_lex,
                                                              train_valid_y,
                                                              test_size=0.2,
                                                              random_state=42)
    print "Some more prep"
    idx2label = dict(
        (k, v) for v, k in labels2idx.iteritems())  # Reverse the dictionary
    idx2word = dict(
        (k, v) for v, k in char2idx.iteritems())  # Reverse the dictionary

    #     vocsize = 1 + len(set(reduce(\
    #                                  lambda x, y: list(x)+list(y),\
    #                                  train_lex+valid_lex+test_lex)))
    vocsize = 1 + len(
        set(item for lex in (train_lex, valid_lex, test_lex) for sublist in lex
            for item in sublist))
    nclasses = 2  #len(set(reduce(lambda x, y: list(x) + list(y), train_y + test_y + valid_y)))
    nsentences = len(train_lex)
    print "Some file os calls"
    folder = os.path.basename(__file__).split('.')[0] + "_3"
    if not os.path.exists(folder):
        os.mkdir(folder)
    print "Create a Neural Network"
    rnn = regular_elman(
        nh=conf['nhidden'],
        nc=nclasses,
        ne=vocsize,
        de=conf['emb_dimension'],
        cs=conf['win'],
    )

    # train with early stopping on validation set
    best_f1 = -np.inf
    conf['clr'] = conf['lr']
    print "Start training"
    start_time = time.time()
    for epoch in xrange(conf['nepochs']):
        # shuffle
        shuffle([train_lex, train_y], conf['seed'])
        conf['ce'] = epoch
        tic = time.time()
        for i in xrange(nsentences):
            cwords = contextwin(train_lex[i], conf['win'])
            words = [
                np.asarray(x).astype(np.int32)
                for x in minibatch(cwords, conf['bs'])
            ]
            labels = train_y[i]
            for word_batch, label_last_word in zip(words, labels):
                rnn.train(word_batch, label_last_word, conf['clr'])
                rnn.normalize()
            if conf['verbose']:
                print '[learning] epoch %i >> %2.2f%%' % (
                    epoch, (i + 1) * 100. / nsentences
                ), 'completed in %.2f (sec) <<\r' % (time.time() - tic),
                sys.stdout.flush()

        # evaluation // back into the real world : idx -> words
        predictions_test = [ map(lambda x: idx2label[x], \
                         rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                         for x in test_lex ]
        groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y]
        words_test = [map(lambda x: idx2word[x], w) for w in test_lex]

        predictions_valid = [ map(lambda x: idx2label[x], \
                             rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                             for x in valid_lex ]
        groundtruth_valid = [map(lambda x: idx2label[x], y) for y in valid_y]
        words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex]

        # evaluation // compute the accuracy using conlleval.pl
        res_test = conlleval(predictions_test, groundtruth_test, words_test,
                             folder + '/current.test.txt')
        res_valid = conlleval(predictions_valid, groundtruth_valid,
                              words_valid, folder + '/current.valid.txt')

        if res_valid['f1'] > best_f1:
            rnn.save(folder)
            best_f1 = res_valid['f1']
            print 'NEW BEST: epoch', epoch, 'valid F1', res_valid[
                'f1'], 'best test F1', res_test['f1'], ' ' * 20
            conf['vf1'], conf['vp'], conf['vr'] = res_valid['f1'], res_valid[
                'p'], res_valid['r']
            conf['tf1'], conf['tp'], conf['tr'] = res_test['f1'], res_test[
                'p'], res_test['r']
            conf['be'] = epoch
            subprocess.call([
                'mv', folder + '/current.test.txt', folder + '/best.test.txt'
            ])
            subprocess.call([
                'mv', folder + '/current.valid.txt', folder + '/best.valid.txt'
            ])
        else:
            print '        : epoch', epoch, 'valid F1', res_valid[
                'f1'], '     test F1', res_test['f1'], ' ' * 20

        # learning rate decay if no improvement in 10 epochs
        if conf['decay'] and abs(conf['be'] - conf['ce']) >= 10:
            conf['clr'] *= 0.5
        if conf['clr'] < 1e-5:
            break

    print 'BEST RESULT: epoch', conf[
        'be'], 'valid F1', best_f1, 'best test F1', conf[
            'tf1'], 'with the model', folder
    print "total time = {} seconds".format(time.time() - start_time)
Exemplo n.º 10
0
def play_with_spelling():
    """Play with spelling mistakes"""
    print CONF
    np.random.seed(CONF['seed'])
    random.seed(CONF['seed'])
    print "Calculate output"
    session_files = get_session_files(number_of_files=CONF['number_of_files'], random_seed=CONF['seed'])
    sentences = get_sentences(session_files)
    print len(sentences)
    labels2idx = char2idx = get_char_to_idx(sentences)

    print "Prepare train, validation and test sets"
    train_valid_sentences, test_sentences = train_test_split(sentences, test_size=0.15, random_state=CONF['seed'])
    train_sentences, valid_sentences = train_test_split(train_valid_sentences, test_size=0.2, random_state=CONF['seed'])
    print len(train_valid_sentences), len(test_sentences)
    test_lex, test_y = create_tests(test_sentences, CONF['error_probability'], labels2idx, char2idx)
    valid_lex, valid_y = create_tests(valid_sentences, CONF['error_probability'], labels2idx, char2idx)
    train_lex = []
    train_y = []
    for error_probability in (CONF['error_probability'], CONF['error_probability'] / 10, CONF['error_probability'] / 100, 0):
        _train_idxes, _train_labels_idxes = create_tests(train_sentences, error_probability, labels2idx, char2idx)
        train_lex.extend(_train_idxes)
        train_y.extend(_train_labels_idxes)
#     train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_idxes, train_valid_labels_idxes, test_size=0.2, random_state=CONF['seed'])
    print len(train_lex), len(valid_lex), len(train_y), len(valid_y)

    print "Some more prep"
    idx2label = dict((k, v) for v, k in labels2idx.iteritems()) # Reverse the dictionary
    idx2word = dict((k, v) for v, k in char2idx.iteritems()) # Reverse the dictionary
    groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y]
    windowed_test_lex = [np.asarray(contextwin(x, CONF['win'])).astype('int32') for x in test_lex]
    windowed_valid_lex = [np.asarray(contextwin(x, CONF['win'])).astype('int32') for x in valid_lex]

    words_test = [ map(lambda x: idx2word[x], w) for w in test_lex]
    groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ]
    words_valid = [ map(lambda x: idx2word[x], w) for w in valid_lex]
    vocsize = 1 + len(set(item for lex in (train_lex, valid_lex, test_lex) for sublist in lex for item in sublist))
    nclasses = 1 + len(set(item for _y in (train_y, test_y, valid_y) for sublist in _y for item in sublist))
    nsentences = len(train_lex)

    words_lex = []
    for i in xrange(nsentences):
        cwords = contextwin(train_lex[i], CONF['win'])
        words = [np.asarray(x).astype(np.int32) for x in minibatch(cwords, CONF['batch_size'])]
        words_lex.append(words)

    print "Some file os calls"
    folder = os.path.basename(__file__).split('.')[0] + "_3"
    if not os.path.exists(folder):
        os.mkdir(folder)
    print "Create a Neural Network"
    rnn = regular_elman(nh=CONF['nhidden'],
                        nc=nclasses,
                        ne=vocsize,
                        de=CONF['emb_dimension'],
                        cs=CONF['win'],)

    # train with early stopping on validation set
    best_f1 = -np.inf
    CONF['current_learning_rate'] = CONF['learning_rate']
    print "Start training"
    start_time = print_time = time.time()
    for epoch in xrange(CONF['nepochs']):
        # shuffle
        shuffle([words_lex, train_y], CONF['seed'])
        CONF['ce'] = epoch
        tic = time.time()
        percentage_of_sentences_to_train = (epoch + 1) / CONF['nepochs']
        numer_of_sentences_to_train = int(nsentences * percentage_of_sentences_to_train)
        print "starting an epoch, numer_of_sentences_to_train =", numer_of_sentences_to_train
        test_size = int(len(windowed_test_lex) * percentage_of_sentences_to_train)
        print "test_size", test_size
        validation_size = int(len(windowed_valid_lex) * percentage_of_sentences_to_train)
        print "validation_size", validation_size
        for _ in xrange(30): # Trauma!
            print "_", _
            for i in xrange(numer_of_sentences_to_train):
                words = words_lex[i]
                labels = train_y[i]
                for word_batch, label_last_word in zip(words, labels):
                    rnn.train(word_batch, label_last_word, CONF['current_learning_rate'])
                    rnn.normalize()
                if CONF['verbose'] and time.time() - print_time > 30:
                    print '[learning] epoch %i >> %2.2f%%' % (epoch, (i + 1) * 100. / numer_of_sentences_to_train), 'completed in %.2f (sec) <<\r' % (time.time() - tic),
                    print_time = time.time()            

        # evaluation // back into the real world : idx -> words
        if CONF['verbose']:
            print "Classify test"
        predictions_test = [[idx2label[x] for x in rnn.classify(windowed_test_lex_item)]
                            for windowed_test_lex_item in windowed_test_lex[:test_size]]

        if CONF['verbose']:
            print "Classify validation"
        predictions_valid = [[idx2label[x] for x in rnn.classify(windowed_valid_lex_item)]
                             for windowed_valid_lex_item in windowed_valid_lex[:validation_size]]
        # evaluation // compute the accuracy using conlleval.pl
        if CONF['verbose']:
            print "Evaluate test and validation"
        res_test = conlleval(predictions_test, groundtruth_test[:test_size], words_test[:test_size], folder + '/current.test.txt')
        res_valid = conlleval(predictions_valid, groundtruth_valid[:validation_size], words_valid[:validation_size], folder + '/current.valid.txt')

        if res_valid['f1'] > best_f1:
            rnn.save(folder)
            best_f1 = res_valid['f1']
            print 'NEW BEST: epoch', epoch, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' ' * 20
            CONF['vf1'], CONF['vp'], CONF['vr'] = res_valid['f1'], res_valid['p'], res_valid['r']
            CONF['tf1'], CONF['tp'], CONF['tr'] = res_test['f1'], res_test['p'], res_test['r']
            CONF['be'] = epoch
            subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt'])
            subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt'])
        else:
            print '        : epoch', epoch, 'valid F1', res_valid['f1'], '     test F1', res_test['f1'], ' ' * 20
#             rnn.load(folder)

        # learning rate decay if no improvement in 10 epochs
        if CONF['decay'] and abs(CONF['be'] - CONF['ce']) >= 10:
            CONF['current_learning_rate'] *= 0.5
        if CONF['current_learning_rate'] < 1e-5:
            break

    print 'BEST RESULT: epoch', CONF['be'], 'valid F1', best_f1, 'best test F1', CONF['tf1'], 'with the model', folder
    print "total time = {} seconds".format(time.time() - start_time)
Exemplo n.º 11
0
def prepare_data():
    """Prepare the data"""
    conf = {
        'fold': 3,  # 5 folds 0,1,2,3,4
        'lr': 0.0627142536696559,
        'verbose': True,
        'decay': True,  # decay on the learning rate if improvement stops
        'win': 7,  # number of words in the context window
        'bs': 9,  # number of back-propagation through time steps
        'nhidden': 100,  # number of hidden units
        'seed': 345,
        'emb_dimension': 300,  # dimension of word embedding
        'nepochs': 50
    }
    np.random.seed(conf['seed'])
    random.seed(conf['seed'])
    session_files = get_session_files(
        number_of_files=None,
        random_seed=conf['seed'])  # Limit the scope To speed things up...
    sentences = []
    idxes = []
    labels = []
    labels_idxes = []
    print "Calculate words2idx"
    words2idx = get_words2idx(session_files)
    unknown = words2idx["<UNK>"]
    print "Calculate output"
    for session_file in session_files:
        session = json.loads(open(session_file, "rb").read())
        sentence = session_to_text0(session)
        if not sentence.strip():
            continue
        sentences.append(sentence)
        token_list = tokenize(sentence.lower())
        dtp_search_res = dtp_search(sentence, None)
        iobes = to_iob(token_list, dtp_search_res)
        labels.append(iobes)
        labels_idxes.append(
            np.fromiter((LABELS2IDX[iob] for iob in iobes), dtype=np.int32))
        #         token_list = [re.sub(r"\d", "DIGIT", token) for token in token_list]
        idxes.append(
            np.fromiter(
                (words2idx.get(token, unknown) for token in token_list),
                dtype=np.int32))

    print "Prepare train, validation and test sets"
    train_valid_lex, test_lex, train_valid_y, test_y = train_test_split(
        idxes, labels_idxes, test_size=0.15, random_state=42)
    train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_lex,
                                                              train_valid_y,
                                                              test_size=0.2,
                                                              random_state=42)

    idx2label = dict(
        (k, v) for v, k in LABELS2IDX.iteritems())  # Reverse the dictionary
    idx2word = dict(
        (k, v) for v, k in words2idx.iteritems())  # Reverse the dictionary

    vocsize = len(idx2word)

    nclasses = len({label for labels in labels_idxes for label in labels})
    # nclasses = len(set(reduce(lambda x, y: list(x) + list(y), train_y + test_y + valid_y)))

    nsentences = len(train_lex)
    folder = os.path.basename(__file__).split('.')[0]
    if not os.path.exists(folder):
        os.mkdir(folder)

    print "Loading Word2Vec"
    word2vec = Word2Vec.load_word2vec_format(WORD2VEC_FILENAME,
                                             binary=True)  # C binary format

    print "Calculate word embeddings"
    embeddings = 0.2 * np.random.uniform(
        -1.0, 1.0, (vocsize + 1, conf['emb_dimension'])).astype(
            theano.config.floatX
        )  # add one for PADDING at the end @UndefinedVariable
    for idx, word in idx2word.iteritems():
        try:
            embedding = word2vec[word]
        except KeyError:
            try:
                embedding = word2vec[word.capitalize()]
            except KeyError:
                embedding = embeddings[idx]  # Keep it random
        embeddings[idx] = embedding

    del word2vec  # It is huge

    print "Create a Neural Network"
    rnn = elman2vec(nh=conf['nhidden'],
                    nc=nclasses,
                    ne=vocsize,
                    de=conf['emb_dimension'],
                    cs=conf['win'],
                    embeddings=embeddings)

    # train with early stopping on validation set
    best_f1 = -np.inf
    conf['clr'] = conf['lr']
    print "Start training"
    for epoch in xrange(conf['nepochs']):
        # shuffle
        shuffle([train_lex, train_y], conf['seed'])
        conf['ce'] = epoch
        tic = time.time()
        for i in xrange(nsentences):
            cwords = contextwin(train_lex[i], conf['win'])
            words = [
                np.asarray(x).astype(np.int32)
                for x in minibatch(cwords, conf['bs'])
            ]
            labels = train_y[i]
            for word_batch, label_last_word in zip(words, labels):
                rnn.train(word_batch, label_last_word, conf['clr'])


#                 rnn.normalize()
            if conf['verbose']:
                print '[learning] epoch %i >> %2.2f%%' % (
                    epoch, (i + 1) * 100. / nsentences
                ), 'completed in %.2f (sec) <<\r' % (time.time() - tic),
                sys.stdout.flush()

        # evaluation // back into the real world : idx -> words
        predictions_test = [ map(lambda x: idx2label[x], \
                         rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                         for x in test_lex ]
        groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y]
        words_test = [map(lambda x: idx2word[x], w) for w in test_lex]

        predictions_valid = [ map(lambda x: idx2label[x], \
                             rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                             for x in valid_lex ]
        groundtruth_valid = [map(lambda x: idx2label[x], y) for y in valid_y]
        words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex]

        # evaluation // compute the accuracy using conlleval.pl
        res_test = conlleval(predictions_test, groundtruth_test, words_test,
                             folder + '/current.test.txt')
        res_valid = conlleval(predictions_valid, groundtruth_valid,
                              words_valid, folder + '/current.valid.txt')

        if res_valid['f1'] > best_f1:
            rnn.save(folder)
            best_f1 = res_valid['f1']
            print 'NEW BEST: epoch', epoch, 'valid F1', res_valid[
                'f1'], 'best test F1', res_test['f1'], ' ' * 20
            conf['vf1'], conf['vp'], conf['vr'] = res_valid['f1'], res_valid[
                'p'], res_valid['r']
            conf['tf1'], conf['tp'], conf['tr'] = res_test['f1'], res_test[
                'p'], res_test['r']
            conf['be'] = epoch
            subprocess.call([
                'mv', folder + '/current.test.txt', folder + '/best.test.txt'
            ])
            subprocess.call([
                'mv', folder + '/current.valid.txt', folder + '/best.valid.txt'
            ])
        else:
            print '        : epoch', epoch, 'valid F1', res_valid[
                'f1'], '     test F1', res_test['f1'], ' ' * 20

        # learning rate decay if no improvement in 10 epochs
        if conf['decay'] and abs(conf['be'] - conf['ce']) >= 10:
            conf['clr'] *= 0.5
        if conf['clr'] < 1e-5:
            break

    print 'BEST RESULT: epoch', epoch, 'valid F1', res_valid[
        'f1'], 'best test F1', res_test['f1'], 'with the model', folder
Exemplo n.º 12
0
def run(params):

    start_time = time.time()

    folder = os.path.basename(__file__).split('.')[0]
    if not os.path.exists(folder): os.mkdir(folder)
    rhoList = numpy.array([100, 50]).astype(
        numpy.int32
    )  # 100,90,80,70,60,50,0 # combining forward and backward layers

    # load the dataset
    eval_options = []
    params['measure'] = 'F1score'
    if params['dataset'] == 'atis':
        train_set, valid_set, test_set, dic = loadData.atisfold(params['fold'])
    if params['dataset'] == 'ner':
        train_set, valid_set, test_set, dic = loadData.ner()
    if params['dataset'] == 'chunk':
        train_set, valid_set, test_set, dic = loadData.chunk()
    if params['dataset'] == 'pos':
        train_set, valid_set, test_set, dic = loadData.pos()
        eval_options = ['-r']
        params['measure'] = 'Accuracy'

    idx2label = dict((k, v) for v, k in dic['labels2idx'].items())
    idx2word = dict((k, v) for v, k in dic['words2idx'].items())

    train_lex, train_ne, train_y = train_set
    valid_lex, valid_ne, valid_y = valid_set
    test_lex, test_ne, test_y = test_set

    ## :( hack
    # train_lex = train_lex[::100]
    # train_ne = train_ne[::100]
    # train_y = train_y[::100]
    # valid_lex = valid_lex[::100]
    # valid_ne = valid_ne[::100]
    # valid_y = valid_y[::100]
    # test_lex = test_lex[::100]
    # test_ne = test_ne[::100]
    # test_y = test_y[::100]

    vocsize = len(dic['words2idx'])
    nclasses = len(dic['labels2idx'])
    nsentences = len(train_lex)

    wv = None
    if params['WVFolder'] != 'random':
        if '[' in params['WVFolder'] and ']' in params['WVFolder']:
            folderSet = set(
                eval(params['WVFolder'].replace('[', '[\'').replace(
                    ']', '\']').replace(',', '\',\'')))
            print(folderSet)
            wv = numpy.zeros(
                (vocsize + 1,
                 params['WVModel']['emb_dimension'] * len(folderSet)))

            modelIndex = 0
            for folder in folderSet:
                params['WVFile'] = folder + '/' + 'words' + str(
                    params['WVModel']['emb_dimension']) + '.npy'
                params['WVVocabFile'] = folder + '/' + 'words' + str(
                    params['WVModel']['emb_dimension']) + '.vocab'
                # load word vector
                wvnp = np.load(params['WVFile'])

                # load vocab
                with open(params['WVVocabFile']) as f:
                    vocab = [line.strip() for line in f if len(line) > 0]
                wi = dict([(a, i) for i, a in enumerate(vocab)])

                random_v = math.sqrt(
                    6.0 / numpy.sum(params['WVModel']['emb_dimension'])
                ) * numpy.random.uniform(-1.0, 1.0,
                                         (params['WVModel']['emb_dimension']))
                miss = 0  # the number of missing words in pre-trained word embeddings
                for i in range(0, vocsize):
                    word = idx2word[i]
                    if word in wi:
                        wv[i][params['WVModel']['emb_dimension'] *
                              modelIndex:params['WVModel']['emb_dimension'] *
                              (modelIndex + 1)] = wvnp[wi[word]]
                        # print wvnp[wi[word]]
                    else:
                        wv[i][params['WVModel']['emb_dimension'] *
                              modelIndex:params['WVModel']['emb_dimension'] *
                              (modelIndex + 1)] = random_v
                        miss += 1
                print("missing words rate : ", miss, '/', vocsize)
                params['WVModel']['vocab_size'] = len(vocab)
                modelIndex = modelIndex + 1

            params['WVModel']['emb_dimension'] *= len(folderSet)
            # return
        else:
            folder = params['WVFolder']
            params['WVFile'] = folder + '/' + 'words' + str(
                params['WVModel']['emb_dimension']) + '.npy'
            params['WVVocabFile'] = folder + '/' + 'words' + str(
                params['WVModel']['emb_dimension']) + '.vocab'

            # load word vector
            wvnp = np.load(params['WVFile'])
            params['WVModel']['emb_dimension'] = len(wvnp[0])

            # load vocab
            with open(params['WVVocabFile']) as f:
                vocab = [line.strip() for line in f if len(line) > 0]
            wi = dict([(a, i) for i, a in enumerate(vocab)])
            wv = numpy.zeros((vocsize + 1, params['WVModel']['emb_dimension']))
            random_v = math.sqrt(6.0 / numpy.sum(
                params['WVModel']['emb_dimension'])) * numpy.random.uniform(
                    -1.0, 1.0, (params['WVModel']['emb_dimension']))

            miss = 0  # the number of missing words in pre-trained word embeddings
            for i in range(0, vocsize):
                word = idx2word[i]
                if word in wi:
                    wv[i] = wvnp[wi[word]]
                    # print wvnp[wi[word]]
                else:
                    wv[i] = random_v
                    miss += 1
            print("missing words rate : ", miss, '/', vocsize)
            params['WVModel']['vocab_size'] = len(vocab)

    print(json.dumps(params, sort_keys=True, indent=4, separators=(',', ': ')))

    rhoSuffix = "%_forward"
    best_valid = {}
    best_test = {}
    for i_rho in range(len(rhoList)):
        best_valid[str(rhoList[i_rho]) + rhoSuffix] = -numpy.inf
        best_test[str(rhoList[i_rho]) + rhoSuffix] = -numpy.inf
    validMeasureList = {}
    testMeasureList = {}  # this is used for drawing line chart.
    for i_rho in range(len(rhoList)):
        validMeasureList[str(rhoList[i_rho]) + rhoSuffix] = []
        testMeasureList[str(rhoList[i_rho]) + rhoSuffix] = []

    # instanciate the model
    numpy.random.seed(params['seed'])
    random.seed(params['seed'])
    rnn = elman_attention.model(nh=params['nhidden'],
                                nc=nclasses,
                                ne=vocsize,
                                de=params['WVModel']['emb_dimension'],
                                attention=params['attention'],
                                h_win=(params['h_win_left'],
                                       params['h_win_right']),
                                lvrg=params['lvrg'],
                                wv=wv)

    # train
    for e in range(params['nepochs']):
        # shuffle
        shuffle([train_lex, train_ne, train_y], params['seed'])

        tic = time.time()
        for i in range(nsentences):
            cwords = contextwin(train_lex[i])
            labels = train_y[i]

            nl, aaL = rnn.train(cwords, labels, params['dropRate'], 1)

            # rnn.normalize()
            if params['verbose']:
                sys.stdout.write(
                    ('\r[learning] epoch %i >> %2.2f%%' %
                     (e, (i + 1) * 100. / nsentences) +
                     ('  average speed in %.2f (min) <<' %
                      ((time.time() - tic) / 60 / (i + 1) * nsentences)) +
                     (' completed in %.2f (sec) <<' % ((time.time() - tic)))))
                sys.stdout.flush()

        print('start test', time.time() / 60)

        print('start pred train', time.time() / 60)
        predictions_train = [[map(lambda varible: idx2label[varible], w) \
                              for w in rnn.classify(numpy.asarray(contextwin(x)).astype('int32'), params['dropRate'], 0, rhoList)]
                             for x in train_lex]

        predictions_test = [[map(lambda varible: idx2label[varible], w) \
                             for w in rnn.classify(numpy.asarray(contextwin(x)).astype('int32'), params['dropRate'], 0, rhoList)]
                            for x in test_lex]

        predictions_valid = [[map(lambda varible: idx2label[varible], w) \
                              for w in rnn.classify(numpy.asarray(contextwin(x)).astype('int32'), params['dropRate'], 0, rhoList)]
                             for x in valid_lex]

        for i_rho in range(len(rhoList)):

            groundtruth_train = [
                map(lambda x: idx2label[x], y) for y in train_y
            ]
            words_train = [map(lambda x: idx2word[x], w) for w in train_lex]
            groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y]
            words_test = [map(lambda x: idx2word[x], w) for w in test_lex]
            groundtruth_valid = [
                map(lambda x: idx2label[x], y) for y in valid_y
            ]
            words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex]

            ptrain = [p[i_rho] for p in predictions_train]
            ptest = [p[i_rho] for p in predictions_test]
            pvalid = [p[i_rho] for p in predictions_valid]

            res_train = conlleval(
                ptrain, groundtruth_train, words_train, folder +
                '/current.train.txt' + str(i_rho) + str(params['seed']),
                eval_options)
            res_test = conlleval(
                ptest, groundtruth_test, words_test, folder +
                '/current.test.txt' + str(i_rho) + str(params['seed']),
                eval_options)
            res_valid = conlleval(
                pvalid, groundtruth_valid, words_valid, folder +
                '/current.valid.txt' + str(i_rho) + str(params['seed']),
                eval_options)

            print('                                     epoch', e, ' rhoList ',
                  i_rho, '  train p', res_train['p'], 'valid p',
                  res_valid['p'], '  train r', res_train['r'], 'valid r',
                  res_valid['r'], '  train ', params['measure'],
                  res_train['measure'], 'valid ', params['measure'],
                  res_valid['measure'], 'best test ', params['measure'],
                  res_test['measure'], ' ' * 20)

            validMeasureList[str(rhoList[i_rho]) + rhoSuffix].append(
                res_valid['measure'])
            testMeasureList[str(rhoList[i_rho]) + rhoSuffix].append(
                res_test['measure'])

            if res_valid['measure'] > best_valid[str(rhoList[i_rho]) +
                                                 rhoSuffix]:
                best_valid[str(rhoList[i_rho]) +
                           rhoSuffix] = res_valid['measure']
                best_test[str(rhoList[i_rho]) +
                          rhoSuffix] = res_test['measure']

        for i_rho in range(
                len(rhoList)):  # this is used for drawing line chart.
            print(i_rho, params['dataset'], end=' ')
            for v in testMeasureList[str(rhoList[i_rho]) + rhoSuffix]:
                print(v, end=' ')
            print('')

        for i_rho in range(len(rhoList)):
            print('current best results', rhoList[i_rho], ' ',
                  best_valid[str(rhoList[i_rho]) + rhoSuffix], '/',
                  best_test[str(rhoList[i_rho]) + rhoSuffix])

    end_time = time.time()

    with open(params['JSONOutputFile'], 'w') as outputFile:
        params['results'] = {}
        params['results']['best_valid_' + params['measure']] = best_valid
        params['results']['best_test_' + params['measure']] = best_test
        params['results']['valid_' + params['measure'] +
                          'ListBasedOnEpochs'] = validMeasureList
        params['results']['test_' + params['measure'] +
                          'ListBasedOnEpochs'] = testMeasureList
        params['running_time'] = {}
        params['running_time']['start'] = time.strftime(
            "%Y-%m-%d %H:%M:%S", time.localtime(start_time))
        params['running_time']['end'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                      time.localtime(end_time))
        params['running_time']['duration'] = end_time - start_time

        res = json.dump(params,
                        outputFile,
                        sort_keys=True,
                        indent=4,
                        separators=(',', ': '))
        print(res)
Exemplo n.º 13
0
    # instanciate the model
    numpy.random.seed(s['seed'])
    random.seed(s['seed'])
    rnn = model(    nh = s['nhidden'],
                    nc = nclasses,
                    ne = vocsize,
                    de = s['emb_dimension'],
                    cs = s['win'] )

    # train with early stopping on validation set
    best_f1 = -numpy.inf
    s['clr'] = s['lr']
    for e in xrange(s['nepochs']):
        # shuffle
        shuffle([train_lex, train_ne, train_y], s['seed'])
        s['ce'] = e
        tic = time.time()
        for i in xrange(nsentences):
            cwords = contextwin(train_lex[i], s['win'])
            words  = map(lambda x: numpy.asarray(x).astype('int32'),\
                         minibatch(cwords, s['bs']))
            labels = train_y[i]
            for word_batch , label_last_word in zip(words, labels):
                rnn.train(word_batch, label_last_word, s['clr'])
                rnn.normalize()
            if s['verbose']:
                print '[learning] epoch %i >> %2.2f%%'%(e,(i+1)*100./nsentences),'completed in %.2f (sec) <<\r'%(time.time()-tic),
                sys.stdout.flush()
            
        # evaluation // back into the real world : idx -> words
Exemplo n.º 14
0
def play_with_splitting_sentences():
    """Play with splitting sentences"""
    conf = { # 'fold': 3, # 5 folds 0,1,2,3,4
        'lr': 0.0627142536696559,
        'verbose': False,
        'decay': True, # decay on the learning rate if improvement stops
        'win': 15, # number of characters in the context window
        'bs': 5, # number of back-propagation through time steps
        'nhidden': 100, # number of hidden units
        'seed': 345,
        'emb_dimension': 30, # dimension of character embedding
        'nepochs': 10}
    number_of_files = 50000
    np.random.seed(conf['seed'])
    random.seed(conf['seed'])
    print "Calculate output"
    session_files = get_session_files(number_of_files=number_of_files, random_seed=conf['seed']) # Limit the scope To speed things up...
    labels2idx = {"O": 0, "X": 1}
    sentences = []
    idxes = []
    labels_idxes = []
    labels = []
    char2idx = get_char_to_idx(session_files)
    for session_file in session_files:
        session = json.loads(open(session_file, "rb").read())
        sentence = session_to_text0(session)
        if not sentence.strip():
            continue
        sentence_out, label = create_test(sentence, probability=0.2)
        sentences.append(sentence_out)
        labels.append(label)
        labels_idxes.append(np.fromiter((labels2idx[l] for l in label), dtype=np.uint32))
        idxes.append(np.fromiter((char2idx[char] for char in sentence_out), dtype=np.uint32))

    print "Prepare train, validation and test sets"
    train_valid_lex, test_lex, train_valid_y, test_y = train_test_split(idxes, labels_idxes, test_size=0.15, random_state=42)
    train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_lex, train_valid_y, test_size=0.2, random_state=42)
    print "Some more prep"
    idx2label = dict((k, v) for v, k in labels2idx.iteritems()) # Reverse the dictionary
    idx2word = dict((k, v) for v, k in char2idx.iteritems()) # Reverse the dictionary

#     vocsize = 1 + len(set(reduce(\
#                                  lambda x, y: list(x)+list(y),\
#                                  train_lex+valid_lex+test_lex)))
    vocsize = 1 + len(set(item for lex in (train_lex, valid_lex, test_lex) for sublist in lex for item in sublist))
    nclasses = 2  #len(set(reduce(lambda x, y: list(x) + list(y), train_y + test_y + valid_y)))
    nsentences = len(train_lex)
    print "Some file os calls"
    folder = os.path.basename(__file__).split('.')[0] + "_3"
    if not os.path.exists(folder):
        os.mkdir(folder)
    print "Create a Neural Network"
    rnn = regular_elman(nh=conf['nhidden'],
                        nc=nclasses,
                        ne=vocsize,
                        de=conf['emb_dimension'],
                        cs=conf['win'],)

    # train with early stopping on validation set
    best_f1 = -np.inf
    conf['clr'] = conf['lr']
    print "Start training"
    start_time = time.time()
    for epoch in xrange(conf['nepochs']):
        # shuffle
        shuffle([train_lex, train_y], conf['seed'])
        conf['ce'] = epoch
        tic = time.time()
        for i in xrange(nsentences):
            cwords = contextwin(train_lex[i], conf['win'])
            words = [np.asarray(x).astype(np.int32) for x in minibatch(cwords, conf['bs'])]
            labels = train_y[i]
            for word_batch , label_last_word in zip(words, labels):
                rnn.train(word_batch, label_last_word, conf['clr'])
                rnn.normalize()
            if conf['verbose']:
                print '[learning] epoch %i >> %2.2f%%' % (epoch, (i + 1) * 100. / nsentences), 'completed in %.2f (sec) <<\r' % (time.time() - tic),
                sys.stdout.flush()

        # evaluation // back into the real world : idx -> words
        predictions_test = [ map(lambda x: idx2label[x], \
                         rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                         for x in test_lex ]
        groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ]
        words_test = [ map(lambda x: idx2word[x], w) for w in test_lex]

        predictions_valid = [ map(lambda x: idx2label[x], \
                             rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                             for x in valid_lex ]
        groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ]
        words_valid = [ map(lambda x: idx2word[x], w) for w in valid_lex]

        # evaluation // compute the accuracy using conlleval.pl
        res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt')
        res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt')

        if res_valid['f1'] > best_f1:
            rnn.save(folder)
            best_f1 = res_valid['f1']
            print 'NEW BEST: epoch', epoch, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' ' * 20
            conf['vf1'], conf['vp'], conf['vr'] = res_valid['f1'], res_valid['p'], res_valid['r']
            conf['tf1'], conf['tp'], conf['tr'] = res_test['f1'], res_test['p'], res_test['r']
            conf['be'] = epoch
            subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt'])
            subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt'])
        else:
            print '        : epoch', epoch, 'valid F1', res_valid['f1'], '     test F1', res_test['f1'], ' ' * 20

        # learning rate decay if no improvement in 10 epochs
        if conf['decay'] and abs(conf['be'] - conf['ce']) >= 10:
            conf['clr'] *= 0.5
        if conf['clr'] < 1e-5:
            break

    print 'BEST RESULT: epoch', conf['be'], 'valid F1', best_f1, 'best test F1', conf['tf1'], 'with the model', folder
    print "total time = {} seconds".format(time.time() - start_time)
Exemplo n.º 15
0
def prepare_data():
    """Prepare the data"""
    conf = {'fold': 3, # 5 folds 0,1,2,3,4
            'lr': 0.0627142536696559,
            'verbose': True,
            'decay': True, # decay on the learning rate if improvement stops
            'win': 7, # number of words in the context window
            'bs': 9, # number of back-propagation through time steps
            'nhidden': 100, # number of hidden units
            'seed': 345,
            'emb_dimension': 300, # dimension of word embedding
            'nepochs': 50}
    np.random.seed(conf['seed'])
    random.seed(conf['seed'])
    session_files = get_session_files(number_of_files=None, random_seed=conf['seed']) # Limit the scope To speed things up...
    sentences = []
    idxes = []
    labels = []
    labels_idxes = []
    print "Calculate words2idx"
    words2idx = get_words2idx(session_files)
    unknown = words2idx["<UNK>"]
    print "Calculate output"
    for session_file in session_files:
        session = json.loads(open(session_file, "rb").read())
        sentence = session_to_text0(session)
        if not sentence.strip():
            continue
        sentences.append(sentence)
        token_list = tokenize(sentence.lower())
        dtp_search_res = dtp_search(sentence, None)
        iobes = to_iob(token_list, dtp_search_res)
        labels.append(iobes)
        labels_idxes.append(np.fromiter((LABELS2IDX[iob] for iob in iobes), dtype=np.int32))
#         token_list = [re.sub(r"\d", "DIGIT", token) for token in token_list]
        idxes.append(np.fromiter((words2idx.get(token, unknown) for token in token_list), dtype=np.int32))



    print "Prepare train, validation and test sets"
    train_valid_lex, test_lex, train_valid_y, test_y = train_test_split(idxes, labels_idxes, test_size=0.15, random_state=42)
    train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_lex, train_valid_y, test_size=0.2, random_state=42)

    idx2label = dict((k, v) for v, k in LABELS2IDX.iteritems()) # Reverse the dictionary
    idx2word = dict((k, v) for v, k in words2idx.iteritems()) # Reverse the dictionary

    vocsize = len(idx2word)

    nclasses = len({label for labels in labels_idxes for label in labels})
    # nclasses = len(set(reduce(lambda x, y: list(x) + list(y), train_y + test_y + valid_y)))

    nsentences = len(train_lex)
    folder = os.path.basename(__file__).split('.')[0]
    if not os.path.exists(folder):
        os.mkdir(folder)

    print "Loading Word2Vec"
    word2vec = Word2Vec.load_word2vec_format(WORD2VEC_FILENAME, binary=True) # C binary format

    print "Calculate word embeddings"
    embeddings = 0.2 * np.random.uniform(-1.0, 1.0, (vocsize + 1, conf['emb_dimension'])).astype(theano.config.floatX) # add one for PADDING at the end @UndefinedVariable
    for idx, word in idx2word.iteritems():
        try:
            embedding = word2vec[word]
        except KeyError:
            try:
                embedding = word2vec[word.capitalize()]
            except KeyError:
                embedding = embeddings[idx] # Keep it random
        embeddings[idx] = embedding

    del word2vec # It is huge

    print "Create a Neural Network"
    rnn = elman2vec(nh=conf['nhidden'],
                nc=nclasses,
                ne=vocsize,
                de=conf['emb_dimension'],
                cs=conf['win'],
                embeddings=embeddings)

    # train with early stopping on validation set
    best_f1 = -np.inf
    conf['clr'] = conf['lr']
    print "Start training"
    for epoch in xrange(conf['nepochs']):
        # shuffle
        shuffle([train_lex, train_y], conf['seed'])
        conf['ce'] = epoch
        tic = time.time()
        for i in xrange(nsentences):
            cwords = contextwin(train_lex[i], conf['win'])
            words = [np.asarray(x).astype(np.int32) for x in minibatch(cwords, conf['bs'])]
            labels = train_y[i]
            for word_batch , label_last_word in zip(words, labels):
                rnn.train(word_batch, label_last_word, conf['clr'])
#                 rnn.normalize()
            if conf['verbose']:
                print '[learning] epoch %i >> %2.2f%%' % (epoch, (i + 1) * 100. / nsentences), 'completed in %.2f (sec) <<\r' % (time.time() - tic),
                sys.stdout.flush()

        # evaluation // back into the real world : idx -> words
        predictions_test = [ map(lambda x: idx2label[x], \
                         rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                         for x in test_lex ]
        groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ]
        words_test = [ map(lambda x: idx2word[x], w) for w in test_lex]

        predictions_valid = [ map(lambda x: idx2label[x], \
                             rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                             for x in valid_lex ]
        groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ]
        words_valid = [ map(lambda x: idx2word[x], w) for w in valid_lex]

        # evaluation // compute the accuracy using conlleval.pl
        res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt')
        res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt')

        if res_valid['f1'] > best_f1:
            rnn.save(folder)
            best_f1 = res_valid['f1']
            print 'NEW BEST: epoch', epoch, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' ' * 20
            conf['vf1'], conf['vp'], conf['vr'] = res_valid['f1'], res_valid['p'], res_valid['r']
            conf['tf1'], conf['tp'], conf['tr'] = res_test['f1'], res_test['p'], res_test['r']
            conf['be'] = epoch
            subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt'])
            subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt'])
        else:
            print '        : epoch', epoch, 'valid F1', res_valid['f1'], '     test F1', res_test['f1'], ' ' * 20

        # learning rate decay if no improvement in 10 epochs
        if conf['decay'] and abs(conf['be'] - conf['ce']) >= 10:
            conf['clr'] *= 0.5
        if conf['clr'] < 1e-5:
            break

    print 'BEST RESULT: epoch', epoch, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], 'with the model', folder