示例#1
0
def train(dim_word=100,  # word vector dimensionality
          dim_char=10,  # the number of LSTM units
          max_char=10,  # the number of LSTM units
          dim=100,  # the number of LSTM units
	  win=5, #Window size
	  bs=5, #number of backprop through time steps
	  seed=123,
	  verbose=1,
          use_model='GRU', #Choose the model from- LSTM, DEEPLSTM, RNN, 
          patience=10,  # early stopping patience
          max_epochs=50,
          lrate=0.0005,  # learning rate
          maxlen=100,  # maximum length of the description
          data_train=['data/qe/train/train.src.lc',
              'data/qe/train/train.mt.lc',
              'data/qe/train/train.align'],
          data_train_y = 'data/qe/train/train.tags',
          data_valid=['data/qe/dev/dev.src.lc',
                'data/qe/dev/dev.mt.lc',
                'data/qe/dev/dev.align'],
          data_valid_y = 'data/qe/dev/dev.tags',
          data_test=['data/qe/test/test.src.lc',
                'data/qe/test/test.mt.lc',
                'data/qe/test/test.align'],
          data_test_y = 'data/qe/test/test.tags',
          dictionaries=['data/qe/train/train.src.lc.json',
              'data/qe/train/train.mt.lc.json'],
          character2index=['data/qe/train/train.src.lc.dict_char.json',
              'data/qe/train/train.mt.lc.dict_char.json'],
	  label2index = 'data/qe/train/train.tags.json',
          embeddings=['data/qe/pretrain/ep_qe.en.vector.txt',
              'data/qe/pretrain/ep_qe.de.vector.txt'],
	  use_adadelta=False,
          use_bilingual=False,
          use_pretrain=False,
          use_quest=False,
          use_tag=False,
          use_char=False,
          saveto=False,
          shuffle_each_epoch=True,
	  load_data=None,
    ):

	model_options = OrderedDict(sorted(locals().copy().items()))
	print 'Model_Options:', model_options

	model_name = model_options['use_model'][0]
	if model_options['use_adadelta']:
		model_name += '_adadelta'
	if model_options['use_char']:
		model_name += '_char'
	if model_options['use_bilingual']:
		model_name += '_bilingual'
	if model_options['use_pretrain']:
		model_name += '_pretrain'

	print 'Using model:', model_name

	processed_data = []
	if load_data:
	    with gzip.open(load_data[0],'rb') as fp:
			processed_data = cPickle.load(fp)
	else:
	    processed_data = preprocess_data(data_train=model_options['data_train'], 
		data_train_y=model_options['data_train_y'][0],
		data_valid=model_options['data_valid'], data_valid_y=model_options['data_valid_y'][0], 
		data_test=model_options['data_test'], data_test_y=model_options['data_test_y'][0], 
		dictionaries=model_options['dictionaries'],
		character2index=model_options['character2index'],
		label2index = model_options['label2index'][0],
		embeddings = model_options['embeddings'],
		use_bilingual=model_options['use_bilingual'], 
		use_char=model_options['use_char'], 
		use_pretrain=model_options['use_pretrain'])

	"""
	Savinn the model/data with model_name
	"""
	save_data = folder = ''
	if use_tag:
		save_data = 'tag.data_' + model_name + '.pkl.gz'
		folder = 'tag.' + model_name
	if use_quest:
		save_data = 'quest.data_' + model_name + '.pkl.gz'
		folder = 'quest.' + model_name

	if saveto:
		with gzip.open(save_data,'wb') as fp:
       			cPickle.dump(processed_data, fp)
    	if not os.path.exists(folder): os.mkdir(folder)

	train, train_y, test, test_y, valid, valid_y, w2idxs, char2idxs, label2idxs, embs=processed_data
	idx2label = dict((k,v) for v,k in label2idxs.iteritems())
	#print len(train), len(test), len(valid)

	vocsize_s = vocsize_t = vocsize_schar = vocsize_tchar = 0
        emb_s, emb_t, train_s, train_schar, train_t, train_tchar, test_s, test_schar, test_t, test_tchar, valid_s, valid_schar, valid_t, valid_tchar = ([] for i in range(14))
		
	if (use_bilingual or len(train) == 4) and use_char:
		emb_s, emb_t = embs
		train_s, train_t, train_schar, train_tchar = train
		test_s, test_t, test_schar, test_tchar = test
		valid_s, valid_t, valid_schar, valid_tchar = valid
    		vocsize_s = len(w2idxs[0])
    		vocsize_t = len(w2idxs[1])
		vocsize_schar = len(char2idxs[0])
		vocsize_tchar = len(char2idxs[1])

	elif use_char:
		emb_t = embs[0]
		train_t, train_tchar = train
		test_t, test_tchar = test
		valid_t, valid_tchar = valid
    		vocsize_t = len(w2idxs[0])
		vocsize_tchar = len(char2idxs[0])

	elif use_bilingual or len(train) == 2:
		emb_s, emb_t = embs
		train_s, train_t = train
		test_s, test_t = test
		valid_s, valid_t = valid
    		vocsize_s = len(w2idxs[0])
    		vocsize_t = len(w2idxs[1])
	else :
		emb_t = embs[0]
		train_t = train[0]
		test_t = test[0]
		valid_t = valid[0]
    		vocsize_t = len(w2idxs[0])

    	nclasses = len(label2idxs)
    	nsentences = len(train_t)

    	numpy.random.seed(model_options['seed'])
    	# instanciate the model
    	rnn = select_model[model_name]( nh = model_options['dim'],
                    nc = nclasses,
                    de = model_options['dim_word'],
                    cs = model_options['win'],
                    de_char = model_options['dim_char'],
		    ne_char = vocsize_tchar,
		    ne_src = vocsize_s,
		    ne_tgt = vocsize_t,
		    emb_src = emb_s,
		    emb_tgt = emb_t,
		    max_char = model_options['max_char'])

    	# train with early stopping on validation set
    	best_f1 = -numpy.inf
    	model_options['patience'] = 2
    	batch_size = (nsentences/100) * 10
    	n_batches = nsentences//batch_size
    	print n_batches
    	for e in xrange(model_options['max_epochs']):
	  model_options['ce'] = e
      	  #shuffle
	  if shuffle_each_epoch:
      	  	shuffle([train_t, train_s, train_tchar, train_y], model_options['seed'])

      	  tic = time.time()
      	  for k in xrange(n_batches):
            #Creating batches
	    batch_train_s = []
	    batch_train_char = []

	    if model_options['use_bilingual']:
            	batch_train_s = train_s[k*batch_size:(k+1)*batch_size]
	    if model_options['use_char']:
            	batch_train_char = train_tchar[k*batch_size:(k+1)*batch_size]

            batch_train_t = train_t[k*batch_size:(k+1)*batch_size]
            batch_train_y = train_y[k*batch_size:(k+1)*batch_size]
            batch_err = 0
            for i in xrange(batch_size):
		cwords_src = []
		padded_chars = []
		if model_options['use_bilingual']:
                	cwords_src = contextwin(batch_train_s[i], model_options['win'])
		if model_options['use_char']:
			padded_chars = add_padding(batch_train_char[i], model_options['max_char'])

		#print batch_train_char[0]
		#print padded_chars
                cwords_tgt = contextwin(batch_train_t[i], model_options['win'])
                labels = batch_train_y[i]

		if model_options['use_bilingual'] and model_options['use_char']:
                     err = rnn.train_grad_shared(cwords_src, cwords_tgt, padded_chars, labels, model_options['lrate'])
		elif model_options['use_char']:
                     err = rnn.train_grad_shared(cwords_tgt, padded_chars, labels, model_options['lrate'])
		elif model_options['use_bilingual']:
                     err = rnn.train_grad_shared(cwords_src, cwords_tgt, labels, model_options['lrate'])
		elif model_options['use_adadelta']:
                     err = rnn.train_grad_shared(cwords_tgt, labels, model_options['lrate'])
		else:
		     err = rnn.train(cwords_tgt, labels, model_options['lrate'])
                
		if model_options['use_adadelta']:
		     rnn.train_update(model_options['lrate'])

                rnn.normalize()
                
                if model_options['verbose']:
                    print '[learning] epoch %i batch %i >> %2.2f%%'%(e, k, (i+1)*100./batch_size),'completed in %.2f (sec) <<\r'%(time.time()-tic),
		    sys.stdout.flush()

	    if(k % model_options['patience'] == 0):

		predictions_test, groundtruth_test, predictions_valid, \
			groundtruth_valid = ([] for i in range(4))

		if model_options['use_bilingual'] and model_options['use_char']:
			predictions_test = [ map(lambda x: idx2label[x],
				rnn.classify(numpy.asarray(contextwin(x, 
				model_options['win'])).astype('int32'),
				numpy.asarray(contextwin(_x, model_options['win'])).astype('int32'),
				numpy.asarray(add_padding(__x, 
				model_options['max_char'])).astype('int32')))
				for x, _x, __x in zip(test_s, test_t, test_tchar) ]
                	groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ]
                	#words_test = [ map(lambda x: idx2word[x], w) for w in test_lex]

                	predictions_valid = [ map(lambda x: idx2label[x],
                                 rnn.classify(numpy.asarray(contextwin(x, 
				 model_options['win'])).astype('int32'),
				 numpy.asarray(contextwin(_x, model_options['win'])).astype('int32'),
				 numpy.asarray(add_padding(__x, 
				 model_options['max_char'])).astype('int32')))
                                 for x, _x, __x in zip(valid_s, valid_t, valid_tchar) ]
                	groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ]

		elif model_options['use_bilingual']:
			#evaluation // back into the real world : idx -> words
            		predictions_test = [ map(lambda x: idx2label[x],
                                 rnn.classify(numpy.asarray(contextwin(x_src, 
				 model_options['win'])).astype('int32'),
                                 numpy.asarray(contextwin(x_tgt,model_options['win'])).astype('int32')))
                                 for x_src, x_tgt in zip(test_s, test_t) ]
            		groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ]
           		#words_test = [ map(lambda x: idx2word_de[x], w) for w in test_lex]

            		predictions_valid = [ map(lambda x: idx2label[x],
                                 rnn.classify(numpy.asarray(contextwin(x_src, 
				 model_options['win'])).astype('int32'),
                                 numpy.asarray(contextwin(x_tgt,model_options['win'])).astype('int32')))
                                 for x_src, x_tgt in zip(valid_s, valid_t) ]
            		groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ]
            		#words_valid = [ map(lambda x: idx2word_de[x], w) for w in valid_lex]


		elif model_options['use_char']:
			predictions_test = [ map(lambda x: idx2label[x],
				rnn.classify(numpy.asarray(contextwin(x, 
				model_options['win'])).astype('int32'),
				numpy.asarray(add_padding(_x, 
				model_options['max_char'])).astype('int32')))
                                for x, _x, in zip(test_t, test_tchar) ]
                	groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ]
                	#words_test = [ map(lambda x: idx2word[x], w) for w in test_lex]

                	predictions_valid = [ map(lambda x: idx2label[x],
				rnn.classify(numpy.asarray(contextwin(x, 
				model_options['win'])).astype('int32'),
				numpy.asarray(add_padding(_x,
				model_options['max_char'])).astype('int32')))
                                for x, _x, in zip(valid_t, valid_tchar) ]
                	groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ]
		else:
                	#evaluation // back into the real world : idx -> words
                	predictions_test = [ map(lambda x: idx2label[x],
				rnn.classify(numpy.asarray(contextwin(x,
				model_options['win'])).astype('int32'))) for x in test_t ]

                	groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ]
                	#words_test = [ map(lambda x: idx2word[x], w) for w in test_t]

                	predictions_valid = [ map(lambda x: idx2label[x], 
				rnn.classify(numpy.asarray(contextwin(x, 
				model_options['win'])).astype('int32'))) for x in valid_t ]
                	groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ]
                	#words_valid = [ map(lambda x: idx2word[x], w) for w in valid_t]

                #evaluation // compute the accuracy using conlleval.pl
		res_test = []
		res_valid = []
		current_score = 0
		if model_options['use_quest']:
                   res_test=wmt_eval(predictions_test, groundtruth_test, folder+'/current.test.txt')
               	   res_valid=wmt_eval(predictions_valid, groundtruth_valid, folder+'/current.valid.txt')
		   current_score = res_valid[2][0]
		if model_options['use_tag']:
                  res_test=icon_eval(predictions_test, groundtruth_test, folder+'/current.test.txt')
                  res_valid=icon_eval(predictions_valid, groundtruth_valid, folder+'/current.valid.txt')
		  current_score = res_valid[1]

                if current_score > best_f1:

		    """
			Save the model and model parameters
		    """
                    rnn.save(folder)
		    filename = folder +'/model'
		    with open('%s.json'%filename, 'wb') as f:
			  json.dump(model_options, f, indent=2)

                    best_f1 = current_score
                    if model_options['verbose']:
                        print 'NEW BEST: epoch', e, 'valid F1', res_valid, 'test F1' , res_test , ' '*20
                    model_options['be'] = e
		    subprocess.call(['mv', folder + '/current.test.txt.hyp', folder+'/best.test.txt'])
                    subprocess.call(['mv', folder + '/current.valid.txt.hyp', folder+'/best.valid.txt'])
                else:
                    print ''
          #Break if no improvement in 10 epochs
          if abs(model_options['be']-model_options['ce']) >= 10:  break
        print 'BEST RESULT: epoch', model_options['be'] , 'valid F1', best_f1 , 'with the model', folder
示例#2
0
    rnn = model(nh=s['nhidden'],
                nc=nclasses,
                ne=vocsize,
                de=s['emb_dimension'],
                cs=s['win'])

    # train with early stopping on validation set
    best_f1 = -numpy.inf
    s['clr'] = s['lr']
    for e in range(s['nepochs']):
        # shuffle
        shuffle([train_lex, train_ne, train_y], s['seed'])
        s['ce'] = e
        tic = time.time()
        for i in range(nsentences):
            cwords = contextwin(train_lex[i], s['win'])
            words  = map(lambda x: numpy.asarray(x).astype('int32'),\
                         minibatch(cwords, s['bs']))
            labels = train_y[i]

            for word_batch, label_last_word in zip(words, labels):
                rnn.train(word_batch, label_last_word, s['clr'])
                rnn.normalize()

            if s['verbose']:
                print(
                    '[learning] epoch {} >> {:2.2f}, completed in {:.2f} (sec) '
                    .format(e, (i + 1) * 100. / nsentences,
                            time.time() - tic))

        # evaluation // back into the real world : idx -> words
示例#3
0
    # create a folder for store the models
    if not os.path.exists(model_folder): os.mkdir(model_folder)

    # train with early stopping on validation set
    best_f1_test, best_f1_test_val = -numpy.inf, -numpy.inf
    s['clr'] = s['lr']  # learning rate

    for e in xrange(s['nepochs']):
        # shuffle
        shuffle([train_lex, train_y, train_feat], s['seed'])
        s['ce'] = e
        tic = time.time()

        for i in xrange(num_sentences):
            context_words = contextwin(
                train_lex[i], s['win']
            )  #list of list of indexes corresponding to context windows surrounding each word in the sentence
            words = map(lambda x: numpy.asarray(x).astype('int32'),
                        minibatch(context_words, s['bs']))
            features = minibatch(train_feat[i], s['bs'])

            labels = train_y[i]

            for word_batch, feature_batch, label_last_word in zip(
                    words, features, labels):
                rnn.train(word_batch, feature_batch, label_last_word, s['clr'])
                rnn.normalize()

            if s['verbose']:
                print '[learning] epoch %i >> %2.2f%%' % (e, (i+1)*100./num_sentences),\
                    'completed in %.2f (sec) <<\r' % (time.time()-tic),
示例#4
0
def main():
    settings = {
        'fold': 3,  # 5 folds 0,1,2,3,4
        'lr': 0.0627142536696559,
        'verbose': 1,
        'decay': False,  # decay on the learning rate if improvement stops
        'win': 7,  # number of words in the context window
        'bs': 9,  # number of backprop through time steps
        'nhidden': 100,  # number of hidden units
        'seed': 345,
        'emb_dimension': 100,  # dimension of word embedding
        'nepochs': 50
    }

    folder = os.path.basename(__file__).split('.')[0]

    if not os.path.exists(folder):
        os.mkdir(folder)

    # load the dataset
    train_set, valid_set, test_set, dic = load.atisfold(settings['fold'])
    idx2label = dict((k, v) for v, k in dic['labels2idx'].iteritems())
    idx2word = dict((k, v) for v, k in dic['words2idx'].iteritems())

    train_lex, train_ne, train_y = train_set
    valid_lex, valid_ne, valid_y = valid_set
    test_lex,  test_ne,  test_y = test_set

    vocsize = len(dic['words2idx'])
    nclasses = len(dic['labels2idx'])
    nsentences = len(train_lex)

    # instantiate the model
    numpy.random.seed(settings['seed'])
    random.seed(settings['seed'])

    if LOAD:
        print "Loading model from %s..." % folder

        rnn = ElmanRNNModel.load(folder)
    else:
        rnn = ElmanRNNModel(
            hidden_dims=settings['nhidden'],
            num_classes=nclasses,
            vocab_size=vocsize,
            embed_dims=settings['emb_dimension'],
            context_size=settings['win']
        )

    # train with early stopping on validation set
    best_f1 = -numpy.inf
    settings['current_lr'] = settings['lr']
    for e in xrange(settings['nepochs']):
        # shuffle
        shuffle([train_lex, train_ne, train_y], settings['seed'])
        settings['current_epoch'] = e
        tic = time.time()
        for i in xrange(nsentences):
            cwords = contextwin(train_lex[i], settings['win'])

            words = map(
                lambda x: numpy.asarray(x).astype('int32'),
                minibatch(cwords, settings['bs'])
            )

            labels = train_y[i]

            for word_batch, label_last_word in zip(words, labels):
                rnn.train(word_batch, label_last_word, settings['current_lr'])
                rnn.normalize()

            if settings['verbose']:
                print '[learning] epoch %i >> %2.2f%%' % (e, (i+1)*100./nsentences), \
                    'completed in %.2f (sec) <<\r' % (time.time()-tic),
                sys.stdout.flush()

        # evaluation // back into the real world : idx -> words
        predictions_test = [
            map(lambda x: idx2label[x],
                rnn.classify(numpy.asarray(contextwin(x, settings['win'])).astype('int32')))
            for x in test_lex
        ]

        groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y ]

        words_test = [map(lambda x: idx2word[x], w) for w in test_lex]

        predictions_valid = [
            map(
                lambda idx: idx2label[idx],
                rnn.classify(
                    numpy.asarray(contextwin(x, settings['win'])).astype('int32'))
            )
            for x in valid_lex
        ]

        groundtruth_valid = [map(lambda x: idx2label[x], y) for y in valid_y]

        words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex]

        # evaluation // compute the accuracy using conlleval.pl
        res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt')
        res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt')

        if res_valid['f1'] > best_f1:
            rnn.save(folder)
            best_f1 = res_valid['f1']
            if settings['verbose']:
                print 'NEW BEST: epoch', e, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' '*20
            settings['vf1'], settings['vp'], settings['vr'] = res_valid['f1'], res_valid['p'], res_valid['r']
            settings['tf1'], settings['tp'], settings['tr'] = res_test['f1'],  res_test['p'],  res_test['r']
            settings['be'] = e
            subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt'])
            subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt'])
        else:
            print ''

        # learning rate decay if no improvement in 10 epochs
        if settings['decay'] and abs(settings['be'] - settings['current_epoch']) >= 10:
            settings['current_lr'] *= 0.5

        if settings['current_lr'] < 1e-5:
            break

    print 'BEST RESULT: epoch', e, 'valid F1', settings['vf1'], 'best test F1', settings['tf1'], 'with the model', folder
示例#5
0
    # create a folder for store the models
    if not os.path.exists(model_folder): os.mkdir(model_folder)

    # train with early stopping on validation set
    best_f1_test, best_f1_test_val = -numpy.inf, -numpy.inf
    s['clr'] = s['lr'] # learning rate

    for e in xrange(s['nepochs']):
        # shuffle
        shuffle([train_lex, train_y, train_feat], s['seed'])
        s['ce'] = e
        tic = time.time()

        for i in xrange(num_sentences):
            context_words = contextwin(train_lex[i], s['win'])
            words = map(lambda x: numpy.asarray(x).astype('int32'), minibatch(context_words, s['bs']))
            features = minibatch(train_feat[i], s['bs'])

            labels   = train_y[i]

            for word_batch, feature_batch, label_last_word in zip(words, features, labels):
                rnn.train(word_batch, feature_batch, label_last_word, s['clr'])
                rnn.normalize()

            if s['verbose']:
                print '[learning] epoch %i >> %2.2f%%' % (e, (i+1)*100./num_sentences),\
                    'completed in %.2f (sec) <<\r' % (time.time()-tic),
                sys.stdout.flush()

        # evaluation // back into the real world : idx -> words
示例#6
0
def play_with_spelling():
    """Play with spelling mistakes"""
    print CONF
    np.random.seed(CONF['seed'])
    random.seed(CONF['seed'])
    print "Calculate output"
    session_files = get_session_files(number_of_files=CONF['number_of_files'], random_seed=CONF['seed'])
    sentences = get_sentences(session_files)
    print len(sentences)
    labels2idx = char2idx = get_char_to_idx(sentences)

    print "Prepare train, validation and test sets"
    train_valid_sentences, test_sentences = train_test_split(sentences, test_size=0.15, random_state=CONF['seed'])
    train_sentences, valid_sentences = train_test_split(train_valid_sentences, test_size=0.2, random_state=CONF['seed'])
    print len(train_valid_sentences), len(test_sentences)
    test_lex, test_y = create_tests(test_sentences, CONF['error_probability'], labels2idx, char2idx)
    valid_lex, valid_y = create_tests(valid_sentences, CONF['error_probability'], labels2idx, char2idx)
    train_lex = []
    train_y = []
    for error_probability in (CONF['error_probability'], CONF['error_probability'] / 10, CONF['error_probability'] / 100, 0):
        _train_idxes, _train_labels_idxes = create_tests(train_sentences, error_probability, labels2idx, char2idx)
        train_lex.extend(_train_idxes)
        train_y.extend(_train_labels_idxes)
#     train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_idxes, train_valid_labels_idxes, test_size=0.2, random_state=CONF['seed'])
    print len(train_lex), len(valid_lex), len(train_y), len(valid_y)

    print "Some more prep"
    idx2label = dict((k, v) for v, k in labels2idx.iteritems()) # Reverse the dictionary
    idx2word = dict((k, v) for v, k in char2idx.iteritems()) # Reverse the dictionary
    groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y]
    windowed_test_lex = [np.asarray(contextwin(x, CONF['win'])).astype('int32') for x in test_lex]
    windowed_valid_lex = [np.asarray(contextwin(x, CONF['win'])).astype('int32') for x in valid_lex]

    words_test = [ map(lambda x: idx2word[x], w) for w in test_lex]
    groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ]
    words_valid = [ map(lambda x: idx2word[x], w) for w in valid_lex]
    vocsize = 1 + len(set(item for lex in (train_lex, valid_lex, test_lex) for sublist in lex for item in sublist))
    nclasses = 1 + len(set(item for _y in (train_y, test_y, valid_y) for sublist in _y for item in sublist))
    nsentences = len(train_lex)

    words_lex = []
    for i in xrange(nsentences):
        cwords = contextwin(train_lex[i], CONF['win'])
        words = [np.asarray(x).astype(np.int32) for x in minibatch(cwords, CONF['batch_size'])]
        words_lex.append(words)

    print "Some file os calls"
    folder = os.path.basename(__file__).split('.')[0] + "_3"
    if not os.path.exists(folder):
        os.mkdir(folder)
    print "Create a Neural Network"
    rnn = regular_elman(nh=CONF['nhidden'],
                        nc=nclasses,
                        ne=vocsize,
                        de=CONF['emb_dimension'],
                        cs=CONF['win'],)

    # train with early stopping on validation set
    best_f1 = -np.inf
    CONF['current_learning_rate'] = CONF['learning_rate']
    print "Start training"
    start_time = print_time = time.time()
    for epoch in xrange(CONF['nepochs']):
        # shuffle
        shuffle([words_lex, train_y], CONF['seed'])
        CONF['ce'] = epoch
        tic = time.time()
        percentage_of_sentences_to_train = (epoch + 1) / CONF['nepochs']
        numer_of_sentences_to_train = int(nsentences * percentage_of_sentences_to_train)
        print "starting an epoch, numer_of_sentences_to_train =", numer_of_sentences_to_train
        test_size = int(len(windowed_test_lex) * percentage_of_sentences_to_train)
        print "test_size", test_size
        validation_size = int(len(windowed_valid_lex) * percentage_of_sentences_to_train)
        print "validation_size", validation_size
        for _ in xrange(30): # Trauma!
            print "_", _
            for i in xrange(numer_of_sentences_to_train):
                words = words_lex[i]
                labels = train_y[i]
                for word_batch, label_last_word in zip(words, labels):
                    rnn.train(word_batch, label_last_word, CONF['current_learning_rate'])
                    rnn.normalize()
                if CONF['verbose'] and time.time() - print_time > 30:
                    print '[learning] epoch %i >> %2.2f%%' % (epoch, (i + 1) * 100. / numer_of_sentences_to_train), 'completed in %.2f (sec) <<\r' % (time.time() - tic),
                    print_time = time.time()            

        # evaluation // back into the real world : idx -> words
        if CONF['verbose']:
            print "Classify test"
        predictions_test = [[idx2label[x] for x in rnn.classify(windowed_test_lex_item)]
                            for windowed_test_lex_item in windowed_test_lex[:test_size]]

        if CONF['verbose']:
            print "Classify validation"
        predictions_valid = [[idx2label[x] for x in rnn.classify(windowed_valid_lex_item)]
                             for windowed_valid_lex_item in windowed_valid_lex[:validation_size]]
        # evaluation // compute the accuracy using conlleval.pl
        if CONF['verbose']:
            print "Evaluate test and validation"
        res_test = conlleval(predictions_test, groundtruth_test[:test_size], words_test[:test_size], folder + '/current.test.txt')
        res_valid = conlleval(predictions_valid, groundtruth_valid[:validation_size], words_valid[:validation_size], folder + '/current.valid.txt')

        if res_valid['f1'] > best_f1:
            rnn.save(folder)
            best_f1 = res_valid['f1']
            print 'NEW BEST: epoch', epoch, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' ' * 20
            CONF['vf1'], CONF['vp'], CONF['vr'] = res_valid['f1'], res_valid['p'], res_valid['r']
            CONF['tf1'], CONF['tp'], CONF['tr'] = res_test['f1'], res_test['p'], res_test['r']
            CONF['be'] = epoch
            subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt'])
            subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt'])
        else:
            print '        : epoch', epoch, 'valid F1', res_valid['f1'], '     test F1', res_test['f1'], ' ' * 20
#             rnn.load(folder)

        # learning rate decay if no improvement in 10 epochs
        if CONF['decay'] and abs(CONF['be'] - CONF['ce']) >= 10:
            CONF['current_learning_rate'] *= 0.5
        if CONF['current_learning_rate'] < 1e-5:
            break

    print 'BEST RESULT: epoch', CONF['be'], 'valid F1', best_f1, 'best test F1', CONF['tf1'], 'with the model', folder
    print "total time = {} seconds".format(time.time() - start_time)
示例#7
0
    # create a folder for store the models
    if not os.path.exists(model_folder): os.mkdir(model_folder)

    # train with early stopping on validation set
    best_f1_test, best_f1_test_val = -numpy.inf, -numpy.inf
    s['clr'] = s['lr']  # learning rate

    for e in xrange(s['nepochs']):
        # shuffle
        shuffle([train_lex, train_y, train_feat], s['seed'])
        s['ce'] = e
        tic = time.time()

        for i in xrange(num_sentences):
            context_words = contextwin(train_lex[i], s['win'])
            words = map(lambda x: numpy.asarray(x).astype('int32'),
                        minibatch(context_words, s['bs']))
            features = minibatch(train_feat[i], s['bs'])

            labels = train_y[i]

            for word_batch, feature_batch, label_last_word in zip(
                    words, features, labels):
                rnn.train(word_batch, feature_batch, label_last_word, s['clr'])
                rnn.normalize()

            if s['verbose']:
                print '[learning] epoch %i >> %2.2f%%' % (e, (i+1)*100./num_sentences),\
                    'completed in %.2f (sec) <<\r' % (time.time()-tic),
                sys.stdout.flush()
示例#8
0
def prepare_data():
    """Prepare the data"""
    conf = {
        'fold': 3,  # 5 folds 0,1,2,3,4
        'lr': 0.0627142536696559,
        'verbose': True,
        'decay': True,  # decay on the learning rate if improvement stops
        'win': 7,  # number of words in the context window
        'bs': 9,  # number of back-propagation through time steps
        'nhidden': 100,  # number of hidden units
        'seed': 345,
        'emb_dimension': 300,  # dimension of word embedding
        'nepochs': 50
    }
    np.random.seed(conf['seed'])
    random.seed(conf['seed'])
    session_files = get_session_files(
        number_of_files=None,
        random_seed=conf['seed'])  # Limit the scope To speed things up...
    sentences = []
    idxes = []
    labels = []
    labels_idxes = []
    print "Calculate words2idx"
    words2idx = get_words2idx(session_files)
    unknown = words2idx["<UNK>"]
    print "Calculate output"
    for session_file in session_files:
        session = json.loads(open(session_file, "rb").read())
        sentence = session_to_text0(session)
        if not sentence.strip():
            continue
        sentences.append(sentence)
        token_list = tokenize(sentence.lower())
        dtp_search_res = dtp_search(sentence, None)
        iobes = to_iob(token_list, dtp_search_res)
        labels.append(iobes)
        labels_idxes.append(
            np.fromiter((LABELS2IDX[iob] for iob in iobes), dtype=np.int32))
        #         token_list = [re.sub(r"\d", "DIGIT", token) for token in token_list]
        idxes.append(
            np.fromiter(
                (words2idx.get(token, unknown) for token in token_list),
                dtype=np.int32))

    print "Prepare train, validation and test sets"
    train_valid_lex, test_lex, train_valid_y, test_y = train_test_split(
        idxes, labels_idxes, test_size=0.15, random_state=42)
    train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_lex,
                                                              train_valid_y,
                                                              test_size=0.2,
                                                              random_state=42)

    idx2label = dict(
        (k, v) for v, k in LABELS2IDX.iteritems())  # Reverse the dictionary
    idx2word = dict(
        (k, v) for v, k in words2idx.iteritems())  # Reverse the dictionary

    vocsize = len(idx2word)

    nclasses = len({label for labels in labels_idxes for label in labels})
    # nclasses = len(set(reduce(lambda x, y: list(x) + list(y), train_y + test_y + valid_y)))

    nsentences = len(train_lex)
    folder = os.path.basename(__file__).split('.')[0]
    if not os.path.exists(folder):
        os.mkdir(folder)

    print "Loading Word2Vec"
    word2vec = Word2Vec.load_word2vec_format(WORD2VEC_FILENAME,
                                             binary=True)  # C binary format

    print "Calculate word embeddings"
    embeddings = 0.2 * np.random.uniform(
        -1.0, 1.0, (vocsize + 1, conf['emb_dimension'])).astype(
            theano.config.floatX
        )  # add one for PADDING at the end @UndefinedVariable
    for idx, word in idx2word.iteritems():
        try:
            embedding = word2vec[word]
        except KeyError:
            try:
                embedding = word2vec[word.capitalize()]
            except KeyError:
                embedding = embeddings[idx]  # Keep it random
        embeddings[idx] = embedding

    del word2vec  # It is huge

    print "Create a Neural Network"
    rnn = elman2vec(nh=conf['nhidden'],
                    nc=nclasses,
                    ne=vocsize,
                    de=conf['emb_dimension'],
                    cs=conf['win'],
                    embeddings=embeddings)

    # train with early stopping on validation set
    best_f1 = -np.inf
    conf['clr'] = conf['lr']
    print "Start training"
    for epoch in xrange(conf['nepochs']):
        # shuffle
        shuffle([train_lex, train_y], conf['seed'])
        conf['ce'] = epoch
        tic = time.time()
        for i in xrange(nsentences):
            cwords = contextwin(train_lex[i], conf['win'])
            words = [
                np.asarray(x).astype(np.int32)
                for x in minibatch(cwords, conf['bs'])
            ]
            labels = train_y[i]
            for word_batch, label_last_word in zip(words, labels):
                rnn.train(word_batch, label_last_word, conf['clr'])


#                 rnn.normalize()
            if conf['verbose']:
                print '[learning] epoch %i >> %2.2f%%' % (
                    epoch, (i + 1) * 100. / nsentences
                ), 'completed in %.2f (sec) <<\r' % (time.time() - tic),
                sys.stdout.flush()

        # evaluation // back into the real world : idx -> words
        predictions_test = [ map(lambda x: idx2label[x], \
                         rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                         for x in test_lex ]
        groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y]
        words_test = [map(lambda x: idx2word[x], w) for w in test_lex]

        predictions_valid = [ map(lambda x: idx2label[x], \
                             rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                             for x in valid_lex ]
        groundtruth_valid = [map(lambda x: idx2label[x], y) for y in valid_y]
        words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex]

        # evaluation // compute the accuracy using conlleval.pl
        res_test = conlleval(predictions_test, groundtruth_test, words_test,
                             folder + '/current.test.txt')
        res_valid = conlleval(predictions_valid, groundtruth_valid,
                              words_valid, folder + '/current.valid.txt')

        if res_valid['f1'] > best_f1:
            rnn.save(folder)
            best_f1 = res_valid['f1']
            print 'NEW BEST: epoch', epoch, 'valid F1', res_valid[
                'f1'], 'best test F1', res_test['f1'], ' ' * 20
            conf['vf1'], conf['vp'], conf['vr'] = res_valid['f1'], res_valid[
                'p'], res_valid['r']
            conf['tf1'], conf['tp'], conf['tr'] = res_test['f1'], res_test[
                'p'], res_test['r']
            conf['be'] = epoch
            subprocess.call([
                'mv', folder + '/current.test.txt', folder + '/best.test.txt'
            ])
            subprocess.call([
                'mv', folder + '/current.valid.txt', folder + '/best.valid.txt'
            ])
        else:
            print '        : epoch', epoch, 'valid F1', res_valid[
                'f1'], '     test F1', res_test['f1'], ' ' * 20

        # learning rate decay if no improvement in 10 epochs
        if conf['decay'] and abs(conf['be'] - conf['ce']) >= 10:
            conf['clr'] *= 0.5
        if conf['clr'] < 1e-5:
            break

    print 'BEST RESULT: epoch', epoch, 'valid F1', res_valid[
        'f1'], 'best test F1', res_test['f1'], 'with the model', folder
示例#9
0
def play_with_splitting_sentences():
    """Play with splitting sentences"""
    conf = {  # 'fold': 3, # 5 folds 0,1,2,3,4
        'lr': 0.0627142536696559,
        'verbose': False,
        'decay': True,  # decay on the learning rate if improvement stops
        'win': 15,  # number of characters in the context window
        'bs': 5,  # number of back-propagation through time steps
        'nhidden': 100,  # number of hidden units
        'seed': 345,
        'emb_dimension': 30,  # dimension of character embedding
        'nepochs': 10
    }
    number_of_files = 50000
    np.random.seed(conf['seed'])
    random.seed(conf['seed'])
    print "Calculate output"
    session_files = get_session_files(
        number_of_files=number_of_files,
        random_seed=conf['seed'])  # Limit the scope To speed things up...
    labels2idx = {"O": 0, "X": 1}
    sentences = []
    idxes = []
    labels_idxes = []
    labels = []
    char2idx = get_char_to_idx(session_files)
    for session_file in session_files:
        session = json.loads(open(session_file, "rb").read())
        sentence = session_to_text0(session)
        if not sentence.strip():
            continue
        sentence_out, label = create_test(sentence, probability=0.2)
        sentences.append(sentence_out)
        labels.append(label)
        labels_idxes.append(
            np.fromiter((labels2idx[l] for l in label), dtype=np.uint32))
        idxes.append(
            np.fromiter((char2idx[char] for char in sentence_out),
                        dtype=np.uint32))

    print "Prepare train, validation and test sets"
    train_valid_lex, test_lex, train_valid_y, test_y = train_test_split(
        idxes, labels_idxes, test_size=0.15, random_state=42)
    train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_lex,
                                                              train_valid_y,
                                                              test_size=0.2,
                                                              random_state=42)
    print "Some more prep"
    idx2label = dict(
        (k, v) for v, k in labels2idx.iteritems())  # Reverse the dictionary
    idx2word = dict(
        (k, v) for v, k in char2idx.iteritems())  # Reverse the dictionary

    #     vocsize = 1 + len(set(reduce(\
    #                                  lambda x, y: list(x)+list(y),\
    #                                  train_lex+valid_lex+test_lex)))
    vocsize = 1 + len(
        set(item for lex in (train_lex, valid_lex, test_lex) for sublist in lex
            for item in sublist))
    nclasses = 2  #len(set(reduce(lambda x, y: list(x) + list(y), train_y + test_y + valid_y)))
    nsentences = len(train_lex)
    print "Some file os calls"
    folder = os.path.basename(__file__).split('.')[0] + "_3"
    if not os.path.exists(folder):
        os.mkdir(folder)
    print "Create a Neural Network"
    rnn = regular_elman(
        nh=conf['nhidden'],
        nc=nclasses,
        ne=vocsize,
        de=conf['emb_dimension'],
        cs=conf['win'],
    )

    # train with early stopping on validation set
    best_f1 = -np.inf
    conf['clr'] = conf['lr']
    print "Start training"
    start_time = time.time()
    for epoch in xrange(conf['nepochs']):
        # shuffle
        shuffle([train_lex, train_y], conf['seed'])
        conf['ce'] = epoch
        tic = time.time()
        for i in xrange(nsentences):
            cwords = contextwin(train_lex[i], conf['win'])
            words = [
                np.asarray(x).astype(np.int32)
                for x in minibatch(cwords, conf['bs'])
            ]
            labels = train_y[i]
            for word_batch, label_last_word in zip(words, labels):
                rnn.train(word_batch, label_last_word, conf['clr'])
                rnn.normalize()
            if conf['verbose']:
                print '[learning] epoch %i >> %2.2f%%' % (
                    epoch, (i + 1) * 100. / nsentences
                ), 'completed in %.2f (sec) <<\r' % (time.time() - tic),
                sys.stdout.flush()

        # evaluation // back into the real world : idx -> words
        predictions_test = [ map(lambda x: idx2label[x], \
                         rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                         for x in test_lex ]
        groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y]
        words_test = [map(lambda x: idx2word[x], w) for w in test_lex]

        predictions_valid = [ map(lambda x: idx2label[x], \
                             rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                             for x in valid_lex ]
        groundtruth_valid = [map(lambda x: idx2label[x], y) for y in valid_y]
        words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex]

        # evaluation // compute the accuracy using conlleval.pl
        res_test = conlleval(predictions_test, groundtruth_test, words_test,
                             folder + '/current.test.txt')
        res_valid = conlleval(predictions_valid, groundtruth_valid,
                              words_valid, folder + '/current.valid.txt')

        if res_valid['f1'] > best_f1:
            rnn.save(folder)
            best_f1 = res_valid['f1']
            print 'NEW BEST: epoch', epoch, 'valid F1', res_valid[
                'f1'], 'best test F1', res_test['f1'], ' ' * 20
            conf['vf1'], conf['vp'], conf['vr'] = res_valid['f1'], res_valid[
                'p'], res_valid['r']
            conf['tf1'], conf['tp'], conf['tr'] = res_test['f1'], res_test[
                'p'], res_test['r']
            conf['be'] = epoch
            subprocess.call([
                'mv', folder + '/current.test.txt', folder + '/best.test.txt'
            ])
            subprocess.call([
                'mv', folder + '/current.valid.txt', folder + '/best.valid.txt'
            ])
        else:
            print '        : epoch', epoch, 'valid F1', res_valid[
                'f1'], '     test F1', res_test['f1'], ' ' * 20

        # learning rate decay if no improvement in 10 epochs
        if conf['decay'] and abs(conf['be'] - conf['ce']) >= 10:
            conf['clr'] *= 0.5
        if conf['clr'] < 1e-5:
            break

    print 'BEST RESULT: epoch', conf[
        'be'], 'valid F1', best_f1, 'best test F1', conf[
            'tf1'], 'with the model', folder
    print "total time = {} seconds".format(time.time() - start_time)
示例#10
0
def run(params):

    start_time = time.time()

    folder = os.path.basename(__file__).split('.')[0]
    if not os.path.exists(folder): os.mkdir(folder)
    rhoList = numpy.array([100, 50]).astype(
        numpy.int32
    )  # 100,90,80,70,60,50,0 # combining forward and backward layers

    # load the dataset
    eval_options = []
    params['measure'] = 'F1score'
    if params['dataset'] == 'atis':
        train_set, valid_set, test_set, dic = loadData.atisfold(params['fold'])
    if params['dataset'] == 'ner':
        train_set, valid_set, test_set, dic = loadData.ner()
    if params['dataset'] == 'chunk':
        train_set, valid_set, test_set, dic = loadData.chunk()
    if params['dataset'] == 'pos':
        train_set, valid_set, test_set, dic = loadData.pos()
        eval_options = ['-r']
        params['measure'] = 'Accuracy'

    idx2label = dict((k, v) for v, k in dic['labels2idx'].items())
    idx2word = dict((k, v) for v, k in dic['words2idx'].items())

    train_lex, train_ne, train_y = train_set
    valid_lex, valid_ne, valid_y = valid_set
    test_lex, test_ne, test_y = test_set

    ## :( hack
    # train_lex = train_lex[::100]
    # train_ne = train_ne[::100]
    # train_y = train_y[::100]
    # valid_lex = valid_lex[::100]
    # valid_ne = valid_ne[::100]
    # valid_y = valid_y[::100]
    # test_lex = test_lex[::100]
    # test_ne = test_ne[::100]
    # test_y = test_y[::100]

    vocsize = len(dic['words2idx'])
    nclasses = len(dic['labels2idx'])
    nsentences = len(train_lex)

    wv = None
    if params['WVFolder'] != 'random':
        if '[' in params['WVFolder'] and ']' in params['WVFolder']:
            folderSet = set(
                eval(params['WVFolder'].replace('[', '[\'').replace(
                    ']', '\']').replace(',', '\',\'')))
            print(folderSet)
            wv = numpy.zeros(
                (vocsize + 1,
                 params['WVModel']['emb_dimension'] * len(folderSet)))

            modelIndex = 0
            for folder in folderSet:
                params['WVFile'] = folder + '/' + 'words' + str(
                    params['WVModel']['emb_dimension']) + '.npy'
                params['WVVocabFile'] = folder + '/' + 'words' + str(
                    params['WVModel']['emb_dimension']) + '.vocab'
                # load word vector
                wvnp = np.load(params['WVFile'])

                # load vocab
                with open(params['WVVocabFile']) as f:
                    vocab = [line.strip() for line in f if len(line) > 0]
                wi = dict([(a, i) for i, a in enumerate(vocab)])

                random_v = math.sqrt(
                    6.0 / numpy.sum(params['WVModel']['emb_dimension'])
                ) * numpy.random.uniform(-1.0, 1.0,
                                         (params['WVModel']['emb_dimension']))
                miss = 0  # the number of missing words in pre-trained word embeddings
                for i in range(0, vocsize):
                    word = idx2word[i]
                    if word in wi:
                        wv[i][params['WVModel']['emb_dimension'] *
                              modelIndex:params['WVModel']['emb_dimension'] *
                              (modelIndex + 1)] = wvnp[wi[word]]
                        # print wvnp[wi[word]]
                    else:
                        wv[i][params['WVModel']['emb_dimension'] *
                              modelIndex:params['WVModel']['emb_dimension'] *
                              (modelIndex + 1)] = random_v
                        miss += 1
                print("missing words rate : ", miss, '/', vocsize)
                params['WVModel']['vocab_size'] = len(vocab)
                modelIndex = modelIndex + 1

            params['WVModel']['emb_dimension'] *= len(folderSet)
            # return
        else:
            folder = params['WVFolder']
            params['WVFile'] = folder + '/' + 'words' + str(
                params['WVModel']['emb_dimension']) + '.npy'
            params['WVVocabFile'] = folder + '/' + 'words' + str(
                params['WVModel']['emb_dimension']) + '.vocab'

            # load word vector
            wvnp = np.load(params['WVFile'])
            params['WVModel']['emb_dimension'] = len(wvnp[0])

            # load vocab
            with open(params['WVVocabFile']) as f:
                vocab = [line.strip() for line in f if len(line) > 0]
            wi = dict([(a, i) for i, a in enumerate(vocab)])
            wv = numpy.zeros((vocsize + 1, params['WVModel']['emb_dimension']))
            random_v = math.sqrt(6.0 / numpy.sum(
                params['WVModel']['emb_dimension'])) * numpy.random.uniform(
                    -1.0, 1.0, (params['WVModel']['emb_dimension']))

            miss = 0  # the number of missing words in pre-trained word embeddings
            for i in range(0, vocsize):
                word = idx2word[i]
                if word in wi:
                    wv[i] = wvnp[wi[word]]
                    # print wvnp[wi[word]]
                else:
                    wv[i] = random_v
                    miss += 1
            print("missing words rate : ", miss, '/', vocsize)
            params['WVModel']['vocab_size'] = len(vocab)

    print(json.dumps(params, sort_keys=True, indent=4, separators=(',', ': ')))

    rhoSuffix = "%_forward"
    best_valid = {}
    best_test = {}
    for i_rho in range(len(rhoList)):
        best_valid[str(rhoList[i_rho]) + rhoSuffix] = -numpy.inf
        best_test[str(rhoList[i_rho]) + rhoSuffix] = -numpy.inf
    validMeasureList = {}
    testMeasureList = {}  # this is used for drawing line chart.
    for i_rho in range(len(rhoList)):
        validMeasureList[str(rhoList[i_rho]) + rhoSuffix] = []
        testMeasureList[str(rhoList[i_rho]) + rhoSuffix] = []

    # instanciate the model
    numpy.random.seed(params['seed'])
    random.seed(params['seed'])
    rnn = elman_attention.model(nh=params['nhidden'],
                                nc=nclasses,
                                ne=vocsize,
                                de=params['WVModel']['emb_dimension'],
                                attention=params['attention'],
                                h_win=(params['h_win_left'],
                                       params['h_win_right']),
                                lvrg=params['lvrg'],
                                wv=wv)

    # train
    for e in range(params['nepochs']):
        # shuffle
        shuffle([train_lex, train_ne, train_y], params['seed'])

        tic = time.time()
        for i in range(nsentences):
            cwords = contextwin(train_lex[i])
            labels = train_y[i]

            nl, aaL = rnn.train(cwords, labels, params['dropRate'], 1)

            # rnn.normalize()
            if params['verbose']:
                sys.stdout.write(
                    ('\r[learning] epoch %i >> %2.2f%%' %
                     (e, (i + 1) * 100. / nsentences) +
                     ('  average speed in %.2f (min) <<' %
                      ((time.time() - tic) / 60 / (i + 1) * nsentences)) +
                     (' completed in %.2f (sec) <<' % ((time.time() - tic)))))
                sys.stdout.flush()

        print('start test', time.time() / 60)

        print('start pred train', time.time() / 60)
        predictions_train = [[map(lambda varible: idx2label[varible], w) \
                              for w in rnn.classify(numpy.asarray(contextwin(x)).astype('int32'), params['dropRate'], 0, rhoList)]
                             for x in train_lex]

        predictions_test = [[map(lambda varible: idx2label[varible], w) \
                             for w in rnn.classify(numpy.asarray(contextwin(x)).astype('int32'), params['dropRate'], 0, rhoList)]
                            for x in test_lex]

        predictions_valid = [[map(lambda varible: idx2label[varible], w) \
                              for w in rnn.classify(numpy.asarray(contextwin(x)).astype('int32'), params['dropRate'], 0, rhoList)]
                             for x in valid_lex]

        for i_rho in range(len(rhoList)):

            groundtruth_train = [
                map(lambda x: idx2label[x], y) for y in train_y
            ]
            words_train = [map(lambda x: idx2word[x], w) for w in train_lex]
            groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y]
            words_test = [map(lambda x: idx2word[x], w) for w in test_lex]
            groundtruth_valid = [
                map(lambda x: idx2label[x], y) for y in valid_y
            ]
            words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex]

            ptrain = [p[i_rho] for p in predictions_train]
            ptest = [p[i_rho] for p in predictions_test]
            pvalid = [p[i_rho] for p in predictions_valid]

            res_train = conlleval(
                ptrain, groundtruth_train, words_train, folder +
                '/current.train.txt' + str(i_rho) + str(params['seed']),
                eval_options)
            res_test = conlleval(
                ptest, groundtruth_test, words_test, folder +
                '/current.test.txt' + str(i_rho) + str(params['seed']),
                eval_options)
            res_valid = conlleval(
                pvalid, groundtruth_valid, words_valid, folder +
                '/current.valid.txt' + str(i_rho) + str(params['seed']),
                eval_options)

            print('                                     epoch', e, ' rhoList ',
                  i_rho, '  train p', res_train['p'], 'valid p',
                  res_valid['p'], '  train r', res_train['r'], 'valid r',
                  res_valid['r'], '  train ', params['measure'],
                  res_train['measure'], 'valid ', params['measure'],
                  res_valid['measure'], 'best test ', params['measure'],
                  res_test['measure'], ' ' * 20)

            validMeasureList[str(rhoList[i_rho]) + rhoSuffix].append(
                res_valid['measure'])
            testMeasureList[str(rhoList[i_rho]) + rhoSuffix].append(
                res_test['measure'])

            if res_valid['measure'] > best_valid[str(rhoList[i_rho]) +
                                                 rhoSuffix]:
                best_valid[str(rhoList[i_rho]) +
                           rhoSuffix] = res_valid['measure']
                best_test[str(rhoList[i_rho]) +
                          rhoSuffix] = res_test['measure']

        for i_rho in range(
                len(rhoList)):  # this is used for drawing line chart.
            print(i_rho, params['dataset'], end=' ')
            for v in testMeasureList[str(rhoList[i_rho]) + rhoSuffix]:
                print(v, end=' ')
            print('')

        for i_rho in range(len(rhoList)):
            print('current best results', rhoList[i_rho], ' ',
                  best_valid[str(rhoList[i_rho]) + rhoSuffix], '/',
                  best_test[str(rhoList[i_rho]) + rhoSuffix])

    end_time = time.time()

    with open(params['JSONOutputFile'], 'w') as outputFile:
        params['results'] = {}
        params['results']['best_valid_' + params['measure']] = best_valid
        params['results']['best_test_' + params['measure']] = best_test
        params['results']['valid_' + params['measure'] +
                          'ListBasedOnEpochs'] = validMeasureList
        params['results']['test_' + params['measure'] +
                          'ListBasedOnEpochs'] = testMeasureList
        params['running_time'] = {}
        params['running_time']['start'] = time.strftime(
            "%Y-%m-%d %H:%M:%S", time.localtime(start_time))
        params['running_time']['end'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                      time.localtime(end_time))
        params['running_time']['duration'] = end_time - start_time

        res = json.dump(params,
                        outputFile,
                        sort_keys=True,
                        indent=4,
                        separators=(',', ': '))
        print(res)
示例#11
0
    rnn = model(    nh = s['nhidden'],
                    nc = nclasses,
                    ne = vocsize,
                    de = s['emb_dimension'],
                    cs = s['win'] )

    # train with early stopping on validation set
    best_f1 = -numpy.inf
    s['clr'] = s['lr']
    for e in xrange(s['nepochs']):
        # shuffle
        shuffle([train_lex, train_ne, train_y], s['seed'])
        s['ce'] = e
        tic = time.time()
        for i in xrange(nsentences):
            cwords = contextwin(train_lex[i], s['win'])
            words  = map(lambda x: numpy.asarray(x).astype('int32'),\
                         minibatch(cwords, s['bs']))
            labels = train_y[i]
            for word_batch , label_last_word in zip(words, labels):
                rnn.train(word_batch, label_last_word, s['clr'])
                rnn.normalize()
            if s['verbose']:
                print '[learning] epoch %i >> %2.2f%%'%(e,(i+1)*100./nsentences),'completed in %.2f (sec) <<\r'%(time.time()-tic),
                sys.stdout.flush()
            
        # evaluation // back into the real world : idx -> words
        predictions_test = [ map(lambda x: idx2label[x], \
                             rnn.classify(numpy.asarray(contextwin(x, s['win'])).astype('int32')))\
                             for x in test_lex ]
        groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ]
示例#12
0
def test(data_test=['data/qe/test/test.src.lc',
                'data/qe/test/test.mt.lc',
                'data/qe/test/test.align'],
          data_test_y = 'data/qe/test/test.tags',
          dictionaries=['data/qe/train/train.src.lc.json',
              'data/qe/train/train.mt.lc.json'],
          character2index=['data/qe/train/train.src.lc.dict_char.json',
              'data/qe/train/train.mt.lc.dict_char.json'],
          embeddings=['data/qe/pretrain/ep_qe.en.vector.txt',
              'data/qe/pretrain/ep_qe.de.vector.txt'],
	  label2index = 'data/qe/train/train.tags.json',
	  load_model = None
    ):
	model_dir = load_model[0]
	current_options = OrderedDict(sorted(locals().copy().items()))
	train_options = json.load(open(model_dir+'/model.json'))
	model_options = merg_dicts(current_options, train_options)

	print 'model_options:', model_options

	model_name = model_options['use_model'][0]
	if model_options['use_adadelta']:
		model_name += '_adadelta'
	if model_options['use_char']:
		model_name += '_char'
	if model_options['use_bilingual']:
		model_name += '_bilingual'
	if model_options['use_pretrain']:
		model_name += '_pretrain'

	print 'Using model:', model_name

	processed_data = preprocess_data(
		data_test=model_options['data_test'], data_test_y=model_options['data_test_y'][0], 
		dictionaries=model_options['dictionaries'],
		embeddings = model_options['embeddings'],
		character2index = model_options['character2index'],
		label2index = model_options['label2index'][0],
		use_bilingual=model_options['use_bilingual'], 
		use_char=model_options['use_char'], 
		use_pretrain=model_options['use_pretrain'])

	test, test_y, w2idxs, char2idxs, label2idxs, embs = processed_data
	idx2label = dict((k,v) for v,k in label2idxs.iteritems())

        vocsize_s = vocsize_t = vocsize_schar = vocsize_tchar = 0
        emb_s, emb_t, test_s, test_schar, test_t, test_tchar, = ([] for i in range(6))

        if (model_options['use_bilingual'] or len(test) == 4) and model_options['use_char']:
                emb_s, emb_t = embs
                test_s, test_t, test_schar, test_tchar = test
                vocsize_s = len(w2idxs[0])
                vocsize_t = len(w2idxs[1])
                vocsize_schar = len(char2idxs[0])
                vocsize_tchar = len(char2idxs[1])

        elif model_options['use_char']:
                emb_t = embs[0]
                test_t, test_tchar = test
                vocsize_t = len(w2idxs[0])
                vocsize_tchar = len(char2idxs[0])

	elif model_options['use_bilingual'] or len(test) == 2:
                emb_s, emb_t = embs
                test_s, test_t = test
                vocsize_s = len(w2idxs[0])
                vocsize_t = len(w2idxs[1])
        else :
                emb_t = embs[0]
                test_t = test[0]
                vocsize_t = len(w2idxs[0])

    	numpy.random.seed(model_options['seed'])
    	# instanciate the model

	params = load(model_dir)

    	rnn = select_model[model_name](
                    de = model_options['dim_word'],
                    cs = model_options['win'],
                    de_char = model_options['dim_char'],
                    max_char = model_options['max_char'],
		    params = params)

	predictions_test, groundtruth_test, predictions_valid,groundtruth_valid = ([] for i in range(4))

   	if model_options['use_bilingual'] and model_options['use_char']:
                        predictions_test = [ map(lambda x: idx2label[x],
                                rnn.classify(numpy.asarray(contextwin(x,
                                model_options['win'])).astype('int32'),
                                numpy.asarray(contextwin(_x, model_options['win'])).astype('int32'),
                                numpy.asarray(add_padding(__x,
                                model_options['max_char'])).astype('int32')))
                                for x, _x, __x in zip(test_s, test_t, test_tchar) ]

	elif model_options['use_bilingual']:
		#evaluation // back into the real world : idx -> words
            	predictions_test = [ map(lambda x: idx2label[x],
                	rnn.classify(numpy.asarray(contextwin(x_src, 
			model_options['win'])).astype('int32'),
			numpy.asarray(contextwin(x_tgt,model_options['win'])).astype('int32')))
			for x_src, x_tgt in zip(test_s, test_t) ]

	elif model_options['use_char']:
                        predictions_test = [ map(lambda x: idx2label[x],
                                rnn.classify(numpy.asarray(contextwin(x,
                                model_options['win'])).astype('int32'),
                                numpy.asarray(add_padding(_x,
                                model_options['max_char'])).astype('int32')))
                                for x, _x, in zip(test_t, test_tchar) ]
	else:
                #evaluation // back into the real world : idx -> words
                predictions_test = [ map(lambda x: idx2label[x],
			rnn.classify(numpy.asarray(contextwin(x,
			model_options['win'])).astype('int32'))) for x in test_t ]

	if data_test_y:
                groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ]
                #words_test = [ map(lambda x: idx2word[x], w) for w in test_t]

        #evaluation // compute the accuracy using conlleval.pl
	res_test = []
	output_file = model_dir + '/test_output.txt'
	if model_options['use_quest']:
		if data_test_y:
		   print '\nWriting the output into:', output_file
		   res_test=wmt_eval(predictions_test, groundtruth_test, output_file)
		else:
		   print '\nWriting the output into:', output_file
		   _write_text(predictions_test, output_file)
	if model_options['use_tag']:
		if data_test_y:
		   print '\nWriting the output into:', output_file
                   res_test=icon_eval(predictions_test, groundtruth_test, output_file)
		else:
		   print '\nWriting the output into:', output_file
		   _write_text(predictions_test, output_file)

	if data_test_y:
        	print 'test F1' , res_test , ' '*20
示例#13
0
def play_with_splitting_sentences():
    """Play with splitting sentences"""
    conf = { # 'fold': 3, # 5 folds 0,1,2,3,4
        'lr': 0.0627142536696559,
        'verbose': False,
        'decay': True, # decay on the learning rate if improvement stops
        'win': 15, # number of characters in the context window
        'bs': 5, # number of back-propagation through time steps
        'nhidden': 100, # number of hidden units
        'seed': 345,
        'emb_dimension': 30, # dimension of character embedding
        'nepochs': 10}
    number_of_files = 50000
    np.random.seed(conf['seed'])
    random.seed(conf['seed'])
    print "Calculate output"
    session_files = get_session_files(number_of_files=number_of_files, random_seed=conf['seed']) # Limit the scope To speed things up...
    labels2idx = {"O": 0, "X": 1}
    sentences = []
    idxes = []
    labels_idxes = []
    labels = []
    char2idx = get_char_to_idx(session_files)
    for session_file in session_files:
        session = json.loads(open(session_file, "rb").read())
        sentence = session_to_text0(session)
        if not sentence.strip():
            continue
        sentence_out, label = create_test(sentence, probability=0.2)
        sentences.append(sentence_out)
        labels.append(label)
        labels_idxes.append(np.fromiter((labels2idx[l] for l in label), dtype=np.uint32))
        idxes.append(np.fromiter((char2idx[char] for char in sentence_out), dtype=np.uint32))

    print "Prepare train, validation and test sets"
    train_valid_lex, test_lex, train_valid_y, test_y = train_test_split(idxes, labels_idxes, test_size=0.15, random_state=42)
    train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_lex, train_valid_y, test_size=0.2, random_state=42)
    print "Some more prep"
    idx2label = dict((k, v) for v, k in labels2idx.iteritems()) # Reverse the dictionary
    idx2word = dict((k, v) for v, k in char2idx.iteritems()) # Reverse the dictionary

#     vocsize = 1 + len(set(reduce(\
#                                  lambda x, y: list(x)+list(y),\
#                                  train_lex+valid_lex+test_lex)))
    vocsize = 1 + len(set(item for lex in (train_lex, valid_lex, test_lex) for sublist in lex for item in sublist))
    nclasses = 2  #len(set(reduce(lambda x, y: list(x) + list(y), train_y + test_y + valid_y)))
    nsentences = len(train_lex)
    print "Some file os calls"
    folder = os.path.basename(__file__).split('.')[0] + "_3"
    if not os.path.exists(folder):
        os.mkdir(folder)
    print "Create a Neural Network"
    rnn = regular_elman(nh=conf['nhidden'],
                        nc=nclasses,
                        ne=vocsize,
                        de=conf['emb_dimension'],
                        cs=conf['win'],)

    # train with early stopping on validation set
    best_f1 = -np.inf
    conf['clr'] = conf['lr']
    print "Start training"
    start_time = time.time()
    for epoch in xrange(conf['nepochs']):
        # shuffle
        shuffle([train_lex, train_y], conf['seed'])
        conf['ce'] = epoch
        tic = time.time()
        for i in xrange(nsentences):
            cwords = contextwin(train_lex[i], conf['win'])
            words = [np.asarray(x).astype(np.int32) for x in minibatch(cwords, conf['bs'])]
            labels = train_y[i]
            for word_batch , label_last_word in zip(words, labels):
                rnn.train(word_batch, label_last_word, conf['clr'])
                rnn.normalize()
            if conf['verbose']:
                print '[learning] epoch %i >> %2.2f%%' % (epoch, (i + 1) * 100. / nsentences), 'completed in %.2f (sec) <<\r' % (time.time() - tic),
                sys.stdout.flush()

        # evaluation // back into the real world : idx -> words
        predictions_test = [ map(lambda x: idx2label[x], \
                         rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                         for x in test_lex ]
        groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ]
        words_test = [ map(lambda x: idx2word[x], w) for w in test_lex]

        predictions_valid = [ map(lambda x: idx2label[x], \
                             rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                             for x in valid_lex ]
        groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ]
        words_valid = [ map(lambda x: idx2word[x], w) for w in valid_lex]

        # evaluation // compute the accuracy using conlleval.pl
        res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt')
        res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt')

        if res_valid['f1'] > best_f1:
            rnn.save(folder)
            best_f1 = res_valid['f1']
            print 'NEW BEST: epoch', epoch, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' ' * 20
            conf['vf1'], conf['vp'], conf['vr'] = res_valid['f1'], res_valid['p'], res_valid['r']
            conf['tf1'], conf['tp'], conf['tr'] = res_test['f1'], res_test['p'], res_test['r']
            conf['be'] = epoch
            subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt'])
            subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt'])
        else:
            print '        : epoch', epoch, 'valid F1', res_valid['f1'], '     test F1', res_test['f1'], ' ' * 20

        # learning rate decay if no improvement in 10 epochs
        if conf['decay'] and abs(conf['be'] - conf['ce']) >= 10:
            conf['clr'] *= 0.5
        if conf['clr'] < 1e-5:
            break

    print 'BEST RESULT: epoch', conf['be'], 'valid F1', best_f1, 'best test F1', conf['tf1'], 'with the model', folder
    print "total time = {} seconds".format(time.time() - start_time)
示例#14
0
def prepare_data():
    """Prepare the data"""
    conf = {'fold': 3, # 5 folds 0,1,2,3,4
            'lr': 0.0627142536696559,
            'verbose': True,
            'decay': True, # decay on the learning rate if improvement stops
            'win': 7, # number of words in the context window
            'bs': 9, # number of back-propagation through time steps
            'nhidden': 100, # number of hidden units
            'seed': 345,
            'emb_dimension': 300, # dimension of word embedding
            'nepochs': 50}
    np.random.seed(conf['seed'])
    random.seed(conf['seed'])
    session_files = get_session_files(number_of_files=None, random_seed=conf['seed']) # Limit the scope To speed things up...
    sentences = []
    idxes = []
    labels = []
    labels_idxes = []
    print "Calculate words2idx"
    words2idx = get_words2idx(session_files)
    unknown = words2idx["<UNK>"]
    print "Calculate output"
    for session_file in session_files:
        session = json.loads(open(session_file, "rb").read())
        sentence = session_to_text0(session)
        if not sentence.strip():
            continue
        sentences.append(sentence)
        token_list = tokenize(sentence.lower())
        dtp_search_res = dtp_search(sentence, None)
        iobes = to_iob(token_list, dtp_search_res)
        labels.append(iobes)
        labels_idxes.append(np.fromiter((LABELS2IDX[iob] for iob in iobes), dtype=np.int32))
#         token_list = [re.sub(r"\d", "DIGIT", token) for token in token_list]
        idxes.append(np.fromiter((words2idx.get(token, unknown) for token in token_list), dtype=np.int32))



    print "Prepare train, validation and test sets"
    train_valid_lex, test_lex, train_valid_y, test_y = train_test_split(idxes, labels_idxes, test_size=0.15, random_state=42)
    train_lex, valid_lex, train_y, valid_y = train_test_split(train_valid_lex, train_valid_y, test_size=0.2, random_state=42)

    idx2label = dict((k, v) for v, k in LABELS2IDX.iteritems()) # Reverse the dictionary
    idx2word = dict((k, v) for v, k in words2idx.iteritems()) # Reverse the dictionary

    vocsize = len(idx2word)

    nclasses = len({label for labels in labels_idxes for label in labels})
    # nclasses = len(set(reduce(lambda x, y: list(x) + list(y), train_y + test_y + valid_y)))

    nsentences = len(train_lex)
    folder = os.path.basename(__file__).split('.')[0]
    if not os.path.exists(folder):
        os.mkdir(folder)

    print "Loading Word2Vec"
    word2vec = Word2Vec.load_word2vec_format(WORD2VEC_FILENAME, binary=True) # C binary format

    print "Calculate word embeddings"
    embeddings = 0.2 * np.random.uniform(-1.0, 1.0, (vocsize + 1, conf['emb_dimension'])).astype(theano.config.floatX) # add one for PADDING at the end @UndefinedVariable
    for idx, word in idx2word.iteritems():
        try:
            embedding = word2vec[word]
        except KeyError:
            try:
                embedding = word2vec[word.capitalize()]
            except KeyError:
                embedding = embeddings[idx] # Keep it random
        embeddings[idx] = embedding

    del word2vec # It is huge

    print "Create a Neural Network"
    rnn = elman2vec(nh=conf['nhidden'],
                nc=nclasses,
                ne=vocsize,
                de=conf['emb_dimension'],
                cs=conf['win'],
                embeddings=embeddings)

    # train with early stopping on validation set
    best_f1 = -np.inf
    conf['clr'] = conf['lr']
    print "Start training"
    for epoch in xrange(conf['nepochs']):
        # shuffle
        shuffle([train_lex, train_y], conf['seed'])
        conf['ce'] = epoch
        tic = time.time()
        for i in xrange(nsentences):
            cwords = contextwin(train_lex[i], conf['win'])
            words = [np.asarray(x).astype(np.int32) for x in minibatch(cwords, conf['bs'])]
            labels = train_y[i]
            for word_batch , label_last_word in zip(words, labels):
                rnn.train(word_batch, label_last_word, conf['clr'])
#                 rnn.normalize()
            if conf['verbose']:
                print '[learning] epoch %i >> %2.2f%%' % (epoch, (i + 1) * 100. / nsentences), 'completed in %.2f (sec) <<\r' % (time.time() - tic),
                sys.stdout.flush()

        # evaluation // back into the real world : idx -> words
        predictions_test = [ map(lambda x: idx2label[x], \
                         rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                         for x in test_lex ]
        groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ]
        words_test = [ map(lambda x: idx2word[x], w) for w in test_lex]

        predictions_valid = [ map(lambda x: idx2label[x], \
                             rnn.classify(np.asarray(contextwin(x, conf['win'])).astype('int32')))\
                             for x in valid_lex ]
        groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ]
        words_valid = [ map(lambda x: idx2word[x], w) for w in valid_lex]

        # evaluation // compute the accuracy using conlleval.pl
        res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt')
        res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt')

        if res_valid['f1'] > best_f1:
            rnn.save(folder)
            best_f1 = res_valid['f1']
            print 'NEW BEST: epoch', epoch, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' ' * 20
            conf['vf1'], conf['vp'], conf['vr'] = res_valid['f1'], res_valid['p'], res_valid['r']
            conf['tf1'], conf['tp'], conf['tr'] = res_test['f1'], res_test['p'], res_test['r']
            conf['be'] = epoch
            subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt'])
            subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt'])
        else:
            print '        : epoch', epoch, 'valid F1', res_valid['f1'], '     test F1', res_test['f1'], ' ' * 20

        # learning rate decay if no improvement in 10 epochs
        if conf['decay'] and abs(conf['be'] - conf['ce']) >= 10:
            conf['clr'] *= 0.5
        if conf['clr'] < 1e-5:
            break

    print 'BEST RESULT: epoch', epoch, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], 'with the model', folder