def mpqa(filename): f = open(filename, 'rb') data, dicts = cPickle.load(f) sentences, labels = data shuffle([sentences, labels], 69) n = len(sentences) training_size = int(floor(n * 0.8)) training_end = training_size-1 train_set = (sentences[0:training_end], labels[0:training_end]) valid_size = int(floor(n * 0.1)) valid_end = training_size + valid_size - 1 valid_set = (sentences[training_size:valid_end], labels[training_size:valid_end]) test_size = int(floor(n * 0.1)) test_begin = valid_end + 1 test_end = valid_end + test_size - 1 test_set = (sentences[test_begin:test_end], labels[test_begin:test_end]) f.close() return (train_set, valid_set, test_set, dicts)
# instanciate the model numpy.random.seed(s['seed']) random.seed(s['seed']) rnn = model(nh=s['nhidden'], nc=nclasses, ne=vocsize, de=s['emb_dimension'], cs=s['win']) # train with early stopping on validation set best_f1 = -numpy.inf s['clr'] = s['lr'] for e in xrange(s['nepochs']): # shuffle shuffle([train_lex, train_ne, train_y], s['seed']) s['ce'] = e tic = time.time() for i in xrange(nsentences): cwords = contextwin(train_lex[i], s['win']) words = map(lambda x: numpy.asarray(x).astype('int32'), \ minibatch(cwords, s['bs'])) labels = train_y[i] for word_batch, label_last_word in zip(words, labels): rnn.train(word_batch, label_last_word, s['clr']) rnn.normalize() if s['verbose']: print '[learning] epoch %i >> %2.2f%%' % ( e, (i + 1) * 100. / nsentences
# instanciate the model numpy.random.seed(s['seed']) random.seed(s['seed']) rnn = model( nh = s['nhidden'], nc = nclasses, ne = vocsize, de = s['emb_dimension'], cs = s['win'], cue = args.c) # train with early stopping on validation set best_f1 = -numpy.inf s['clr'] = s['lr'] for e in xrange(s['nepochs']): # shuffle shuffle([train_lex,train_y,train_cue], s['seed']) print '[learning] epoch %d' % e s['ce'] = e tic = time.time() for i in xrange(nsentences): # take the context win of both # merge the results cwords = contextwin(train_lex[i], s['win']) words = map(lambda x: numpy.asarray(x).astype('int32'),\ minibatch(cwords, s['bs'])) if args.c: ccues = contextwin(train_cue[i],s['win']) cues_bs = map(lambda x: numpy.asarray(x).astype('int32'),\ minibatch(ccues, s['bs'])) labels = train_y[i] if not args.c:
def main(): parser = argparse.ArgumentParser() parser.add_argument('-v', '--verbose', action='count', default=1, help='Adjust level of verbosity.') parser.add_argument('-nh', '--num-hidden', dest='num_hidden', type=int, default=20, help='Set dimension of hidden units.') parser.add_argument('-d', '--depth', type=int, default=3, help='Set number of stacked layers') parser.add_argument('-l', '--lambda', dest='lam', type=float, default=0.00000001, help='Set lambda value used for L2-regularization') parser.add_argument('--seed', type=int, default=345, help='Set PRNG seed') parser.add_argument('--emb-file', dest='emb_file', type=str, help='Location of file containing word embeddings') parser.add_argument('-e', '--num-epochs', dest='num_epochs', type=int, default=200, help='Set number of epochs to train') parser.add_argument('-a', '--alpha', dest='alpha', type=float, default=0.01, help='Set the initial learning rate') parser.add_argument('--ex', dest='examples_file', type=str, default='./mpqa2data.pkl', help='Path to file containing the pkled complete dataset') parser.add_argument('--adagrad', dest='adagrad', type=bool, default=True, help='Enable adagrad') args = parser.parse_args() s = {'lr': args.alpha, 'verbose': args.verbose, 'decay': True, # decay on the learning rate if improvement stops 'nhidden': args.num_hidden, # number of hidden units 'depth': args.depth, # number of layers in space 'seed': args.seed, 'nepochs': args.num_epochs} folder = os.path.basename(__file__).split('.')[0] if not os.path.exists(folder): os.mkdir(folder) # load the dataset train_set, valid_set, test_set, dic = mpqa_load.mpqa(args.examples_file) idx2label = dic['idx2label'] idx2word = dic['idx2word'] print idx2label train_lex, train_y = train_set valid_lex, valid_y = valid_set test_lex, test_y = test_set vocsize = max(set([item for sublist in train_lex+valid_lex+test_lex for item in sublist])) + 1 nclasses = len(set([item for sublist in train_y+valid_y+test_y for item in sublist])) nsentences = len(train_lex) # instantiate the model numpy.random.seed(s['seed']) random.seed(s['seed']) rnn = model( nh = s['nhidden'], nc = nclasses, ne = vocsize, depth = s['depth'], embeddings = load_embeddings(args.emb_file, idx2word, vocsize), lam=args.lam, adagrad=args.adagrad ) # train with early stopping on validation set best_f1 = -numpy.inf s['clr'] = s['lr'] s['be'] = 0 for e in xrange(s['nepochs']): # shuffle shuffle([train_lex, train_y], s['seed']) s['ce'] = e tic = time.time() for i in xrange(nsentences): words = numpy.asarray(train_lex[i]).astype('int32').reshape((len(train_lex[i]), 1)) labels = numpy.asarray(train_y[i]).astype('int32').reshape(len(train_y[i])) if len(words) == 0: continue cost, _s = rnn.train(words, labels, s['clr']) if args.verbose > 0 and i % nsentences/4 == 0: for idx in xrange(len(words)): print [round(item, 3) for item in _s[idx,0,:].tolist()], labels[idx], numpy.argmax(_s[idx,0,:]), idx2word[words[idx]] print '[learning] epoch %i >> %2.2f%%' % (e, (i+1)*100./nsentences), '\tCurrent cost: %.3f' % cost sys.stdout.flush() #pdb.set_trace() # evaluation // back into the real world : idx -> words predictions_test = [ map(lambda x: idx2label[x], \ rnn.classify(numpy.asarray(x).astype('int32').reshape((len(x), 1)))) \ for x in test_lex if len(x) > 0 ] groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y if len(y) > 0 ] words_test = [ map(lambda x: idx2word[x], w) for w in test_lex if len(w) > 0 ] predictions_valid = [ map(lambda x: idx2label[x], \ rnn.classify(numpy.asarray(x).astype('int32').reshape((len(x), 1)))) \ for x in valid_lex if len(x) > 0 ] groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y if len(y) > 0 ] words_valid = [ map(lambda x: idx2word[x], w) for w in valid_lex if len(w) > 0 ] #pdb.set_trace() predictions_test_labels = [item for sublist in predictions_test for item in sublist] zero_predictions = [item for item in predictions_test_labels if item == '0'] nonzero_predictions = [item for item in predictions_test_labels if item != '0'] print "Num zero_predictions: %d" % len(zero_predictions) print "Num nonzero_predictions: %d" % len(nonzero_predictions) errors = 0 for idx in xrange(len(groundtruth_test)): cur_gt_labels = groundtruth_test[idx] cur_pred_labels = predictions_test[idx] for subidx in xrange(len(cur_gt_labels)): if cur_gt_labels[subidx] != cur_pred_labels[subidx]: errors += 1 accuracy = (1.0 - errors / float(len(predictions_test_labels))) # evaluation // compute the accuracy using conlleval.pl # error_rate = accuracy([item for sublist in predictions_test for item in sublist], # [item for sublist in groundtruth_test for item in sublist]) if accuracy > best_f1: best_f1 = accuracy s['be'] = e #print "Nonzero prediction count: %d" % len([item for item ]) print "accuracy after %d epochs: %g" % (e, accuracy)
# language model tensors # lm_init_op = sess.graph.get_operation_by_name('Train/Model/states_init') # char_inputs_in_tensor = sess.graph.get_tensor_by_name('Train/Model/char_inputs_in') # inputs_in_tensor = sess.graph.get_tensor_by_name('Train/Model/inputs_in') # targets_in_tensor = sess.graph.get_tensor_by_name('Train/Model/targets_in') # targets_weights_in_tensor = sess.graph.get_tensor_by_name('Train/Model/target_weights_in') init_op = tf.global_variables_initializer() sess.run(init_op) # train with early stopping on training set best_f1 = -np.inf # early_stop_count = 0 for e in range(FLAGS.nepochs): # shuffle shuffle([train_x, train_ne, train_y, train_lm], FLAGS.seed) record['current epoch'] = e tic = time.time() # training for i in range(FLAGS.nsentences): X = np.asarray([train_x[i]]) Y = to_categorical( np.asarray(train_y[i])[:, np.newaxis], nclasses)[np.newaxis, :, :] if FLAGS.with_lm: # don't use the lm_embedding for the start token <S> [_] = sess.run([train_op], feed_dict={ inputs: X,
# instanciate the model numpy.random.seed(s['seed']) random.seed(s['seed']) rnn = model( nh = s['nhidden'], nc = nclasses, ne = vocsize, de = s['emb_dimension'], cs = s['win'] ) # train with early stopping on validation set best_f1 = -numpy.inf s['clr'] = s['lr'] for e in xrange(s['nepochs']): # shuffle shuffle([train_lex, train_ne, train_y], s['seed']) s['ce'] = e tic = time.time() for i in xrange(nsentences): cwords = contextwin(train_lex[i], s['win']) words = map(lambda x: numpy.asarray(x).astype('int32'),\ minibatch(cwords, s['bs'])) labels = train_y[i] for word_batch , label_last_word in zip(words, labels): print "word_batch: ", word_batch print "label_last_word: ", label_last_word rnn.train(word_batch, label_last_word, s['clr']) rnn.normalize() if s['verbose']: print '[learning] epoch %i >> %2.2f%%'%(e,(i+1)*100./nsentences),'completed in %.2f (sec) <<\r'%(time.time()-tic), sys.stdout.flush()
numpy.random.seed(s.seed) random.seed(s.seed) rnn = model( nh = s.hidden_size, nc = nclasses, ne = vocsize, de = s.emb_size, cs = s.win, memory_size = s.memory_size, n_memory_slots = s.n_memory_slots ) # train with early stopping on validation set best_f1 = -numpy.inf s.clr = s.lr for e in xrange(s.n_epochs): # shuffle shuffle([train_lex, train_ne, train_y], s.seed) s.ce = e tic = time.time() for i in xrange(nsentences): cwords = contextwin(train_lex[i], s.win) words = map(lambda x: numpy.asarray(x).astype('int32'),\ minibatch(cwords, s.bs)) labels = train_y[i] for word_batch , label_last_word in zip(words, labels): rnn.train(word_batch, label_last_word, s.clr) rnn.normalize() if s.verbose: print '[learning] epoch %i >> %2.2f%%'%(e,(i+1)*100./nsentences),'completed in %.2f (sec) <<\r'%(time.time()-tic), sys.stdout.flush() # evaluation // back into the real world : idx -> words
nclasses = len(set(reduce(lambda x, y: list(x) + list(y), train_y + test_y + valid_y))) nsentences = len(train_lex) # instanciate the model numpy.random.seed(s["seed"]) random.seed(s["seed"]) rnn = model(nh=s["nhidden"], nc=nclasses, ne=vocsize, de=s["emb_dimension"], cs=s["win"]) # train with early stopping on validation set best_f1 = -numpy.inf s["clr"] = s["lr"] for e in xrange(s["nepochs"]): # shuffle shuffle([train_lex, train_ne, train_y], s["seed"]) s["ce"] = e tic = time.time() for i in xrange(nsentences): cwords = contextwin(train_lex[i], s["win"]) words = map(lambda x: numpy.asarray(x).astype("int32"), minibatch(cwords, s["bs"])) labels = train_y[i] for word_batch, label_last_word in zip(words, labels): rnn.train(word_batch, label_last_word, s["clr"]) rnn.normalize() if s["verbose"]: print "[learning] epoch %i >> %2.2f%%" % ( e, (i + 1) * 100.0 / nsentences,
numpy.random.seed(s.seed) random.seed(s.seed) rnn = model(nh=s.hidden_size, nc=nclasses, ne=vocsize, de=s.emb_size, cs=s.win, memory_size=s.memory_size, n_memory_slots=s.n_memory_slots) # train with early stopping on validation set best_f1 = -numpy.inf s.clr = s.lr for e in xrange(s.n_epochs): # shuffle shuffle([train_lex, train_ne, train_y], s.seed) s.ce = e tic = time.time() for i in xrange(nsentences): cwords = contextwin(train_lex[i], s.win) words = map(lambda x: numpy.asarray(x).astype('int32'),\ minibatch(cwords, s.bs)) labels = train_y[i] for word_batch, label_last_word in zip(words, labels): rnn.train(word_batch, label_last_word, s.clr) rnn.normalize() if s.verbose: print '[learning] epoch %i >> %2.2f%%' % ( e, (i + 1) * 100. / nsentences ), 'completed in %.2f (sec) <<\r' % (time.time() - tic), sys.stdout.flush()
def callRNN(): s = {'reload':False, 'model':'the path of the model', 'isemb':True, 'lr':0.0627142536696559, 'verbose':1, 'decay':True, # decay on the learning rate if improvement stops 'win':5, # number of words in the context window 'bs':9, # number of backprop through time steps 'nhidden':100, # number of hidden units 'seed':345, 'emb_dimension':100, # dimension of word embedding 'nepochs':20} #获取当前文件名 folder = os.path.basename(__file__).split('.')[0] if not os.path.exists(folder): os.mkdir(folder) # load the dataset 训练集、开发集、测试集、词典 train_set, valid_set, test_set, dic = pp.preProcess(segfile, labelfile, embfile) #train_set, valid_set, test_set, dic = load.atisfold(s['fold']) # 字典中存在labels字典和词典 词-》编号 编号-》词 idx2label = dict((k,v) for v,k in dic['labels2idx'].iteritems()) idx2word = dict((k,v) for v,k in dic['words2idx'].iteritems()) #对同一个文件进行处理,处理完成后进行切分,现在没做的 #数据集中包括编号、每行个数、编号, 训练集4:1切分为训练和开发 train_lex, train_ne, train_y = train_set valid_lex, valid_ne, valid_y = valid_set test_lex, test_ne, test_y = test_set #vocsize = len(set(reduce(\ # lambda x, y: list(x)+list(y),\ # train_lex+valid_lex+test_lex))) #分类个数,一共多少种类,这个可以直接赋值的 nclasses = len(idx2word) #句子数,训练语料的训练句子,用于对句子进行遍历,把握进度 nsentences = len(train_lex) # instanciate the model numpy.random.seed(s['seed']) random.seed(s['seed']) #初始化模型参数 print 'init model' rnn = model( nh = s['nhidden'], nc = nclasses, ne = 1, isemb = s['isemb'], de = s['emb_dimension'], cs = s['win'] ) if s['reload']: print 'load model' rnn.load(s[model]) # train with early stopping on validation set best_f1 = -numpy.inf s['clr'] = s['lr'] print 'start train' for e in xrange(s['nepochs']): # shuffle shuffle([train_lex, train_ne, train_y], s['seed']) s['ce'] = e tic = time.time() for i in xrange(nsentences): cwords = contextwin(train_lex[i], s['win']) words = map(lambda x: numpy.asarray(x).astype('int32'),\ minibatch(cwords, s['bs'])) labels = train_y[i] for word_batch , label_last_word in zip(words, labels): rnn.train(word_batch, label_last_word, s['clr']) #开始训练 rnn.normalize() if s['verbose']: print '[learning] epoch %i >> %2.2f%%'%(e,(i+1)*100./nsentences),'completed in %.2f (sec) <<\r'%(time.time()-tic), sys.stdout.flush() # evaluation // back into the real world : idx -> words #通过开发集进行调参,主要调节学习率 #对测试集进行测试,并将结果转化为字母标签 predictions_test = [ map(lambda x: idx2label[x], \ rnn.classify(numpy.asarray(contextwin(x, s['win'])).astype('int32')))\ for x in test_lex ] #将test_y的值使用字母标签进行代替 groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ] #进test_lex使用词本身代替 words_test = [ map(lambda x: idx2word[x], w) for w in test_lex] #对开发集结果进行测试,并将结果转化为字母标签 predictions_valid = [ map(lambda x: idx2label[x], \ rnn.classify(numpy.asarray(contextwin(x, s['win'])).astype('int32')))\ for x in valid_lex ] #将开发集标签使用字母标签替换 groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ] #将valid_lex使用词替换 words_valid = [ map(lambda x: idx2word[x], w) for w in valid_lex] # evaluation // compute the accuracy using conlleval.pl # 调用conlleval.pl,对test和valid数据集进行结果分析,并将结果进行保存 res_test = conlleval(predictions_test, groundtruth_test, words_test, folder +'/test'+str(e)+'.txt') res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/valid'+str(e)+'.txt') #保存模型 if not os.path.exists('result'): os.mkdir('result') rnn.save('result/'+folder+str(e)) #对测试集的F值进行比较 print '第',e,'次迭代的F值为:',res_test['f1'],'开发集F值为',res_valid['f1'] if res_valid['f1'] > best_f1: best_f1 = res_valid['f1'] if s['verbose']: print 'NEW BEST: epoch', e, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' '*20 s['vf1'], s['vp'], s['vr'] = res_valid['f1'], res_valid['p'], res_valid['r'] s['tf1'], s['tp'], s['tr'] = res_test['f1'], res_test['p'], res_test['r'] s['be'] = e #开启子线程执行mv命令,其实就是改名 subprocess.call(['mv', folder + '/test'+str(e)+'.txt', folder + '/best.test'+str(e)+'.txt']) subprocess.call(['mv', folder + '/valid'+str(e)+'.txt', folder + '/best.valid'+str(e)+'.txt']) else: print '' # learning rate decay if no improvement in 10 epochs if s['decay'] and abs(s['be']-s['ce']) >= 5: s['clr'] *= 0.5 print '学习率修改为=',s['clr'] if s['clr'] < 1e-5: break print 'BEST RESULT: epoch', e, 'valid F1', s['vf1'], 'best test F1', s['tf1'], 'with the model', folder
def callRNN(): s = { 'reload': False, 'model': 'the path of the model', 'isemb': True, 'lr': 0.0627142536696559, 'verbose': 1, 'decay': True, # decay on the learning rate if improvement stops 'win': 5, # number of words in the context window 'bs': 9, # number of backprop through time steps 'nhidden': 100, # number of hidden units 'seed': 345, 'emb_dimension': 100, # dimension of word embedding 'nepochs': 20 } #获取当前文件名 folder = os.path.basename(__file__).split('.')[0] if not os.path.exists(folder): os.mkdir(folder) # load the dataset 训练集、开发集、测试集、词典 train_set, valid_set, test_set, dic = pp.preProcess( segfile, labelfile, embfile) #train_set, valid_set, test_set, dic = load.atisfold(s['fold']) # 字典中存在labels字典和词典 词-》编号 编号-》词 idx2label = dict((k, v) for v, k in dic['labels2idx'].iteritems()) idx2word = dict((k, v) for v, k in dic['words2idx'].iteritems()) #对同一个文件进行处理,处理完成后进行切分,现在没做的 #数据集中包括编号、每行个数、编号, 训练集4:1切分为训练和开发 train_lex, train_ne, train_y = train_set valid_lex, valid_ne, valid_y = valid_set test_lex, test_ne, test_y = test_set #vocsize = len(set(reduce(\ # lambda x, y: list(x)+list(y),\ # train_lex+valid_lex+test_lex))) #分类个数,一共多少种类,这个可以直接赋值的 nclasses = len(idx2word) #句子数,训练语料的训练句子,用于对句子进行遍历,把握进度 nsentences = len(train_lex) # instanciate the model numpy.random.seed(s['seed']) random.seed(s['seed']) #初始化模型参数 print 'init model' rnn = model(nh=s['nhidden'], nc=nclasses, ne=1, isemb=s['isemb'], de=s['emb_dimension'], cs=s['win']) if s['reload']: print 'load model' rnn.load(s[model]) # train with early stopping on validation set best_f1 = -numpy.inf s['clr'] = s['lr'] print 'start train' for e in xrange(s['nepochs']): # shuffle shuffle([train_lex, train_ne, train_y], s['seed']) s['ce'] = e tic = time.time() for i in xrange(nsentences): cwords = contextwin(train_lex[i], s['win']) words = map(lambda x: numpy.asarray(x).astype('int32'),\ minibatch(cwords, s['bs'])) labels = train_y[i] for word_batch, label_last_word in zip(words, labels): rnn.train(word_batch, label_last_word, s['clr']) #开始训练 rnn.normalize() if s['verbose']: print '[learning] epoch %i >> %2.2f%%' % ( e, (i + 1) * 100. / nsentences ), 'completed in %.2f (sec) <<\r' % (time.time() - tic), sys.stdout.flush() # evaluation // back into the real world : idx -> words #通过开发集进行调参,主要调节学习率 #对测试集进行测试,并将结果转化为字母标签 predictions_test = [ map(lambda x: idx2label[x], \ rnn.classify(numpy.asarray(contextwin(x, s['win'])).astype('int32')))\ for x in test_lex ] #将test_y的值使用字母标签进行代替 groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y] #进test_lex使用词本身代替 words_test = [map(lambda x: idx2word[x], w) for w in test_lex] #对开发集结果进行测试,并将结果转化为字母标签 predictions_valid = [ map(lambda x: idx2label[x], \ rnn.classify(numpy.asarray(contextwin(x, s['win'])).astype('int32')))\ for x in valid_lex ] #将开发集标签使用字母标签替换 groundtruth_valid = [map(lambda x: idx2label[x], y) for y in valid_y] #将valid_lex使用词替换 words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex] # evaluation // compute the accuracy using conlleval.pl # 调用conlleval.pl,对test和valid数据集进行结果分析,并将结果进行保存 res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/test' + str(e) + '.txt') res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/valid' + str(e) + '.txt') #保存模型 if not os.path.exists('result'): os.mkdir('result') rnn.save('result/' + folder + str(e)) #对测试集的F值进行比较 print '第', e, '次迭代的F值为:', res_test['f1'], '开发集F值为', res_valid['f1'] if res_valid['f1'] > best_f1: best_f1 = res_valid['f1'] if s['verbose']: print 'NEW BEST: epoch', e, 'valid F1', res_valid[ 'f1'], 'best test F1', res_test['f1'], ' ' * 20 s['vf1'], s['vp'], s['vr'] = res_valid['f1'], res_valid[ 'p'], res_valid['r'] s['tf1'], s['tp'], s['tr'] = res_test['f1'], res_test[ 'p'], res_test['r'] s['be'] = e #开启子线程执行mv命令,其实就是改名 subprocess.call([ 'mv', folder + '/test' + str(e) + '.txt', folder + '/best.test' + str(e) + '.txt' ]) subprocess.call([ 'mv', folder + '/valid' + str(e) + '.txt', folder + '/best.valid' + str(e) + '.txt' ]) else: print '' # learning rate decay if no improvement in 10 epochs if s['decay'] and abs(s['be'] - s['ce']) >= 5: s['clr'] *= 0.5 print '学习率修改为=', s['clr'] if s['clr'] < 1e-5: break print 'BEST RESULT: epoch', e, 'valid F1', s['vf1'], 'best test F1', s[ 'tf1'], 'with the model', folder
def main(): parser = argparse.ArgumentParser() parser.add_argument('-v', '--verbose', action='count', default=1, help='Adjust level of verbosity.') parser.add_argument('-nh', '--num-hidden', dest='num_hidden', type=int, default=100, help='Set dimension of hidden units.') parser.add_argument('-w', '--window', type=int, default=5, help='Set size of context window (in words).') parser.add_argument('-d', '--depth', type=int, default=3, help='Set number of stacked layers') parser.add_argument('--seed', type=int, default=345, help='Set PRNG seed') parser.add_argument('--emb-dim', dest='emb_dimension', type=int, default=100, help='Set size of word embeddings') parser.add_argument('-e', '--num-epochs', dest='num_epochs', type=int, default=50, help='Set number of epochs to train') args = parser.parse_args() s = {'fold':3, # 5 folds 0,1,2,3,4 'lr':0.0627142536696559, 'verbose': args.verbose, 'decay': False, # decay on the learning rate if improvement stops 'win': args.window, # number of words in the context window 'bs':9, # number of backprop through time steps 'nhidden': args.num_hidden, # number of hidden units 'depth': args.depth, # number of layers in space 'seed': args.seed, 'emb_dimension': args.emb_dimension, # dimension of word embedding 'nepochs': args.num_epochs} folder = os.path.basename(__file__).split('.')[0] if not os.path.exists(folder): os.mkdir(folder) # load the dataset train_set, valid_set, test_set, dic = load.atisfold(s['fold']) idx2label = dict((k,v) for v,k in dic['labels2idx'].iteritems()) idx2word = dict((k,v) for v,k in dic['words2idx'].iteritems()) train_lex, train_ne, train_y = train_set valid_lex, valid_ne, valid_y = valid_set test_lex, test_ne, test_y = test_set vocsize = len(set(reduce(\ lambda x, y: list(x)+list(y),\ train_lex+valid_lex+test_lex))) nclasses = len(set(reduce(\ lambda x, y: list(x)+list(y),\ train_y+test_y+valid_y))) nsentences = len(train_lex) # instantiate the model numpy.random.seed(s['seed']) random.seed(s['seed']) rnn = model( nh = s['nhidden'], nc = nclasses, ne = vocsize, de = s['emb_dimension'], cs = s['win'], depth = s['depth'] ) # train with early stopping on validation set best_f1 = -numpy.inf s['clr'] = s['lr'] for e in xrange(s['nepochs']): # shuffle shuffle([train_lex, train_ne, train_y], s['seed']) s['ce'] = e tic = time.time() for i in xrange(nsentences): cwords = contextwin(train_lex[i], s['win']) words = map(lambda x: numpy.asarray(x).astype('int32'),\ minibatch(cwords, s['bs'])) labels = train_y[i] for word_batch, label_last_word in zip(words, labels): print word_batch #print label_last_word #pdb.set_trace() rnn.train(word_batch, label_last_word, s['clr']) rnn.normalize() if s['verbose']: print '[learning] epoch %i >> %2.2f%%'%(e,(i+1)*100./nsentences),'completed in %.2f (sec) <<\r'%(time.time()-tic), sys.stdout.flush() # evaluation // back into the real world : idx -> words predictions_test = [ map(lambda x: idx2label[x], \ rnn.classify(numpy.asarray(contextwin(x, s['win'])).astype('int32')))\ for x in test_lex ] groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ] words_test = [ map(lambda x: idx2word[x], w) for w in test_lex] predictions_valid = [ map(lambda x: idx2label[x], \ rnn.classify(numpy.asarray(contextwin(x, s['win'])).astype('int32')))\ for x in valid_lex ] groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ] words_valid = [ map(lambda x: idx2word[x], w) for w in valid_lex] # evaluation // compute the accuracy using conlleval.pl res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt') res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt') if res_valid['f1'] > best_f1: rnn.save(folder) best_f1 = res_valid['f1'] if s['verbose']: print 'NEW BEST: epoch', e, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' '*20 s['vf1'], s['vp'], s['vr'] = res_valid['f1'], res_valid['p'], res_valid['r'] s['tf1'], s['tp'], s['tr'] = res_test['f1'], res_test['p'], res_test['r'] s['be'] = e subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt']) subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt']) else: print '' # learning rate decay if no improvement in 10 epochs if s['decay'] and abs(s['be']-s['ce']) >= 10: s['clr'] *= 0.5 if s['clr'] < 1e-5: break print 'BEST RESULT: epoch', e, 'valid F1', s['vf1'], 'best test F1', s['tf1'], 'with the model', folder
def run(s) : print s folder = os.path.basename(__file__).split('.')[0] if not os.path.exists(folder): os.mkdir(folder) #print folder # load the dataset eval_options = [] if s['dataset'] == 'atis': train_set, valid_set, test_set, dic = load.atisfold(s['fold']) if s['dataset'] == 'ner': train_set, valid_set, test_set, dic = load.ner() if s['dataset'] == 'chunk': train_set, valid_set, test_set, dic = load.chunk() if s['dataset'] == 'pos': train_set, valid_set, test_set, dic = load.pos() eval_options = ['-r'] idx2label = dict((k, v) for v, k in dic['labels2idx'].iteritems()) idx2word = dict((k, v) for v, k in dic['words2idx'].iteritems()) train_lex, train_ne, train_y = train_set valid_lex, valid_ne, valid_y = valid_set test_lex, test_ne, test_y = test_set vocsize = len(dic['words2idx']) nclasses = len(dic['labels2idx']) nsentences = len(train_lex) wv = None if 'WVFolderName' in s: # load word vector # wv = numpy.zeros((vocsize+1, s['emb_dimension'])) # input = open(s['wv_folder'] + str(s['emb_dimension']), 'r') # for line in input: # tokens = line.split(' ') # wv[int(tokens[0])] = [float(tokens[j]) for j in xrange(1, len(tokens) - 1)] # load word vector wvnp = np.load("./../WV/" + s['WVFolderName'] + "/" + s['model']+".words" + str(s['emb_dimension']) + ".npy") # load vocab with open("./../WV/" + s['WVFolderName'] + "/" + s['model']+".words" + str(s['emb_dimension']) + ".vocab") as f: vocab = [line.strip() for line in f if len(line) > 0] wi = dict([(a, i) for i, a in enumerate(vocab)]) iw = vocab wv = numpy.zeros((vocsize + 1, s['emb_dimension'])) random_v = math.sqrt(6.0 / numpy.sum(s['emb_dimension'])) * numpy.random.uniform(-1.0, 1.0, (s['emb_dimension'])) miss = 0 for i in range(0, vocsize): word = idx2word[i] if word in wi: wv[i] = wvnp[wi[word]] # print wvnp[wi[word]] else: wv[i] = random_v miss += 1 print miss, '/', vocsize best_valid = numpy.zeros(len(s['rho'])) - numpy.inf best_test = numpy.zeros(len(s['rho'])) - numpy.inf test_f1List = [[],[],[],[],[],[] ] # print 111 # print test_f1List # instanciate the model numpy.random.seed(s['seed']) random.seed(s['seed']) rnn = elman_attention.model(nh=s['nhidden'], nc=nclasses, ne=vocsize, de=s['emb_dimension'], attention=s['attention'], h_win=s['h_win'], lvrg=s['lvrg'], wv=wv) # train with early stopping on validation set for e in xrange(s['nepochs']): # shuffle shuffle([train_lex, train_ne, train_y], s['seed']) s['ce'] = e tic = time.time() for i in xrange(nsentences): cwords = contextwin(train_lex[i]) labels = train_y[i] # for j in xrange(len(words)): # if j >= 2 : # rnn.train(words[j], [labels[j-2], labels[j-1], labels[j]], s['clr']) nl, aaL = rnn.train(cwords, labels, s['dropRate'], 1) #if i % 1 == 0: # print aaL # rnn.normalize() if s['verbose']: sys.stdout.write(('\r[learning] epoch %i >> %2.2f%%' % ( e, (i + 1) * 100. / nsentences) + (' average speed in %.2f (min) <<' % ( (time.time() - tic) / 60 / (i + 1) * nsentences)) + (' completed in %.2f (sec) <<' % ( (time.time() - tic))))) sys.stdout.flush() print 'start test', time.time() / 60 # print avgSentenceLength / (nsentences) # evaluation // back into the real world : idx -> words # evaluation // back into the real world : idx -> words print 'start pred train', time.time() / 60 predictions_train = [[map(lambda varible: idx2label[varible], w)\ for w in rnn.classify(numpy.asarray(contextwin(x)).astype('int32'), s['dropRate'], 0, s['rho'])] for x in train_lex] groundtruth_train = [map(lambda x: idx2label[x], y) for y in train_y] words_train = [map(lambda x: idx2word[x], w) for w in train_lex] #print 'start pred test', time.time() / 60 predictions_test = [[map(lambda varible: idx2label[varible], w)\ for w in rnn.classify(numpy.asarray(contextwin(x)).astype('int32'), s['dropRate'], 0, s['rho'])] for x in test_lex] groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y] words_test = [map(lambda x: idx2word[x], w) for w in test_lex] #print 'start pred valid', time.time() / 60 predictions_valid = [[map(lambda varible: idx2label[varible], w)\ for w in rnn.classify(numpy.asarray(contextwin(x)).astype('int32'), s['dropRate'], 0, s['rho'])] for x in valid_lex] groundtruth_valid = [map(lambda x: idx2label[x], y) for y in valid_y] words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex] #print 'end pred, start eval', time.time() / 60 # evaluation // compute the accuracy using conlleval.pl for i_rho in xrange(len(s['rho'])) : ptrain = [p[i_rho] for p in predictions_train] ptest = [p[i_rho] for p in predictions_test] pvalid = [p[i_rho] for p in predictions_valid] res_train = conlleval(ptrain, groundtruth_train, words_train, folder + '/current.train.txt' + str(s['seed']), eval_options) res_test = conlleval(ptest, groundtruth_test, words_test, folder + '/current.test.txt' + str(s['seed']), eval_options) res_valid = conlleval(pvalid, groundtruth_valid, words_valid, folder + '/current.valid.txt' + str(s['seed']), eval_options) print ' epoch', e, ' rho ', i_rho, ' train p', res_train[ 'p'], 'valid p', res_valid[ 'p'],' train r', res_train[ 'r'], 'valid r', res_valid[ 'r'],' train F1', res_train[ 'f1'], 'valid F1', res_valid[ 'f1'], 'best test F1', res_test['f1'], ' ' * 20 test_f1List[i_rho].append(res_test['f1']) if res_valid['f1'] > best_valid[i_rho]: best_valid[i_rho] = res_valid['f1'] best_test[i_rho] = res_test['f1'] for i_rho in xrange(len(s['rho'])) : print i_rho, s['dataset'], if s['model'] == 'glove': print s['WVFolderName'].replace('skip', 'glove'), else: print s['WVFolderName'], for iff1 in test_f1List[i_rho]: print iff1, print '' for i_rho in xrange(len(s['rho'])) : print s['rho'][i_rho], ' ', best_valid[i_rho] , '/' , best_test[i_rho] #print 'end eval', time.time() / 60 print 'BEST RESULT: epoch', e, 'valid F1', s['vf1'], 'best test F1', s['tf1'], 'with the model', folder