def get_data(self): ''' 读取训练数据和验证集数据 :return: ''' train_path = os.path.join('.', args.train_data, 'dev.txt') train_data = read_corpus(train_path) dev_path = os.path.join('.', args.dev_data, 'dev.txt') dev_data = read_corpus(dev_path) return train_data, dev_data
def train(): train_path = os.path.join('./data_path/', 'train_data') test_path = os.path.join('./data_path/', 'test_data') train_data = read_corpus(train_path) test_data = read_corpus(test_path) test_size = len(test_data) model.build_graph() print("train data: {}".format(len(train_data))) model.train( train_data, test_data ) # we could use test_data as the dev_data to see the overfitting phenomena
def test(opt): log = helpers.Logger(opt.verbose) timer = helpers.Timer() # Load data ========================================================= log.info('Reading corpora') # Read vocabs widss, ids2ws, widst, ids2wt = helpers.get_dictionaries(opt, test=True) # Read test tests_data = np.asarray(data.read_corpus(opt.test_src, widss), dtype=list) # Test output if not opt.test_out: opt.test_out = helpers.exp_filename(opt, 'test.out') # Get target language model lang_model = helpers.get_language_model(opt, None, widst, test=True) # Create model ====================================================== log.info('Creating model') s2s = helpers.build_model(opt, widss, widst, lang_model, test=True) # Print configuration =============================================== if opt.verbose: options.print_config(opt, src_dict_size=len(widss), trg_dict_size=len(widst)) # Start testing ===================================================== log.info('Start running on test set, buckle up!') timer.restart() translations = [] s2s.set_test_mode() for i, x in enumerate(tests_data): y = s2s.translate(x, beam_size=opt.beam_size) translations.append(' '.join([ids2wt[w] for w in y[1:-1]])) np.savetxt(opt.test_out, translations, fmt='%s') translations = np.asarray(translations, dtype=str) BLEU, details = evaluation.bleu_score(opt.test_dst, opt.test_out) log.info('Finished running on test set %.2f elapsed.' % timer.tick()) log.info(details)
def data_concat(base_path): dev_data = [] test_data = [] dev_data_ori = read_corpus(base_path+'/dev_data') test_data_ori = read_corpus(base_path+'/test_data') dev_data_predicted = read_corpus_3(base_path+'/label_dev') test_data_predicted = read_corpus_3(base_path + '/label_test') for sent_, sent_predicted_ in zip(dev_data_ori, dev_data_predicted): dev_data.append([sent_predicted_[0], sent_predicted_[1], sent_predicted_[2], sent_[2], sent_[3]]) for sent_, sent_predicted_ in zip(test_data_ori, test_data_predicted): test_data.append([sent_predicted_[0], sent_predicted_[1], sent_predicted_[2], sent_[2], sent_[3]]) return dev_data, test_data
def get_data(self): ''' 读取测试集 :return: ''' test_path = os.path.join('.', args.test_data, 'test.txt') test_data = read_corpus(test_path) return test_data
def data_format(base_path): train_data = read_corpus(train_file_path) dev_data_bieo = read_predicted_corpus(dev_file_path) test_data_bieo = read_predicted_corpus(test_file_path) write_data(train_data, base_path+'/train.txt') write_data(dev_data_bieo, base_path + '/dev.txt') write_data(test_data_bieo, base_path+'/test.txt')
def test(opt): # Load data ========================================================= if opt.verbose: print('Reading corpora') # Read vocabs if opt.dic_src: widss, ids2ws = data.load_dic(opt.dic_src) else: widss, ids2ws = data.read_dic(opt.train_src, max_size=opt.src_vocab_size) data.save_dic(opt.exp_name + '_src_dic.txt', widss) if opt.dic_dst: widst, ids2wt = data.load_dic(opt.dic_dst) else: widst, ids2wt = data.read_dic(opt.train_dst, max_size=opt.trg_vocab_size) data.save_dic(opt.exp_name + '_trg_dic.txt', widst) # Read test tests_data = data.read_corpus(opt.test_src, widss) # Create model ====================================================== if opt.verbose: print('Creating model') sys.stdout.flush() s2s = seq2seq.Seq2SeqModel(opt.emb_dim, opt.hidden_dim, opt.att_dim, widss, widst, model_file=opt.model, bidir=opt.bidir, word_emb=opt.word_emb, dropout=opt.dropout_rate, max_len=opt.max_len) if s2s.model_file is not None: s2s.load() s2s.model_file = opt.exp_name + '_model' # Print configuration =============================================== if opt.verbose: options.print_config(opt, src_dict_size=len(widss), trg_dict_size=len(widst)) sys.stdout.flush() # Start testing ===================================================== print('Start running on test set, buckle up!') sys.stdout.flush() test_start = time.time() with open(opt.test_out, 'w+') as of: for x in tests_data: y = s2s.translate(x, beam_size=opt.beam_size) translation = ' '.join([ids2wt[w] for w in y[1:-1]]) of.write(translation+'\n') _, details = evaluation.bleu_score(opt.test_dst, opt.test_out) test_elapsed = time.time()-test_start print('Finished running on test set', test_elapsed, 'elapsed.') print(details) sys.stdout.flush()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--gpu', '-g', default=-1, type=int, help='GPU ID (negative value indicates CPU)') parser.add_argument('--epoch', '-e', default=400, type=int, help='number of epochs to learn') parser.add_argument('--unit', '-u', default=30, type=int, help='number of units') parser.add_argument('--batchsize', '-b', type=int, default=25, help='learning minibatch size') parser.add_argument('--label', '-l', type=int, default=5, help='number of labels') parser.add_argument('--epocheval', '-p', type=int, default=5, help='number of epochs per evaluation') parser.add_argument('--test', dest='test', action='store_true') parser.set_defaults(test=False) args = parser.parse_args() vocab = {} max_size = None train_trees = data.read_corpus('trees/train.txt', max_size) test_trees = data.read_corpus('trees/test.txt', max_size) if args.gpu >= 0: chainer.cuda.get_device(args.gpu).use() xp = cuda.cupy else: xp = numpy train_data = [linearize_tree(vocab, t, xp) for t in train_trees] train_iter = chainer.iterators.SerialIterator(train_data, args.batchsize) test_data = [linearize_tree(vocab, t, xp) for t in test_trees] test_iter = chainer.iterators.SerialIterator( test_data, args.batchsize, repeat=False, shuffle=False) model = ThinStackRecursiveNet(len(vocab), args.unit, args.label) if args.gpu >= 0: model.to_gpu() optimizer = chainer.optimizers.AdaGrad(0.1) optimizer.setup(model) updater = training.StandardUpdater( train_iter, optimizer, device=None, converter=convert) trainer = training.Trainer(updater, (args.epoch, 'epoch')) trainer.extend( extensions.Evaluator(test_iter, model, converter=convert, device=None), trigger=(args.epocheval, 'epoch')) trainer.extend(extensions.LogReport()) trainer.extend(extensions.MicroAverage( 'main/correct', 'main/total', 'main/accuracy')) trainer.extend(extensions.MicroAverage( 'validation/main/correct', 'validation/main/total', 'validation/main/accuracy')) trainer.extend(extensions.PrintReport( ['epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time'])) trainer.run()
acc_node = 100.0 * result['correct_node'] / result['total_node'] acc_root = 100.0 * result['correct_root'] / result['total_root'] print(' Node accuracy: {0:.2f} %% ({1:,d}/{2:,d})'.format( acc_node, result['correct_node'], result['total_node'])) print(' Root accuracy: {0:.2f} %% ({1:,d}/{2:,d})'.format( acc_root, result['correct_root'], result['total_root'])) vocab = {} if args.test: max_size = 10 else: max_size = None train_trees = [convert_tree(vocab, tree) for tree in data.read_corpus('trees/train.txt', max_size)] test_trees = [convert_tree(vocab, tree) for tree in data.read_corpus('trees/test.txt', max_size)] develop_trees = [convert_tree(vocab, tree) for tree in data.read_corpus('trees/dev.txt', max_size)] model = RecursiveNet(len(vocab), n_units) if args.gpu >= 0: model.to_gpu() # Setup optimizer optimizer = optimizers.AdaGrad(lr=0.1) optimizer.setup(model) optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(0.0001))
## get char embeddings word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl')) if args.pretrain_embedding == 'random': embeddings = random_embedding(word2id, args.embedding_dim) else: embedding_path = 'pretrain_embedding.npy' embeddings = np.array(np.load(embedding_path), dtype='float32') if args.mode != 'demo': train_path_text = os.path.join('.', args.train_data, 'train_data_text') train_path_tag = os.path.join('.', args.train_data, 'train_data_tag') test_path_text = os.path.join('.', args.test_data, 'test_data_text') test_path_tag = os.path.join('.', args.test_data, 'test_data_tag') train_data = read_corpus(train_path_text, train_path_tag) test_data = read_corpus(test_path_text, test_path_tag) test_size = len(test_data) else: demo_tag = os.path.join('.', args.demo_data, 'demo_tag') ## paths setting paths = {} timestamp = str(int(time.time())) if args.mode == 'train' else args.demo_model output_path = os.path.join('.', args.train_data + "_save", timestamp) if not os.path.exists(output_path): os.makedirs(output_path) summary_path = os.path.join(output_path, "summaries") paths['summary_path'] = summary_path if not os.path.exists(summary_path): os.makedirs(summary_path) model_path = os.path.join(output_path, "checkpoints/")
args = parser.parse_args() ## get char embeddings word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl')) if args.pretrain_embedding == 'random': embeddings = random_embedding(word2id, args.embedding_dim) else: embedding_path = 'pretrain_embedding.npy' embeddings = np.array(np.load(embedding_path), dtype='float32') ## read corpus and get training data if args.mode != 'demo': #train_path = os.path.join('.', args.train_data, 'train_data') test_path = os.path.join('.', args.test_data, 'test_data') #train_data = read_corpus(train_path) test_data = read_corpus(test_path) test_size = len(test_data) ## paths setting timestamp = str(int(time.time())) if args.mode == 'train' else args.demo_model output_path = os.path.join('.', args.train_data + "_save", timestamp) if not os.path.exists(output_path): os.makedirs(output_path) summary_path = os.path.join(output_path, "summaries") if not os.path.exists(summary_path): os.makedirs(summary_path) model_path = os.path.join(output_path, "checkpoints/") if not os.path.exists(model_path): os.makedirs(model_path) ckpt_prefix = os.path.join(model_path, "model") result_path = os.path.join(output_path, "results") if not os.path.exists(result_path): os.makedirs(result_path) log_path = os.path.join(result_path, "log.txt") get_logger(log_path).info(str(args))
word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl')) if args.pretrain_embedding == 'random': embeddings = random_embedding(word2id, args.embedding_dim) else: embedding_path = 'pretrain_embedding.npy' embeddings = np.array(np.load(embedding_path), dtype='float32') ## read corpus and get training data if args.mode != 'demo': f_source_data = os.path.join('.', args.train_data, 'source_data.txt') f_source_label = os.path.join('.', args.train_data, 'source_label.txt') f_test_data = os.path.join('.', args.train_data, 'test_data.txt') f_test_label = os.path.join('.', args.train_data, 'test_label.txt') train_data = read_corpus(f_source_data, f_source_label) test_data = read_corpus(f_test_data, f_test_label); test_size = len(test_data) ## paths setting paths = {} timestamp = str(int(time.time())) if args.mode == 'train' else args.demo_model output_path = os.path.join('.', args.train_data+"_save", timestamp) if not os.path.exists(output_path): os.makedirs(output_path) summary_path = os.path.join(output_path, "summaries") paths['summary_path'] = summary_path if not os.path.exists(summary_path): os.makedirs(summary_path) model_path = os.path.join(output_path, "checkpoints/") if not os.path.exists(model_path): os.makedirs(model_path) ckpt_prefix = os.path.join(model_path, "model") paths['model_path'] = ckpt_prefix
def main(): # if args.mode == 'train' ap = [] with open('../../../china_medical_char_data_cleaned/vocab.tags.txt', 'r') as fin: for line in fin: ap.append(line.strip()) fin.close() length = len(ap) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.625) # config = tf.ConfigProto() # config.gpu_options.allow_growth = True sess = tf.Session(config=tf.ConfigProto( # device_count={ "CPU": 48 }, # inter_op_parallelism_threads=10, allow_soft_placement=True, # intra_op_parallelism_threads=20, gpu_options=gpu_options)) generator = Generator_BiLSTM_CRF(0.5, 1, batch_size, params, filter_sizes, num_filters, 0.75, length) generator.build_graph() tvars = tf.trainable_variables() (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) # 最后初始化变量 # sess.run(tf.global_variables_initializer()) sess.run(generator.init_op) sess.run(generator.table_op) sess.run(generator.init_op_1) saver = tf.train.Saver(tf.global_variables()) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" print(" name = %s, shape = %s%s", var.name, var.shape, init_string) train_path = os.path.join('.', args.train_data, 'train_data1') train_unlabel_path = os.path.join('.', args.train_data_unlabel, 'train_unlabel') train_unlabel_path_1 = os.path.join('.', args.train_data_unlabel, 'train_unlabel1') test_path = os.path.join('.', args.test_data, 'test_data1') sub_test_path = os.path.join('.', args.sub_test_data, 'sub_test_data') train_data = read_corpus(train_path) train_data_unlabel = read_corpus_unlabel(train_unlabel_path) train_data_unlabel_1 = read_corpus_unlabel(train_unlabel_path_1) test_data = read_corpus(test_path) test_size = len(test_data) sub_test_data = read_corpus(sub_test_path) batches_labeled = batch_yield(train_data, batch_size, shuffle=True) batches_labeled = list(batches_labeled) # print(len(batches_labeled)) num_batches = (len(train_data) + batch_size - 1) // batch_size batches_unlabeled = batch_yield_for_unla_da(train_data_unlabel, batch_size, shuffle=True) batches_unlabeled = list(batches_unlabeled) # print(len(batches_unlabeled)) batches_labeled_for_dis = batch_yield_for_discri(train_data, batch_size, shuffle=True) batches_labeled_for_dis = list(batches_labeled_for_dis) batches_unlabeled_for_dis = batch_yield_for_discri_unlabeled( train_data_unlabel, batch_size, shuffle=True) batches_unlabeled_for_dis = list(batches_unlabeled_for_dis) dev = batch_yield(test_data, batch_size, shuffle=True) # num_batches = min(len(batches_labeled),len(batches_unlabeled)) num_batches_unlabel = (len(train_data_unlabel) + batch_size - 1) // batch_size num_batches_1 = min(len(batches_labeled_for_dis), len(batches_unlabeled_for_dis)) index = 0 if args.mode == 'train': for epoch_total in range(30): print('epoch_total and index are {} and {}'.format( epoch_total + 1, index)) medi_lis = get_metrics(sess, generator, dev, test_size, batch_size, flag=0) for ele in medi_lis: print('实体识别的', ele) print('the whole epoch training accuracy finished!!!!!!!!!!!!') for i, (words, labels) in enumerate(batches_labeled): run_one_epoch(sess, words, labels, tags=[], dev=test_data, epoch=epoch_total, gen=generator, num_batches=num_batches, batch=i, label=0, it=0, iteration=0, saver=saver) dev1 = batch_yield(test_data, batch_size, shuffle=True) medi_lis_from_cross_entropy_training = get_metrics(sess, generator, dev1, test_size, batch_size, flag=0) for ele in medi_lis_from_cross_entropy_training: print('第一次', ele) print( 'the accuray after cross entropy training finished!!!!!!!!!!!!!!!!!!1' ) # if epoch_total > 3: # # batches_labeled_for_dis = batches_labeled_for_dis[0: len(batches_labeled_for_dis)-5] # batch_dis_for_label = len(batches_labeled_for_dis) # batch_dis_for_unlabel = len(batches_unlabeled_for_dis) # for (ele, ele2) in zip(enumerate(batches_labeled_for_dis), enumerate(batches_unlabeled_for_dis)): # index += 1 # # if index > 70: # # break # run_one_epoch(sess, ele[1][0], ele[1][1], ele[1][2], dev=test_data, epoch=epoch_total, # gen=generator, # num_batches=batch_dis_for_label, batch=index, label=2, it=0, iteration=0, saver=saver) # run_one_epoch(sess, ele2[1][0], ele2[1][1], ele2[1][2], dev=test_data, epoch=epoch_total, # gen=generator, # num_batches=batch_dis_for_unlabel, batch=index, label=3, it=0, iteration=0, # saver=saver) # index = 0 # # print('the whole dis phaseI finished') # # index += 1 # for it in range(5): # for i, (words, labels, tags) in enumerate(batches_unlabeled): # # print(i) # run_one_epoch(sess, words, labels, tags=tags, dev=test_data, epoch=epoch_total, gen=generator, # num_batches=num_batches_unlabel, batch=i, label=1, it=it, iteration=i, # saver=saver) # # dev2 = batch_yield(test_data, batch_size, shuffle=True) # # medi_lis_from_adversarial_training = get_metrics(sess, generator, dev2, test_size, batch_size, flag=0) # # for ele in medi_lis_from_adversarial_training: # print('第二次打印', ele) # # print('the accuracy after adversarial training of generator finised!!!!!!!!!!!!!!') # # print('epoch {} finished!'.format(epoch_total)) if args.mode == 'test': sub_dev = batch_yield_for_discri_unlabeled(sub_test_data, batch_size, shuffle=True) # print(list(sub_dev)) ckpt_file = tf.train.latest_checkpoint(model_path) generator = Generator_BiLSTM_CRF(0.5, batch_size, params, filter_sizes, num_filters, 0.75, length, is_training=False) generator.build_graph() generator.test(sess, sub_dev, test_size, 20)
## get char embeddings word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl')) if args.pretrain_embedding == 'random': embeddings = random_embedding(word2id, args.embedding_dim) #(3905,300) else: embedding_path = 'pretrain_embedding.npy' embeddings = np.array(np.load(embedding_path), dtype='float32') ## read corpus and get training data if args.mode != 'train': # train_path = os.path.join('.', args.train_data, 'train_data') # test_path = os.path.join('.', args.test_data, 'test_data') train_path = os.path.join('.', args.train_data, 'processed_downloadfile3') test_path = os.path.join('.', args.test_data, 'processed_downloadfile4') train_data = read_corpus(train_path) #list[(句子,label),(句子,label)] test_data = read_corpus(test_path) test_size = len(test_data) #test中有多少条句子 ## paths setting paths = {} timestamp = str(int(time.time())) if args.mode == 'train' else args.demo_model print(timestamp) output_path = os.path.join( '.', args.train_data + "_save", timestamp) #output_path:.\\data_path_save\\timestamp if not os.path.exists(output_path): os.makedirs(output_path) summary_path = os.path.join( output_path, "summaries") #summary_path:./data_path_save/timestamp/summaries paths['summary_path'] = summary_path
# -*- coding: utf-8 -*- """ Created on Wed Nov 6 11:09:20 2019 @author: 37112 """ import time import data import utils from collections import Counter import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer start = time.time() #数据载入 data_path = "data/train-v2.0.json" qlist, alist = data.read_corpus(data_path) #展示词频最多的次 #word_dic = Counter([q for l in utils.cut(qlist) for q in l]) #utils.show_most_word_freq(word_dic, 50) #载入预处理后问题 qlist_new = utils.load_qlist('data/q_prepro.txt') #question = input("您想了解什么问题?") question = "When did Beyonce start become popular" # 使用tf-idf方法 idx = utils.find_top_similar_ask1(question, qlist_new) alist = np.array(alist) print(alist[idx]) end = time.time() print(end - start, "s")
def eval_user_adaptation(opt): log = utils.Logger(opt.verbose) timer = utils.Timer() # Read vocabs lexicon = helpers.get_lexicon(opt) # Read data filepairs = load_user_filepairs(opt.usr_file_list) # Get target language model lang_model = None # Load model s2s = helpers.build_model(opt, lexicon, lang_model, test=True) if opt.update_mode == 'mixture_weights' and not opt.user_recognizer == 'fact_voc': log.info('Updating only the mixture weights doesn\'t make sense here') exit() s2s.lm = lexicon.trg_unigrams # s2s.freeze_parameters() # Trainer trainer = helpers.get_trainer(opt, s2s) # print config if opt.verbose: options.print_config(opt, src_dict_size=len(lexicon.w2ids), trg_dict_size=len(lexicon.w2idt)) # This will store translations and gold sentences base_translations = [] adapt_translations = [] gold = [] # Run training for usr_id, (src_file, trg_file) in enumerate(filepairs): log.info('Evaluating on files %s' % os.path.basename(src_file).split()[0]) # Load file pair src_data = data.read_corpus(src_file, lexicon.w2ids, raw=True) trg_data = data.read_corpus(trg_file, lexicon.w2idt, raw=True) # split train/test train_src, test_src, train_trg, test_trg, order = split_user_data( src_data, trg_data, n_test=opt.n_test) # Convert train data to indices train_src = lexicon.sents_to_ids(train_src) train_trg = lexicon.sents_to_ids(train_trg, trg=True) # Save test data for s in test_trg: gold.append(' '.join(s)) # Reset model s2s.load() s2s.reset_usr_vec() # Translate with baseline model base_translations.extend(evaluate_model(s2s, test_src, opt.beam_size)) # Start loop n_train = opt.max_n_train adapt_translations.extend( adapt_user(s2s, trainer, train_src[:n_train], train_trg[:n_train], test_src, opt)) # Temp files temp_gold = utils.exp_temp_filename(opt, 'gold.txt') temp_base = utils.exp_temp_filename(opt, '%s_base.txt' % opt.update_mode) temp_adapt = utils.exp_temp_filename(opt, '%s_adapt.txt' % opt.update_mode) utils.savetxt(temp_gold, gold) utils.savetxt(temp_base, base_translations) utils.savetxt(temp_adapt, adapt_translations) # Evaluate base translations bleu, details = evaluation.bleu_score(temp_gold, temp_base) log.info('Base BLEU score: %.2f' % bleu) # Evaluate base translations bleu, details = evaluation.bleu_score(temp_gold, temp_adapt) log.info('Adaptation BLEU score: %.2f' % bleu) # Compare both temp_bootstrap_gold = utils.exp_temp_filename(opt, 'bootstrap_gold.txt') temp_bootstrap_base = utils.exp_temp_filename(opt, 'bootstrap_base.txt') temp_bootstrap_adapt = utils.exp_temp_filename(opt, 'bootstrap_adapt.txt') bleus = evaluation.paired_bootstrap_resampling( temp_gold, temp_base, temp_adapt, opt.bootstrap_num_samples, opt.bootstrap_sample_size, temp_bootstrap_gold, temp_bootstrap_base, temp_bootstrap_adapt) evaluation.print_paired_stats(bleus) os.remove(temp_bootstrap_gold) os.remove(temp_bootstrap_base) os.remove(temp_bootstrap_adapt)
import numpy as np import torch from torch.utils.data import DataLoader import torch.optim as optim from data import read_corpus, build_dict, TAG_MAP, NER_DataSet, condtraints from bi_lstm_crf import BiLSTM_CRF from trainer import train, evaluate, load_model train_corpus_path = './datasets/train_data' test_corpus_path = './datasets/test_data' if __name__ == '__main__': # prepare data corpus = read_corpus(train_corpus_path) dct = build_dict(corpus) # build dataloader np.random.shuffle(corpus) train_ds = NER_DataSet(corpus[:-5000], dct) val_ds = NER_DataSet(corpus[-5000:], dct) train_dl = DataLoader(train_ds, batch_size=32, shuffle=True, drop_last=True, num_workers=0) val_dl = DataLoader(val_ds, batch_size=32, shuffle=False,
label2id = ner_cfg.generate_tag_to_label() logger = logging.getLogger(__name__) current_dir = os.path.dirname(os.path.abspath(__file__)) ## get char embeddings word2id_pos2id = read_dictionary('word2id_pos2id_new.pkl') word2id = word2id_pos2id['word2id'] pos2id = word2id_pos2id['pos2id'] word_embedding = np.array(np.load('word2vec.npy'), dtype=np.float32) pos_embedding = np.array(np.load('pos2vec.npy'), dtype=np.float32) config = Config(word2id, pos2id, label2id, batch_size=128, n_epochs=200, n_neurons=60) config.word_embedding = word_embedding config.pos_embedding = pos_embedding ## read corpus and get training data train_data, test_data = read_corpus('train_data') # test_data = read_corpus('test_data') # test_size = len(test_data) model = BiLSTM_CRF(is_training=True, config=config) model.build_graph() model.train(train_data=train_data, valid_data=test_data) # model.test(test_data)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--gpu', '-g', default=-1, type=int, help='GPU ID (negative value indicates CPU)') parser.add_argument('--out', '-o', default='result', type=str, help='Directory to ouput the result') parser.add_argument('--resume', '-r', type=str, help='Resume the training from snapshot') parser.add_argument('--epoch', '-e', default=400, type=int, help='number of epochs to learn') parser.add_argument('--unit', '-u', default=30, type=int, help='number of units') parser.add_argument('--batchsize', '-b', type=int, default=25, help='learning minibatch size') parser.add_argument('--label', '-l', type=int, default=5, help='number of labels') parser.add_argument('--epocheval', '-p', type=int, default=5, help='number of epochs per evaluation') parser.add_argument('--test', dest='test', action='store_true') parser.set_defaults(test=False) args = parser.parse_args() n_epoch = args.epoch # number of epochs n_units = args.unit # number of units per layer batchsize = args.batchsize # minibatch size n_label = args.label # number of labels epoch_per_eval = args.epocheval # number of epochs per evaluation if args.test: max_size = 10 else: max_size = None vocab = {} train_data = [ convert_tree(vocab, tree) for tree in data.read_corpus('trees/train.txt', max_size) ] train_iter = chainer.iterators.SerialIterator(train_data, batchsize) validation_data = [ convert_tree(vocab, tree) for tree in data.read_corpus('trees/dev.txt', max_size) ] validation_iter = chainer.iterators.SerialIterator(validation_data, batchsize, repeat=False, shuffle=False) test_data = [ convert_tree(vocab, tree) for tree in data.read_corpus('trees/test.txt', max_size) ] model = RecursiveNet(len(vocab), n_units, n_label) if args.gpu >= 0: cuda.get_device_from_id(args.gpu).use() model.to_gpu() # Setup optimizer optimizer = optimizers.AdaGrad(lr=0.1) optimizer.setup(model) optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(0.0001)) def _convert(batch, _): return batch # Setup updater updater = chainer.training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=_convert) # Setup trainer and run trainer = chainer.training.Trainer(updater, (n_epoch, 'epoch'), args.out) trainer.extend(extensions.Evaluator(validation_iter, model, device=args.gpu, converter=_convert), trigger=(epoch_per_eval, 'epoch')) trainer.extend(extensions.LogReport()) trainer.extend( extensions.MicroAverage('main/correct', 'main/total', 'main/accuracy')) trainer.extend( extensions.MicroAverage('validation/main/correct', 'validation/main/total', 'validation/main/accuracy')) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) trainer.extend( extensions.snapshot(filename='snapshot_epoch_{.updater.epoch}'), trigger=(epoch_per_eval, 'epoch')) if args.resume is not None: chainer.serializers.load_npz(args.resume, trainer) trainer.run() print('Test evaluation') evaluate(model, test_data)
def train(opt): # Load data ========================================================= if opt.verbose: print('Reading corpora') # Read vocabs if opt.dic_src: widss, ids2ws = data.load_dic(opt.dic_src) else: widss, ids2ws = data.read_dic(opt.train_src, max_size=opt.src_vocab_size) data.save_dic(opt.exp_name + '_src_dic.txt', widss) if opt.dic_dst: widst, ids2wt = data.load_dic(opt.dic_dst) else: widst, ids2wt = data.read_dic(opt.train_dst, max_size=opt.trg_vocab_size) data.save_dic(opt.exp_name + '_trg_dic.txt', widst) # Read training trainings_data = data.read_corpus(opt.train_src, widss) trainingt_data = data.read_corpus(opt.train_dst, widst) # Read validation valids_data = data.read_corpus(opt.valid_src, widss) validt_data = data.read_corpus(opt.valid_dst, widst) # Create model ====================================================== if opt.verbose: print('Creating model') sys.stdout.flush() s2s = seq2seq.Seq2SeqModel(opt.emb_dim, opt.hidden_dim, opt.att_dim, widss, widst, model_file=opt.model, bidir=opt.bidir, word_emb=opt.word_emb, dropout=opt.dropout_rate, max_len=opt.max_len) if s2s.model_file is not None: s2s.load() s2s.model_file = opt.exp_name+'_model.txt' # Trainer ========================================================== if opt.trainer == 'sgd': trainer = dy.SimpleSGDTrainer( s2s.model, e0=opt.learning_rate, edecay=opt.learning_rate_decay) if opt.trainer == 'clr': trainer = dy.CyclicalSGDTrainer(s2s.model, e0_min=opt.learning_rate / 10, e0_max=opt.learning_rate, edecay=opt.learning_rate_decay) elif opt.trainer == 'momentum': trainer = dy.MomentumSGDTrainer( s2s.model, e0=opt.learning_rate, edecay=opt.learning_rate_decay) elif opt.trainer == 'rmsprop': trainer = dy.RMSPropTrainer(s2s.model, e0=opt.learning_rate, edecay=opt.learning_rate_decay) elif opt.trainer == 'adam': trainer = dy.AdamTrainer(s2s.model, opt.learning_rate, edecay=opt.learning_rate_decay) else: print('Trainer name invalid or not provided, using SGD', file=sys.stderr) trainer = dy.SimpleSGDTrainer( s2s.model, e0=opt.learning_rate, edecay=opt.learning_rate_decay) if opt.verbose: print('Using '+opt.trainer+' optimizer') trainer.set_clip_threshold(opt.gradient_clip) # Print configuration =============================================== if opt.verbose: options.print_config(opt, src_dict_size=len(widss), trg_dict_size=len(widst)) sys.stdout.flush() # Creat batch loaders =============================================== if opt.verbose: print('Creating batch loaders') sys.stdout.flush() trainbatchloader = data.BatchLoader(trainings_data, trainingt_data, opt.batch_size) devbatchloader = data.BatchLoader(valids_data, validt_data, opt.dev_batch_size) # Start training ==================================================== if opt.verbose: print('starting training') sys.stdout.flush() start = time.time() train_loss = 0 processed = 0 best_bleu = 0 i = 0 for epoch in range(opt.num_epochs): for x, y in trainbatchloader: processed += sum(map(len, y)) bsize = len(y) # Compute loss loss = s2s.calculate_loss(x, y) # Backward pass and parameter update loss.backward() trainer.update() train_loss += loss.scalar_value() * bsize if (i+1) % opt.check_train_error_every == 0: # Check average training error from time to time logloss = train_loss / processed ppl = np.exp(logloss) elapsed = time.time()-start trainer.status() print(" Training_loss=%f, ppl=%f, time=%f s, tokens processed=%d" % (logloss, ppl, elapsed, processed)) start = time.time() train_loss = 0 processed = 0 sys.stdout.flush() if (i+1) % opt.check_valid_error_every == 0: # Check generalization error on the validation set from time to time dev_loss = 0 dev_processed = 0 dev_start = time.time() for x, y in devbatchloader: dev_processed += sum(map(len, y)) bsize = len(y) loss = s2s.calculate_loss(x, y, test=True) dev_loss += loss.scalar_value() * bsize dev_logloss = dev_loss/dev_processed dev_ppl = np.exp(dev_logloss) dev_elapsed = time.time()-dev_start print("[epoch %d] Dev loss=%f, ppl=%f, time=%f s, tokens processed=%d" % (epoch, dev_logloss, dev_ppl, dev_elapsed, dev_processed)) sys.stdout.flush() start = time.time() if (i+1) % opt.valid_bleu_every == 0: # Check BLEU score on the validation set from time to time print('Start translating validation set, buckle up!') sys.stdout.flush() bleu_start = time.time() with open(opt.valid_out, 'w+') as f: for x in valids_data: y_hat = s2s.translate(x, beam_size=opt.beam_size) translation = [ids2wt[w] for w in y_hat[1:-1]] print(' '.join(translation), file=f) bleu, details = evaluation.bleu_score(opt.valid_dst, opt.valid_out) bleu_elapsed = time.time()-bleu_start print('Finished translating validation set', bleu_elapsed, 'elapsed.') print(details) # Early stopping : save the latest best model if bleu > best_bleu: best_bleu = bleu print('Best BLEU score up to date, saving model to', s2s.model_file) s2s.save() sys.stdout.flush() start = time.time() i = i+1 trainer.update_epoch()
return { "muc": muc_score, "b3": b3_score, "ceafe": ceaf_score, "avg": avg_score } if __name__ == "__main__": args = parser.parse_args() if args.random_seed: torch.random.manual_seed(args.random_seed) np.random.seed(args.random_seed) documents = read_corpus(args.dataset) def create_model_instance(model_name, **override_kwargs): return BaselineController( MentionPairFeatures.num_features(), model_name=model_name, learning_rate=override_kwargs.get("learning_rate", args.learning_rate), dataset_name=override_kwargs.get("dataset", args.dataset)) # Train model if args.dataset == "coref149": INNER_K, OUTER_K = 3, 10 logging.info( f"Performing {OUTER_K}-fold (outer) and {INNER_K}-fold (inner) CV..." )
def main(): parser = argparse.ArgumentParser() parser.add_argument('--gpu', '-g', default=-1, type=int, help='GPU ID (negative value indicates CPU)') parser.add_argument('--out', '-o', default='result', type=str, help='Directory to ouput the result') parser.add_argument('--resume', '-r', type=str, help='Resume the training from snapshot') parser.add_argument('--epoch', '-e', default=400, type=int, help='number of epochs to learn') parser.add_argument('--unit', '-u', default=30, type=int, help='number of units') parser.add_argument('--batchsize', '-b', type=int, default=25, help='learning minibatch size') parser.add_argument('--label', '-l', type=int, default=5, help='number of labels') parser.add_argument('--epocheval', '-p', type=int, default=5, help='number of epochs per evaluation') parser.add_argument('--test', dest='test', action='store_true') parser.set_defaults(test=False) args = parser.parse_args() n_epoch = args.epoch # number of epochs n_units = args.unit # number of units per layer batchsize = args.batchsize # minibatch size n_label = args.label # number of labels epoch_per_eval = args.epocheval # number of epochs per evaluation if args.test: max_size = 10 else: max_size = None vocab = {} train_data = [convert_tree(vocab, tree) for tree in data.read_corpus('trees/train.txt', max_size)] train_iter = chainer.iterators.SerialIterator(train_data, batchsize) validation_data = [convert_tree(vocab, tree) for tree in data.read_corpus('trees/dev.txt', max_size)] validation_iter = chainer.iterators.SerialIterator( validation_data, batchsize, repeat=False, shuffle=False) test_data = [convert_tree(vocab, tree) for tree in data.read_corpus('trees/test.txt', max_size)] model = RecursiveNet(len(vocab), n_units, n_label) if args.gpu >= 0: cuda.get_device_from_id(args.gpu).use() model.to_gpu() # Setup optimizer optimizer = optimizers.AdaGrad(lr=0.1) optimizer.setup(model) optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(0.0001)) def _convert(batch, _): return batch # Setup updater updater = chainer.training.StandardUpdater( train_iter, optimizer, device=args.gpu, converter=_convert) # Setup trainer and run trainer = chainer.training.Trainer(updater, (n_epoch, 'epoch'), args.out) trainer.extend( extensions.Evaluator(validation_iter, model, device=args.gpu, converter=_convert), trigger=(epoch_per_eval, 'epoch')) trainer.extend(extensions.LogReport()) trainer.extend(extensions.MicroAverage( 'main/correct', 'main/total', 'main/accuracy')) trainer.extend(extensions.MicroAverage( 'validation/main/correct', 'validation/main/total', 'validation/main/accuracy')) trainer.extend(extensions.PrintReport( ['epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time'])) trainer.extend( extensions.snapshot(filename='snapshot_epoch_{.updater.epoch}'), trigger=(epoch_per_eval, 'epoch')) if args.resume is not None: chainer.serializers.load_npz(args.resume, trainer) trainer.run() print('Test evaluation') evaluate(model, test_data)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--gpu', '-g', default=-1, type=int, help='GPU ID (negative value indicates CPU)') parser.add_argument('--epoch', '-e', default=400, type=int, help='number of epochs to learn') parser.add_argument('--unit', '-u', default=30, type=int, help='number of units') parser.add_argument('--batchsize', '-b', type=int, default=25, help='learning minibatch size') parser.add_argument('--label', '-l', type=int, default=5, help='number of labels') parser.add_argument('--epocheval', '-p', type=int, default=5, help='number of epochs per evaluation') parser.add_argument('--test', dest='test', action='store_true') parser.set_defaults(test=False) args = parser.parse_args() vocab = {} max_size = None train_trees = data.read_corpus('trees/train.txt', max_size) test_trees = data.read_corpus('trees/test.txt', max_size) if args.gpu >= 0: chainer.cuda.get_device(args.gpu).use() xp = cuda.cupy else: xp = numpy train_data = [linearize_tree(vocab, t, xp) for t in train_trees] train_iter = chainer.iterators.SerialIterator(train_data, args.batchsize) test_data = [linearize_tree(vocab, t, xp) for t in test_trees] test_iter = chainer.iterators.SerialIterator(test_data, args.batchsize, repeat=False, shuffle=False) model = ThinStackRecursiveNet(len(vocab), args.unit, args.label) if args.gpu >= 0: model.to_gpu() optimizer = chainer.optimizers.AdaGrad(0.1) optimizer.setup(model) updater = training.StandardUpdater(train_iter, optimizer, device=None, converter=convert) trainer = training.Trainer(updater, (args.epoch, 'epoch')) trainer.extend(extensions.Evaluator(test_iter, model, converter=convert, device=None), trigger=(args.epocheval, 'epoch')) trainer.extend(extensions.LogReport()) trainer.extend( extensions.MicroAverage('main/correct', 'main/total', 'main/accuracy')) trainer.extend( extensions.MicroAverage('validation/main/correct', 'validation/main/total', 'validation/main/accuracy')) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) trainer.run()
#import numpy as np #import os, argparse, time, random from BiLSTMmodel import bilstm_model from data import read_corpus, read_dictionary, random_embedding from config import config ## get char embeddings word2id = read_dictionary('vocab') ##随机产生embedding embeddings = random_embedding(word2id, config.embedding_size) paths={'log_path':'logger//', 'model_path':'./model2/','result_path':'result//'} #TODO 注意:model_path!!这是个坑啊!! model = bilstm_model(embeddings, paths, word2id, config=config) model.build_graph() ## train model on the whole training data train_data = read_corpus('pku_training.utf8') print("train data: {}".format(len(train_data))) model.train(train_data=train_data) ##test model #test_data = read_corpus('pku_test_gold.utf8') #print("test data: {}".format(len(test_data))) #model.test(test_data=test_data)
if args.embedding_type == 'random': #随机生成词嵌入矩阵(一共3905个字,默认取300个特征,维度为3905*300) embeddings = random_embedding(word2id, args.embedding_dim) else: embeddings = load_embeddings(args.embedding_dim, word2id, args.embedding_type) #使用gensim(word2vec)基于wiki百科语料训练的中文词向量 print("\n=========embeddings==========\n", embeddings, "\ndim(embeddings)=", embeddings.shape) ## read corpus and get training data获取 if args.mode != 'demo': train_path = os.path.join('.', args.train_data, 'ner_train_data') test_path = os.path.join('.', args.test_data, 'ner_test_data') train_data = read_corpus(train_path) #读取训练集 test_data = read_corpus(test_path) test_size = len(test_data) #读取测试集 print('train_data=\n', train_data) #print("\n==========train_data================\n",train_data) #print("\n==========test_data================\n",test_data) ## paths setting创建相应文件夹目录 paths = {} # 时间戳就是一个时间点,一般就是为了在同步更新的情况下提高效率之用。 #就比如一个文件,如果他没有被更改,那么他的时间戳就不会改变,那么就没有必要写回,以提高效率, #如果不论有没有被更改都重新写回的话,很显然效率会有所下降。 timestamp = str(int(time.time())) if args.mode == 'train' else args.demo_model #输出路径output_path路径设置为data_path_save下的具体时间名字为文件名 output_path = os.path.join('.', args.train_data + "_save", timestamp)
parser.add_argument("--source_dataset", type=str, default="senticoref") parser.add_argument("--target_dataset", type=str, default="coref149") parser.add_argument("--kfold_state_cache_path", type=str, default=None) if __name__ == "__main__": logger = logging.getLogger() logger.setLevel(logging.INFO) logger.addHandler(logging.StreamHandler(sys.stdout)) args = parser.parse_args() if args.random_seed: torch.random.manual_seed(args.random_seed) np.random.seed(args.random_seed) src_docs = read_corpus(args.source_dataset) tgt_docs = read_corpus(args.target_dataset) all_tok2id, _ = extract_vocab(src_docs + tgt_docs, lowercase=True, top_n=10**9) logging.info(f"Total vocabulary size: {len(all_tok2id)} tokens") pretrained_embs = None embedding_size = args.embedding_size if args.use_pretrained_embs == "word2vec": # Note: pretrained word2vec embeddings we use are uncased logging.info("Loading pretrained Slovene word2vec embeddings") with codecs.open(args.embedding_path, "r", encoding="utf-8", errors="ignore") as f: num_tokens, embedding_size = list(map(int, f.readline().split(" "))) embs = {} for line in f:
log_path = os.path.join(result_path, "log.txt") paths['log_path'] = log_path get_logger(log_path).info(str(args)) ## training model if args.mode == 'train': model = BiLSTM_CRF(args, embeddings, dictname2id, word2id, paths, config=config) model.build_graph() train_path = os.path.join('.', args.train_data, 'train.txt') train_data = read_corpus(train_path, word2id, word2dictname, dictname2id) test_path = os.path.join('.', args.test_data, 'test.txt') test_data = read_corpus(test_path, word2id, word2dictname, dictname2id) test_size = len(test_data) ## train model on the whole training data print("train data: {}".format(len(train_data))) model.train(train=train_data, dev=test_data ) # use test_data as the dev_data to see overfitting phenomena ## testing model elif args.mode == 'test': ckpt_file = tf.train.latest_checkpoint(model_path) print(ckpt_file) paths['model_path'] = ckpt_file model = BiLSTM_CRF(args,
result_path = os.path.join(output_path, "results") paths['result_path'] = result_path if not os.path.exists(result_path): os.makedirs(result_path) log_path = os.path.join(result_path, args.dataset_name + log_pre + "_log.txt") paths['log_path'] = log_path get_logger(log_path).info(str(args)) # read corpus and get training data if args.mode != 'demo': train_path = os.path.join('data_path', args.dataset_name, train_file) paths['train_path'] = train_path test_path = os.path.join('data_path', args.dataset_name, test_file) paths['test_path'] = test_path train_data = read_corpus(train_path)[:100] test_data = read_corpus(test_path) test_size = len(test_data) print("train data: {}".format(len(train_data))) print("test data: {}".format(test_size)) # training model if args.mode == 'train': model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=config) model.build_graph()
## get char embeddings word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl')) if args.pretrain_embedding == 'random': embeddings = random_embedding(word2id, args.embedding_dim) else: embedding_path = 'pretrain_embedding.npy' embeddings = np.array(np.load(embedding_path), dtype='float32') ## read corpus and get training data if args.mode != 'demo': train_path = os.path.join('.', args.train_data, 'train_data') test_path = os.path.join('.', args.test_data, 'test_data') train_data = read_corpus(train_path) test_data = read_corpus(test_path); test_size = len(test_data) ## paths setting paths = {} timestamp = str(int(time.time())) if args.mode == 'train' else args.demo_model output_path = os.path.join('.', args.train_data+"_save", timestamp) if not os.path.exists(output_path): os.makedirs(output_path) summary_path = os.path.join(output_path, "summaries") paths['summary_path'] = summary_path if not os.path.exists(summary_path): os.makedirs(summary_path) model_path = os.path.join(output_path, "checkpoints/") if not os.path.exists(model_path): os.makedirs(model_path) ckpt_prefix = os.path.join(model_path, "model") paths['model_path'] = ckpt_prefix
args = parser.parse_args() import torch torch.manual_seed(args.seed) args.use_cuda = True # load data from data_loader import DataLoader from data import read_corpus, tag2label import os from eval import conlleval sents_train, labels_train, args.word_size, _ = read_corpus( os.path.join('.', args.data, 'source_data.txt'), os.path.join('.', args.data, 'source_label.txt')) sents_test, labels_test, _, data_origin = read_corpus( os.path.join('.', args.data, 'test_data.txt'), os.path.join('.', args.data, 'test_label.txt'), is_train=False) args.label_size = len(tag2label) train_data = DataLoader(sents_train, labels_train, cuda=args.use_cuda, batch_size=args.batch_size) test_data = DataLoader(sents_test, labels_test, cuda=args.use_cuda, shuffle=False,
config = tf.ConfigProto() ## hyperparameters embedding_dim = 128 tag2label = {"N": 0, "解剖部位": 1, "手术": 2, "药物": 3, "独立症状": 4, "症状描述": 5} ## get char embeddings word2id = read_dictionary('./vocab.pkl') embeddings = random_embedding(word2id, embedding_dim) train_data = read_corpus('./c.txt') # embeddings, tag2label, vocab,batch_size,epoch,hidden_dim,CRF,update_embedding,shuffle ## training model if __name__ == '__main__': model = BiLSTM_CRF(embeddings, tag2label, word2id, 4,80,128,False,True,True) model.build_graph() test_report = open('test_report.txt','w',encoding= 'utf-8') print("train data: {}".format(len(train_data))) model.test(test_report) # model.train(train=train_data) # use test_data as the dev_data to see overfitting phenomena
args = parser.parse_args() ## get char embeddings word2id = read_dictionary(os.path.join('.', args.train_data, 'word2id.pkl')) if args.pretrain_embedding == 'random': embeddings = random_embedding(word2id, args.embedding_dim) else: embedding_path = 'pretrain_embedding.npy' embeddings = np.array(np.load(embedding_path), dtype='float32') ## read corpus and get training data if args.mode != 'demo': train_path = os.path.join('.', args.train_data, 'train_data') pre_train_path = os.path.join('.', args.train_data, 'resume_data') test_path = os.path.join('.', args.test_data, 'test_data') train_data = read_corpus(train_path) pre_train_data = read_pre_train_data(pre_train_path, args.seq_length) test_data = read_corpus(test_path) test_size = len(test_data) ## paths setting paths = {} timestamp = str(int(time.time())) if args.mode == 'train' else args.demo_model output_path = os.path.join('.', args.train_data + "_save", timestamp) if not os.path.exists(output_path): os.makedirs(output_path) summary_path = os.path.join(output_path, "summaries") paths['summary_path'] = summary_path if not os.path.exists(summary_path): os.makedirs(summary_path) model_path = os.path.join(output_path, "checkpoints/") if not os.path.exists(model_path): os.makedirs(model_path)
def getTrainData(filename): return read_corpus(filename)