def train(): # 加载数据集 train_sentences = data_loader.load_sentences(FLAGS.train_file) dev_sentences = data_loader.load_sentences(FLAGS.dev_file) test_sentences = data_loader.load_sentences(FLAGS.test_file) # 转换编码 data_loader.update_tag_scheme(train_sentences, FLAGS.tag_schema) data_loader.update_tag_scheme(dev_sentences, FLAGS.tag_schema) data_loader.update_tag_scheme(test_sentences, FLAGS.tag_schema) # 创建单词和词典映射 if not os.path.isfile(FLAGS.map_file): if FLAGS.pre_emb: dico_words_train = data_loader.word_mapping(train_sentences)[0] dico_word, word_to_id, id_to_word = data_utils.augment_with_pretrained( dico_words_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _, word_to_id, id_to_word = data_loader.word_mapping( train_sentences) _, tag_to_id, id_to_tag = data_loader.tag_mapping(train_sentences) with open(FLAGS.map_file, 'wb') as f: pickle.dump([word_to_id, id_to_word, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, 'rb') as f: word_to_id, id_to_word, tag_to_id, id_to_tag = pickle.load(f) # 准备数据 train_data = data_loader.prepare_dataset(train_sentences, word_to_id, tag_to_id) dev_data = data_loader.prepare_dataset(dev_sentences, word_to_id, tag_to_id) test_data = data_loader.prepare_dataset(test_sentences, word_to_id, tag_to_id) # 将数据分批处理 train_manager = data_utils.BatchManager(train_data, FLAGS.batch_size) dev_manager = data_utils.BatchManager(dev_data, FLAGS.batch_size) test_manager = data_utils.BatchManager(test_data, FLAGS.batch_size) # 创建不存在的文件夹 model_utils.make_path(FLAGS) # 判断配置文件 if os.path.isfile(FLAGS.config_file): config = model_utils.load_config(FLAGS.config_file) else: config = model_utils.config_model(FLAGS, word_to_id, tag_to_id) model_utils.save_config(config, FLAGS.config_file) # 配置印logger log_path = os.path.join('log', FLAGS.log_file) logger = model_utils.get_logger(log_path) model_utils.print_config(config, logger) tf_config = tf.ConfigProto(allow_soft_placement=True) tf_config.gpu_options.allow_growth = True step_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = model_utils.create(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_word, logger) logger.info('开始训练') loss = [] start = time.time() for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.setps_chech == 0: iteration = step // step_per_epoch + 1 logger.info( "iteration{}: step{}/{}, NER loss:{:>9.6f}".format( iteration, step % step_per_epoch, step_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, 'dev', dev_manager, id_to_tag, logger) if best: model_utils.save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, 'test', test_manager, id_to_tag, logger) t = time.time() - start logger.info('cost time: %f' % t)
dev_sentences, word_to_id, tag_to_id ) test_data = data_loader.prepare_dataset( test_sentences, word_to_id, tag_to_id ) train_manager = data_utils.BatchManager(train_data, FLAGS.batch_size) dev_manager = data_utils.BatchManager(dev_data, FLAGS.batch_size) test_manager = data_utils.BatchManager(test_data, FLAGS.batch_size) print('train_data_num %i, dev_data_num %i, test_data_num %i' % (len(train_data), len(dev_data), len(test_data))) model_utils.make_path(FLAGS)
:param tag_to_id: :param train: :return: """ none_index = tag_to_id['O'] data = [] for s in sentences: word_list = [w[0] for w in s] word_id_list = [ word_to_id[w if w in word_to_id else '<UNK>'] for w in word_list ] segs = data_utils.get_seg_features("".join(word_list)) if train: tag_id_list = [tag_to_id[w[-1]] for w in s] else: tag_id_list = [none_index for w in s] data.append([word_list, word_id_list, segs, tag_id_list]) return data if __name__ == "__main__": path = "data/ner.dev" sentences = load_sentences(path) update_tag_scheme(sentences, "BIOES") _, word_to_id, id_to_word = word_mapping(sentences) _, tag_to_id, id_to_tag = tag_mapping(sentences) dev_data = prepare_dataset(sentences, word_to_id, tag_to_id) data_utils.BatchManager(dev_data, 120)
def train(): # 加载数据集 train_sentences = data_loader.load_sentences(FLAGS.train_file) dev_sentences = data_loader.load_sentences(FLAGS.dev_file) test_sentences = data_loader.load_sentences(FLAGS.test_file) # 转换编码 bio转bioes data_loader.update_tag_scheme(train_sentences, FLAGS.tag_schema) data_loader.update_tag_scheme(test_sentences, FLAGS.tag_schema) data_loader.update_tag_scheme(dev_sentences, FLAGS.tag_schema) # 创建单词映射及标签映射 if not os.path.isfile(FLAGS.map_file): if FLAGS.pre_emb: dico_words_train = data_loader.word_mapping(train_sentences)[0] dico_word, word_to_id, id_to_word = data_utils.augment_with_pretrained( dico_words_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable( [[w[0] for w in s] for s in test_sentences] ) ) ) else: _, word_to_id, id_to_word = data_loader.word_mapping(train_sentences) _, tag_to_id, id_to_tag = data_loader.tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([word_to_id, id_to_word, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, 'rb') as f: word_to_id, id_to_word, tag_to_id, id_to_tag = pickle.load(f) train_data = data_loader.prepare_dataset( train_sentences, word_to_id, tag_to_id ) dev_data = data_loader.prepare_dataset( dev_sentences, word_to_id, tag_to_id ) test_data = data_loader.prepare_dataset( test_sentences, word_to_id, tag_to_id ) train_manager = data_utils.BatchManager(train_data, FLAGS.batch_size) dev_manager = data_utils.BatchManager(dev_data, FLAGS.batch_size) test_manager = data_utils.BatchManager(test_data, FLAGS.batch_size) print('train_data_num %i, dev_data_num %i, test_data_num %i' % (len(train_data), len(dev_data), len(test_data))) model_utils.make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = model_utils.load_config(FLAGS.config_file) else: config = model_utils.config_model(FLAGS, word_to_id, tag_to_id) model_utils.save_config(config, FLAGS.config_file) log_path = os.path.join("log", FLAGS.log_file) logger = model_utils.get_logger(log_path) model_utils.print_config(config, logger) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch =train_manager.len_data with tf.Session(config = tf_config) as sess: model = model_utils.create(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_word, logger) logger.info("开始训练") loss = [] for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.setps_chech== 0: iterstion = step // steps_per_epoch + 1 logger.info("iteration:{} step{}/{},NER loss:{:>9.6f}".format(iterstion, step%steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess,model,"dev", dev_manager, id_to_tag, logger) if best: model_utils.save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
def train(): # load data sets # sentences 的格式如下 ['在', 'O'], ['厦', 'B-LOC'], ['门', 'I-LOC'] # train_sentences = loader.load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) # dev_sentences = loader.load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) # test_sentences = loader.load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) train_sentences = loader.load_folder_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = loader.load_folder_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = loader.load_folder_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) # update_tag_scheme 后sentence没有太大的变化 loader.update_tag_scheme(train_sentences, FLAGS.tag_schema) loader.update_tag_scheme(test_sentences, FLAGS.tag_schema) os.environ["CUDA_VISIBLE_DEVICES"] = "0" # create maps if not exist # 是否存在maps.pkl文件,如果不存在就需要读取训练数据, # 获得char_to_id tag_to_id # create maps if not exist # 是否存在maps.pkl文件, if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: dico_chars_train = loader.char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = loader.augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _c, char_to_id, id_to_char = loader.char_mapping( train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = loader.tag_mapping(train_sentences) print('tag_to_id: ', tag_to_id) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # print('tag_to_id: ', tag_to_id) print('tag_to_id: ', tag_to_id) # prepare data, get a collection of list containing index train_data = loader.prepare_dataset(train_sentences, char_to_id, tag_to_id, FLAGS.lower) dev_data = loader.prepare_dataset(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = loader.prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), 0, len(test_data))) train_manager = data_utils.BatchManager(train_data, FLAGS.batch_size) dev_manager = data_utils.BatchManager(dev_data, 100) test_manager = data_utils.BatchManager(test_data, 100) # make path for store log and model if not exist utils.make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = utils.load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) utils.save_config(config, FLAGS.config_file) utils.make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) # ./log/train.log logger = utils.get_logger(log_path) utils.print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = utils.create_model(sess, Model, FLAGS.ckpt_path, data_utils.load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(FLAGS.iterations): # for i in range(10): logger.info('epoch: {}'.format(i)) for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: utils.save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
def train(): tran_batch_manager = data_utils.BatchManager("data/train.txt.id40000.in", config.batch_size) test_batch_manager = data_utils.BatchManager("data/test.txt.id40000.in", config.batch_size) with tf.Session() as sess: graph_writer = tf.summary.FileWriter(config.model_dir, graph=sess.graph) model_obj = model.Seq2SeqModel('train') model_obj.model_restore(sess) #outputTensors = [] #print(model_obj.decoder_pred_decode.name.replace(":0","")) #outputTensors.append(model_obj.decoder_pred_decode.name.replace(":0","")) #output_graph_with_weight = tf.graph_util.convert_variables_to_constants(sess,sess.graph_def,outputTensors) #with tf.gfile.FastGFile(os.path.join(config.model_dir, "weight_seq2seq.pb"), # 'wb') as gf: # gf.write(output_graph_with_weight.SerializeToString()) print("开始训练") loss = 0.0 start_time = time.time() best_loss = 10000.0 for epoch_id in range(config.max_epochs): for step, train_batch in enumerate(tran_batch_manager.iterbatch()): if train_batch['encode'] is None: continue print("开始第%d轮第%d次训练,globa_step %d" % (epoch_id + 1, step + 1, model_obj.global_step.eval())) # Execute a single training step step_loss, summary = model_obj.train( sess, encoder_inputs=train_batch['encode'], decoder_inputs=train_batch['decode'], encoder_inputs_length=train_batch['encode_lengths'], decoder_inputs_length=train_batch['decode_lengths']) loss += float(step_loss) / config.display_freq if (model_obj.global_step.eval() + 1) % config.display_freq == 0: if loss < best_loss: best_loss = loss print("保存模型。。。。。。。") checkpoint_path = model_obj.mode_save_path model_obj.saver.save(sess, checkpoint_path, global_step=model_obj.global_step) avg_perplexity = math.exp( float(loss)) if loss < 300 else float("inf") #计算时间 time_cost = time.time() - start_time step_time = time_cost / config.display_freq print( '第%d轮训练,第%d的步,loss值为 %.2f , Preplexity值为 %.2f,花费时间 %f' % (epoch_id, model_obj.global_step.eval(), loss, avg_perplexity, step_time)) loss = 0.0 start_time = time.time() # Record training summary for the current batch graph_writer.add_summary(summary, model_obj.global_step.eval()) #验证模型 if (model_obj.global_step.eval() + 1) % config.valid_freq == 0: print("验证模型。。。。。") valid_loss = 0.0 totoal_sentent = 0 for test_batch in test_batch_manager.iterbatch(): step_loss, summary = model_obj.eval( sess, encoder_inputs=test_batch['encode'], decoder_inputs=test_batch['decode'], encoder_inputs_length=test_batch['encode_lengths'], decoder_inputs_length=test_batch['decode_lengths']) batch_size = test_batch['encode_lengths'].shape[0] valid_loss += step_loss * batch_size totoal_sentent += batch_size valid_loss = valid_loss / totoal_sentent print("验证集上面的loss值为 %.2f, Preplexity值为 %.2f" % (valid_loss, math.exp(valid_loss))) if (model_obj.global_step.eval() + 1) % config.save_freq == 0: print("保存模型。。。。。。。") checkpoint_path = model_obj.mode_save_path model_obj.saver.save(sess, checkpoint_path, global_step=model_obj.global_step)