tf.flags.DEFINE_integer("seed", 123, "random seed") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() MAXLEN = 30 np.random.seed(FLAGS.seed) train_file = "dataset/train.tsv" valid_file = "dataset/valid.tsv" test_file = "dataset/test.tsv" # statement, credit_history, topic, speaker, job, state, party, location, label train_statement, train_ch, train_topic, train_speaker, train_job, train_state, \ train_party, train_location, train_y = load_data(train_file) valid_statement, valid_ch, valid_topic, valid_speaker, valid_job, valid_state, \ valid_party, valid_location, valid_y = load_data(valid_file) test_statement, test_ch, test_topic, test_speaker, test_job, test_state, \ test_party, test_location, test_y = load_data(test_file) train_location = [clean_text(lc) for lc in train_location] valid_location = [clean_text(lc) for lc in valid_location] test_location = [clean_text(lc) for lc in test_location] train_location = texts_to_tokens(train_location) valid_location = texts_to_tokens(valid_location) test_location = texts_to_tokens(test_location)
def do_train(config): train, dev, test = load_data(config) # 加载数据 word_to_id, id_to_word, tag_to_id, id_to_tag = create_maps(train, config) # 创建或读取maps # 配置信息及保存 config["num_chars"] = len(word_to_id) # 词总数 config["num_tags"] = len(tag_to_id) # 标签总数 with open(config["config_file"], "w") as f: json.dump(config, f, ensure_ascii=False, indent=4) # 数据处理 train_data = prepare_dataset(train, word_to_id, tag_to_id, config["lower"]) dev_data = prepare_dataset(dev, word_to_id, tag_to_id, config["lower"]) test_data = prepare_dataset(test, word_to_id, tag_to_id, config["lower"]) print("train/dev/test 句子数:{} / {} / {}".format(len(train_data), len(dev_data), len(test_data))) # 分batch train_manager = BatchManager(train_data, config["batch_size"]) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) steps_per_epoch = train_manager.len_data # 每个轮次的steps # 创建相关路径 make_path(config) # logger logger = get_logger(config["log_file"]) # GPU限制 tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: # 创建模型, 可以提供使用现有参数配置 model = Model(config) ckpt = tf.train.get_checkpoint_state(config["ckpt_path"]) # 从模型路径获取ckpt if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): # 现有模型 logger.info("读取现有模型...") model.saver.restore(sess, ckpt.model_checkpoint_path) else: logger.info("新建模型...") sess.run(tf.global_variables_initializer()) # 不使用预训练的embeddings # 如果使用预训练的embeddings if config["pre_emb"]: emb_weights = sess.run(model.char_lookup.read_value()) emb_weights = load_word2vec(config["emb_file"], id_to_word, config["char_dim"], emb_weights) sess.run(model.char_lookup.assign(emb_weights)) logger.info("Load pre-trained embedding.") logger.info("开始训练...") loss = [] for i in range(config["max_epoch"]): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % config["steps_check"] == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger, config) if best: save_model(sess, model, config["ckpt_path"], logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger, config)