def train(self): batch_manager = BatchManager() self.head_input_size = batch_manager.head_vocab_size self.tail_input_size = batch_manager.tail_vocab_size self.relation_input_size = batch_manager.relation_vocab_size data_map = { "head_size": self.head_input_size, "tail_size": self.tail_input_size, "relation_size": self.relation_input_size, "head_vocab": batch_manager.head_vocab, "tail_vocab": batch_manager.tail_vocab, "relation_vocab": batch_manager.relation_vocab } f = open("models/data_map.pkl", "wb") pickle.dump(data_map, f) f.close() self.init_model() with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state(self.checkpoint_dir) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print("[->] restore model") self.saver.restore(sess, ckpt.model_checkpoint_path) else: print("[->] no model, initializing") sess.run(tf.global_variables_initializer()) for i in range(200): print("epoch {}".format(i)) for batch in batch_manager.get_batch(): loss = self.step(batch, sess) print("\tloss: {}".format(loss)) self.saver.save(sess, self.checkpoint_path)
def get_data(self): train_valid_test = os.path.join(FLAGS.pkl_dir, "train_valid_test.pkl") if os.path.exists(train_valid_test): # 若train_valid_test已被处理和存储 with open(train_valid_test, 'rb') as data_f: train_data, valid_data, test_data, true_label_pert = pickle.load( data_f) else: # 读取数据集并创建训练集、验证集和测试集 with open(FLAGS.traning_data_path, "r", encoding="utf-8") as data_f: all_data = csv.reader(data_f, delimiter='\t', quotechar='|') # 获取tfidf值 tfidf_path = "data/tfidf.txt" # 存储tfidf值的文件路径 if not os.path.exists(tfidf_path): get_tfidf_and_save(all_data, tfidf_path) tfidf_dict = load_tfidf_dict(tfidf_path) # 获取fasttext词向量 fasttext_path = FLAGS.fasttext_model_path fasttext_dict = load_vector(fasttext_path) # 获取word2vec词向量 word2vec_path = "data/word2vec_word_model.txt" word2vec_dict = load_vector(word2vec_path) with open(FLAGS.traning_data_path, "r", encoding="utf-8") as data_f: all_data = csv.reader(data_f, delimiter='\t', quotechar='|') # 基于句子的长度和包含的词汇、tfidf值、fasttext词向量、word2vec词向量进行特征工程,并获取相应的特征向量 features_vector = features_engineer(all_data, fasttext_dict, word2vec_dict, tfidf_dict, FLAGS.tokenize_style, n_gram=8) with open(FLAGS.traning_data_path, "r", encoding="utf-8") as data_f: all_data = csv.reader(data_f, delimiter='\t', quotechar='|') # 语句序列化,将句子中的word映射成index,作为输入特征 sentences_1, sentences_2, labels = sentence_word_to_index( all_data, self.word_to_index, self.label_to_index, FLAGS.tokenize_style) """ 打乱数据、padding、添加features_vector到数据中并根据比例分割成train、valid、test数据, train、valid、test里面又依次包含sentences_1,sentences_2,features_vector,labels四种数据 """ train_data, valid_data, test_data, true_label_pert = shuffle_padding_split( sentences_1, sentences_2, labels, features_vector, train_valid_test, FLAGS.sentence_len) self.features_vector_size = len(train_data[2][0]) # print("features_vector_size:", self.features_vector_size) print("训练集大小:", len(train_data[0]), "验证集大小:", len(valid_data[0]), "正样本比例:", true_label_pert) # 获取train、valid、test数据的batch生成类 self.train_batch_manager = BatchManager(train_data, int(FLAGS.batch_size)) print("训练集批次数量:", self.train_batch_manager.len_data) self.valid_batch_manager = BatchManager(valid_data, int(FLAGS.batch_size)) self.test_batch_manager = BatchManager(test_data, int(FLAGS.batch_size))
def train(): #加载训练用的数据 train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) #加载验证集和测试集合 dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES)使用选定的标记方案I:中间,O:其他,B:开始 | E:结束,S:单个 update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) update_tag_scheme(dev_sentences, FLAGS.tag_schema) _c, char_to_id, id_to_char = char_mapping( train_sentences, FLAGS.lower) #统计每个字的频率以及为每个字分配一个id _t, tag_to_id, id_to_tag = tag_mapping( train_sentences, FLAGS.id_to_tag_path, FLAGS.tag_to_id_path) #统计每个命名实体的频率以及为每个命名实体分配一个id #将字典写入pkl文件中 with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) #准备数据,获取包含索引的列表集合,得到用于输入网络进行训练的数据 train_data = prepare_dataset( # train_data[0][0]:一句话;train_data[0][1]:单个字的编号;train_data[0][2]:切词之后,切词特征:词的大小是一个字的话是0,词的大小是2以上的话:1,2.。。,2,3; train_data[0][3]:每个字的标签 train_sentences, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) train_manager = BatchManager( train_data, FLAGS.batch_size) # 将数据拆分成以60句话为一个batch,得到一个可迭代对象 dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) config = config_model(char_to_id, tag_to_id) #补全参数配置 #限制GPU的使用 tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, load_word2vec, config, id_to_char) saver = tf.train.Saver() # 用于保存模型 with tf.device("/cpu:0"): for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step( sess, True, batch) # 按批次训练模型 这个是训练的开始,可以从这里倒着找整个网络怎么训练 #每训练5次做一次验证并计算模型的f1 if (i + 1) % 1 == 0: f1 = evaluate(sess, model, "dev", dev_manager, id_to_tag) print("验证集的F1系数:", f1) #每训练20次保存一次模型 if (i + 10) % 1 == 0: saver.save(sess, save_path=FLAGS.ckpt_path)
def main(_): if not os.path.isdir(FLAGS.log_path): os.makedirs(FLAGS.log_path) if not os.path.isdir(FLAGS.model_path): os.makedirs(FLAGS.model_path) if not os.path.isdir(FLAGS.result_path): os.makedirs(FLAGS.result_path) tag_to_id = FLAGS.tag_to_id # specific_file = "data/mor-test/test_set.mor" specific_file = "../mor_v1_addr.test"#"../addr_all.test"#""#"data/mor-test_code/mor_iter_v1/mor_person_label_v2.txt"#FLAGS.test_file#""data/rule_gen/rule_gen.test_code"#"data/sighan/sighan.test_code"# # load data id_to_word, id_to_tag, _, _, test_data = load_data(FLAGS, tag_to_id, only_use_test=True, specific_file=specific_file) test_manager = BatchManager(test_data, len(id_to_tag), FLAGS.word_max_len, FLAGS.valid_batch_size) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: model = create_model(sess, id_to_word, id_to_tag) # test model model.logger.info("testing ner") ckpt = tf.train.get_checkpoint_state(FLAGS.model_path) model.logger.info("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) ner_results = model.predict(sess, test_manager) eval_lines = conll_eval(ner_results, FLAGS.result_path) for line in eval_lines: model.logger.info(line)
def predict(): """ 对一个数据集进行实体识别 :return: """ config = load_config(FLAGS.config_file) logger = get_logger(FLAGS.log_file) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True # limit GPU memory # 从训练阶段生成的map_file中恢复各映射字典 with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower, train=False) test_manager = BatchManager(test_data, 1) with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, config, id_to_char, logger) logger.info("predict data......") ner_results = model.predict(sess, test_manager, id_to_tag) result_write_evaluate(ner_results, FLAGS.result_path, "test")
def main(_): if FLAGS.train: if FLAGS.clean: clean(FLAGS) train() else: # 下面使用testdata来进行评估模型 with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) log_path = os.path.join("log", FLAGS.log_file) config = load_config(FLAGS.config_file) logger = get_logger(log_path) tf_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) test_manager = BatchManager(test_data, 100) with tf.Session(config=tf_config) as sess: sess.run(tf.global_variables_initializer()) model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
def test(): # 加载配置文件 config = load_config(FLAGS.config_file) # 加载日志管理器 log_path = os.path.join("log", FLAGS.test_log_file) logger = get_logger(log_path) # 配置GPU tf_config = tf.ConfigProto() # 加载数据集 test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # 读取词典 with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # 格式化test test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) # 加载batch test_manager = BatchManager(test_data, 20) with tf.Session(config=tf_config) as sess: logger.info("start testing...") start = time.time() # 根据保存的模型读取模型 model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) # 获取testbatch evaluate(sess, model, "test", test_manager, id_to_tag, logger) logger.info("The best_f1 on test_dataset is {}".format( model.best_test_f1.eval())) logger.info('Time test for 10 batch is {} sec\n'.format(time.time() - start))
def get_batch_data(self): """ 得到训练集和验证集的batch管理类:首先基于各映射字典对训练集和验证集的语句序列进行处理,得到每个语句的各特征列表以及 真实标签列表,然后获取batch管理类,用于生成batch数据 :return: """ if not os.path.isfile(FLAGS.train_dev_file): train_data = prepare_dataset(self.train_sentences, self.char_to_id, self.tag_to_id, FLAGS.lower) dev_data = prepare_dataset(self.dev_sentences, self.char_to_id, self.tag_to_id, FLAGS.lower) with open(FLAGS.train_dev_file, "wb") as f: pickle.dump([train_data, dev_data], f) else: with open(FLAGS.train_dev_file, "rb") as f: train_data, dev_data = pickle.load(f) print("%i / %i sentences in train / dev ." % (len(train_data), len(dev_data))) self.train_batch_manager = BatchManager(train_data, int(FLAGS.batch_size)) self.dev_batch_manager = BatchManager(dev_data, int(FLAGS.batch_size))
def main(): os.environ['PYTHONHASHSEED'] = str(args.seed) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True train_sentences = load_sentences(args.train_file) dev_sentences = load_sentences(args.dev_file) test_sentences = load_sentences(args.test_file) update_tag_scheme(train_sentences, args.tag_schema) update_tag_scheme(test_sentences, args.tag_schema) update_tag_scheme(dev_sentences, args.tag_schema) with open(args.map_file, 'rb') as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id) train_manager = BatchManager(train_data, args.batch_size, args.num_steps) dev_manager = BatchManager(dev_data, 100, args.num_steps) test_manager = BatchManager(test_data, 100, args.num_steps) if args.cuda >= 0: torch.cuda.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device(args.cuda) else: device = torch.device('cpu') print("device: ", device) if args.train: train(id_to_char, id_to_tag, train_manager, dev_manager, device) f1, res_info = eval_model(id_to_char, id_to_tag, test_manager, device, args.log_name) log_handler.info("\n resinfo {} \v F1: {} ".format(res_info, f1))
def train(): # -----------------------------------数据准备------------------------------------- train_manager = BatchManager(batch_size=20, name='train') test_manager = BatchManager(batch_size=100, name='test') # -----------------------------------读取字典------------------------------------- mapping_dict = get_dict(dict_file) # -----------------------------------搭建模型------------------------------------- model = Model(mapping_dict) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) for i in range(5): j = 1 for batch in train_manager.iter_batch(shuffle=True): start = time.time() loss = model.run_step(sess, batch) end = time.time() if j % 5 == 0: print('epoch:{},step:{}/{},loss:{},elapse:{},estimate:{}'. format(i + 1, j, train_manager.len_data, loss, end - start, (end - start) * (train_manager.len_data - j))) j += 1 for batch in test_manager.iter_batch(shuffle=True): test_result = model.predict(sess, batch, istrain=False, istest=True) print('precision rate:{} %', test_result[1])
def test(): make_path(FLAGS) config = load_config(FLAGS.config_file) with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) update_tag_scheme(test_sentences, FLAGS.tag_schema) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id) test_manager = BatchManager(test_data, 100) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) os.environ["CUDA_VISIBLE_DEVICES"] = "3" gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9) tf_config = tf.ConfigProto(gpu_options=gpu_options) tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
def test(param): # 检查参数 assert param.clip < 5.1, "gradient clip should't be too much" assert 0 <= param.dropout < 1, "dropout rate between 0 and 1" assert param.lr > 0, "learning rate must larger than zero" # 获取batch_manager test_manager = BatchManager(param.test_batch_size, name='test') number_dataset = test_manager.len_data print("total of number test data is {}".format(number_dataset)) # 配置日志 logger = get_logger(param.test_log_file) # 读取字典 mapping_dict = get_dict(param.dict_file) # 搭建模型 model = Model(param, mapping_dict) # 配置GPU参数 gpu_config = tf.ConfigProto() with tf.Session(config=gpu_config) as sess: logger.info("start testing...") start = time.time() # 首先检查模型是否存在 ckpt_path = param.ckpt_path ckpt = tf.train.get_checkpoint_state(ckpt_path) # 看是否存在训练好的模型 if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): logger.info("Reading model parameters from {}".format( ckpt.model_checkpoint_path)) # 如果存在就进行重新加载 model.saver.restore(sess, ckpt.model_checkpoint_path) else: logger.info("Cannot find the ckpt files!") # 开始评估 evaluate(sess, param, model, "test", test_manager, logger) logger.info("The best_f1 on test_dataset is {:.2f}".format( model.best_test_f1.eval())) logger.info('Time test for {:.2f} batch is {:.2f} sec\n'.format( param.test_batch_size, time.time() - start))
def train(): # load data sets # 句子集合 = [[句子1],[句子2],[句子3]],句子1 = [我 O,在 O,。。。] #<class 'list'>: [['海', 'O'], ['钓', 'O'], ['比', 'O'], ['赛', 'O'], ['地', 'O'], ['点', 'O'], ['在', 'O'], ['厦', 'B-LOC'], ['门', 'I-LOC'], ['与', 'O'], ['金', 'B-LOC'], ['门', 'I-LOC'], ['之', 'O'], ['间', 'O'], ['的', 'O'], ['海', 'O'], ['域', 'O'], ['。', 'O']] # train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) # dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) # test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) from xlnet_base.xlnet_data_utils import XLNetDataUtils sp_model = spm.SentencePieceProcessor() sp_model.Load('./chinese_xlnet_base_L-12_H-768_A-12/spiece.model') train_data = XLNetDataUtils(sp_model, batch_size=FLAGS.batch_size, entry="train") dev_data = XLNetDataUtils(sp_model, batch_size=FLAGS.batch_size, entry="dev") test_data = XLNetDataUtils(sp_model, batch_size=FLAGS.batch_size, entry="test") dev_batch = dev_data.iteration() def datapadding(data): alldatalist = [] datalist = data.data max_length = 64 for i in range(len(datalist)): tmpdatalist = [] token = datalist[i][0] segmentid = datalist[i][1] inputid = datalist[i][2] inputmask = datalist[i][3] labellist = datalist[i][4] #token label if len(labellist) < max_length: for i in range(max_length - len(token)): labellist.append(0) elif len(labellist) > max_length: tmplabellist = [] for i in range(max_length): tmplabellist.append(labellist[i]) labellist = tmplabellist #segmentid inputid inputmask if len(segmentid) < max_length: for i in range(max_length - len(segmentid)): segmentid.append(0) inputid.append(0) inputmask.append(0) elif len(segmentid) > max_length: tmpsegmentid = [] tmpinputid = [] tmpinputmask = [] for i in range(max_length): tmpsegmentid.append(segmentid[i]) tmpinputid.append(inputid[i]) tmpinputmask.append(inputmask[i]) segmentid = tmpsegmentid inputid = tmpinputid inputmask = tmpinputmask tmpdatalist.append(token) tmpdatalist.append(segmentid) tmpdatalist.append(inputid) tmpdatalist.append(inputmask) tmpdatalist.append(labellist) alldatalist.append(tmpdatalist) return alldatalist ftraindata = datapadding(train_data) fdevdata = datapadding(dev_data) ftestdata = datapadding(test_data) print(len(ftraindata)) print(len(fdevdata)) print(len(ftestdata)) # traindata = { # "batch_size": train_data.batch_size, # "input_size": train_data.input_size, # "vocab": train_data.vocab, # "tag_map": train_data.tag_map, # } # devdata = { # "batch_size": dev_data.batch_size, # "input_size": dev_data.input_size, # "vocab": dev_data.vocab, # "tag_map": dev_data.tag_map, # } # testdata = { # "batch_size": test_data.batch_size, # "input_size": test_data.input_size, # "vocab": test_data.vocab, # "tag_map": test_data.tag_map, # } # if not os.path.exists("./model/train_data_map.pkl"): # f = open("./model/train_data_map.pkl", "wb") # pickle.dump(traindata, f) # f.close() # if not os.path.exists("./model/dev_data_map.pkl"): # f = open("./model/dev_data_map.pkl", "wb") # pickle.dump(devdata, f) # f.close() # if not os.path.exists("./model/test_data_map.pkl"): # f = open("./model/test_data_map.pkl", "wb") # pickle.dump(testdata, f) # f.close() # Use selected tagging scheme (IOB / IOBES) #update_tag_scheme(train_sentences, FLAGS.tag_schema) #update_tag_scheme(test_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # Create a dictionary and a mapping for tags ''' _t:{'O': 869087, 'B-LOC': 16571, 'I-LOC': 22531, 'B-PER': 8144, 'I-PER': 15881, 'B-ORG': 9277, 'I-ORG': 37689, '[SEP]': 8, '[CLS]': 10} id_to_tag:{0: 'O', 1: 'I-ORG', 2: 'I-LOC', 3: 'B-LOC', 4: 'I-PER', 5: 'B-ORG', 6: 'B-PER', 7: '[CLS]', 8: '[SEP]'} tag_to_id:{'O': 0, 'I-ORG': 1, 'I-LOC': 2, 'B-LOC': 3, 'I-PER': 4, 'B-ORG': 5, 'B-PER': 6, '[CLS]': 7, '[SEP]': 8} ''' tag_to_id = train_data.tag_map id_to_tag = {v: k for k, v in tag_to_id.items()} with open(FLAGS.map_file, "wb") as f: pickle.dump([tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index ''' [['在', '这', '里', '恕', '弟', '不', '恭', '之', '罪', ',', '敢', '在', '尊', '前', '一', '诤', ':', '前', '人', '论', '书', ',', '每', '曰', '“', '字', '字', '有', '来', '历', ',', '笔', '笔', '有', '出', '处', '”', ',', '细', '读', '公', '字', ',', '何', '尝', '跳', '出', '前', '人', '藩', '篱', ',', '自', '隶', '变', '而', '后', ',', '直', '至', '明', '季', ',', '兄', '有', '何', '新', '出', '?'], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1762, 6821, 7027, 2609, 2475, 679, 2621, 722, 5389, 8024, 3140, 1762, 2203, 1184, 671, 6420, 8038, 1184, 782, 6389, 741, 8024, 3680, 3288, 100, 2099, 2099, 3300, 3341, 1325, 8024, 5011, 5011, 3300, 1139, 1905, 100, 8024, 5301, 6438, 1062, 2099, 8024, 862, 2214, 6663, 1139, 1184, 782, 5974, 5075, 8024, 5632, 7405, 1359, 5445, 1400, 8024, 4684, 5635, 3209, 2108, 8024, 1040, 3300, 862, 3173, 1139, 8043, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] ''' # train_data = prepare_dataset( # train_sentences, FLAGS.max_seq_len, tag_to_id, FLAGS.lower # ) # dev_data = prepare_dataset( # dev_sentences, FLAGS.max_seq_len, tag_to_id, FLAGS.lower # ) # test_data = prepare_dataset( # test_sentences, FLAGS.max_seq_len, tag_to_id, FLAGS.lower # ) print("%i / %i / %i sentences in train / dev / test." % (len(train_data.data), len(dev_data.data), len(test_data.data))) train_manager = BatchManager(ftraindata, FLAGS.batch_size) dev_manager = BatchManager(fdevdata, FLAGS.batch_size) test_manager = BatchManager(ftestdata, FLAGS.batch_size) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, config, logger) logger.info("start training") loss = [] for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger, global_steps=step) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
def train(): # load data sets train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list(itertools.chain.from_iterable( [[w[0] for w in s] for s in test_sentences]) ) ) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset( train_sentences, char_to_id, tag_to_id, FLAGS.lower ) dev_data = prepare_dataset( dev_sentences, char_to_id, tag_to_id, FLAGS.lower ) test_data = prepare_dataset( test_sentences, char_to_id, tag_to_id, FLAGS.lower ) print("%i / %i / %i sentences in train / dev / test." % ( len(train_data), 0, len(test_data))) train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(100): for batch in train_manager.iter_batch(shuffle=True): #print batch step, batch_loss = model.run_step(sess, True, batch) #print step loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step%steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
def do_train(config): train, dev, test = load_data(config) # 加载数据 word_to_id, id_to_word, tag_to_id, id_to_tag = create_maps(train, config) # 创建或读取maps # 配置信息及保存 config["num_chars"] = len(word_to_id) # 词总数 config["num_tags"] = len(tag_to_id) # 标签总数 with open(config["config_file"], "w") as f: json.dump(config, f, ensure_ascii=False, indent=4) # 数据处理 train_data = prepare_dataset(train, word_to_id, tag_to_id, config["lower"]) dev_data = prepare_dataset(dev, word_to_id, tag_to_id, config["lower"]) test_data = prepare_dataset(test, word_to_id, tag_to_id, config["lower"]) print("train/dev/test 句子数:{} / {} / {}".format(len(train_data), len(dev_data), len(test_data))) # 分batch train_manager = BatchManager(train_data, config["batch_size"]) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) steps_per_epoch = train_manager.len_data # 每个轮次的steps # 创建相关路径 make_path(config) # logger logger = get_logger(config["log_file"]) # GPU限制 tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: # 创建模型, 可以提供使用现有参数配置 model = Model(config) ckpt = tf.train.get_checkpoint_state(config["ckpt_path"]) # 从模型路径获取ckpt if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): # 现有模型 logger.info("读取现有模型...") model.saver.restore(sess, ckpt.model_checkpoint_path) else: logger.info("新建模型...") sess.run(tf.global_variables_initializer()) # 不使用预训练的embeddings # 如果使用预训练的embeddings if config["pre_emb"]: emb_weights = sess.run(model.char_lookup.read_value()) emb_weights = load_word2vec(config["emb_file"], id_to_word, config["char_dim"], emb_weights) sess.run(model.char_lookup.assign(emb_weights)) logger.info("Load pre-trained embedding.") logger.info("开始训练...") loss = [] for i in range(config["max_epoch"]): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % config["steps_check"] == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger, config) if best: save_model(sess, model, config["ckpt_path"], logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger, config)
def main(): # load data sets global args args = parser.parse_args() pp.pprint(vars(args)) # running_name = 'X' use_cuda = cuda_model.ifUseCuda(args.gpu_id, args.multiGpu) # use_cuda = False # train_file = 'data/example.train' # dev_file = 'data/example.dev' test_file = 'data/example.test' # embedding_file = 'data/vec.txt' map_file = 'map.pkl' # config_file = 'config_file_pytorch' tag_file = 'tag.pkl' # embedding_easy_file = 'data/easy_embedding.npy' # train_sentences = load_sentences(train_file) # dev_sentences = load_sentences(dev_file) test_sentences = load_sentences(test_file) # train_sentences = dev_sentences # update_tag_scheme(train_sentences, args.tag_schema) update_tag_scheme(test_sentences, args.tag_schema) # update_tag_scheme(dev_sentences, args.tag_schema) if not os.path.isfile(tag_file): print("Tag file {:s} Not found".format(tag_file)) sys.exit(-1) else: with open(tag_file, 'rb') as t: tag_to_id, id_to_tag = pickle.load(t) if not os.path.isfile(map_file): print("Map file {:s} Not found".format(map_file)) # create dictionary for word # dico_chars_train = char_mapping(train_sentences)[0] # dico_chars, char_to_id, id_to_char = augment_with_pretrained( # dico_chars_train.copy(), # embedding_file, # list(itertools.chain.from_iterable( # [[w[0] for w in s] for s in test_sentences]) # ) # ) # # _, tag_to_id, id_to_tag = tag_mapping(train_sentences) # # with open(map_file, "wb") as f: # pickle.dump([char_to_id, id_to_char], f) else: with open(map_file, "rb") as f: char_to_id, id_to_char = pickle.load(f) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id) print("{:d} sentences in test.".format(len(test_data))) test_manager = BatchManager(test_data, 1) save_places = dir_utils.save_places(args.eval) # log_path = os.path.join("log", FLAGS.log_file) logger = get_logger( os.path.join(save_places.log_save_dir, 'evaluation-{:d}.txt'.format(args.fileid))) config = config_model(char_to_id, tag_to_id, args) print_config(config, logger) logger.info("start training") #Update: create model and embedding! model = NERModel.CNERPointer(char_dim=args.char_dim, seg_dim=args.seg_dim, hidden_dim=args.hidden_dim, max_length=15, output_classes=4, dropout=args.dropout, embedding_path=None, id_to_word=id_to_char, easy_load=None) print("Number of Params\t{:d}".format( sum([p.data.nelement() for p in model.parameters()]))) #Update: this won't work! # model = cuda_model.convertModel2Cuda(model, gpu_id=args.gpu_id, multiGpu=args.multiGpu) if use_cuda: model = model.cuda() model.eval() if args.eval is not None: # if os.path.isfile(args.resume): ckpt_filename = os.path.join( save_places.model_save_dir, 'checkpoint_{:04d}.pth.tar'.format(args.fileid)) assert os.path.isfile( ckpt_filename), 'Error: no checkpoint directory found!' checkpoint = torch.load(ckpt_filename, map_location=lambda storage, loc: storage) model.load_state_dict(checkpoint['state_dict'], strict=True) train_iou = checkpoint['IoU'] print("=> loading checkpoint '{}', current iou: {:.04f}".format( ckpt_filename, train_iou)) ner_results = evaluate(model, test_manager, id_to_tag, use_cuda, max_len=5) eval_lines = test_ner(ner_results, save_places.summary_save_dir) for line in eval_lines: logger.info(line) f1 = float(eval_lines[1].strip().split()[-1]) return f1
def annotation(): #train() #读入整体的csv文件 try: whole_data = pd.read_csv(raw_data, sep=',', encoding='UTF-8') except UnicodeEncodeError: whole_data = pd.read_csv(raw_data, sep=',', encoding='GBK', errors='ignore') except Exception as e: print(e) row_num = whole_data.shape[0] print(row_num) # -----------------------------------读取字典------------------------------------- mapping_dict = get_dict(dict_file) # -----------------------------------搭建模型------------------------------------- model = Model(mapping_dict) list = ['Per', 'Com', 'Time', 'Job', 'Nat', 'Bir', 'Age', 'Gdr', 'Uni', 'Edu', 'Sch', 'Col', 'Maj', 'Zhi', 'Hon'] feature_dataframe = pd.DataFrame(columns=list) #创建test文件夹,并遍历所有的数据进行预测 for i in range(whole_data): # 单纯创建只能创建两层,用shutil可以创建多层 if os.path.exists('data/Test'): shutil.rmtree('data/Test') if not os.path.exists('data/Test'): os.makedirs('data/Test') cur_data = whole_data['ManagerResume'][i] print(cur_data) filename = 'data/Test/need_annotation.txt' with open(filename, 'w') as f: # 如果filename不存在会自动创建, 'w'表示写数据,写之前会清空文件中的原有数据! f.write(cur_data) task_process(split_text) get_data('task') # -----------------------------------数据准备------------------------------------- task_manager = BatchManager(batch_size=1, name='task') # -----------------------------------搭建模型------------------------------------- item_T = {} item_T = pd.DataFrame(item_T) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) for i in range(1): for batch in task_manager.iter_batch(shuffle=True): task_result,item = model.predict(sess, batch, istrain=False, istest=False) #item_Entity = pd.DataFrame(item['entities']) #item_T = item_T.append(item_Entity) item_T = pd.DataFrame(item['entities']) #print('predict result:{} %', task_result) print(item_T) #num_samples = len(item) # 获取有多少句话 等于是有多少个样本 #print(num_samples) # -------------------------------存储标注完的数据---------------------------------- f_Key = {} for feature in list: l_type =[] for j in range(item_T.shape[0]): if(item_T['type'].iloc[j] == feature): return_word = [item_T['word'].iloc[j]] l_type = l_type+return_word f_Key.update({feature:l_type}) feature_dataframe = feature_dataframe.append(f_Key,ignore_index=True) FinalResult = pd.concat([whole_data,feature_dataframe], axis=1) fpath = 'FinalResult.csv' pd.DataFrame(FinalResult).to_csv(fpath)
def train(): # load data sets train_sentences = load_sentences( FLAGS.train_file, FLAGS.lower, FLAGS.zeros) # dimension:num_sentence*len_sentence*2 dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme( train_sentences, FLAGS.tag_schema) # dimension:num_sentence*len_sentence*2 update_tag_scheme(test_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: # 如果使用预训练的词嵌入 dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[ 0] # dico_chars_train dimension: 训练数据集中出现的字符类别数*2, dico_chars, char_to_id, id_to_char = augment_with_pretrained( # 利用测试数据样本集中的字对dico_chars_train进行补充 dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: # 创建map_file文件 pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset( train_sentences, char_to_id, tag_to_id, FLAGS.lower) # dimension: NumSentence*4*LenSentence dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), len(dev_data), len(test_data))) train_manager = BatchManager( train_data, FLAGS.batch_size ) # batch_data dimension: BatchNum*4*BatchSize*MaxLenSentence dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): # 若已有config_file则读取加载 config = load_config(FLAGS.config_file) else: # 若没有config_file则新建并保存为文件 config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # 将config打印到日志文件 # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True # 动态申请内存 steps_per_epoch = train_manager.len_data # len_data: ceil(NumSentence/BatchSize) with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(FLAGS.max_epoch): # 括号中数字是epoach数量 for batch in train_manager.iter_batch( shuffle=True ): # 一次从batch_data中取出一个batch,Shuffle为True表示打乱batch_data的顺序 step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger) # View the tensorboard graph by running the following code and then going to the terminal and typing: # tensorboard --logdir = tensorboard_logs merged = tf.summary.merge_all() if not os.path.exists('tensorboard_logs/'): os.makedirs('tensorboard_logs/') my_writer = tf.summary.FileWriter('tensorboard_logs/', sess.graph)
class Main: def __init__(self): self.train_sentences = None # 用于存储训练集语句中的字符及标签 self.dev_sentences = None # 用于存储验证集语句中字符及标签 self.char_to_id = None # 字符char到索引id的映射字典 self.id_to_char = None # 索引id到字符char的映射字典 self.tag_to_id = None # 标签tag到索引id的映射字典 self.id_to_tag = None # 索引id到标签tag的映射字典 self.train_batch_manager = None # 训练集的batch管理类 self.dev_batch_manager = None # 验证集的batch管理类 @staticmethod def config_model(char_to_id, tag_to_id): """ 设置模型参数 :param char_to_id:词到索引的映射字典 :param tag_to_id:标签到索引的映射字典 :return:config:dict """ config = OrderedDict() config["num_chars"] = len(char_to_id) config["char_dim"] = FLAGS.char_dim config["num_tags"] = len(tag_to_id) config["seg_dim"] = FLAGS.seg_dim config["lstm_dim"] = FLAGS.lstm_dim config["batch_size"] = FLAGS.batch_size config["emb_file"] = FLAGS.emb_file config["clip"] = FLAGS.clip config["dropout_keep"] = 1.0 - FLAGS.dropout config["optimizer"] = FLAGS.optimizer config["lr"] = FLAGS.lr config["tag_schema"] = FLAGS.tag_schema config["pre_emb"] = FLAGS.pre_emb config["zeros"] = FLAGS.zeros config["lower"] = FLAGS.lower return config @staticmethod def evaluate(sess, model, name, data, id_to_tag, logger): if name == "dev": logger.info("evaluate dev data......") ner_results = model.predict(sess, data, id_to_tag) # 对验证集进行预测,得到对各个实体的预测 # 将预测结果写入到原数据并输出,然后计算并评估识别性能 eval_lines = result_write_evaluate(ner_results, FLAGS.result_path, name, size_train_data) for line in eval_lines: logger.info(line) f1 = float(eval_lines[1].strip().split()[-1]) best_test_f1 = model.best_dev_f1.eval() if f1 > best_test_f1: tf.assign(model.best_dev_f1, f1).eval() logger.info("new best dev f1 score:{:>.3f}".format(f1)) return f1 > best_test_f1 def get_sentences_dict(self): """ 加载数据集中的语句,将每个语句的字符和标签存储为列表,然后生成字符和标签与索引id的双向映射字典 :return: """ # 加载数据集中的语句,将每个语句的字符和标签存储为列表 self.train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) self.dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) # print("dev_sentences:", self.dev_sentences) # 原数据的标注模式与需要的标注模式不同时用update_tag_scheme函数对标注模式进行转换,转换成指定的IOB或者IOBES # update_tag_scheme(train_sentences, FLAGS.tag_schema) # update_tag_scheme(test_sentences, FLAGS.tag_schema) if not os.path.isfile(FLAGS.map_file): # 若map_file不存在,则根据数据集和预训练词向量文件初始化各个映射字典 # 若使用预训练的词向量 if FLAGS.pre_emb: # 得到train_sentences中字符的字典,键值对为word-词频 dico_chars_train = char_mapping(self.train_sentences, FLAGS.lower)[0] # 用预训练词向量文件扩充字典(目的为尽可能地扩充字典、使更多字符能基于预训练的词向量进行初始化)并得到word与id的双向映射字典。 dico_chars, self.char_to_id, self.id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list(itertools.chain.from_iterable( [[w[0] for w in s] for s in self.dev_sentences]) ) ) else: # 若不使用预训练的词向量 _c, self.char_to_id, self.id_to_char = char_mapping(self.train_sentences, FLAGS.lower) _t, self.tag_to_id, self.id_to_tag = tag_mapping(self.train_sentences) # 标签和索引之间的双向映射字典 print("tag_to_id", self.tag_to_id, len(self.tag_to_id)) # 将得到的映射字典存入文件,以免重复初始化 with open(FLAGS.map_file, "wb") as f: pickle.dump([self.char_to_id, self.id_to_char, self.tag_to_id, self.id_to_tag], f) else: # 若map_file存在,则直接从文件中恢复各个映射字典 with open(FLAGS.map_file, "rb") as f: self.char_to_id, self.id_to_char, self.tag_to_id, self.id_to_tag = pickle.load(f) def get_batch_data(self): """ 得到训练集和验证集的batch管理类:首先基于各映射字典对训练集和验证集的语句序列进行处理,得到每个语句的各特征列表以及 真实标签列表,然后获取batch管理类,用于生成batch数据 :return: """ if not os.path.isfile(FLAGS.train_dev_file): train_data = prepare_dataset(self.train_sentences, self.char_to_id, self.tag_to_id, FLAGS.lower) dev_data = prepare_dataset(self.dev_sentences, self.char_to_id, self.tag_to_id, FLAGS.lower) with open(FLAGS.train_dev_file, "wb") as f: pickle.dump([train_data, dev_data], f) else: with open(FLAGS.train_dev_file, "rb") as f: train_data, dev_data = pickle.load(f) print("%i / %i sentences in train / dev ." % (len(train_data), len(dev_data))) self.train_batch_manager = BatchManager(train_data, int(FLAGS.batch_size)) self.dev_batch_manager = BatchManager(dev_data, int(FLAGS.batch_size)) def get_config(self): """ 从模型参数配置文件中获取参数或者用config_model函数生成参数并存储 :return:日志logger及参数列表config """ make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = self.config_model(self.char_to_id, self.tag_to_id) save_config(config, FLAGS.config_file) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) return logger, config def train(self): self.get_sentences_dict() self.get_batch_data() logger, config = self.get_config() tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True # limit GPU memory steps_per_epoch = self.train_batch_manager.len_data # 每一轮epoch的batch数量 with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, config, self.id_to_char, logger) logger.info("start training") loss = [] for i in range(FLAGS.max_epoch): for batch in self.train_batch_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, ""NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] # 对验证集进行预测和评估 best = self.evaluate(sess, model, "dev", self.dev_batch_manager, self.id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger) @ staticmethod def predict(): """ 对一个数据集进行实体识别 :return: """ config = load_config(FLAGS.config_file) logger = get_logger(FLAGS.log_file) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True # limit GPU memory # 从训练阶段生成的map_file中恢复各映射字典 with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower, train=False) test_manager = BatchManager(test_data, 1) with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, config, id_to_char, logger) logger.info("predict data......") ner_results = model.predict(sess, test_manager, id_to_tag) result_write_evaluate(ner_results, FLAGS.result_path, "test") @staticmethod def predict_line(): """ 对一个语句实例进行实体识别测试 :return: """ config = load_config(FLAGS.config_file) logger = get_logger(FLAGS.log_file) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, config, id_to_char, logger) # 对单个句子进行预测 while True: line = input("请输入测试句子:") result = model.predict_line(sess, input_from_line(line, char_to_id), id_to_tag) print(result)
def train(): # load data sets # train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) # dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) all_train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) train_sentences, dev_sentences = split_train_dev(all_train_sentences) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) # update_tag_scheme(dev_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: # dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars_train = char_mapping(all_train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _c, char_to_id, id_to_char = char_mapping(all_train_sentences, FLAGS.lower) # _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(all_train_sentences) # _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # nlp = StanfordCoreNLP(r'E:\DC\dataset\泰一指尚评测数据\stanford-corenlp-full-2017-06-09') #l_sorted_lexcion = load_lexcion(FLAGS.lexcion_file, nlp) l_sorted_lexcion = [] # prepare data, get a collection of list containing index train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, l_sorted_lexcion, FLAGS.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, l_sorted_lexcion, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, l_sorted_lexcion, FLAGS.lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), len(dev_data), len(test_data))) max_len = max( [len(sentence[0]) for sentence in train_data + test_data + dev_data]) train_manager = BatchManager(train_data, FLAGS.batch_size, max_len) dev_manager = BatchManager(dev_data, 800, max_len) test_manager = BatchManager(test_data, 800, max_len) # random.shuffle(train_data) # pad_test_data = pad_data(test_data) # pad_dev_data = pad_data(dev_data) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id, max_len) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(FLAGS.max_epoch): random.shuffle(train_data) pad_train_data = pad_data(train_data, max_len) strings, chars, lexcion_teatures, pos_ids, dep_ids, head_ids, targets = pad_train_data for j in range(0, len(strings), FLAGS.batch_size): batch = [ strings[j:j + FLAGS.batch_size], chars[j:j + FLAGS.batch_size], lexcion_teatures[j:j + FLAGS.batch_size], pos_ids[j:j + FLAGS.batch_size], dep_ids[j:j + FLAGS.batch_size], head_ids[j:j + FLAGS.batch_size], targets[j:j + FLAGS.batch_size] ] step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "AS loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger, i) evaluate(sess, model, "test", test_manager, id_to_tag, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
def train(): # load data sets datasets = load_sentences(FLAGS.train_file, FLAGS.lower) random.shuffle(datasets) train_sentences = datasets[:14000] test_sentences = datasets[14000:] # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # create dictionary for word char_to_id, _ = elmo_char_mapping(FLAGS.elmo_vocab) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i sentences in train / dev." % (len(train_data), len(test_data))) elmo_batcher = get_batcher() train_manager = BatchManager(train_data, FLAGS.batch_size, elmo_batcher) test_manager = BatchManager(test_data, FLAGS.batch_size, elmo_batcher) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto(allow_soft_placement=True) tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: elmo_model = load_elmo() model = create_model(sess, Model, FLAGS.ckpt_path, elmo_model, config, logger) logger.info("start training") loss = [] for i in range(FLAGS.max_epoch): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info( "iteration:{} step:{}/{}, NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "test", test_manager, id_to_tag, logger) # evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger)
def train(param): # 检查参数 assert param.clip < 5.1, "gradient clip should't be too much" assert 0 <= param.dropout < 1, "dropout rate between 0 and 1" assert param.lr > 0, "learning rate must larger than zero" # 数据准备 train_manager = BatchManager(param.batch_size, name='train') number_dataset = train_manager.len_data print("total of number train data is {}".format(number_dataset)) # 创建相应的文件夹 make_path(param) # 配置日志 logger = get_logger(param.train_log_file) # 读取字典 mapping_dict = get_dict(param.dict_file) # 读取senc_tag为后续加载词向量做准备 senc_tag = get_sent_tag(param.sent_tag_file) # 加载预训练向量 dico_chars, char_to_id, id_to_char = augment_with_pretrained( mapping_dict['word'][2].copy(), param.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in senc_tag]))) # 获取总的训练集数据数量 steps_per_epoch = train_manager.len_data # 配置GPU参数 gpu_config = tf.ConfigProto() with tf.Session(config=gpu_config) as sess: # 初始化模型 model = creat_model(sess, Model, param.ckpt_path, load_word2vec, param, id_to_char, logger, map_all=mapping_dict) for i in range(param.max_epoch): loss = [] total_loss = 0 # 初始化时间 start = time.time() for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, batch) # 这里计算平均loss loss.append(batch_loss) # 这里计算总的loss后面计算全部平均 total_loss += batch_loss if step % 5 == 0: logger.info( "epoch:{}, step:{}/{}, avg_loss:{:>9.4f}".format( i + 1, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) # 保存模型 model.save_model(sess, logger, i) logger.info('Epoch {}, total Loss {:.4f}'.format( i + 1, total_loss / train_manager.len_data)) logger.info( 'Time taken for one epoch {:.4f} min, take {:.2f} h for rest of epoch\n' .format((time.time() - start) / 60, ((param.max_epoch - i + 1) * (time.time() - start)) / 3600))
def train(): # load data sets train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) # create maps if not exist, load data if exists maps if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), 0, len(test_data))) train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # 设置训练日志目录 train_log = os.path.join(FLAGS.logdir, "train") if not os.path.exists(train_log): os.makedirs(train_log) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data # the nums of batch data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) # 观察所建立的计算图 train_writer = tf.summary.FileWriter(train_log, sess.graph) logger.info("start training") loss = [] dev_f1 = [] test_f1 = [] for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss, merged = model.run_step( sess, True, batch) # step是global step # 在迭代中输出到结果 train_writer.add_summary(merged, step) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] # use dev data to validation the model best, dev_f1_value = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) # store the dev f1 dev_f1.append(dev_f1_value) if best: save_model(sess, model, FLAGS.ckpt_path, logger) # use current the model to test _, test_f1_value = evaluate(sess, model, "test", test_manager, id_to_tag, logger) # store the test f1 test_f1.append(test_f1_value) # write the dev_f1 and test_f1 to file f1_result = {} f1_result["dev_f1"] = dev_f1 f1_result["test_f1"] = test_f1 write_data_to_file(f1_result, "f1_result")
def train(X_train,X_dev,X_test): # load data sets train_sentences = X_train dev_sentences = X_dev test_sentences = X_test train_sentences_loc = load_sentences(FLAGS.train_file_loc, FLAGS.lower, FLAGS.zeros) dev_sentences_loc = load_sentences(FLAGS.dev_file_loc, FLAGS.lower, FLAGS.zeros) test_sentences_loc = load_sentences(FLAGS.test_file_loc, FLAGS.lower, FLAGS.zeros) train_sentences_org = load_sentences(FLAGS.train_file_org, FLAGS.lower, FLAGS.zeros) dev_sentences_org = load_sentences(FLAGS.dev_file_org, FLAGS.lower, FLAGS.zeros) test_sentences_org = load_sentences(FLAGS.test_file_org, FLAGS.lower, FLAGS.zeros) train_sentences_per = load_sentences(FLAGS.train_file_per, FLAGS.lower, FLAGS.zeros) dev_sentences_per = load_sentences(FLAGS.dev_file_per, FLAGS.lower, FLAGS.zeros) test_sentences_per = load_sentences(FLAGS.test_file_per, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) update_tag_scheme(train_sentences_loc, FLAGS.tag_schema) update_tag_scheme(test_sentences_loc, FLAGS.tag_schema) update_tag_scheme(train_sentences_per, FLAGS.tag_schema) update_tag_scheme(test_sentences_per, FLAGS.tag_schema) update_tag_scheme(train_sentences_org, FLAGS.tag_schema) update_tag_scheme(test_sentences_org, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list(itertools.chain.from_iterable( [[w[0] for w in s] for s in test_sentences]) ) ) dico_chars_train_loc = char_mapping(train_sentences_loc, FLAGS.lower)[0] dico_chars_loc, char_to_id_loc, id_to_char_loc = augment_with_pretrained( dico_chars_train_loc.copy(), FLAGS.emb_file, list(itertools.chain.from_iterable( [[w[0] for w in s] for s in test_sentences_loc]) ) ) dico_chars_train_per = char_mapping(train_sentences_per, FLAGS.lower)[0] dico_chars_per, char_to_id_per, id_to_char_per = augment_with_pretrained( dico_chars_train_per.copy(), FLAGS.emb_file, list(itertools.chain.from_iterable( [[w[0] for w in s] for s in test_sentences_per]) ) ) dico_chars_train_org = char_mapping(train_sentences_org, FLAGS.lower)[0] dico_chars_org, char_to_id_org, id_to_char_org = augment_with_pretrained( dico_chars_train_org.copy(), FLAGS.emb_file, list(itertools.chain.from_iterable( [[w[0] for w in s] for s in test_sentences_org]) ) ) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) _c_loc, char_to_id_loc, id_to_char_loc = char_mapping(train_sentences_loc, FLAGS.lower) _c_per, char_to_id_per, id_to_char_per = char_mapping(train_sentences_per, FLAGS.lower) _c_org, char_to_id_org, id_to_char_org = char_mapping(train_sentences_org, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) _t_loc, tag_to_id_loc, id_to_tag_loc = tag_mapping(train_sentences_loc) _t_per, tag_to_id_per, id_to_tag_per = tag_mapping(train_sentences_per) _t_org, tag_to_id_org, id_to_tag_org = tag_mapping(train_sentences_org) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag,char_to_id_loc, id_to_char_loc, tag_to_id_loc, id_to_tag_loc,char_to_id_per, id_to_char_per, tag_to_id_per, id_to_tag_per,char_to_id_org, id_to_char_org, tag_to_id_org, id_to_tag_org], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag,char_to_id_loc, id_to_char_loc, tag_to_id_loc, id_to_tag_loc,char_to_id_per, id_to_char_per, tag_to_id_per, id_to_tag_per,char_to_id_org, id_to_char_org, tag_to_id_org, id_to_tag_org = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset( train_sentences, char_to_id, tag_to_id, FLAGS.lower ) dev_data = prepare_dataset( dev_sentences, char_to_id, tag_to_id, FLAGS.lower ) test_data = prepare_dataset( test_sentences, char_to_id, tag_to_id, FLAGS.lower ) print("%i / %i / %i sentences in train / dev / test." % ( len(train_data),len(dev_data), len(test_data))) train_data_loc = prepare_dataset_ner( train_sentences_loc, char_to_id_loc, tag_to_id_loc, FLAGS.lower ) dev_data_loc = prepare_dataset_ner( dev_sentences_loc, char_to_id_loc, tag_to_id_loc, FLAGS.lower ) test_data_loc = prepare_dataset_ner( test_sentences_loc, char_to_id_loc, tag_to_id_loc, FLAGS.lower ) print("%i / %i / %i sentences_loc in train / dev / test." % ( len(train_data_loc), len(dev_data_loc), len(test_data_loc))) train_data_per = prepare_dataset_ner( train_sentences_per, char_to_id_per, tag_to_id_per, FLAGS.lower ) dev_data_per = prepare_dataset_ner( dev_sentences_per, char_to_id_per, tag_to_id_per, FLAGS.lower ) test_data_per = prepare_dataset_ner( test_sentences_per, char_to_id_per, tag_to_id_per, FLAGS.lower ) print("%i / %i / %i sentences_per in train / dev / test." % ( len(train_data_per), len(dev_data_per), len(test_data_per))) train_data_org = prepare_dataset_ner( train_sentences_org, char_to_id_org, tag_to_id_org, FLAGS.lower ) dev_data_org = prepare_dataset_ner( dev_sentences_org, char_to_id_org, tag_to_id_org, FLAGS.lower ) test_data_org = prepare_dataset_ner( test_sentences_org, char_to_id_org, tag_to_id_org, FLAGS.lower ) print("%i / %i / %i sentences_org in train / dev / test." % ( len(train_data_org), len(dev_data_org), len(test_data_org))) train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) train_manager_loc = BatchManager(train_data_loc, FLAGS.batch_size) train_manager_per = BatchManager(train_data_per, FLAGS.batch_size) train_manager_org = BatchManager(train_data_org, FLAGS.batch_size) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id,char_to_id_loc, tag_to_id_loc,char_to_id_per, tag_to_id_per,char_to_id_org, tag_to_id_org) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data steps_per_epoch_loc = train_manager_loc.len_data steps_per_epoch_per = train_manager_per.len_data steps_per_epoch_org = train_manager_org.len_data model = create_model(Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, id_to_char_loc, id_to_char_per, id_to_char_org, logger) with tf.Session(config=tf_config, graph = model.graph ) as sess: sess.run(tf.global_variables_initializer()) if config["pre_emb"]: emb_weights = sess.run(model.char_lookup.read_value()) emb_weights_ner = sess.run(model.char_lookup.read_value()) emb_weights, emb_weights_ner = load_word2vec(config["emb_file"], id_to_char, id_to_char_loc,id_to_char_per,id_to_char_org, config["char_dim"], emb_weights, emb_weights_ner) sess.run(model.char_lookup.assign(emb_weights)) logger.info("Load pre-trained embedding.") logger.info("start training") loss = [] loss_loc = [] loss_per = [] loss_org = [] for i in range(100): for batch_loc in train_manager_loc.iter_batch(shuffle=True): step_loc, batch_loss_loc = model.run_step_ner(sess, True, batch_loc) loss_loc.append(batch_loss_loc) if step_loc % FLAGS.steps_check == 0: iteration_loc = step_loc // steps_per_epoch_loc + 1 logger.info("iteration:{} step_loc:{}/{}, " "NER loss:{:>9.6f}".format( iteration_loc, step_loc % steps_per_epoch_loc, steps_per_epoch_loc, np.mean(loss_loc))) loss_loc = [] for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration_1 = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "SKILL loss:{:>9.6f}".format( iteration_1, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] precision_loc_dev = model.precision(sess, dev_manager, id_to_tag) precision_loc_test = model.precision(sess, test_manager, id_to_tag) for batch_per in train_manager_per.iter_batch(shuffle=True): step_per, batch_loss_per = model.run_step_ner(sess, True, batch_per) loss_per.append(batch_loss_per) if step_per % FLAGS.steps_check == 0: iteration_per = step_per // steps_per_epoch_per + 1 logger.info("iteration:{} step_per:{}/{}, " "NER loss:{:>9.6f}".format( iteration_per, step_per % steps_per_epoch_per, steps_per_epoch_per, np.mean(loss_per))) loss_per = [] for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration_2 = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "SKILL loss:{:>9.6f}".format( iteration_2, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] precision_per_dev = model.precision(sess, dev_manager, id_to_tag) precision_per_test = model.precision(sess, test_manager, id_to_tag) for batch_org in train_manager_org.iter_batch(shuffle=True): step_org, batch_loss_org = model.run_step_ner(sess, True, batch_org) loss_org.append(batch_loss_org) if step_org % FLAGS.steps_check == 0: iteration_org = step_org // steps_per_epoch_org + 1 logger.info("iteration:{} step_org:{}/{}, " "NER loss:{:>9.6f}".format( iteration_org, step_org % steps_per_epoch_org, steps_per_epoch_org, np.mean(loss_org))) loss_org = [] for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration_3 = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "SKILL loss:{:>9.6f}".format( iteration_3, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] precision_org_dev = model.precision(sess, dev_manager, id_to_tag) precision_org_test = model.precision(sess, test_manager, id_to_tag) best = evaluate(sess, model, "dev", dev_manager, id_to_tag,precision_loc_dev,precision_per_dev,precision_org_dev, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger) best_test,results= evaluate(sess, model, "test", test_manager, id_to_tag,precision_loc_test,precision_per_test,precision_org_test, logger) with open("CDTL_PSE-result.csv", "a",encoding='utf-8')as st_re: st_re.write(str(results).replace("[", "").replace("]", "")) st_re.write("\n")
char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) train_data = prepare_dataset( train_sentences, char_to_id, tag_to_id, FLAGS.lower, FLAGS.self_seg ) dev_data = prepare_dataset( dev_sentences, char_to_id, tag_to_id, FLAGS.lower, FLAGS.self_seg ) test_data = prepare_dataset( test_sentences, char_to_id, tag_to_id, FLAGS.lower, FLAGS.self_seg ) print("%i / %i / %i sentences in train / dev / test." % ( len(train_data), len(dev_data), len(test_data))) #长度不足补0 train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger)
def main(_): if not os.path.isdir(FLAGS.log_path): os.makedirs(FLAGS.log_path) if not os.path.isdir(FLAGS.model_path): os.makedirs(FLAGS.model_path) if not os.path.isdir(FLAGS.result_path): os.makedirs(FLAGS.result_path) tag_to_id = FLAGS.tag_to_id # load data id_to_word, id_to_tag, train_data, dev_data, test_data = load_data( FLAGS, tag_to_id) train_manager = BatchManager(train_data, len(id_to_tag), FLAGS.word_max_len, FLAGS.batch_size) dev_manager = BatchManager(dev_data, len(id_to_tag), FLAGS.word_max_len, FLAGS.valid_batch_size) test_manager = BatchManager(test_data, len(id_to_tag), FLAGS.word_max_len, FLAGS.valid_batch_size) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: model = create_model(sess, id_to_word, id_to_tag) loss = 0 best_test_f1 = 0 steps_per_epoch = len(train_data) // FLAGS.batch_size + 1 for _ in range(FLAGS.max_epoch): iteration = (model.global_step.eval()) // steps_per_epoch + 1 train_manager.shuffle() for batch in train_manager.iter_batch(): global_step = model.global_step.eval() step = global_step % steps_per_epoch batch_loss = model.run_step(sess, True, batch) loss += batch_loss / FLAGS.steps_per_checkpoint if global_step % FLAGS.steps_per_checkpoint == 0: model.logger.info( "iteration:{} step:{}/{}, NER loss:{:>9.6f}".format( iteration, step, steps_per_epoch, loss)) loss = 0 model.logger.info("validating ner") ner_results = model.predict(sess, dev_manager) eval_lines = conll_eval(ner_results, FLAGS.result_path) for line in eval_lines: model.logger.info(line) test_f1 = float(eval_lines[1].strip().split()[-1]) if test_f1 > best_test_f1: best_test_f1 = test_f1 model.logger.info("new best f1 score:{:>.3f}".format(test_f1)) model.logger.info("saving model ...") checkpoint_path = os.path.join(FLAGS.model_path, "translate.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) # test model model.logger.info("testing ner") ckpt = tf.train.get_checkpoint_state(FLAGS.model_path) model.logger.info("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) ner_results = model.predict(sess, test_manager) eval_lines = conll_eval(ner_results, FLAGS.result_path) for line in eval_lines: model.logger.info(line)
SRL_Model = create_model(Model, logger, word2idx, pumsa2idx, char2idx, label2idx, lemma2idx, word_embedding_matrix) if config.mode == "train": ELMo_dict, context_embeddings_op, ELMo_context, ELMo_ids = load_ELMo() sess.run(tf.global_variables_initializer()) train_dataset = load_data(config.train_path) test_dataset = load_data(config.test_path) train_data = prepare_dataset(train_dataset, word2idx, pumsa2idx, char2idx, lemma2idx, label2idx, ELMo_dict) test_data = prepare_dataset(test_dataset, word2idx, pumsa2idx, char2idx, lemma2idx, label2idx, ELMo_dict) print("%i / %i sentences in train / dev " % (len(train_data), len(test_data))) train_manager = BatchManager(train_data, label2idx) test_manager = BatchManager(test_data, label2idx) train(sess) elif config.mode == "tagging": saver = tf.train.Saver() saver.restore(sess, config.ckpt_path) ELMo_dict, context_embeddings_op, ELMo_context, ELMo_ids = load_ELMo() init_ELMo(sess) unlabeled_dataset = load_data(config.unlabeld_path) unlabeled_data = prepare_dataset(unlabeled_dataset, word2idx, pumsa2idx, char2idx, lemma2idx, label2idx, ELMo_dict) print("%i sentences in unlabeled " % len(unlabeled_data))
def train(): # 加载数据集 train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # 选择tag schema(IOB / IOBES) I:中间,O:其他,B:开始 | E:结束,S:单个 update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) update_tag_scheme(dev_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # 配置文件:char_to_id, id_to_char, tag_to_id, id_to_tag的数据 # create dictionary for word if FLAGS.pre_emb: dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list(itertools.chain.from_iterable( [[w[0] for w in s] for s in test_sentences]) ) ) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences, FLAGS.id_to_tag_path, FLAGS.tag_to_id_path) # with open('maps.txt','w',encoding='utf8') as f1: # f1.writelines(str(char_to_id)+" "+id_to_char+" "+str(tag_to_id)+" "+id_to_tag+'\n') with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # # prepare data, get a collection of list containing index # train_data[0][0]:一句话; # train_data[0][1]:单个字的编号; # train_data[0][2]:切词之后,切词特征:词的大小是一个字的话是0,词的大小是2以上的话:1,2....,2,3; # train_data[0][3]:每个字的标签 train_data = prepare_dataset( train_sentences, char_to_id, tag_to_id, FLAGS.lower ) dev_data = prepare_dataset( dev_sentences, char_to_id, tag_to_id, FLAGS.lower ) test_data = prepare_dataset( test_sentences, char_to_id, tag_to_id, FLAGS.lower ) print("%i / %i / %i sentences in train / dev / test." % ( len(train_data), 0, len(test_data))) train_manager = BatchManager(train_data, FLAGS.batch_size) # 按batch size将数据拆分 dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] # tf.device("/cpu:0") 指定运行的GPU(默认为GPU:0) with tf.device("/cpu:0"): for i in range(100): # 按批次训练模型。这个是训练的开始,可以从这里倒着找整个网络怎么训练 for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) # 打印信息: # iteration:迭代次数,也就是经过多少个epoch; # if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] # best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if i % 7 == 0: save_model(sess, model, FLAGS.ckpt_path, logger)
def get_batch_manager(id_to_tag, tag_to_id, text, word_to_id): test_file = get_test_data2(text) test_data = prepare_data(test_file, word_to_id, tag_to_id, FLAGS.word_max_len) # load data,迭代器 test_manager = BatchManager(test_data, len(id_to_tag), FLAGS.word_max_len, FLAGS.valid_batch_size) return test_manager
def train(): # load data sets train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), 0, len(test_data))) train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] #best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) #if best: save_model(sess, model, FLAGS.ckpt_path, logger)
def train_new(): train_sent = load_sentences(FLAGS.filepath) update_tag_scheme(train_sent, FLAGS.tag_schema) if not os.path.isfile(FLAGS.map_file): _c, char_to_id, id_to_char = char_mapping(train_sent, FLAGS.lower) print("random embedding") # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sent) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # 数据准备,划分验证集和训练集 np.random.seed(10) train_sent_ = np.array(train_sent) shuffle_indices = np.random.permutation(np.arange(len(train_sent))) sent_shuffled = train_sent_[shuffle_indices] dev_sample_index = -1 * int(FLAGS.dev_percentage * float(len(train_sent))) train_sent_new, dev_sent = sent_shuffled[:dev_sample_index], sent_shuffled[ dev_sample_index:] train_data = prepare_dataset(train_sent_new, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_dataset(dev_sent, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i sentences in train." % (len(train_data), len(dev_data))) train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = FLAGS.log_file logger = get_logger(log_path) print_config(config, logger) # 根据需求,设置动态使用GPU资源 tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: fig = plt.figure() ax = fig.add_subplot(211) ax2 = fig.add_subplot(212) plt.grid(True) plt.ion() model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(FLAGS.max_epoch): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % 20 == 0: ax.scatter(step, np.mean(loss), c='b', marker='.') plt.pause(0.001) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best, f1 = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) ax2.scatter(i + 1, f1, c='b', marker='.') plt.pause(0.001) if best: save_model(sess, model, FLAGS.ckpt_path, logger, "best")