def create_instance(): tag_to_id = FLAGS.tag_to_id id_to_tag = {v: k for k, v in tag_to_id.items()} # 字典生成 print "dict building......" if not isExists(FLAGS.dict_file): print "build dict starting..." train_file = read_conll_file(FLAGS.train_file) word_to_id, _ = word_mapping(train_file, FLAGS.min_freq) write_file(word_to_id, FLAGS.dict_file) else: print "build dict from pickle..." word_to_id = load_dict(FLAGS.dict_file) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) with sess.as_default(): print "begin for create model..." model = create_model(sess, word_to_id, id_to_tag) # just struct # load model model.logger.info("testing ner") ckpt = tf.train.get_checkpoint_state(FLAGS.model_path) model.logger.info("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) return word_to_id, tag_to_id, id_to_tag, sess, model
def train(): # 加载数据集 train_sentences = dl.load_sentences(FLAGS.train_file) dev_sentences = dl.load_sentences(FLAGS.dev_file) test_sentences = dl.load_sentences(FLAGS.test_file) # 转换编码 bio转bioes dl.update_tag_scheme(train_sentences, FLAGS.tag_schema) dl.update_tag_scheme(test_sentences, FLAGS.tag_schema) dl.update_tag_scheme(dev_sentences, FLAGS.tag_schema) # 创建单词映射及标签映射 if not os.path.isfile(FLAGS.map_file): _, word_to_id, id_to_word = dl.word_mapping(train_sentences) _, tag_to_id, id_to_tag = dl.tag_mapping(train_sentences) with open(FLAGS.map_file, 'wb') as f: pickle.dump([word_to_id, id_to_word, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, 'rb') as f: unpickler = pickle.Unpickler(f) scores = unpickler.load() word_to_id, id_to_word, tag_to_id, id_to_tag = scores train_data = dl.prapare_dataset(train_sentences, word_to_id, tag_to_id) dev_data = dl.prapare_dataset(train_sentences, word_to_id, tag_to_id) test_data = dl.prapare_dataset(train_sentences, word_to_id, tag_to_id) print('train_data %i, dev_data_num %i, test_data_num %i' % (len(train_data), len(dev_data), len(test_data))) mu.make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = mu.load_config(FLAGS.config_file) else: config = mu.config_model(FLAGS, word_to_id, tag_to_id) mu.save_config(config, FLAGS.config_file) log_path = os.path.join('log', FLAGS.log_file) logger = mu.get_log(log_path) mu.print_config(config, logger) print('aa')
def train(): # 1、加载数据集 train_sentences = data_loader.load_sentences(FLAGS.train_file) dev_sentences = data_loader.load_sentences(FLAGS.dev_file) test_sentences = data_loader.load_sentences(FLAGS.test_file) # 2、转换编码 BIO->BIOES data_loader.update_tag_scheme(train_sentences, FLAGS.tag_schema) data_loader.update_tag_scheme(dev_sentences, FLAGS.tag_schema) data_loader.update_tag_scheme(test_sentences, FLAGS.tag_schema) # 3、创建单词映射与标签映射 if not os.path.isfile(FLAGS.map_file): _, word_to_id, id_to_word = data_loader.word_mapping(train_sentences) _, tag_to_id, id_to_tag = data_loader.tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: # 序列化pickle.dump(obj, file, [,protocol]),,序列化对象,将对象obj保存到文件file中去。 pickle.dump([word_to_id, id_to_word, tag_to_id, id_to_tag], f) else: # 反序列化对象,将文件中的数据解析为一个python对象。file中有read()接口和readline()接口 with open(FLAGS.map_file, "rb") as f: word_to_id, id_to_word, tag_to_id, id_to_tag = pickle.load(f) # 4、数据预处理 train_data = data_loader.prepare_dataset(train_sentences, word_to_id, tag_to_id) dev_data = data_loader.prepare_dataset(dev_sentences, word_to_id, tag_to_id) test_data = data_loader.prepare_dataset(test_sentences, word_to_id, tag_to_id) model_utils.make_path(FLAGS) config = model_utils.config_model(FLAGS, word_to_id, tag_to_id)
def train(): # 加载数据集 train_sentences = data_loader.load_sentences(FLAGS.train_file) dev_sentences = data_loader.load_sentences(FLAGS.dev_file) test_sentences = data_loader.load_sentences(FLAGS.test_file) # 转换编码 data_loader.update_tag_scheme(train_sentences, FLAGS.tag_schema) data_loader.update_tag_scheme(dev_sentences, FLAGS.tag_schema) data_loader.update_tag_scheme(test_sentences, FLAGS.tag_schema) # 创建单词和词典映射 if not os.path.isfile(FLAGS.map_file): if FLAGS.pre_emb: dico_words_train = data_loader.word_mapping(train_sentences)[0] dico_word, word_to_id, id_to_word = data_utils.augment_with_pretrained( dico_words_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _, word_to_id, id_to_word = data_loader.word_mapping( train_sentences) _, tag_to_id, id_to_tag = data_loader.tag_mapping(train_sentences) with open(FLAGS.map_file, 'wb') as f: pickle.dump([word_to_id, id_to_word, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, 'rb') as f: word_to_id, id_to_word, tag_to_id, id_to_tag = pickle.load(f) # 准备数据 train_data = data_loader.prepare_dataset(train_sentences, word_to_id, tag_to_id) dev_data = data_loader.prepare_dataset(dev_sentences, word_to_id, tag_to_id) test_data = data_loader.prepare_dataset(test_sentences, word_to_id, tag_to_id) # 将数据分批处理 train_manager = data_utils.BatchManager(train_data, FLAGS.batch_size) dev_manager = data_utils.BatchManager(dev_data, FLAGS.batch_size) test_manager = data_utils.BatchManager(test_data, FLAGS.batch_size) # 创建不存在的文件夹 model_utils.make_path(FLAGS) # 判断配置文件 if os.path.isfile(FLAGS.config_file): config = model_utils.load_config(FLAGS.config_file) else: config = model_utils.config_model(FLAGS, word_to_id, tag_to_id) model_utils.save_config(config, FLAGS.config_file) # 配置印logger log_path = os.path.join('log', FLAGS.log_file) logger = model_utils.get_logger(log_path) model_utils.print_config(config, logger) tf_config = tf.ConfigProto(allow_soft_placement=True) tf_config.gpu_options.allow_growth = True step_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = model_utils.create(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_word, logger) logger.info('开始训练') loss = [] start = time.time() for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.setps_chech == 0: iteration = step // step_per_epoch + 1 logger.info( "iteration{}: step{}/{}, NER loss:{:>9.6f}".format( iteration, step % step_per_epoch, step_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, 'dev', dev_manager, id_to_tag, logger) if best: model_utils.save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, 'test', test_manager, id_to_tag, logger) t = time.time() - start logger.info('cost time: %f' % t)
data_loader.update_tag_scheme(train_sentences, FLAGS.tag_schema) data_loader.update_tag_scheme(test_sentences, FLAGS.tag_schema) data_loader.update_tag_scheme(dev_sentences, FLAGS.tag_schema) # 创建单词映射及标签映射 if not os.path.isfile(FLAGS.map_file): if FLAGS.pre_emb: dico_words_train = data_loader.word_mapping(train_sentences)[0] dico_word, word_to_id, id_to_word = data_utils.augment_with_pretrained( dico_words_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable( [[w[0] for w in s] for s in test_sentences] )
def train(): # 加载数据集 train_sentences = data_loader.load_sentences(FLAGS.train_file) dev_sentences = data_loader.load_sentences(FLAGS.dev_file) test_sentences = data_loader.load_sentences(FLAGS.test_file) # 转换编码 bio转bioes data_loader.update_tag_scheme(train_sentences, FLAGS.tag_schema) data_loader.update_tag_scheme(test_sentences, FLAGS.tag_schema) data_loader.update_tag_scheme(dev_sentences, FLAGS.tag_schema) # 创建单词映射及标签映射 if not os.path.isfile(FLAGS.map_file): if FLAGS.pre_emb: dico_words_train = data_loader.word_mapping(train_sentences)[0] dico_word, word_to_id, id_to_word = data_utils.augment_with_pretrained( dico_words_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable( [[w[0] for w in s] for s in test_sentences] ) ) ) else: _, word_to_id, id_to_word = data_loader.word_mapping(train_sentences) _, tag_to_id, id_to_tag = data_loader.tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([word_to_id, id_to_word, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, 'rb') as f: word_to_id, id_to_word, tag_to_id, id_to_tag = pickle.load(f) train_data = data_loader.prepare_dataset( train_sentences, word_to_id, tag_to_id ) dev_data = data_loader.prepare_dataset( dev_sentences, word_to_id, tag_to_id ) test_data = data_loader.prepare_dataset( test_sentences, word_to_id, tag_to_id ) train_manager = data_utils.BatchManager(train_data, FLAGS.batch_size) dev_manager = data_utils.BatchManager(dev_data, FLAGS.batch_size) test_manager = data_utils.BatchManager(test_data, FLAGS.batch_size) print('train_data_num %i, dev_data_num %i, test_data_num %i' % (len(train_data), len(dev_data), len(test_data))) model_utils.make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = model_utils.load_config(FLAGS.config_file) else: config = model_utils.config_model(FLAGS, word_to_id, tag_to_id) model_utils.save_config(config, FLAGS.config_file) log_path = os.path.join("log", FLAGS.log_file) logger = model_utils.get_logger(log_path) model_utils.print_config(config, logger) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch =train_manager.len_data with tf.Session(config = tf_config) as sess: model = model_utils.create(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_word, logger) logger.info("开始训练") loss = [] for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.setps_chech== 0: iterstion = step // steps_per_epoch + 1 logger.info("iteration:{} step{}/{},NER loss:{:>9.6f}".format(iterstion, step%steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess,model,"dev", dev_manager, id_to_tag, logger) if best: model_utils.save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)