def load(train_file='example.train', dev_file='example.dev', test_file='example.test', lower=True, zeros=True, tag_schema='iobes', map_file='map.pkl', pre_emb=True, emb_file='wiki_100.utf8'): train_file = get_data_path(train_file) dev_file = get_data_path(dev_file) test_file = get_data_path(test_file) map_file = get_data_path(map_file) emb_file = get_data_path(emb_file) train_sentences = load_sentences(train_file, lower, zeros) dev_sentences = load_sentences(dev_file, lower, zeros) test_sentences = load_sentences(test_file, lower, zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, tag_schema) update_tag_scheme(test_sentences, tag_schema) # create maps if not exist if not os.path.isfile(map_file): # create dictionary for word if pre_emb: dico_chars_train = char_mapping(train_sentences, lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), 0, len(test_data))) return train_data, dev_data, test_data, char_to_id, tag_to_id, id_to_char, id_to_tag
def evaluate_test(): config = load_config(args.config_file) logger = get_logger(args.log_file) with open(args.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag, intent_to_id, id_to_intent = pickle.load( f) test_sentences = load_sentences(args.test_file, args.lower, args.zeros) update_tag_scheme(test_sentences, args.tag_schema) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, intent_to_id, args.lower) test_manager = BatchManager(test_data, 100) # limit GPU memory 限制GPU的内存大小 tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, args.ckpt_path, load_word2vec, config, id_to_char, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
def predict_sentences_given_model(sentences_string, model): """ :type sentences_string: string :type model: MainTaggerModel :param model: Mappings must be loaded. """ from utils import tokenize_sentences_string from utils.loader import load_sentences, prepare_dataset tokenized_sentences = tokenize_sentences_string(sentences_string) # print tokenized_sentences from utils.morph_analyzer_caller import get_morph_analyzes, create_single_word_single_line_format # "\n".join([" ".join(x) for x in tokenized_sentences]) dataset_file_string = "" morph_analyzer_output_for_all_sentences = "" for tokenized_sentence in tokenized_sentences: morph_analyzer_output_for_a_single_sentence = get_morph_analyzes( " ".join(tokenized_sentence)) morph_analyzer_output_for_all_sentences += morph_analyzer_output_for_a_single_sentence + "\n" # print string_output dataset_file_string += create_single_word_single_line_format( morph_analyzer_output_for_a_single_sentence, conll=True, for_prediction=True) dataset_file_string = dataset_file_string.decode('iso-8859-9') # import sys # sys.exit(1) # print sentences_data_string.split("\n") # We now have the input sentences in our native format train_sentences, _, _ = load_sentences(dataset_file_string.split("\n"), model.parameters["zeros"]) char_to_id, id_to_char, id_to_morpho_tag, id_to_tag, id_to_word, morpho_tag_to_id, tag_to_id, word_to_id = \ extract_mapping_dictionaries_from_model(model) _, _, _, sentences_data = prepare_dataset( train_sentences, word_to_id, char_to_id, tag_to_id, morpho_tag_to_id, model.parameters['lower'], model.parameters['mt_d'], model.parameters['mt_t'], model.parameters['mt_ci'], morpho_tag_separator=("+" if model.parameters['lang_name'] == "turkish" else "|")) f_scores, morph_accuracies, labeled_sentences = \ predict_tags_given_model_and_input([('tagger_output', sentences_data)], model, return_result=True) print(labeled_sentences) return labeled_sentences, dataset_file_string
def train_ner(): clean(FLAGS) # load data sets train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) update_tag_scheme(dev_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), 0, len(test_data))) train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(25): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
def train(): # load data sets train_sentences = load_sentences(args.train_file, args.lower, args.zeros) dev_sentences = load_sentences(args.dev_file, args.lower, args.zeros) test_sentences = load_sentences(args.test_file, args.lower, args.zeros) # Use selected tagging scheme (IOB / IOBES) # 检测并维护数据集的 tag 标记 update_tag_scheme(train_sentences, args.tag_schema) update_tag_scheme(test_sentences, args.tag_schema) update_tag_scheme(dev_sentences, args.tag_schema) # create maps if not exist # 根据数据集创建 char_to_id, id_to_char, tag_to_id, id_to_tag 字典,并储存为 pkl 文件 if not os.path.isfile(args.map_file): # create dictionary for word if args.pre_emb: dico_chars_train = char_mapping(train_sentences, args.lower)[0] # 利用预训练嵌入集增强(扩充)字符字典,然后返回字符与位置映射关系 dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), args.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, args.lower) # Create a dictionary and a mapping for tags # 获取标记与位置映射关系 tag_to_id, id_to_tag, intent_to_id, id_to_intent = tag_mapping( train_sentences) with open(args.map_file, "wb") as f: pickle.dump([ char_to_id, id_to_char, tag_to_id, id_to_tag, intent_to_id, id_to_intent ], f) else: with open(args.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag, intent_to_id, id_to_intent = pickle.load( f) # 提取句子特征 # prepare data, get a collection of list containing index train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, intent_to_id, args.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, intent_to_id, args.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, intent_to_id, args.lower) # code.interact(local=locals()) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), len(dev_data), len(test_data))) # 获取可供模型训练的单个批次数据 train_manager = BatchManager(train_data, args.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist make_path(args) if os.path.isfile(args.config_file): config = load_config(args.config_file) else: config = config_model(char_to_id, tag_to_id, intent_to_id) save_config(config, args.config_file) make_path(args) logger = get_logger(args.log_file) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True # 训练集全量跑一次需要迭代的次数 steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: # 此处模型创建为项目最核心代码 model = create_model(sess, Model, args.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss_slot = [] loss_intent = [] # with tf.device("/gpu:0"): for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss_slot, batch_loss_intent = model.run_step( sess, True, batch) loss_slot.append(batch_loss_slot) loss_intent.append(batch_loss_intent) if step % args.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "INTENT loss:{:>9.6f}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss_intent), np.mean(loss_slot))) loss_slot = [] loss_intent = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: # if i%7 == 0: save_model(sess, model, args.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
def train(): tf.io.gfile.mkdir(FLAGS.output) log_path = os.path.join(FLAGS.output, 'model.log') logger = get_logger(log_path) # load data sets train_sentences = load_sentences(os.path.join(FLAGS.data, "train.txt"), FLAGS.zeros) dev_sentences = load_sentences(os.path.join(FLAGS.data, "dev.txt"), FLAGS.zeros) test_sentences = load_sentences(os.path.join(FLAGS.data, "test.txt"), FLAGS.zeros) # create maps if not exist map_file = os.path.join(FLAGS.output, 'maps.pkl') if not os.path.isfile(map_file): # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(map_file, "wb") as f: pickle.dump([tag_to_id, id_to_tag], f) else: with open(map_file, "rb") as f: tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset(train_sentences, FLAGS.max_seq_len, tag_to_id) dev_data = prepare_dataset(dev_sentences, FLAGS.max_seq_len, tag_to_id) test_data = prepare_dataset(test_sentences, FLAGS.max_seq_len, tag_to_id) logger.info("%i / %i / %i sentences in train / dev / test." % (len(train_data), len(dev_data), len(test_data))) train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, FLAGS.batch_size) test_manager = BatchManager(test_data, FLAGS.batch_size) # make path for store log and model if not exist config_file = os.path.join(FLAGS.output, 'config.json') if os.path.isfile(config_file): config = load_config(config_file) else: config = config_model(tag_to_id) save_config(config, config_file) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, os.path.join(FLAGS.output, 'checkpoint'), config, logger) logger.info("start training") loss = [] for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, os.path.join(FLAGS.output, 'checkpoint'), logger, global_steps=step) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
def train(): # load data sets train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) #update_tag_scheme(train_sentences, FLAGS.tag_schema) #update_tag_scheme(test_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # create dictionary for word _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) os.makedirs('%s' % FLAGS.save_path) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_padding_dataset(train_sentences, FLAGS.max_seq_len, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_padding_dataset(dev_sentences, FLAGS.max_seq_len, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_padding_dataset(test_sentences, FLAGS.max_seq_len, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), len(dev_data), len(test_data))) train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) """ batch = train_manager.batch_data[0] strings, chars, segs, tags = batch for chrs in chars: print(chrs) for chrs in segs: print(chrs) print(tag_to_id) """ # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join(FLAGS.save_path, "log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = TransformerCRFModel(config, is_training=True) sess.run(tf.global_variables_initializer()) logger.info("start training") loss = [] for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] predict_lists = [] source_tag = [] best_dev_f1 = 0.0 best_test_f1 = 0.0 for batch in dev_manager.iter_batch(shuffle=False): lengths, logits = model.run_step(sess, False, batch) _, chars, segs, tags = batch transition = model.transition.eval(session=sess) pre_seq = model.predict(logits, transition, lengths) pre_label = recover_label(pre_seq, lengths, id_to_tag) """ for p in range(len(pre_label)): print(chars[p]) print(pre_label[p]) """ source_label = recover_label(tags, lengths, id_to_tag) predict_lists.extend(pre_label) source_tag.extend(source_label) train_loss_v = np.round(float(np.mean(loss)), 4) print('****************************************************') acc, p, r, f = get_ner_fmeasure(source_tag, predict_lists, config["tag_schema"]) logger.info('epoch:\t{}\ttrain loss:\t{}\t'.format( i + 1, train_loss_v)) logger.info('dev acc:\t{}\tp:\t{}\tr:\t{}\tf:\t{}'.format( acc, p, r, f)) for batch in test_manager.iter_batch(shuffle=False): lengths, logits = model.run_step(sess, False, batch) _, chars, segs, tags = batch transition = model.transition.eval(session=sess) pre_seq = model.predict(logits, transition, lengths) pre_label = recover_label(pre_seq, lengths, id_to_tag) source_label = recover_label(tags, lengths, id_to_tag) predict_lists.extend(pre_label) source_tag.extend(source_label) acc_t, p_t, r_t, f_t = get_ner_fmeasure(source_tag, predict_lists, config["tag_schema"]) logger.info('test acc:\t{}\tp:\t{}\tr:\t{}\tf:\t{}'.format( acc_t, p_t, r_t, f_t)) if f > best_dev_f1: save_model(sess, model, FLAGS.ckpt_path, logger) best_dev_f1 = f best_test_f1 = f_t logger.info( 'save epoch:\t{} model with best dev f1-score'.format(i + 1)) print('****************************************************\n\n')