def train(): # load data sets train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(dev_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) # with open(FLAGS.map_file, "wb") as f: # pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) # author : wn _t_pos, pos_to_id, id_to_pos = pos_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([ char_to_id, id_to_char, tag_to_id, id_to_tag, pos_to_id, id_to_pos ], f) else: # with open(FLAGS.map_file, "rb") as f: # char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # author : wn with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag, pos_to_id, id_to_pos = pickle.load( f) print(tag_to_id) print(pos_to_id) # prepare data, get a collection of list containing index # train_data = prepare_dataset( # train_sentences, char_to_id, tag_to_id, FLAGS.lower # ) # dev_data = prepare_dataset( # dev_sentences, char_to_id, tag_to_id, FLAGS.lower # ) # test_data = prepare_dataset( # test_sentences, char_to_id, tag_to_id, FLAGS.lower # ) train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, pos_to_id, FLAGS.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, pos_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, pos_to_id, FLAGS.lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), len(dev_data), len(test_data))) train_manager = BatchManager(train_data, FLAGS.batch_size) train_dev_manager = BatchManager(train_data, 100) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: # config = config_model(char_to_id, tag_to_id) # author : wn config = config_model(char_to_id, tag_to_id, pos_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] current_epoch = FLAGS.current_epoch while current_epoch < FLAGS.max_epoch: for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.debug("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] logger.info( "\n\n *******************epoch-{} NER loss:{:>9.6f}************************" .format(current_epoch, np.mean(loss))) best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: # save_model(sess, model, FLAGS.ckpt_path, logger) save_model(sess, model, FLAGS.ckpt_path, logger, current_epoch, np.mean(loss), remark='best_dev') # elif current_epoch%10 ==0 : # save_model(sess, model, FLAGS.ckpt_path, logger, current_epoch, np.mean(loss)) evaluate(sess, model, "train", train_dev_manager, id_to_tag, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger) current_epoch += 1
itertools.chain.from_iterable([[w[0] for w in s] for s in dev_sentences + test_sentences])) if not parameters['all_emb'] else None) else: dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower) dico_words_train = dico_words # Create a dictionary and a mapping for words / POS tags / tags if opts.reload == None: if opts.train_true: dico_chars, char_to_id, id_to_char = char_mapping(train_sentences + train_true_sentences) dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences + train_true_sentences) dico_POSs, POS_to_id, id_to_POS = pos_mapping(train_sentences + train_true_sentences) else: dico_chars, char_to_id, id_to_char = char_mapping(train_sentences) dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences) dico_POSs, POS_to_id, id_to_POS = pos_mapping(train_sentences) if opts.reload != None: word_to_id, char_to_id, tag_to_id, POS_to_id = [ {v: k for k, v in x.items()} for x in [model.id_to_word, model.id_to_char, model.id_to_tag, model.id_to_POS] ] id_to_tag = model.id_to_tag id_to_char = model.id_to_char id_to_word = model.id_to_word
dev_sentences = loader.load_sentences(opts.dev, lower, zeros) test_sentences = loader.load_sentences(opts.test, lower, zeros) # 选择标注规范(IOB / IOBES) #update_tag_scheme(train_sentences, 'iobes') #update_tag_scheme(dev_sentences, 'iobes') #update_tag_scheme(test_sentences, 'iobes') # 用训练集建立词/字/NER标签/词性标签的词典和映射 dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower) dico_words_train = dico_words id_to_char = {} if opts.char_dim: dico_chars, char_to_id, id_to_char = char_mapping(train_sentences) dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences) dico_pos_tags, pos_tag_to_id, id_to_pos_tag = pos_mapping(train_sentences) n_tag = len(id_to_tag) n_pos_tag = len(id_to_pos_tag) # Index data if opts.char_dim: train_data = prepare_dataset(train_sentences, word_to_id, char_to_id, tag_to_id, lower) dev_data = prepare_dataset(dev_sentences, word_to_id, char_to_id, tag_to_id, lower) test_data = prepare_dataset(test_sentences, word_to_id, char_to_id, tag_to_id, lower) else: train_data = prepare_dataset_(train_sentences, word_to_id, tag_to_id, pos_tag_to_id, lower)