def main_train(): # load data sets # sentences = [[(words11, tag11), ...], [(word21, tag21), ...], ...] train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) # 更新在train_sentences和test_sentences中 update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) # create maps if not exist # 创建或加载字符、词、特征、target的映射字典 if not os.path.isfile(FLAGS.map_file): # create dictionary for word # 若存在pre-trained embedding file,则同时使用pre-trained和训练集构建字典 if FLAGS.pre_emb: # 统计train中字,返回字典 dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] # 使用pre-trained的字增大训练集字的字典 dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list(itertools.chain.from_iterable( [[w[0] for w in s] for s in test_sentences]) ) ) # 否则只是用训练集构建字典 else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, target_to_id, id_to_target = tag_mapping(train_sentences) # 创建其他特征的映射字典,返回的三个都为dict _f, feature_to_id, id_to_feature = feature_mapping(train_sentences, FLAGS.features) # 存储字、target、feature的映射关系 with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, target_to_id, id_to_target, feature_to_id, id_to_feature], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, target_to_id, id_to_target, feature_to_id, id_to_feature = pickle.load(f) # make path for store log and model config if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = create_config_model(FLAGS, char_to_id, target_to_id, feature_to_id) logger = get_logger(FLAGS.log_file) print_config(config, logger) train(config, train_sentences, dev_sentences, test_sentences, char_to_id, feature_to_id, target_to_id, id_to_char, id_to_target, logger)
def load_data(self): self.data_path = "data/annotated/" + self.name + ".json" make_path(self.data_path) if os.path.exists(self.data_path): self.ann_data = load_json(self.data_path) else: self.ann_data = [] self.raw_data = load_json(config.processed_path) self.total_num = len(self.raw_data) self.annotated_num = len(self.ann_data) self.position = self.annotated_num # the page showing
def __init__(self, song_length: int, dim, n_channels: int, batch_size: int, args): self.path = "../sdb/data/%s/%s.npz" self.song_length = song_length self.dimension = dim self.n_channels = n_channels self.input_shape = np.empty((*self.dimension, self.n_channels)).shape self.n_labels = 10 if args.d == 'gtzan' else 50 self.batch_size = batch_size self.model = self.build_model() self.model.summary() self.workers = multiprocessing.cpu_count() print('Using ' + str(self.workers) + ' workers') # Callbacks self.callbacks = [] self.callbacks.append(LearningRateTracker()) self.callbacks.append( EarlyStopping(monitor='val_loss', patience=3, verbose=0, mode='auto')) if args.logging: csv_logger = CSVLogger(filename=utils.make_path( args.logging, "%s-%s_%s.csv" % (args.d, self.model_name, datetime.now()))) self.callbacks.append(csv_logger) self.gpu = None if args.gpu: self.gpu = args.gpu self.dataset = args.d
def train_ner(): clean(FLAGS) # load data sets train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) update_tag_scheme(dev_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), 0, len(test_data))) train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(25): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
def train(): # load data sets train_sentences = load_sentences(args.train_file, args.lower, args.zeros) dev_sentences = load_sentences(args.dev_file, args.lower, args.zeros) test_sentences = load_sentences(args.test_file, args.lower, args.zeros) # Use selected tagging scheme (IOB / IOBES) # 检测并维护数据集的 tag 标记 update_tag_scheme(train_sentences, args.tag_schema) update_tag_scheme(test_sentences, args.tag_schema) update_tag_scheme(dev_sentences, args.tag_schema) # create maps if not exist # 根据数据集创建 char_to_id, id_to_char, tag_to_id, id_to_tag 字典,并储存为 pkl 文件 if not os.path.isfile(args.map_file): # create dictionary for word if args.pre_emb: dico_chars_train = char_mapping(train_sentences, args.lower)[0] # 利用预训练嵌入集增强(扩充)字符字典,然后返回字符与位置映射关系 dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), args.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, args.lower) # Create a dictionary and a mapping for tags # 获取标记与位置映射关系 tag_to_id, id_to_tag, intent_to_id, id_to_intent = tag_mapping( train_sentences) with open(args.map_file, "wb") as f: pickle.dump([ char_to_id, id_to_char, tag_to_id, id_to_tag, intent_to_id, id_to_intent ], f) else: with open(args.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag, intent_to_id, id_to_intent = pickle.load( f) # 提取句子特征 # prepare data, get a collection of list containing index train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, intent_to_id, args.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, intent_to_id, args.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, intent_to_id, args.lower) # code.interact(local=locals()) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), len(dev_data), len(test_data))) # 获取可供模型训练的单个批次数据 train_manager = BatchManager(train_data, args.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist make_path(args) if os.path.isfile(args.config_file): config = load_config(args.config_file) else: config = config_model(char_to_id, tag_to_id, intent_to_id) save_config(config, args.config_file) make_path(args) logger = get_logger(args.log_file) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True # 训练集全量跑一次需要迭代的次数 steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: # 此处模型创建为项目最核心代码 model = create_model(sess, Model, args.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss_slot = [] loss_intent = [] # with tf.device("/gpu:0"): for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss_slot, batch_loss_intent = model.run_step( sess, True, batch) loss_slot.append(batch_loss_slot) loss_intent.append(batch_loss_intent) if step % args.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "INTENT loss:{:>9.6f}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss_intent), np.mean(loss_slot))) loss_slot = [] loss_intent = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: # if i%7 == 0: save_model(sess, model, args.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
def train(): # load data sets train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) #update_tag_scheme(train_sentences, FLAGS.tag_schema) #update_tag_scheme(test_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # create dictionary for word _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) os.makedirs('%s' % FLAGS.save_path) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_padding_dataset(train_sentences, FLAGS.max_seq_len, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_padding_dataset(dev_sentences, FLAGS.max_seq_len, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_padding_dataset(test_sentences, FLAGS.max_seq_len, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), len(dev_data), len(test_data))) train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) """ batch = train_manager.batch_data[0] strings, chars, segs, tags = batch for chrs in chars: print(chrs) for chrs in segs: print(chrs) print(tag_to_id) """ # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join(FLAGS.save_path, "log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = TransformerCRFModel(config, is_training=True) sess.run(tf.global_variables_initializer()) logger.info("start training") loss = [] for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] predict_lists = [] source_tag = [] best_dev_f1 = 0.0 best_test_f1 = 0.0 for batch in dev_manager.iter_batch(shuffle=False): lengths, logits = model.run_step(sess, False, batch) _, chars, segs, tags = batch transition = model.transition.eval(session=sess) pre_seq = model.predict(logits, transition, lengths) pre_label = recover_label(pre_seq, lengths, id_to_tag) """ for p in range(len(pre_label)): print(chars[p]) print(pre_label[p]) """ source_label = recover_label(tags, lengths, id_to_tag) predict_lists.extend(pre_label) source_tag.extend(source_label) train_loss_v = np.round(float(np.mean(loss)), 4) print('****************************************************') acc, p, r, f = get_ner_fmeasure(source_tag, predict_lists, config["tag_schema"]) logger.info('epoch:\t{}\ttrain loss:\t{}\t'.format( i + 1, train_loss_v)) logger.info('dev acc:\t{}\tp:\t{}\tr:\t{}\tf:\t{}'.format( acc, p, r, f)) for batch in test_manager.iter_batch(shuffle=False): lengths, logits = model.run_step(sess, False, batch) _, chars, segs, tags = batch transition = model.transition.eval(session=sess) pre_seq = model.predict(logits, transition, lengths) pre_label = recover_label(pre_seq, lengths, id_to_tag) source_label = recover_label(tags, lengths, id_to_tag) predict_lists.extend(pre_label) source_tag.extend(source_label) acc_t, p_t, r_t, f_t = get_ner_fmeasure(source_tag, predict_lists, config["tag_schema"]) logger.info('test acc:\t{}\tp:\t{}\tr:\t{}\tf:\t{}'.format( acc_t, p_t, r_t, f_t)) if f > best_dev_f1: save_model(sess, model, FLAGS.ckpt_path, logger) best_dev_f1 = f best_test_f1 = f_t logger.info( 'save epoch:\t{} model with best dev f1-score'.format(i + 1)) print('****************************************************\n\n')