def load_model(self, model_dir: str, model_config: str = "model_config.json"): model_config = os.path.join(model_dir, model_config) model_config = json.load(open(model_config)) bert_config = json.load( open(os.path.join(model_dir, "bert_config.json"))) model = BertNer(bert_config, tf.float32, model_config['num_labels'], model_config['max_seq_length']) ids = tf.ones((1, 128), dtype=tf.int64) _ = model(ids, ids, ids, ids, training=False) model.load_weights(os.path.join(model_dir, "model.h5")) voacb = os.path.join(model_dir, "vocab.txt") tokenizer = FullTokenizer(vocab_file=voacb, do_lower_case=model_config["do_lower"]) return model, tokenizer, model_config
def create_model(self, num_train_step, num_warmup_step): """ 根据config文件选择对应的模型,并初始化 :return: """ model = BertNer(config=self.config, num_train_step=num_train_step, num_warmup_step=num_warmup_step) return model
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help= "Bert pre-trained model selected in the list: bert-base-cased,bert-large-cased" ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) # Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev/test set.") parser.add_argument("--eval_on", default="dev", type=str, help="Evaluation set, dev: Development, test: Test") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3, type=int, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--weight_decay", default=0.01, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") # training stratergy arguments parser.add_argument( "--multi_gpu", action='store_true', help="Set this flag to enable multi-gpu training using MirroredStrategy." "Single gpu training") parser.add_argument( "--gpus", default='0', type=str, help="Comma separated list of gpus devices." "For Single gpu pass the gpu id.Default '0' GPU" "For Multi gpu,if gpus not specified all the available gpus will be used" ) args = parser.parse_args() processor = NerProcessor() label_list = processor.get_labels() num_labels = len(label_list) + 1 if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if args.do_train: tokenizer = FullTokenizer(os.path.join(args.bert_model, "vocab.txt"), args.do_lower_case) if args.multi_gpu: if len(args.gpus.split(',')) == 1: strategy = tf.distribute.MirroredStrategy() else: gpus = [f"/gpu:{gpu}" for gpu in args.gpus.split(',')] strategy = tf.distribute.MirroredStrategy(devices=gpus) else: gpu = args.gpus.split(',')[0] strategy = tf.distribute.OneDeviceStrategy(device=f"/gpu:{gpu}") train_examples = None optimizer = None num_train_optimization_steps = 0 ner = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size) * args.num_train_epochs warmup_steps = int(args.warmup_proportion * num_train_optimization_steps) learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=args.learning_rate, decay_steps=num_train_optimization_steps, end_learning_rate=0.0) if warmup_steps: learning_rate_fn = WarmUp(initial_learning_rate=args.learning_rate, decay_schedule_fn=learning_rate_fn, warmup_steps=warmup_steps) optimizer = AdamWeightDecay( learning_rate=learning_rate_fn, weight_decay_rate=args.weight_decay, beta_1=0.9, beta_2=0.999, epsilon=args.adam_epsilon, exclude_from_weight_decay=['layer_norm', 'bias']) with strategy.scope(): ner = BertNer(args.bert_model, tf.float32, num_labels, args.max_seq_length) loss_fct = tf.keras.losses.SparseCategoricalCrossentropy( reduction=tf.keras.losses.Reduction.NONE) label_map = {i: label for i, label in enumerate(label_list, 1)} if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = tf.data.Dataset.from_tensor_slices( np.asarray([f.input_ids for f in train_features], dtype=np.int32)) all_input_mask = tf.data.Dataset.from_tensor_slices( np.asarray([f.input_mask for f in train_features], dtype=np.int32)) all_segment_ids = tf.data.Dataset.from_tensor_slices( np.asarray([f.segment_ids for f in train_features], dtype=np.int32)) all_valid_ids = tf.data.Dataset.from_tensor_slices( np.asarray([f.valid_ids for f in train_features], dtype=np.int32)) all_label_mask = tf.data.Dataset.from_tensor_slices( np.asarray([f.label_mask for f in train_features])) all_label_ids = tf.data.Dataset.from_tensor_slices( np.asarray([f.label_id for f in eval_features], dtype=np.int32)) # Dataset using tf.data train_data = tf.data.Dataset.zip( (all_input_ids, all_input_mask, all_segment_ids, all_valid_ids, all_label_ids, all_label_mask)) shuffled_train_data = train_data.shuffle(buffer_size=int( len(train_features) * 0.1), seed=args.seed, reshuffle_each_iteration=True) batched_train_data = shuffled_train_data.batch(args.train_batch_size) # Distributed dataset dist_dataset = strategy.experimental_distribute_dataset( batched_train_data) loss_metric = tf.keras.metrics.Mean() epoch_bar = master_bar(range(args.num_train_epochs)) pb_max_len = math.ceil( float(len(train_features)) / float(args.train_batch_size)) def train_step(input_ids, input_mask, segment_ids, valid_ids, label_ids, label_mask): def step_fn(input_ids, input_mask, segment_ids, valid_ids, label_ids, label_mask): with tf.GradientTape() as tape: logits = ner(input_ids, input_mask, segment_ids, valid_ids, training=True) label_mask = tf.reshape(label_mask, (-1, )) logits = tf.reshape(logits, (-1, num_labels)) logits_masked = tf.boolean_mask(logits, label_mask) label_ids = tf.reshape(label_ids, (-1, )) label_ids_masked = tf.boolean_mask(label_ids, label_mask) cross_entropy = loss_fct(label_ids_masked, logits_masked) loss = tf.reduce_sum(cross_entropy) * ( 1.0 / args.train_batch_size) grads = tape.gradient(loss, ner.trainable_variables) optimizer.apply_gradients( list(zip(grads, ner.trainable_variables))) return cross_entropy per_example_losses = strategy.experimental_run_v2( step_fn, args=(input_ids, input_mask, segment_ids, valid_ids, label_ids, label_mask)) mean_loss = strategy.reduce(tf.distribute.ReduceOp.MEAN, per_example_losses, axis=0) return mean_loss for epoch in epoch_bar: with strategy.scope(): for (input_ids, input_mask, segment_ids, valid_ids, label_ids, label_mask) in progress_bar(dist_dataset, total=pb_max_len, parent=epoch_bar): loss = train_step(input_ids, input_mask, segment_ids, valid_ids, label_ids, label_mask) loss_metric(loss) epoch_bar.child.comment = f'loss : {loss_metric.result()}' loss_metric.reset_states() # model weight save ner.save_weights(os.path.join(args.output_dir, "model.h5")) # copy vocab to output_dir shutil.copyfile(os.path.join(args.bert_model, "vocab.txt"), os.path.join(args.output_dir, "vocab.txt")) # copy bert config to output_dir shutil.copyfile(os.path.join(args.bert_model, "bert_config.json"), os.path.join(args.output_dir, "bert_config.json")) # save label_map and max_seq_length of trained model model_config = { "bert_model": args.bert_model, "do_lower": args.do_lower_case, "max_seq_length": args.max_seq_length, "num_labels": num_labels, "label_map": label_map } json.dump(model_config, open(os.path.join(args.output_dir, "model_config.json"), "w"), indent=4) if args.do_eval: # load tokenizer tokenizer = FullTokenizer(os.path.join(args.output_dir, "vocab.txt"), args.do_lower_case) # model build hack : fix config = json.load( open(os.path.join(args.output_dir, "bert_config.json"))) ner = BertNer(config, tf.float32, num_labels, args.max_seq_length) ids = tf.ones((1, 128), dtype=tf.int32) _ = ner(ids, ids, ids, ids, training=False) ner.load_weights(os.path.join(args.output_dir, "model.h5")) # load test or development set based on argsK if args.eval_on == "dev": eval_examples = processor.get_dev_examples(args.data_dir) elif args.eval_on == "test": eval_examples = processor.get_test_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evalution *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = tf.data.Dataset.from_tensor_slices( np.asarray([f.input_ids for f in eval_features], dtype=np.int32)) all_input_mask = tf.data.Dataset.from_tensor_slices( np.asarray([f.input_mask for f in eval_features], dtype=np.int32)) all_segment_ids = tf.data.Dataset.from_tensor_slices( np.asarray([f.segment_ids for f in eval_features], dtype=np.int32)) all_valid_ids = tf.data.Dataset.from_tensor_slices( np.asarray([f.valid_ids for f in eval_features], dtype=np.int32)) all_label_ids = tf.data.Dataset.from_tensor_slices( np.asarray([f.label_id for f in eval_features], dtype=np.int32)) eval_data = tf.data.Dataset.zip( (all_input_ids, all_input_mask, all_segment_ids, all_valid_ids, all_label_ids)) batched_eval_data = eval_data.batch(args.eval_batch_size) loss_metric = tf.keras.metrics.Mean() epoch_bar = master_bar(range(1)) pb_max_len = math.ceil( float(len(eval_features)) / float(args.eval_batch_size)) y_true = [] y_pred = [] label_map = {i: label for i, label in enumerate(label_list, 1)} for epoch in epoch_bar: for (input_ids, input_mask, segment_ids, valid_ids, label_ids) in progress_bar(batched_eval_data, total=pb_max_len, parent=epoch_bar): logits = ner(input_ids, input_mask, segment_ids, valid_ids, training=False) logits = tf.argmax(logits, axis=2) for i, label in enumerate(label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(label): if j == 0: continue elif label_ids[i][j].numpy() == len(label_map): y_true.append(temp_1) y_pred.append(temp_2) break else: temp_1.append(label_map[label_ids[i][j].numpy()]) temp_2.append(label_map[logits[i][j].numpy()]) report = classification_report(y_true, y_pred, digits=4) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") logger.info("\n%s", report) writer.write(report)
def create_model(self): """ 根据config文件选择对应的模型,并初始化 :return: """ self.model = BertNer(config=self.config, is_training=False)
class Predictor(object): def __init__(self, config): self.model = None self.config = config self.output_path = config["output_path"] self.vocab_path = os.path.join(config["bert_model_path"], "vocab.txt") self.label_to_index = self.load_vocab() self.word_vectors = None self.sequence_length = self.config["sequence_length"] # 创建模型 self.create_model() # 加载计算图 self.load_graph() def load_vocab(self): # 将词汇-索引映射表加载出来 with open(os.path.join(self.output_path, "label_to_index.json"), "r") as f: label_to_index = json.load(f) return label_to_index def padding(self, input_id, input_mask, segment_id): """ 对序列进行补全 :param input_id: :param input_mask: :param segment_id: :return: """ if len(input_id) < self.sequence_length: pad_input_id = input_id + [0] * (self.sequence_length - len(input_id)) pad_input_mask = input_mask + [0] * (self.sequence_length - len(input_mask)) pad_segment_id = segment_id + [0] * (self.sequence_length - len(segment_id)) sequence_len = len(input_id) else: pad_input_id = input_id[:self.sequence_length] pad_input_mask = input_mask[:self.sequence_length] pad_segment_id = segment_id[:self.sequence_length] sequence_len = self.sequence_length return pad_input_id, pad_input_mask, pad_segment_id, sequence_len def sentence_to_idx(self, text): """ 将分词后的句子转换成idx表示 :return: """ tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_path, do_lower_case=True) tokens = [] for token in text: token = tokenizer.tokenize(token) tokens.extend(token) tokens = ["[CLS]"] + tokens + ["[SEP]"] input_id = tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_id) segment_id = [0] * len(input_id) input_id, input_mask, segment_id, sequence_len = self.padding( input_id, input_mask, segment_id) return [input_id], [input_mask], [segment_id], [sequence_len] def load_graph(self): """ 加载计算图 :return: """ self.sess = tf.Session() ckpt = tf.train.get_checkpoint_state(self.config["ckpt_model_path"]) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print('Reloading model parameters..') self.model.saver.restore(self.sess, ckpt.model_checkpoint_path) else: raise ValueError('No such file:[{}]'.format( self.config["ckpt_model_path"])) def create_model(self): """ 根据config文件选择对应的模型,并初始化 :return: """ self.model = BertNer(config=self.config, is_training=False) def predict(self, text): """ 给定分词后的句子,预测其分类结果 :param text: :return: """ input_ids, input_masks, segment_ids, sequence_len = self.sentence_to_idx( text) prediction = self.model.infer( self.sess, dict(input_ids=input_ids, input_masks=input_masks, segment_ids=segment_ids, sequence_len=sequence_len)).tolist() print(prediction) chunks = get_chunk(prediction, self.label_to_index) return chunks