def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" unique_ids = features["unique_ids"] input_ids = features["input_ids"] input_mask = features["input_mask"] input_type_ids = features["input_type_ids"] jit_scope = tf.contrib.compiler.jit.experimental_jit_scope with jit_scope(): model = modeling.BertModel(config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=input_type_ids) if mode != tf.estimator.ModeKeys.PREDICT: raise ValueError("Only PREDICT modes are supported: %s" % (mode)) tvars = tf.trainable_variables() (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) all_layers = model.get_all_encoder_layers() predictions = { "unique_id": unique_ids, } for (i, layer_index) in enumerate(layer_indexes): predictions["layer_output_%d" % i] = all_layers[layer_index] from tensorflow.python.estimator.model_fn import EstimatorSpec output_spec = EstimatorSpec(mode=mode, predictions=predictions) return output_spec
def model_gpu(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for GPU 版本的 Estimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.compat.v1.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] label_ids = features["label_ids"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) (total_loss, per_example_loss, logits, probabilities) = SentimentCLS.create_model( bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, num_labels, use_one_hot_embeddings) tvars = tf.compat.v1.trainable_variables() initialized_variable_names = {} if init_checkpoint: (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) tf.compat.v1.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.compat.v1.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, False) output_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, ) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(per_example_loss, label_ids, logits, is_real_example): predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) accuracy = tf.compat.v1.metrics.accuracy( labels=label_ids, predictions=predictions, weights=is_real_example) loss = tf.compat.v1.metrics.mean(values=per_example_loss, weights=is_real_example) return {"eval_accuracy": accuracy, "eval_loss": loss, } metrics = metric_fn(per_example_loss, label_ids, logits, True) output_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, eval_metric_ops=metrics) else: output_spec = tf.estimator.EstimatorSpec(mode=mode, predictions={"probabilities": probabilities}, ) return output_spec
def main(): print("print start load the params...") print(json.dumps(config, ensure_ascii=False, indent=2)) tf.logging.set_verbosity(tf.logging.INFO) tf.gfile.MakeDirs(config["out"]) train_examples_len = config["train_examples_len"] dev_examples_len = config["dev_examples_len"] learning_rate = config["learning_rate"] eval_per_step = config["eval_per_step"] num_labels = config["num_labels"] num_train_steps = math.ceil(train_examples_len / config["train_batch_size"]) num_dev_steps = math.ceil(dev_examples_len / config["dev_batch_size"]) num_warmup_steps = math.ceil(num_train_steps * config["num_train_epochs"] * config["warmup_proportion"]) print("num_train_steps:{}, num_dev_steps:{}, num_warmup_steps:{}".format( num_train_steps, num_dev_steps, num_warmup_steps)) use_one_hot_embeddings = False is_training = True use_tpu = False seq_len = config["max_seq_len"] init_checkpoint = config["init_checkpoint"] print("print start compile the bert model...") # 定义输入输出 input_ids = tf.placeholder(tf.int64, shape=[None, seq_len], name='input_ids') input_mask = tf.placeholder(tf.int64, shape=[None, seq_len], name='input_mask') segment_ids = tf.placeholder(tf.int64, shape=[None, seq_len], name='segment_ids') labels = tf.placeholder(tf.int64, shape=[None, seq_len], name='labels') keep_prob = tf.placeholder(tf.float32, name='keep_prob') # , name='is_training' bert_config_ = load_bert_config(config["bert_config"]) (total_loss, acc, logits, probabilities) = create_model(bert_config_, is_training, input_ids, input_mask, segment_ids, labels, keep_prob, num_labels, use_one_hot_embeddings) train_op = optimization.create_optimizer( total_loss, learning_rate, num_train_steps * config["num_train_epochs"], num_warmup_steps, False) print("print start train the bert model...") batch_size = config["train_batch_size"] dev_batch_size = config["dev_batch_size"] init_global = tf.global_variables_initializer() saver = tf.train.Saver([ v for v in tf.global_variables() if 'adam_v' not in v.name and 'adam_m' not in v.name ], max_to_keep=2) # 保存最后top3模型 with tf.Session() as sess: sess.run(init_global) print("start load the pre train model") if init_checkpoint: # tvars = tf.global_variables() tvars = tf.trainable_variables() print("global_variables", len(tvars)) (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) print("initialized_variable_names:", len(initialized_variable_names)) saver_ = tf.train.Saver( [v for v in tvars if v.name in initialized_variable_names]) saver_.restore(sess, init_checkpoint) tvars = tf.global_variables() initialized_vars = [ v for v in tvars if v.name in initialized_variable_names ] not_initialized_vars = [ v for v in tvars if v.name not in initialized_variable_names ] tf.logging.info('--all size %s; not initialized size %s' % (len(tvars), len(not_initialized_vars))) if len(not_initialized_vars): sess.run(tf.variables_initializer(not_initialized_vars)) for v in initialized_vars: print('--initialized: %s, shape = %s' % (v.name, v.shape)) for v in not_initialized_vars: print('--not initialized: %s, shape = %s' % (v.name, v.shape)) else: sess.run(tf.global_variables_initializer()) # if init_checkpoint: # saver.restore(sess, init_checkpoint) # print("checkpoint restored from %s" % init_checkpoint) print("********* train start *********") # tf.summary.FileWriter("output/",sess.graph) # albert remove dropout def train_step(ids, mask, segment, y, step): feed = { input_ids: ids, input_mask: mask, segment_ids: segment, labels: y, keep_prob: 0.9 } _, out_loss, acc_, p_ = sess.run( [train_op, total_loss, acc, probabilities], feed_dict=feed) # print("step :{}, lr:{}, loss :{}, acc :{}".format(step, _[1], out_loss, acc_)) print("step :{}, loss :{}, acc :{}".format(step, out_loss, acc_)) return out_loss, p_, y def dev_step(ids, mask, segment, y): feed = { input_ids: ids, input_mask: mask, segment_ids: segment, labels: y, keep_prob: 1.0 } out_loss, acc_, p_ = sess.run([total_loss, acc, probabilities], feed_dict=feed) print("loss :{}, acc :{}".format(out_loss, acc_)) return out_loss, p_, y min_total_loss_dev = 999999 step = 0 for epoch in range(config["num_train_epochs"]): _ = "{:*^100s}".format(("epoch-" + str(epoch)).center(20)) print(_) # 读取训练数据 total_loss_train = 0 # total_pre_train = [] # total_true_train = [] input_ids2, input_mask2, segment_ids2, labels2 = get_input_data( config["in_1"], seq_len, batch_size) for i in range(num_train_steps): step += 1 ids_train, mask_train, segment_train, y_train = sess.run( [input_ids2, input_mask2, segment_ids2, labels2]) out_loss, pre, y = train_step(ids_train, mask_train, segment_train, y_train, step) total_loss_train += out_loss # total_pre_train.extend(pre) # total_true_train.extend(y) if step % eval_per_step == 0 and step >= config[ "eval_start_step"]: total_loss_dev = 0 dev_input_ids2, dev_input_mask2, dev_segment_ids2, dev_labels2 = get_input_data( config["in_2"], seq_len, dev_batch_size, False) # total_pre_dev = [] # total_true_dev = [] for j in range(num_dev_steps): # 一个 epoch 的 轮数 ids_dev, mask_dev, segment_dev, y_dev = sess.run([ dev_input_ids2, dev_input_mask2, dev_segment_ids2, dev_labels2 ]) out_loss, pre, y = dev_step(ids_dev, mask_dev, segment_dev, y_dev) total_loss_dev += out_loss # total_pre_dev.extend(pre) # total_true_dev.extend(y_dev) print("total_loss_dev:{}".format(total_loss_dev)) # print(classification_report(total_true_dev, total_pre_dev, digits=4)) if total_loss_dev < min_total_loss_dev: print("save model:\t%f\t>%f" % (min_total_loss_dev, total_loss_dev)) min_total_loss_dev = total_loss_dev saver.save(sess, config["out"] + 'bert.ckpt', global_step=step) elif step < config[ "eval_start_step"] and step % config["auto_save"] == 0: saver.save(sess, config["out"] + 'bert.ckpt', global_step=step) _ = "{:*^100s}".format( ("epoch-" + str(epoch) + " report:").center(20)) print("total_loss_train:{}".format(total_loss_train)) # print(classification_report(total_true_train, total_pre_train, digits=4)) sess.close() # remove dropout print("remove dropout in predict") tf.reset_default_graph() is_training = False input_ids = tf.placeholder(tf.int64, shape=[None, seq_len], name='input_ids') input_mask = tf.placeholder(tf.int64, shape=[None, seq_len], name='input_mask') segment_ids = tf.placeholder(tf.int64, shape=[None, seq_len], name='segment_ids') labels = tf.placeholder(tf.int64, shape=[None, seq_len], name='labels') keep_prob = tf.placeholder(tf.float32, name='keep_prob') # , name='is_training' bert_config_ = load_bert_config(config["bert_config"]) (total_loss, _, logits, probabilities) = create_model(bert_config_, is_training, input_ids, input_mask, segment_ids, labels, keep_prob, num_labels, use_one_hot_embeddings) init_global = tf.global_variables_initializer() saver = tf.train.Saver(tf.global_variables(), max_to_keep=1) # 保存最后top3模型 try: checkpoint = tf.train.get_checkpoint_state(config["out"]) input_checkpoint = checkpoint.model_checkpoint_path print("[INFO] input_checkpoint:", input_checkpoint) except Exception as e: input_checkpoint = config["out"] print("[INFO] Model folder", config["out"], repr(e)) with tf.Session() as sess: sess.run(init_global) saver.restore(sess, input_checkpoint) saver.save(sess, config["out_1"] + 'bert.ckpt') sess.close()
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) unique_ids = features["unique_ids"] input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) (start_logits, end_logits) = create_model( bert_config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: seq_length = modeling.get_shape_list(input_ids)[1] def compute_loss(logits, positions): one_hot_positions = tf.one_hot(positions, depth=seq_length, dtype=tf.float32) log_probs = tf.nn.log_softmax(logits, axis=-1) loss = -tf.reduce_mean( tf.reduce_sum(one_hot_positions * log_probs, axis=-1)) return loss start_positions = features["start_positions"] end_positions = features["end_positions"] start_loss = compute_loss(start_logits, start_positions) end_loss = compute_loss(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2.0 train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.PREDICT: predictions = { "unique_ids": unique_ids, "start_logits": start_logits, "end_logits": end_logits, } output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) else: raise ValueError("Only TRAIN and PREDICT modes are supported: %s" % (mode)) return output_spec
def optimize_graph(logger=None, verbose=False): if not logger: logger = set_logger(colored('BERT_VEC', 'yellow'), verbose) try: # we don't need GPU for optimizing the graph from tensorflow.python.tools.optimize_for_inference_lib import optimize_for_inference tf.gfile.MakeDirs(args.output_dir) config_fp = args.config_name logger.info('model config: %s' % config_fp) # 加载bert配置文件 with tf.gfile.GFile(config_fp, 'r') as f: bert_config = modeling.BertConfig.from_dict(json.load(f)) logger.info('build graph...') # input placeholders, not sure if they are friendly to XLA input_ids = tf.placeholder(tf.int32, (None, args.max_seq_len), 'input_ids') input_mask = tf.placeholder(tf.int32, (None, args.max_seq_len), 'input_mask') input_type_ids = tf.placeholder(tf.int32, (None, args.max_seq_len), 'input_type_ids') jit_scope = tf.contrib.compiler.jit.experimental_jit_scope with jit_scope(): input_tensors = [input_ids, input_mask, input_type_ids] model = modeling.BertModel(config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=input_type_ids, use_one_hot_embeddings=False) # 获取所有要训练的变量 tvars = tf.trainable_variables() init_checkpoint = args.ckpt_name (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) # 共享卷积核 with tf.variable_scope("pooling"): # 如果只有一层,就只取对应那一层的weight if len(args.layer_indexes) == 1: encoder_layer = model.all_encoder_layers[ args.layer_indexes[0]] else: # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数 all_layers = [ model.all_encoder_layers[l] for l in args.layer_indexes ] encoder_layer = tf.concat(all_layers, -1) mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1) masked_reduce_mean = lambda x, m: tf.reduce_sum( mul_mask(x, m), axis=1) / (tf.reduce_sum( m, axis=1, keepdims=True) + 1e-10) input_mask = tf.cast(input_mask, tf.float32) # 以下代码是句向量的生成方法,可以理解为做了一个卷积的操作,但是没有把结果相加, 卷积核是input_mask pooled = masked_reduce_mean(encoder_layer, input_mask) pooled = tf.identity(pooled, 'final_encodes') output_tensors = [pooled] tmp_g = tf.get_default_graph().as_graph_def() # allow_soft_placement:自动选择运行设备 config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: logger.info('load parameters from checkpoint...') sess.run(tf.global_variables_initializer()) logger.info('freeze...') tmp_g = tf.graph_util.convert_variables_to_constants( sess, tmp_g, [n.name[:-2] for n in output_tensors]) dtypes = [n.dtype for n in input_tensors] logger.info('optimize...') tmp_g = optimize_for_inference( tmp_g, [n.name[:-2] for n in input_tensors], [n.name[:-2] for n in output_tensors], [dtype.as_datatype_enum for dtype in dtypes], False) # tmp_file = tempfile.NamedTemporaryFile('w', delete=False, dir=args.output_dir).name tmp_file = args.graph_file logger.info('write graph to a tmp file: %s' % tmp_file) with tf.gfile.GFile(tmp_file, 'wb') as f: f.write(tmp_g.SerializeToString()) return tmp_file except Exception as e: logger.error('fail to optimize the graph!') logger.error(e)