def model_fn(features, labels, mode, params): """Defines how to train, evaluate and predict from the transformer model.""" with tf.variable_scope("model"): inputs, targets = features, labels # Create model and get output logits. model = transformer.Transformer(params, mode == tf.estimator.ModeKeys.TRAIN) output = model(inputs, targets) # When in prediction mode, the labels/targets is None. The model output # is the prediction if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec( tf.estimator.ModeKeys.PREDICT, predictions=output) logits = output # Calculate model loss. xentropy, weights = metrics.padded_cross_entropy_loss( logits, targets, params.label_smoothing, params.vocab_size) loss = tf.reduce_sum(xentropy * weights) / tf.reduce_sum(weights) if mode == tf.estimator.ModeKeys.EVAL: return tf.estimator.EstimatorSpec( mode=mode, loss=loss, predictions={"predictions": logits}, eval_metric_ops=metrics.get_eval_metrics(logits, labels, params)) else: train_op = get_train_op(loss, params) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
def get_model(vocab_size_source, vocab_size_target): """获取模型""" transformer = _transformer.Transformer(_config.num_layers, _config.d_model, _config.num_heads, _config.dff, vocab_size_source + 1, vocab_size_target + 1, pe_input=vocab_size_source + 1, pe_target=vocab_size_target + 1, rate=_config.dropout_rate) return transformer
def initialize_network(self): de2idx, idx2de = load_de_vocab(self.min_cnt) en2idx, idx2en = load_en_vocab(self.min_cnt) source_word_count = len(en2idx) target_word_count = len(de2idx) self.model = transformer.Transformer(self.batch_size, source_word_count, target_word_count, self.max_len)
def model_fn(features, labels, mode, params): """Defines how to train, evaluate and predict from the transformer model.""" with tf.variable_scope("model"): inputs, targets = features, labels # Create model and get output logits. model = transformer.Transformer(params, mode == tf.estimator.ModeKeys.TRAIN) logits = model(inputs, targets) # When in prediction mode, the labels/targets is None. The model output # is the prediction if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec( tf.estimator.ModeKeys.PREDICT, predictions=logits, export_outputs={ "translate": tf.estimator.export.PredictOutput(logits) }) # Explicitly set the shape of the logits for XLA (TPU). This is needed # because the logits are passed back to the host VM CPU for metric # evaluation, and the shape of [?, ?, vocab_size] is too vague. However # it is known from Transformer that the first two dimensions of logits # are the dimensions of targets. Note that the ambiguous shape of logits is # not a problem when computing xentropy, because padded_cross_entropy_loss # resolves the shape on the TPU. logits.set_shape(targets.shape.as_list() + logits.shape.as_list()[2:]) # Calculate model loss. # xentropy contains the cross entropy loss of every nonpadding token in the # targets. xentropy, weights = metrics.padded_cross_entropy_loss( logits, targets, params["label_smoothing"], params["vocab_size"]) loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights) # Save loss as named tensor that will be logged with the logging hook. tf.identity(loss, "cross_entropy") if mode == tf.estimator.ModeKeys.EVAL: return tf.estimator.EstimatorSpec( mode=mode, loss=loss, predictions={"predictions": logits}, eval_metric_ops=metrics.get_eval_metrics( logits, labels, params)) else: train_op, metric_dict = get_train_op_and_metrics(loss, params) # Epochs can be quite long. This gives some intermediate information # in TensorBoard. metric_dict["minibatch_loss"] = loss record_scalars(metric_dict) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
def create_model(s_ids,t_ids,mode,config): eos_id=config.eos_id with tf.variable_scope('model'): model = transformer.Transformer(config, mode == tf.estimator.ModeKeys.TRAIN) logits = model(s_ids, t_ids,eos_id) with tf.variable_scope("loss"): xentropy, weights = metrics.padded_cross_entropy_loss( logits, t_ids, config.label_smoothing, config.vocab_size) # Compute the weighted mean of the cross entropy losses loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights) return loss
def release_model(**kwargs): release_dir = kwargs.get("release_dir", './release') restore_dir = kwargs.get('restore_dir', './out') if not os.path.isdir(release_dir): print("Create release dir:{}".format(release_dir)) os.mkdir(release_dir) for file in glob.glob(os.path.join(release_dir, '*')): print("Remove previous file:{}".format(file)) os.remove(file) # release后保存的模型文件,参数文件 release_model_file = os.path.join(release_dir, 'model.ckpt') release_var_file = os.path.join(release_dir, 'var.pkl') # restore 的文件 restore_step = kwargs.get('steps') if restore_step: restore_model_file = os.path.join(restore_dir, 'model.ckpt-{}'.format(restore_step)) else: restore_model_file = tf.train.get_checkpoint_state(restore_dir).model_checkpoint_path restore_var_file = os.path.join(restore_dir, 'options.pkl') with open(restore_var_file, 'rb') as f: options = pickle.load(f) basic_config = config.basic_config() basic_config.__dict__.update(options) basic_config.beam_size = 2 g = tf.Graph() with g.as_default(): sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: input_ids = tf.placeholder(tf.int64, [None, None], name='input_ids') with tf.variable_scope('model'): model = transformer.Transformer(basic_config, False) out_res = model(input_ids, eos_id=basic_config.eos_id) top_decoded_ids = out_res['outputs'] scores = out_res['scores'] # print(top_decoded_ids.name) # print(scores.name) saver = tf.train.Saver() saver.restore(sess, restore_model_file) saver.save(sess, release_model_file) _vars = {'input_ids': input_ids.name, 'decode_ids': top_decoded_ids.name, 'scores': scores.name} with open(release_var_file, 'wb') as f: pickle.dump((_vars, options), f, -1) # res=sess.run(top_decoded_ids,{input_ids:np.array([[2,3,4,5]],dtype=np.int32)}) # print(res) # print(res[0].shape) # print(res[1]['k'].shape) # print(res[1]['w'].shape) print("Done!")
def main(unused_argv): tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) if FLAGS.params == "base": params = model_params.TransformerBaseParams elif FLAGS.params == "big": params = model_params.TransformerBigParams else: raise ValueError("Invalid parameter set defined: %s." "Expected 'base' or 'big.'" % FLAGS.params) # Set up estimator and params params.beam_size = _BEAM_SIZE params.alpha = _ALPHA params.extra_decode_length = _EXTRA_DECODE_LENGTH params.frozen_graph = None input_shape = [None, None] input_tokens = tf.compat.v1.placeholder(tf.int64, input_shape, name='input_tokens') with tf.compat.v1.variable_scope("model"): model = transformer.Transformer(params, False) output = model(input_tokens) # Restore variables from checkpoint sess = tf.compat.v1.Session() latest_model = tf.train.latest_checkpoint(FLAGS.model_dir) saver = tf.compat.v1.train.Saver() saver.restore(sess, latest_model) # Freeze the graph graph_def = sess.graph.as_graph_def() output_names = [ 'model/Transformer/strided_slice_15', 'model/Transformer/strided_slice_16' ] graph_def = tf.compat.v1.graph_util.convert_variables_to_constants( sess, graph_def, output_names) print("pb_path is", FLAGS.pb_path) with tf.compat.v1.gfile.GFile(FLAGS.pb_path, 'wb') as pb_file: pb_file.write(graph_def.SerializeToString())
def train_schedule(train_eval_iterations, single_iteration_train_steps, params, bleu_source=None, bleu_ref=None, bleu_threshold=None): """ Train and evaluate model :param model: model to train :param train_eval_iterations: Number of times to repeat the train-eval iteration :param single_iteration_train_steps: Number of steps to train in one iteration :param bleu_source:File containing text to be translated for BLEU calculation. :param bleu_ref:File containing reference translations for BLEU calculation. :param bleu_threshold:minimum BLEU score before training is stopped. """ print('Training schedule:') print('\t1.Train for %d iterations' % train_eval_iterations) print('\t2.Each iteration for %d steps.' % single_iteration_train_steps) print('\t3.Compute BLEU score.') '''if bleu_threshold is not None: print("Repeat above steps until the BLEU score reaches", bleu_threshold) train_eval_iterations = INF else: print("Repeat above steps %d times." % train_eval_iterations)''' # Loop training/evaluation/bleu cycles subtokenizer = tokenizer.Subtokenizer(vocab_file='vocab.ende.32768') dataset_train = dataset.TranslationDataset(dir_lang1='wmt32k-train.lang1', dir_lang2='wmt32k-train.lang2', subtokenizer=subtokenizer) global_step = 0 best_bleu_score = 0 net = transformer.Transformer(params=params, train=1) net.initialize(init=init.Xavier(), ctx=ctx, force_reinit=True) learning_rate = get_learning_rate(params.learning_rate, params.hidden_size, params.learning_rate_warmup_steps, global_step) optimizer = mx.optimizer.Adam(learning_rate=learning_rate, beta1=params.optimizer_adam_beta1, beta2=params.optimizer_adam_beta2, epsilon=params.optimizer_adam_epsilon) trainer = gluon.Trainer(net.collect_params(), optimizer=optimizer) bleu_score_file = open('blue_score_file', w+)
def model_fn(features, labels, mode, params): """Defines how to train, evaluate and predict from the transformer model.""" if params.frozen_graph and mode == tf.estimator.ModeKeys.PREDICT: print("Reading***** From *** pb", flush=True) input_map = {'input_tokens': features} output_names = [ 'model/Transformer/strided_slice_15', 'model/Transformer/strided_slice_16' ] with tf.io.gfile.GFile(params.frozen_graph, "rb") as f: graph_def = tf.compat.v1.GraphDef() graph_def.ParseFromString(f.read()) tf.graph_util.import_graph_def(graph_def, input_map, output_names, name="") output_tensors = [ tf.compat.v1.get_default_graph().get_tensor_by_name(name + ":0") for name in output_names ] output = {'outputs': output_tensors[0], 'scores': output_tensors[1]} return tf.estimator.EstimatorSpec(tf.estimator.ModeKeys.PREDICT, predictions=output) else: with tf.compat.v1.variable_scope("model"): inputs, targets = features, labels # Create model and get output logits. model = transformer.Transformer( params, mode == tf.estimator.ModeKeys.TRAIN) output = model(inputs, targets) # When in prediction mode, the labels/targets is None. The model output # is the prediction if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec( tf.estimator.ModeKeys.PREDICT, predictions=output) logits = output # Calculate model loss. xentropy, weights = metrics.padded_cross_entropy_loss( logits, targets, params.label_smoothing, params.vocab_size) loss = tf.reduce_sum(input_tensor=xentropy * weights) / tf.reduce_sum(input_tensor=weights) if mode == tf.estimator.ModeKeys.EVAL: return tf.estimator.EstimatorSpec( mode=mode, loss=loss, predictions={"predictions": logits}, eval_metric_ops=metrics.get_eval_metrics( logits, labels, params)) else: train_op = get_train_op(loss, params) logging_hook = tf.compat.v1.train.LoggingTensorHook( {"loss": loss}, every_n_iter=FLAGS.print_iter) return tf.estimator.EstimatorSpec( mode=mode, loss=loss, train_op=train_op, training_hooks=[logging_hook])
def model_fn(features, labels, mode, params): """Defines how to train, evaluate and predict from the transformer model.""" with tf.variable_scope("model"): inputs, targets = features, labels # Create model and get output logits. model = transformer.Transformer(params, mode == tf.estimator.ModeKeys.TRAIN) # 如果是predict: # returns a dictionary { # output: [batch_size, decoded length] # score: [batch_size, float]} # else: # Returns: # float32 tensor with shape [batch_size, target_length, vocab_size] logits = model(inputs, targets) # When in prediction mode, the labels/targets is None. The model output # is the prediction if mode == tf.estimator.ModeKeys.PREDICT: if params["use_tpu"]: raise NotImplementedError("Prediction is not yet supported on TPUs.") return tf.estimator.EstimatorSpec( tf.estimator.ModeKeys.PREDICT, predictions=logits, export_outputs={ "translate": tf.estimator.export.PredictOutput(logits) }) # Explicitly set the shape of the logits for XLA (TPU). This is needed # because the logits are passed back to the host VM CPU for metric # evaluation, and the shape of [?, ?, vocab_size] is too vague. However # it is known from Transformer that the first two dimensions of logits # are the dimensions of targets. Note that the ambiguous shape of logits is # not a problem when computing xentropy, because padded_cross_entropy_loss # resolves the shape on the TPU. logits.set_shape(targets.shape.as_list() + logits.shape.as_list()[2:]) # Calculate model loss. # xentropy contains the cross entropy loss of every nonpadding token in the # targets. # 训练时,labels 为0(即<PAD>)的对应loss的weight被置0 xentropy, weights = metrics.padded_cross_entropy_loss( logits, targets, params["label_smoothing"], params["vocab_size"]) loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights) # Save loss as named tensor that will be logged with the logging hook. tf.identity(loss, "cross_entropy") if mode == tf.estimator.ModeKeys.EVAL: if params["use_tpu"]: # host call functions should only have tensors as arguments. # This lambda pre-populates params so that metric_fn is # TPUEstimator compliant. def metric_fn(logits, labels): return ( metrics.get_eval_metrics(logits, labels, params=params)) eval_metrics = (metric_fn, [logits, labels]) return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, predictions={"predictions": logits}, eval_metrics=eval_metrics) return tf.estimator.EstimatorSpec( mode=mode, loss=loss, predictions={"predictions": logits}, eval_metric_ops=metrics.get_eval_metrics(logits, labels, params)) else: train_op, metric_dict = get_train_op_and_metrics(loss, params) # Epochs can be quite long. This gives some intermediate information # in TensorBoard. metric_dict["minibatch_loss"] = loss if params["use_tpu"]: return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, host_call=tpu_util.construct_scalar_host_call( metric_dict=metric_dict, model_dir=params["model_dir"], prefix="training/") ) record_scalars(metric_dict) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)