def get_variable_initializer(hparams): """Get variable initializer from hparams.""" if not hparams.initializer: return None mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_INITIALIZER_GAIN, value=hparams.initializer_gain, hparams=hparams) if not tf.contrib.eager.in_eager_mode(): tf.logging.info("Using variable initializer: %s", hparams.initializer) if hparams.initializer == "orthogonal": return tf.orthogonal_initializer(gain=hparams.initializer_gain) elif hparams.initializer == "uniform": max_val = 0.1 * hparams.initializer_gain return tf.random_uniform_initializer(-max_val, max_val) elif hparams.initializer == "normal_unit_scaling": return tf.variance_scaling_initializer( hparams.initializer_gain, mode="fan_avg", distribution="normal") elif hparams.initializer == "uniform_unit_scaling": return tf.variance_scaling_initializer( hparams.initializer_gain, mode="fan_avg", distribution="uniform") elif hparams.initializer == "xavier": return tf.contrib.layers.xavier_initializer() else: raise ValueError("Unrecognized initializer: %s" % hparams.initializer)
def train(self, max_steps=None): mlperf_log.transformer_print(key=mlperf_log.TRAIN_LOOP) mlperf_log.transformer_print(key=mlperf_log.TRAIN_EPOCH, value=0) self._estimator.train( self._train_spec.input_fn, hooks=self._train_spec.hooks, max_steps=max_steps or self._train_spec.max_steps)
def generate_files(generator, output_filenames, max_cases=None, cycle_every_n=1): """Generate cases from a generator and save as TFRecord files. Generated cases are transformed to tf.Example protos and saved as TFRecords in sharded files named output_dir/output_name-00..N-of-00..M=num_shards. Args: generator: a generator yielding (string -> int/float/str list) dictionaries. output_filenames: List of output file paths. max_cases: maximum number of cases to get from the generator; if None (default), we use the generator until StopIteration is raised. cycle_every_n: how many cases from the generator to take before switching to the next shard; by default set to 1, switch every case. """ if outputs_exist(output_filenames): tf.logging.info("Skipping generator because outputs files exists at {}" .format(output_filenames)) return tmp_filenames = [fname + ".incomplete" for fname in output_filenames] num_shards = len(output_filenames) # Check if is training or eval, ref: train_data_filenames(). if num_shards > 0: if "-train" in output_filenames[0]: tag = "train" elif "-dev" in output_filenames[0]: tag = "eval" else: tag = "other" writers = [tf.python_io.TFRecordWriter(fname) for fname in tmp_filenames] counter, shard = 0, 0 for case in generator: if case is None: continue if counter % 100000 == 0: tf.logging.info("Generating case %d." % counter) counter += 1 if max_cases and counter > max_cases: break example = to_example(case) writers[shard].write(example.SerializeToString()) if counter % cycle_every_n == 0: shard = (shard + 1) % num_shards for writer in writers: writer.close() for tmp_name, final_name in zip(tmp_filenames, output_filenames): tf.gfile.Rename(tmp_name, final_name) if num_shards > 0: if tag == "train": mlperf_log.transformer_print( key=mlperf_log.PREPROC_NUM_TRAIN_EXAMPLES, value=counter) elif tag == "eval": mlperf_log.transformer_print( key=mlperf_log.PREPROC_NUM_EVAL_EXAMPLES, value=counter) tf.logging.info("Generated %s Examples", counter)
def generate_dataset_and_shuffle(train_gen, train_paths, dev_gen, dev_paths, shuffle=True): generate_files(train_gen, train_paths) generate_files(dev_gen, dev_paths) mlperf_log.transformer_print(key=mlperf_log.INPUT_ORDER) if shuffle: shuffle_dataset(train_paths + dev_paths)
def learning_rate_schedule(hparams): """Learning rate schedule based on hparams.""" mlperf_log.transformer_print(key=mlperf_log.OPT_LR, deferred=True) mlperf_log.transformer_print( key=mlperf_log.OPT_LR_WARMUP_STEPS, value=hparams.learning_rate_warmup_steps) step_num = _global_step(hparams) schedule_string = hparams.learning_rate_schedule names = schedule_string.split("*") names = [name.strip() for name in names if name.strip()] ret = tf.constant(1.0) for name in names: ret *= learning_rate_factor(name, step_num, hparams) return ret
def create_estimator(model_name, hparams, run_config, schedule="train_and_evaluate", decode_hparams=None, use_tpu=False, use_tpu_estimator=False, use_xla=False): """Create a T2T Estimator.""" model_fn = t2t_model.T2TModel.make_estimator_model_fn( model_name, hparams, decode_hparams=decode_hparams, use_tpu=use_tpu) del use_xla if use_tpu or use_tpu_estimator: problem = hparams.problem batch_size = ( problem.tpu_batch_size_per_shard(hparams) * run_config.tpu_config.num_shards) mlperf_log.transformer_print( key=mlperf_log.INPUT_BATCH_SIZE, value=batch_size) if getattr(hparams, "mtf_mode", False): batch_size = problem.tpu_batch_size_per_shard(hparams) predict_batch_size = batch_size if decode_hparams and decode_hparams.batch_size: predict_batch_size = decode_hparams.batch_size if decode_hparams and run_config.tpu_config: decode_hparams.add_hparam("iterations_per_loop", run_config.tpu_config.iterations_per_loop) estimator = tf.contrib.tpu.TPUEstimator( model_fn=model_fn, model_dir=run_config.model_dir, config=run_config, use_tpu=use_tpu, train_batch_size=batch_size, eval_batch_size=batch_size if "eval" in schedule else None, predict_batch_size=predict_batch_size) else: estimator = tf.estimator.Estimator( model_fn=model_fn, model_dir=run_config.model_dir, config=run_config, ) return estimator
def train_eval_and_decode(self): """Does eval and decode after training every eval_freq_in_steps.""" eval_steps = self._hparams.eval_freq_in_steps packed_dataset = "_packed" in self._hparams.problem.name mlperf_log.transformer_print(key=mlperf_log.TRAIN_LOOP) for i in range(0, self._train_spec.max_steps, eval_steps): mlperf_log.transformer_print( key=mlperf_log.TRAIN_EPOCH, value=i // eval_steps) if packed_dataset and i > 0: problem = registry.problem(self._hparams.problem.name + "_packed") p_hparams = problem.get_hparams(self._hparams) self._hparams.problem = problem self._hparams.problem_hparams = p_hparams self._estimator.train( self._train_spec.input_fn, steps=eval_steps, hooks=self._train_spec.hooks) self._estimator.evaluate( self._eval_spec.input_fn, steps=self._eval_spec.steps, hooks=self._eval_spec.hooks) if packed_dataset: problem = registry.problem( self._hparams.problem.name.replace("_packed", "")) p_hparams = problem.get_hparams(self._hparams) self._hparams.problem = problem self._hparams.problem_hparams = p_hparams mlperf_log.transformer_print(key=mlperf_log.EVAL_START) if self._hparams.mlperf_mode: self._decode_hparams.mlperf_decode_step = i + eval_steps self.decode(dataset_split=tf.estimator.ModeKeys.EVAL) d_hparams = self._decode_hparams if self._hparams.mlperf_mode and d_hparams.mlperf_success: mlperf_log.transformer_print( key=mlperf_log.RUN_STOP, value={"success": "true"}) break d_hparams = self._decode_hparams if self._hparams.mlperf_mode and not d_hparams.mlperf_success: mlperf_log.transformer_print( key=mlperf_log.RUN_STOP, value={"success": "false"})
def continuous_decode_on_eval_data(self): """Decode from dataset on new checkpoint.""" if self._hparams.mlperf_mode: ckpt_generator = next_undecoded_checkpoint(self._hparams.model_dir) else: ckpt_generator = next_checkpoint(self._hparams.model_dir) for ckpt in ckpt_generator: current_step = int(os.path.basename(ckpt).split("-")[1]) tf.logging.info("Decoding step %d" % current_step) # Skip checkpoint 0. if current_step == 0: continue # Decode the latest checkpoint by default. checkpoint_path = None if self._hparams.mlperf_mode: self._decode_hparams.mlperf_decode_step = current_step checkpoint_path = ckpt mlperf_log.transformer_print(key=mlperf_log.EVAL_START) self.decode( dataset_split=tf.estimator.ModeKeys.EVAL, checkpoint_path=checkpoint_path) d_hparams = self._decode_hparams if self._hparams.mlperf_mode and d_hparams.mlperf_success: mlperf_log.transformer_print( key=mlperf_log.RUN_STOP, value={"success": "true"}) break d_hparams = self._decode_hparams if self._hparams.mlperf_mode and not d_hparams.mlperf_success: mlperf_log.transformer_print( key=mlperf_log.RUN_STOP, value={"success": "false"})
def corpus_token_counts( text_filepattern, corpus_max_lines, split_on_newlines=True): """Read the corpus and compute a dictionary of token counts. Args: text_filepattern: A pattern matching one or more files. corpus_max_lines: An integer; maximum total lines to read. split_on_newlines: A boolean. If true, then split files by lines and strip leading and trailing whitespace from each line. Otherwise, treat each file as a single string. Returns: a dictionary mapping token to count. """ counts = collections.Counter() for doc in _read_filepattern( text_filepattern, max_lines=corpus_max_lines, split_on_newlines=split_on_newlines): counts.update(encode(_native_to_unicode(doc))) mlperf_log.transformer_print( key=mlperf_log.PREPROC_VOCAB_SIZE, value=len(counts)) return counts
def continuous_decode_on_eval_data(self): """Decode from dataset on new checkpoint.""" for ckpt in next_checkpoint(self._hparams.model_dir): current_step = int(os.path.basename(ckpt).split("-")[1]) # Skip checkpoint 0. if current_step == 0: continue mlperf_log.transformer_print(key=mlperf_log.EVAL_START) self.decode(dataset_split=tf.estimator.ModeKeys.EVAL) d_hparams = self._decode_hparams if d_hparams.mlperf_mode and d_hparams.mlperf_success: mlperf_log.transformer_print(key=mlperf_log.RUN_STOP, value={"success": "true"}) break d_hparams = self._decode_hparams if d_hparams.mlperf_mode and not d_hparams.mlperf_success: mlperf_log.transformer_print(key=mlperf_log.RUN_STOP, value={"success": "false"})
def main(argv): tf.logging.set_verbosity(tf.logging.INFO) usr_dir.import_usr_dir(FLAGS.t2t_usr_dir) # If we just have to print the registry, do that and exit early. maybe_log_registry_and_exit() # Create HParams. if argv: set_hparams_from_args(argv[1:]) if FLAGS.schedule != "run_std_server": hparams = create_hparams() if FLAGS.gpu_automatic_mixed_precision: setattr(hparams, "gpu_automatic_mixed_precision", True) if FLAGS.schedule == "train" or FLAGS.schedule == "train_eval_and_decode": mlperf_log.transformer_print(key=mlperf_log.RUN_START, hparams=hparams) if FLAGS.schedule == "run_std_server": run_std_server() mlperf_log.transformer_print( key=mlperf_log.RUN_SET_RANDOM_SEED, value=FLAGS.random_seed, hparams=hparams) trainer_lib.set_random_seed(FLAGS.random_seed) if FLAGS.cloud_mlengine: cloud_mlengine.launch() return if FLAGS.generate_data: generate_data() if cloud_mlengine.job_dir(): FLAGS.output_dir = cloud_mlengine.job_dir() exp_fn = create_experiment_fn() exp = exp_fn(create_run_config(hparams), hparams) if is_chief(): save_metadata(hparams) execute_schedule(exp) if FLAGS.schedule != "train": mlperf_log.transformer_print(key=mlperf_log.RUN_FINAL, hparams=hparams)
def compute_bleu_summaries(hook_args): """Compute BLEU core summaries using the decoder output. Args: hook_args: DecodeHookArgs namedtuple Returns: A list of tf.Summary values if hook_args.hparams contains the reference file and the translated file. """ decode_hparams = hook_args.decode_hparams estimator = hook_args.estimator current_step = estimator.get_variable_value(tf.GraphKeys.GLOBAL_STEP) has_iters = hasattr(decode_hparams, "iterations_per_loop") if current_step and has_iters and decode_hparams.iterations_per_loop: iterations_per_loop = decode_hparams.iterations_per_loop current_epoch = np.asscalar(current_step) // iterations_per_loop else: current_epoch = 0 if (decode_hparams.decode_reference is None or decode_hparams.decode_to_file is None): return None values = [] bleu = 100 * bleu_hook.bleu_wrapper(decode_hparams.decode_reference, decode_hparams.decode_to_file) values.append(tf.Summary.Value(tag="BLEU", simple_value=bleu)) tf.logging.info("%s: BLEU = %6.2f" % (decode_hparams.decode_to_file, bleu)) if decode_hparams.mlperf_mode: mlperf_log.transformer_print(key=mlperf_log.EVAL_TARGET, value=decode_hparams.mlperf_threshold) mlperf_log.transformer_print(key=mlperf_log.EVAL_ACCURACY, value={ "epoch": max(current_epoch - 1, 0), "value": bleu }) mlperf_log.transformer_print(key=mlperf_log.EVAL_STOP) if bleu >= decode_hparams.mlperf_threshold: decode_hparams.set_hparam("mlperf_success", True) return values
def compute_bleu_summaries(hook_args): """Compute BLEU core summaries using the decoder output. Args: hook_args: DecodeHookArgs namedtuple Returns: A list of tf.Summary values if hook_args.hparams contains the reference file and the translated file. """ decode_hparams = hook_args.decode_hparams if (decode_hparams.decode_reference is None or decode_hparams.decode_to_file is None): return None values = [] bleu = 100 * bleu_hook.bleu_wrapper(decode_hparams.decode_reference, decode_hparams.decode_to_file) values.append(tf.Summary.Value(tag="BLEU", simple_value=bleu)) tf.logging.info("%s: BLEU = %6.2f" % (decode_hparams.decode_to_file, bleu)) if hook_args.hparams.mlperf_mode: current_step = decode_hparams.mlperf_decode_step mlperf_log.transformer_print(key=mlperf_log.EVAL_TARGET, value=decode_hparams.mlperf_threshold) mlperf_log.transformer_print( key=mlperf_log.EVAL_ACCURACY, value={ "epoch": max(current_step // decode_hparams.iterations_per_loop - 1, 0), "value": bleu }) mlperf_log.transformer_print(key=mlperf_log.EVAL_STOP) if bleu >= decode_hparams.mlperf_threshold: decode_hparams.set_hparam("mlperf_success", True) return values
def compute_bleu_summaries(hook_args): """Compute BLEU core summaries using the decoder output. Args: hook_args: DecodeHookArgs namedtuple Returns: A list of tf.Summary values if hook_args.hparams contains the reference file and the translated file. """ decode_hparams = hook_args.decode_hparams if (decode_hparams.decode_reference is None or decode_hparams.decode_to_file is None): return None values = [] bleu = 100 * bleu_hook.bleu_wrapper( decode_hparams.decode_reference, decode_hparams.decode_to_file) values.append(tf.Summary.Value(tag="BLEU", simple_value=bleu)) tf.logging.info("%s: BLEU = %6.2f" % (decode_hparams.decode_to_file, bleu)) if hook_args.hparams.mlperf_mode: current_step = decode_hparams.mlperf_decode_step mlperf_log.transformer_print( key=mlperf_log.EVAL_TARGET, value=decode_hparams.mlperf_threshold) mlperf_log.transformer_print( key=mlperf_log.EVAL_ACCURACY, value={ "epoch": max(current_step // decode_hparams.iterations_per_loop - 1, 0), "value": bleu }) mlperf_log.transformer_print(key=mlperf_log.EVAL_STOP) if bleu >= decode_hparams.mlperf_threshold: decode_hparams.set_hparam("mlperf_success", True) return values
def continuous_decode_on_eval_data(self): """Decode from dataset on new checkpoint.""" if self._hparams.mlperf_mode: ckpt_generator = next_undecoded_checkpoint( self._hparams.model_dir, self._decode_hparams.decode_timeout_mins) else: ckpt_generator = next_checkpoint( self._hparams.model_dir, self._decode_hparams.decode_timeout_mins) for ckpt in ckpt_generator: current_step = decoding.get_step_from_ckpt_path(ckpt) tf.logging.info("Decoding step %d" % current_step) # Skip checkpoint 0. if current_step == 0: continue # Decode the latest checkpoint by default. checkpoint_path = None if self._hparams.mlperf_mode: self._decode_hparams.mlperf_decode_step = current_step checkpoint_path = ckpt mlperf_log.transformer_print(key=mlperf_log.EVAL_START) self.decode(dataset_split=tf.estimator.ModeKeys.EVAL, checkpoint_path=checkpoint_path) d_hparams = self._decode_hparams if self._hparams.mlperf_mode and d_hparams.mlperf_success: mlperf_log.transformer_print(key=mlperf_log.RUN_STOP, value={"success": "true"}) break d_hparams = self._decode_hparams if self._hparams.mlperf_mode and not d_hparams.mlperf_success: mlperf_log.transformer_print(key=mlperf_log.RUN_STOP, value={"success": "false"})
def main(argv): tf.logging.set_verbosity(tf.logging.INFO) hparams = create_hparams() if FLAGS.schedule == "train" or FLAGS.schedule == "train_eval_and_decode": mlperf_log.transformer_print(key=mlperf_log.RUN_START, mlperf_mode=hparams.mlperf_mode) if FLAGS.schedule == "run_std_server": run_std_server() mlperf_log.transformer_print(key=mlperf_log.RUN_SET_RANDOM_SEED, value=FLAGS.random_seed, mlperf_mode=hparams.mlperf_mode) trainer_lib.set_random_seed(FLAGS.random_seed) usr_dir.import_usr_dir(FLAGS.t2t_usr_dir) maybe_log_registry_and_exit() if FLAGS.cloud_mlengine: cloud_mlengine.launch() return if FLAGS.generate_data: generate_data() if cloud_mlengine.job_dir(): FLAGS.output_dir = cloud_mlengine.job_dir() if argv: set_hparams_from_args(argv[1:]) exp_fn = create_experiment_fn() exp = exp_fn(create_run_config(hparams), hparams) if is_chief(): save_metadata(hparams) execute_schedule(exp) if FLAGS.schedule != "train": mlperf_log.transformer_print(key=mlperf_log.RUN_FINAL, mlperf_mode=hparams.mlperf_mode)
def input_fn(self, mode, hparams, data_dir=None, params=None, config=None, force_repeat=False, prevent_repeat=False, dataset_kwargs=None): """Builds input pipeline for problem. Args: mode: tf.estimator.ModeKeys hparams: HParams, model hparams data_dir: str, data directory; if None, will use hparams.data_dir params: dict, may include "batch_size" config: RunConfig; should have the data_parallelism attribute if not using TPU force_repeat: bool, whether to repeat the data even if not training prevent_repeat: bool, whether to not repeat when in training mode. Overrides force_repeat. dataset_kwargs: dict, if passed, will pass as kwargs to self.dataset method when called Returns: (features_dict<str name, Tensor feature>, Tensor targets) """ partition_id, num_partitions = self._dataset_partition(mode, config) is_training = mode == tf.estimator.ModeKeys.TRAIN if config and config.use_tpu: num_threads = 64 else: num_threads = cpu_count() if is_training else 1 if config and hasattr(config, "data_parallelism") and config.data_parallelism: num_shards = config.data_parallelism.n else: num_shards = 1 max_length = self.max_length(hparams) mlperf_log.transformer_print(key=mlperf_log.INPUT_MAX_LENGTH, value=max_length) def tpu_valid_size(example): return data_reader.example_valid_size(example, hparams.min_length, max_length) def gpu_valid_size(example): drop_long_sequences = is_training or hparams.eval_drop_long_sequences return data_reader.example_valid_size( example, hparams.min_length, max_length if drop_long_sequences else 10**9) def define_shapes(example): batch_size = config and config.use_tpu and params["batch_size"] return standardize_shapes(example, batch_size=batch_size) # Read and preprocess data_dir = data_dir or (hasattr(hparams, "data_dir") and hparams.data_dir) dataset_kwargs = dataset_kwargs or {} dataset_kwargs.update({ "mode": mode, "data_dir": data_dir, "num_threads": num_threads, "hparams": hparams, "partition_id": partition_id, "num_partitions": num_partitions, }) dataset = self.dataset(**dataset_kwargs) if (force_repeat or is_training) and not prevent_repeat: # Repeat and skip a random number of records dataset = dataset.repeat() if is_training and self.skip_random_fraction_when_training: data_files = tf.contrib.slim.parallel_reader.get_data_files( self.filepattern(data_dir, mode)) # In continuous_train_and_eval when switching between train and # eval, this input_fn method gets called multiple times and it # would give you the exact same samples from the last call # (because the Graph seed is set). So this skip gives you some # shuffling. dataset = skip_random_fraction(dataset, data_files[0]) dataset = dataset.map(data_reader.cast_ints_to_int32, num_parallel_calls=num_threads) if self.batch_size_means_tokens: batch_size_means_tokens = True else: if _are_shapes_fully_defined(dataset.output_shapes): batch_size_means_tokens = False else: tf.logging.warning( "Shapes are not fully defined. Assuming batch_size means tokens." ) batch_size_means_tokens = True # Batching if not batch_size_means_tokens: # Batch size means examples per datashard. if config and config.use_tpu: # on TPU, we use params["batch_size"], which specifies the number of # examples across all datashards batch_size = params["batch_size"] dataset = dataset.batch(batch_size, drop_remainder=True) else: batch_size = hparams.batch_size * num_shards dataset = dataset.batch(batch_size) else: # batch_size means tokens per datashard if config and config.use_tpu: dataset = dataset.filter(tpu_valid_size) padded_shapes = self._pad_for_tpu(dataset.output_shapes, hparams) # on TPU, we use params["batch_size"], which specifies the number of # examples across all datashards batch_size = params["batch_size"] if hparams.pad_batch: tf.logging.warn( "Padding the batch to ensure that remainder eval batches are " "processed. This may lead to incorrect metrics for " "non-zero-padded features, e.g. images. Use a smaller batch " "size that has no remainder in that case.") dataset = dataset.padded_batch(batch_size, padded_shapes, drop_remainder=False) dataset = dataset.map(functools.partial( pad_batch, batch_multiple=batch_size), num_parallel_calls=num_threads) else: dataset = dataset.padded_batch(batch_size, padded_shapes, drop_remainder=True) else: # On GPU, bucket by length dataset = dataset.filter(gpu_valid_size) batching_scheme = data_reader.hparams_to_batching_scheme( hparams, shard_multiplier=num_shards, length_multiplier=self.get_hparams().batch_size_multiplier) if hparams.use_fixed_batch_size: # Here batch_size really means examples per datashard. batching_scheme["batch_sizes"] = [hparams.batch_size] batching_scheme["boundaries"] = [] dataset = dataset.apply( tf.contrib.data.bucket_by_sequence_length( data_reader.example_length, batching_scheme["boundaries"], batching_scheme["batch_sizes"])) if not is_training: batch_multiple = num_shards if hparams.use_fixed_batch_size: # Make sure the last batch has the same fixed size as the rest. batch_multiple *= hparams.batch_size if batch_multiple > 1: tf.logging.warn( "Padding the batch to ensure that remainder eval batches have " "a batch size divisible by the number of data shards. This may " "lead to incorrect metrics for non-zero-padded features, e.g. " "images. Use a single datashard (i.e. 1 GPU) in that case." ) dataset = dataset.map(functools.partial( pad_batch, batch_multiple=batch_multiple), num_parallel_calls=num_threads) dataset = dataset.map(define_shapes, num_parallel_calls=num_threads) # Add shuffling for training batches. This is necessary along with record # level shuffling in the dataset generation. Record shuffling will shuffle # the examples. However, in some cases, it's possible that the shuffle # buffer size for record shuffling is smaller than the batch size. In such # cases, adding batch shuffling ensures that the data is in random order # during training if (is_training and hasattr(hparams, "batch_shuffle_size") and hparams.batch_shuffle_size): dataset = dataset.shuffle(hparams.batch_shuffle_size) def prepare_for_output(example): if not config or not config.use_tpu: _summarize_features(example, num_shards) if mode == tf.estimator.ModeKeys.PREDICT: example["infer_targets"] = example.pop("targets") return example else: return example, example["targets"] dataset = dataset.map(prepare_for_output, num_parallel_calls=num_threads) dataset = dataset.prefetch(2) if mode == tf.estimator.ModeKeys.PREDICT: # This is because of a bug in the Estimator that short-circuits prediction # if it doesn't see a QueueRunner. DummyQueueRunner implements the # minimal expected interface but does nothing. tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, data_reader.DummyQueueRunner()) return dataset
def __init__(self, optimizer_name, lr, hparams, use_tpu=False): # pylint: disable=super-init-not-called tf.logging.info("Using optimizer %s", optimizer_name) mlperf_log.transformer_print(key=mlperf_log.OPT_NAME, value=optimizer_name, hparams=hparams) mlperf_log.transformer_print( key=mlperf_log.OPT_HP_ADAM_BETA1, value=hparams.optimizer_adam_beta1, hparams=hparams) mlperf_log.transformer_print( key=mlperf_log.OPT_HP_ADAM_BETA2, value=hparams.optimizer_adam_beta2, hparams=hparams) mlperf_log.transformer_print( key=mlperf_log.OPT_HP_ADAM_EPSILON, value=hparams.optimizer_adam_epsilon, hparams=hparams) if optimizer_name == "Adam": # We change the default epsilon for Adam. # Using LazyAdam as it's much faster for large vocabulary embeddings. self._opt = tf.contrib.opt.LazyAdamOptimizer( lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon) elif optimizer_name == "MultistepAdam": self._opt = multistep_optimizer.MultistepAdamOptimizer( lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon, n=hparams.optimizer_multistep_accumulate_steps) elif optimizer_name == "Momentum": self._opt = tf.train.MomentumOptimizer( lr, momentum=hparams.optimizer_momentum_momentum, use_nesterov=hparams.optimizer_momentum_nesterov) elif optimizer_name == "YellowFin": self._opt = yellowfin.YellowFinOptimizer( learning_rate=lr, momentum=hparams.optimizer_momentum_momentum) elif optimizer_name == "TrueAdam": self._opt = tf.train.AdamOptimizer( lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon) elif optimizer_name == "AdamW": # Openai gpt used weight decay. # Given the internals of AdamW, weight decay dependent on the # learning rate is chosen to match the openai implementation. # The weight decay update to each parameter is applied before the adam # gradients computation, which is different from that described # in the paper and in the openai implementation: # https://arxiv.org/pdf/1711.05101.pdf self._opt = tf.contrib.opt.AdamWOptimizer( 0.01*lr, lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon) elif optimizer_name == "Adafactor": self._opt = adafactor.adafactor_optimizer_from_hparams(hparams, lr) else: self._opt = tf.contrib.layers.OPTIMIZER_CLS_NAMES[optimizer_name](lr) if _mixed_precision_is_enabled(hparams): if not hparams.mixed_precision_optimizer_loss_scaler: tf.logging.warning("Using mixed precision without a loss scaler will " "likely cause numerical errors.") elif hparams.mixed_precision_optimizer_loss_scaler != "exponential": raise ValueError("Mixed precision training only supports the " "exponential loss scaler") else: tf.logging.info("Using Exponential Update Loss Scaler") manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager( init_loss_scale=2**15, incr_every_n_steps=2000, decr_every_n_nan_or_inf=2, incr_ratio=2, decr_ratio=0.5) self._opt = LossScaleOptimizer(self._opt, manager) self._zero_grads = hparams.optimizer_zero_grads
def train(self, max_steps=None): mlperf_log.transformer_print(key=mlperf_log.TRAIN_LOOP) self._estimator.train(self._train_spec.input_fn, hooks=self._train_spec.hooks, max_steps=max_steps or self._train_spec.max_steps)
def decode_once(estimator, problem_name, hparams, infer_input_fn, decode_hp, decode_to_file, output_dir, log_results=True, checkpoint_path=None): """Decodes once.""" # Get the predictions as an iterable predictions = estimator.predict(infer_input_fn, checkpoint_path=checkpoint_path) if not log_results: return list(predictions) # Prepare output file writers if decode_to_file passed decode_to_file = decode_to_file or decode_hp.decode_to_file if decode_to_file: output_filepath = _decode_filename(decode_to_file, problem_name, decode_hp) parts = output_filepath.split(".") parts[-1] = "targets" target_filepath = ".".join(parts) parts[-1] = "inputs" input_filepath = ".".join(parts) output_file = tf.gfile.Open(output_filepath, "w") target_file = tf.gfile.Open(target_filepath, "w") input_file = tf.gfile.Open(input_filepath, "w") problem_hparams = hparams.problem_hparams # Inputs vocabulary is set to targets if there are no inputs in the problem, # e.g., for language models where the inputs are just a prefix of targets. has_input = "inputs" in problem_hparams.vocabulary inputs_vocab_key = "inputs" if has_input else "targets" inputs_vocab = problem_hparams.vocabulary[inputs_vocab_key] targets_vocab = problem_hparams.vocabulary["targets"] num_eval_samples = 0 for num_predictions, prediction in enumerate(predictions): num_eval_samples += 1 num_predictions += 1 inputs = prediction.get("inputs") targets = prediction.get("targets") outputs = prediction.get("outputs") # Log predictions decoded_outputs = [] decoded_scores = [] if decode_hp.return_beams: output_beams = np.split(outputs, decode_hp.beam_size, axis=0) scores = None if "scores" in prediction: scores = np.split(prediction["scores"], decode_hp.beam_size, axis=0) for i, beam in enumerate(output_beams): tf.logging.info("BEAM %d:" % i) score = scores and scores[i] decoded = log_decode_results( inputs, beam, problem_name, num_predictions, inputs_vocab, targets_vocab, save_images=decode_hp.save_images, output_dir=output_dir, identity_output=decode_hp.identity_output, targets=targets, log_results=decode_hp.log_results) decoded_outputs.append(decoded) if decode_hp.write_beam_scores: decoded_scores.append(score) else: decoded = log_decode_results( inputs, outputs, problem_name, num_predictions, inputs_vocab, targets_vocab, save_images=decode_hp.save_images, output_dir=output_dir, identity_output=decode_hp.identity_output, targets=targets, log_results=decode_hp.log_results) decoded_outputs.append(decoded) # Write out predictions if decode_to_file passed if decode_to_file: for i, (d_input, d_output, d_target) in enumerate(decoded_outputs): # Skip if all padding if d_input and re.match("^({})+$".format(text_encoder.PAD), d_input): continue beam_score_str = "" if decode_hp.write_beam_scores: beam_score_str = "\t%.2f" % decoded_scores[i] output_file.write( str(d_output) + beam_score_str + decode_hp.delimiter) target_file.write(str(d_target) + decode_hp.delimiter) input_file.write(str(d_input) + decode_hp.delimiter) if (decode_hp.num_samples >= 0 and num_predictions >= decode_hp.num_samples): break mlperf_log.transformer_print(key=mlperf_log.EVAL_SIZE, value=num_eval_samples, hparams=hparams) if decode_to_file: output_file.close() target_file.close() input_file.close()
def __init__(self, optimizer_name, lr, hparams, use_tpu=False): # pylint: disable=super-init-not-called tf.logging.info("Using optimizer %s", optimizer_name) mlperf_log.transformer_print(key=mlperf_log.OPT_NAME, value=optimizer_name, hparams=hparams) mlperf_log.transformer_print( key=mlperf_log.OPT_HP_ADAM_BETA1, value=hparams.optimizer_adam_beta1, hparams=hparams) mlperf_log.transformer_print( key=mlperf_log.OPT_HP_ADAM_BETA2, value=hparams.optimizer_adam_beta2, hparams=hparams) mlperf_log.transformer_print( key=mlperf_log.OPT_HP_ADAM_EPSILON, value=hparams.optimizer_adam_epsilon, hparams=hparams) if optimizer_name == "Adam": # We change the default epsilon for Adam. # Using LazyAdam as it's much faster for large vocabulary embeddings. self._opt = tf.contrib.opt.LazyAdamOptimizer( lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon) elif optimizer_name == "MultistepAdam": self._opt = multistep_optimizer.MultistepAdamOptimizer( lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon, n=hparams.optimizer_multistep_accumulate_steps) elif optimizer_name == "Momentum": self._opt = tf.train.MomentumOptimizer( lr, momentum=hparams.optimizer_momentum_momentum, use_nesterov=hparams.optimizer_momentum_nesterov) elif optimizer_name == "YellowFin": self._opt = yellowfin.YellowFinOptimizer( learning_rate=lr, momentum=hparams.optimizer_momentum_momentum) elif optimizer_name == "TrueAdam": self._opt = tf.train.AdamOptimizer( lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon) elif optimizer_name == "AdamW": # Openai gpt used weight decay. # Given the internals of AdamW, weight decay dependent on the # learning rate is chosen to match the openai implementation. # The weight decay update to each parameter is applied before the adam # gradients computation, which is different from that described # in the paper and in the openai implementation: # https://arxiv.org/pdf/1711.05101.pdf self._opt = tf.contrib.opt.AdamWOptimizer( 0.01*lr, lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon) elif optimizer_name == "Adafactor": self._opt = adafactor.adafactor_optimizer_from_hparams(hparams, lr) else: self._opt = tf.contrib.layers.OPTIMIZER_CLS_NAMES[optimizer_name](lr) self._zero_grads = hparams.optimizer_zero_grads
def transformer_ffn_layer(x, hparams, pad_remover=None): """Feed-forward layer in the transformer. Args: x: a Tensor of shape [batch_size, length, hparams.hidden_size] hparams: hyperparameters for model pad_remover: an expert_utils.PadRemover object tracking the padding positions. If provided, when using convolutional settings, the padding is removed before applying the convolution, and restored afterward. This can give a significant speedup. Returns: a Tensor of shape [batch_size, length, hparams.hidden_size] Raises: ValueError: If losses arg is None, but layer generates extra losses. """ ffn_layer = hparams.ffn_layer if ffn_layer != "dense_relu_dense": raise ValueError( "sparse transformer only supports dense_relu_dense ffn.") relu_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "relu_dropout_broadcast_dims", ""))) # In simple convolution mode, use `pad_remover` to speed up processing. mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_FFN_FILTER_DENSE, value={ "filter_size": hparams.filter_size, "use_bias": "True", "activation": mlperf_log.RELU }) mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_FFN_OUTPUT_DENSE, value={ "hidden_size": hparams.hidden_size, "use_bias": "True", }) mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_RELU_DROPOUT, value=hparams.relu_dropout) if pad_remover: original_shape = common_layers.shape_list(x) # Collapse `x` across examples, and remove padding positions. x = tf.reshape(x, tf.concat([[-1], original_shape[2:]], axis=0)) x = tf.expand_dims(pad_remover.remove(x), axis=0) initial_sparsity = None if hparams.get("load_masks_from"): initial_sparsity = hparams.get("initial_sparsity") conv_output = sparse_layers.dense_relu_dense( x, hparams.filter_size, hparams.hidden_size, dropout=hparams.relu_dropout, dropout_broadcast_dims=relu_dropout_broadcast_dims, sparsity_technique=hparams.get("sparsity_technique"), threshold=hparams.get("log_alpha_threshold"), training=hparams.get("mode") == tf_estimator.ModeKeys.TRAIN, clip_alpha=hparams.get("clip_log_alpha"), initial_sparsity=initial_sparsity) if pad_remover: # Restore `conv_output` to the original shape of `x`, including padding. conv_output = tf.reshape( pad_remover.restore(tf.squeeze(conv_output, axis=0)), original_shape) return conv_output
def transformer_decode(decoder_function, decoder_input, encoder_output, encoder_decoder_attention_bias, decoder_self_attention_bias, hparams, attention_weights=None, cache=None, decode_loop_step=None, nonpadding=None, losses=None, **kwargs): """Decode Transformer outputs from encoder representation. Args: decoder_function: the decoder function decoder_input: inputs to bottom of the model. [batch_size, decoder_length, hidden_dim] encoder_output: Encoder representation. [batch_size, input_length, hidden_dim] encoder_decoder_attention_bias: Bias and mask weights for encoder-decoder attention. [batch_size, input_length] decoder_self_attention_bias: Bias and mask weights for decoder self-attention. [batch_size, decoder_length] hparams: hyperparameters for model. attention_weights: weight to store attention to. cache: dict, containing tensors which are the results of previous attentions, used for fast decoding. decode_loop_step: An integer, step number of the decoding loop. Only used for inference on TPU. nonpadding: optional Tensor with shape [batch_size, decoder_length] losses: optional list onto which to append extra training losses **kwargs: additional arguments to pass to decoder_function Returns: Final decoder representation. [batch_size, decoder_length, hidden_dim] """ mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_LAYER_POSTPROCESS_DROPOUT, value=hparams.layer_prepostprocess_dropout, hparams=hparams) decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.layer_prepostprocess_dropout) decoder_output = decoder_function(decoder_input, encoder_output, decoder_self_attention_bias, encoder_decoder_attention_bias, hparams, cache=cache, decode_loop_step=decode_loop_step, nonpadding=nonpadding, save_weights_to=attention_weights, losses=losses, **kwargs) if (common_layers.is_xla_compiled() and hparams.mode == tf.estimator.ModeKeys.TRAIN): # TPU does not react kindly to extra dimensions. # TODO(noam): remove this once TPU is more forgiving of extra dims. return decoder_output else: # Expand since t2t expects 4d tensors. return tf.expand_dims(decoder_output, axis=2)
def transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="encoder", nonpadding=None, save_weights_to=None, make_image_summary=True, losses=None): """A stack of transformer layers. Args: encoder_input: a Tensor encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This must either be passed in, which we do for "packed" datasets, or inferred from encoder_self_attention_bias. The knowledge about padding is used for pad_remover(efficiency) and to mask out padding in convolutional layers. save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. losses: optional list onto which to append extra training losses Returns: y: a Tensors """ x = encoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS, value=hparams.num_encoder_layers or hparams.num_hidden_layers) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_ATTENTION_DROPOUT, value=hparams.attention_dropout) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_ATTENTION_DENSE, value={ "use_bias": "false", "num_heads": hparams.num_heads, "hidden_size": hparams.hidden_size }) with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) nonpadding = 1.0 - padding pad_remover = None if hparams.use_pad_remover and not common_layers.is_xla_compiled(): pad_remover = expert_utils.PadRemover(padding) for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams.max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams.add_relative_to_values, save_weights_to=save_weights_to, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d")) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer_ffn_layer( common_layers.layer_preprocess(x, hparams), hparams, pad_remover, conv_padding="SAME", nonpadding_mask=nonpadding, losses=losses) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_NORM, value={"hidden_size": hparams.hidden_size}) return common_layers.layer_preprocess(x, hparams)
def transformer_ffn_layer(x, hparams, pad_remover=None, conv_padding="LEFT", nonpadding_mask=None, losses=None, cache=None, decode_loop_step=None, readout_filter_size=0): """Feed-forward layer in the transformer. Args: x: a Tensor of shape [batch_size, length, hparams.hidden_size] hparams: hyperparameters for model pad_remover: an expert_utils.PadRemover object tracking the padding positions. If provided, when using convolutional settings, the padding is removed before applying the convolution, and restored afterward. This can give a significant speedup. conv_padding: a string - either "LEFT" or "SAME". nonpadding_mask: an optional Tensor with shape [batch_size, length]. needed for convolutional layers with "SAME" padding. Contains 1.0 in positions corresponding to nonpadding. losses: optional list onto which to append extra training losses cache: dict, containing tensors which are the results of previous attentions, used for fast decoding. decode_loop_step: An integer, step number of the decoding loop. Only used for inference on TPU. readout_filter_size: if it's greater than 0, then it will be used instead of filter_size Returns: a Tensor of shape [batch_size, length, hparams.hidden_size] Raises: ValueError: If losses arg is None, but layer generates extra losses. """ ffn_layer = hparams.ffn_layer relu_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "relu_dropout_broadcast_dims", ""))) if ffn_layer == "conv_hidden_relu": # Backwards compatibility ffn_layer = "dense_relu_dense" if ffn_layer == "dense_relu_dense": # In simple convolution mode, use `pad_remover` to speed up processing. mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_FFN_FILTER_DENSE, value={ "filter_size": hparams.filter_size, "use_bias": "True", "activation": mlperf_log.RELU }) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_FFN_OUTPUT_DENSE, value={ "hidden_size": hparams.hidden_size, "use_bias": "True", }) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_RELU_DROPOUT, value=hparams.relu_dropout) if pad_remover: original_shape = common_layers.shape_list(x) # Collapse `x` across examples, and remove padding positions. x = tf.reshape(x, tf.concat([[-1], original_shape[2:]], axis=0)) x = tf.expand_dims(pad_remover.remove(x), axis=0) conv_output = common_layers.dense_relu_dense( x, hparams.filter_size, hparams.hidden_size, dropout=hparams.relu_dropout, dropout_broadcast_dims=relu_dropout_broadcast_dims) if pad_remover: # Restore `conv_output` to the original shape of `x`, including padding. conv_output = tf.reshape( pad_remover.restore(tf.squeeze(conv_output, axis=0)), original_shape) return conv_output elif ffn_layer == "conv_relu_conv": return common_layers.conv_relu_conv( x, readout_filter_size or hparams.filter_size, hparams.hidden_size, first_kernel_size=hparams.conv_first_kernel, second_kernel_size=1, padding=conv_padding, nonpadding_mask=nonpadding_mask, dropout=hparams.relu_dropout, cache=cache, decode_loop_step=decode_loop_step) elif ffn_layer == "parameter_attention": return common_attention.parameter_attention( x, hparams.parameter_attention_key_channels or hparams.hidden_size, hparams.parameter_attention_value_channels or hparams.hidden_size, hparams.hidden_size, readout_filter_size or hparams.filter_size, hparams.num_heads, hparams.attention_dropout) elif ffn_layer == "conv_hidden_relu_with_sepconv": return common_layers.conv_hidden_relu( x, readout_filter_size or hparams.filter_size, hparams.hidden_size, kernel_size=(3, 1), second_kernel_size=(31, 1), padding="LEFT", dropout=hparams.relu_dropout) elif ffn_layer == "sru": return common_layers.sru(x) elif ffn_layer == "local_moe_tpu": overhead = ( hparams.moe_overhead_train if hparams.mode == tf.estimator.ModeKeys.TRAIN else hparams.moe_overhead_eval) ret, loss = expert_utils.local_moe_tpu( x, hparams.filter_size // 2, hparams.hidden_size, hparams.moe_num_experts, overhead=overhead, loss_coef=hparams.moe_loss_coef) elif ffn_layer == "local_moe": overhead = ( hparams.moe_overhead_train if hparams.mode == tf.estimator.ModeKeys.TRAIN else hparams.moe_overhead_eval) ret, loss = expert_utils.local_moe( x, True, expert_utils.ffn_expert_fn(hparams.hidden_size, [hparams.filter_size], hparams.hidden_size), hparams.moe_num_experts, k=hparams.moe_k, hparams=hparams) losses.append(loss) return ret else: assert ffn_layer == "none" return x
def main(argv): tf.logging.set_verbosity(tf.logging.INFO) if FLAGS.jax: # Setup trax FLAGS dataset = FLAGS.problem model = FLAGS.model data_dir = FLAGS.data_dir output_dir = FLAGS.output_dir config_file = [FLAGS.hparams_set] config = [ "train.train_steps=%d" % FLAGS.train_steps, "train.eval_steps=%d" % FLAGS.eval_steps, "train.eval_frequency=%d" % FLAGS.local_eval_frequency, ] + str(FLAGS.hparams).split(",") # Copied _setup_gin exactly from trax/trainer.py and removed "FLAGS." def _setup_gin(): """Setup gin configuration.""" # Imports for configurables # pylint: disable=g-import-not-at-top,unused-import,g-bad-import-order,reimported,unused-variable from tensor2tensor.trax import inputs as _trax_inputs from tensor2tensor.trax import models as _trax_models from tensor2tensor.trax import optimizers as _trax_opt # pylint: disable=g-import-not-at-top,unused-import,g-bad-import-order,reimported,unused-variable configs = config or [] # Override with --dataset and --model if dataset: configs.append("inputs.dataset_name='%s'" % dataset) configs.append("inputs.data_dir='%s'" % data_dir) configs.append("[email protected]") if model: configs.append("[email protected].%s" % model) gin.parse_config_files_and_bindings(config_file, configs) _setup_gin() trax.train(output_dir=output_dir) usr_dir.import_usr_dir(FLAGS.t2t_usr_dir) # If we just have to print the registry, do that and exit early. maybe_log_registry_and_exit() # Create HParams. if argv: set_hparams_from_args(argv[1:]) hparams = create_hparams() if FLAGS.schedule == "train" or FLAGS.schedule == "train_eval_and_decode": mlperf_log.transformer_print(key=mlperf_log.RUN_START, hparams=hparams) if FLAGS.schedule == "run_std_server": run_std_server() mlperf_log.transformer_print( key=mlperf_log.RUN_SET_RANDOM_SEED, value=FLAGS.random_seed, hparams=hparams) trainer_lib.set_random_seed(FLAGS.random_seed) if FLAGS.cloud_mlengine: cloud_mlengine.launch() return if FLAGS.generate_data: generate_data() if cloud_mlengine.job_dir(): FLAGS.output_dir = cloud_mlengine.job_dir() exp_fn = create_experiment_fn() exp = exp_fn(create_run_config(hparams), hparams) if is_chief(): save_metadata(hparams) execute_schedule(exp) if FLAGS.schedule != "train": mlperf_log.transformer_print(key=mlperf_log.RUN_FINAL, hparams=hparams)
def transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="encoder", nonpadding=None, save_weights_to=None, make_image_summary=True, losses=None, attn_bias_for_padding=None): """A stack of transformer layers. Args: encoder_input: a Tensor encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This must either be passed in, which we do for "packed" datasets, or inferred from encoder_self_attention_bias. The knowledge about padding is used for pad_remover(efficiency) and to mask out padding in convolutional layers. save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. losses: optional list onto which to append extra training losses attn_bias_for_padding: Padded attention bias in case a unidirectional encoder is being used where future attention is masked. Returns: y: a Tensors """ x = encoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS, value=hparams.num_encoder_layers or hparams.num_hidden_layers) mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_ATTENTION_DROPOUT, value=hparams.attention_dropout) mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_ATTENTION_DENSE, value={ "use_bias": "false", "num_heads": hparams.num_heads, "hidden_size": hparams.hidden_size }) with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: attention_bias = encoder_self_attention_bias if attn_bias_for_padding is not None: attention_bias = attn_bias_for_padding padding = common_attention.attention_bias_to_padding( attention_bias) nonpadding = 1.0 - padding pad_remover = None if hparams.use_pad_remover and not common_layers.is_xla_compiled(): pad_remover = expert_utils.PadRemover(padding) for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): if layer < hparams.get("num_area_layers", 0): max_area_width = hparams.get("max_area_width", 1) max_area_height = hparams.get("max_area_height", 1) memory_height = hparams.get("memory_height", 1) else: max_area_width = 1 max_area_height = 1 memory_height = 1 y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams.max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams.add_relative_to_values, save_weights_to=save_weights_to, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d"), activation_dtype=hparams.get("activation_dtype", "float32"), weight_dtype=hparams.get("weight_dtype", "float32"), hard_attention_k=hparams.get("hard_attention_k", 0), gumbel_noise_weight=hparams.get( "gumbel_noise_weight", 0.0), max_area_width=max_area_width, max_area_height=max_area_height, memory_height=memory_height, area_key_mode=hparams.get("area_key_mode", "none"), area_value_mode=hparams.get("area_value_mode", "none"), training=(hparams.get("mode", tf.estimator.ModeKeys.TRAIN) == tf.estimator.ModeKeys.TRAIN)) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer_ffn_layer(common_layers.layer_preprocess( x, hparams), hparams, pad_remover, conv_padding="SAME", nonpadding_mask=nonpadding, losses=losses) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_NORM, value={"hidden_size": hparams.hidden_size}) return common_layers.layer_preprocess(x, hparams)
def __init__(self, optimizer_name, lr, hparams, use_tpu=False): # pylint: disable=super-init-not-called tf.logging.info("Using optimizer %s", optimizer_name) mlperf_log.transformer_print(key=mlperf_log.OPT_NAME, value=optimizer_name, hparams=hparams) mlperf_log.transformer_print( key=mlperf_log.OPT_HP_ADAM_BETA1, value=hparams.optimizer_adam_beta1, hparams=hparams) mlperf_log.transformer_print( key=mlperf_log.OPT_HP_ADAM_BETA2, value=hparams.optimizer_adam_beta2, hparams=hparams) mlperf_log.transformer_print( key=mlperf_log.OPT_HP_ADAM_EPSILON, value=hparams.optimizer_adam_epsilon, hparams=hparams) if optimizer_name == "Adam": # We change the default epsilon for Adam. # Using LazyAdam as it's much faster for large vocabulary embeddings. self._opt = tf.contrib.opt.LazyAdamOptimizer( lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon) elif optimizer_name == "MultistepAdam": self._opt = multistep_optimizer.MultistepAdamOptimizer( lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon, n=hparams.optimizer_multistep_accumulate_steps) elif optimizer_name == "Momentum": self._opt = tf.train.MomentumOptimizer( lr, momentum=hparams.optimizer_momentum_momentum, use_nesterov=hparams.optimizer_momentum_nesterov) elif optimizer_name == "YellowFin": self._opt = yellowfin.YellowFinOptimizer( learning_rate=lr, momentum=hparams.optimizer_momentum_momentum) elif optimizer_name == "TrueAdam": self._opt = tf.train.AdamOptimizer( lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon) elif optimizer_name == "AdamW": # Openai gpt used weight decay. # Given the internals of AdamW, weight decay dependent on the # learning rate is chosen to match the openai implementation. # The weight decay update to each parameter is applied before the adam # gradients computation, which is different from that described # in the paper and in the openai implementation: # https://arxiv.org/pdf/1711.05101.pdf self._opt = tf.contrib.opt.AdamWOptimizer( 0.01*lr, lr, beta1=hparams.optimizer_adam_beta1, beta2=hparams.optimizer_adam_beta2, epsilon=hparams.optimizer_adam_epsilon) elif optimizer_name == "Adafactor": self._opt = adafactor.adafactor_optimizer_from_hparams(hparams, lr) else: self._opt = tf.contrib.layers.OPTIMIZER_CLS_NAMES[optimizer_name](lr)
def dataset(self, mode, data_dir=None, num_threads=None, output_buffer_size=None, shuffle_files=None, hparams=None, preprocess=True, dataset_split=None, shard=None, partition_id=0, num_partitions=1, shuffle_buffer_size=1024, max_records=-1): """Build a Dataset for this problem. Args: mode: tf.estimator.ModeKeys; determines which files to read from. data_dir: directory that contains data files. num_threads: int, number of threads to use for decode and preprocess Dataset.map calls. output_buffer_size: int, how many elements to prefetch at end of pipeline. shuffle_files: whether to shuffle input files. Default behavior (i.e. when shuffle_files=None) is to shuffle if mode == TRAIN. hparams: HParams; hparams to be passed to Problem.preprocess_example and Problem.hparams. If None, will use a default set that is a no-op. preprocess: bool, whether to map the Dataset through Problem.preprocess_example. dataset_split: DatasetSplit, which split to read data from (TRAIN:"-train", EVAL:"-dev", "test":"-test"). Defaults to mode. shard: int, if provided, will only read data from the specified shard. partition_id: integer - which partition of the dataset to read from num_partitions: how many partitions in the dataset shuffle_buffer_size: if shuffle_files is True, this is the buffer size used to shuffle records. max_records: int, number of records to truncate to. Returns: Dataset containing dict<feature name, Tensor>. Raises: ValueError: if num_partitions is greater than the number of data files. """ is_training = mode == tf.estimator.ModeKeys.TRAIN shuffle_files = shuffle_files or shuffle_files is None and is_training dataset_split = dataset_split or mode assert data_dir if hparams is None: hparams = default_model_hparams() if not hasattr(hparams, "data_dir"): hparams.add_hparam("data_dir", data_dir) if not hparams.data_dir: hparams.data_dir = data_dir # Construct the Problem's hparams so that items within it are accessible _ = self.get_hparams(hparams) data_filepattern = self.filepattern(data_dir, dataset_split, shard=shard) tf.logging.info("Reading data files from %s", data_filepattern) data_files = sorted(slim.parallel_reader.get_data_files( data_filepattern)) # Functions used in dataset transforms below. `filenames` can be either a # `tf.string` tensor or `tf.data.Dataset` containing one or more filenames. def _load_records_and_preprocess(filenames): """Reads files from a string tensor or a dataset of filenames.""" # Load records from file(s) with an 8MiB read buffer. dataset = tf.data.TFRecordDataset(filenames, buffer_size=8 * 1024 * 1024) # Decode. dataset = dataset.map(self.decode_example, num_parallel_calls=num_threads) # Preprocess if requested. # Note that preprocessing should happen per-file as order may matter. if preprocess: dataset = self.preprocess(dataset, mode, hparams, interleave=shuffle_files) return dataset if len(data_files) < num_partitions: raise ValueError( "number of data files (%d) must be at least the number of hosts (%d)" % (len(data_files), num_partitions)) data_files = [f for (i, f) in enumerate(data_files) if i % num_partitions == partition_id] tf.logging.info( "partition: %d num_data_files: %d" % (partition_id, len(data_files))) if shuffle_files: mlperf_log.transformer_print(key=mlperf_log.INPUT_ORDER) random.shuffle(data_files) dataset = tf.data.Dataset.from_tensor_slices(tf.constant(data_files)) # Create data-set from files by parsing, pre-processing and interleaving. if shuffle_files: dataset = dataset.apply( tf.data.experimental.parallel_interleave( _load_records_and_preprocess, sloppy=True, cycle_length=8)) else: dataset = _load_records_and_preprocess(dataset) dataset = dataset.map( self.maybe_reverse_and_copy, num_parallel_calls=num_threads) dataset = dataset.take(max_records) ## Shuffle records only for training examples. if shuffle_files and is_training: dataset = dataset.shuffle(shuffle_buffer_size) if hparams.get("pack_dataset", False): dataset = generator_utils.pack_dataset( dataset, hparams.max_length, keys=["inputs", "targets"], use_custom_ops=hparams.get("use_custom_ops", False)) if output_buffer_size: dataset = dataset.prefetch(output_buffer_size) return dataset
def moe_transformer_decoder(decoder_input, encoder_output, decoder_self_attention_bias, encoder_decoder_attention_bias, hparams, cache=None, decode_loop_step=None, name="decoder", save_weights_to=None, make_image_summary=True, layer_collection=None, recurrent_memory_by_layer=None, chunk_number=None): """A stack of transformer layers. Args: decoder_input: a Tensor encoder_output: a Tensor decoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) encoder_decoder_attention_bias: bias Tensor for encoder-decoder attention (see common_attention.attention_bias()) hparams: hyperparameters for model cache: dict, containing tensors which are the results of previous attentions, used for fast decoding. decode_loop_step: An integer, step number of the decoding loop. Only used for inference on TPU. name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This is used to mask out padding in convolutional layers. We generally only need this mask for "packed" datasets, because for ordinary datasets, no padding is ever followed by nonpadding. save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. losses: optional list onto which to append extra training losses layer_collection: A tensorflow_kfac.LayerCollection. Only used by the KFAC optimizer. Default is None. recurrent_memory_by_layer: Optional dict, mapping layer names to instances of transformer_memory.RecurrentMemory. Default is None. chunk_number: an optional integer Tensor with shape [batch] used to operate the recurrent_memory. Returns: y: a Tensors """ x = decoder_input mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS, value=hparams.num_decoder_layers or hparams.num_hidden_layers) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_ATTENTION_DROPOUT, value=hparams.attention_dropout) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_ATTENTION_DENSE, value={ "use_bias": "false", "num_heads": hparams.num_heads, "hidden_size": hparams.hidden_size }) with tf.variable_scope(name): for layer_idx in range(hparams.num_decoder_layers or hparams.num_hidden_layers): x = moe_transformer_decoder_layer( x, decoder_self_attention_bias, layer_idx, hparams, encoder_decoder_attention_bias=encoder_decoder_attention_bias, encoder_output=encoder_output, cache=cache, decode_loop_step=decode_loop_step, save_weights_to=save_weights_to, make_image_summary=make_image_summary, layer_collection=layer_collection, recurrent_memory_by_layer=recurrent_memory_by_layer, chunk_number=chunk_number ) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_NORM, value={"hidden_size": hparams.hidden_size}) return common_layers.layer_preprocess( x, hparams)
def mel_perf_transformer_encode(encoder_function, perf_inputs, mel_inputs, target_space, hparams, attention_weights=None, features=None, losses=None, prepare_encoder_fn=None, **kwargs): """Encode transformer inputs. Used for melody & performance autoencoder. Performance is mean-aggregated across time and combined with melody in a variety of different ways. Args: encoder_function: the encoder function perf_inputs: Transformer inputs [batch_size, input_length, 1, hidden_dim] which will be flattened along the two spatial dimensions. mel_inputs: Transformer inputs [batch_size, input_length, 1, hidden_dim] which will be flattened along the two spatial dimensions. target_space: scalar, target space ID. hparams: hyperparameters for model. attention_weights: weight to store attention to. features: optionally pass the entire features dictionary as well. This is needed now for "packed" datasets. losses: optional list onto which to append extra training losses prepare_encoder_fn: optional, alternative to transformer_prepare_encoder. **kwargs: additional arguments to pass to encoder_function Returns: Tuple of: encoder_output: Encoder representation. [batch_size, input_length, hidden_dim] encoder_decoder_attention_bias: Bias and mask weights for encoder-decoder attention. [batch_size, input_length] """ perf_inputs = common_layers.flatten4d3d(perf_inputs) mel_inputs = common_layers.flatten4d3d(mel_inputs) if not prepare_encoder_fn: prepare_encoder_fn = transformer_prepare_encoder perf_encoder_input, perf_self_attention_bias, perf_encdec_attention_bias = ( prepare_encoder_fn(perf_inputs, target_space, hparams, features=features, reuse_target_embedding=tf.AUTO_REUSE)) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_LAYER_POSTPROCESS_DROPOUT, value=hparams.layer_prepostprocess_dropout, hparams=hparams) perf_encoder_input = tf.nn.dropout( perf_encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) perf_attn_bias_for_padding = None # Otherwise the encoder will just use encoder_self_attention_bias. if hparams.unidirectional_encoder: perf_attn_bias_for_padding = perf_encdec_attention_bias # do the same thing for melody mel_encoder_input, mel_self_attention_bias, mel_encdec_attention_bias = ( prepare_encoder_fn(mel_inputs, target_space, hparams, features=features, reuse_target_embedding=tf.AUTO_REUSE)) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_LAYER_POSTPROCESS_DROPOUT, value=hparams.layer_prepostprocess_dropout, hparams=hparams) mel_encoder_input = tf.nn.dropout( mel_encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) mel_attn_bias_for_padding = None # Otherwise the encoder will just use encoder_self_attention_bias. if hparams.unidirectional_encoder: mel_attn_bias_for_padding = mel_encdec_attention_bias # use the proper encoder function for perf/melody perf_encoder_output = encoder_function( perf_encoder_input, perf_self_attention_bias, hparams, name="perf_encoder", nonpadding=features_to_nonpadding(features, "inputs"), save_weights_to=attention_weights, make_image_summary=not common_layers.is_xla_compiled(), losses=losses, attn_bias_for_padding=perf_attn_bias_for_padding, **kwargs) # same thing for melody mel_encoder_output = encoder_function( mel_encoder_input, mel_self_attention_bias, hparams, name="mel_encoder", nonpadding=features_to_nonpadding(features, "inputs"), save_weights_to=attention_weights, make_image_summary=not common_layers.is_xla_compiled(), losses=losses, attn_bias_for_padding=mel_attn_bias_for_padding, **kwargs) # concatenate the global mean vector/bias term with the full melody encoding perf_mean_vector = tf.math.reduce_mean(perf_encoder_output, axis=1, keep_dims=True) # different methods of aggregating over the performance + melody vectors! if hparams.aggregation == "sum": # add both mean performance and melody vectors together perf_mean_bias = tf.math.reduce_mean(perf_encdec_attention_bias, axis=-1, keep_dims=True) encoder_output = mel_encoder_output + perf_mean_vector encoder_decoder_attention_bias = mel_encdec_attention_bias + perf_mean_bias elif hparams.aggregation == "concat": # concatenate melody with mean-aggregated performance embedding stop_token = tf.zeros((1, 1, 384)) encoder_output = tf.concat( [mel_encoder_output, stop_token, perf_mean_vector], axis=1) perf_mean_bias = tf.math.reduce_mean(perf_encdec_attention_bias, axis=-1, keep_dims=True) stop_bias = tf.zeros((1, 1, 1, 1)) encoder_decoder_attention_bias = tf.concat( [mel_encdec_attention_bias, stop_bias, perf_mean_bias], axis=-1) elif hparams.aggregation == "tile": # tile performance embedding across each dimension of melody embedding! dynamic_val = tf.shape(mel_encoder_output)[1] shp = tf.convert_to_tensor([1, dynamic_val, 1], dtype=tf.int32) tiled_mean = tf.tile(perf_mean_vector, shp) encoder_output = tf.concat([mel_encoder_output, tiled_mean], axis=-1) encoder_decoder_attention_bias = mel_encdec_attention_bias else: NotImplementedError( "aggregation method must be in [sum, concat, tile].") return encoder_output, encoder_decoder_attention_bias
def generate_files(generator, output_filenames, max_cases=None, cycle_every_n=1): """Generate cases from a generator and save as TFRecord files. Generated cases are transformed to tf.Example protos and saved as TFRecords in sharded files named output_dir/output_name-00..N-of-00..M=num_shards. Args: generator: a generator yielding (string -> int/float/str list) dictionaries. output_filenames: List of output file paths. max_cases: maximum number of cases to get from the generator; if None (default), we use the generator until StopIteration is raised. cycle_every_n: how many cases from the generator to take before switching to the next shard; by default set to 1, switch every case. """ if outputs_exist(output_filenames): tf.logging.info( "Skipping generator because outputs files exists at {}".format( output_filenames)) return tmp_filenames = [fname + ".incomplete" for fname in output_filenames] num_shards = len(output_filenames) # Check if is training or eval, ref: train_data_filenames(). if num_shards > 0: if "-train" in output_filenames[0]: tag = "train" elif "-dev" in output_filenames[0]: tag = "eval" else: tag = "other" writers = [tf.python_io.TFRecordWriter(fname) for fname in tmp_filenames] counter, shard = 0, 0 for case in generator: if case is None: continue if counter % 100000 == 0: tf.logging.info("Generating case %d." % counter) counter += 1 if max_cases and counter > max_cases: break example = to_example(case) writers[shard].write(example.SerializeToString()) if counter % cycle_every_n == 0: shard = (shard + 1) % num_shards for writer in writers: writer.close() for tmp_name, final_name in zip(tmp_filenames, output_filenames): tf.gfile.Rename(tmp_name, final_name) if num_shards > 0: if tag == "train": mlperf_log.transformer_print( key=mlperf_log.PREPROC_NUM_TRAIN_EXAMPLES, value=counter) elif tag == "eval": mlperf_log.transformer_print( key=mlperf_log.PREPROC_NUM_EVAL_EXAMPLES, value=counter) tf.logging.info("Generated %s Examples", counter)
def perf_transformer_encode(encoder_function, inputs, target_space, hparams, baseline, attention_weights=None, features=None, losses=None, prepare_encoder_fn=None, **kwargs): """Encoding for performance autoencoder, which mean-aggregates across time. Args: encoder_function: the encoder function inputs: Transformer inputs [batch_size, input_length, 1, hidden_dim] which will be flattened along the two spatial dimensions. target_space: scalar, target space ID. hparams: hyperparameters for model. baseline: if True, does not mean-aggregate the encoder output. attention_weights: weight to store attention to. features: optionally pass the entire features dictionary as well. This is needed now for "packed" datasets. losses: optional list onto which to append extra training losses prepare_encoder_fn: optional, alternative to transformer_prepare_encoder. **kwargs: additional arguments to pass to encoder_function Returns: Tuple of: encoder_output: Encoder representation. [batch_size, input_length, hidden_dim] encoder_decoder_attention_bias: Bias and mask weights for encoder-decoder attention. [batch_size, input_length] """ inputs = common_layers.flatten4d3d(inputs) if not prepare_encoder_fn: prepare_encoder_fn = transformer_prepare_encoder encoder_input, self_attention_bias, encoder_decoder_attention_bias = ( prepare_encoder_fn(inputs, target_space, hparams, features=features, reuse_target_embedding=tf.AUTO_REUSE)) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_LAYER_POSTPROCESS_DROPOUT, value=hparams.layer_prepostprocess_dropout, hparams=hparams) encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) attn_bias_for_padding = None # Otherwise the encoder will just use encoder_self_attention_bias. if hparams.unidirectional_encoder: attn_bias_for_padding = encoder_decoder_attention_bias encoder_output = encoder_function( encoder_input, self_attention_bias, hparams, name="encoder", nonpadding=features_to_nonpadding(features, "inputs"), save_weights_to=attention_weights, make_image_summary=not common_layers.is_xla_compiled(), losses=losses, attn_bias_for_padding=attn_bias_for_padding, **kwargs) if not baseline: encoder_output = tf.math.reduce_mean(encoder_output, axis=1, keep_dims=True) encoder_decoder_attention_bias = tf.math.reduce_mean( encoder_decoder_attention_bias, axis=-1, keep_dims=True) return encoder_output, encoder_decoder_attention_bias
def input_fn(dataset, filepattern, skip_random_fraction_when_training, batch_size_means_tokens_param, batch_size_multiplier, max_length, mode, hparams, data_dir=None, params=None, config=None, force_repeat=False, prevent_repeat=False): """Builds input pipeline for problem. Args: dataset: the dataset to make input function from. filepattern: the pattern of files to read from. skip_random_fraction_when_training: whether to skip randomly when training. batch_size_means_tokens_param: whether batch size should mean tokens. batch_size_multiplier: how to multiply batch size when bucketing. max_length: maximum length, mode: tf.estimator.ModeKeys hparams: HParams, model hparams data_dir: str, data directory; if None, will use hparams.data_dir params: dict, may include "batch_size" config: RunConfig; should have the data_parallelism attribute if not using TPU force_repeat: bool, whether to repeat the data even if not training prevent_repeat: bool, whether to not repeat when in training mode. Overrides force_repeat. Returns: (features_dict<str name, Tensor feature>, Tensor targets) """ is_training = mode == tf.estimator.ModeKeys.TRAIN if config and config.use_tpu: num_threads = 64 else: num_threads = cpu_count() if is_training else 1 if config and hasattr(config, "data_parallelism") and config.data_parallelism: num_shards = config.data_parallelism.n else: num_shards = 1 mlperf_log.transformer_print(key=mlperf_log.INPUT_MAX_LENGTH, value=max_length) def tpu_valid_size(example): return example_valid_size(example, hparams.min_length, max_length) def gpu_valid_size(example): drop_long_sequences = is_training or hparams.eval_drop_long_sequences max_validate_length = max_length if drop_long_sequences else 10**9 return example_valid_size(example, hparams.min_length, max_validate_length) def define_shapes(example): batch_size = config and config.use_tpu and params["batch_size"] return standardize_shapes(example, batch_size=batch_size) # Read and preprocess data_dir = data_dir or (hasattr(hparams, "data_dir") and hparams.data_dir) if (force_repeat or is_training) and not prevent_repeat: # Repeat and skip a random number of records dataset = dataset.repeat() if is_training and skip_random_fraction_when_training: data_files = contrib.slim().parallel_reader.get_data_files(filepattern) # In continuous_train_and_eval when switching between train and # eval, this input_fn method gets called multiple times and it # would give you the exact same samples from the last call # (because the Graph seed is set). So this skip gives you some # shuffling. dataset = skip_random_fraction(dataset, data_files[0]) dataset = dataset.map(cast_ints_to_int32, num_parallel_calls=num_threads) if batch_size_means_tokens_param: batch_size_means_tokens = True else: if _are_shapes_fully_defined(dataset.output_shapes): batch_size_means_tokens = False else: tf.logging.warning( "Shapes are not fully defined. Assuming batch_size means tokens." ) batch_size_means_tokens = True # Batching if not batch_size_means_tokens: # Batch size means examples per datashard. if config and config.use_tpu: # on TPU, we use params["batch_size"], which specifies the number of # examples across all datashards batch_size = params["batch_size"] dataset = dataset.batch(batch_size, drop_remainder=True) else: batch_size = hparams.batch_size * num_shards dataset = dataset.batch(batch_size) else: # batch_size means tokens per datashard if config and config.use_tpu: dataset = dataset.filter(tpu_valid_size) padded_shapes = pad_for_tpu(dataset.output_shapes, hparams, max_length) # on TPU, we use params["batch_size"], which specifies the number of # examples across all datashards batch_size = params["batch_size"] if hparams.pad_batch: tf.logging.warn( "Padding the batch to ensure that remainder eval batches are " "processed. This may lead to incorrect metrics for " "non-zero-padded features, e.g. images. Use a smaller batch " "size that has no remainder in that case.") dataset = dataset.padded_batch(batch_size, padded_shapes, drop_remainder=False) dataset = dataset.map(functools.partial( pad_batch, batch_multiple=batch_size), num_parallel_calls=num_threads) else: dataset = dataset.padded_batch(batch_size, padded_shapes, drop_remainder=True) else: # On GPU, bucket by length dataset = dataset.filter(gpu_valid_size) cur_batching_scheme = hparams_to_batching_scheme( hparams, shard_multiplier=num_shards, length_multiplier=batch_size_multiplier) if hparams.use_fixed_batch_size: # Here batch_size really means examples per datashard. cur_batching_scheme["batch_sizes"] = [hparams.batch_size] cur_batching_scheme["boundaries"] = [] dataset = dataset.apply( tf.data.experimental.bucket_by_sequence_length( example_length, cur_batching_scheme["boundaries"], cur_batching_scheme["batch_sizes"])) if not is_training: batch_multiple = num_shards if hparams.use_fixed_batch_size: # Make sure the last batch has the same fixed size as the rest. batch_multiple *= hparams.batch_size if batch_multiple > 1: tf.logging.warn( "Padding the batch to ensure that remainder eval batches have " "a batch size divisible by the number of data shards. This may " "lead to incorrect metrics for non-zero-padded features, e.g. " "images. Use a single datashard (i.e. 1 GPU) in that case." ) dataset = dataset.map(functools.partial( pad_batch, batch_multiple=batch_multiple), num_parallel_calls=num_threads) dataset = dataset.map(define_shapes, num_parallel_calls=num_threads) # Add shuffling for training batches. This is necessary along with record # level shuffling in the dataset generation. Record shuffling will shuffle # the examples. However, in some cases, it's possible that the shuffle # buffer size for record shuffling is smaller than the batch size. In such # cases, adding batch shuffling ensures that the data is in random order # during training if (is_training and hasattr(hparams, "batch_shuffle_size") and hparams.batch_shuffle_size): dataset = dataset.shuffle(hparams.batch_shuffle_size) # Split batches into chunks if targets are too long. # The new "chunk_number" feature is 0 for the first chunk and goes up then. # Chunks are reversed so the 0th chunk comes first, then the 1st and so on, # so models can attend to them in the order they arrive. The last chunk is # usually the one containing the end of the target sentence (EOS). chunk_length = hparams.get("split_targets_chunk_length", 0) max_chunks = hparams.get("split_targets_max_chunks", 100) if chunk_length > 0: def is_nonzero_chunk(example): """A chunk is zero if all targets are 0s.""" return tf.less(0, tf.reduce_sum(tf.abs(example["targets"]))) def split_on_length(example): """Split a batch of ditcs on length.""" x = example["targets"] # TODO(kitaev): This code breaks if chunk_length * max_chunks < batch_size length_diff = chunk_length * max_chunks - tf.shape(x)[1] padded_x = tf.pad(x, [(0, 0), (0, length_diff), (0, 0), (0, 0)]) chunks = [ padded_x[:, i * chunk_length:(i + 1) * chunk_length, :, :] for i in range(max_chunks - 1) ] chunks.append(padded_x[:, (max_chunks - 1) * chunk_length:, :, :]) new_example = {} # Setting chunk_number to be tf.range(max_chunks) is incompatible with TPU new_example["chunk_number"] = tf.concat([ tf.expand_dims(tf.ones_like(c) * n, axis=0) for n, c in enumerate(chunks) ], axis=0) new_example["targets"] = tf.concat( [tf.expand_dims(c, axis=0) for c in chunks], axis=0) for k in example: if k != "targets": assert k != "chunk_number", ( "Chunking code expects the chunk_number feature name to be " "available") new_example[k] = tf.concat([ tf.expand_dims(example[k], axis=0) for _ in range(max_chunks) ], axis=0) return tf.data.Dataset.from_tensor_slices(new_example) dataset = dataset.flat_map(split_on_length) dataset = dataset.filter(is_nonzero_chunk) # The chunking data pipeline thus far creates batches of examples where all # of the examples have the same chunk number. This can lead to periodic # fluctuations in the loss; for example, when all examples in the batch have # chunk number 0 the loss may be higher than midway through a sequence. # Enabling split_targets_strided_training adjusts the data so that each # batch includes examples at various points within a sequence. if is_training and hparams.split_targets_strided_training: # TODO(kitaev): make sure that shape inference works on GPU, not just TPU. inferred_batch_size = dataset.output_shapes["targets"].as_list()[0] if inferred_batch_size is None: raise ValueError( "Strided training is only implemented when the batch size can be " "inferred statically, for example when training on TPU.") chunk_stride = inferred_batch_size * max( 1, max_chunks // inferred_batch_size) + 1 def collapse_nested_datasets(example): """Converts a dataset of datasets to a dataset of tensor features.""" new_example = {} for k, v in example.items(): v = tf.data.experimental.get_single_element( v.batch(inferred_batch_size, drop_remainder=True)) new_example[k] = v return tf.data.Dataset.from_tensor_slices(new_example) dataset = dataset.unbatch() dataset = dataset.window(inferred_batch_size, inferred_batch_size, chunk_stride) dataset = dataset.flat_map(collapse_nested_datasets) dataset = dataset.batch(inferred_batch_size, drop_remainder=True) def prepare_for_output(example): if not config or not config.use_tpu: _summarize_features(example, num_shards) if mode == tf.estimator.ModeKeys.PREDICT: example["infer_targets"] = example.pop("targets") return example else: return example, example[hparams.get(key="labels_feature_name", default="targets")] dataset = dataset.map(prepare_for_output, num_parallel_calls=num_threads) dataset = dataset.prefetch(2) if mode == tf.estimator.ModeKeys.PREDICT: # This is because of a bug in the Estimator that short-circuits prediction # if it doesn't see a QueueRunner. DummyQueueRunner implements the # minimal expected interface but does nothing. tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, DummyQueueRunner()) return dataset
def decode_once(estimator, problem_name, hparams, infer_input_fn, decode_hp, decode_to_file, output_dir, log_results=True, checkpoint_path=None): """Decodes once. Args: estimator: tf.estimator.Estimator instance. Used to generate encoded predictions. problem_name: str. Name of problem. hparams: tf.HParams instance. HParams for model training. infer_input_fn: zero-arg function. Input function for estimator. decode_hp: tf.HParams instance. See decode_hparams() above. decode_to_file: str. Prefix for filenames. Used to generated filenames to which decoded predictions are written. output_dir: str. Output directory. Only used for writing images. log_results: bool. If False, return encoded predictions without any further processing. checkpoint_path: str. Path to load model checkpoint from. If unspecified, Estimator's default is used. Returns: If decode_hp.decode_in_memory is True: List of dicts, one per example. Values are either numpy arrays or decoded strings. If decode_hp.decode_in_memory is False: An empty list. """ # Get the predictions as an iterable predictions = estimator.predict(infer_input_fn, checkpoint_path=checkpoint_path) if not log_results: return list(predictions) # Prepare output file writers if decode_to_file passed decode_to_file = decode_to_file or decode_hp.decode_to_file if decode_to_file: output_filepath = _decode_filename(decode_to_file, problem_name, decode_hp) parts = output_filepath.split(".") parts[-1] = "targets" target_filepath = ".".join(parts) parts[-1] = "inputs" input_filepath = ".".join(parts) output_file = tf.gfile.Open(output_filepath, "w") target_file = tf.gfile.Open(target_filepath, "w") input_file = tf.gfile.Open(input_filepath, "w") problem_hparams = hparams.problem_hparams # Inputs vocabulary is set to targets if there are no inputs in the problem, # e.g., for language models where the inputs are just a prefix of targets. has_input = "inputs" in problem_hparams.vocabulary inputs_vocab_key = "inputs" if has_input else "targets" inputs_vocab = problem_hparams.vocabulary[inputs_vocab_key] targets_vocab = problem_hparams.vocabulary["targets"] num_eval_samples = 0 # all_outputs[i][j] = (input: str, output: str, target: str). Input, # decoded output, and target strings for example i, beam rank j. all_outputs = [] for num_predictions, prediction in enumerate(predictions): num_eval_samples += 1 num_predictions += 1 inputs = prediction.get("inputs") targets = prediction.get("targets") outputs = prediction.get("outputs") # Log predictions decoded_outputs = [] # [(str, str, str)]. See all_outputs above. if decode_hp.decode_in_memory: all_outputs.append(decoded_outputs) decoded_scores = [] if decode_hp.return_beams or decode_hp.multi_targets: output_beams = np.split(outputs, decode_hp.beam_size, axis=0) scores = None if "scores" in prediction: scores = np.split(prediction["scores"], prediction["scores"].shape[0], axis=0) target_i = targets for i, beam in enumerate(output_beams): if decode_hp.multi_targets and decode_hp.return_beams: beam_id = (i % decode_hp.beam_size) target_id = i // (decode_hp.beam_size) target_i = targets[target_id] if not beam_id: tf.logging.info("TARGET %d:" % target_id) tf.logging.info("BEAM %d:" % beam_id) elif decode_hp.multi_targets: target_i = targets[i] tf.logging.info("TARGET %d:" % i) else: tf.logging.info("BEAM %d:" % i) score = scores and scores[i] decoded = log_decode_results( inputs, beam, problem_name, num_predictions, inputs_vocab, targets_vocab, save_images=decode_hp.save_images, output_dir=output_dir, identity_output=decode_hp.identity_output, targets=target_i, log_results=log_results) decoded_outputs.append(decoded) if decode_hp.write_beam_scores: decoded_scores.append(score) else: decoded = log_decode_results( inputs, outputs, problem_name, num_predictions, inputs_vocab, targets_vocab, save_images=decode_hp.save_images, output_dir=output_dir, identity_output=decode_hp.identity_output, targets=targets, log_results=log_results) decoded_outputs.append(decoded) # Write out predictions if decode_to_file passed if decode_to_file: for i, (d_input, d_output, d_target) in enumerate(decoded_outputs): # Skip if all padding if d_input and re.match("^({})+$".format(text_encoder.PAD), d_input): continue beam_score_str = "" if decode_hp.write_beam_scores: beam_score_str = "\t%.2f" % decoded_scores[i] output_file.write(str(d_output) + beam_score_str + decode_hp.delimiter) target_file.write(str(d_target) + decode_hp.delimiter) input_file.write(str(d_input) + decode_hp.delimiter) if (decode_hp.num_samples >= 0 and num_predictions >= decode_hp.num_samples): break mlperf_log.transformer_print(key=mlperf_log.EVAL_SIZE, value=num_eval_samples, hparams=hparams) if decode_to_file: output_file.close() target_file.close() input_file.close() return all_outputs
def create_estimator(model_name, hparams, run_config, schedule="train_and_evaluate", decode_hparams=None, use_tpu=False, use_tpu_estimator=False, use_xla=False, export_saved_model_api_version=1, use_guarantee_const_getter=False): """Create a T2T Estimator.""" model_fn = t2t_model.T2TModel.make_estimator_model_fn( model_name, hparams, decode_hparams=decode_hparams, use_tpu=use_tpu) del use_xla if use_tpu or use_tpu_estimator: from tensorflow.contrib.tpu.python.tpu import tpu_estimator # pylint: disable=g-import-not-at-top problem = hparams.problem batch_size = (problem.tpu_batch_size_per_shard(hparams) * run_config.tpu_config.num_shards) mlperf_log.transformer_print(key=mlperf_log.INPUT_BATCH_SIZE, value=batch_size) if getattr(hparams, "mtf_mode", False): batch_size = problem.tpu_batch_size_per_shard(hparams) predict_batch_size = batch_size if decode_hparams and decode_hparams.batch_size: predict_batch_size = decode_hparams.batch_size if decode_hparams and run_config.tpu_config: decode_hparams.add_hparam( "iterations_per_loop", run_config.tpu_config.iterations_per_loop) if export_saved_model_api_version == 1: api_version_enum_name = tpu_estimator.ExportSavedModelApiVersion.V1 estimator_model_fn = model_fn elif export_saved_model_api_version == 2: api_version_enum_name = tpu_estimator.ExportSavedModelApiVersion.V2 def maybe_use_guarantee_const_getter_model_fn( features, labels, mode, params): """Wrapper model_fn with guarantee_const getter.""" if not use_guarantee_const_getter: return model_fn(features, labels, mode, params) # It marks all weights as constant, which may improves TPU inference # performance because it prevents the weights being transferred to the # TPU. It will increase HBM "program" usage and reduce HBM "arguments" # usage during TPU model serving. def guarantee_const_getter(getter, name, *args, **kwargs): with tf.control_dependencies(None): return tf.guarantee_const( getter(name, *args, **kwargs), name=name + "/GuaranteeConst") @contextlib.contextmanager def guarantee_const_scope(): var_scope = tf.get_variable_scope() prev_custom_getter = var_scope.custom_getter prev_caching_device = var_scope.caching_device var_scope.set_custom_getter(guarantee_const_getter) var_scope.set_caching_device(lambda op: op.device) yield var_scope.set_custom_getter(prev_custom_getter) var_scope.set_caching_device(prev_caching_device) with guarantee_const_scope(): return model_fn(features, labels, mode, params) def tpu_model_fn(features, labels, mode, params): """Wrapper model_fn with tpu.rewrite / TPUPartitionedCall.""" if mode == tf.estimator.ModeKeys.PREDICT and params["use_tpu"]: batch_config = tpu_estimator.BatchConfig( num_batch_threads=2, max_batch_size=predict_batch_size, batch_timeout_micros=60 * 1000, allowed_batch_sizes=[predict_batch_size]) return tpu_estimator.model_fn_inference_on_tpu( maybe_use_guarantee_const_getter_model_fn, features=features, labels=labels, config=None, params=params, batch_config=batch_config) else: return model_fn(features, labels, mode, params) estimator_model_fn = tpu_model_fn else: raise ValueError( "Flag export_saved_model_api_version must be 1 or 2.") estimator = contrib.tpu().TPUEstimator( model_fn=estimator_model_fn, model_dir=run_config.model_dir, config=run_config, use_tpu=use_tpu, train_batch_size=batch_size, eval_batch_size=batch_size if "eval" in schedule else None, predict_batch_size=predict_batch_size, export_saved_model_api_version=api_version_enum_name) else: estimator = tf.estimator.Estimator( model_fn=model_fn, model_dir=run_config.model_dir, config=run_config, ) return estimator
def decode_once(estimator, problem_name, hparams, infer_input_fn, decode_hp, decode_to_file, output_dir, log_results=True, checkpoint_path=None): """Decodes once.""" # Get the predictions as an iterable predictions = estimator.predict(infer_input_fn, checkpoint_path=checkpoint_path) if not log_results: return list(predictions) # Prepare output file writers if decode_to_file passed decode_to_file = decode_to_file or decode_hp.decode_to_file if decode_to_file: output_filepath = _decode_filename(decode_to_file, problem_name, decode_hp) parts = output_filepath.split(".") parts[-1] = "targets" target_filepath = ".".join(parts) parts[-1] = "inputs" input_filepath = ".".join(parts) output_file = tf.gfile.Open(output_filepath, "w") target_file = tf.gfile.Open(target_filepath, "w") input_file = tf.gfile.Open(input_filepath, "w") problem_hparams = hparams.problem_hparams # Inputs vocabulary is set to targets if there are no inputs in the problem, # e.g., for language models where the inputs are just a prefix of targets. has_input = "inputs" in problem_hparams.vocabulary inputs_vocab_key = "inputs" if has_input else "targets" inputs_vocab = problem_hparams.vocabulary[inputs_vocab_key] targets_vocab = problem_hparams.vocabulary["targets"] num_eval_samples = 0 for num_predictions, prediction in enumerate(predictions): num_eval_samples += 1 num_predictions += 1 inputs = prediction.get("inputs") targets = prediction.get("targets") outputs = prediction.get("outputs") # Log predictions decoded_outputs = [] decoded_scores = [] if decode_hp.return_beams: output_beams = np.split(outputs, decode_hp.beam_size, axis=0) scores = None if "scores" in prediction: scores = np.split(prediction["scores"], decode_hp.beam_size, axis=0) for i, beam in enumerate(output_beams): tf.logging.info("BEAM %d:" % i) score = scores and scores[i] decoded = log_decode_results( inputs, beam, problem_name, num_predictions, inputs_vocab, targets_vocab, save_images=decode_hp.save_images, output_dir=output_dir, identity_output=decode_hp.identity_output, targets=targets, log_results=decode_hp.log_results) decoded_outputs.append(decoded) if decode_hp.write_beam_scores: decoded_scores.append(score) else: decoded = log_decode_results( inputs, outputs, problem_name, num_predictions, inputs_vocab, targets_vocab, save_images=decode_hp.save_images, output_dir=output_dir, identity_output=decode_hp.identity_output, targets=targets, log_results=decode_hp.log_results) decoded_outputs.append(decoded) # Write out predictions if decode_to_file passed if decode_to_file: for i, (d_input, d_output, d_target) in enumerate(decoded_outputs): # Skip if all padding if d_input and re.match("^({})+$".format(text_encoder.PAD), d_input): continue beam_score_str = "" if decode_hp.write_beam_scores: beam_score_str = "\t%.2f" % decoded_scores[i] output_file.write(str(d_output) + beam_score_str + decode_hp.delimiter) target_file.write(str(d_target) + decode_hp.delimiter) input_file.write(str(d_input) + decode_hp.delimiter) if (decode_hp.num_samples >= 0 and num_predictions >= decode_hp.num_samples): break mlperf_log.transformer_print(key=mlperf_log.EVAL_SIZE, value=num_eval_samples, hparams=hparams) if decode_to_file: output_file.close() target_file.close() input_file.close()
def transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="encoder", nonpadding=None, save_weights_to=None, make_image_summary=True): """A stack of transformer layers. Args: encoder_input: a Tensor encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This must either be passed in, which we do for "packed" datasets, or inferred from encoder_self_attention_bias. The knowledge about padding is used for pad_remover(efficiency) and to mask out padding in convolutional layers. save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. Returns: y: a Tensors """ x = encoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS, value=hparams.num_encoder_layers or hparams.num_hidden_layers) mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_ATTENTION_DROPOUT, value=hparams.attention_dropout) mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_ATTENTION_DENSE, value={ "use_bias": "false", "num_heads": hparams.num_heads, "hidden_size": hparams.hidden_size }) with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) nonpadding = 1.0 - padding pad_remover = None if hparams.use_pad_remover and not common_layers.is_xla_compiled(): pad_remover = expert_utils.PadRemover(padding) for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers): initial_sparsity = None if hparams.get("load_masks_from"): initial_sparsity = hparams.get("initial_sparsity") with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = sparse_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams.max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams.add_relative_to_values, save_weights_to=save_weights_to, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d"), sparsity_technique=hparams.get("sparsity_technique"), threshold=hparams.get("log_alpha_threshold"), training=hparams.get( "mode") == tf_estimator.ModeKeys.TRAIN, clip_alpha=hparams.get("clip_log_alpha"), initial_sparsity=initial_sparsity, split_heads=hparams.get("split_heads")) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer_ffn_layer( common_layers.layer_preprocess(x, hparams), hparams, pad_remover) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_NORM, value={"hidden_size": hparams.hidden_size}) return common_layers.layer_preprocess(x, hparams)
def moe_transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="moe-encoder", nonpadding=None, save_weights_to=None, make_image_summary=True, attn_bias_for_padding=None): """A stack of transformer moe layers. Args: encoder_input: a Tensor encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This must either be passed in, which we do for "packed" datasets, or inferred from encoder_self_attention_bias. The knowledge about padding is used for pad_remover(efficiency) and to mask out padding in convolutional layers. save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. losses: optional list onto which to append extra training losses attn_bias_for_padding: Padded attention bias in case a unidirectional encoder is being used where future attention is masked. Returns: y: a Tensors """ x = encoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS, value=hparams.num_encoder_layers or hparams.num_hidden_layers) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_ATTENTION_DROPOUT, value=hparams.attention_dropout) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_ATTENTION_DENSE, value={ "use_bias": "false", "num_heads": hparams.num_heads, "hidden_size": hparams.hidden_size }) with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: attention_bias = encoder_self_attention_bias if attn_bias_for_padding is not None: attention_bias = attn_bias_for_padding padding = common_attention.attention_bias_to_padding(attention_bias) nonpadding = 1.0 - padding for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): x = moe_transformer_encoder_layer( layer, x, encoder_self_attention_bias, hparams, attention_dropout_broadcast_dims, save_weights_to, make_image_summary) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_NORM, value={"hidden_size": hparams.hidden_size}) return common_layers.layer_preprocess(x, hparams)