def init_from_files(vocab_file, files, target_vocab_size, threshold, min_count=None, file_byte_limit=1e6, reserved_tokens=None): """Create subtoken vocabulary based on files, and save vocab to file. Args: vocab_file: String name of vocab file to store subtoken vocabulary. files: List of file paths that will be used to generate vocabulary. target_vocab_size: target vocabulary size to generate. threshold: int threshold of vocabulary size to accept. min_count: int minimum count to use for generating the vocabulary. The min count is the minimum number of times a subtoken should appear in the files before it is added to the vocabulary. If set to none, this value is found using binary search. file_byte_limit: (Default 1e6) Maximum number of bytes of sample text that will be drawn from the files. reserved_tokens: List of string tokens that are guaranteed to be at the beginning of the subtoken vocabulary list. Returns: Subtokenizer object """ if reserved_tokens is None: reserved_tokens = RESERVED_TOKENS if tf.gfile.Exists(vocab_file): tf.logging.info("Vocab file already exists (%s)" % vocab_file) else: tf.logging.info("Begin steps to create subtoken vocabulary...") token_counts = _count_tokens(files, file_byte_limit) alphabet = _generate_alphabet_dict(token_counts) subtoken_list = _generate_subtokens_with_target_vocab_size( token_counts, alphabet, target_vocab_size, threshold, min_count, reserved_tokens) tf.logging.info("Generated vocabulary with %d subtokens." % len(subtoken_list)) mlperf_log.transformer_print(key=mlperf_log.PREPROC_VOCAB_SIZE, value=len(subtoken_list)) _save_vocab_file(vocab_file, subtoken_list) return Subtokenizer(vocab_file)
def decode(self, targets, encoder_outputs, attention_bias): """Generate logits for each value in the target sequence. Args: targets: target values for the output sequence. int tensor with shape [batch_size, target_length] encoder_outputs: continuous representation of input sequence. float tensor with shape [batch_size, input_length, hidden_size] attention_bias: float tensor with shape [batch_size, 1, 1, input_length] Returns: float32 tensor with shape [batch_size, target_length, vocab_size] """ with tf.compat.v1.name_scope("decode"): # Prepare inputs to decoder layers by shifting targets, adding positional # encoding and applying dropout. decoder_inputs = self.embedding_softmax_layer(targets) with tf.compat.v1.name_scope("shift_targets"): # Shift targets to the right, and remove the last element decoder_inputs = tf.pad(tensor=decoder_inputs, paddings=[[0, 0], [1, 0], [0, 0]])[:, :-1, :] with tf.compat.v1.name_scope("add_pos_encoding"): length = tf.shape(input=decoder_inputs)[1] poscod = model_utils.get_position_encoding( length, self.params.hidden_size) decoder_inputs += poscod if self.train: mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_LAYER_POSTPROCESS_DROPOUT, value=self.params.layer_postprocess_dropout) decoder_inputs = tf.nn.dropout( decoder_inputs, 1 - (1 - self.params.layer_postprocess_dropout)) # Run values decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length) outputs = self.decoder_stack(decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias) logits = self.embedding_softmax_layer.linear(outputs) return logits
def call(self, x, padding=None): with tf.compat.v1.tpu.bfloat16_scope(): #x = tf.cast(x, tf.bfloat16) # Retrieve dynamically known shapes batch_size = tf.shape(input=x)[0] length = tf.shape(input=x)[1] if padding is not None: with tf.compat.v1.name_scope("remove_padding"): # Flatten padding to [batch_size*length] pad_mask = tf.reshape(padding, [-1]) nonpad_ids = tf.cast(tf.compat.v1.where(pad_mask < 1e-9), dtype=tf.int32) # Reshape x to [batch_size*length, hidden_size] to remove padding x = tf.reshape(x, [-1, self.hidden_size]) x = tf.gather_nd(x, indices=nonpad_ids) # Reshape x from 2 dimensions to 3 dimensions. x.set_shape([None, self.hidden_size]) x = tf.expand_dims(x, axis=0) output = self.filter_dense_layer(x) if self.train: mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_RELU_DROPOUT, value=self.relu_dropout) output = tf.nn.dropout(output, 1 - (1.0 - self.relu_dropout)) output = self.output_dense_layer(output) if padding is not None: with tf.compat.v1.name_scope("re_add_padding"): output = tf.squeeze(output, axis=0) output = tf.scatter_nd( indices=nonpad_ids, updates=output, shape=[batch_size * length, self.hidden_size]) output = tf.reshape(output, [batch_size, length, self.hidden_size]) #output = tf.cast(output, tf.float32) return output
def get_train_op(loss, params): """Generate training operation that updates variables based on loss.""" with tf.variable_scope("get_train_op"): mlperf_log.transformer_print(key=mlperf_log.OPT_LR_WARMUP_STEPS, value=params.learning_rate_warmup_steps) learning_rate = get_learning_rate(params.learning_rate, params.hidden_size, params.learning_rate_warmup_steps) log_id = mlperf_log.resnet_print(key=mlperf_log.OPT_LR, deferred=True) learning_rate = tf_mlperf_log.log_deferred(op=learning_rate, log_id=log_id, every_n=100) # Create optimizer. Use LazyAdamOptimizer from TF contrib, which is faster # than the TF core Adam optimizer. mlperf_log.transformer_print(key=mlperf_log.OPT_NAME, value=mlperf_log.LAZY_ADAM) mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=params.optimizer_adam_beta1) mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=params.optimizer_adam_beta2) mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=params.optimizer_adam_epsilon) optimizer = tf.contrib.opt.LazyAdamOptimizer( learning_rate, beta1=params.optimizer_adam_beta1, beta2=params.optimizer_adam_beta2, epsilon=params.optimizer_adam_epsilon) # Calculate and apply gradients using LazyAdamOptimizer. global_step = tf.train.get_global_step() tvars = tf.trainable_variables() gradients = optimizer.compute_gradients( loss, tvars, colocate_gradients_with_ops=True) train_op = optimizer.apply_gradients(gradients, global_step=global_step, name="train") # Save gradient norm to Tensorboard tf.summary.scalar("global_norm/gradient_norm", tf.global_norm(list(zip(*gradients))[0])) return train_op
def __init__(self, params, train): super(EncoderStack, self).__init__() self.layers = [] mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS, value=params.num_hidden_layers) for _ in range(params.num_hidden_layers): # Create sublayers for each layer. self_attention_layer = attention_layer.SelfAttention( params.hidden_size, params.num_heads, params.attention_dropout, train) feed_forward_network = ffn_layer.FeedFowardNetwork( params.hidden_size, params.filter_size, params.relu_dropout, train) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params, train), PrePostProcessingWrapper(feed_forward_network, params, train)]) # Create final layer normalization layer. #self.output_normalization = LayerNormalization(params.hidden_size) self.output_normalization = tf.keras.layers.LayerNormalization(epsilon=0.000001, dtype=policy)
def call(self, x, padding=None): # Retrieve dynamically known shapes batch_size = tf.shape(input=x)[0] length = tf.shape(input=x)[1] with tf.compat.v1.tpu.bfloat16_scope(): # Reshape to 2D teansor x = tf.reshape(x, [-1, self.hidden_size]) output = self.filter_dense_layer(x) if self.train: mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_RELU_DROPOUT, value=self.relu_dropout) output = tf.nn.dropout(output, 1 - (1.0 - self.relu_dropout)) output = self.output_dense_layer(output) # Reshaped back to 3D tensor output = tf.reshape(output, [batch_size, length, self.hidden_size]) return output
def __call__(self, inputs, targets=None): """Calculate target logits or inferred target sequences. Args: inputs: int tensor with shape [batch_size, input_length]. targets: None or int tensor with shape [batch_size, target_length]. Returns: If targets is defined, then return logits for each word in the target sequence. float tensor with shape [batch_size, target_length, vocab_size] If target is none, then generate output sequence one token at a time. returns a dictionary { output: [batch_size, decoded length] score: [batch_size, float]} """ # Variance scaling is used here because it seems to work in many problems. # Other reasonable initializers may also work just as well. mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_INITIALIZER_GAIN, value=self.params.initializer_gain) initializer = tf.compat.v1.variance_scaling_initializer( self.params.initializer_gain, mode="fan_avg", distribution="uniform") with tf.compat.v1.variable_scope("Transformer", initializer=initializer): # Calculate attention bias for encoder self-attention and decoder # multi-headed attention layers. attention_bias = model_utils.get_padding_bias(inputs) # Run the inputs through the encoder layer to map the symbol # representations to continuous representations. encoder_outputs = self.encode(inputs, attention_bias) # Generate output sequence if targets is None, or return logits if target # sequence is known. if targets is None: out_seq = self.predict(encoder_outputs, attention_bias) return out_seq else: logits = self.decode(targets, encoder_outputs, attention_bias) return logits
def __init__(self, args, optimizer): super().__init__(args, optimizer) if len(args.lr) > 1: raise ValueError( 'Cannot use a fixed learning rate schedule with inverse_sqrt.' ' Consider --lr-scheduler=fixed instead.') mlperf_log.transformer_print(key=mlperf_log.OPT_LR_WARMUP_STEPS, value=args.warmup_updates) warmup_end_lr = args.lr[0] if args.warmup_init_lr < 0: args.warmup_init_lr = warmup_end_lr # linearly warmup for the first args.warmup_updates self.lr_step = (warmup_end_lr - args.warmup_init_lr) / args.warmup_updates # then, decay prop. to the inverse square root of the update number self.decay_factor = warmup_end_lr * args.warmup_updates**0.5 # initial learning rate self.lr = args.warmup_init_lr self.optimizer.set_lr(self.lr)
def sequence_beam_search(symbols_to_logits_fn, initial_ids, initial_cache, vocab_size, beam_size, alpha, max_decode_length, eos_id): """Search for sequence of subtoken ids with the largest probability. Args: symbols_to_logits_fn: A function that takes in ids, index, and cache as arguments. The passed in arguments will have shape: ids -> [batch_size * beam_size, index] index -> [] (scalar) cache -> nested dictionary of tensors [batch_size * beam_size, ...] The function must return logits and new cache. logits -> [batch * beam_size, vocab_size] new cache -> same shape/structure as inputted cache initial_ids: Starting ids for each batch item. int32 tensor with shape [batch_size] initial_cache: dict containing starting decoder variables information vocab_size: int size of tokens beam_size: int number of beams alpha: float defining the strength of length normalization max_decode_length: maximum length to decoded sequence eos_id: int id of eos token, used to determine when a sequence has finished Returns: Top decoded sequences [batch_size, beam_size, max_decode_length] sequence scores [batch_size, beam_size] """ batch_size = tf.shape(initial_ids)[0] mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_SEQ_BEAM_SEARCH, value={ "vocab_size": vocab_size, "batch_size": batch_size, "beam_size": beam_size, "alpha": alpha, "max_decode_length": max_decode_length }) sbs = SequenceBeamSearch(symbols_to_logits_fn, vocab_size, batch_size, beam_size, alpha, max_decode_length, eos_id) return sbs.search(initial_ids, initial_cache)
def __init__(self, params, train): super(EncoderStack, self).__init__() self.layers = [] mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS, value=params.num_hidden_layers) # SSY num_hidden_layers is 6 transformer/model/model_params.py for _ in range(params.num_hidden_layers): # Create sublayers for each layer. # only SelfAttention and ffn # SSY 2.1 transformer/model/attention_layer.py Dense and matmul self_attention_layer = attention_layer.SelfAttention( params.hidden_size, params.num_heads, params.attention_dropout, train) # SSY 2.2 transformer/model/ffn_layer.py only Dense feed_forward_network = ffn_layer.FeedFowardNetwork( params.hidden_size, params.filter_size, params.relu_dropout, train) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params, train), PrePostProcessingWrapper(feed_forward_network, params, train)]) # Create final layer normalization layer. self.output_normalization = LayerNormalization(params.hidden_size)
def __init__(self, params, train): super(DecoderStack, self).__init__() self.layers = [] mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS, value=params.num_hidden_layers) for _ in range(params.num_hidden_layers): # SSY 3.1 transformer/model/attention_layer.py Dense and matmul self_attention_layer = attention_layer.SelfAttention( params.hidden_size, params.num_heads, params.attention_dropout, train) # SSY 3.2 transformer/model/attention_layer.py Dense and matmul enc_dec_attention_layer = attention_layer.Attention( params.hidden_size, params.num_heads, params.attention_dropout, train) # SSY 3.3 transformer/model/ffn_layer.py only Dense feed_forward_network = ffn_layer.FeedFowardNetwork( params.hidden_size, params.filter_size, params.relu_dropout, train) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params, train), PrePostProcessingWrapper(enc_dec_attention_layer, params, train), PrePostProcessingWrapper(feed_forward_network, params, train)]) self.output_normalization = LayerNormalization(params.hidden_size)
def _read_and_batch_from_files(file_pattern, batch_size, max_length, num_cpu_cores, shuffle, repeat): """Create dataset where each item is a dict of "inputs" and "targets". Args: file_pattern: String used to match the input TFRecord files. batch_size: Maximum number of tokens per batch of examples max_length: Maximum number of tokens per example num_cpu_cores: Number of cpu cores for parallel input processing. shuffle: If true, randomizes order of elements. repeat: Number of times to repeat the dataset. If None, the dataset is repeated forever. Returns: tf.data.Dataset object containing examples loaded from the files. """ dataset = tf.data.Dataset.list_files(file_pattern) if shuffle: # Shuffle filenames mlperf_log.transformer_print(key=mlperf_log.INPUT_ORDER) dataset = dataset.shuffle(buffer_size=_FILE_SHUFFLE_BUFFER) # Read files and interleave results. When training, the order of the examples # will be non-deterministic. dataset = dataset.apply( tf.data.experimental.parallel_interleave(_load_records, sloppy=shuffle, cycle_length=num_cpu_cores)) # Parse each tf.Example into a dictionary # TODO: Look into prefetch_input_elements for performance optimization. dataset = dataset.map(_parse_example, num_parallel_calls=num_cpu_cores) # Remove examples where the input or target length exceeds the maximum length, dataset = dataset.filter(lambda x, y: _filter_max_length( (x, y), max_length)) # Batch such that each batch has examples of similar length. mlperf_log.transformer_print(key=mlperf_log.INPUT_BATCH_SIZE, value=batch_size) mlperf_log.transformer_print(key=mlperf_log.INPUT_MAX_LENGTH, value=max_length) dataset = _batch_examples(dataset, batch_size, max_length) dataset = dataset.repeat(repeat) # Prefetch the next element to improve speed of input pipeline. dataset = dataset.prefetch(1) return dataset
def main(unused_argv): """Obtain training and evaluation data for the Transformer model.""" tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) make_dir(FLAGS.raw_dir) make_dir(FLAGS.data_dir) # Get paths of download/extracted training and evaluation files. tf.compat.v1.logging.info("Step 1/4: Downloading data from source") train_files = get_raw_files(FLAGS.raw_dir, _TRAIN_DATA_SOURCES) eval_files = get_raw_files(FLAGS.raw_dir, _EVAL_DATA_SOURCES) # Create subtokenizer based on the training files. tf.compat.v1.logging.info( "Step 2/4: Creating subtokenizer and building vocabulary") train_files_flat = train_files["inputs"] + train_files["targets"] vocab_file = os.path.join(FLAGS.data_dir, _VOCAB_FILE) subtokenizer = tokenizer.Subtokenizer.init_from_files( vocab_file, train_files_flat, _TARGET_VOCAB_SIZE, _TARGET_THRESHOLD, min_count=None if FLAGS.search else _TRAIN_DATA_MIN_COUNT) tf.compat.v1.logging.info( "Step 3/4: Compiling training and evaluation data") compiled_train_files = compile_files(FLAGS.data_dir, train_files, _TRAIN_TAG) compiled_eval_files = compile_files(FLAGS.data_dir, eval_files, _EVAL_TAG) # Tokenize and save data as Examples in the TFRecord format. tf.compat.v1.logging.info("Step 4/4: Preprocessing and saving data") mlperf_log.transformer_print(key=mlperf_log.PREPROC_TOKENIZE_TRAINING) train_tfrecord_files = encode_and_save_files(subtokenizer, FLAGS.data_dir, compiled_train_files, _TRAIN_TAG, _TRAIN_SHARDS) mlperf_log.transformer_print(key=mlperf_log.PREPROC_TOKENIZE_EVAL) encode_and_save_files(subtokenizer, FLAGS.data_dir, compiled_eval_files, _EVAL_TAG, _EVAL_SHARDS) mlperf_log.transformer_print(key=mlperf_log.INPUT_ORDER) for fname in train_tfrecord_files: shuffle_records(fname)
def main(_): # Set logging level to INFO to display training progress (logged by the # estimator) tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) mlperf_log.transformer_print(key=mlperf_log.RUN_START) # Set random seed. if FLAGS.random_seed is None: raise Exception('No Random seed given') print('Setting random seed = ', FLAGS.random_seed) seed = FLAGS.random_seed mlperf_log.transformer_print(key=mlperf_log.RUN_SET_RANDOM_SEED, value=seed) random.seed(seed) tf.compat.v1.set_random_seed(seed) numpy.random.seed(seed) if FLAGS.params == "base": params = model_params.TransformerBaseParams elif FLAGS.params == "big": params = model_params.TransformerBigParams else: raise ValueError("Invalid parameter set defined: %s." "Expected 'base' or 'big.'" % FLAGS.params) # Determine training schedule based on flags. if FLAGS.train_steps != 0 and FLAGS.train_epochs is not None: raise ValueError( "Both --train_steps and --train_epochs were set. Only one " "may be defined.") if FLAGS.train_steps != 0: train_eval_iterations = FLAGS.train_steps // FLAGS.steps_between_eval single_iteration_train_steps = FLAGS.steps_between_eval single_iteration_train_epochs = None else: if FLAGS.train_epochs is None: FLAGS.train_epochs = DEFAULT_TRAIN_EPOCHS train_eval_iterations = FLAGS.train_epochs // FLAGS.epochs_between_eval single_iteration_train_steps = None single_iteration_train_epochs = FLAGS.epochs_between_eval # Make sure that the BLEU source and ref files if set if FLAGS.bleu_source is not None and FLAGS.bleu_ref is not None: if not tf.io.gfile.exists(FLAGS.bleu_source): raise ValueError("BLEU source file %s does not exist" % FLAGS.bleu_source) if not tf.io.gfile.exists(FLAGS.bleu_ref): raise ValueError("BLEU source file %s does not exist" % FLAGS.bleu_ref) # Add flag-defined parameters to params object params.data_dir = FLAGS.data_dir params.num_cpu_cores = FLAGS.num_cpu_cores params.epochs_between_eval = FLAGS.epochs_between_eval params.repeat_dataset = single_iteration_train_epochs # Add inter_op and intra_op parallelism thread session_config = tf.compat.v1.ConfigProto( inter_op_parallelism_threads=FLAGS.inter_op_parallelism_threads, intra_op_parallelism_threads=FLAGS.intra_op_parallelism_threads, allow_soft_placement=True) if FLAGS.save_checkpoints == "No": # To skip the checkpoints saving (which takes long time) # added the following run_config run_config = tf.estimator.RunConfig(session_config=session_config, save_summary_steps=None, save_checkpoints_secs=None) else: run_config = tf.estimator.RunConfig(session_config=session_config) estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=FLAGS.model_dir, params=params, config=run_config) train_schedule(estimator, train_eval_iterations, single_iteration_train_steps, single_iteration_train_epochs, FLAGS.bleu_source, FLAGS.bleu_ref, FLAGS.bleu_threshold) mlperf_log.transformer_print(key=mlperf_log.RUN_STOP) mlperf_log.transformer_print(key=mlperf_log.RUN_FINAL)
def train_schedule(estimator, train_eval_iterations, single_iteration_train_steps=None, single_iteration_train_epochs=None, bleu_source=None, bleu_ref=None, bleu_threshold=None): """Train and evaluate model, and optionally compute model's BLEU score. **Step vs. Epoch vs. Iteration** Steps and epochs are canonical terms used in TensorFlow and general machine learning. They are used to describe running a single process (train/eval): - Step refers to running the process through a single or batch of examples. - Epoch refers to running the process through an entire dataset. E.g. training a dataset with 100 examples. The dataset is divided into 20 batches with 5 examples per batch. A single training step trains the model on one batch. After 20 training steps, the model will have trained on every batch in the dataset, or, in other words, one epoch. Meanwhile, iteration is used in this implementation to describe running multiple processes (training and eval). - A single iteration: 1. trains the model for a specific number of steps or epochs. 2. evaluates the model. 3. (if source and ref files are provided) compute BLEU score. This function runs through multiple train+eval+bleu iterations. Args: estimator: tf.Estimator containing model to train. train_eval_iterations: Number of times to repeat the train+eval iteration. single_iteration_train_steps: Number of steps to train in one iteration. single_iteration_train_epochs: Number of epochs to train in one iteration. bleu_source: File containing text to be translated for BLEU calculation. bleu_ref: File containing reference translations for BLEU calculation. bleu_threshold: minimum BLEU score before training is stopped. Raises: ValueError: if both or none of single_iteration_train_steps and single_iteration_train_epochs were defined. """ # Ensure that exactly one of single_iteration_train_steps and # single_iteration_train_epochs is defined. if single_iteration_train_steps is None: if single_iteration_train_epochs is None: raise ValueError( "Exactly one of single_iteration_train_steps or " "single_iteration_train_epochs must be defined. Both were none." ) else: if single_iteration_train_epochs is not None: raise ValueError( "Exactly one of single_iteration_train_steps or " "single_iteration_train_epochs must be defined. Both were defined." ) evaluate_bleu = bleu_source is not None and bleu_ref is not None # Print out training schedule print("Training schedule:") if single_iteration_train_epochs is not None: print("\t1. Train for %d epochs." % single_iteration_train_epochs) else: print("\t1. Train for %d steps." % single_iteration_train_steps) print("\t2. Evaluate model.") if evaluate_bleu: print("\t3. Compute BLEU score.") if bleu_threshold is not None: print("Repeat above steps until the BLEU score reaches", bleu_threshold) if not evaluate_bleu or bleu_threshold is None: print("Repeat above steps %d times." % train_eval_iterations) if evaluate_bleu: # Set summary writer to log bleu score. bleu_writer = tf.compat.v1.summary.FileWriter( os.path.join(estimator.model_dir, BLEU_DIR)) if bleu_threshold is not None: # Change loop stopping condition if bleu_threshold is defined. train_eval_iterations = INF # Loop training/evaluation/bleu cycles mlperf_log.transformer_print(key=mlperf_log.TRAIN_LOOP) #Creating hooks for printing Examples per Second, used with estimator.train train_hooks = hooks_helper.get_train_hooks( ["ExamplesPerSecondHook"], model_dir=FLAGS.model_dir, batch_size=estimator.params.batch_size, every_n_steps=FLAGS.print_iter, warm_steps=20) for i in xrange(train_eval_iterations): print("Starting iteration", i + 1) if single_iteration_train_epochs is not None: mlperf_log.transformer_print( key=mlperf_log.TRAIN_EPOCH, value=i * single_iteration_train_epochs + 1) # Train the model for single_iteration_train_steps or until the input fn # runs out of examples (if single_iteration_train_steps is None). estimator.train(dataset.train_input_fn, steps=single_iteration_train_steps, hooks=train_hooks) mlperf_log.transformer_print(key=mlperf_log.EVAL_START) # To save training time, we can turn off evaluation # Otherwise it will be turned on if FLAGS.do_eval == "Yes": eval_results = estimator.evaluate(dataset.eval_input_fn) print( "Evaluation results (iter %d/%d):" % (i + 1, train_eval_iterations), eval_results) if evaluate_bleu: uncased_score, _ = evaluate_and_log_bleu(estimator, bleu_writer, bleu_source, bleu_ref) if bleu_threshold is not None and uncased_score > bleu_threshold: bleu_writer.close() break mlperf_log.transformer_print(key=mlperf_log.EVAL_TARGET, value=bleu_threshold) mlperf_log.transformer_print(key=mlperf_log.EVAL_ACCURACY, value=uncased_score) mlperf_log.transformer_print(key=mlperf_log.EVAL_STOP)
def get_train_op(loss, params): """Generate training operation that updates variables based on loss.""" with tf.compat.v1.variable_scope("get_train_op"): mlperf_log.transformer_print(key=mlperf_log.OPT_LR_WARMUP_STEPS, value=params.learning_rate_warmup_steps) learning_rate = get_learning_rate(params.learning_rate, params.hidden_size, params.learning_rate_warmup_steps) log_id = mlperf_log.resnet_print(key=mlperf_log.OPT_LR, deferred=True) learning_rate = tf_mlperf_log.log_deferred(op=learning_rate, log_id=log_id, every_n=100) # Create optimizer. Use LazyAdamOptimizer from TF contrib, which is faster # than the TF core Adam optimizer. mlperf_log.transformer_print(key=mlperf_log.OPT_NAME, value=mlperf_log.LAZY_ADAM) mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=params.optimizer_adam_beta1) mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=params.optimizer_adam_beta2) mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=params.optimizer_adam_epsilon) # Using optimizer v1(from tensorflow.python.trainings*) # The optimizer v2 version of code is in the below. # Optimzer v1 does not # have lazyAdam optimizer (was in contrib, now deprecated) optimizer = adam.AdamOptimizer(learning_rate, beta1=params.optimizer_adam_beta1, beta2=params.optimizer_adam_beta2, epsilon=params.optimizer_adam_epsilon) # Calculate and apply gradients using LazyAdamOptimizer. global_step = tf.compat.v1.train.get_global_step() tvars = tf.compat.v1.trainable_variables() grads_and_vars = optimizer.compute_gradients(loss, tvars) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step, name="train") # Save gradient norm to Tensorboard tf.compat.v1.summary.scalar( "global_norm/gradient_norm", tf.linalg.global_norm(list(zip(*grads_and_vars))[0])) # Using tfa (tensorflow_addons) optimizer, which in turn # uses optimizer_v2 (from tf.python.keras.optimizer_v2) # which has waringin issues about global step not updated since # global_step is not accepted in apply_gradients() function of # optimizer_v2 version. # Thus the global step is updated and grouped with training op # To activate LazyAdams from tensroflow-addons activate the # following code and take out the above optimer v1 related code # Currently both optimizer v1 and v2 take about same time ''' optimizer = tfa.optimizers.LazyAdam( learning_rate, beta_1=params.optimizer_adam_beta1, beta_2=params.optimizer_adam_beta2, epsilon=params.optimizer_adam_epsilon) # Calculate and apply gradients using LazyAdamOptimizer. global_step = tf.compat.v1.train.get_global_step() tvars = tf.compat.v1.trainable_variables() tvars = tvars[0:len(tvars)-1] gradients = optimizer.get_gradients( loss, tvars) grads_and_vars = zip(gradients, tvars) train_op = optimizer.apply_gradients( grads_and_vars) # Save gradient norm to Tensorboard tf.compat.v1.summary.scalar("global_norm/gradient_norm", tf.compat.v1.linalg.global_norm(list(gradients))) update_global_step = tf.compat.v1.assign(global_step, global_step + 1, name = "update_global_step") train_op = tf.compat.v1.group(train_op, [(update_global_step)]) ''' return train_op
def call(self, x, y, bias, cache=None): """Apply attention mechanism to x and y. Args: x: a tensor with shape [batch_size, length_x, hidden_size] y: a tensor with shape [batch_size, length_y, hidden_size] bias: attention bias that will be added to the result of the dot product. cache: (Used during prediction) dictionary with tensors containing results of previous attentions. The dictionary must have the items: {"k": tensor with shape [batch_size, i, key_channels], "v": tensor with shape [batch_size, i, value_channels]} where i is the current decoded length. Returns: Attention layer output with shape [batch_size, length_x, hidden_size] """ # Linearly project the query (q), key (k) and value (v) using different # learned projections. This is in preparation of splitting them into # multiple heads. Multi-head attention uses multiple queries, keys, and # values rather than regular attention (which uses a single q, k, v). q = self.q_dense_layer(x) k = self.k_dense_layer(y) v = self.v_dense_layer(y) if cache is not None: # Combine cached keys and values with new keys and values. k = tf.concat([cache["k"], k], axis=1) v = tf.concat([cache["v"], v], axis=1) # Update cache cache["k"] = k cache["v"] = v # Split q, k, v into heads. q = self.split_heads(q) k = self.split_heads(k) v = self.split_heads(v) # Scale q to prevent the dot product between q and k from growing too large. depth = (self.hidden_size // self.num_heads) q *= depth**-0.5 # Calculate dot product attention # SSY bf16 #q = tf.reshape(id_bf16cut_fp(q),tf.shape(q)) #k = tf.reshape(id_bf16cut_fp(k),tf.shape(k)) q = tf.reshape(bf16cut_fp(q), tf.shape(q)) k = tf.reshape(bf16cut_fp(k), tf.shape(k)) logits = tf.matmul(q, k, transpose_b=True) #logits = tf.reshape(id_bf16cut_bp(logits),tf.shape(logits)) logits = tf.reshape(bf16cut_bp(logits), tf.shape(logits)) logits += bias weights = tf.nn.softmax(logits, name="attention_weights") if self.train: mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_ATTENTION_DROPOUT, value=self.attention_dropout) weights = tf.nn.dropout(weights, 1.0 - self.attention_dropout) # SSY bf16 #weights = tf.reshape(id_bf16cut_fp(weights),tf.shape(weights)) #v = tf.reshape(id_bf16cut_fp(v),tf.shape(v)) weights = tf.reshape(bf16cut_fp(weights), tf.shape(weights)) v = tf.reshape(bf16cut_fp(v), tf.shape(v)) attention_output = tf.matmul(weights, v) #attention_output = tf.reshape(id_bf16cut_bp(attention_output),tf.shape(attention_output)) attention_output = tf.reshape(bf16cut_bp(attention_output), tf.shape(attention_output)) # Recombine heads --> [batch_size, length, hidden_size] attention_output = self.combine_heads(attention_output) # Run the combined outputs through another linear projection layer. attention_output = self.output_dense_layer(attention_output) return attention_output
def main(_): # Set logging level to INFO to display training progress (logged by the # estimator) tf.logging.set_verbosity(tf.logging.INFO) mlperf_log.transformer_print(key=mlperf_log.RUN_START) # Set random seed. if FLAGS.random_seed is None: raise Exception('No Random seed given') print('Setting random seed = ', FLAGS.random_seed) seed = FLAGS.random_seed mlperf_log.transformer_print(key=mlperf_log.RUN_SET_RANDOM_SEED, value=seed) random.seed(seed) tf.set_random_seed(seed) numpy.random.seed(seed) if FLAGS.params == "base": params = model_params.TransformerBaseParams elif FLAGS.params == "big": params = model_params.TransformerBigParams else: raise ValueError("Invalid parameter set defined: %s." "Expected 'base' or 'big.'" % FLAGS.params) # Determine training schedule based on flags. if FLAGS.train_steps is not None and FLAGS.train_epochs is not None: raise ValueError( "Both --train_steps and --train_epochs were set. Only one " "may be defined.") if FLAGS.train_steps is not None: train_eval_iterations = FLAGS.train_steps // FLAGS.steps_between_eval single_iteration_train_steps = FLAGS.steps_between_eval single_iteration_train_epochs = None else: if FLAGS.train_epochs is None: FLAGS.train_epochs = DEFAULT_TRAIN_EPOCHS train_eval_iterations = FLAGS.train_epochs // FLAGS.epochs_between_eval single_iteration_train_steps = None single_iteration_train_epochs = FLAGS.epochs_between_eval # Make sure that the BLEU source and ref files if set if FLAGS.bleu_source is not None and FLAGS.bleu_ref is not None: if not tf.gfile.Exists(FLAGS.bleu_source): raise ValueError("BLEU source file %s does not exist" % FLAGS.bleu_source) if not tf.gfile.Exists(FLAGS.bleu_ref): raise ValueError("BLEU source file %s does not exist" % FLAGS.bleu_ref) # Add flag-defined parameters to params object params.data_dir = FLAGS.data_dir params.num_cpu_cores = FLAGS.num_cpu_cores params.epochs_between_eval = FLAGS.epochs_between_eval params.repeat_dataset = single_iteration_train_epochs estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=FLAGS.model_dir, params=params) train_schedule(estimator, train_eval_iterations, single_iteration_train_steps, single_iteration_train_epochs, FLAGS.bleu_source, FLAGS.bleu_ref, FLAGS.bleu_threshold) mlperf_log.transformer_print(key=mlperf_log.RUN_STOP) mlperf_log.transformer_print(key=mlperf_log.RUN_FINAL)
def main(args): print(args) transformer_print(key=mlperf_log.PREPROC_VOCAB_SIZE, value={ 'src': args.nwordssrc, 'tgt': args.nwordstgt }) os.makedirs(args.destdir, exist_ok=True) target = not args.only_source def build_dictionary(filenames): d = dictionary.Dictionary() for filename in filenames: Tokenizer.add_file_to_dictionary(filename, d, tokenize_line) return d def train_path(lang): return '{}{}'.format(args.trainpref, ('.' + lang) if lang else '') def file_name(prefix, lang): fname = prefix if lang is not None: fname += f'.{lang}' return fname def dest_path(prefix, lang): return os.path.join(args.destdir, file_name(prefix, lang)) def dict_path(lang): return dest_path('dict', lang) + '.txt' def dataset_dest_path(output_prefix, lang, extension): base = f'{args.destdir}/{output_prefix}' lang_part = f'.{args.source_lang}-{args.target_lang}.{lang}' if lang is not None else '' return f'{base}{lang_part}.{extension}' if args.joined_dictionary: assert not args.srcdict, 'cannot combine --srcdict and --joined-dictionary' assert not args.tgtdict, 'cannot combine --tgtdict and --joined-dictionary' src_dict = build_dictionary( set([ train_path(lang) for lang in [args.source_lang, args.target_lang] ])) tgt_dict = src_dict else: if args.srcdict: src_dict = dictionary.Dictionary.load(args.srcdict) else: assert args.trainpref, "--trainpref must be set if --srcdict is not specified" src_dict = build_dictionary([train_path(args.source_lang)]) if target: if args.tgtdict: tgt_dict = dictionary.Dictionary.load(args.tgtdict) else: assert args.trainpref, "--trainpref must be set if --tgtdict is not specified" tgt_dict = build_dictionary([train_path(args.target_lang)]) src_dict.finalize( threshold=args.thresholdsrc, nwords=args.nwordssrc, padding_factor=args.padding_factor, ) src_dict.save(dict_path(args.source_lang)) if target: if not args.joined_dictionary: tgt_dict.finalize( threshold=args.thresholdtgt, nwords=args.nwordstgt, padding_factor=args.padding_factor, ) tgt_dict.save(dict_path(args.target_lang)) def make_binary_dataset(input_prefix, output_prefix, lang): dict = dictionary.Dictionary.load(dict_path(lang)) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_path(output_prefix, lang, 'bin')) def consumer(tensor): ds.add_item(tensor) input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '') res = Tokenizer.binarize(input_file, dict, consumer) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, res['nseq'], res['ntok'], 100 * res['nunk'] / res['ntok'], dict.unk_word)) ds.finalize(dataset_dest_path(output_prefix, lang, 'idx')) def make_dataset(input_prefix, output_prefix, lang): if args.output_format == 'binary': make_binary_dataset(input_prefix, output_prefix, lang) elif args.output_format == 'raw': # Copy original text file to destination folder output_text_file = dest_path( output_prefix + '.{}-{}'.format(args.source_lang, args.target_lang), lang, ) shutil.copyfile(file_name(input_prefix, lang), output_text_file) def make_all(lang): if args.trainpref: make_dataset(args.trainpref, 'train', lang) if args.validpref: for k, validpref in enumerate(args.validpref.split(',')): outprefix = 'valid{}'.format(k) if k > 0 else 'valid' make_dataset(validpref, outprefix, lang) if args.testpref: for k, testpref in enumerate(args.testpref.split(',')): outprefix = 'test{}'.format(k) if k > 0 else 'test' make_dataset(testpref, outprefix, lang) make_all(args.source_lang) if target: make_all(args.target_lang) print('| Wrote preprocessed data to {}'.format(args.destdir)) if args.alignfile: assert args.trainpref, "--trainpref must be set if --alignfile is specified" src_file_name = train_path(args.source_lang) tgt_file_name = train_path(args.target_lang) src_dict = dictionary.Dictionary.load(dict_path(args.source_lang)) tgt_dict = dictionary.Dictionary.load(dict_path(args.target_lang)) freq_map = {} with open(args.alignfile, 'r') as align_file: with open(src_file_name, 'r') as src_file: with open(tgt_file_name, 'r') as tgt_file: for a, s, t in zip_longest(align_file, src_file, tgt_file): si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False) ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False) ai = list(map(lambda x: tuple(x.split('-')), a.split())) for sai, tai in ai: srcidx = si[int(sai)] tgtidx = ti[int(tai)] if srcidx != src_dict.unk( ) and tgtidx != tgt_dict.unk(): assert srcidx != src_dict.pad() assert srcidx != src_dict.eos() assert tgtidx != tgt_dict.pad() assert tgtidx != tgt_dict.eos() if srcidx not in freq_map: freq_map[srcidx] = {} if tgtidx not in freq_map[srcidx]: freq_map[srcidx][tgtidx] = 1 else: freq_map[srcidx][tgtidx] += 1 align_dict = {} for srcidx in freq_map.keys(): align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get) with open( os.path.join( args.destdir, 'alignment.{}-{}.txt'.format(args.source_lang, args.target_lang)), 'w') as f: for k, v in align_dict.items(): print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)
def call(self, x, y, bias, cache=None, encdec_cache=None): """Apply attention mechanism to x and y. Args: x: a tensor with shape [batch_size, length_x, hidden_size] y: a tensor with shape [batch_size, length_y, hidden_size] bias: attention bias that will be added to the result of the dot product. cache: (Used during prediction) dictionary with tensors containing results of previous attentions. The dictionary must have the items: {"k": tensor with shape [batch_size, i, key_channels], "v": tensor with shape [batch_size, i, value_channels]} where i is the current decoded length. Returns: Attention layer output with shape [batch_size, length_x, hidden_size] """ # Linearly project the query (q), key (k) and value (v) using different # learned projections. This is in preparation of splitting them into # multiple heads. Multi-head attention uses multiple queries, keys, and # values rather than regular attention (which uses a single q, k, v). with tf.compat.v1.tpu.bfloat16_scope(): if x.dtype == tf.float32: x = tf.cast(x, tf.bfloat16) if y.dtype == tf.float32: y = tf.cast(y, tf.bfloat16) q = self.q_dense_layer(x) if encdec_cache is not None: k = encdec_cache["k"] v = encdec_cache["v"] else: k = self.k_dense_layer(y) v = self.v_dense_layer(y) if cache is not None: # Combine cached keys and values with new keys and values. k = tf.concat([cache["k"], k], axis=1) v = tf.concat([cache["v"], v], axis=1) # Update cache cache["k"] = k cache["v"] = v # Split q, k, v into heads. q = self.split_heads(q) k = self.split_heads(k) v = self.split_heads(v) # Scale q to prevent the dot product between q and k from growing too large. # Calculate dot product attention with tf.compat.v1.tpu.bfloat16_scope(): bias = tf.cast(bias, tf.bfloat16) logits = tf.matmul(q, k, transpose_b=True) logits *= Attention.rsqrtQ logits += bias weights = tf.nn.softmax(logits, name="attention_weights") if self.train: mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_ATTENTION_DROPOUT, value=self.attention_dropout) weights = tf.nn.dropout(weights, 1 - (1.0 - self.attention_dropout)) attention_output = tf.matmul(weights, v) # Recombine heads --> [batch_size, length, hidden_size] attention_output = self.combine_heads(attention_output) # Run the combined outputs through another linear projection layer. attention_output = self.output_dense_layer(attention_output) return attention_output
def Linear(in_features, out_features, bias=True): m = nn.Linear(in_features, out_features, bias) transformer_print(mlperf_log.MODEL_HP_INITIALIZER_GAIN, value=1) nn.init.xavier_uniform_(m.weight) nn.init.constant_(m.bias, 0.) return m
def _read_and_batch_from_files(file_pattern, batch_size, max_length, num_cpu_cores, shuffle, repeat, has_horovod=False, static_batch=False): """Create dataset where each item is a dict of "inputs" and "targets". Args: file_pattern: String used to match the input TFRecord files. batch_size: Maximum number of tokens per batch of examples max_length: Maximum number of tokens per example num_cpu_cores: Number of cpu cores for parallel input processing. shuffle: If true, randomizes order of elements. repeat: Number of times to repeat the dataset. If None, the dataset is repeated forever. has_horovod: mark if this instance is running with horovod static_batch: Whether the batches in the dataset should have static shapes. If True, the input is batched so that every batch has the shape [batch_size // max_length, max_length]. If False, the input is grouped by length, and batched so that batches may have different shapes [N, M], where: N * M <= batch_size M <= max_length In general, this setting should be False. Dynamic shapes allow the inputs to be grouped so that the number of padding tokens is minimized, and helps model training. In cases where the input shape must be static (e.g. running on TPU), this setting should be set to True. Returns: tf.data.Dataset object containing examples loaded from the files. """ dataset = tf.data.Dataset.list_files(file_pattern) if shuffle: # Shuffle filenames mlperf_log.transformer_print(key=mlperf_log.INPUT_ORDER) dataset = dataset.shuffle(buffer_size=_FILE_SHUFFLE_BUFFER) # Read files and interleave results. When training, the order of the examples # will be non-deterministic. dataset = dataset.apply( tf.data.experimental.parallel_interleave(_load_records, sloppy=shuffle, cycle_length=num_cpu_cores)) # Parse each tf.Example into a dictionary # TODO: Look into prefetch_input_elements for performance optimization. dataset = dataset.map(_parse_example, num_parallel_calls=num_cpu_cores) # Remove examples where the input or target length exceeds the maximum length, dataset = dataset.filter(lambda x, y: _filter_max_length( (x, y), max_length)) # Batch such that each batch has examples of similar length. mlperf_log.transformer_print(key=mlperf_log.INPUT_BATCH_SIZE, value=batch_size) mlperf_log.transformer_print(key=mlperf_log.INPUT_MAX_LENGTH, value=max_length) #dataset = _batch_examples(dataset, batch_size, max_length) if static_batch == "Yes": dataset = dataset.padded_batch(batch_size // max_length, ([max_length], [max_length]), drop_remainder=True) else: # Group and batch such that each batch has examples of similar length. dataset = _batch_examples(dataset, batch_size, max_length) dataset = dataset.repeat(repeat) # horovod: do shard if enabled multi-instance while training #TODO: verify if it is working if shuffle and has_horovod: import horovod.tensorflow as hvd shape = dataset.output_shapes dataset = dataset.shard(hvd.size(), hvd.rank()) # Prefetch the next element to improve speed of input pipeline. dataset = dataset.prefetch(1) return dataset
def main(args): if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) from mlperf_compliance.mlperf_log import transformer_print transformer_print( key=mlperf_log.RUN_CLEAR_CACHES ) #before this tag we should run clearing caches on the host # mlperf compliance synchronization if args.distributed_world_size > 1: assert (torch.distributed.is_initialized()) torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0) torch.cuda.synchronize() transformer_print(key=mlperf_log.RUN_START) if args.max_tokens is None: args.max_tokens = 6000 print(args) transformer_print(key=mlperf_log.OPT_NAME, value=args.optimizer) transformer_print(key=mlperf_log.OPT_LR, value=args.lr) transformer_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=eval(args.adam_betas)[0]) transformer_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=eval(args.adam_betas)[1]) transformer_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=args.adam_eps) pValue = ctypes.cast((ctypes.c_int * 1)(), ctypes.POINTER(ctypes.c_int)) result = torch.cuda.cudart().cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128)) result = torch.cuda.cudart().cudaDeviceGetLimit(pValue, ctypes.c_int(0x05)) torch.manual_seed(args.seed) transformer_print(key=mlperf_log.RUN_SET_RANDOM_SEED, value=args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) transformer_print(key=mlperf_log.MODEL_HP_SEQ_BEAM_SEARCH, value={ 'alpha': args.lenpen, 'beam_size': args.beam, 'extra_decode_length': args.max_len_b, 'vocab_size': task.target_dictionary.__len__() }) # Load dataset splits load_dataset_splits(task, ['train', 'valid']) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format( sum(p.numel() for p in model.parameters()))) # Build trainer if args.fp16: trainer = FP16Trainer(args, task, model, criterion) else: if torch.cuda.get_device_capability(0)[0] >= 7: print( '| NOTICE: your device may support faster training with --fp16' ) trainer = Trainer(args, task, model, criterion) if (args.online_eval or args.target_bleu) and not args.remove_bpe: args.remove_bpe = '@@ ' print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) transformer_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.max_tokens) transformer_print(key=mlperf_log.INPUT_ORDER) # Initialize dataloader max_positions = trainer.get_model().max_positions() # Send a dummy batch to warm the caching allocator dummy_batch = task.dataset('train').get_dummy_batch( args.max_tokens, max_positions) trainer.dummy_train_step(dummy_batch) # Train until the learning rate gets too small or model reaches target score max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf tgt_bleu = args.target_bleu or math.inf current_bleu = 0.0 lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') ctr = 0 class DummyEpochBatchIterator: def __init__(self, epoch=0): self.epoch = epoch epoch_itr = DummyEpochBatchIterator(0) transformer_print(key=mlperf_log.TRAIN_LOOP) while lr >= args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update and current_bleu < tgt_bleu: transformer_print(key=mlperf_log.TRAIN_EPOCH, value=epoch_itr.epoch) import time start = time.time() epoch_itr = data.EpochBatchIterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences_valid, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, epoch=epoch_itr.epoch if ctr is not 0 else 0) print("got epoch iterator", time.time() - start) # Load the latest checkpoint if one is available if ctr is 0: load_checkpoint(args, trainer, epoch_itr) # train for one epoch start = time.time() train(args, trainer, task, epoch_itr) print("epoch time ", time.time() - start) start = time.time() if epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) # Eval BLEU score transformer_print(key=mlperf_log.EVAL_START, value=epoch_itr.epoch) if args.online_eval or (not tgt_bleu is math.inf): current_bleu = score(args, trainer, task, epoch_itr, args.gen_subset) transformer_print(key=mlperf_log.EVAL_ACCURACY, value={ 'epoch': epoch_itr.epoch, 'value': current_bleu }) transformer_print(key=mlperf_log.EVAL_TARGET, value=tgt_bleu) transformer_print(key=mlperf_log.EVAL_STOP, value=epoch_itr.epoch) # Only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # Save checkpoint if epoch_itr.epoch % args.save_interval == 0: save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) ctr = ctr + 1 print("validation and scoring ", time.time() - start) train_meter.stop() transformer_print(key=mlperf_log.RUN_STOP) transformer_print(key=mlperf_log.RUN_FINAL) print('| done training in {:.1f} seconds'.format(train_meter.sum))
def encode_and_save_files(subtokenizer, data_dir, raw_files, tag, total_shards): """Save data from files as encoded Examples in TFrecord format. Args: subtokenizer: Subtokenizer object that will be used to encode the strings. data_dir: The directory in which to write the examples raw_files: A tuple of (input, target) data files. Each line in the input and the corresponding line in target file will be saved in a tf.Example. tag: String that will be added onto the file names. total_shards: Number of files to divide the data into. Returns: List of all files produced. """ # Create a file for each shard. filepaths = [ shard_filename(data_dir, tag, n + 1, total_shards) for n in range(total_shards) ] if all_exist(filepaths): tf.logging.info("Files with tag %s already exist." % tag) return filepaths tf.logging.info("Saving files with tag %s." % tag) input_file = raw_files[0] target_file = raw_files[1] # Write examples to each shard in round robin order. tmp_filepaths = [fname + ".incomplete" for fname in filepaths] writers = [tf.python_io.TFRecordWriter(fname) for fname in tmp_filepaths] counter, shard = 0, 0 for counter, (input_line, target_line) in enumerate( zip(txt_line_iterator(input_file), txt_line_iterator(target_file))): if counter > 0 and counter % 100000 == 0: tf.logging.info("\tSaving case %d." % counter) example = dict_to_example({ "inputs": subtokenizer.encode(input_line, add_eos=True), "targets": subtokenizer.encode(target_line, add_eos=True) }) writers[shard].write(example.SerializeToString()) shard = (shard + 1) % total_shards for writer in writers: writer.close() for tmp_name, final_name in zip(tmp_filepaths, filepaths): tf.gfile.Rename(tmp_name, final_name) if tag == _TRAIN_TAG: mlperf_log.transformer_print(key=mlperf_log.PREPROC_NUM_TRAIN_EXAMPLES, value=counter) elif tag == _EVAL_TAG: mlperf_log.transformer_print(key=mlperf_log.PREPROC_NUM_EVAL_EXAMPLES, value=counter) tf.logging.info("Saved %d Examples", counter) return filepaths
def _read_and_batch_from_files(file_pattern, batch_size, max_length, num_cpu_cores, shuffle, repeat): """Create dataset where each item is a dict of "inputs" and "targets". Args: file_pattern: String used to match the input TFRecord files. batch_size: Maximum number of tokens per batch of examples max_length: Maximum number of tokens per example num_cpu_cores: Number of cpu cores for parallel input processing. shuffle: If true, randomizes order of elements. repeat: Number of times to repeat the dataset. If None, the dataset is repeated forever. Returns: tf.data.Dataset object containing examples loaded from the files. """ # SSY print("SSY _read_and_batch_from_files") # /usr/local/lib/python3.5/dist-packages/tensorflow_core/python/data/ops/dataset_ops.py dataset = tf.data.Dataset.list_files(file_pattern) # DatasetV1Adapter print("dataset type {}".format(type(dataset))) for elem in dataset: print(elem.numpy()) if shuffle: # Shuffle filenames mlperf_log.transformer_print(key=mlperf_log.INPUT_ORDER) # SSY actually list_files above can also shuffle # /usr/local/lib/python3.5/dist-packages/tensorflow_core/python/data/ops/dataset_ops.py # shuffle the data set randomly dataset = dataset.shuffle(buffer_size=_FILE_SHUFFLE_BUFFER) # Read files and interleave results. When training, the order of the examples # will be non-deterministic. # SSY dataset.apply /usr/local/lib/python3.5/dist-packages/tensorflow_core/python/data/ops/dataset_ops.py # SSY tf.contrib.data.parallel_interleave /usr/local/lib/python3.5/dist-packages/tensorflow_core/contrib/data/python/ops/interleave_ops.py dataset = dataset.apply( tf.contrib.data.parallel_interleave(_load_records, sloppy=shuffle, cycle_length=num_cpu_cores)) # Parse each tf.Example into a dictionary # TODO: Look into prefetch_input_elements for performance optimization. # SSY dataset.map /usr/local/lib/python3.5/dist-packages/tensorflow_core/python/data/ops/dataset_ops.py dataset = dataset.map(_parse_example, num_parallel_calls=num_cpu_cores) # Remove examples where the input or target length exceeds the maximum length, dataset = dataset.filter(lambda x, y: _filter_max_length( (x, y), max_length)) # Batch such that each batch has examples of similar length. mlperf_log.transformer_print(key=mlperf_log.INPUT_BATCH_SIZE, value=batch_size) mlperf_log.transformer_print(key=mlperf_log.INPUT_MAX_LENGTH, value=max_length) dataset = _batch_examples(dataset, batch_size, max_length) dataset = dataset.repeat(repeat) # Prefetch the next element to improve speed of input pipeline. # SSY niubility!!! prefetch dataset = dataset.prefetch(1) # SSY only return dataset return dataset