Exemplo n.º 1
0
    def init_from_files(vocab_file,
                        files,
                        target_vocab_size,
                        threshold,
                        min_count=None,
                        file_byte_limit=1e6,
                        reserved_tokens=None):
        """Create subtoken vocabulary based on files, and save vocab to file.

    Args:
      vocab_file: String name of vocab file to store subtoken vocabulary.
      files: List of file paths that will be used to generate vocabulary.
      target_vocab_size: target vocabulary size to generate.
      threshold: int threshold of vocabulary size to accept.
      min_count: int minimum count to use for generating the vocabulary. The min
        count is the minimum number of times a subtoken should appear in the
        files before it is added to the vocabulary. If set to none, this value
        is found using binary search.
      file_byte_limit: (Default 1e6) Maximum number of bytes of sample text that
        will be drawn from the files.
      reserved_tokens: List of string tokens that are guaranteed to be at the
        beginning of the subtoken vocabulary list.

    Returns:
      Subtokenizer object
    """
        if reserved_tokens is None:
            reserved_tokens = RESERVED_TOKENS

        if tf.gfile.Exists(vocab_file):
            tf.logging.info("Vocab file already exists (%s)" % vocab_file)
        else:
            tf.logging.info("Begin steps to create subtoken vocabulary...")
            token_counts = _count_tokens(files, file_byte_limit)
            alphabet = _generate_alphabet_dict(token_counts)
            subtoken_list = _generate_subtokens_with_target_vocab_size(
                token_counts, alphabet, target_vocab_size, threshold,
                min_count, reserved_tokens)
            tf.logging.info("Generated vocabulary with %d subtokens." %
                            len(subtoken_list))
            mlperf_log.transformer_print(key=mlperf_log.PREPROC_VOCAB_SIZE,
                                         value=len(subtoken_list))
            _save_vocab_file(vocab_file, subtoken_list)
        return Subtokenizer(vocab_file)
Exemplo n.º 2
0
    def decode(self, targets, encoder_outputs, attention_bias):
        """Generate logits for each value in the target sequence.

    Args:
      targets: target values for the output sequence.
        int tensor with shape [batch_size, target_length]
      encoder_outputs: continuous representation of input sequence.
        float tensor with shape [batch_size, input_length, hidden_size]
      attention_bias: float tensor with shape [batch_size, 1, 1, input_length]

    Returns:
      float32 tensor with shape [batch_size, target_length, vocab_size]
    """
        with tf.compat.v1.name_scope("decode"):
            # Prepare inputs to decoder layers by shifting targets, adding positional
            # encoding and applying dropout.
            decoder_inputs = self.embedding_softmax_layer(targets)
            with tf.compat.v1.name_scope("shift_targets"):
                # Shift targets to the right, and remove the last element
                decoder_inputs = tf.pad(tensor=decoder_inputs,
                                        paddings=[[0, 0], [1, 0],
                                                  [0, 0]])[:, :-1, :]
            with tf.compat.v1.name_scope("add_pos_encoding"):
                length = tf.shape(input=decoder_inputs)[1]
                poscod = model_utils.get_position_encoding(
                    length, self.params.hidden_size)
                decoder_inputs += poscod
            if self.train:
                mlperf_log.transformer_print(
                    key=mlperf_log.MODEL_HP_LAYER_POSTPROCESS_DROPOUT,
                    value=self.params.layer_postprocess_dropout)
                decoder_inputs = tf.nn.dropout(
                    decoder_inputs,
                    1 - (1 - self.params.layer_postprocess_dropout))

            # Run values
            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                length)
            outputs = self.decoder_stack(decoder_inputs, encoder_outputs,
                                         decoder_self_attention_bias,
                                         attention_bias)
            logits = self.embedding_softmax_layer.linear(outputs)
            return logits
Exemplo n.º 3
0
    def call(self, x, padding=None):

        with tf.compat.v1.tpu.bfloat16_scope():
            #x = tf.cast(x, tf.bfloat16)
            # Retrieve dynamically known shapes
            batch_size = tf.shape(input=x)[0]
            length = tf.shape(input=x)[1]

            if padding is not None:
                with tf.compat.v1.name_scope("remove_padding"):
                    # Flatten padding to [batch_size*length]
                    pad_mask = tf.reshape(padding, [-1])

                    nonpad_ids = tf.cast(tf.compat.v1.where(pad_mask < 1e-9),
                                         dtype=tf.int32)

                    # Reshape x to [batch_size*length, hidden_size] to remove padding
                    x = tf.reshape(x, [-1, self.hidden_size])
                    x = tf.gather_nd(x, indices=nonpad_ids)

                    # Reshape x from 2 dimensions to 3 dimensions.
                    x.set_shape([None, self.hidden_size])
                    x = tf.expand_dims(x, axis=0)

            output = self.filter_dense_layer(x)
            if self.train:
                mlperf_log.transformer_print(
                    key=mlperf_log.MODEL_HP_RELU_DROPOUT,
                    value=self.relu_dropout)
                output = tf.nn.dropout(output, 1 - (1.0 - self.relu_dropout))
            output = self.output_dense_layer(output)

            if padding is not None:
                with tf.compat.v1.name_scope("re_add_padding"):
                    output = tf.squeeze(output, axis=0)
                    output = tf.scatter_nd(
                        indices=nonpad_ids,
                        updates=output,
                        shape=[batch_size * length, self.hidden_size])
                output = tf.reshape(output,
                                    [batch_size, length, self.hidden_size])
            #output = tf.cast(output, tf.float32)
        return output
def get_train_op(loss, params):
    """Generate training operation that updates variables based on loss."""
    with tf.variable_scope("get_train_op"):
        mlperf_log.transformer_print(key=mlperf_log.OPT_LR_WARMUP_STEPS,
                                     value=params.learning_rate_warmup_steps)
        learning_rate = get_learning_rate(params.learning_rate,
                                          params.hidden_size,
                                          params.learning_rate_warmup_steps)
        log_id = mlperf_log.resnet_print(key=mlperf_log.OPT_LR, deferred=True)
        learning_rate = tf_mlperf_log.log_deferred(op=learning_rate,
                                                   log_id=log_id,
                                                   every_n=100)

        # Create optimizer. Use LazyAdamOptimizer from TF contrib, which is faster
        # than the TF core Adam optimizer.
        mlperf_log.transformer_print(key=mlperf_log.OPT_NAME,
                                     value=mlperf_log.LAZY_ADAM)
        mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_BETA1,
                                     value=params.optimizer_adam_beta1)
        mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_BETA2,
                                     value=params.optimizer_adam_beta2)
        mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_EPSILON,
                                     value=params.optimizer_adam_epsilon)
        optimizer = tf.contrib.opt.LazyAdamOptimizer(
            learning_rate,
            beta1=params.optimizer_adam_beta1,
            beta2=params.optimizer_adam_beta2,
            epsilon=params.optimizer_adam_epsilon)

        # Calculate and apply gradients using LazyAdamOptimizer.
        global_step = tf.train.get_global_step()
        tvars = tf.trainable_variables()
        gradients = optimizer.compute_gradients(
            loss, tvars, colocate_gradients_with_ops=True)
        train_op = optimizer.apply_gradients(gradients,
                                             global_step=global_step,
                                             name="train")

        # Save gradient norm to Tensorboard
        tf.summary.scalar("global_norm/gradient_norm",
                          tf.global_norm(list(zip(*gradients))[0]))

        return train_op
Exemplo n.º 5
0
  def __init__(self, params, train):
    super(EncoderStack, self).__init__()
    self.layers = []
    mlperf_log.transformer_print(
        key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS,
        value=params.num_hidden_layers)
    for _ in range(params.num_hidden_layers):
      # Create sublayers for each layer.
      self_attention_layer = attention_layer.SelfAttention(
          params.hidden_size, params.num_heads, params.attention_dropout, train)
      feed_forward_network = ffn_layer.FeedFowardNetwork(
          params.hidden_size, params.filter_size, params.relu_dropout, train)

    self.layers.append([
        PrePostProcessingWrapper(self_attention_layer, params, train),
        PrePostProcessingWrapper(feed_forward_network, params, train)])

    # Create final layer normalization layer.
    #self.output_normalization = LayerNormalization(params.hidden_size)
    self.output_normalization = tf.keras.layers.LayerNormalization(epsilon=0.000001, dtype=policy)
Exemplo n.º 6
0
    def call(self, x, padding=None):
        # Retrieve dynamically known shapes
        batch_size = tf.shape(input=x)[0]
        length = tf.shape(input=x)[1]

        with tf.compat.v1.tpu.bfloat16_scope():
            # Reshape to 2D teansor
            x = tf.reshape(x, [-1, self.hidden_size])
            output = self.filter_dense_layer(x)
            if self.train:
                mlperf_log.transformer_print(
                    key=mlperf_log.MODEL_HP_RELU_DROPOUT,
                    value=self.relu_dropout)
                output = tf.nn.dropout(output, 1 - (1.0 - self.relu_dropout))
            output = self.output_dense_layer(output)

            # Reshaped back to 3D tensor
            output = tf.reshape(output, [batch_size, length, self.hidden_size])

        return output
Exemplo n.º 7
0
    def __call__(self, inputs, targets=None):
        """Calculate target logits or inferred target sequences.

    Args:
      inputs: int tensor with shape [batch_size, input_length].
      targets: None or int tensor with shape [batch_size, target_length].

    Returns:
      If targets is defined, then return logits for each word in the target
      sequence. float tensor with shape [batch_size, target_length, vocab_size]
      If target is none, then generate output sequence one token at a time.
        returns a dictionary {
          output: [batch_size, decoded length]
          score: [batch_size, float]}
    """
        # Variance scaling is used here because it seems to work in many problems.
        # Other reasonable initializers may also work just as well.
        mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_INITIALIZER_GAIN,
                                     value=self.params.initializer_gain)
        initializer = tf.compat.v1.variance_scaling_initializer(
            self.params.initializer_gain,
            mode="fan_avg",
            distribution="uniform")
        with tf.compat.v1.variable_scope("Transformer",
                                         initializer=initializer):
            # Calculate attention bias for encoder self-attention and decoder
            # multi-headed attention layers.
            attention_bias = model_utils.get_padding_bias(inputs)

            # Run the inputs through the encoder layer to map the symbol
            # representations to continuous representations.
            encoder_outputs = self.encode(inputs, attention_bias)

            # Generate output sequence if targets is None, or return logits if target
            # sequence is known.
            if targets is None:
                out_seq = self.predict(encoder_outputs, attention_bias)
                return out_seq
            else:
                logits = self.decode(targets, encoder_outputs, attention_bias)
                return logits
    def __init__(self, args, optimizer):
        super().__init__(args, optimizer)
        if len(args.lr) > 1:
            raise ValueError(
                'Cannot use a fixed learning rate schedule with inverse_sqrt.'
                ' Consider --lr-scheduler=fixed instead.')
        mlperf_log.transformer_print(key=mlperf_log.OPT_LR_WARMUP_STEPS,
                                     value=args.warmup_updates)
        warmup_end_lr = args.lr[0]
        if args.warmup_init_lr < 0:
            args.warmup_init_lr = warmup_end_lr

        # linearly warmup for the first args.warmup_updates
        self.lr_step = (warmup_end_lr -
                        args.warmup_init_lr) / args.warmup_updates

        # then, decay prop. to the inverse square root of the update number
        self.decay_factor = warmup_end_lr * args.warmup_updates**0.5

        # initial learning rate
        self.lr = args.warmup_init_lr
        self.optimizer.set_lr(self.lr)
Exemplo n.º 9
0
def sequence_beam_search(symbols_to_logits_fn, initial_ids, initial_cache,
                         vocab_size, beam_size, alpha, max_decode_length,
                         eos_id):
    """Search for sequence of subtoken ids with the largest probability.

  Args:
    symbols_to_logits_fn: A function that takes in ids, index, and cache as
      arguments. The passed in arguments will have shape:
        ids -> [batch_size * beam_size, index]
        index -> [] (scalar)
        cache -> nested dictionary of tensors [batch_size * beam_size, ...]
      The function must return logits and new cache.
        logits -> [batch * beam_size, vocab_size]
        new cache -> same shape/structure as inputted cache
    initial_ids: Starting ids for each batch item.
      int32 tensor with shape [batch_size]
    initial_cache: dict containing starting decoder variables information
    vocab_size: int size of tokens
    beam_size: int number of beams
    alpha: float defining the strength of length normalization
    max_decode_length: maximum length to decoded sequence
    eos_id: int id of eos token, used to determine when a sequence has finished

  Returns:
    Top decoded sequences [batch_size, beam_size, max_decode_length]
    sequence scores [batch_size, beam_size]
  """
    batch_size = tf.shape(initial_ids)[0]
    mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_SEQ_BEAM_SEARCH,
                                 value={
                                     "vocab_size": vocab_size,
                                     "batch_size": batch_size,
                                     "beam_size": beam_size,
                                     "alpha": alpha,
                                     "max_decode_length": max_decode_length
                                 })
    sbs = SequenceBeamSearch(symbols_to_logits_fn, vocab_size, batch_size,
                             beam_size, alpha, max_decode_length, eos_id)
    return sbs.search(initial_ids, initial_cache)
Exemplo n.º 10
0
  def __init__(self, params, train):
    super(EncoderStack, self).__init__()
    self.layers = []
    mlperf_log.transformer_print(
        key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS,
        value=params.num_hidden_layers)
    # SSY num_hidden_layers is 6 transformer/model/model_params.py
    for _ in range(params.num_hidden_layers):
      # Create sublayers for each layer.
      # only SelfAttention and ffn
      # SSY 2.1  transformer/model/attention_layer.py Dense and matmul
      self_attention_layer = attention_layer.SelfAttention(
          params.hidden_size, params.num_heads, params.attention_dropout, train)
      # SSY 2.2 transformer/model/ffn_layer.py only Dense
      feed_forward_network = ffn_layer.FeedFowardNetwork(
          params.hidden_size, params.filter_size, params.relu_dropout, train)

      self.layers.append([
          PrePostProcessingWrapper(self_attention_layer, params, train),
          PrePostProcessingWrapper(feed_forward_network, params, train)])

    # Create final layer normalization layer.
    self.output_normalization = LayerNormalization(params.hidden_size)
Exemplo n.º 11
0
  def __init__(self, params, train):
    super(DecoderStack, self).__init__()
    self.layers = []
    mlperf_log.transformer_print(
        key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS,
        value=params.num_hidden_layers)
    for _ in range(params.num_hidden_layers):
      # SSY 3.1  transformer/model/attention_layer.py Dense and matmul
      self_attention_layer = attention_layer.SelfAttention(
          params.hidden_size, params.num_heads, params.attention_dropout, train)
      # SSY 3.2  transformer/model/attention_layer.py Dense and matmul
      enc_dec_attention_layer = attention_layer.Attention(
          params.hidden_size, params.num_heads, params.attention_dropout, train)
      # SSY 3.3 transformer/model/ffn_layer.py only Dense
      feed_forward_network = ffn_layer.FeedFowardNetwork(
          params.hidden_size, params.filter_size, params.relu_dropout, train)

      self.layers.append([
          PrePostProcessingWrapper(self_attention_layer, params, train),
          PrePostProcessingWrapper(enc_dec_attention_layer, params, train),
          PrePostProcessingWrapper(feed_forward_network, params, train)])

    self.output_normalization = LayerNormalization(params.hidden_size)
Exemplo n.º 12
0
def _read_and_batch_from_files(file_pattern, batch_size, max_length,
                               num_cpu_cores, shuffle, repeat):
    """Create dataset where each item is a dict of "inputs" and "targets".

  Args:
    file_pattern: String used to match the input TFRecord files.
    batch_size: Maximum number of tokens per batch of examples
    max_length: Maximum number of tokens per example
    num_cpu_cores: Number of cpu cores for parallel input processing.
    shuffle: If true, randomizes order of elements.
    repeat: Number of times to repeat the dataset. If None, the dataset is
      repeated forever.

  Returns:
    tf.data.Dataset object containing examples loaded from the files.
  """
    dataset = tf.data.Dataset.list_files(file_pattern)

    if shuffle:
        # Shuffle filenames
        mlperf_log.transformer_print(key=mlperf_log.INPUT_ORDER)
        dataset = dataset.shuffle(buffer_size=_FILE_SHUFFLE_BUFFER)

    # Read files and interleave results. When training, the order of the examples
    # will be non-deterministic.
    dataset = dataset.apply(
        tf.data.experimental.parallel_interleave(_load_records,
                                                 sloppy=shuffle,
                                                 cycle_length=num_cpu_cores))

    # Parse each tf.Example into a dictionary
    # TODO: Look into prefetch_input_elements for performance optimization.
    dataset = dataset.map(_parse_example, num_parallel_calls=num_cpu_cores)

    # Remove examples where the input or target length exceeds the maximum length,
    dataset = dataset.filter(lambda x, y: _filter_max_length(
        (x, y), max_length))

    # Batch such that each batch has examples of similar length.
    mlperf_log.transformer_print(key=mlperf_log.INPUT_BATCH_SIZE,
                                 value=batch_size)
    mlperf_log.transformer_print(key=mlperf_log.INPUT_MAX_LENGTH,
                                 value=max_length)
    dataset = _batch_examples(dataset, batch_size, max_length)
    dataset = dataset.repeat(repeat)

    # Prefetch the next element to improve speed of input pipeline.
    dataset = dataset.prefetch(1)
    return dataset
Exemplo n.º 13
0
def main(unused_argv):
    """Obtain training and evaluation data for the Transformer model."""
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)

    make_dir(FLAGS.raw_dir)
    make_dir(FLAGS.data_dir)

    # Get paths of download/extracted training and evaluation files.
    tf.compat.v1.logging.info("Step 1/4: Downloading data from source")
    train_files = get_raw_files(FLAGS.raw_dir, _TRAIN_DATA_SOURCES)
    eval_files = get_raw_files(FLAGS.raw_dir, _EVAL_DATA_SOURCES)

    # Create subtokenizer based on the training files.
    tf.compat.v1.logging.info(
        "Step 2/4: Creating subtokenizer and building vocabulary")
    train_files_flat = train_files["inputs"] + train_files["targets"]
    vocab_file = os.path.join(FLAGS.data_dir, _VOCAB_FILE)
    subtokenizer = tokenizer.Subtokenizer.init_from_files(
        vocab_file,
        train_files_flat,
        _TARGET_VOCAB_SIZE,
        _TARGET_THRESHOLD,
        min_count=None if FLAGS.search else _TRAIN_DATA_MIN_COUNT)

    tf.compat.v1.logging.info(
        "Step 3/4: Compiling training and evaluation data")
    compiled_train_files = compile_files(FLAGS.data_dir, train_files,
                                         _TRAIN_TAG)
    compiled_eval_files = compile_files(FLAGS.data_dir, eval_files, _EVAL_TAG)

    # Tokenize and save data as Examples in the TFRecord format.
    tf.compat.v1.logging.info("Step 4/4: Preprocessing and saving data")
    mlperf_log.transformer_print(key=mlperf_log.PREPROC_TOKENIZE_TRAINING)
    train_tfrecord_files = encode_and_save_files(subtokenizer, FLAGS.data_dir,
                                                 compiled_train_files,
                                                 _TRAIN_TAG, _TRAIN_SHARDS)
    mlperf_log.transformer_print(key=mlperf_log.PREPROC_TOKENIZE_EVAL)
    encode_and_save_files(subtokenizer, FLAGS.data_dir, compiled_eval_files,
                          _EVAL_TAG, _EVAL_SHARDS)

    mlperf_log.transformer_print(key=mlperf_log.INPUT_ORDER)
    for fname in train_tfrecord_files:
        shuffle_records(fname)
Exemplo n.º 14
0
def main(_):
    # Set logging level to INFO to display training progress (logged by the
    # estimator)
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)

    mlperf_log.transformer_print(key=mlperf_log.RUN_START)

    # Set random seed.
    if FLAGS.random_seed is None:
        raise Exception('No Random seed given')
    print('Setting random seed = ', FLAGS.random_seed)
    seed = FLAGS.random_seed
    mlperf_log.transformer_print(key=mlperf_log.RUN_SET_RANDOM_SEED,
                                 value=seed)
    random.seed(seed)
    tf.compat.v1.set_random_seed(seed)
    numpy.random.seed(seed)

    if FLAGS.params == "base":
        params = model_params.TransformerBaseParams
    elif FLAGS.params == "big":
        params = model_params.TransformerBigParams
    else:
        raise ValueError("Invalid parameter set defined: %s."
                         "Expected 'base' or 'big.'" % FLAGS.params)

    # Determine training schedule based on flags.
    if FLAGS.train_steps != 0 and FLAGS.train_epochs is not None:
        raise ValueError(
            "Both --train_steps and --train_epochs were set. Only one "
            "may be defined.")
    if FLAGS.train_steps != 0:
        train_eval_iterations = FLAGS.train_steps // FLAGS.steps_between_eval
        single_iteration_train_steps = FLAGS.steps_between_eval
        single_iteration_train_epochs = None
    else:
        if FLAGS.train_epochs is None:
            FLAGS.train_epochs = DEFAULT_TRAIN_EPOCHS
        train_eval_iterations = FLAGS.train_epochs // FLAGS.epochs_between_eval
        single_iteration_train_steps = None
        single_iteration_train_epochs = FLAGS.epochs_between_eval

    # Make sure that the BLEU source and ref files if set
    if FLAGS.bleu_source is not None and FLAGS.bleu_ref is not None:
        if not tf.io.gfile.exists(FLAGS.bleu_source):
            raise ValueError("BLEU source file %s does not exist" %
                             FLAGS.bleu_source)
        if not tf.io.gfile.exists(FLAGS.bleu_ref):
            raise ValueError("BLEU source file %s does not exist" %
                             FLAGS.bleu_ref)

    # Add flag-defined parameters to params object
    params.data_dir = FLAGS.data_dir
    params.num_cpu_cores = FLAGS.num_cpu_cores
    params.epochs_between_eval = FLAGS.epochs_between_eval
    params.repeat_dataset = single_iteration_train_epochs
    # Add inter_op and intra_op parallelism thread
    session_config = tf.compat.v1.ConfigProto(
        inter_op_parallelism_threads=FLAGS.inter_op_parallelism_threads,
        intra_op_parallelism_threads=FLAGS.intra_op_parallelism_threads,
        allow_soft_placement=True)

    if FLAGS.save_checkpoints == "No":
        # To skip the checkpoints saving (which takes long time)
        # added the following run_config
        run_config = tf.estimator.RunConfig(session_config=session_config,
                                            save_summary_steps=None,
                                            save_checkpoints_secs=None)
    else:
        run_config = tf.estimator.RunConfig(session_config=session_config)

    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       model_dir=FLAGS.model_dir,
                                       params=params,
                                       config=run_config)
    train_schedule(estimator, train_eval_iterations,
                   single_iteration_train_steps, single_iteration_train_epochs,
                   FLAGS.bleu_source, FLAGS.bleu_ref, FLAGS.bleu_threshold)

    mlperf_log.transformer_print(key=mlperf_log.RUN_STOP)
    mlperf_log.transformer_print(key=mlperf_log.RUN_FINAL)
Exemplo n.º 15
0
def train_schedule(estimator,
                   train_eval_iterations,
                   single_iteration_train_steps=None,
                   single_iteration_train_epochs=None,
                   bleu_source=None,
                   bleu_ref=None,
                   bleu_threshold=None):
    """Train and evaluate model, and optionally compute model's BLEU score.

  **Step vs. Epoch vs. Iteration**

  Steps and epochs are canonical terms used in TensorFlow and general machine
  learning. They are used to describe running a single process (train/eval):
    - Step refers to running the process through a single or batch of examples.
    - Epoch refers to running the process through an entire dataset.

  E.g. training a dataset with 100 examples. The dataset is
  divided into 20 batches with 5 examples per batch. A single training step
  trains the model on one batch. After 20 training steps, the model will have
  trained on every batch in the dataset, or, in other words, one epoch.

  Meanwhile, iteration is used in this implementation to describe running
  multiple processes (training and eval).
    - A single iteration:
      1. trains the model for a specific number of steps or epochs.
      2. evaluates the model.
      3. (if source and ref files are provided) compute BLEU score.

  This function runs through multiple train+eval+bleu iterations.

  Args:
    estimator: tf.Estimator containing model to train.
    train_eval_iterations: Number of times to repeat the train+eval iteration.
    single_iteration_train_steps: Number of steps to train in one iteration.
    single_iteration_train_epochs: Number of epochs to train in one iteration.
    bleu_source: File containing text to be translated for BLEU calculation.
    bleu_ref: File containing reference translations for BLEU calculation.
    bleu_threshold: minimum BLEU score before training is stopped.

  Raises:
    ValueError: if both or none of single_iteration_train_steps and
      single_iteration_train_epochs were defined.
  """
    # Ensure that exactly one of single_iteration_train_steps and
    # single_iteration_train_epochs is defined.
    if single_iteration_train_steps is None:
        if single_iteration_train_epochs is None:
            raise ValueError(
                "Exactly one of single_iteration_train_steps or "
                "single_iteration_train_epochs must be defined. Both were none."
            )
    else:
        if single_iteration_train_epochs is not None:
            raise ValueError(
                "Exactly one of single_iteration_train_steps or "
                "single_iteration_train_epochs must be defined. Both were defined."
            )

    evaluate_bleu = bleu_source is not None and bleu_ref is not None

    # Print out training schedule
    print("Training schedule:")
    if single_iteration_train_epochs is not None:
        print("\t1. Train for %d epochs." % single_iteration_train_epochs)
    else:
        print("\t1. Train for %d steps." % single_iteration_train_steps)
    print("\t2. Evaluate model.")
    if evaluate_bleu:
        print("\t3. Compute BLEU score.")
        if bleu_threshold is not None:
            print("Repeat above steps until the BLEU score reaches",
                  bleu_threshold)
    if not evaluate_bleu or bleu_threshold is None:
        print("Repeat above steps %d times." % train_eval_iterations)

    if evaluate_bleu:
        # Set summary writer to log bleu score.
        bleu_writer = tf.compat.v1.summary.FileWriter(
            os.path.join(estimator.model_dir, BLEU_DIR))
        if bleu_threshold is not None:
            # Change loop stopping condition if bleu_threshold is defined.
            train_eval_iterations = INF

    # Loop training/evaluation/bleu cycles
    mlperf_log.transformer_print(key=mlperf_log.TRAIN_LOOP)
    #Creating hooks for printing Examples per Second, used with estimator.train
    train_hooks = hooks_helper.get_train_hooks(
        ["ExamplesPerSecondHook"],
        model_dir=FLAGS.model_dir,
        batch_size=estimator.params.batch_size,
        every_n_steps=FLAGS.print_iter,
        warm_steps=20)

    for i in xrange(train_eval_iterations):
        print("Starting iteration", i + 1)

        if single_iteration_train_epochs is not None:
            mlperf_log.transformer_print(
                key=mlperf_log.TRAIN_EPOCH,
                value=i * single_iteration_train_epochs + 1)

        # Train the model for single_iteration_train_steps or until the input fn
        # runs out of examples (if single_iteration_train_steps is None).
        estimator.train(dataset.train_input_fn,
                        steps=single_iteration_train_steps,
                        hooks=train_hooks)

        mlperf_log.transformer_print(key=mlperf_log.EVAL_START)
        # To save training time, we can turn off evaluation
        # Otherwise it will be turned on
        if FLAGS.do_eval == "Yes":
            eval_results = estimator.evaluate(dataset.eval_input_fn)
            print(
                "Evaluation results (iter %d/%d):" %
                (i + 1, train_eval_iterations), eval_results)

        if evaluate_bleu:
            uncased_score, _ = evaluate_and_log_bleu(estimator, bleu_writer,
                                                     bleu_source, bleu_ref)
            if bleu_threshold is not None and uncased_score > bleu_threshold:
                bleu_writer.close()
                break
            mlperf_log.transformer_print(key=mlperf_log.EVAL_TARGET,
                                         value=bleu_threshold)
            mlperf_log.transformer_print(key=mlperf_log.EVAL_ACCURACY,
                                         value=uncased_score)
        mlperf_log.transformer_print(key=mlperf_log.EVAL_STOP)
Exemplo n.º 16
0
def get_train_op(loss, params):
    """Generate training operation that updates variables based on loss."""
    with tf.compat.v1.variable_scope("get_train_op"):
        mlperf_log.transformer_print(key=mlperf_log.OPT_LR_WARMUP_STEPS,
                                     value=params.learning_rate_warmup_steps)
        learning_rate = get_learning_rate(params.learning_rate,
                                          params.hidden_size,
                                          params.learning_rate_warmup_steps)
        log_id = mlperf_log.resnet_print(key=mlperf_log.OPT_LR, deferred=True)
        learning_rate = tf_mlperf_log.log_deferred(op=learning_rate,
                                                   log_id=log_id,
                                                   every_n=100)

        # Create optimizer. Use LazyAdamOptimizer from TF contrib, which is faster
        # than the TF core Adam optimizer.
        mlperf_log.transformer_print(key=mlperf_log.OPT_NAME,
                                     value=mlperf_log.LAZY_ADAM)
        mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_BETA1,
                                     value=params.optimizer_adam_beta1)
        mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_BETA2,
                                     value=params.optimizer_adam_beta2)
        mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_EPSILON,
                                     value=params.optimizer_adam_epsilon)
        # Using optimizer v1(from tensorflow.python.trainings*)
        # The optimizer v2 version of code is in the below.
        # Optimzer v1 does not
        # have lazyAdam optimizer (was in contrib, now deprecated)
        optimizer = adam.AdamOptimizer(learning_rate,
                                       beta1=params.optimizer_adam_beta1,
                                       beta2=params.optimizer_adam_beta2,
                                       epsilon=params.optimizer_adam_epsilon)

        # Calculate and apply gradients using LazyAdamOptimizer.
        global_step = tf.compat.v1.train.get_global_step()
        tvars = tf.compat.v1.trainable_variables()
        grads_and_vars = optimizer.compute_gradients(loss, tvars)
        train_op = optimizer.apply_gradients(grads_and_vars,
                                             global_step=global_step,
                                             name="train")
        # Save gradient norm to Tensorboard
        tf.compat.v1.summary.scalar(
            "global_norm/gradient_norm",
            tf.linalg.global_norm(list(zip(*grads_and_vars))[0]))
        # Using tfa (tensorflow_addons) optimizer, which in turn
        # uses optimizer_v2 (from tf.python.keras.optimizer_v2)
        # which has waringin issues about global step not updated since
        # global_step is not accepted in apply_gradients() function of
        # optimizer_v2 version.
        # Thus the global step is updated and grouped with training op
        # To activate LazyAdams from tensroflow-addons activate the
        # following code and take out the above optimer v1 related code
        # Currently both optimizer v1 and v2 take about same time
        '''                  
    optimizer = tfa.optimizers.LazyAdam(
        learning_rate,
        beta_1=params.optimizer_adam_beta1,
        beta_2=params.optimizer_adam_beta2,
        epsilon=params.optimizer_adam_epsilon)

    # Calculate and apply gradients using LazyAdamOptimizer.
    global_step = tf.compat.v1.train.get_global_step()
    tvars = tf.compat.v1.trainable_variables()
    tvars = tvars[0:len(tvars)-1]
    gradients = optimizer.get_gradients(
        loss, tvars)
    grads_and_vars = zip(gradients, tvars)
    train_op = optimizer.apply_gradients(
        grads_and_vars)
    # Save gradient norm to Tensorboard
    tf.compat.v1.summary.scalar("global_norm/gradient_norm",
                      tf.compat.v1.linalg.global_norm(list(gradients)))
    update_global_step = tf.compat.v1.assign(global_step, global_step + 1, name = "update_global_step")
    train_op = tf.compat.v1.group(train_op, [(update_global_step)])
    '''
        return train_op
Exemplo n.º 17
0
    def call(self, x, y, bias, cache=None):
        """Apply attention mechanism to x and y.

    Args:
      x: a tensor with shape [batch_size, length_x, hidden_size]
      y: a tensor with shape [batch_size, length_y, hidden_size]
      bias: attention bias that will be added to the result of the dot product.
      cache: (Used during prediction) dictionary with tensors containing results
        of previous attentions. The dictionary must have the items:
            {"k": tensor with shape [batch_size, i, key_channels],
             "v": tensor with shape [batch_size, i, value_channels]}
        where i is the current decoded length.

    Returns:
      Attention layer output with shape [batch_size, length_x, hidden_size]
    """
        # Linearly project the query (q), key (k) and value (v) using different
        # learned projections. This is in preparation of splitting them into
        # multiple heads. Multi-head attention uses multiple queries, keys, and
        # values rather than regular attention (which uses a single q, k, v).
        q = self.q_dense_layer(x)
        k = self.k_dense_layer(y)
        v = self.v_dense_layer(y)

        if cache is not None:
            # Combine cached keys and values with new keys and values.
            k = tf.concat([cache["k"], k], axis=1)
            v = tf.concat([cache["v"], v], axis=1)

            # Update cache
            cache["k"] = k
            cache["v"] = v

        # Split q, k, v into heads.
        q = self.split_heads(q)
        k = self.split_heads(k)
        v = self.split_heads(v)

        # Scale q to prevent the dot product between q and k from growing too large.
        depth = (self.hidden_size // self.num_heads)
        q *= depth**-0.5

        # Calculate dot product attention
        # SSY bf16
        #q = tf.reshape(id_bf16cut_fp(q),tf.shape(q))
        #k = tf.reshape(id_bf16cut_fp(k),tf.shape(k))
        q = tf.reshape(bf16cut_fp(q), tf.shape(q))
        k = tf.reshape(bf16cut_fp(k), tf.shape(k))
        logits = tf.matmul(q, k, transpose_b=True)
        #logits = tf.reshape(id_bf16cut_bp(logits),tf.shape(logits))
        logits = tf.reshape(bf16cut_bp(logits), tf.shape(logits))

        logits += bias
        weights = tf.nn.softmax(logits, name="attention_weights")
        if self.train:
            mlperf_log.transformer_print(
                key=mlperf_log.MODEL_HP_ATTENTION_DROPOUT,
                value=self.attention_dropout)
            weights = tf.nn.dropout(weights, 1.0 - self.attention_dropout)
        # SSY bf16
        #weights = tf.reshape(id_bf16cut_fp(weights),tf.shape(weights))
        #v = tf.reshape(id_bf16cut_fp(v),tf.shape(v))
        weights = tf.reshape(bf16cut_fp(weights), tf.shape(weights))
        v = tf.reshape(bf16cut_fp(v), tf.shape(v))
        attention_output = tf.matmul(weights, v)
        #attention_output = tf.reshape(id_bf16cut_bp(attention_output),tf.shape(attention_output))
        attention_output = tf.reshape(bf16cut_bp(attention_output),
                                      tf.shape(attention_output))

        # Recombine heads --> [batch_size, length, hidden_size]
        attention_output = self.combine_heads(attention_output)

        # Run the combined outputs through another linear projection layer.
        attention_output = self.output_dense_layer(attention_output)
        return attention_output
def main(_):
    # Set logging level to INFO to display training progress (logged by the
    # estimator)
    tf.logging.set_verbosity(tf.logging.INFO)

    mlperf_log.transformer_print(key=mlperf_log.RUN_START)

    # Set random seed.
    if FLAGS.random_seed is None:
        raise Exception('No Random seed given')
    print('Setting random seed = ', FLAGS.random_seed)
    seed = FLAGS.random_seed
    mlperf_log.transformer_print(key=mlperf_log.RUN_SET_RANDOM_SEED,
                                 value=seed)
    random.seed(seed)
    tf.set_random_seed(seed)
    numpy.random.seed(seed)

    if FLAGS.params == "base":
        params = model_params.TransformerBaseParams
    elif FLAGS.params == "big":
        params = model_params.TransformerBigParams
    else:
        raise ValueError("Invalid parameter set defined: %s."
                         "Expected 'base' or 'big.'" % FLAGS.params)

    # Determine training schedule based on flags.
    if FLAGS.train_steps is not None and FLAGS.train_epochs is not None:
        raise ValueError(
            "Both --train_steps and --train_epochs were set. Only one "
            "may be defined.")
    if FLAGS.train_steps is not None:
        train_eval_iterations = FLAGS.train_steps // FLAGS.steps_between_eval
        single_iteration_train_steps = FLAGS.steps_between_eval
        single_iteration_train_epochs = None
    else:
        if FLAGS.train_epochs is None:
            FLAGS.train_epochs = DEFAULT_TRAIN_EPOCHS
        train_eval_iterations = FLAGS.train_epochs // FLAGS.epochs_between_eval
        single_iteration_train_steps = None
        single_iteration_train_epochs = FLAGS.epochs_between_eval

    # Make sure that the BLEU source and ref files if set
    if FLAGS.bleu_source is not None and FLAGS.bleu_ref is not None:
        if not tf.gfile.Exists(FLAGS.bleu_source):
            raise ValueError("BLEU source file %s does not exist" %
                             FLAGS.bleu_source)
        if not tf.gfile.Exists(FLAGS.bleu_ref):
            raise ValueError("BLEU source file %s does not exist" %
                             FLAGS.bleu_ref)

    # Add flag-defined parameters to params object
    params.data_dir = FLAGS.data_dir
    params.num_cpu_cores = FLAGS.num_cpu_cores
    params.epochs_between_eval = FLAGS.epochs_between_eval
    params.repeat_dataset = single_iteration_train_epochs

    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       model_dir=FLAGS.model_dir,
                                       params=params)
    train_schedule(estimator, train_eval_iterations,
                   single_iteration_train_steps, single_iteration_train_epochs,
                   FLAGS.bleu_source, FLAGS.bleu_ref, FLAGS.bleu_threshold)

    mlperf_log.transformer_print(key=mlperf_log.RUN_STOP)
    mlperf_log.transformer_print(key=mlperf_log.RUN_FINAL)
def main(args):
    print(args)
    transformer_print(key=mlperf_log.PREPROC_VOCAB_SIZE,
                      value={
                          'src': args.nwordssrc,
                          'tgt': args.nwordstgt
                      })
    os.makedirs(args.destdir, exist_ok=True)
    target = not args.only_source

    def build_dictionary(filenames):
        d = dictionary.Dictionary()
        for filename in filenames:
            Tokenizer.add_file_to_dictionary(filename, d, tokenize_line)
        return d

    def train_path(lang):
        return '{}{}'.format(args.trainpref, ('.' + lang) if lang else '')

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += f'.{lang}'
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args.destdir, file_name(prefix, lang))

    def dict_path(lang):
        return dest_path('dict', lang) + '.txt'

    def dataset_dest_path(output_prefix, lang, extension):
        base = f'{args.destdir}/{output_prefix}'
        lang_part = f'.{args.source_lang}-{args.target_lang}.{lang}' if lang is not None else ''
        return f'{base}{lang_part}.{extension}'

    if args.joined_dictionary:
        assert not args.srcdict, 'cannot combine --srcdict and --joined-dictionary'
        assert not args.tgtdict, 'cannot combine --tgtdict and --joined-dictionary'
        src_dict = build_dictionary(
            set([
                train_path(lang)
                for lang in [args.source_lang, args.target_lang]
            ]))
        tgt_dict = src_dict
    else:
        if args.srcdict:
            src_dict = dictionary.Dictionary.load(args.srcdict)
        else:
            assert args.trainpref, "--trainpref must be set if --srcdict is not specified"
            src_dict = build_dictionary([train_path(args.source_lang)])
        if target:
            if args.tgtdict:
                tgt_dict = dictionary.Dictionary.load(args.tgtdict)
            else:
                assert args.trainpref, "--trainpref must be set if --tgtdict is not specified"
                tgt_dict = build_dictionary([train_path(args.target_lang)])

    src_dict.finalize(
        threshold=args.thresholdsrc,
        nwords=args.nwordssrc,
        padding_factor=args.padding_factor,
    )
    src_dict.save(dict_path(args.source_lang))
    if target:
        if not args.joined_dictionary:
            tgt_dict.finalize(
                threshold=args.thresholdtgt,
                nwords=args.nwordstgt,
                padding_factor=args.padding_factor,
            )
        tgt_dict.save(dict_path(args.target_lang))

    def make_binary_dataset(input_prefix, output_prefix, lang):
        dict = dictionary.Dictionary.load(dict_path(lang))
        print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))

        ds = indexed_dataset.IndexedDatasetBuilder(
            dataset_dest_path(output_prefix, lang, 'bin'))

        def consumer(tensor):
            ds.add_item(tensor)

        input_file = '{}{}'.format(input_prefix,
                                   ('.' + lang) if lang is not None else '')
        res = Tokenizer.binarize(input_file, dict, consumer)
        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
            lang, input_file, res['nseq'], res['ntok'],
            100 * res['nunk'] / res['ntok'], dict.unk_word))
        ds.finalize(dataset_dest_path(output_prefix, lang, 'idx'))

    def make_dataset(input_prefix, output_prefix, lang):
        if args.output_format == 'binary':
            make_binary_dataset(input_prefix, output_prefix, lang)
        elif args.output_format == 'raw':
            # Copy original text file to destination folder
            output_text_file = dest_path(
                output_prefix +
                '.{}-{}'.format(args.source_lang, args.target_lang),
                lang,
            )
            shutil.copyfile(file_name(input_prefix, lang), output_text_file)

    def make_all(lang):
        if args.trainpref:
            make_dataset(args.trainpref, 'train', lang)
        if args.validpref:
            for k, validpref in enumerate(args.validpref.split(',')):
                outprefix = 'valid{}'.format(k) if k > 0 else 'valid'
                make_dataset(validpref, outprefix, lang)
        if args.testpref:
            for k, testpref in enumerate(args.testpref.split(',')):
                outprefix = 'test{}'.format(k) if k > 0 else 'test'
                make_dataset(testpref, outprefix, lang)

    make_all(args.source_lang)
    if target:
        make_all(args.target_lang)

    print('| Wrote preprocessed data to {}'.format(args.destdir))

    if args.alignfile:
        assert args.trainpref, "--trainpref must be set if --alignfile is specified"
        src_file_name = train_path(args.source_lang)
        tgt_file_name = train_path(args.target_lang)
        src_dict = dictionary.Dictionary.load(dict_path(args.source_lang))
        tgt_dict = dictionary.Dictionary.load(dict_path(args.target_lang))
        freq_map = {}
        with open(args.alignfile, 'r') as align_file:
            with open(src_file_name, 'r') as src_file:
                with open(tgt_file_name, 'r') as tgt_file:
                    for a, s, t in zip_longest(align_file, src_file, tgt_file):
                        si = Tokenizer.tokenize(s,
                                                src_dict,
                                                add_if_not_exist=False)
                        ti = Tokenizer.tokenize(t,
                                                tgt_dict,
                                                add_if_not_exist=False)
                        ai = list(map(lambda x: tuple(x.split('-')),
                                      a.split()))
                        for sai, tai in ai:
                            srcidx = si[int(sai)]
                            tgtidx = ti[int(tai)]
                            if srcidx != src_dict.unk(
                            ) and tgtidx != tgt_dict.unk():
                                assert srcidx != src_dict.pad()
                                assert srcidx != src_dict.eos()
                                assert tgtidx != tgt_dict.pad()
                                assert tgtidx != tgt_dict.eos()

                                if srcidx not in freq_map:
                                    freq_map[srcidx] = {}
                                if tgtidx not in freq_map[srcidx]:
                                    freq_map[srcidx][tgtidx] = 1
                                else:
                                    freq_map[srcidx][tgtidx] += 1

        align_dict = {}
        for srcidx in freq_map.keys():
            align_dict[srcidx] = max(freq_map[srcidx],
                                     key=freq_map[srcidx].get)

        with open(
                os.path.join(
                    args.destdir,
                    'alignment.{}-{}.txt'.format(args.source_lang,
                                                 args.target_lang)), 'w') as f:
            for k, v in align_dict.items():
                print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)
Exemplo n.º 20
0
  def call(self, x, y, bias, cache=None, encdec_cache=None):
    """Apply attention mechanism to x and y.

    Args:
      x: a tensor with shape [batch_size, length_x, hidden_size]
      y: a tensor with shape [batch_size, length_y, hidden_size]
      bias: attention bias that will be added to the result of the dot product.
      cache: (Used during prediction) dictionary with tensors containing results
        of previous attentions. The dictionary must have the items:
            {"k": tensor with shape [batch_size, i, key_channels],
             "v": tensor with shape [batch_size, i, value_channels]}
        where i is the current decoded length.

    Returns:
      Attention layer output with shape [batch_size, length_x, hidden_size]
    """
    # Linearly project the query (q), key (k) and value (v) using different
    # learned projections. This is in preparation of splitting them into
    # multiple heads. Multi-head attention uses multiple queries, keys, and
    # values rather than regular attention (which uses a single q, k, v).
    with tf.compat.v1.tpu.bfloat16_scope():
        if x.dtype == tf.float32:
           x = tf.cast(x, tf.bfloat16)
        if y.dtype == tf.float32:
           y = tf.cast(y, tf.bfloat16)
        q = self.q_dense_layer(x)
        if encdec_cache is not None:
          k = encdec_cache["k"]
          v = encdec_cache["v"]
        else:
          k = self.k_dense_layer(y)
          v = self.v_dense_layer(y)

    if cache is not None:
      # Combine cached keys and values with new keys and values.
      k = tf.concat([cache["k"], k], axis=1)
      v = tf.concat([cache["v"], v], axis=1)

      # Update cache
      cache["k"] = k
      cache["v"] = v

    # Split q, k, v into heads.
    q = self.split_heads(q)
    k = self.split_heads(k)
    v = self.split_heads(v)

    # Scale q to prevent the dot product between q and k from growing too large.
    # Calculate dot product attention
    with tf.compat.v1.tpu.bfloat16_scope():
        bias = tf.cast(bias, tf.bfloat16)
        logits = tf.matmul(q, k, transpose_b=True)
        logits *= Attention.rsqrtQ
        logits += bias
        weights = tf.nn.softmax(logits, name="attention_weights")
        if self.train:
          mlperf_log.transformer_print(
              key=mlperf_log.MODEL_HP_ATTENTION_DROPOUT,
              value=self.attention_dropout)
          weights = tf.nn.dropout(weights, 1 - (1.0 - self.attention_dropout))
        attention_output = tf.matmul(weights, v)

        # Recombine heads --> [batch_size, length, hidden_size]
        attention_output = self.combine_heads(attention_output)

        # Run the combined outputs through another linear projection layer.
        attention_output = self.output_dense_layer(attention_output)
        return attention_output
Exemplo n.º 21
0
def Linear(in_features, out_features, bias=True):
    m = nn.Linear(in_features, out_features, bias)
    transformer_print(mlperf_log.MODEL_HP_INITIALIZER_GAIN, value=1)
    nn.init.xavier_uniform_(m.weight)
    nn.init.constant_(m.bias, 0.)
    return m
Exemplo n.º 22
0
def _read_and_batch_from_files(file_pattern,
                               batch_size,
                               max_length,
                               num_cpu_cores,
                               shuffle,
                               repeat,
                               has_horovod=False,
                               static_batch=False):
    """Create dataset where each item is a dict of "inputs" and "targets".

  Args:
    file_pattern: String used to match the input TFRecord files.
    batch_size: Maximum number of tokens per batch of examples
    max_length: Maximum number of tokens per example
    num_cpu_cores: Number of cpu cores for parallel input processing.
    shuffle: If true, randomizes order of elements.
    repeat: Number of times to repeat the dataset. If None, the dataset is
      repeated forever.
    has_horovod: mark if this instance is running with horovod
    static_batch: Whether the batches in the dataset should have static shapes.
      If True, the input is batched so that every batch has the
      shape [batch_size // max_length, max_length]. If False, the input is
      grouped by length, and batched so that batches may have different
      shapes [N, M], where:
        N * M <= batch_size
        M <= max_length
      In general, this setting should be False. Dynamic shapes allow the inputs
      to be grouped so that the number of padding tokens is minimized, and helps
      model training. In cases where the input shape must be static
      (e.g. running on TPU), this setting should be set to True.

  Returns:
    tf.data.Dataset object containing examples loaded from the files.
  """
    dataset = tf.data.Dataset.list_files(file_pattern)

    if shuffle:
        # Shuffle filenames
        mlperf_log.transformer_print(key=mlperf_log.INPUT_ORDER)
        dataset = dataset.shuffle(buffer_size=_FILE_SHUFFLE_BUFFER)

    # Read files and interleave results. When training, the order of the examples
    # will be non-deterministic.
    dataset = dataset.apply(
        tf.data.experimental.parallel_interleave(_load_records,
                                                 sloppy=shuffle,
                                                 cycle_length=num_cpu_cores))

    # Parse each tf.Example into a dictionary
    # TODO: Look into prefetch_input_elements for performance optimization.
    dataset = dataset.map(_parse_example, num_parallel_calls=num_cpu_cores)

    # Remove examples where the input or target length exceeds the maximum length,
    dataset = dataset.filter(lambda x, y: _filter_max_length(
        (x, y), max_length))

    # Batch such that each batch has examples of similar length.
    mlperf_log.transformer_print(key=mlperf_log.INPUT_BATCH_SIZE,
                                 value=batch_size)
    mlperf_log.transformer_print(key=mlperf_log.INPUT_MAX_LENGTH,
                                 value=max_length)
    #dataset = _batch_examples(dataset, batch_size, max_length)
    if static_batch == "Yes":
        dataset = dataset.padded_batch(batch_size // max_length,
                                       ([max_length], [max_length]),
                                       drop_remainder=True)
    else:
        # Group and batch such that each batch has examples of similar length.
        dataset = _batch_examples(dataset, batch_size, max_length)

    dataset = dataset.repeat(repeat)

    # horovod: do shard if enabled multi-instance while training
    #TODO: verify if it is working
    if shuffle and has_horovod:
        import horovod.tensorflow as hvd
        shape = dataset.output_shapes
        dataset = dataset.shard(hvd.size(), hvd.rank())

    # Prefetch the next element to improve speed of input pipeline.
    dataset = dataset.prefetch(1)
    return dataset
Exemplo n.º 23
0
def main(args):
    if not torch.cuda.is_available():
        raise NotImplementedError('Training on CPU is not supported')
    torch.cuda.set_device(args.device_id)
    from mlperf_compliance.mlperf_log import transformer_print
    transformer_print(
        key=mlperf_log.RUN_CLEAR_CACHES
    )  #before this tag we should run clearing caches on the host
    # mlperf compliance synchronization
    if args.distributed_world_size > 1:
        assert (torch.distributed.is_initialized())
        torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0)
        torch.cuda.synchronize()
    transformer_print(key=mlperf_log.RUN_START)
    if args.max_tokens is None:
        args.max_tokens = 6000
    print(args)
    transformer_print(key=mlperf_log.OPT_NAME, value=args.optimizer)
    transformer_print(key=mlperf_log.OPT_LR, value=args.lr)
    transformer_print(key=mlperf_log.OPT_HP_ADAM_BETA1,
                      value=eval(args.adam_betas)[0])
    transformer_print(key=mlperf_log.OPT_HP_ADAM_BETA2,
                      value=eval(args.adam_betas)[1])
    transformer_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=args.adam_eps)
    pValue = ctypes.cast((ctypes.c_int * 1)(), ctypes.POINTER(ctypes.c_int))
    result = torch.cuda.cudart().cudaDeviceSetLimit(ctypes.c_int(0x05),
                                                    ctypes.c_int(128))
    result = torch.cuda.cudart().cudaDeviceGetLimit(pValue, ctypes.c_int(0x05))
    torch.manual_seed(args.seed)
    transformer_print(key=mlperf_log.RUN_SET_RANDOM_SEED, value=args.seed)

    # Setup task, e.g., translation, language modeling, etc.
    task = tasks.setup_task(args)
    transformer_print(key=mlperf_log.MODEL_HP_SEQ_BEAM_SEARCH,
                      value={
                          'alpha': args.lenpen,
                          'beam_size': args.beam,
                          'extra_decode_length': args.max_len_b,
                          'vocab_size': task.target_dictionary.__len__()
                      })

    # Load dataset splits
    load_dataset_splits(task, ['train', 'valid'])

    # Build model and criterion
    model = task.build_model(args)
    criterion = task.build_criterion(args)
    print('| model {}, criterion {}'.format(args.arch,
                                            criterion.__class__.__name__))
    print('| num. model params: {}'.format(
        sum(p.numel() for p in model.parameters())))

    # Build trainer
    if args.fp16:
        trainer = FP16Trainer(args, task, model, criterion)
    else:
        if torch.cuda.get_device_capability(0)[0] >= 7:
            print(
                '| NOTICE: your device may support faster training with --fp16'
            )
        trainer = Trainer(args, task, model, criterion)
    if (args.online_eval or args.target_bleu) and not args.remove_bpe:
        args.remove_bpe = '@@ '
    print('| training on {} GPUs'.format(args.distributed_world_size))
    print('| max tokens per GPU = {} and max sentences per GPU = {}'.format(
        args.max_tokens,
        args.max_sentences,
    ))
    transformer_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.max_tokens)
    transformer_print(key=mlperf_log.INPUT_ORDER)
    # Initialize dataloader
    max_positions = trainer.get_model().max_positions()

    # Send a dummy batch to warm the caching allocator
    dummy_batch = task.dataset('train').get_dummy_batch(
        args.max_tokens, max_positions)
    trainer.dummy_train_step(dummy_batch)

    # Train until the learning rate gets too small or model reaches target score
    max_epoch = args.max_epoch or math.inf
    max_update = args.max_update or math.inf
    tgt_bleu = args.target_bleu or math.inf
    current_bleu = 0.0
    lr = trainer.get_lr()
    train_meter = StopwatchMeter()
    train_meter.start()
    valid_losses = [None]
    valid_subsets = args.valid_subset.split(',')

    ctr = 0

    class DummyEpochBatchIterator:
        def __init__(self, epoch=0):
            self.epoch = epoch

    epoch_itr = DummyEpochBatchIterator(0)
    transformer_print(key=mlperf_log.TRAIN_LOOP)
    while lr >= args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates(
    ) < max_update and current_bleu < tgt_bleu:
        transformer_print(key=mlperf_log.TRAIN_EPOCH, value=epoch_itr.epoch)
        import time
        start = time.time()
        epoch_itr = data.EpochBatchIterator(
            dataset=task.dataset(args.train_subset),
            max_tokens=args.max_tokens,
            max_sentences=args.max_sentences_valid,
            max_positions=max_positions,
            ignore_invalid_inputs=True,
            required_batch_size_multiple=8,
            seed=args.seed,
            num_shards=args.distributed_world_size,
            shard_id=args.distributed_rank,
            epoch=epoch_itr.epoch if ctr is not 0 else 0)
        print("got epoch iterator", time.time() - start)

        # Load the latest checkpoint if one is available
        if ctr is 0:
            load_checkpoint(args, trainer, epoch_itr)

        # train for one epoch
        start = time.time()
        train(args, trainer, task, epoch_itr)
        print("epoch time ", time.time() - start)

        start = time.time()

        if epoch_itr.epoch % args.validate_interval == 0:
            valid_losses = validate(args, trainer, task, epoch_itr,
                                    valid_subsets)

        # Eval BLEU score
        transformer_print(key=mlperf_log.EVAL_START, value=epoch_itr.epoch)
        if args.online_eval or (not tgt_bleu is math.inf):
            current_bleu = score(args, trainer, task, epoch_itr,
                                 args.gen_subset)
            transformer_print(key=mlperf_log.EVAL_ACCURACY,
                              value={
                                  'epoch': epoch_itr.epoch,
                                  'value': current_bleu
                              })
            transformer_print(key=mlperf_log.EVAL_TARGET, value=tgt_bleu)
        transformer_print(key=mlperf_log.EVAL_STOP, value=epoch_itr.epoch)

        # Only use first validation loss to update the learning rate
        lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0])

        # Save checkpoint
        if epoch_itr.epoch % args.save_interval == 0:
            save_checkpoint(args, trainer, epoch_itr, valid_losses[0])

        ctr = ctr + 1
        print("validation and scoring ", time.time() - start)

    train_meter.stop()
    transformer_print(key=mlperf_log.RUN_STOP)
    transformer_print(key=mlperf_log.RUN_FINAL)
    print('| done training in {:.1f} seconds'.format(train_meter.sum))
Exemplo n.º 24
0
def encode_and_save_files(subtokenizer, data_dir, raw_files, tag,
                          total_shards):
    """Save data from files as encoded Examples in TFrecord format.

    Args:
      subtokenizer: Subtokenizer object that will be used to encode the strings.
      data_dir: The directory in which to write the examples
      raw_files: A tuple of (input, target) data files. Each line in the input and
        the corresponding line in target file will be saved in a tf.Example.
      tag: String that will be added onto the file names.
      total_shards: Number of files to divide the data into.

    Returns:
      List of all files produced.
    """
    # Create a file for each shard.
    filepaths = [
        shard_filename(data_dir, tag, n + 1, total_shards)
        for n in range(total_shards)
    ]

    if all_exist(filepaths):
        tf.logging.info("Files with tag %s already exist." % tag)
        return filepaths

    tf.logging.info("Saving files with tag %s." % tag)
    input_file = raw_files[0]
    target_file = raw_files[1]

    # Write examples to each shard in round robin order.
    tmp_filepaths = [fname + ".incomplete" for fname in filepaths]
    writers = [tf.python_io.TFRecordWriter(fname) for fname in tmp_filepaths]
    counter, shard = 0, 0
    for counter, (input_line, target_line) in enumerate(
            zip(txt_line_iterator(input_file),
                txt_line_iterator(target_file))):
        if counter > 0 and counter % 100000 == 0:
            tf.logging.info("\tSaving case %d." % counter)
        example = dict_to_example({
            "inputs":
            subtokenizer.encode(input_line, add_eos=True),
            "targets":
            subtokenizer.encode(target_line, add_eos=True)
        })
        writers[shard].write(example.SerializeToString())
        shard = (shard + 1) % total_shards
    for writer in writers:
        writer.close()

    for tmp_name, final_name in zip(tmp_filepaths, filepaths):
        tf.gfile.Rename(tmp_name, final_name)

    if tag == _TRAIN_TAG:
        mlperf_log.transformer_print(key=mlperf_log.PREPROC_NUM_TRAIN_EXAMPLES,
                                     value=counter)
    elif tag == _EVAL_TAG:
        mlperf_log.transformer_print(key=mlperf_log.PREPROC_NUM_EVAL_EXAMPLES,
                                     value=counter)

    tf.logging.info("Saved %d Examples", counter)
    return filepaths
Exemplo n.º 25
0
def _read_and_batch_from_files(file_pattern, batch_size, max_length,
                               num_cpu_cores, shuffle, repeat):
    """Create dataset where each item is a dict of "inputs" and "targets".

  Args:
    file_pattern: String used to match the input TFRecord files.
    batch_size: Maximum number of tokens per batch of examples
    max_length: Maximum number of tokens per example
    num_cpu_cores: Number of cpu cores for parallel input processing.
    shuffle: If true, randomizes order of elements.
    repeat: Number of times to repeat the dataset. If None, the dataset is
      repeated forever.

  Returns:
    tf.data.Dataset object containing examples loaded from the files.
  """
    # SSY
    print("SSY _read_and_batch_from_files")
    # /usr/local/lib/python3.5/dist-packages/tensorflow_core/python/data/ops/dataset_ops.py
    dataset = tf.data.Dataset.list_files(file_pattern)
    # DatasetV1Adapter
    print("dataset type {}".format(type(dataset)))
    for elem in dataset:
        print(elem.numpy())

    if shuffle:
        # Shuffle filenames
        mlperf_log.transformer_print(key=mlperf_log.INPUT_ORDER)
        # SSY actually list_files above can also shuffle
        # /usr/local/lib/python3.5/dist-packages/tensorflow_core/python/data/ops/dataset_ops.py
        # shuffle the data set randomly
        dataset = dataset.shuffle(buffer_size=_FILE_SHUFFLE_BUFFER)

    # Read files and interleave results. When training, the order of the examples
    # will be non-deterministic.
    # SSY dataset.apply /usr/local/lib/python3.5/dist-packages/tensorflow_core/python/data/ops/dataset_ops.py
    # SSY tf.contrib.data.parallel_interleave /usr/local/lib/python3.5/dist-packages/tensorflow_core/contrib/data/python/ops/interleave_ops.py
    dataset = dataset.apply(
        tf.contrib.data.parallel_interleave(_load_records,
                                            sloppy=shuffle,
                                            cycle_length=num_cpu_cores))

    # Parse each tf.Example into a dictionary
    # TODO: Look into prefetch_input_elements for performance optimization.
    # SSY dataset.map /usr/local/lib/python3.5/dist-packages/tensorflow_core/python/data/ops/dataset_ops.py
    dataset = dataset.map(_parse_example, num_parallel_calls=num_cpu_cores)

    # Remove examples where the input or target length exceeds the maximum length,
    dataset = dataset.filter(lambda x, y: _filter_max_length(
        (x, y), max_length))

    # Batch such that each batch has examples of similar length.
    mlperf_log.transformer_print(key=mlperf_log.INPUT_BATCH_SIZE,
                                 value=batch_size)
    mlperf_log.transformer_print(key=mlperf_log.INPUT_MAX_LENGTH,
                                 value=max_length)
    dataset = _batch_examples(dataset, batch_size, max_length)
    dataset = dataset.repeat(repeat)

    # Prefetch the next element to improve speed of input pipeline.
    # SSY niubility!!! prefetch
    dataset = dataset.prefetch(1)
    # SSY only return dataset
    return dataset