Пример #1
0
def generate_dataset_and_shuffle(train_gen,
                                 train_paths,
                                 dev_gen,
                                 dev_paths,
                                 shuffle=True):
    generate_files(train_gen, train_paths)
    generate_files(dev_gen, dev_paths)
    mlperf_log.transformer_print(key=mlperf_log.INPUT_ORDER)
    if shuffle:
        shuffle_dataset(train_paths + dev_paths)
Пример #2
0
    def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
        if dataset_split == problem.DatasetSplit.TRAIN:
            mlperf_log.transformer_print(
                key=mlperf_log.PREPROC_TOKENIZE_TRAINING)
        elif dataset_split == problem.DatasetSplit.EVAL:
            mlperf_log.transformer_print(key=mlperf_log.PREPROC_TOKENIZE_EVAL)

        generator = self.generate_samples(data_dir, tmp_dir, dataset_split)
        encoder = self.get_or_create_vocab(data_dir, tmp_dir)
        return text2text_generate_encoded(generator,
                                          encoder,
                                          has_inputs=self.has_inputs)
Пример #3
0
def corpus_token_counts(text_filepattern,
                        corpus_max_lines,
                        split_on_newlines=True):
    """Read the corpus and compute a dictionary of token counts.

  Args:
    text_filepattern: A pattern matching one or more files.
    corpus_max_lines: An integer; maximum total lines to read.
    split_on_newlines: A boolean. If true, then split files by lines and strip
        leading and trailing whitespace from each line. Otherwise, treat each
        file as a single string.

  Returns:
    a dictionary mapping token to count.
  """
    counts = collections.Counter()
    for doc in _read_filepattern(text_filepattern,
                                 max_lines=corpus_max_lines,
                                 split_on_newlines=split_on_newlines):
        counts.update(encode(_native_to_unicode(doc)))

    mlperf_log.transformer_print(key=mlperf_log.PREPROC_VOCAB_SIZE,
                                 value=len(counts))
    return counts
Пример #4
0
def generate_files(generator,
                   output_filenames,
                   max_cases=None,
                   cycle_every_n=1):
    """Generate cases from a generator and save as TFRecord files.

  Generated cases are transformed to tf.Example protos and saved as TFRecords
  in sharded files named output_dir/output_name-00..N-of-00..M=num_shards.

  Args:
    generator: a generator yielding (string -> int/float/str list) dictionaries.
    output_filenames: List of output file paths.
    max_cases: maximum number of cases to get from the generator;
      if None (default), we use the generator until StopIteration is raised.
    cycle_every_n: how many cases from the generator to take before
      switching to the next shard; by default set to 1, switch every case.
  """
    if outputs_exist(output_filenames):
        tf.logging.info(
            "Skipping generator because outputs files exists at {}".format(
                output_filenames))
        return
    tmp_filenames = [fname + ".incomplete" for fname in output_filenames]
    num_shards = len(output_filenames)
    # Check if is training or eval, ref: train_data_filenames().
    if num_shards > 0:
        if "-train" in output_filenames[0]:
            tag = "train"
        elif "-dev" in output_filenames[0]:
            tag = "eval"
        else:
            tag = "other"

    writers = [tf.python_io.TFRecordWriter(fname) for fname in tmp_filenames]
    counter, shard = 0, 0
    for case in generator:
        if case is None:
            continue
        if counter % 100000 == 0:
            tf.logging.info("Generating case %d." % counter)
        counter += 1
        if max_cases and counter > max_cases:
            break
        example = to_example(case)
        writers[shard].write(example.SerializeToString())
        if counter % cycle_every_n == 0:
            shard = (shard + 1) % num_shards

    for writer in writers:
        writer.close()

    for tmp_name, final_name in zip(tmp_filenames, output_filenames):
        tf.gfile.Rename(tmp_name, final_name)

    if num_shards > 0:
        if tag == "train":
            mlperf_log.transformer_print(
                key=mlperf_log.PREPROC_NUM_TRAIN_EXAMPLES, value=counter)
        elif tag == "eval":
            mlperf_log.transformer_print(
                key=mlperf_log.PREPROC_NUM_EVAL_EXAMPLES, value=counter)

    tf.logging.info("Generated %s Examples", counter)
Пример #5
0
    def dataset(self,
                mode,
                data_dir=None,
                num_threads=None,
                output_buffer_size=None,
                shuffle_files=None,
                hparams=None,
                preprocess=True,
                dataset_split=None,
                shard=None,
                partition_id=0,
                num_partitions=1,
                shuffle_buffer_size=1024,
                max_records=-1):
        """Build a Dataset for this problem.

        Args:
          mode: tf.estimator.ModeKeys; determines which files to read from.
          data_dir: directory that contains data files.
          num_threads: int, number of threads to use for decode and preprocess
            Dataset.map calls.
          output_buffer_size: int, how many elements to prefetch at end of pipeline.
          shuffle_files: whether to shuffle input files. Default behavior (i.e. when
            shuffle_files=None) is to shuffle if mode == TRAIN.
          hparams: HParams; hparams to be passed to
            Problem.preprocess_example and Problem.hparams. If None, will use a
            default set that is a no-op.
          preprocess: bool, whether to map the Dataset through
            Problem.preprocess_example.
          dataset_split: DatasetSplit, which split to read data
            from (TRAIN:"-train", EVAL:"-dev", "test":"-test"). Defaults to mode.
          shard: int, if provided, will only read data from the specified shard.
          partition_id: integer - which partition of the dataset to read from
          num_partitions: how many partitions in the dataset
          shuffle_buffer_size: if shuffle_files is True, this is the buffer size
            used to shuffle records.
          max_records: int, number of records to truncate to.

        Returns:
          Dataset containing dict<feature name, Tensor>.

        Raises:
          ValueError: if num_partitions is greater than the number of data files.
        """
        is_training = mode == tf.estimator.ModeKeys.TRAIN
        shuffle_files = shuffle_files or shuffle_files is None and is_training

        dataset_split = dataset_split or mode
        assert data_dir

        if hparams is None:
            hparams = default_model_hparams()

        if not hasattr(hparams, "data_dir"):
            hparams.add_hparam("data_dir", data_dir)
        if not hparams.data_dir:
            hparams.data_dir = data_dir
        # Construct the Problem's hparams so that items within it are accessible
        _ = self.get_hparams(hparams)

        data_filepattern = self.filepattern(data_dir, dataset_split, shard=shard)
        tf.logging.info("Reading data files from %s", data_filepattern)
        data_files = sorted(tf.contrib.slim.parallel_reader.get_data_files(
            data_filepattern))

        # Functions used in dataset transforms below. `filenames` can be either a
        # `tf.string` tensor or `tf.data.Dataset` containing one or more filenames.
        def _load_records_and_preprocess(filenames):
            """Reads files from a string tensor or a dataset of filenames."""
            # Load records from file(s) with an 8MiB read buffer.
            dataset = tf.data.TFRecordDataset(filenames, buffer_size=8 * 1024 * 1024)
            # Decode.
            dataset = dataset.map(self.decode_example, num_parallel_calls=num_threads)
            # Preprocess if requested.
            # Note that preprocessing should happen per-file as order may matter.
            if preprocess:
                dataset = self.preprocess(dataset, mode, hparams,
                                          interleave=shuffle_files)
            return dataset

        if len(data_files) < num_partitions:
            raise ValueError(
                "number of data files (%d) must be at least the number of hosts (%d)"
                % (len(data_files), num_partitions))
        data_files = [f for (i, f) in enumerate(data_files)
                      if i % num_partitions == partition_id]
        tf.logging.info(
            "partition: %d num_data_files: %d" % (partition_id, len(data_files)))
        if shuffle_files:
            mlperf_log.transformer_print(key=mlperf_log.INPUT_ORDER)
            random.shuffle(data_files)

        dataset = tf.data.Dataset.from_tensor_slices(tf.constant(data_files))
        # Create data-set from files by parsing, pre-processing and interleaving.
        if shuffle_files:
            dataset = dataset.apply(
                tf.data.experimental.parallel_interleave(
                    _load_records_and_preprocess, sloppy=True, cycle_length=8))
        else:
            dataset = _load_records_and_preprocess(dataset)

        dataset = dataset.map(
            self.maybe_reverse_and_copy, num_parallel_calls=num_threads)
        dataset = dataset.take(max_records)

        ## Shuffle records only for training examples.
        if shuffle_files and is_training:
            dataset = dataset.shuffle(shuffle_buffer_size)
        if output_buffer_size:
            dataset = dataset.prefetch(output_buffer_size)

        return dataset
Пример #6
0
def transformer_ffn_layer(x,
                          hparams,
                          pad_remover=None,
                          conv_padding="LEFT",
                          nonpadding_mask=None,
                          losses=None,
                          cache=None,
                          decode_loop_step=None,
                          readout_filter_size=0):
    """Feed-forward layer in the transformer.

  Args:
    x: a Tensor of shape [batch_size, length, hparams.hidden_size]
    hparams: hyperparameters for model
    pad_remover: an expert_utils.PadRemover object tracking the padding
      positions. If provided, when using convolutional settings, the padding
      is removed before applying the convolution, and restored afterward. This
      can give a significant speedup.
    conv_padding: a string - either "LEFT" or "SAME".
    nonpadding_mask: an optional Tensor with shape [batch_size, length].
      needed for convolutional layers with "SAME" padding.
      Contains 1.0 in positions corresponding to nonpadding.
    losses: optional list onto which to append extra training losses
    cache: dict, containing tensors which are the results of previous
        attentions, used for fast decoding.
    decode_loop_step: An integer, step number of the decoding loop.
        Only used for inference on TPU.
    readout_filter_size: if it's greater than 0, then it will be used instead of
      filter_size


  Returns:
    a Tensor of shape [batch_size, length, hparams.hidden_size]

  Raises:
    ValueError: If losses arg is None, but layer generates extra losses.
  """
    ffn_layer = hparams.ffn_layer
    relu_dropout_broadcast_dims = (
        common_layers.comma_separated_string_to_integer_list(
            getattr(hparams, "relu_dropout_broadcast_dims", "")))
    if ffn_layer == "conv_hidden_relu":
        # Backwards compatibility
        ffn_layer = "dense_relu_dense"
    if ffn_layer == "dense_relu_dense":
        # In simple convolution mode, use `pad_remover` to speed up processing.
        mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_FFN_FILTER_DENSE,
                                     value={
                                         "filter_size": hparams.filter_size,
                                         "use_bias": "True",
                                         "activation": mlperf_log.RELU
                                     })
        mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_FFN_OUTPUT_DENSE,
                                     value={
                                         "hidden_size": hparams.hidden_size,
                                         "use_bias": "True",
                                     })
        mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_RELU_DROPOUT,
                                     value=hparams.relu_dropout)
        if pad_remover:
            original_shape = common_layers.shape_list(x)
            # Collapse `x` across examples, and remove padding positions.
            x = tf.reshape(x, tf.concat([[-1], original_shape[2:]], axis=0))
            x = tf.expand_dims(pad_remover.remove(x), axis=0)
        conv_output = common_layers.dense_relu_dense(
            x,
            hparams.filter_size,
            hparams.hidden_size,
            dropout=hparams.relu_dropout,
            dropout_broadcast_dims=relu_dropout_broadcast_dims)
        if pad_remover:
            # Restore `conv_output` to the original shape of `x`, including padding.
            conv_output = tf.reshape(
                pad_remover.restore(tf.squeeze(conv_output, axis=0)),
                original_shape)
        return conv_output
    elif ffn_layer == "conv_relu_conv":
        return common_layers.conv_relu_conv(
            x,
            readout_filter_size or hparams.filter_size,
            hparams.hidden_size,
            first_kernel_size=hparams.conv_first_kernel,
            second_kernel_size=1,
            padding=conv_padding,
            nonpadding_mask=nonpadding_mask,
            dropout=hparams.relu_dropout,
            cache=cache,
            decode_loop_step=decode_loop_step)
    elif ffn_layer == "parameter_attention":
        return common_attention.parameter_attention(
            x, hparams.parameter_attention_key_channels or hparams.hidden_size,
            hparams.parameter_attention_value_channels or hparams.hidden_size,
            hparams.hidden_size, readout_filter_size or hparams.filter_size,
            hparams.num_heads, hparams.attention_dropout)
    elif ffn_layer == "conv_hidden_relu_with_sepconv":
        return common_layers.conv_hidden_relu(x,
                                              readout_filter_size
                                              or hparams.filter_size,
                                              hparams.hidden_size,
                                              kernel_size=(3, 1),
                                              second_kernel_size=(31, 1),
                                              padding="LEFT",
                                              dropout=hparams.relu_dropout)
    elif ffn_layer == "sru":
        return common_layers.sru(x)
    elif ffn_layer == "local_moe_tpu":
        overhead = hparams.moe_overhead_eval
        if hparams.mode == tf.estimator.ModeKeys.TRAIN:
            overhead = hparams.moe_overhead_train
        ret, loss = expert_utils.local_moe_tpu(x,
                                               hparams.filter_size // 2,
                                               hparams.hidden_size,
                                               hparams.moe_num_experts,
                                               overhead=overhead,
                                               loss_coef=hparams.moe_loss_coef)
    elif ffn_layer == "local_moe":
        overhead = hparams.moe_overhead_eval
        if hparams.mode == tf.estimator.ModeKeys.TRAIN:
            overhead = hparams.moe_overhead_train
        ret, loss = expert_utils.local_moe(x,
                                           True,
                                           expert_utils.ffn_expert_fn(
                                               hparams.hidden_size,
                                               [hparams.filter_size],
                                               hparams.hidden_size),
                                           hparams.moe_num_experts,
                                           k=hparams.moe_k,
                                           hparams=hparams)
        losses.append(loss)
        return ret
    else:
        assert ffn_layer == "none"
        return x
Пример #7
0
def transformer_encoder(encoder_input,
                        encoder_self_attention_bias,
                        hparams,
                        name="encoder",
                        nonpadding=None,
                        save_weights_to=None,
                        make_image_summary=True,
                        losses=None,
                        attn_bias_for_padding=None):
    """A stack of transformer layers.

  Args:
    encoder_input: a Tensor
    encoder_self_attention_bias: bias Tensor for self-attention
       (see common_attention.attention_bias())
    hparams: hyperparameters for model
    name: a string
    nonpadding: optional Tensor with shape [batch_size, encoder_length]
      indicating what positions are not padding.  This must either be
      passed in, which we do for "packed" datasets, or inferred from
      encoder_self_attention_bias.  The knowledge about padding is used
      for pad_remover(efficiency) and to mask out padding in convolutional
      layers.
    save_weights_to: an optional dictionary to capture attention weights
      for visualization; the weights tensor will be appended there under
      a string key created from the variable scope (including name).
    make_image_summary: Whether to make an attention image summary.
    losses: optional list onto which to append extra training losses
    attn_bias_for_padding: Padded attention bias in case a unidirectional
      encoder is being used where future attention is masked.

  Returns:
    y: a Tensors
  """
    x = encoder_input
    attention_dropout_broadcast_dims = (
        common_layers.comma_separated_string_to_integer_list(
            getattr(hparams, "attention_dropout_broadcast_dims", "")))
    mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS,
                                 value=hparams.num_encoder_layers
                                 or hparams.num_hidden_layers)
    mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_ATTENTION_DROPOUT,
                                 value=hparams.attention_dropout)
    mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_ATTENTION_DENSE,
                                 value={
                                     "use_bias": "false",
                                     "num_heads": hparams.num_heads,
                                     "hidden_size": hparams.hidden_size
                                 })

    with tf.variable_scope(name):
        if nonpadding is not None:
            padding = 1.0 - nonpadding
        else:
            attention_bias = encoder_self_attention_bias
            if attn_bias_for_padding is not None:
                attention_bias = attn_bias_for_padding
            padding = common_attention.attention_bias_to_padding(
                attention_bias)
            nonpadding = 1.0 - padding
        pad_remover = None
        if hparams.use_pad_remover and not common_layers.is_xla_compiled():
            pad_remover = expert_utils.PadRemover(padding)
        for layer in range(hparams.num_encoder_layers
                           or hparams.num_hidden_layers):
            with tf.variable_scope("layer_%d" % layer):
                with tf.variable_scope("self_attention"):
                    y = common_attention.multihead_attention(
                        common_layers.layer_preprocess(x, hparams),
                        None,
                        encoder_self_attention_bias,
                        hparams.attention_key_channels or hparams.hidden_size,
                        hparams.attention_value_channels
                        or hparams.hidden_size,
                        hparams.hidden_size,
                        hparams.num_heads,
                        hparams.attention_dropout,
                        attention_type=hparams.self_attention_type,
                        max_relative_position=hparams.max_relative_position,
                        heads_share_relative_embedding=(
                            hparams.heads_share_relative_embedding),
                        add_relative_to_values=hparams.add_relative_to_values,
                        save_weights_to=save_weights_to,
                        make_image_summary=make_image_summary,
                        dropout_broadcast_dims=attention_dropout_broadcast_dims,
                        max_length=hparams.get("max_length"),
                        vars_3d=hparams.get("attention_variables_3d"),
                        activation_dtype=hparams.get("activation_dtype",
                                                     "float32"),
                        weight_dtype=hparams.get("weight_dtype", "float32"))
                    x = common_layers.layer_postprocess(x, y, hparams)
                with tf.variable_scope("ffn"):
                    y = transformer_ffn_layer(common_layers.layer_preprocess(
                        x, hparams),
                                              hparams,
                                              pad_remover,
                                              conv_padding="SAME",
                                              nonpadding_mask=nonpadding,
                                              losses=losses)
                    x = common_layers.layer_postprocess(x, y, hparams)
        # if normalization is done in layer_preprocess, then it should also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.
        mlperf_log.transformer_print(
            key=mlperf_log.MODEL_HP_NORM,
            value={"hidden_size": hparams.hidden_size})
        return common_layers.layer_preprocess(x, hparams)