示例#1
0
    def create_and_batch_tfds(self, ds: Dataset, mode,
                              args=None, num_replicas_in_sync=1) -> tf.data.Dataset:
        """ Creates a dataset according to the `mode`.

        Args:
            args: A dict containing dataset arguments.
            ds: A neurst.data.datasets.Dataset object.
            mode: A ModeKeys indicating the running mode.
            num_replicas_in_sync: The number of GPUs or other workers. We will generate global
                batches, and each global batch is equally divisible by number of replicas.

        Returns:
            A tf.data.Dataset or a INFER_DATA tuple.
        """
        if args is None:
            args = self._args
        else:
            args = deep_merge_dict(self._args, args)
        src_eos = tf.constant(self._src_data_pipeline.meta["eos_id"], dtype=tf.int64)
        trg_eos = tf.constant(self._trg_data_pipeline.meta["eos_id"], dtype=tf.int64)

        assert isinstance(ds, AbstractParallelDataset), (
            "The dataset for SeqToSeq task must inherit AbstractParallelDataset.")

        dataset = ds.build(map_func=self.get_data_preprocess_fn(mode, ds.status, args),
                           map_output_dtypes=self.inputs_signature(mode)[0],
                           auto_shard=(mode == compat.ModeKeys.TRAIN),
                           shuffle=(mode == compat.ModeKeys.TRAIN))

        if mode == compat.ModeKeys.INFER:
            logging.info("Creating test dataset.")
            test_dataset = dataset_utils.batch_sequential_dataset(
                dataset=dataset.cache(),
                batch_size=args["batch_size"],
                padding_values={"feature": src_eos},
                num_replicas_in_sync=num_replicas_in_sync,
                drop_remainder=False)

            return test_dataset
        elif mode == compat.ModeKeys.EVAL:
            logging.info("Creating evaluation dataset.")
            return dataset_utils.batch_sequential_dataset(
                dataset.cache(),
                batch_size=args["batch_size"],
                padding_values={"feature": src_eos, "label": trg_eos},
                num_replicas_in_sync=num_replicas_in_sync,
                drop_remainder=False)
        else:
            logging.info("Creating training dataset.")
            if args["cache_dataset"]:
                dataset = dataset.cache()
            dataset = dataset_utils.batch_sequential_dataset(
                dataset,
                padding_values={"feature": src_eos, "label": trg_eos},
                batch_size=args["batch_size"],
                batch_size_per_gpu=args["batch_size_per_gpu"],
                batch_by_tokens=args["batch_by_tokens"],
                shuffer_buffer=args["shuffle_buffer"],
                data_max_lengths={"feature": args["max_src_len"], "label": args["max_trg_len"]},
                drop_remainder=True,
                num_replicas_in_sync=num_replicas_in_sync)
            return dataset
示例#2
0
    def create_and_batch_tfds(self,
                              ds: Dataset,
                              mode,
                              args=None,
                              num_replicas_in_sync=1) -> tf.data.Dataset:
        """ Creates a dataset according to the `mode`.

        Args:
            args: A dict containing dataset arguments.
            ds: A neurst.data.datasets.Dataset object.
            mode: A ModeKeys indicating the running mode.
            num_replicas_in_sync: The number of GPUs or other workers. We will generate global
                batches, and each global batch is equally divisible by number of replicas.

        Returns:
            A tf.data.Dataset.
        """
        if args is None:
            args = self._args
        else:
            args = deep_merge_dict(self._args, args, local_overwrite=False)
        pad = tf.constant(self._data_pipeline.meta["pad_id"], dtype=tf.int64)
        dataset = ds.build(map_func=self.get_data_preprocess_fn(
            mode, ds.status, args),
                           map_output_dtypes=self.inputs_signature(mode)[0],
                           auto_shard=(mode == compat.ModeKeys.TRAIN),
                           shuffle=(mode == compat.ModeKeys.TRAIN))

        if mode == compat.ModeKeys.INFER:
            raise NotImplementedError
            # logging.info("Creating test dataset.")
            # return dataset.cache().padded_batch(
            #     dataset_utils.adjust_batch_size(args["batch_size"],
            #                                     num_replicas_in_sync=num_replicas_in_sync),
            #     padded_shapes={"tokens": [None]},
            #     padding_values={"tokens": pad},
            #     drop_remainder=False)
        elif mode == compat.ModeKeys.EVAL:
            logging.info("Creating evaluation dataset.")
            return dataset.cache().padded_batch(
                dataset_utils.adjust_batch_size(
                    args["batch_size"],
                    num_replicas_in_sync=num_replicas_in_sync),
                padded_shapes={"tokens": [None]},
                padding_values={"tokens": pad},
                drop_remainder=False)
        else:
            logging.info("Creating training dataset.")
            level = args.get("gpu_efficient_level", None)
            logging.info(
                f"Creating training dataset with GPU efficient level={level}.")
            dataset = ds.build(
                map_func=self.get_data_preprocess_fn(mode, ds.status, args),
                map_output_dtypes=self.inputs_signature(mode)[0],
                auto_shard=True,
                shuffle=True)
            dataset = dataset_utils.clean_dataset_by_length(
                dataset, {"tokens": args["max_len"]})
            if args["cache_dataset"]:
                dataset = dataset.cache()
            if args["shuffle_buffer"]:
                dataset = dataset.shuffle(buffer_size=args["shuffle_buffer"])
            padding_values = {
                "tokens":
                tf.constant(self._data_pipeline.meta["pad_id"], dtype=tf.int64)
            }
            if args["max_len"] is None:
                raise RuntimeError("Must provide `max_len` for training.")
            max_len = minimal_multiple(args["max_len"],
                                       EFFICIENT_MULTIPLIER[level])
            batch_size = dataset_utils.adjust_batch_size(
                args["batch_size"],
                args["batch_size_per_gpu"],
                num_replicas_in_sync=num_replicas_in_sync,
                verbose=False)
            if level == GPU_EFFICIENT_LEVEL.LEVEL5:  # static batch
                _batch_size = batch_size
                if args["batch_by_tokens"]:
                    _batch_size = _batch_size // max_len
                logging.info(
                    f"Batching dataset with fixed shape: batch={_batch_size} x {max_len})."
                )
                return dataset.padded_batch(
                    _batch_size // num_replicas_in_sync * num_replicas_in_sync,
                    padded_shapes={"tokens": [max_len]},
                    padding_values=padding_values,
                    drop_remainder=True)
            else:
                bucket_boundaries = [
                    EFFICIENT_MULTIPLIER[level] * i
                    for i in range(1, max_len // EFFICIENT_MULTIPLIER[level] +
                                   1)
                ]
                if bucket_boundaries[-1] < max_len:
                    bucket_boundaries.append(
                        minimal_multiple(bucket_boundaries[-1] + 1,
                                         EFFICIENT_MULTIPLIER[level]))
                bucket_boundaries = {"tokens": bucket_boundaries}
                bucket_batch_sizes = dataset_utils.adjust_batch_size(
                    batch_size,
                    bucket_boundaries=bucket_boundaries
                    if args["batch_by_tokens"] else None,
                    boundaries_reduce_to_length_fn=lambda x: max(
                        tf.nest.flatten(x)),
                    num_replicas_in_sync=num_replicas_in_sync)
                if level != GPU_EFFICIENT_LEVEL.LEVEL0:
                    if isinstance(bucket_batch_sizes, list):
                        bucket_batch_sizes = [
                            int(
                                maximum_lower_multiple(
                                    x // num_replicas_in_sync,
                                    EFFICIENT_MULTIPLIER[level]) *
                                num_replicas_in_sync)
                            for x in bucket_batch_sizes
                        ]
                    else:
                        bucket_batch_sizes = int(
                            maximum_lower_multiple(
                                bucket_batch_sizes // num_replicas_in_sync,
                                EFFICIENT_MULTIPLIER[level]) *
                            num_replicas_in_sync)
                return dataset_utils.batch_examples_by_token(
                    dataset,
                    bucket_boundaries=bucket_boundaries,
                    bucket_batch_sizes=bucket_batch_sizes,
                    padding_values=padding_values,
                    example_length_func=lambda x:
                    {k: tf.size(v)
                     for k, v in x.items()})
示例#3
0
    def create_and_batch_tfds(self,
                              ds: Dataset,
                              mode,
                              args=None,
                              num_replicas_in_sync=1) -> tf.data.Dataset:
        """ Creates a dataset according to the `mode`.

        Args:
            args: A dict containing dataset arguments.
            ds: A neurst.data.datasets.Dataset object.
            mode: A ModeKeys indicating the running mode.
            num_replicas_in_sync: The number of GPUs or other workers. We will generate global
                batches, and each global batch is equally divisible by number of replicas.

        Returns:
            A tf.data.Dataset.
        """
        if args is None:
            args = self._args
        else:
            args = deep_merge_dict(self._args, args, local_overwrite=False)
        src_eos = tf.constant(self._src_data_pipeline.meta["eos_id"],
                              dtype=tf.int64)
        trg_eos = tf.constant(self._trg_data_pipeline.meta["eos_id"],
                              dtype=tf.int64)

        assert isinstance(ds, AbstractParallelDataset), (
            "The dataset for SeqToSeq task must inherit AbstractParallelDataset."
        )

        dataset = ds.build(map_func=self.get_data_preprocess_fn(
            mode, ds.status, args),
                           map_output_dtypes=self.inputs_signature(mode)[0],
                           auto_shard=(mode == compat.ModeKeys.TRAIN),
                           shuffle=(mode == compat.ModeKeys.TRAIN))

        if mode == compat.ModeKeys.INFER:
            logging.info("Creating test dataset.")
            return dataset.cache().padded_batch(
                dataset_utils.adjust_batch_size(
                    args["batch_size"],
                    num_replicas_in_sync=num_replicas_in_sync),
                padded_shapes={"feature": [None]},
                padding_values={"feature": src_eos},
                drop_remainder=False)
        elif mode == compat.ModeKeys.EVAL:
            logging.info("Creating evaluation dataset.")
            return dataset.cache().padded_batch(
                dataset_utils.adjust_batch_size(
                    args["batch_size"],
                    num_replicas_in_sync=num_replicas_in_sync),
                padded_shapes={
                    "feature": [None],
                    "label": [None]
                },
                padding_values={
                    "feature": src_eos,
                    "label": trg_eos
                },
                drop_remainder=False)
        else:
            logging.info("Creating training dataset.")
            dataset = dataset_utils.clean_dataset_by_length(
                dataset, {
                    "feature": args["max_src_len"],
                    "label": args["max_trg_len"]
                })
            if args["cache_dataset"]:
                dataset = dataset.cache()
            if args["shuffle_buffer"]:
                dataset = dataset.shuffle(buffer_size=args["shuffle_buffer"])
            padding_values = {"feature": src_eos, "label": trg_eos}
            if args["max_src_len"] is None:
                raise RuntimeError("Must provide `max_src_len` for training.")
            if args["max_trg_len"] is None:
                raise RuntimeError("Must provide `max_trg_len` for training.")
            src_bucket_boundaries, trg_bucket_boundaries = dataset_utils.associated_bucket_boundaries(
                dataset_utils.create_batch_bucket_boundaries(
                    args["max_src_len"]),
                dataset_utils.create_batch_bucket_boundaries(
                    args["max_trg_len"]))

            bucket_boundaries = {
                "feature": src_bucket_boundaries,
                "label": trg_bucket_boundaries
            }
            bucket_batch_sizes = dataset_utils.adjust_batch_size(
                args["batch_size"],
                args["batch_size_per_gpu"],
                bucket_boundaries=bucket_boundaries
                if args["batch_by_tokens"] else None,
                boundaries_reduce_to_length_fn=lambda x: max(tf.nest.flatten(x)
                                                             ),
                num_replicas_in_sync=num_replicas_in_sync)
            return dataset_utils.batch_examples_by_token(
                dataset,
                bucket_boundaries=bucket_boundaries,
                bucket_batch_sizes=bucket_batch_sizes,
                padding_values=padding_values,
                example_length_func=lambda x:
                {k: tf.size(v)
                 for k, v in x.items()})
示例#4
0
def main(processor_id,
         num_processors,
         num_output_shards,
         output_range_begin,
         output_range_end,
         output_template,
         dataset: Dataset,
         progressbar=False,
         task=None):
    assert 0 <= output_range_begin < output_range_end <= num_output_shards
    assert 0 <= processor_id < num_processors
    logging.info(f"Shards: {output_range_begin} to {output_range_end}")
    if not tf.io.gfile.exists(os.path.dirname(output_template)):
        tf.io.gfile.makedirs(os.path.dirname(output_template))

    file_paths = [
        output_template % (s, num_output_shards)
        for s in range(output_range_begin, output_range_end)
    ]
    tmp_file_paths = [f + ".incomplete" for f in file_paths]
    recordio_writers = [tf.io.TFRecordWriter(_x) for _x in tmp_file_paths]

    map_func = None
    if task is not None:
        map_func = task.get_data_preprocess_fn(ModeKeys.TRAIN, dataset.status)

    feature_type_dict = None
    i = 0
    if progressbar:
        from tqdm import tqdm
        iterator = tqdm(dataset.build_iterator(map_func=map_func,
                                               shard_id=processor_id,
                                               total_shards=num_processors)(),
                        total=dataset.num_samples // num_processors)
    else:
        iterator = dataset.build_iterator(map_func=map_func,
                                          shard_id=processor_id,
                                          total_shards=num_processors)()
    for example in iterator:  # lazily pre-process
        if feature_type_dict is None:
            feature_type_dict = dict()
            for name, data in example.items():
                data_type = type(numpy.array(data).flatten().tolist()[0])
                assert data_type in [int, float, str, bytes
                                     ], "Not supported {}".format(data_type)
                feature_type_dict[name] = data_type
        feature_dict = {}
        for name, data in example.items():
            feature_dict[name] = _format_tf_feature(data,
                                                    feature_type_dict[name])
        recordio_writers[random.randint(
            0,
            len(recordio_writers) - 1)].write(
                tf.train.Example(features=tf.train.Features(
                    feature=feature_dict)).SerializeToString())
        i += 1
    logging.info(f"Total processed {i} samples.")
    for recordio_writer in recordio_writers:
        recordio_writer.close()
    for tmp_f, f in zip(tmp_file_paths, file_paths):
        tf.io.gfile.rename(tmp_f, f, overwrite=True)
    logging.info(
        "===================== Examine feature types =====================")
    for x in tf.data.TFRecordDataset(file_paths).take(1):
        example = tf.train.Example()
        example.ParseFromString(x.numpy())
        logging.info("{")
        for name in example.features.feature:
            if len(example.features.feature[name].bytes_list.value) > 0:
                logging.info(f"    \"{name}\": bytes (str)")
            elif len(example.features.feature[name].int64_list.value) > 0:
                logging.info(f"    \"{name}\": int64")
            elif len(example.features.feature[name].float_list.value) > 0:
                logging.info(f"    \"{name}\": float32")
        logging.info("}")
示例#5
0
    def create_and_batch_tfds(self,
                              ds: Dataset,
                              mode,
                              args=None,
                              num_replicas_in_sync=1) -> tf.data.Dataset:
        """ Creates a dataset according to the `mode`.

        Args:
            args: A dict containing dataset arguments.
            ds: A neurst.data.datasets.Dataset object.
            mode: A ModeKeys indicating the running mode.
            num_replicas_in_sync: The number of GPUs or other workers. We will generate global
                batches, and each global batch is equally divisible by number of replicas.

        Returns:
            A tf.data.Dataset.
        """
        if args is None:
            args = self._args
        else:
            args = deep_merge_dict(self._args, args, local_overwrite=False)
        float_zero = tf.constant(0, dtype=tf.float32)
        int_zero = tf.constant(0, dtype=tf.int64)
        trg_eos = tf.constant(self._trg_data_pipeline.meta["eos_id"],
                              dtype=tf.int64)

        dataset = ds.build(map_func=self.get_data_preprocess_fn(
            mode, ds.status, args),
                           map_output_dtypes=self.inputs_signature(mode)[0],
                           auto_shard=(mode == compat.ModeKeys.TRAIN),
                           shuffle=(mode == compat.ModeKeys.TRAIN))

        if mode == compat.ModeKeys.INFER:
            logging.info("Creating test dataset.")
            return dataset.cache().padded_batch(
                dataset_utils.adjust_batch_size(
                    args["batch_size"],
                    num_replicas_in_sync=num_replicas_in_sync),
                padded_shapes={
                    "audio": [None],
                    "audio_length": []
                },
                padding_values={
                    "audio": float_zero,
                    "audio_length": int_zero
                },
                drop_remainder=False)

        elif mode == compat.ModeKeys.EVAL:
            logging.info("Creating evaluation dataset.")
            return dataset.cache().padded_batch(
                dataset_utils.adjust_batch_size(
                    args["batch_size"],
                    num_replicas_in_sync=num_replicas_in_sync),
                padded_shapes={
                    "audio": [None],
                    "audio_length": [],
                    "transcript": [None]
                },
                padding_values={
                    "audio": float_zero,
                    "audio_length": int_zero,
                    "transcript": trg_eos
                },
                drop_remainder=False)
        else:
            logging.info("Creating training dataset.")
            dataset = dataset_utils.clean_dataset_by_length(
                dataset, {
                    "audio": args["max_src_len"] * self._audio_feature_dim *
                    self._audio_feature_channels,
                    "audio_length": -1,
                    "transcript": args["max_trg_len"]
                })
            if args["cache_dataset"]:
                dataset = dataset.cache()
            if args["shuffle_buffer"]:
                dataset = dataset.shuffle(buffer_size=args["shuffle_buffer"])
            padding_values = {
                "audio": float_zero,
                "audio_length": int_zero,
                "transcript": trg_eos
            }
            if args["max_src_len"] is None:
                raise RuntimeError(
                    "`max_src_len` for SpeechToText task must be provided.")
            if args["max_trg_len"] is None:
                raise RuntimeError(
                    "`max_trg_len` for SpeechToText task must be provided.")
            max_src_len = args["max_src_len"]
            max_trg_len = minimal_multiple(args["max_trg_len"], 8)
            audio_bucket_boundaries = create_audio_bucket_boundaries(
                max_src_len, args["min_src_bucket_boundary"])
            audio_bucket_boundaries[-1] = minimal_multiple(
                audio_bucket_boundaries[-1], 8)
            batch_size = dataset_utils.adjust_batch_size(
                args["batch_size"],
                args["batch_size_per_gpu"],
                num_replicas_in_sync=num_replicas_in_sync,
                verbose=False)
            batch_size_per_gpu = batch_size // num_replicas_in_sync
            assert batch_size_per_gpu > max_src_len, (
                f"batch size per gpu({batch_size_per_gpu} must be greater than "
                f"`max_src_len`={max_src_len}")
            if args["disable_batch_efficiency"]:
                bucket_batch_sizes = [
                    int(batch_size_per_gpu // bound * num_replicas_in_sync)
                    for bound in audio_bucket_boundaries
                ]
            else:
                bucket_batch_sizes = [
                    int(
                        minimal_multiple(batch_size_per_gpu // bound, 8) *
                        num_replicas_in_sync)
                    for bound in audio_bucket_boundaries
                ]
            frame_transcript_ratio = args[
                "experimental_frame_transcript_ratio"]
            if frame_transcript_ratio is None:
                logging.info(
                    "WARNING: we recommend one to pre-scan the dataset and estimate the ratio: "
                    "frame length / transcript length.")
            else:
                trans_bucket_boundaries = [
                    int(bound /
                        (frame_transcript_ratio + i *
                         (max_src_len / max_trg_len - frame_transcript_ratio) /
                         len(audio_bucket_boundaries)))
                    for i, bound in enumerate(audio_bucket_boundaries)
                ]
                trans_bucket_boundaries = [
                    minimal_multiple(min(i, max_trg_len), 8)
                    for i in trans_bucket_boundaries
                ]
                num_buckets = len(trans_bucket_boundaries)
                true_trans_bucket_boundaries = []
                num_input_shapes = 0
                for idx, (batc, bound, tbound) in enumerate(
                        zip(bucket_batch_sizes, audio_bucket_boundaries,
                            trans_bucket_boundaries)):
                    max_trans_len = [
                        tbound, trans_bucket_boundaries[min(
                            idx + 1,
                            len(bucket_batch_sizes) - 1)]
                    ]
                    num_input_shapes += len(set(max_trans_len))
                    true_trans_bucket_boundaries.append(max_trans_len)
                logging.info(
                    f"There are {num_input_shapes} input shapes to be compiled:"
                )
                for idx, (batc, bound, tbound) in enumerate(
                        zip(bucket_batch_sizes, audio_bucket_boundaries,
                            true_trans_bucket_boundaries)):
                    logging.info(f"   - batch={batc}, maximum-frames={bound}, "
                                 f"maximum-transcript-length={set(tbound)}")
                true_trans_bucket_boundaries = tf.constant(
                    true_trans_bucket_boundaries, dtype=tf.int32)
                true_audio_bucket_boundaries = tf.transpose(
                    tf.constant([audio_bucket_boundaries] * 2, dtype=tf.int32))

            bucket_batch_sizes = tf.constant(bucket_batch_sizes,
                                             dtype=tf.int64)
            audio_bucket_boundaries = tf.constant(audio_bucket_boundaries,
                                                  dtype=tf.int32)

            def example_to_bucket_id(examples):
                """Return int64 bucket id for this example, calculated based on length."""
                if frame_transcript_ratio is None:
                    conditions_c = tf.less_equal(
                        tf.cast(examples["audio_length"], tf.int32),
                        audio_bucket_boundaries)
                    return tf.reduce_min(tf.where(conditions_c))
                conditions_c = tf.logical_and(
                    tf.less_equal(tf.cast(examples["audio_length"], tf.int32),
                                  true_audio_bucket_boundaries),
                    tf.less_equal(tf.size(examples["transcript"]),
                                  true_trans_bucket_boundaries))
                minimum_match = tf.where(conditions_c)[0]
                return minimum_match[0] * num_buckets + minimum_match[1]

            def window_size_fn(bucket_id):
                """Return number of examples to be grouped when given a bucket id."""
                if frame_transcript_ratio is None:
                    return bucket_batch_sizes[bucket_id]
                return bucket_batch_sizes[bucket_id // num_buckets]

            def batching_fn(bucket_id, grouped_dataset):
                """Batch and add padding to a dataset of elements with similar lengths."""
                bucket_batch_size = window_size_fn(bucket_id)

                # Batch the dataset and add padding so that all input sequences in the
                # examples have the same length, and all target sequences have the same
                # lengths as well. Resulting lengths of inputs and targets can differ.
                return grouped_dataset.padded_batch(
                    bucket_batch_size,
                    padded_shapes={
                        "audio":
                        ([(audio_bucket_boundaries[bucket_id]
                           if frame_transcript_ratio is None else
                           audio_bucket_boundaries[bucket_id // num_buckets]) *
                          self._audio_feature_dim *
                          self._audio_feature_channels]),
                        "audio_length": [],
                        "transcript":
                        ([None] if frame_transcript_ratio is None else [
                            true_trans_bucket_boundaries[
                                bucket_id // num_buckets][bucket_id %
                                                          num_buckets]
                        ])
                    },
                    padding_values=padding_values,
                    drop_remainder=True)

            return dataset.apply(
                tf.data.experimental.group_by_window(
                    key_func=example_to_bucket_id,
                    reduce_func=batching_fn,
                    window_size=None,
                    window_size_func=window_size_fn))