示例#1
0
    def _prepare_split(self, split_generator, pipeline):
        beam = lazy_imports_lib.lazy_imports.apache_beam

        if not tf.io.gfile.exists(self._data_dir):
            tf.io.gfile.makedirs(self._data_dir)

        split_info = split_generator.split_info
        output_prefix = naming.filename_prefix_for_split(
            self.name, split_info.name)
        output_prefix = os.path.join(self._data_dir, output_prefix)

        # Note: We need to wrap the pipeline in a PTransform to avoid re-using the
        # same label names for each split
        @beam.ptransform_fn
        def _build_pcollection(pipeline):
            """PTransformation which build a single split."""
            # Encode the PCollection
            pcoll_examples = self._build_pcollection(
                pipeline, **split_generator.gen_kwargs)
            pcoll_examples |= "Encode" >> beam.Map(
                self.info.features.encode_example)

            # Write the example to disk
            return self._file_format_adapter.write_from_pcollection(
                pcoll_examples,
                file_path_prefix=output_prefix,
                num_shards=split_info.num_shards,
            )

        # Add the PCollection to the pipeline
        _ = pipeline | split_info.name >> _build_pcollection()  # pylint: disable=no-value-for-parameter
示例#2
0
    def _download_and_prepare(self, dl_manager, download_config):
        # Create the Beam pipeline and forward it to _prepare_split
        beam = lazy_imports_lib.lazy_imports.apache_beam

        if not download_config.beam_runner and not download_config.beam_options:
            raise ValueError(
                "Trying to generate a dataset using Apache Beam, yet no Beam Runner "
                "or PipelineOptions() has been provided. Please pass a "
                "tfds.download.DownloadConfig(beam_runner=...) object to the "
                "builder.download_and_prepare(download_config=...) method")

        # Use a single pipeline for all splits
        with beam.Pipeline(
                runner=download_config.beam_runner,
                options=download_config.beam_options,
        ) as pipeline:
            # TODO(tfds): Should eventually try to add support to
            # download_config.max_examples_per_split
            super(BeamBasedBuilder, self)._download_and_prepare(
                dl_manager,
                pipeline=pipeline,
            )

        # Update the number of shards for splits where liquid sharding were used.
        split_dict = self.info.splits
        for split_info in split_dict.values():
            if not split_info.num_shards:
                output_prefix = naming.filename_prefix_for_split(
                    self.name, split_info.name)
                output_prefix = os.path.join(self._data_dir, output_prefix)
                split_info.num_shards = len(
                    tf.io.gfile.glob(output_prefix + "*"))
        self.info.update_splits_if_different(split_dict)
示例#3
0
    def _prepare_split(self, split_generator, pipeline):
        beam = lazy_imports_lib.lazy_imports.apache_beam

        if not tf.io.gfile.exists(self._data_dir):
            tf.io.gfile.makedirs(self._data_dir)

        split_name = split_generator.split_info.name
        output_prefix = naming.filename_prefix_for_split(self.name, split_name)
        output_prefix = os.path.join(self._data_dir, output_prefix)

        # To write examples to disk:
        fname = "{}-{}.tfrecord".format(self.name, split_name)
        fpath = os.path.join(self._data_dir, fname)
        beam_writer = tfrecords_writer.BeamWriter(self._example_specs,
                                                  fpath,
                                                  hash_salt=split_name)
        self._beam_writers[split_name] = beam_writer

        encode_example = self.info.features.encode_example

        # Note: We need to wrap the pipeline in a PTransform to avoid re-using the
        # same label names for each split
        @beam.ptransform_fn
        def _build_pcollection(pipeline):
            """PTransformation which build a single split."""
            # Encode the PCollection
            pcoll_examples = self._build_pcollection(
                pipeline, **split_generator.gen_kwargs)
            pcoll_examples |= "Encode" >> beam.Map(
                lambda key_ex: (key_ex[0], encode_example(key_ex[1])))
            return beam_writer.write_from_pcollection(pcoll_examples)

        # Add the PCollection to the pipeline
        _ = pipeline | split_name >> _build_pcollection()  # pylint: disable=no-value-for-parameter
示例#4
0
    def _prepare_split(self, split_generator, pipeline):
        beam = lazy_imports.lazy_imports.apache_beam

        if not tf.io.gfile.exists(self._data_dir):
            tf.io.gfile.makedirs(self._data_dir)

        if len(split_generator.split_info_list) > 1:
            # Could support Shared-split PCollection, either with same behavior as
            # GeneratorBasedBuilder, or by having each value be a tuple
            # ('split_name', example_value) ? Or should we return three branched
            # PCollections ? Should ask for user feedback
            raise NotImplementedError(
                "Shared-split PCollections not supported yet.")

        split_info = split_generator.split_info_list[0]
        output_prefix = naming.filename_prefix_for_split(
            self.name, split_info.name)
        output_prefix = os.path.join(self._data_dir, output_prefix)

        # Note: We need to wrap the pipeline in a PTransform to avoid re-using the
        # same label names for each split
        @beam.ptransform_fn
        def _build_pcollection(pipeline):
            """PTransformation which build a single split."""
            # Encode the PCollection
            pcoll_examples = self._build_pcollection(
                pipeline, **split_generator.gen_kwargs)
            pcoll_examples |= "Encode" >> beam.Map(
                self.info.features.encode_example)

            # Write the example to disk
            return self._file_format_adapter.write_from_pcollection(
                pcoll_examples,
                file_path_prefix=output_prefix,
                num_shards=split_info.num_shards,
            )

        # Add the PCollection to the pipeline
        _ = pipeline | split_info.name >> _build_pcollection()  # pylint: disable=no-value-for-parameter
示例#5
0
 def test_filename_prefix_for_split(self, prefix, expected):
   split = splits.Split.TRAIN
   self.assertEqual(expected, naming.filename_prefix_for_split(prefix, split))