def _prepare_split(self, split_generator, pipeline): beam = lazy_imports_lib.lazy_imports.apache_beam if not tf.io.gfile.exists(self._data_dir): tf.io.gfile.makedirs(self._data_dir) split_info = split_generator.split_info output_prefix = naming.filename_prefix_for_split( self.name, split_info.name) output_prefix = os.path.join(self._data_dir, output_prefix) # Note: We need to wrap the pipeline in a PTransform to avoid re-using the # same label names for each split @beam.ptransform_fn def _build_pcollection(pipeline): """PTransformation which build a single split.""" # Encode the PCollection pcoll_examples = self._build_pcollection( pipeline, **split_generator.gen_kwargs) pcoll_examples |= "Encode" >> beam.Map( self.info.features.encode_example) # Write the example to disk return self._file_format_adapter.write_from_pcollection( pcoll_examples, file_path_prefix=output_prefix, num_shards=split_info.num_shards, ) # Add the PCollection to the pipeline _ = pipeline | split_info.name >> _build_pcollection() # pylint: disable=no-value-for-parameter
def _download_and_prepare(self, dl_manager, download_config): # Create the Beam pipeline and forward it to _prepare_split beam = lazy_imports_lib.lazy_imports.apache_beam if not download_config.beam_runner and not download_config.beam_options: raise ValueError( "Trying to generate a dataset using Apache Beam, yet no Beam Runner " "or PipelineOptions() has been provided. Please pass a " "tfds.download.DownloadConfig(beam_runner=...) object to the " "builder.download_and_prepare(download_config=...) method") # Use a single pipeline for all splits with beam.Pipeline( runner=download_config.beam_runner, options=download_config.beam_options, ) as pipeline: # TODO(tfds): Should eventually try to add support to # download_config.max_examples_per_split super(BeamBasedBuilder, self)._download_and_prepare( dl_manager, pipeline=pipeline, ) # Update the number of shards for splits where liquid sharding were used. split_dict = self.info.splits for split_info in split_dict.values(): if not split_info.num_shards: output_prefix = naming.filename_prefix_for_split( self.name, split_info.name) output_prefix = os.path.join(self._data_dir, output_prefix) split_info.num_shards = len( tf.io.gfile.glob(output_prefix + "*")) self.info.update_splits_if_different(split_dict)
def _prepare_split(self, split_generator, pipeline): beam = lazy_imports_lib.lazy_imports.apache_beam if not tf.io.gfile.exists(self._data_dir): tf.io.gfile.makedirs(self._data_dir) split_name = split_generator.split_info.name output_prefix = naming.filename_prefix_for_split(self.name, split_name) output_prefix = os.path.join(self._data_dir, output_prefix) # To write examples to disk: fname = "{}-{}.tfrecord".format(self.name, split_name) fpath = os.path.join(self._data_dir, fname) beam_writer = tfrecords_writer.BeamWriter(self._example_specs, fpath, hash_salt=split_name) self._beam_writers[split_name] = beam_writer encode_example = self.info.features.encode_example # Note: We need to wrap the pipeline in a PTransform to avoid re-using the # same label names for each split @beam.ptransform_fn def _build_pcollection(pipeline): """PTransformation which build a single split.""" # Encode the PCollection pcoll_examples = self._build_pcollection( pipeline, **split_generator.gen_kwargs) pcoll_examples |= "Encode" >> beam.Map( lambda key_ex: (key_ex[0], encode_example(key_ex[1]))) return beam_writer.write_from_pcollection(pcoll_examples) # Add the PCollection to the pipeline _ = pipeline | split_name >> _build_pcollection() # pylint: disable=no-value-for-parameter
def _prepare_split(self, split_generator, pipeline): beam = lazy_imports.lazy_imports.apache_beam if not tf.io.gfile.exists(self._data_dir): tf.io.gfile.makedirs(self._data_dir) if len(split_generator.split_info_list) > 1: # Could support Shared-split PCollection, either with same behavior as # GeneratorBasedBuilder, or by having each value be a tuple # ('split_name', example_value) ? Or should we return three branched # PCollections ? Should ask for user feedback raise NotImplementedError( "Shared-split PCollections not supported yet.") split_info = split_generator.split_info_list[0] output_prefix = naming.filename_prefix_for_split( self.name, split_info.name) output_prefix = os.path.join(self._data_dir, output_prefix) # Note: We need to wrap the pipeline in a PTransform to avoid re-using the # same label names for each split @beam.ptransform_fn def _build_pcollection(pipeline): """PTransformation which build a single split.""" # Encode the PCollection pcoll_examples = self._build_pcollection( pipeline, **split_generator.gen_kwargs) pcoll_examples |= "Encode" >> beam.Map( self.info.features.encode_example) # Write the example to disk return self._file_format_adapter.write_from_pcollection( pcoll_examples, file_path_prefix=output_prefix, num_shards=split_info.num_shards, ) # Add the PCollection to the pipeline _ = pipeline | split_info.name >> _build_pcollection() # pylint: disable=no-value-for-parameter
def test_filename_prefix_for_split(self, prefix, expected): split = splits.Split.TRAIN self.assertEqual(expected, naming.filename_prefix_for_split(prefix, split))