Пример #1
0
def _compute_split_statistics_beam(
    *,
    split_files: _SplitFilesDict,
    out_dir: epath.PathLike,
    filename_template: naming.ShardedFileTemplate,
) -> List[split_lib.SplitInfo]:
    """Compute statistics."""
    out_dir = epath.Path(out_dir)

    assert out_dir.exists(), f'{out_dir} does not exists'

    beam = lazy_imports_lib.lazy_imports.apache_beam

    # Launch the beam pipeline computation
    runner = None
    # Create the global pipeline object common for all splits
    # Disable type_hint as it doesn't works with typing.Protocol
    beam_options = beam.options.pipeline_options.PipelineOptions()
    beam_options.view_as(
        beam.options.pipeline_options.TypeOptions).pipeline_type_check = False
    with beam.Pipeline(runner=runner, options=beam_options) as pipeline:
        for split_name, file_infos in split_files.items():
            _ = pipeline | split_name >> _process_split(  # pylint: disable=no-value-for-parameter
                filename_template=filename_template,
                out_dir=out_dir,
                file_infos=file_infos,  # pytype: disable=missing-parameter
            )

    # After the files have been computed
    return [
        _split_info_from_path(
            filename_template.replace(data_dir=out_dir, split=split))
        for split in split_files
    ]
Пример #2
0
    def copy(
        self,
        dst: epath.PathLike,
        overwrite: bool = False,
    ) -> epath.Path:
        """Copy the current file to the given destination.

    Args:
      dst: Target file. It can be any PathLike compatible path (e.g. `gs://...`)
      overwrite: Whether the file should be overwritten or not

    Returns:
      The new created file.

    Raises:
      FileExistsError: If `overwrite` is false and destination exists.
    """
        dst = epath.Path(dst)
        if not overwrite and dst.exists():
            raise FileExistsError(
                f'Cannot copy {self}. Destination {dst} exists.')
        # Otherwise, copy src to dst
        dst.write_bytes(self.read_bytes())
        return dst
Пример #3
0
def compute_url_info(
    path: epath.PathLike,
    checksum_cls=hashlib.sha256,
) -> UrlInfo:
    """Locally compute size, checksums of the given file."""
    path = epath.Path(path)

    checksum = checksum_cls()
    size = 0
    with path.open('rb') as f:
        while True:
            block = f.read(io.DEFAULT_BUFFER_SIZE)
            size += len(block)
            if not block:
                break
            checksum.update(block)

    return UrlInfo(
        checksum=checksum.hexdigest(),  # base64 digest would have been better.
        size=utils.Size(size),
        filename=path.name,
    )
Пример #4
0
def mock_cwd(path: epath.PathLike) -> Iterator[None]:
    """Mock the current directory."""
    path = pathlib.Path(path)
    assert path.exists() and path.is_dir()  # Check given path is valid cwd dir
    with mock.patch('os.getcwd', return_value=os.fspath(path)):
        yield
Пример #5
0
def write_metadata(
    *,
    data_dir: epath.PathLike,
    features: features_lib.feature.FeatureConnectorArg,
    split_infos: Union[None, epath.PathLike, List[split_lib.SplitInfo]] = None,
    version: Union[None, str, utils.Version] = None,
    check_data: bool = True,
    **ds_info_kwargs,
) -> None:
    """Add metadata required to load with TFDS.

  See documentation for usage:
  https://www.tensorflow.org/datasets/external_tfrecord

  Args:
    data_dir: Dataset path on which to save the metadata
    features: dict of `tfds.features.FeatureConnector` matching the proto specs.
    split_infos: Can be either:  * A path to the pre-computed split info values
      ( the `out_dir` kwarg of `tfds.folder_dataset.compute_split_info`) * A
      list of `tfds.core.SplitInfo` (returned value of
      `tfds.folder_dataset.compute_split_info`) * `None` to auto-compute the
      split info.
    version: Optional dataset version (auto-infer by default, or fallback to
      1.0.0)
    check_data: If True, perform additional check to validate the data in
      data_dir is valid
    **ds_info_kwargs: Additional metadata forwarded to `tfds.core.DatasetInfo` (
      description, homepage,...). Will appear in the doc.
  """
    features = features_lib.features_dict.to_feature(features)
    data_dir = epath.Path(data_dir)
    # Extract the tf-record filenames
    tfrecord_files = [
        f for f in data_dir.iterdir() if naming.FilenameInfo.is_valid(f.name)
    ]
    if not tfrecord_files:
        raise ValueError(
            f'Could not find tf-record (or compatible format) in {data_dir}. '
            'Make sure to follow the pattern: '
            '`<dataset_name>-<split_name>.<file-extension>-xxxxxx-of-yyyyyy`')

    file_infos = [naming.FilenameInfo.from_str(f.name) for f in tfrecord_files]

    # Use set with tuple expansion syntax to ensure all names are consistents
    snake_name, = {f.dataset_name for f in file_infos}
    camel_name = naming.snake_to_camelcase(snake_name)
    filetype_suffix, = {f.filetype_suffix for f in file_infos}
    file_format = file_adapters.file_format_from_suffix(filetype_suffix)

    cls = types.new_class(
        camel_name,
        bases=(_WriteBuilder, ),
        kwds=dict(skip_registration=True),
        exec_body=None,
    )

    if version is None:  # Automatically detect the version
        if utils.Version.is_valid(data_dir.name):
            version = data_dir.name
        else:
            version = '1.0.0'
    cls.VERSION = utils.Version(version)

    # Create a dummy builder (use non existant folder to make sure
    # dataset_info.json is not restored)
    builder = cls(file_format=file_format, data_dir='/tmp/non-existent-dir/')

    # Create the metadata
    ds_info = dataset_info.DatasetInfo(
        builder=builder,
        features=features,
        **ds_info_kwargs,
    )
    ds_info.set_file_format(file_format)

    # Add the split infos
    split_dict = _load_splits(
        data_dir=data_dir,
        split_infos=split_infos,
        file_infos=file_infos,
        filetype_suffix=filetype_suffix,
        builder=builder,
    )
    ds_info.set_splits(split_dict)

    # Save all metadata (dataset_info.json, features.json,...)
    ds_info.write_to_directory(data_dir)

    # Make sure that the data can be loaded (feature connector match the actual
    # specs)
    if check_data:
        utils.print_notebook(
            'Metadata written. Testing by reading first example. '
            'Set check_data=False to skip.')
        builder = read_only_builder.builder_from_directory(data_dir)
        split_name = next(iter(builder.info.splits))
        _, = builder.as_dataset(
            split=f'{split_name}[:1]')  # Load the first example