예제 #1
0
def main(_):
  # Legacy datasets
  urls = set(tfds.core.download.checksums.get_all_url_infos().keys())

  # Dataset-as-folder datasets
  # Could keep track of the dataset name, so the report clearly indicates which
  # dataset should be updated.
  url_infos = {
      name: tfds.builder_cls(name).url_infos
      for name in tfds.list_builders(with_community_datasets=False)
  }
  for url_info in url_infos.values():
    if url_info:
      urls |= url_info.keys()

  urls = sorted(urls)

  with futures.ThreadPoolExecutor(max_workers=100) as executor:
    all_codes = executor.map(_get_status_code, urls)

  print('\n************ Summary ************\n')
  total_errors = 0
  for url, code in zip(urls, all_codes):
    if code == requests.codes.ok:
      continue
    total_errors += 1
    print(f'{url} - status code: {code}')
  print(f'{total_errors} URLs had issues')
예제 #2
0
def _get_builder_cls(
    ds_to_build: str,
) -> Tuple[Type[tfds.core.DatasetBuilder], Dict[str, str]]:
    """Infer the builder class to build.

  Args:
    ds_to_build: Dataset argument.

  Returns:
    builder_cls: The dataset class to download and prepare
    kwargs:
  """
    # 1st case: Requested dataset is a path to `.py` script
    path = _search_script_path(ds_to_build)
    if path is not None:
        logging.info(f'Loading dataset {ds_to_build} from path: {path}')
        # Dynamically load user dataset script
        with tfds.core.utils.add_sys_path(path.parent):
            builder_cls = tfds.core.community.builder_cls_from_module(
                path.stem)
        return builder_cls, {}

    # 2nd case: Dataset is registered through imports.

    # Extract `name/config:version`
    name, builder_kwargs = tfds.core.naming.parse_builder_name_kwargs(
        ds_to_build)
    builder_cls = tfds.builder_cls(str(name))
    logging.info(
        f'Loading dataset {ds_to_build} from imports: {builder_cls.__module__}'
    )
    builder_kwargs = typing.cast(Dict[str, str], builder_kwargs)
    return builder_cls, builder_kwargs
예제 #3
0
def _load_builder_from_code(name: str, ) -> BuilderToDocument:
    """Load the builder, config,... to document."""
    builder_cls = tfds.builder_cls(name)
    section = _get_section(builder_cls)

    if builder_cls.BUILDER_CONFIGS:  # Builder with configs

        def get_config_builder(config) -> tfds.core.DatasetBuilder:
            return tfds.builder(builder_cls.name, config=config)

        with futures.ThreadPoolExecutor(
                max_workers=_WORKER_COUNT_CONFIGS) as tpool:
            config_builders = list(
                tpool.map(get_config_builder, builder_cls.BUILDER_CONFIGS), )
        return BuilderToDocument(
            section=section,
            namespace=None,
            builder=config_builders[0],
            config_builders=config_builders,
        )
    else:  # Builder without configs
        return BuilderToDocument(
            section=section,
            namespace=None,
            builder=builder_cls(),  # pytype: disable=not-instantiable
            config_builders=[],
        )
예제 #4
0
def _document_single_builder_inner(
    name: str,
    visu_doc_util: doc_utils.VisualizationDocUtil,
    df_doc_util: doc_utils.DataframeDocUtil,
    nightly_doc_util: doc_utils.NightlyDocUtil,
) -> Optional[BuilderDocumentation]:
  """Doc string for a single builder, with or without configs."""
  builder_cls = tfds.builder_cls(name)
  section = _get_section(builder_cls)

  tqdm.tqdm.write(f'Document builder {name}...')
  builder, config_builders = _load_builder(builder_cls)

  out_str = dataset_markdown_builder.get_markdown_string(
      builder=builder,
      config_builders=config_builders,
      visu_doc_util=visu_doc_util,
      df_doc_util=df_doc_util,
      nightly_doc_util=nightly_doc_util,
  )
  is_nightly = bool(
      nightly_doc_util and nightly_doc_util.is_builder_nightly(name)
  )
  return BuilderDocumentation(
      name=name,
      content=out_str,
      section=section,
      is_manual=bool(builder_cls.MANUAL_DOWNLOAD_INSTRUCTIONS),
      is_nightly=is_nightly,
  )
예제 #5
0
def _make_builders(
    args: argparse.Namespace,
    ds_to_build: str,
) -> Iterator[tfds.core.DatasetBuilder]:
  """Yields builders to generate."""
  # TODO(tfds): Infer the dataset format.
  # And make sure --record_checksums works.
  # * From file (`.py`), dataset-as-folder (`my_dataset/`):
  # * Nothing (use current directory)
  # * From module `tensorflow_datasets.text.my_dataset`
  # * Community datasets: `namespace/my_dataset`

  # If no dataset selected, use current directory
  if not ds_to_build:
    raise NotImplementedError('No datasets provided not supported yet.')

  # Extract `name/config:version`
  extract_name_and_kwargs = tfds.core.load.dataset_name_and_kwargs_from_name_str
  builder_name, builder_kwargs = extract_name_and_kwargs(ds_to_build)
  builder_cls = tfds.builder_cls(builder_name)

  # Eventually overwrite version
  if args.experimental_latest_version:
    if 'version' in builder_kwargs:
      raise ValueError(
          'Can\'t have both `--experimental_latest` and version set (`:1.0.0`)'
      )
    builder_kwargs['version'] = 'experimental_latest'

  # Eventually overwrite config
  builder_kwargs['config'] = _get_config_name(
      builder_cls=builder_cls,
      config_kwarg=builder_kwargs.get('config'),
      config_name=args.config,
      config_idx=args.config_idx,
  )

  make_builder = functools.partial(
      _make_builder,
      builder_cls,
      overwrite=args.overwrite,
      data_dir=args.data_dir,
      **builder_kwargs,
  )

  # Generate all configs if no config requested.
  if builder_cls.BUILDER_CONFIGS and builder_kwargs['config'] is None:
    for config in builder_cls.BUILDER_CONFIGS:
      yield make_builder(config=config.name)
  # Generate only the dataset
  else:
    yield make_builder()
예제 #6
0
def _collect_path_to_url_infos(
) -> Dict[tfds.core.ReadWritePath, Dict[Url, checksums.UrlInfo]]:
    """Collect checksums paths to url_infos."""
    # Collect legacy checksums paths
    url_info_paths = list(checksums._checksum_paths().values())  # pylint: disable=protected-access

    # Collect dataset-as-folder checksums path
    for name in tfds.list_builders():
        url_info_path = tfds.builder_cls(name)._checksums_path  # pylint: disable=protected-access
        if url_info_path.exists():
            url_info_paths.append(url_info_path)

    url_info_paths = [tfds.core.utils.to_write_path(p) for p in url_info_paths]
    return {
        path: typing.cast(Dict[Url, checksums.UrlInfo],
                          checksums.load_url_infos(path))
        for path in url_info_paths
    }
예제 #7
0
def refactor_dataset(ds_name: str) -> None:
  """Refactor a single dataset."""
  code_info = BuilderCodeInfo.from_builder_cls(tfds.builder_cls(ds_name))

  print(f'Refactoring {code_info.name} in {code_info.dst}')

  # Eventually cleanup previous refactoring.
  if code_info.dst.exists():
    print(f'Cleanup existing {code_info.dst}')
    shutil.rmtree(code_info.dst)
  code_info.dst.mkdir()

  # Copy all files and folders
  _add_init_file(code_info)
  _mv_code(code_info)
  _mv_code_test(code_info)
  _mv_checksums(code_info)
  _mv_fake_data_dir(code_info)
  _mv_create_fake_data(code_info)