def _make_download_manager(self, download_dir, download_config): """Creates a new download manager object.""" download_dir = download_dir or os.path.join(self._data_dir_root, "downloads") extract_dir = (download_config.extract_dir or os.path.join(download_dir, "extracted")) # Use manual_dir only if MANUAL_DOWNLOAD_INSTRUCTIONS are set. if self.MANUAL_DOWNLOAD_INSTRUCTIONS: manual_dir = (download_config.manual_dir or os.path.join(download_dir, "manual")) else: manual_dir = None return download.DownloadManager( dataset_name=self.name, download_dir=download_dir, extract_dir=extract_dir, manual_dir=manual_dir, manual_dir_instructions=utils.dedent( self.MANUAL_DOWNLOAD_INSTRUCTIONS), force_download=(download_config.download_mode == FORCE_REDOWNLOAD), force_extraction=( download_config.download_mode == FORCE_REDOWNLOAD), force_checksums_validation=download_config. force_checksums_validation, register_checksums=download_config.register_checksums, )
def _make_download_manager(self, download_dir, download_config): """Creates a new download manager object.""" download_dir = ( download_dir or os.path.join(self._data_dir_root, "downloads") ) extract_dir = ( download_config.extract_dir or os.path.join(download_dir, "extracted") ) manual_dir = ( download_config.manual_dir or os.path.join(download_dir, "manual") ) if download_config.register_checksums: # Note: Error will be raised here if user try to record checksums # from a `zipapp` register_checksums_path = utils.to_write_path(self._checksums_path) else: register_checksums_path = None return download.DownloadManager( download_dir=download_dir, extract_dir=extract_dir, manual_dir=manual_dir, url_infos=self.url_infos, manual_dir_instructions=utils.dedent(self.MANUAL_DOWNLOAD_INSTRUCTIONS), force_download=(download_config.download_mode == FORCE_REDOWNLOAD), force_extraction=(download_config.download_mode == FORCE_REDOWNLOAD), force_checksums_validation=download_config.force_checksums_validation, register_checksums=download_config.register_checksums, register_checksums_path=register_checksums_path, verify_ssl=download_config.verify_ssl, dataset_name=self.name, )
def beam_pipeline(self) -> 'beam.Pipeline': """Instanciates and returns Apache Beam pipeline. Calling this function starts the Apache Beam mode. Returns: pipeline: The beam pipeline """ if not self._in_contextmanager: raise AssertionError( 'beam_pipeline has to be created from within `SplitBuilder` ' 'contextmanager.' ) beam = lazy_imports_lib.lazy_imports.apache_beam # On Colab, stderr isn't displayed by default, so using `print`. print_fn = print if utils.is_notebook() else logging.warning if not self._beam_runner and not self._beam_options: msg = utils.dedent( """ **************************** WARNING ********************************* Warning: The dataset you're trying to generate is using Apache Beam, yet no `beam_runner` nor `beam_options` was explicitly provided. Some Beam datasets take weeks to generate, so are usually not suited for single machine generation. Please have a look at the instructions to setup distributed generation: https://www.tensorflow.org/datasets/beam_datasets#generating_a_beam_dataset ********************************************************************** """ ) print_fn(msg) beam_options = ( self._beam_options or beam.options.pipeline_options.PipelineOptions() ) # Beam type checking assumes transforms multiple outputs are of same type, # which is not our case. Plus it doesn't handle correctly all types, so we # are better without it. beam_options.view_as( beam.options.pipeline_options.TypeOptions ).pipeline_type_check = False # Create the global pipeline object common for all splits pipeline = beam.Pipeline(runner=self._beam_runner, options=beam_options) self._beam_pipeline = pipeline.__enter__() return self._beam_pipeline
def __init__(self, builder, description=None, features=None, supervised_keys=None, homepage=None, citation=None, metadata=None, redistribution_info=None): """Constructs DatasetInfo. Args: builder: `DatasetBuilder`, dataset builder for this info. description: `str`, description of this dataset. features: `tfds.features.FeaturesDict`, Information on the feature dict of the `tf.data.Dataset()` object from the `builder.as_dataset()` method. supervised_keys: `tuple` of `(input_key, target_key)`, Specifies the input feature and the label for supervised learning, if applicable for the dataset. The keys correspond to the feature names to select in `info.features`. When calling `tfds.core.DatasetBuilder.as_dataset()` with `as_supervised=True`, the `tf.data.Dataset` object will yield the (input, target) defined here. homepage: `str`, optional, the homepage for this dataset. citation: `str`, optional, the citation to use for this dataset. metadata: `tfds.core.Metadata`, additonal object which will be stored/restored with the dataset. This allows for storing additional information with the dataset. redistribution_info: `dict`, optional, information needed for redistribution, as specified in `dataset_info_pb2.RedistributionInfo`. The content of the `license` subfield will automatically be written to a LICENSE file stored with the dataset. """ self._builder = builder self._info_proto = dataset_info_pb2.DatasetInfo( name=builder.name, description=utils.dedent(description), version=str(builder._version), # pylint: disable=protected-access citation=utils.dedent(citation), redistribution_info=dataset_info_pb2.RedistributionInfo( license=utils.dedent(redistribution_info.pop("license")), **redistribution_info) if redistribution_info else None) if homepage: self._info_proto.location.urls[:] = [homepage] if features: if not isinstance(features, top_level_feature.TopLevelFeature): raise ValueError( "DatasetInfo.features only supports FeaturesDict or Sequence at " "the top-level. Got {}".format(features)) features._set_top_level() # pylint: disable=protected-access self._features = features self._splits = splits_lib.SplitDict(self._builder.name) if supervised_keys is not None: assert isinstance(supervised_keys, tuple) assert len(supervised_keys) == 2 self._info_proto.supervised_keys.input = supervised_keys[0] self._info_proto.supervised_keys.output = supervised_keys[1] if metadata and not isinstance(metadata, Metadata): raise ValueError( "Metadata should be a `tfds.core.Metadata` instance. Received " "{}".format(metadata)) self._metadata = metadata # Is this object initialized with both the static and the dynamic data? self._fully_initialized = False
def __init__( self, *, download_dir: epath.PathLike, extract_dir: Optional[epath.PathLike] = None, manual_dir: Optional[epath.PathLike] = None, manual_dir_instructions: Optional[str] = None, url_infos: Optional[Dict[str, checksums.UrlInfo]] = None, dataset_name: Optional[str] = None, force_download: bool = False, force_extraction: bool = False, force_checksums_validation: bool = False, register_checksums: bool = False, register_checksums_path: Optional[epath.PathLike] = None, verify_ssl: bool = True, ): """Download manager constructor. Args: download_dir: Path to directory where downloads are stored. extract_dir: Path to directory where artifacts are extracted. manual_dir: Path to manually downloaded/extracted data directory. manual_dir_instructions: Human readable instructions on how to prepare contents of the manual_dir for this dataset. url_infos: Urls info for the checksums. dataset_name: Name of dataset this instance will be used for. If provided, downloads will contain which datasets they were used for. force_download: If True, always [re]download. force_extraction: If True, always [re]extract. force_checksums_validation: If True, raises an error if an URL do not have checksums. register_checksums: If True, dl checksums aren't checked, but stored into file. register_checksums_path: Path were to save checksums. Should be set if register_checksums is True. verify_ssl: `bool`, defaults to True. If True, will verify certificate when downloading dataset. Raises: FileNotFoundError: Raised if the register_checksums_path does not exists. """ if register_checksums: if not register_checksums_path: raise ValueError( 'When register_checksums=True, register_checksums_path should be set.' ) register_checksums_path = epath.Path(register_checksums_path) if not register_checksums_path.exists(): # Create the file here to make sure user has write access before # starting downloads. register_checksums_path.touch() else: # Make sure the user has write access before downloading any files. # (e.g. TFDS installed by admin) register_checksums_path.write_text( register_checksums_path.read_text()) download_dir = epath.Path(download_dir).expanduser() if extract_dir: extract_dir = epath.Path(extract_dir).expanduser() else: extract_dir = download_dir / 'extracted' if manual_dir: manual_dir = epath.Path(manual_dir).expanduser() self._download_dir: epath.Path = download_dir self._extract_dir: epath.Path = extract_dir self._manual_dir: Optional[epath.Path] = manual_dir # pytype: disable=annotation-type-mismatch # attribute-variable-annotations self._manual_dir_instructions = utils.dedent(manual_dir_instructions) self._download_dir.mkdir(parents=True, exist_ok=True) self._extract_dir.mkdir(parents=True, exist_ok=True) self._force_download = force_download self._force_extraction = force_extraction self._force_checksums_validation = force_checksums_validation self._register_checksums = register_checksums self._register_checksums_path = register_checksums_path self._verify_ssl = verify_ssl self._dataset_name = dataset_name # All known URLs: {url: UrlInfo(size=, checksum=)} self._url_infos = checksums.get_all_url_infos() if url_infos is not None: self._url_infos.update(url_infos) # To record what is being used: {url: UrlInfo(size, checksum, filename)} self._recorded_url_infos: Dict[str, checksums.UrlInfo] = {} # These attributes are lazy-initialized since they must be cleared when this # object is pickled for Beam. They are then recreated on each worker. self.__downloader = None self.__extractor = None # Executor to avoid blocking other download/extractions when running I/O # operations (reading/renaming download file). # Only use a single thread as the read/ops are locked by the # `build_synchronize_decorator`. # Note: This thread is in additions of the download and extraction # executors threads. self._executor = concurrent.futures.ThreadPoolExecutor(1)
def __init__( self, *, builder: Union[DatasetIdentity, Any], description: Optional[str] = None, features: Optional[feature_lib.FeatureConnector] = None, supervised_keys: Optional[SupervisedKeysType] = None, disable_shuffling: bool = False, homepage: Optional[str] = None, citation: Optional[str] = None, metadata: Optional[Metadata] = None, license: Optional[str] = None, # pylint: disable=redefined-builtin redistribution_info: Optional[Dict[str, str]] = None, split_dict: Optional[splits_lib.SplitDict] = None): # pyformat: disable """Constructs DatasetInfo. Args: builder: `DatasetBuilder` or `DatasetIdentity`. The dataset builder or identity will be used to populate this info. description: `str`, description of this dataset. features: `tfds.features.FeaturesDict`, Information on the feature dict of the `tf.data.Dataset()` object from the `builder.as_dataset()` method. supervised_keys: Specifies the input structure for supervised learning, if applicable for the dataset, used with "as_supervised". The keys correspond to the feature names to select in `info.features`. When calling `tfds.core.DatasetBuilder.as_dataset()` with `as_supervised=True`, the `tf.data.Dataset` object will yield the structure defined by the keys passed here, instead of that defined by the `features` argument. Typically this is a `(input_key, target_key)` tuple, and the dataset yields a tuple of tensors `(input, target)` tensors. To yield a more complex structure, pass a tuple of `tf.nest` compatible structures of feature keys. The resulting `Dataset` will yield structures with each key replaced by the coresponding tensor. For example, passing a triple of keys would return a dataset that yields `(feature, target, sample_weights)` triples for keras. Using `supervised_keys=({'a':'a','b':'b'}, 'c')` would create a dataset yielding a tuple with a dictionary of features in the `features` position. Note that selecting features in nested `tfds.features.FeaturesDict` objects is not supported. disable_shuffling: `bool`, specify whether to shuffle the examples. homepage: `str`, optional, the homepage for this dataset. citation: `str`, optional, the citation to use for this dataset. metadata: `tfds.core.Metadata`, additonal object which will be stored/restored with the dataset. This allows for storing additional information with the dataset. license: license of the dataset. redistribution_info: information needed for redistribution, as specified in `dataset_info_pb2.RedistributionInfo`. The content of the `license` subfield will automatically be written to a LICENSE file stored with the dataset. split_dict: information about the splits in this dataset. """ # pyformat: enable self._builder_or_identity = builder if isinstance(builder, DatasetIdentity): self._identity = builder else: self._identity = DatasetIdentity.from_builder(builder) self._info_proto = dataset_info_pb2.DatasetInfo( name=self._identity.name, description=utils.dedent(description), version=str(self._identity.version), release_notes=self._identity.release_notes, disable_shuffling=disable_shuffling, config_name=self._identity.config_name, config_description=self._identity.config_description, citation=utils.dedent(citation), module_name=self._identity.module_name, redistribution_info=dataset_info_pb2.RedistributionInfo( license=utils.dedent(license or redistribution_info.pop("license")), **redistribution_info) if redistribution_info else None) if homepage: self._info_proto.location.urls[:] = [homepage] if features: if not isinstance(features, top_level_feature.TopLevelFeature): raise ValueError( "DatasetInfo.features only supports FeaturesDict or Sequence at " "the top-level. Got {}".format(features)) self._features = features self._splits = splits_lib.SplitDict([]) if split_dict: self.set_splits(split_dict) if supervised_keys is not None: self._info_proto.supervised_keys.CopyFrom( _supervised_keys_to_proto(supervised_keys)) if metadata and not isinstance(metadata, Metadata): raise ValueError( "Metadata should be a `tfds.core.Metadata` instance. Received " "{}".format(metadata)) self._metadata = metadata # Is this object initialized with both the static and the dynamic data? self._fully_initialized = False