def load(tag): """ Load a dataset from repository using given tag Args: tag: string using {username}/{dataset} format or file system, s3://, gcs:// Notes ------ It will try to load using old version and fall off on newer version """ try: ds = load_v0(tag) logger.warning( "Deprecated Warning: Given dataset is using deprecated format v0.x. Please convert to v1.x version upon availability." ) return ds except ImportError: raise DaskModuleNotInstalledException except HubDatasetNotFoundException: raise except Exception as e: pass # logger.warning(traceback.format_exc() + str(e)) return Dataset(tag)
def load(tag, creds=None, session_creds=True) -> Dataset: """ Load a dataset from repository using given url and credentials (optional) """ fs, path = _load_fs_and_path(tag, creds, session_creds=session_creds) fs: fsspec.AbstractFileSystem = fs path_2 = f"{path}/meta.json" if not fs.exists(path): from hub.exceptions import DatasetNotFound raise DatasetNotFound(tag) with fs.open(path_2, "r") as f: ds_meta = json.loads(f.read()) for name in ds_meta["tensors"]: assert fs.exists( f"{path}/{name}"), f"Tensor {name} of {tag} dataset does not exist" if ds_meta["len"] == 0: logger.warning("The dataset is empty (has 0 samples)") return Dataset( { name: Tensor( tmeta, dask.array.from_array( np.empty(shape=(0, ) + tuple(tmeta["shape"][1:]), dtype="uint8"), ), ) for name, tmeta in ds_meta["tensors"].items() }, metainfo=ds_meta.get("metainfo"), ) len_ = ds_meta["len"] # added reverse compatibility for previous versions for name, tmeta in ds_meta["tensors"].items(): if "chunksize" not in tmeta: tmeta["chunksize"] = 1 return Dataset( { name: Tensor( tmeta, _dask_concat([ dask.array.from_delayed( dask.delayed(_numpy_load)( fs, f"{path}/{name}/{i}.npy", codec_from_name(tmeta.get("dcompress")), ), shape=(min(tmeta["chunksize"], len_ - i), ) + tuple(tmeta["shape"][1:]), dtype=tmeta["dtype"], ) for i in range(0, len_, tmeta["chunksize"]) ]), ) for name, tmeta in ds_meta["tensors"].items() }, metainfo=ds_meta.get("metainfo"), )
def commit(self): """ Deprecated alias to flush()""" logger.warning("commit() is deprecated. Use flush() instead") self.flush()