def __init__( self, filepath: str, bucket_name: str, s3fs_args: Dict[str, Any] = None, credentials: Dict[str, Any] = None, save_args: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``MatplotlibS3Writer``. Args: bucket_name: Name of the bucket without "s3://" prefix. filepath: Key path to matplot object file(s). s3fs_args: Arguments for ``S3FileSystem``. See https://s3fs.readthedocs.io/en/latest/api.html#s3fs.core.S3FileSystem credentials: Arguments for ``client_kwargs``. If needed ``aws_access_key_id`` and ``aws_secret_access_key`` are provided here. save_args: Save args passed to `plt.savefig`. See https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html """ deprecation_warning(self.__class__.__name__) _credentials = copy.deepcopy(credentials) or {} self._s3fs_args = copy.deepcopy(s3fs_args) or {} self._filepath = Path(filepath) self._save_args = save_args if save_args else dict() self._bucket_name = Path(bucket_name) self._s3 = S3FileSystem(client_kwargs=_credentials, **self._s3fs_args)
def __init__( self, filepath: str, load_args: Dict[str, Any] = None, save_args: Dict[str, Any] = None, version: Version = None, ) -> None: """Creates a new instance of ``JSONLocalDataSet`` pointing to a concrete filepath. Args: filepath: path to a local json file. load_args: Arguments passed on to ```json.load``. See https://docs.python.org/3/library/json.html for details. All defaults are preserved. save_args: Arguments passed on to ```json.dump``. See https://docs.python.org/3/library/json.html for details. All defaults are preserved. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` attribute is None, save version will be autogenerated. """ deprecation_warning(self.__class__.__name__) super().__init__(Path(filepath), version) # Handle default load and save arguments self._load_args = copy.deepcopy(self.DEFAULT_LOAD_ARGS) if load_args is not None: self._load_args.update(load_args) self._save_args = copy.deepcopy(self.DEFAULT_SAVE_ARGS) if save_args is not None: self._save_args.update(save_args)
def __init__( self, filepath: str, load_args: Dict[str, Any] = None, save_args: Dict[str, Any] = None, version: Version = None, ) -> None: """Creates a new instance of ``NetworkXLocalDataSet``. Args: filepath: The path to the NetworkX graph JSON file. load_args: Arguments passed on to ```networkx.node_link_graph``. See the details in https://networkx.github.io/documentation/networkx-1.9.1/reference/generated/networkx.readwrite.json_graph.node_link_graph.html save_args: Arguments passed on to ```networkx.node_link_data``. See the details in https://networkx.github.io/documentation/networkx-1.9.1/reference/generated/networkx.readwrite.json_graph.node_link_data.html version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` attribute is None, save version will be autogenerated. """ deprecation_warning(self.__class__.__name__) super().__init__( filepath=Path(filepath), load_args=load_args, save_args=save_args, version=version, )
def __init__(self, filepath: str, load_args: Dict[str, Any] = None, version: Version = None) -> None: """Creates a new instance of ``FeatherLocalDataSet`` pointing to a concrete filepath. Args: filepath: path to a feather file. load_args: feather options for loading feather files. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_feather.html#pandas.read_feather All defaults are preserved. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` attribute is None, save version will be autogenerated. """ deprecation_warning(self.__class__.__name__) super().__init__(Path(filepath), version) default_load_args = {} # type: Dict[str, Any] self._load_args = ({ **default_load_args, **load_args } if load_args is not None else default_load_args)
def __init__( self, file_url: str, file_path: str = None, auth: Optional[Union[Tuple[str], AuthBase]] = None, load_args: Optional[Dict[str, Any]] = None, force_download: bool = False, ) -> None: """Creates a new instance of ``CSVHTTPDataSet`` pointing to a concrete csv file over HTTP(S). Args: fileurl: A URL to fetch the CSV file. auth: Anything ``requests.get`` accepts. Normally it's either ``('login', 'password')``, or ``AuthBase`` instance for more complex cases. load_args: Pandas options for loading csv files. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html All defaults are preserved. """ deprecation_warning(self.__class__.__name__) super().__init__() self._file_url = file_url self._file_path = file_path self._auth_backend = auth self._load_args = copy.deepcopy(load_args or {}) self._force_download = force_download
def __init__( self, filepath: str, bucket_name: str = None, credentials: Dict[str, Any] = None, load_args: Dict[str, Any] = None, save_args: Dict[str, Any] = None, version: Version = None, project: str = None, gcsfs_args: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``JSONGCSDataSet`` pointing to a concrete JSON file on GCS. Args: filepath: Path to a JSON file. May contain the full path in Google Cloud Storage including bucket and protocol, e.g. `gcs://bucket-name/path/to/file.json`. bucket_name: GCS bucket name. Must be specified **only** if not present in ``filepath``. credentials: Credentials to access the GCS bucket such as ``client_email`` and ``token_uri``, or ``refresh_token``, ``client_secret``, ``client_id``. load_args: Pandas options for loading JSON files. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_json.html All defaults are preserved. save_args: Pandas options for saving JSON files. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_json.html All defaults are preserved, but "index", which is set to False. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` attribute is None, save version will be autogenerated. project: The GCP project ID, as per: https://cloud.google.com/resource-manager/docs/creating-managing-projects gcsfs_args: Extra arguments to pass into ``GCSFileSystem``. See https://gcsfs.readthedocs.io/en/latest/api.html#gcsfs.core.GCSFileSystem """ deprecation_warning(self.__class__.__name__) _credentials = deepcopy(credentials) or {} _gcsfs_args = deepcopy(gcsfs_args) or {} _gcs = gcsfs.GCSFileSystem(project=project, token=_credentials, **_gcsfs_args) path = _gcs._strip_protocol(filepath) path = PurePosixPath( "{}/{}".format(bucket_name, path) if bucket_name else path) super().__init__( filepath=path, version=version, exists_function=_gcs.exists, glob_function=_gcs.glob, load_args=load_args, save_args=save_args, ) self._gcs = _gcs
def __init__( self, filepath: str, bucket_name: str = None, credentials: Dict[str, Any] = None, load_args: Dict[str, Any] = None, save_args: Dict[str, Any] = None, version: Version = None, s3fs_args: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``CSVS3DataSet`` pointing to a concrete csv file on S3. Args: filepath: Path to a csv file. May contain the full path in S3 including bucket and protocol, e.g. `s3://bucket-name/path/to/file.csv`. bucket_name: S3 bucket name. Must be specified **only** if not present in ``filepath``. credentials: Credentials to access the S3 bucket, such as ``aws_access_key_id``, ``aws_secret_access_key``. load_args: Pandas options for loading csv files. You can find all available arguments at: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html All defaults are preserved. save_args: Pandas options for saving csv files. You can find all available arguments at: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html All defaults are preserved, but "index", which is set to False. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` attribute is None, save version will be autogenerated. s3fs_args: S3FileSystem options. You can see all available arguments at: https://s3fs.readthedocs.io/en/latest/api.html#s3fs.core.S3FileSystem """ deprecation_warning(self.__class__.__name__) _credentials = copy.deepcopy(credentials) or {} _s3fs_args = copy.deepcopy(s3fs_args) or {} _s3 = S3FileSystem(client_kwargs=_credentials, **_s3fs_args) path = _s3._strip_protocol(filepath) path = PurePosixPath( "{}/{}".format(bucket_name, path) if bucket_name else path) super().__init__(path, version, exists_function=_s3.exists, glob_function=_s3.glob) # Handle default load and save arguments self._load_args = copy.deepcopy(self.DEFAULT_LOAD_ARGS) if load_args is not None: self._load_args.update(load_args) self._save_args = copy.deepcopy(self.DEFAULT_SAVE_ARGS) if save_args is not None: self._save_args.update(save_args) self._s3 = _s3
def __init__( self, filepath: str, bucket_name: str = None, credentials: Dict[str, Any] = None, load_args: Dict[str, Any] = None, save_args: Dict[str, Any] = None, version: Version = None, s3fs_args: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``ParquetS3DataSet`` pointing to a concrete parquet file on S3. Args: filepath: Path to a parquet file, parquet collection or the directory of a multipart parquet. May contain the full path in S3 including bucket and protocol, e.g. `s3://bucket-name/path/to/file.parquet`. bucket_name: S3 bucket name. Must be specified **only** if not present in ``filepath``. credentials: Credentials to access the S3 bucket, such as ``aws_access_key_id``, ``aws_secret_access_key``. load_args: Additional loading options `pyarrow`: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html or `fastparquet`: https://fastparquet.readthedocs.io/en/latest/api.html#fastparquet.ParquetFile.to_pandas save_args: Additional saving options for `pyarrow`: https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.from_pandas or `fastparquet`: https://fastparquet.readthedocs.io/en/latest/api.html#fastparquet.write version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` attribute is None, save version will be autogenerated. s3fs_args: S3FileSystem options. You can see all available arguments at: https://s3fs.readthedocs.io/en/latest/api.html#s3fs.core.S3FileSystem """ deprecation_warning(self.__class__.__name__) _credentials = copy.deepcopy(credentials) or {} _s3fs_args = copy.deepcopy(s3fs_args) or {} _s3 = S3FileSystem(client_kwargs=_credentials, **_s3fs_args) path = _s3._strip_protocol(filepath) path = PurePosixPath( "{}/{}".format(bucket_name, path) if bucket_name else path) super().__init__( load_args=load_args, save_args=save_args, filepath=path, version=version, exists_function=_s3.exists, glob_function=_s3.glob, ) self._s3 = _s3
def __init__( self, filepath: str, bucket_name: str = None, credentials: Dict[str, Any] = None, load_args: Dict[str, Any] = None, save_args: Dict[str, Any] = None, version: Version = None, project: str = None, gcsfs_args: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``ParquetGCSDataSet`` pointing to a concrete Parquet file on GCS. Args: filepath: Path to a Parquet file. May contain the full path in Google Cloud Storage including bucket and protocol, e.g. ``gcs://bucket-name/path/to/file.parquet``. bucket_name: GCS bucket name. Must be specified **only** if not present in ``filepath``. credentials: Credentials to access the GCS bucket. Authentication is performed by gcsfs according to https://gcsfs.readthedocs.io/en/latest/#credentials load_args: Pandas options for loading Parquet files. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_parquet.html All defaults are preserved. save_args: Additional saving options for `pyarrow.parquet.write_table`. Here you can find all available arguments: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html?highlight=write_table#pyarrow.parquet.write_table version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` attribute is None, save version will be autogenerated. project: The GCP project. If not specified, then the default is inferred by a remote request. https://cloud.google.com/resource-manager/docs/creating-managing-projects gcsfs_args: Extra arguments to pass into ``GCSFileSystem``. See https://gcsfs.readthedocs.io/en/latest/api.html#gcsfs.core.GCSFileSystem """ deprecation_warning(self.__class__.__name__) _credentials = deepcopy(credentials) or {} _gcsfs_args = deepcopy(gcsfs_args) or {} _gcs = gcsfs.GCSFileSystem(project=project, token=_credentials, **_gcsfs_args) path = _gcs._strip_protocol(filepath) path = PurePosixPath("{}/{}".format(bucket_name, path) if bucket_name else path) super().__init__( filepath=path, version=version, exists_function=_gcs.exists, glob_function=_gcs.glob, load_args=load_args, save_args=save_args, ) self._gcs = _gcs
def __init__( self, filepath: str, engine: str = "xlsxwriter", load_args: Dict[str, Any] = None, save_args: Dict[str, Any] = None, version: Version = None, ) -> None: """Creates a new instance of ``ExcelLocalDataSet`` pointing to a concrete filepath. Args: engine: The engine used to write to excel files. The default engine is 'xlsxwriter'. filepath: path to an Excel file. load_args: Pandas options for loading Excel files. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_excel.html The default_load_arg engine is 'xlrd', all others preserved. save_args: Pandas options for saving Excel files. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_excel.html All defaults are preserved, but "index", which is set to False. If you would like to specify options for the `ExcelWriter`, you can include them under "writer" key. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.ExcelWriter.html version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` attribute is None, save version will be autogenerated. """ deprecation_warning(self.__class__.__name__) super().__init__(Path(filepath), version) # Handle default load and save arguments self._load_args = copy.deepcopy(self.DEFAULT_LOAD_ARGS) if load_args is not None: self._load_args.update(load_args) self._save_args = copy.deepcopy(self.DEFAULT_SAVE_ARGS) self._writer_args = {"engine": engine} # type: Dict[str, Any] if save_args is not None: writer_args = save_args.pop("writer", {}) # type: Dict[str, Any] self._writer_args.update(writer_args) self._save_args.update(save_args)
def __init__( self, filepath: str, engine: str = "auto", load_args: Dict[str, Any] = None, save_args: Dict[str, Any] = None, version: Version = None, ) -> None: """Creates a new instance of ``ParquetLocalDataSet`` pointing to a concrete filepath. Args: filepath: Path to a parquet file or a metadata file of a multipart parquet collection or the directory of a multipart parquet. engine: The engine to use, one of: `auto`, `fastparquet`, `pyarrow`. If `auto`, then the default behavior is to try `pyarrow`, falling back to `fastparquet` if `pyarrow` is unavailable. load_args: Additional loading options `pyarrow`: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html or `fastparquet`: https://fastparquet.readthedocs.io/en/latest/api.html#fastparquet.ParquetFile.to_pandas save_args: Additional saving options for `pyarrow`: https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.from_pandas or `fastparquet`: https://fastparquet.readthedocs.io/en/latest/api.html#fastparquet.write version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` attribute is None, save version will be autogenerated. """ deprecation_warning(self.__class__.__name__) super().__init__(Path(filepath), version) self._engine = engine # Handle default load and save arguments self._load_args = copy.deepcopy(self.DEFAULT_LOAD_ARGS) if load_args is not None: self._load_args.update(load_args) self._save_args = copy.deepcopy(self.DEFAULT_SAVE_ARGS) if save_args is not None: self._save_args.update(save_args)
def __init__( self, filepath: str, load_args: Dict[str, Any] = None, save_args: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``MatplotlibLocalWriter``. Args: filepath: Path to a matplot object file. load_args: Currently ignored as loading is not supported. save_args: Save args passed to `plt.savefig`. See https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html """ deprecation_warning(self.__class__.__name__) self._filepath = Path(filepath) self._load_args = load_args if load_args else dict() self._save_args = save_args if save_args else dict()
def __init__( self, filepath: str, load_args: Dict[str, Any] = None, save_args: Dict[str, Any] = None, version: Version = None, ) -> None: """Creates a new instance of ``CSVLocalDataSet`` pointing to a concrete filepath. Args: filepath: path to a csv file. load_args: Pandas options for loading csv files. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html All defaults are preserved. save_args: Pandas options for saving csv files. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html All defaults are preserved, but "index", which is set to False. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` attribute is None, save version will be autogenerated. Raises: ValueError: If 'filepath' looks like a remote path. """ deprecation_warning(self.__class__.__name__) super().__init__(Path(filepath), version) if is_remote_path(filepath): raise ValueError( "{} seems to be a remote file, which is not supported by {}". format(filepath, self.__class__.__name__)) # Handle default load and save arguments self._load_args = copy.deepcopy(self.DEFAULT_LOAD_ARGS) if load_args is not None: self._load_args.update(load_args) self._save_args = copy.deepcopy(self.DEFAULT_SAVE_ARGS) if save_args is not None: self._save_args.update(save_args)
def __init__( self, filepath: str, save_args: Dict[str, Any] = None, version: Version = None ) -> None: """Creates a new instance of ``YAMLLocalDataset`` pointing to a concrete filepath. Args: filepath: path to a local yaml file. save_args: Arguments passed on to ```yaml.dump``. See https://pyyaml.org/wiki/PyYAMLDocumentation for details. ``{"default_flow_style": False}`` in default. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` attribute is None, save version will be autogenerated. """ deprecation_warning(self.__class__.__name__) super().__init__(filepath=Path(filepath), save_args=save_args, version=version)
def __init__( self, filepath: str, load_args: Optional[Dict[str, Any]] = None, save_args: Optional[Dict[str, Any]] = None, ) -> None: """ Creates a new instance of ``BioSequenceLocalDataSet`` pointing to a concrete filepath. Args: filepath: path to sequence file load_args: Options for loading sequence files. Here you can find all supported file formats: https://biopython.org/wiki/SeqIO save_args: args supported by Biopython are 'handle' and 'format'. Handle by default is equal to ``filepath``. """ deprecation_warning(self.__class__.__name__) self._filepath = filepath super().__init__(load_args, save_args)
def __init__( self, filepath: str, key: str, load_args: Dict[str, Any] = None, save_args: Dict[str, Any] = None, version: Version = None, ) -> None: """Creates a new instance of ``HDFLocalDataSet`` pointing to a concrete filepath. Args: filepath: Path to an hdf file. key: Identifier to the group in the HDF store. load_args: Pandas options for loading hdf files. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_hdf.html All defaults are preserved. save_args: Pandas options for saving hdf files. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_hdf.html All defaults are preserved. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` attribute is None, save version will be autogenerated. """ deprecation_warning(self.__class__.__name__) super().__init__(Path(filepath), version) self._key = key # Handle default load and save arguments self._load_args = copy.deepcopy(self.DEFAULT_LOAD_ARGS) if load_args is not None: self._load_args.update(load_args) self._save_args = copy.deepcopy(self.DEFAULT_SAVE_ARGS) if save_args is not None: self._save_args.update(save_args)
def __init__( self, filepath: str, backend: str = "pickle", load_args: Dict[str, Any] = None, save_args: Dict[str, Any] = None, version: Version = None, ) -> None: """Creates a new instance of ``PickleLocalDataSet`` pointing to a concrete filepath. ``PickleLocalDataSet`` can use two backends to serialise objects to disk: pickle.dump: https://docs.python.org/3/library/pickle.html#pickle.dump joblib.dump: https://pythonhosted.org/joblib/generated/joblib.dump.html and it can use two backends to load serialised objects into memory: pickle.load: https://docs.python.org/3/library/pickle.html#pickle.load joblib.load: https://pythonhosted.org/joblib/generated/joblib.load.html Joblib tends to exhibit better performance in case objects store NumPy arrays: http://gael-varoquaux.info/programming/new_low-overhead_persistence_in_joblib_for_big_data.html. Args: filepath: path to a pkl file. backend: backend to use, must be one of ['pickle', 'joblib']. load_args: Options for loading pickle files. Refer to the help file of ``pickle.load`` or ``joblib.load`` for options. save_args: Options for saving pickle files. Refer to the help file of ``pickle.dump`` or ``joblib.dump`` for options. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` attribute is None, save version will be autogenerated. Raises: ValueError: If 'backend' is not one of ['pickle', 'joblib']. ImportError: If 'backend' could not be imported. """ deprecation_warning(self.__class__.__name__) super().__init__(Path(filepath), version) if backend not in ["pickle", "joblib"]: raise ValueError( "backend should be one of ['pickle', 'joblib'], got %s" % backend) if backend == "joblib" and joblib is None: raise ImportError("selected backend 'joblib' could not be " "imported. Make sure it is installed.") self._backend = backend # Handle default load and save arguments self._load_args = copy.deepcopy(self.DEFAULT_LOAD_ARGS) if load_args is not None: self._load_args.update(load_args) self._save_args = copy.deepcopy(self.DEFAULT_SAVE_ARGS) if save_args is not None: self._save_args.update(save_args)