def __init__( self, database: str, table: str, write_mode: str, table_pk: List[str] = None ) -> None: """Creates a new instance of ``SparkHiveDataSet``. Args: database: The name of the hive database. table: The name of the table within the database. write_mode: ``insert``, ``upsert`` or ``overwrite`` are supported. table_pk: If performing an upsert, this identifies the primary key columns used to resolve preexisting data. Is required for ``write_mode="upsert"``. Raises: DataSetError: Invalid configuration supplied """ valid_write_modes = ["insert", "upsert", "overwrite"] if write_mode not in valid_write_modes: valid_modes = ", ".join(valid_write_modes) raise DataSetError( f"Invalid `write_mode` provided: {write_mode}. " f"`write_mode` must be one of: {valid_modes}" ) if write_mode == "upsert" and not table_pk: raise DataSetError("`table_pk` must be set to utilise `upsert` read mode") self._write_mode = write_mode self._table_pk = table_pk or [] self._database = database self._table = table self._stage_table = "_temp_" + table # self._table_columns is set up in _save() to speed up initialization self._table_columns = [] # type: List[str]
def __init__( self, table_name: str, credentials: Dict[str, Any], load_args: Dict[str, Any] = None, save_args: Dict[str, Any] = None, ) -> None: """Creates a new ``SQLTableDataSet``. Args: table_name: The table name to load or save data to. It overwrites name in ``save_args`` and ``table_name`` parameters in ``load_args``. credentials: A dictionary with a ``SQLAlchemy`` connection string. Users are supposed to provide the connection string 'con' through credentials. It overwrites `con` parameter in ``load_args`` and ``save_args`` in case it is provided. To find all supported connection string formats, see here: https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls load_args: Provided to underlying pandas ``read_sql_table`` function along with the connection string. To find all supported arguments, see here: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html To find all supported connection string formats, see here: https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls save_args: Provided to underlying pandas ``to_sql`` function along with the connection string. To find all supported arguments, see here: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_sql.html To find all supported connection string formats, see here: https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls It has ``index=False`` in the default parameters. Raises: DataSetError: When either ``table_name`` or ``con`` is empty. """ if not table_name: raise DataSetError("`table_name` argument cannot be empty.") if not (credentials and "con" in credentials and credentials["con"]): raise DataSetError( "`con` argument cannot be empty. Please " "provide a SQLAlchemy connection string." ) # Handle default load and save arguments self._load_args = copy.deepcopy(self.DEFAULT_LOAD_ARGS) if load_args is not None: self._load_args.update(load_args) self._save_args = copy.deepcopy(self.DEFAULT_SAVE_ARGS) if save_args is not None: self._save_args.update(save_args) self._load_args["table_name"] = table_name self._save_args["name"] = table_name self._load_args["con"] = self._save_args["con"] = credentials["con"]
def _get_missing_module_error(import_error: ImportError) -> DataSetError: missing_module_instruction = _find_known_drivers(import_error) if missing_module_instruction is None: return DataSetError("{}Loading failed with error:\n\n{}".format( DRIVER_ERROR_MESSAGE, str(import_error))) return DataSetError("{}{}".format(DRIVER_ERROR_MESSAGE, missing_module_instruction))
def _get_missing_module_error(import_error: ImportError) -> DataSetError: missing_module_instruction = _find_known_drivers(import_error) if missing_module_instruction is None: return DataSetError( f"{DRIVER_ERROR_MESSAGE}Loading failed with error:\n\n{str(import_error)}" ) return DataSetError(f"{DRIVER_ERROR_MESSAGE}{missing_module_instruction}")
def _execute_request(self) -> requests.Response: try: response = requests.request(**self._request_args) response.raise_for_status() except requests.exceptions.HTTPError as exc: raise DataSetError("Failed to fetch data", exc) from exc except OSError as exc: raise DataSetError("Failed to connect to the remote server") from exc return response
def __init__( self, table_name: str, credentials: Dict[str, Any], load_args: Dict[str, Any] = None, save_args: Dict[str, Any] = None, ) -> None: """Creates a new ``SQLTableDataSet``. Args: table_name: The table name to load or save data to. It overwrites name in ``save_args`` and ``table_name`` parameters in ``load_args``. credentials: A dictionary with a ``SQLAlchemy`` connection string. Users are supposed to provide the connection string 'con' through credentials. It overwrites con parameter in ``load_args`` and ``save_args`` in case it is provided. load_args: Provided to underlying pandas ``read_sql_table`` function along with the connection string. To find all supported arguments, see here: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html save_args: Provided to underlying pandas ``to_sql`` function along with the connection string. To find all supported arguments, see here: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_sql.html It has ``index=False`` in the default parameters. Raises: DataSetError: When either ``table_name`` or ``con`` is empty. """ if not table_name: raise DataSetError("`table_name` argument cannot be empty.") if not (credentials and "con" in credentials and credentials["con"]): raise DataSetError("`con` argument cannot be empty. Please " "provide a SQLAlchemy connection string.") default_save_args = {"index": False} default_load_args = {} self._load_args = ({ **default_load_args, **load_args } if load_args is not None else default_load_args) self._save_args = ({ **default_save_args, **save_args } if save_args is not None else default_save_args) self._load_args["table_name"] = table_name self._save_args["name"] = table_name self._load_args["con"] = self._save_args["con"] = credentials["con"]
def __init__( self, database: str, table: str, write_mode: str, table_pk: List[str] = None, layer: str = None, ) -> None: """Creates a new instance of ``SparkHiveDataSet``. Args: database: The name of the hive database. table: The name of the table within the database. write_mode: ``insert``, ``upsert`` or ``overwrite`` are supported. table_pk: If performing an upsert, this identifies the primary key columns used to resolve preexisting data. Is required for ``write_mode="upsert"``. layer: The data layer according to the data engineering convention: https://kedro.readthedocs.io/en/stable/06_resources/01_faq.html#what-is-data-engineering-convention Raises: DataSetError: Invalid configuration supplied """ self._database = database self._table = table self._layer = layer self._stage_table = "_temp_" + table self._valid_write_modes = ["insert", "upsert", "overwrite"] if write_mode not in self._valid_write_modes: raise DataSetError( "Invalid write_mode provided: {invalid}. Write_mode must be one of {valid}".format( invalid=write_mode, valid=self._valid_write_modes ) ) self._write_mode = write_mode if self._write_mode == "upsert" and not table_pk: raise DataSetError("table_pk must be set to utilise upsert read mode") self._table_pk = table_pk self._table_columns = self._load().columns if self._exists() else None if ( self._table_pk and self._exists() and set(self._table_pk) - set(self._table_columns) ): raise DataSetError( "columns [{colnames}] selected as PK not found in table {database}.{table}".format( colnames=", ".join( sorted(set(self._table_pk) - set(self._table_columns)) ), database=self._database, table=self._table, ) )
def __init__( self, filepath: str, flavor: str, pyfunc_workflow: Optional[str] = None, load_args: Dict[str, Any] = None, save_args: Dict[str, Any] = None, version: Version = None, ) -> None: """Initialize the Kedro MlflowModelDataSet. Parameters are passed from the Data Catalog. During save, the model is first logged to MLflow. During load, the model is pulled from MLflow run with `run_id`. Args: filepath (str): Path to store the dataset locally. flavor (str): Built-in or custom MLflow model flavor module. Must be Python-importable. pyfunc_workflow (str, optional): Either `python_model` or `loader_module`. See https://www.mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#workflows. load_args (Dict[str, Any], optional): Arguments to `load_model` function from specified `flavor`. Defaults to {}. save_args (Dict[str, Any], optional): Arguments to `log_model` function from specified `flavor`. Defaults to {}. version (Version, optional): Specific version to load. Raises: DataSetError: When passed `flavor` does not exist. """ super().__init__(Path(filepath), version) self._flavor = flavor self._pyfunc_workflow = pyfunc_workflow self._logging_activated = True # by default, it should be True! if flavor == "mlflow.pyfunc" and pyfunc_workflow not in ( "python_model", "loader_module", ): raise DataSetError( "PyFunc models require specifying `pyfunc_workflow` " "(set to either `python_model` or `loader_module`)") self._load_args = load_args or {} self._save_args = save_args or {} try: self._mlflow_model_module except ImportError as err: raise DataSetError(err)
def _execute_request(self): try: response = requests.get(self._fileurl, auth=self._auth_backend) response.raise_for_status() except requests.exceptions.HTTPError as exc: if (exc.response.status_code == requests.codes.NOT_FOUND # pylint: disable=no-member ): raise DataSetNotFoundError( "The server returned 404 for {}".format(self._fileurl)) raise DataSetError("Failed to fetch data") except socket.error: raise DataSetError("Failed to connect to the remote server") return response
def __init__( self, load: Optional[Callable[[], Any]], save: Optional[Callable[[Any], None]], exists: Optional[Callable[[], bool]] = None, ): """Creates a new instance of ``LambdaDataSet`` with references to the required input/output data set methods. Args: load: Method to load data from a data set. save: Method to save data to a data set. exists: Method to check whether output data already exists. If None, no exists method is added. Raises: DataSetError: If load and/or save is specified, but is not a Callable. """ if load is not None and not callable(load): raise DataSetError( "`load` function for LambdaDataSet must be a Callable. " "Object of type `{}` provided instead.".format( load.__class__.__name__)) if save is not None and not callable(save): raise DataSetError( "`save` function for LambdaDataSet must be a Callable. " "Object of type `{}` provided instead.".format( save.__class__.__name__)) self.__load = load self.__save = save if exists: self._exists = exists def _exists_with_error_handling(): # wrapper around exists method for error handling try: logging.getLogger(__name__).debug( "Checking whether target of %s exists", str(self)) return self._exists() except Exception as exc: message = "Failed during exists check for data set {}.\n{}".format( str(self), str(exc)) raise DataSetError(message) from exc self.exists = _exists_with_error_handling
def __init__( self, sql: str, credentials: Dict[str, Any], load_args: Dict[str, Any] = None, layer: str = None, ) -> None: """Creates a new ``SQLQueryDataSet``. Args: sql: The sql query statement. credentials: A dictionary with a ``SQLAlchemy`` connection string. Users are supposed to provide the connection string 'con' through credentials. It overwrites `con` parameter in ``load_args`` and ``save_args`` in case it is provided. To find all supported connection string formats, see here: https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls load_args: Provided to underlying pandas ``read_sql_query`` function along with the connection string. To find all supported arguments, see here: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_query.html To find all supported connection string formats, see here: https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls layer: The data layer according to the data engineering convention: https://kedro.readthedocs.io/en/stable/06_resources/01_faq.html#what-is-data-engineering-convention Raises: DataSetError: When either ``sql`` or ``con`` parameters is emtpy. """ if not sql: raise DataSetError( "`sql` argument cannot be empty. Please provide a sql query") if not (credentials and "con" in credentials and credentials["con"]): raise DataSetError("`con` argument cannot be empty. Please " "provide a SQLAlchemy connection string.") default_load_args = {} # type: Dict[str, Any] self._load_args = ({ **default_load_args, **load_args } if load_args is not None else default_load_args) self._layer = layer self._load_args["sql"] = sql self._load_args["con"] = credentials["con"]
def _get_sql_alchemy_missing_error() -> DataSetError: return DataSetError( "The SQL dialect in your connection is not supported by " "SQLAlchemy. Please refer to " "https://docs.sqlalchemy.org/en/13/core/engines.html#supported-databases " "for more information." )
def _parse_checkpoint_config( self, checkpoint_config: Union[str, Dict[str, Any], None]) -> Dict[str, Any]: checkpoint_config = deepcopy(checkpoint_config) if isinstance(checkpoint_config, str): checkpoint_config = {"force_checkpoint": checkpoint_config} checkpoint_config = checkpoint_config or {} for key in {VERSION_KEY, VERSIONED_FLAG_KEY } & checkpoint_config.keys(): raise DataSetError( "`{}` does not support versioning of the checkpoint. " "Please remove `{}` key from the checkpoint definition.". format(self.__class__.__name__, key)) default_checkpoint_path = self._sep.join( [self._path.rstrip(self._sep), self.DEFAULT_CHECKPOINT_FILENAME]) default_config = { "type": self.DEFAULT_CHECKPOINT_TYPE, self._filepath_arg: default_checkpoint_path, } if self._credentials: default_config[CREDENTIALS_KEY] = deepcopy(self._credentials) if CREDENTIALS_KEY in default_config.keys() & checkpoint_config.keys(): self._logger.warning( "Top-level credentials will not propagate into the checkpoint since " "credentials were explicitly defined in the checkpoint config." ) return {**default_config, **checkpoint_config}
def _load(self) -> DataFrame: if not self._exists(): raise DataSetError( f"Requested table not found: {self._database}.{self._table}") return self._get_spark().sql( f"select * from {self._database}.{self._table}" # nosec )
def _save(self, data) -> None: if self._mutlifile_mode: if not os.path.isdir(self._filepath): os.makedirs(self._filepath) if isinstance(data, list): for index, plot in enumerate(data): plot.savefig( os.path.join(self._filepath, str(index) + '.png'), **self._save_args ) elif isinstance(data, dict): for plot_name, plot in data.items(): plot.savefig( os.path.join(self._filepath, plot_name + '.png'), **self._save_args ) else: plot_type = type(data) raise DataSetError( ( "multiFile is True but data type " "not dict or list. Rather, {}".format(plot_type) ) ) else: data.savefig(self._filepath, **self._save_args)
def _load(self) -> Any: if not self.__load: raise DataSetError( "Cannot load data set. No `load` function " "provided when LambdaDataSet was created." ) return self.__load()
def _copy_with_mode(data: Any, copy_mode: str) -> Any: """Returns the copied data using the copy mode specified. If no copy mode is provided, then it is inferred based on the type of the data. Args: data: The data to copy. copy_mode: The copy mode to use, one of "deepcopy", "copy" and "assign". Raises: DataSetError: If copy_mode is specified, but isn't valid (i.e: not one of deepcopy, copy, assign) Returns: The data copied according to the specified copy mode. """ if copy_mode == "deepcopy": copied_data = copy.deepcopy(data) elif copy_mode == "copy": copied_data = data.copy() elif copy_mode == "assign": copied_data = data else: raise DataSetError(f"Invalid copy mode: {copy_mode}. " f"Possible values are: deepcopy, copy, assign.") return copied_data
def _save(self, data: Any) -> None: if not self.__save: raise DataSetError( "Cannot save to data set. No `save` function " "provided when LambdaDataSet was created." ) self.__save(data)
def _execute_request(self) -> requests.Response: try: login_password = self.credentials connection_url = self._request_args['url'] token_url = urlunsplit((urlsplit(connection_url).scheme, urlsplit(connection_url).netloc, "token", "", "")) token_data = json.loads(requests.post(url=token_url, data=login_password, verify=True).content) response = requests.request(**self._request_args, headers={ "Authorization": ' '.join((token_data['token_type'], token_data['access_token']))}) response.raise_for_status() except requests.exceptions.HTTPError as exc: raise DataSetError("Failed to fetch data", exc) except socket.error: raise DataSetError("Failed to connect to the remote server") return response
def _load(self) -> DataFrame: if not self._exists(): raise DataSetError( "requested table not found: {database}.{table}".format( database=self._database, table=self._table)) return self._get_spark().sql( "select * from {database}.{table}".format( # nosec database=self._database, table=self._table))
def _load(self) -> Any: if self._data is _EMPTY: raise DataSetError( "Data for MemoryDataSet has not been saved yet.") copy_mode = self._copy_mode or _infer_copy_mode(self._data) data = _copy_with_mode(self._data, copy_mode=copy_mode) return data
def _save(self, data: pd.DataFrame) -> None: # pylint: disable=abstract-class-instantiated try: if self._exists(): self._save_args["header"] = False data.to_csv(str(self._filepath), **self._save_args) except FileNotFoundError: raise DataSetError(f"`{self._filepath}` CSV file not found. " f"The file cannot be opened in " f"append mode.")
def __init__( self, sql: str, credentials: Dict[str, Any], load_args: Dict[str, Any] = None ) -> None: """Creates a new ``SQLQueryDataSet``. Args: sql: The sql query statement. credentials: A dictionary with a ``SQLAlchemy`` connection string. Users are supposed to provide the connection string 'con' through credentials. It overwrites con parameter in ``load_args`` and ``save_args`` in case it is provided. load_args: Provided to underlying pandas ``read_sql_query`` function along with the connection string. To find all supported arguments, see here: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_query.html Raises: DataSetError: When either ``sql`` or ``con`` parameters is emtpy. """ if not sql: raise DataSetError( "`sql` argument cannot be empty. Please provide a sql query" ) if not (credentials and "con" in credentials and credentials["con"]): raise DataSetError( "`con` argument cannot be empty. Please " "provide a SQLAlchemy connection string." ) default_load_args = {} # type: Dict[str, Any] self._load_args = ( {**default_load_args, **load_args} if load_args is not None else default_load_args ) self._load_args["sql"] = sql self._load_args["con"] = credentials["con"]
def _validate_location(self): save_location = self._save_args.get("location") load_location = self._load_args.get("location") if save_location != load_location: raise DataSetError( "`load_args['location']` is different from `save_args['location']`. " "The `location` defines where BigQuery data is stored, therefore has " "to be the same for save and load args. " "Details: https://cloud.google.com/bigquery/docs/locations")
def _validate_save(self, data: DataFrame): hive_dtypes = set(self._load().dtypes) data_dtypes = set(data.dtypes) if data_dtypes != hive_dtypes: new_cols = data_dtypes - hive_dtypes missing_cols = hive_dtypes - data_dtypes raise DataSetError( f"Dataset does not match hive table schema.\n" f"Present on insert only: {sorted(new_cols)}\n" f"Present on schema only: {sorted(missing_cols)}")
def _save(self, data: pd.DataFrame) -> None: # pylint: disable=abstract-class-instantiated try: with pd.ExcelWriter(str(self._filepath), **self._writer_args) as writer: data.to_excel(writer, **self._save_args) except FileNotFoundError: raise DataSetError( f"`{self._filepath}` Excel file not found. The file cannot be opened in " f"append mode.")
def _exists_with_error_handling(): # wrapper around exists method for error handling try: logging.getLogger(__name__).debug( "Checking whether target of %s exists", str(self)) return self._exists() except Exception as exc: message = "Failed during exists check for data set {}.\n{}".format( str(self), str(exc)) raise DataSetError(message) from exc
def _handle_delta_format(self) -> None: supported_modes = { "append", "overwrite", "error", "errorifexists", "ignore" } write_mode = self._save_args.get("mode") if (write_mode and self._file_format == "delta" and write_mode not in supported_modes): raise DataSetError( f"It is not possible to perform `save()` for file format `delta` " f"with mode `{write_mode}` on `SparkDataSet`. " f"Please use `spark.DeltaTableDataSet` instead.")
def _load(self) -> Any: if self._data is None: raise DataSetError( "Data for MemoryDataSet has not been saved yet.") if isinstance(self._data, (pd.DataFrame, np.ndarray)): data = self._data.copy() elif type(self._data).__name__ == "DataFrame": data = self._data else: data = copy.deepcopy(self._data) return data
def _save(self, data: Any) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: try: self.BACKENDS[self._backend].dump(data, fs_file, **self._save_args) except Exception as exc: raise DataSetError("{} was not serialized due to: {}".format( str(data.__class__), str(exc))) from exc self._invalidate_cache()