Пример #1
0
    def __init__(
        self, database: str, table: str, write_mode: str, table_pk: List[str] = None
    ) -> None:
        """Creates a new instance of ``SparkHiveDataSet``.

        Args:
            database: The name of the hive database.
            table: The name of the table within the database.
            write_mode: ``insert``, ``upsert`` or ``overwrite`` are supported.
            table_pk: If performing an upsert, this identifies the primary key columns used to
                resolve preexisting data. Is required for ``write_mode="upsert"``.

        Raises:
            DataSetError: Invalid configuration supplied
        """
        valid_write_modes = ["insert", "upsert", "overwrite"]
        if write_mode not in valid_write_modes:
            valid_modes = ", ".join(valid_write_modes)
            raise DataSetError(
                f"Invalid `write_mode` provided: {write_mode}. "
                f"`write_mode` must be one of: {valid_modes}"
            )
        if write_mode == "upsert" and not table_pk:
            raise DataSetError("`table_pk` must be set to utilise `upsert` read mode")

        self._write_mode = write_mode
        self._table_pk = table_pk or []
        self._database = database
        self._table = table
        self._stage_table = "_temp_" + table

        # self._table_columns is set up in _save() to speed up initialization
        self._table_columns = []  # type: List[str]
Пример #2
0
    def __init__(
        self,
        table_name: str,
        credentials: Dict[str, Any],
        load_args: Dict[str, Any] = None,
        save_args: Dict[str, Any] = None,
    ) -> None:
        """Creates a new ``SQLTableDataSet``.

        Args:
            table_name: The table name to load or save data to. It
                overwrites name in ``save_args`` and ``table_name``
                parameters in ``load_args``.
            credentials: A dictionary with a ``SQLAlchemy`` connection string.
                Users are supposed to provide the connection string 'con'
                through credentials. It overwrites `con` parameter in
                ``load_args`` and ``save_args`` in case it is provided. To find
                all supported connection string formats, see here:
                https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls
            load_args: Provided to underlying pandas ``read_sql_table``
                function along with the connection string.
                To find all supported arguments, see here:
                https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html
                To find all supported connection string formats, see here:
                https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls
            save_args: Provided to underlying pandas ``to_sql`` function along
                with the connection string.
                To find all supported arguments, see here:
                https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_sql.html
                To find all supported connection string formats, see here:
                https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls
                It has ``index=False`` in the default parameters.

        Raises:
            DataSetError: When either ``table_name`` or ``con`` is empty.
        """

        if not table_name:
            raise DataSetError("`table_name` argument cannot be empty.")

        if not (credentials and "con" in credentials and credentials["con"]):
            raise DataSetError(
                "`con` argument cannot be empty. Please "
                "provide a SQLAlchemy connection string."
            )

        # Handle default load and save arguments
        self._load_args = copy.deepcopy(self.DEFAULT_LOAD_ARGS)
        if load_args is not None:
            self._load_args.update(load_args)
        self._save_args = copy.deepcopy(self.DEFAULT_SAVE_ARGS)
        if save_args is not None:
            self._save_args.update(save_args)

        self._load_args["table_name"] = table_name
        self._save_args["name"] = table_name

        self._load_args["con"] = self._save_args["con"] = credentials["con"]
Пример #3
0
def _get_missing_module_error(import_error: ImportError) -> DataSetError:
    missing_module_instruction = _find_known_drivers(import_error)

    if missing_module_instruction is None:
        return DataSetError("{}Loading failed with error:\n\n{}".format(
            DRIVER_ERROR_MESSAGE, str(import_error)))

    return DataSetError("{}{}".format(DRIVER_ERROR_MESSAGE,
                                      missing_module_instruction))
Пример #4
0
def _get_missing_module_error(import_error: ImportError) -> DataSetError:
    missing_module_instruction = _find_known_drivers(import_error)

    if missing_module_instruction is None:
        return DataSetError(
            f"{DRIVER_ERROR_MESSAGE}Loading failed with error:\n\n{str(import_error)}"
        )

    return DataSetError(f"{DRIVER_ERROR_MESSAGE}{missing_module_instruction}")
Пример #5
0
    def _execute_request(self) -> requests.Response:
        try:
            response = requests.request(**self._request_args)
            response.raise_for_status()
        except requests.exceptions.HTTPError as exc:
            raise DataSetError("Failed to fetch data", exc) from exc
        except OSError as exc:
            raise DataSetError("Failed to connect to the remote server") from exc

        return response
Пример #6
0
    def __init__(
        self,
        table_name: str,
        credentials: Dict[str, Any],
        load_args: Dict[str, Any] = None,
        save_args: Dict[str, Any] = None,
    ) -> None:
        """Creates a new ``SQLTableDataSet``.

        Args:
            table_name: The table name to load or save data to. It
                overwrites name in ``save_args`` and ``table_name``
                parameters in ``load_args``.
            credentials: A dictionary with a ``SQLAlchemy`` connection string.
                Users are supposed to provide the connection string 'con'
                through credentials. It overwrites con parameter in
                ``load_args`` and ``save_args`` in case it is provided.
            load_args: Provided to underlying pandas ``read_sql_table``
                function along with the connection string.
                To find all supported arguments, see here:
                https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html
            save_args: Provided to underlying pandas ``to_sql`` function along
                with the connection string.
                To find all supported arguments, see here:
                https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_sql.html
                It has ``index=False`` in the default parameters.

        Raises:
            DataSetError: When either ``table_name`` or ``con`` is empty.

        """

        if not table_name:
            raise DataSetError("`table_name` argument cannot be empty.")

        if not (credentials and "con" in credentials and credentials["con"]):
            raise DataSetError("`con` argument cannot be empty. Please "
                               "provide a SQLAlchemy connection string.")

        default_save_args = {"index": False}
        default_load_args = {}

        self._load_args = ({
            **default_load_args,
            **load_args
        } if load_args is not None else default_load_args)
        self._save_args = ({
            **default_save_args,
            **save_args
        } if save_args is not None else default_save_args)

        self._load_args["table_name"] = table_name
        self._save_args["name"] = table_name

        self._load_args["con"] = self._save_args["con"] = credentials["con"]
Пример #7
0
    def __init__(
        self,
        database: str,
        table: str,
        write_mode: str,
        table_pk: List[str] = None,
        layer: str = None,
    ) -> None:
        """Creates a new instance of ``SparkHiveDataSet``.

        Args:
            database: The name of the hive database.
            table: The name of the table within the database.
            write_mode: ``insert``, ``upsert`` or ``overwrite`` are supported.
            table_pk: If performing an upsert, this identifies the primary key columns used to
                resolve preexisting data. Is required for ``write_mode="upsert"``.
            layer: The data layer according to the data engineering convention:
                https://kedro.readthedocs.io/en/stable/06_resources/01_faq.html#what-is-data-engineering-convention

        Raises:
            DataSetError: Invalid configuration supplied
        """
        self._database = database
        self._table = table
        self._layer = layer
        self._stage_table = "_temp_" + table
        self._valid_write_modes = ["insert", "upsert", "overwrite"]
        if write_mode not in self._valid_write_modes:
            raise DataSetError(
                "Invalid write_mode provided: {invalid}. Write_mode must be one of {valid}".format(
                    invalid=write_mode, valid=self._valid_write_modes
                )
            )
        self._write_mode = write_mode
        if self._write_mode == "upsert" and not table_pk:
            raise DataSetError("table_pk must be set to utilise upsert read mode")
        self._table_pk = table_pk

        self._table_columns = self._load().columns if self._exists() else None

        if (
            self._table_pk
            and self._exists()
            and set(self._table_pk) - set(self._table_columns)
        ):
            raise DataSetError(
                "columns [{colnames}] selected as PK not found in table {database}.{table}".format(
                    colnames=", ".join(
                        sorted(set(self._table_pk) - set(self._table_columns))
                    ),
                    database=self._database,
                    table=self._table,
                )
            )
    def __init__(
        self,
        filepath: str,
        flavor: str,
        pyfunc_workflow: Optional[str] = None,
        load_args: Dict[str, Any] = None,
        save_args: Dict[str, Any] = None,
        version: Version = None,
    ) -> None:
        """Initialize the Kedro MlflowModelDataSet.

        Parameters are passed from the Data Catalog.

        During save, the model is first logged to MLflow.
        During load, the model is pulled from MLflow run with `run_id`.

        Args:
            filepath (str): Path to store the dataset locally.
            flavor (str): Built-in or custom MLflow model flavor module.
                Must be Python-importable.
            pyfunc_workflow (str, optional): Either `python_model` or `loader_module`.
                See https://www.mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#workflows.
            load_args (Dict[str, Any], optional): Arguments to `load_model`
                function from specified `flavor`. Defaults to {}.
            save_args (Dict[str, Any], optional): Arguments to `log_model`
                function from specified `flavor`. Defaults to {}.
            version (Version, optional): Specific version to load.

        Raises:
            DataSetError: When passed `flavor` does not exist.
        """

        super().__init__(Path(filepath), version)

        self._flavor = flavor
        self._pyfunc_workflow = pyfunc_workflow
        self._logging_activated = True  # by default, it should be True!

        if flavor == "mlflow.pyfunc" and pyfunc_workflow not in (
                "python_model",
                "loader_module",
        ):
            raise DataSetError(
                "PyFunc models require specifying `pyfunc_workflow` "
                "(set to either `python_model` or `loader_module`)")

        self._load_args = load_args or {}
        self._save_args = save_args or {}

        try:
            self._mlflow_model_module
        except ImportError as err:
            raise DataSetError(err)
Пример #9
0
    def _execute_request(self):
        try:
            response = requests.get(self._fileurl, auth=self._auth_backend)
            response.raise_for_status()
        except requests.exceptions.HTTPError as exc:
            if (exc.response.status_code == requests.codes.NOT_FOUND  # pylint: disable=no-member
                ):
                raise DataSetNotFoundError(
                    "The server returned 404 for {}".format(self._fileurl))
            raise DataSetError("Failed to fetch data")
        except socket.error:
            raise DataSetError("Failed to connect to the remote server")

        return response
Пример #10
0
    def __init__(
        self,
        load: Optional[Callable[[], Any]],
        save: Optional[Callable[[Any], None]],
        exists: Optional[Callable[[], bool]] = None,
    ):
        """Creates a new instance of ``LambdaDataSet`` with references to the
        required input/output data set methods.

        Args:
            load: Method to load data from a data set.
            save: Method to save data to a data set.
            exists: Method to check whether output data already exists.
                If None, no exists method is added.

        Raises:
            DataSetError: If load and/or save is specified, but is not a Callable.

        """

        if load is not None and not callable(load):
            raise DataSetError(
                "`load` function for LambdaDataSet must be a Callable. "
                "Object of type `{}` provided instead.".format(
                    load.__class__.__name__))
        if save is not None and not callable(save):
            raise DataSetError(
                "`save` function for LambdaDataSet must be a Callable. "
                "Object of type `{}` provided instead.".format(
                    save.__class__.__name__))

        self.__load = load
        self.__save = save

        if exists:
            self._exists = exists

            def _exists_with_error_handling():
                # wrapper around exists method for error handling
                try:
                    logging.getLogger(__name__).debug(
                        "Checking whether target of %s exists", str(self))
                    return self._exists()
                except Exception as exc:
                    message = "Failed during exists check for data set {}.\n{}".format(
                        str(self), str(exc))
                    raise DataSetError(message) from exc

            self.exists = _exists_with_error_handling
Пример #11
0
    def __init__(
        self,
        sql: str,
        credentials: Dict[str, Any],
        load_args: Dict[str, Any] = None,
        layer: str = None,
    ) -> None:
        """Creates a new ``SQLQueryDataSet``.

        Args:
            sql: The sql query statement.
            credentials: A dictionary with a ``SQLAlchemy`` connection string.
                Users are supposed to provide the connection string 'con'
                through credentials. It overwrites `con` parameter in
                ``load_args`` and ``save_args`` in case it is provided. To find
                all supported connection string formats, see here:
                https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls
            load_args: Provided to underlying pandas ``read_sql_query``
                function along with the connection string.
                To find all supported arguments, see here:
                https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_query.html
                To find all supported connection string formats, see here:
                https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls
            layer: The data layer according to the data engineering convention:
                https://kedro.readthedocs.io/en/stable/06_resources/01_faq.html#what-is-data-engineering-convention

        Raises:
            DataSetError: When either ``sql`` or ``con`` parameters is emtpy.
        """

        if not sql:
            raise DataSetError(
                "`sql` argument cannot be empty. Please provide a sql query")

        if not (credentials and "con" in credentials and credentials["con"]):
            raise DataSetError("`con` argument cannot be empty. Please "
                               "provide a SQLAlchemy connection string.")

        default_load_args = {}  # type: Dict[str, Any]

        self._load_args = ({
            **default_load_args,
            **load_args
        } if load_args is not None else default_load_args)

        self._layer = layer
        self._load_args["sql"] = sql
        self._load_args["con"] = credentials["con"]
Пример #12
0
def _get_sql_alchemy_missing_error() -> DataSetError:
    return DataSetError(
        "The SQL dialect in your connection is not supported by "
        "SQLAlchemy. Please refer to "
        "https://docs.sqlalchemy.org/en/13/core/engines.html#supported-databases "
        "for more information."
    )
Пример #13
0
    def _parse_checkpoint_config(
            self, checkpoint_config: Union[str, Dict[str, Any],
                                           None]) -> Dict[str, Any]:
        checkpoint_config = deepcopy(checkpoint_config)
        if isinstance(checkpoint_config, str):
            checkpoint_config = {"force_checkpoint": checkpoint_config}
        checkpoint_config = checkpoint_config or {}

        for key in {VERSION_KEY, VERSIONED_FLAG_KEY
                    } & checkpoint_config.keys():
            raise DataSetError(
                "`{}` does not support versioning of the checkpoint. "
                "Please remove `{}` key from the checkpoint definition.".
                format(self.__class__.__name__, key))

        default_checkpoint_path = self._sep.join(
            [self._path.rstrip(self._sep), self.DEFAULT_CHECKPOINT_FILENAME])
        default_config = {
            "type": self.DEFAULT_CHECKPOINT_TYPE,
            self._filepath_arg: default_checkpoint_path,
        }
        if self._credentials:
            default_config[CREDENTIALS_KEY] = deepcopy(self._credentials)

        if CREDENTIALS_KEY in default_config.keys() & checkpoint_config.keys():
            self._logger.warning(
                "Top-level credentials will not propagate into the checkpoint since "
                "credentials were explicitly defined in the checkpoint config."
            )

        return {**default_config, **checkpoint_config}
Пример #14
0
 def _load(self) -> DataFrame:
     if not self._exists():
         raise DataSetError(
             f"Requested table not found: {self._database}.{self._table}")
     return self._get_spark().sql(
         f"select * from {self._database}.{self._table}"  # nosec
     )
    def _save(self, data) -> None:

        if self._mutlifile_mode:

            if not os.path.isdir(self._filepath):
                os.makedirs(self._filepath)

            if isinstance(data, list):
                for index, plot in enumerate(data):
                    plot.savefig(
                        os.path.join(self._filepath, str(index) + '.png'), **self._save_args
                    )

            elif isinstance(data, dict):
                for plot_name, plot in data.items():
                    plot.savefig(
                        os.path.join(self._filepath, plot_name + '.png'), **self._save_args
                    )

            else:
                plot_type = type(data)
                raise DataSetError(
                    (
                        "multiFile is True but data type "
                        "not dict or list. Rather, {}".format(plot_type)
                    )
                )

        else:
            data.savefig(self._filepath, **self._save_args)
Пример #16
0
 def _load(self) -> Any:
     if not self.__load:
         raise DataSetError(
             "Cannot load data set. No `load` function "
             "provided when LambdaDataSet was created."
         )
     return self.__load()
Пример #17
0
def _copy_with_mode(data: Any, copy_mode: str) -> Any:
    """Returns the copied data using the copy mode specified.
    If no copy mode is provided, then it is inferred based on the type of the data.

    Args:
        data: The data to copy.
        copy_mode: The copy mode to use, one of "deepcopy", "copy" and "assign".

    Raises:
        DataSetError: If copy_mode is specified, but isn't valid
            (i.e: not one of deepcopy, copy, assign)

    Returns:
        The data copied according to the specified copy mode.
    """
    if copy_mode == "deepcopy":
        copied_data = copy.deepcopy(data)
    elif copy_mode == "copy":
        copied_data = data.copy()
    elif copy_mode == "assign":
        copied_data = data
    else:
        raise DataSetError(f"Invalid copy mode: {copy_mode}. "
                           f"Possible values are: deepcopy, copy, assign.")

    return copied_data
Пример #18
0
 def _save(self, data: Any) -> None:
     if not self.__save:
         raise DataSetError(
             "Cannot save to data set. No `save` function "
             "provided when LambdaDataSet was created."
         )
     self.__save(data)
Пример #19
0
    def _execute_request(self) -> requests.Response:
        try:
            login_password = self.credentials
            connection_url = self._request_args['url']
            token_url = urlunsplit((urlsplit(connection_url).scheme, urlsplit(connection_url).netloc, "token", "", ""))
            token_data = json.loads(requests.post(url=token_url, data=login_password, verify=True).content)

            response = requests.request(**self._request_args, headers={
                "Authorization": ' '.join((token_data['token_type'], token_data['access_token']))})
            response.raise_for_status()
        except requests.exceptions.HTTPError as exc:
            raise DataSetError("Failed to fetch data", exc)
        except socket.error:
            raise DataSetError("Failed to connect to the remote server")

        return response
Пример #20
0
 def _load(self) -> DataFrame:
     if not self._exists():
         raise DataSetError(
             "requested table not found: {database}.{table}".format(
                 database=self._database, table=self._table))
     return self._get_spark().sql(
         "select * from {database}.{table}".format(  # nosec
             database=self._database, table=self._table))
Пример #21
0
    def _load(self) -> Any:
        if self._data is _EMPTY:
            raise DataSetError(
                "Data for MemoryDataSet has not been saved yet.")

        copy_mode = self._copy_mode or _infer_copy_mode(self._data)
        data = _copy_with_mode(self._data, copy_mode=copy_mode)
        return data
 def _save(self, data: pd.DataFrame) -> None:
     # pylint: disable=abstract-class-instantiated
     try:
         if self._exists():
             self._save_args["header"] = False
         data.to_csv(str(self._filepath), **self._save_args)
     except FileNotFoundError:
         raise DataSetError(f"`{self._filepath}` CSV file not found. "
                            f"The file cannot be opened in "
                            f"append mode.")
Пример #23
0
    def __init__(
        self, sql: str, credentials: Dict[str, Any], load_args: Dict[str, Any] = None
    ) -> None:
        """Creates a new ``SQLQueryDataSet``.

        Args:
            sql: The sql query statement.
            credentials: A dictionary with a ``SQLAlchemy`` connection string.
                Users are supposed to provide the connection string 'con'
                through credentials. It overwrites con parameter in
                ``load_args`` and ``save_args`` in case it is provided.
            load_args: Provided to underlying pandas ``read_sql_query``
                function along with the connection string.
                To find all supported arguments, see here:
                https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_query.html

        Raises:
            DataSetError: When either ``sql`` or ``con`` parameters is emtpy.

        """

        if not sql:
            raise DataSetError(
                "`sql` argument cannot be empty. Please provide a sql query"
            )

        if not (credentials and "con" in credentials and credentials["con"]):
            raise DataSetError(
                "`con` argument cannot be empty. Please "
                "provide a SQLAlchemy connection string."
            )

        default_load_args = {}  # type: Dict[str, Any]

        self._load_args = (
            {**default_load_args, **load_args}
            if load_args is not None
            else default_load_args
        )

        self._load_args["sql"] = sql

        self._load_args["con"] = credentials["con"]
Пример #24
0
    def _validate_location(self):
        save_location = self._save_args.get("location")
        load_location = self._load_args.get("location")

        if save_location != load_location:
            raise DataSetError(
                "`load_args['location']` is different from `save_args['location']`. "
                "The `location` defines where BigQuery data is stored, therefore has "
                "to be the same for save and load args. "
                "Details: https://cloud.google.com/bigquery/docs/locations")
Пример #25
0
 def _validate_save(self, data: DataFrame):
     hive_dtypes = set(self._load().dtypes)
     data_dtypes = set(data.dtypes)
     if data_dtypes != hive_dtypes:
         new_cols = data_dtypes - hive_dtypes
         missing_cols = hive_dtypes - data_dtypes
         raise DataSetError(
             f"Dataset does not match hive table schema.\n"
             f"Present on insert only: {sorted(new_cols)}\n"
             f"Present on schema only: {sorted(missing_cols)}")
Пример #26
0
 def _save(self, data: pd.DataFrame) -> None:
     # pylint: disable=abstract-class-instantiated
     try:
         with pd.ExcelWriter(str(self._filepath),
                             **self._writer_args) as writer:
             data.to_excel(writer, **self._save_args)
     except FileNotFoundError:
         raise DataSetError(
             f"`{self._filepath}` Excel file not found. The file cannot be opened in "
             f"append mode.")
Пример #27
0
 def _exists_with_error_handling():
     # wrapper around exists method for error handling
     try:
         logging.getLogger(__name__).debug(
             "Checking whether target of %s exists", str(self))
         return self._exists()
     except Exception as exc:
         message = "Failed during exists check for data set {}.\n{}".format(
             str(self), str(exc))
         raise DataSetError(message) from exc
Пример #28
0
 def _handle_delta_format(self) -> None:
     supported_modes = {
         "append", "overwrite", "error", "errorifexists", "ignore"
     }
     write_mode = self._save_args.get("mode")
     if (write_mode and self._file_format == "delta"
             and write_mode not in supported_modes):
         raise DataSetError(
             f"It is not possible to perform `save()` for file format `delta` "
             f"with mode `{write_mode}` on `SparkDataSet`. "
             f"Please use `spark.DeltaTableDataSet` instead.")
Пример #29
0
 def _load(self) -> Any:
     if self._data is None:
         raise DataSetError(
             "Data for MemoryDataSet has not been saved yet.")
     if isinstance(self._data, (pd.DataFrame, np.ndarray)):
         data = self._data.copy()
     elif type(self._data).__name__ == "DataFrame":
         data = self._data
     else:
         data = copy.deepcopy(self._data)
     return data
Пример #30
0
    def _save(self, data: Any) -> None:
        save_path = get_filepath_str(self._get_save_path(), self._protocol)

        with self._fs.open(save_path, **self._fs_open_args_save) as fs_file:
            try:
                self.BACKENDS[self._backend].dump(data, fs_file,
                                                  **self._save_args)
            except Exception as exc:
                raise DataSetError("{} was not serialized due to: {}".format(
                    str(data.__class__), str(exc))) from exc

        self._invalidate_cache()