Exemplo n.º 1
0
    def validate_schema(self, data, **kwargs):
        # If a class_name begins with the dollar sign ("$"), then it is assumed to be a variable name to be substituted.
        if data["class_name"][0] == "$":
            return
        if ("default_regex" in data) and not (data["class_name"] in [
                "InferredAssetFilesystemDataConnector",
                "ConfiguredAssetFilesystemDataConnector",
                "InferredAssetS3DataConnector",
                "ConfiguredAssetS3DataConnector",
        ]):
            raise ge_exceptions.InvalidConfigError(
                f"""Your current configuration uses one or more keys in a data connector, that are required only by a
subclass of the FilePathDataConnector class (your data conntector is "{data['class_name']}").  Please update your
configuration to continue.
                """)
        if ("glob_directive" in data) and not (data["class_name"] in [
                "InferredAssetFilesystemDataConnector",
                "ConfiguredAssetFilesystemDataConnector",
        ]):
            raise ge_exceptions.InvalidConfigError(
                f"""Your current configuration uses one or more keys in a data connector, that are required only by a
filesystem type of the data connector (your data conntector is "{data['class_name']}").  Please update your
configuration to continue.
                """)
        if ("bucket" in data or "prefix" in data or "delimiter" in data
                or "max_keys" in data) and not (data["class_name"] in [
                    "InferredAssetS3DataConnector",
                    "ConfiguredAssetS3DataConnector",
                ]):
            raise ge_exceptions.InvalidConfigError(
                f"""Your current configuration uses one or more keys in a data connector, that are required only by an
S3 type of the data connector (your data conntector is "{data['class_name']}").  Please update your configuration to
continue.
                """)
        if ("data_asset_name_prefix" in data
                or "data_asset_name_suffix" in data
                or "include_schema_name" in data or "splitter_method" in data
                or "splitter_kwargs" in data or "sampling_method" in data
                or "sampling_kwargs" in data or "excluded_tables" in data
                or "included_tables" in data or "skip_inapplicable_tables"
                in data) and not (data["class_name"] in [
                    "InferredAssetSqlDataConnector",
                    "ConfiguredAssetSqlDataConnector",
                ]):
            raise ge_exceptions.InvalidConfigError(
                f"""Your current configuration uses one or more keys in a data connector, that are required only by an
SQL type of the data connector (your data conntector is "{data['class_name']}").  Please update your configuration to
continue.
                """)
    def __init__(self,
                 credentials,
                 table_name,
                 key_columns,
                 fixed_length_key=True):
        super().__init__(fixed_length_key=fixed_length_key)
        if not sqlalchemy:
            raise ge_exceptions.DataContextError(
                "ModuleNotFoundError: No module named 'sqlalchemy'")

        if not self.fixed_length_key:
            raise ge_exceptions.InvalidConfigError(
                "DatabaseStoreBackend requires use of a fixed-length-key")

        drivername = credentials.pop("drivername")
        options = URL(drivername, **credentials)
        self.engine = create_engine(options)

        meta = MetaData()
        self.key_columns = key_columns
        # Dynamically construct a SQLAlchemy table with the name and column names we'll use
        cols = []
        for column in key_columns:
            if column == "value":
                raise ge_exceptions.InvalidConfigError(
                    "'value' cannot be used as a key_element name")
            cols.append(Column(column, String, primary_key=True))
        cols.append(Column("value", String))
        try:
            table = Table(table_name,
                          meta,
                          autoload=True,
                          autoload_with=self.engine)
            # We do a "light" check: if the columns' names match, we will proceed, otherwise, create the table
            if {str(col.name).lower()
                    for col in table.columns
                } != (set(key_columns) | {"value"}):
                raise ge_exceptions.StoreBackendError(
                    f"Unable to use table {table_name}: it exists, but does not have the expected schema."
                )
        except NoSuchTableError:
            table = Table(table_name, meta, *cols)
            try:
                meta.create_all(self.engine)
            except SQLAlchemyError as e:
                raise ge_exceptions.StoreBackendError(
                    f"Unable to connect to table {table_name} because of an error. It is possible your table needs to be migrated to a new schema.  SqlAlchemyError: {str(e)}"
                )
        self._table = table
Exemplo n.º 3
0
    def __init__(
        self,
        name,
        data_context,
        target_store,
        source_store_name,
        custom_styles_directory=None,
        show_how_to_buttons=True,
        run_id_filter=None,
        validation_results_limit=None,
        renderer=None,
        view=None,
        data_context_id=None,
        **kwargs,
    ):
        self.name = name
        self.source_store = data_context.stores[source_store_name]
        self.target_store = target_store
        self.run_id_filter = run_id_filter
        self.validation_results_limit = validation_results_limit
        self.data_context_id = data_context_id
        self.show_how_to_buttons = show_how_to_buttons

        if renderer is None:
            raise exceptions.InvalidConfigError(
                "SiteSectionBuilder requires a renderer configuration with a class_name key."
            )
        module_name = (
            renderer.get("module_name") or "great_expectations.render.renderer"
        )
        self.renderer_class = instantiate_class_from_config(
            config=renderer,
            runtime_environment={"data_context": data_context},
            config_defaults={"module_name": module_name},
        )
        if not self.renderer_class:
            raise exceptions.ClassInstantiationError(
                module_name=module_name,
                package_name=None,
                class_name=renderer["class_name"],
            )

        module_name = "great_expectations.render.view"
        if view is None:
            view = {
                "module_name": module_name,
                "class_name": "DefaultJinjaPageView",
            }
        module_name = view.get("module_name") or module_name
        self.view_class = instantiate_class_from_config(
            config=view,
            runtime_environment={"custom_styles_directory": custom_styles_directory},
            config_defaults={"module_name": module_name},
        )
        if not self.view_class:
            raise exceptions.ClassInstantiationError(
                module_name=view["module_name"],
                package_name=None,
                class_name=view["class_name"],
            )
Exemplo n.º 4
0
    def __init__(
        self,
        credentials,
        queries=None,
        store_backend=None,
        runtime_environment=None,
        store_name=None,
    ) -> None:
        if not sqlalchemy:
            raise ge_exceptions.DataContextError(
                "sqlalchemy module not found, but is required for "
                "SqlAlchemyQueryStore"
            )
        super().__init__(
            store_backend=store_backend,
            runtime_environment=runtime_environment,
            store_name=store_name,
        )
        if queries:
            # If queries are defined in configuration, then we load them into an InMemoryStoreBackend
            try:
                assert isinstance(
                    queries, dict
                ), "SqlAlchemyQueryStore queries must be defined as a dictionary"
                assert (
                    store_backend is None
                    or store_backend["class_name"] == "InMemoryStoreBackend"
                ), (
                    "If queries are provided in configuration, then store_backend must be empty or an "
                    "InMemoryStoreBackend"
                )
                for k, v in queries.items():
                    self._store_backend.set(tuple([k]), v)

            except (AssertionError, KeyError) as e:
                raise ge_exceptions.InvalidConfigError(str(e))

        if "engine" in credentials:
            self.engine = credentials["engine"]
        elif "url" in credentials:
            self.engine = create_engine(credentials["url"])
        elif "connection_string" in credentials:
            self.engine = create_engine(credentials["connection_string"])
        else:
            drivername = credentials.pop("drivername")
            options = URL(drivername, **credentials)
            self.engine = create_engine(options)

        # Gather the call arguments of the present function (include the "module_name" and add the "class_name"), filter
        # out the Falsy values, and set the instance "_config" variable equal to the resulting dictionary.
        self._config = {
            "credentials": credentials,
            "queries": queries,
            "store_backend": store_backend,
            "runtime_environment": runtime_environment,
            "store_name": store_name,
            "module_name": self.__class__.__module__,
            "class_name": self.__class__.__name__,
        }
        filter_properties_dict(properties=self._config, clean_falsy=True, inplace=True)
    def add_profiler(
        config: RuleBasedProfilerConfig,
        data_context: "DataContext",  # noqa: F821
        profiler_store: ProfilerStore,
        ge_cloud_id: Optional[str] = None,
    ) -> "RuleBasedProfiler":
        if not RuleBasedProfiler._check_validity_of_batch_requests_in_config(
                config=config):
            raise ge_exceptions.InvalidConfigError(
                f'batch_data found in batch_request cannot be saved to ProfilerStore "{profiler_store.store_name}"'
            )

        # Chetan - 20220204 - DataContext to be removed once it can be decoupled from RBP
        new_profiler: "RuleBasedProfiler" = instantiate_class_from_config(
            config=config.to_json_dict(),
            runtime_environment={
                "data_context": data_context,
            },
            config_defaults={
                "module_name": "great_expectations.rule_based_profiler",
                "class_name": "RuleBasedProfiler",
            },
        )

        key: Union[GeCloudIdentifier, ConfigurationIdentifier]
        if ge_cloud_id:
            key = GeCloudIdentifier(resource_type="contract",
                                    ge_cloud_id=ge_cloud_id)
        else:
            key = ConfigurationIdentifier(configuration_key=config.name, )

        profiler_store.set(key=key, value=config)

        return new_profiler
    def _validate_mssql_limit_param(n: Union[str, int]) -> None:
        """Validate that the mssql limit param is passed as an int or a string representation of an int.

        Args:
            n: mssql limit parameter.

        Returns:
            None
        """
        if not isinstance(n, (str, int)):
            raise ge_exceptions.InvalidConfigError(
                "Please specify your sampling kwargs 'n' parameter as a string or int."
            )
        if isinstance(n, str) and not n.isdigit():
            raise ge_exceptions.InvalidConfigError(
                "If specifying your sampling kwargs 'n' parameter as a string please ensure it is "
                "parseable as an integer.")
Exemplo n.º 7
0
 def data_context_id(self, data_context_id):
     try:
         uuid.UUID(data_context_id)
     except ValueError:
         raise ge_exceptions.InvalidConfigError(
             "data_context_id must be a valid uuid")
     self._data_context_id = data_context_id
     self._explicit_id = True
Exemplo n.º 8
0
    def _validate_date_parts(
            date_parts: Union[List[DatePart], List[str]]) -> None:
        """Validate that date parts exist and are of the correct type.

        Args:
            date_parts: DatePart instances or str.

        Returns:
            None, this method raises exceptions if the config is invalid.
        """
        if len(date_parts) == 0:
            raise ge_exceptions.InvalidConfigError(
                "date_parts are required when using split_on_date_parts.")
        if not all((isinstance(dp, DatePart)) or (isinstance(dp, str))
                   for dp in date_parts):
            raise ge_exceptions.InvalidConfigError(
                "date_parts should be of type DatePart or str.")
Exemplo n.º 9
0
    def validate_schema(self, data, **kwargs):
        # If a class_name begins with the dollar sign ("$"), then it is assumed to be a variable name to be substituted.
        if data["class_name"][0] == "$":
            return
        if "connection_string" in data and not (
                data["class_name"] == "SqlAlchemyExecutionEngine"):
            raise ge_exceptions.InvalidConfigError(
                f"""Your current configuration uses the "connection_string" key in an execution engine, but only
SqlAlchemyExecutionEngine requires this attribute (your execution engine is "{data['class_name']}").  Please update your
configuration to continue.
                """)
        if "spark_config" in data and not (data["class_name"]
                                           == "SparkDFExecutionEngine"):
            raise ge_exceptions.InvalidConfigError(
                f"""Your current configuration uses the "spark_config" key in an execution engine, but only
SparkDFExecutionEngine requires this attribute (your execution engine is "{data['class_name']}").  Please update your
configuration to continue.
                """)
Exemplo n.º 10
0
    def validate_schema(self, data, **kwargs):
        if "generators" in data:
            raise ge_exceptions.InvalidConfigError(
                'Your current configuration uses the "generators" key in a datasource, but in version 0.10 of '
                'GE, that key is renamed to "batch_kwargs_generators". Please update your configuration to continue.'
            )
        # If a class_name begins with the dollar sign ("$"), then it is assumed to be a variable name to be substituted.
        if data["class_name"][0] == "$":
            return
        if ("connection_string" in data or "credentials" in data
                or "introspection" in data
                or "tables" in data) and not (data["class_name"] in [
                    "SqlAlchemyDatasource",
                    "SimpleSqlalchemyDatasource",
                ]):
            raise ge_exceptions.InvalidConfigError(
                f"""Your current configuration uses one or more keys in a data source, that are required only by a
sqlalchemy data source (your data source is "{data['class_name']}").  Please update your configuration to continue.
                """)
Exemplo n.º 11
0
    def _verify_all_strings_are_valid_date_parts(date_part_strings: List[str]) -> None:
        """Verify date part strings by trying to load as DatePart instances.

        Args:
            date_part_strings: A list of strings that should correspond to DatePart.

        Returns:
            None, raises an exception if unable to convert.
        """
        try:
            [DatePart(date_part_string) for date_part_string in date_part_strings]
        except ValueError as e:
            raise ge_exceptions.InvalidConfigError(
                f"{e} please only specify strings that are supported in DatePart: {[dp.value for dp in DatePart]}"
            )
Exemplo n.º 12
0
    def __init__(
            self,
            name,
            data_context,
            target_store,
            source_store_name,
            custom_styles_directory=None,
            run_id_filter=None,
            validation_results_limit=None,
            renderer=None,
            view=None,
    ):
        self.name = name
        self.source_store = data_context.stores[source_store_name]
        self.target_store = target_store
        self.run_id_filter = run_id_filter
        self.validation_results_limit = validation_results_limit

        if renderer is None:
            raise exceptions.InvalidConfigError(
                "SiteSectionBuilder requires a renderer configuration with a class_name key."
            )
        self.renderer_class = instantiate_class_from_config(
            config=renderer,
            runtime_environment={
                "data_context": data_context
            },
            config_defaults={
                "module_name": "great_expectations.render.renderer"
            }
        )
        if view is None:
            view = {
                "module_name": "great_expectations.render.view",
                "class_name": "DefaultJinjaPageView",
            }

        self.view_class = instantiate_class_from_config(
            config=view,
            runtime_environment={
                "custom_styles_directory": custom_styles_directory
            },
            config_defaults={
                "module_name": "great_expectations.render.view"
            }
        )
    def _get_method_name_for_get_data_for_batch_identifiers_method(
            self, splitter_method_name: str):
        """Get the matching method name to get the data for batch identifiers from the input splitter method name.

        Args:
            splitter_method_name: Configured splitter name.

        Returns:
            Name of the corresponding method to get data for building batch identifiers.
        """
        processed_splitter_method_name: str = self._get_splitter_method_name(
            splitter_method_name)
        try:
            return self.SPLITTER_METHOD_TO_GET_UNIQUE_BATCH_IDENTIFIERS_METHOD_MAPPING[
                processed_splitter_method_name]
        except ValueError as e:
            raise ge_exceptions.InvalidConfigError(
                f"Please provide a supported splitter method name, you provided: {splitter_method_name}"
            )
Exemplo n.º 14
0
    def __init__(
        self,
        credentials,
        queries=None,
        store_backend=None,
        runtime_environment=None,
        store_name=None,
    ):
        if not sqlalchemy:
            raise ge_exceptions.DataContextError(
                "sqlalchemy module not found, but is required for "
                "SqlAlchemyQueryStore")
        super().__init__(
            store_backend=store_backend,
            runtime_environment=runtime_environment,
            store_name=store_name,
        )
        if queries:
            # If queries are defined in configuration, then we load them into an InMemoryStoreBackend
            try:
                assert isinstance(
                    queries, dict
                ), "SqlAlchemyQueryStore queries must be defined as a dictionary"
                assert (
                    store_backend is None
                    or store_backend["class_name"] == "InMemoryStoreBackend"
                ), ("If queries are provided in configuration, then store_backend must be empty or an "
                    "InMemoryStoreBackend")
                for k, v in queries.items():
                    self._store_backend.set(tuple([k]), v)

            except (AssertionError, KeyError) as e:
                raise ge_exceptions.InvalidConfigError(str(e))

        if "engine" in credentials:
            self.engine = credentials["engine"]
        elif "url" in credentials:
            self.engine = create_engine(credentials["url"])
        else:
            drivername = credentials.pop("drivername")
            options = URL(drivername, **credentials)
            self.engine = create_engine(options)
Exemplo n.º 15
0
    def __init__(self,
                 name,
                 data_context,
                 target_store,
                 source_store_name,
                 custom_styles_directory=None,
                 # NOTE: Consider allowing specification of ANY element (or combination of elements) within an ID key?
                 run_id_filter=None,
                 renderer=None,
                 view=None,
    ):
        self.name = name
        self.source_store = data_context.stores[source_store_name]
        self.target_store = target_store
        self.run_id_filter = run_id_filter

        if renderer is None:
            raise exceptions.InvalidConfigError(
                "SiteSectionBuilder requires a renderer configuration with a class_name key."
            )
        self.renderer_class = instantiate_class_from_config(
            config=renderer,
            runtime_config={},
            config_defaults={
                "module_name": "great_expectations.render.renderer"
            }
        )
        if view is None:
            view = {
                "module_name": "great_expectations.render.view",
                "class_name": "DefaultJinjaPageView",
            }

        self.view_class = instantiate_class_from_config(
            config=view,
            runtime_config={
                "custom_styles_directory": custom_styles_directory
            },
            config_defaults={
                "module_name": "great_expectations.render.view"
            }
        )
Exemplo n.º 16
0
def add_checkpoint(
    data_context: "DataContext",  # noqa: F821
    checkpoint_store: CheckpointStore,
    checkpoint_store_name: str,
    ge_cloud_mode: bool,
    name: str,
    config_version: Optional[Union[int, float]] = None,
    template_name: Optional[str] = None,
    module_name: Optional[str] = None,
    class_name: Optional[str] = None,
    run_name_template: Optional[str] = None,
    expectation_suite_name: Optional[str] = None,
    batch_request: Optional[dict] = None,
    action_list: Optional[List[dict]] = None,
    evaluation_parameters: Optional[dict] = None,
    runtime_configuration: Optional[dict] = None,
    validations: Optional[List[dict]] = None,
    profilers: Optional[List[dict]] = None,
    # Next two fields are for LegacyCheckpoint configuration
    validation_operator_name: Optional[str] = None,
    batches: Optional[List[dict]] = None,
    # the following four arguments are used by SimpleCheckpoint
    site_names: Optional[Union[str, List[str]]] = None,
    slack_webhook: Optional[str] = None,
    notify_on: Optional[str] = None,
    notify_with: Optional[Union[str, List[str]]] = None,
    ge_cloud_id: Optional[str] = None,
    expectation_suite_ge_cloud_id: Optional[str] = None,
) -> Union[Checkpoint, LegacyCheckpoint]:
    checkpoint_config: Union[CheckpointConfig, dict]

    # These checks protect against typed objects (BatchRequest and/or RuntimeBatchRequest) encountered in arguments.
    batch_request = get_batch_request_as_dict(batch_request=batch_request)
    validations = get_validations_with_batch_request_as_dict(validations=validations)

    # DataFrames shouldn't be saved to CheckpointStore
    if batch_request_contains_batch_data(batch_request=batch_request):
        raise ge_exceptions.InvalidConfigError(
            f'batch_data found in batch_request cannot be saved to CheckpointStore "{checkpoint_store_name}"'
        )

    if batch_request_in_validations_contains_batch_data(validations=validations):
        raise ge_exceptions.InvalidConfigError(
            f'batch_data found in validations cannot be saved to CheckpointStore "{checkpoint_store_name}"'
        )

    checkpoint_config = {
        "name": name,
        "config_version": config_version,
        "template_name": template_name,
        "module_name": module_name,
        "class_name": class_name,
        "run_name_template": run_name_template,
        "expectation_suite_name": expectation_suite_name,
        "batch_request": batch_request,
        "action_list": action_list,
        "evaluation_parameters": evaluation_parameters,
        "runtime_configuration": runtime_configuration,
        "validations": validations,
        "profilers": profilers,
        # Next two fields are for LegacyCheckpoint configuration
        "validation_operator_name": validation_operator_name,
        "batches": batches,
        # the following four keys are used by SimpleCheckpoint
        "site_names": site_names,
        "slack_webhook": slack_webhook,
        "notify_on": notify_on,
        "notify_with": notify_with,
        "ge_cloud_id": ge_cloud_id,
        "expectation_suite_ge_cloud_id": expectation_suite_ge_cloud_id,
    }

    checkpoint_config = deep_filter_properties_iterable(
        properties=checkpoint_config,
        clean_falsy=True,
    )

    new_checkpoint: Union[
        Checkpoint, SimpleCheckpoint, LegacyCheckpoint
    ] = instantiate_class_from_config(
        config=checkpoint_config,
        runtime_environment={
            "data_context": data_context,
        },
        config_defaults={
            "module_name": "great_expectations.checkpoint",
        },
    )

    if ge_cloud_mode:
        key: GeCloudIdentifier = GeCloudIdentifier(
            resource_type="contract", ge_cloud_id=ge_cloud_id
        )
    else:
        key: ConfigurationIdentifier = ConfigurationIdentifier(
            configuration_key=name,
        )

    checkpoint_config = new_checkpoint.get_config()

    checkpoint_ref = checkpoint_store.set(key=key, value=checkpoint_config)
    if isinstance(checkpoint_ref, GeCloudIdAwareRef):
        ge_cloud_id = checkpoint_ref.ge_cloud_id
        new_checkpoint.ge_cloud_id = uuid.UUID(ge_cloud_id)

    return new_checkpoint
Exemplo n.º 17
0
    def __init__(
        self,
        table_name,
        key_columns,
        fixed_length_key=True,
        credentials=None,
        url=None,
        connection_string=None,
        engine=None,
        store_name=None,
        suppress_store_backend_id=False,
        manually_initialize_store_backend_id: str = "",
        **kwargs,
    ):
        super().__init__(
            fixed_length_key=fixed_length_key,
            suppress_store_backend_id=suppress_store_backend_id,
            manually_initialize_store_backend_id=
            manually_initialize_store_backend_id,
            store_name=store_name,
        )
        if not sa:
            raise ge_exceptions.DataContextError(
                "ModuleNotFoundError: No module named 'sqlalchemy'")

        if not self.fixed_length_key:
            raise ge_exceptions.InvalidConfigError(
                "DatabaseStoreBackend requires use of a fixed-length-key")

        self._schema_name = None
        self._credentials = credentials
        self._connection_string = connection_string
        self._url = url

        if engine is not None:
            if credentials is not None:
                logger.warning(
                    "Both credentials and engine were provided during initialization of SqlAlchemyExecutionEngine. "
                    "Ignoring credentials.")
            self.engine = engine
        elif credentials is not None:
            self.engine = self._build_engine(credentials=credentials, **kwargs)
        elif connection_string is not None:
            self.engine = sa.create_engine(connection_string, **kwargs)
        elif url is not None:
            self.drivername = urlparse(url).scheme
            self.engine = sa.create_engine(url, **kwargs)
        else:
            raise ge_exceptions.InvalidConfigError(
                "Credentials, url, connection_string, or an engine are required for a DatabaseStoreBackend."
            )

        meta = MetaData(schema=self._schema_name)
        self.key_columns = key_columns
        # Dynamically construct a SQLAlchemy table with the name and column names we'll use
        cols = []
        for column in key_columns:
            if column == "value":
                raise ge_exceptions.InvalidConfigError(
                    "'value' cannot be used as a key_element name")
            cols.append(Column(column, String, primary_key=True))
        cols.append(Column("value", String))
        try:
            table = Table(table_name,
                          meta,
                          autoload=True,
                          autoload_with=self.engine)
            # We do a "light" check: if the columns' names match, we will proceed, otherwise, create the table
            if {str(col.name).lower()
                    for col in table.columns
                } != (set(key_columns) | {"value"}):
                raise ge_exceptions.StoreBackendError(
                    f"Unable to use table {table_name}: it exists, but does not have the expected schema."
                )
        except NoSuchTableError:
            table = Table(table_name, meta, *cols)
            try:
                if self._schema_name:
                    self.engine.execute(
                        f"CREATE SCHEMA IF NOT EXISTS {self._schema_name};")
                meta.create_all(self.engine)
            except SQLAlchemyError as e:
                raise ge_exceptions.StoreBackendError(
                    f"Unable to connect to table {table_name} because of an error. It is possible your table needs to be migrated to a new schema.  SqlAlchemyError: {str(e)}"
                )
        self._table = table
        # Initialize with store_backend_id
        self._store_backend_id = None
        self._store_backend_id = self.store_backend_id

        # Gather the call arguments of the present function (include the "module_name" and add the "class_name"), filter
        # out the Falsy values, and set the instance "_config" variable equal to the resulting dictionary.
        self._config = {
            "table_name": table_name,
            "key_columns": key_columns,
            "fixed_length_key": fixed_length_key,
            "credentials": credentials,
            "url": url,
            "connection_string": connection_string,
            "engine": engine,
            "store_name": store_name,
            "suppress_store_backend_id": suppress_store_backend_id,
            "manually_initialize_store_backend_id":
            manually_initialize_store_backend_id,
            "module_name": self.__class__.__module__,
            "class_name": self.__class__.__name__,
        }
        self._config.update(kwargs)
        filter_properties_dict(properties=self._config,
                               clean_falsy=True,
                               inplace=True)
Exemplo n.º 18
0
 def validate_schema(self, data, **kwargs):
     if "generators" in data:
         raise ge_exceptions.InvalidConfigError(
             "Your current configuration uses the 'generators' key in a datasource, but in version 0.10 of "
             "GE, that key is renamed to 'batch_kwargs_generators'. Please update your config to continue."
         )
Exemplo n.º 19
0
    def sample_using_limit(
        self,
        execution_engine: "SqlAlchemyExecutionEngine",  # noqa: F821
        batch_spec: BatchSpec,
        where_clause: Optional[Selectable] = None,
    ) -> Union[str, BinaryExpression, BooleanClauseList]:
        """Sample using a limit with configuration provided via the batch_spec.

        Note: where_clause needs to be included at this stage since SqlAlchemy's semantics
        for LIMIT are different than normal WHERE clauses.

        Also this requires an engine to find the dialect since certain databases require
        different handling.

        Args:
            execution_engine: Engine used to connect to the database.
            batch_spec: Batch specification describing the batch of interest.
            where_clause: Optional clause used in WHERE clause. Typically generated by a splitter.

        Returns:
            A query as a string or sqlalchemy object.
        """

        # Split clause should be permissive of all values if not supplied.
        if not where_clause:
            where_clause = True

        table_name: str = batch_spec["table_name"]

        # SQLalchemy's semantics for LIMIT are different than normal WHERE clauses,
        # so the business logic for building the query needs to be different.
        dialect: str = execution_engine.engine.dialect.name.lower()
        if dialect == "oracle":
            # TODO: AJB 20220429 WARNING THIS oracle dialect METHOD IS NOT COVERED BY TESTS
            # limit doesn't compile properly for oracle so we will append rownum to query string later
            raw_query: Selectable = (sa.select("*").select_from(
                sa.table(table_name,
                         schema=batch_spec.get("schema_name",
                                               None))).where(where_clause))
            query: str = str(
                raw_query.compile(execution_engine,
                                  compile_kwargs={"literal_binds": True}))
            query += "\nAND ROWNUM <= %d" % batch_spec["sampling_kwargs"]["n"]
            return query
        elif dialect == "mssql":
            # TODO: AJB 20220429 WARNING THIS mssql dialect METHOD IS NOT COVERED BY TESTS
            selectable_query: Selectable = (sa.select("*").select_from(
                sa.table(table_name,
                         schema=batch_spec.get(
                             "schema_name", None))).where(where_clause).limit(
                                 batch_spec["sampling_kwargs"]["n"]))
            string_of_query: str = str(
                selectable_query.compile(
                    execution_engine.engine,
                    compile_kwargs={"literal_binds": True}))
            # TODO: AJB 20220504 REMOVE THIS HACK!
            # This hack is here because the limit parameter is not substituted during query.compile()
            n: Union[str, int] = batch_spec["sampling_kwargs"]["n"]
            if not isinstance(n, (str, int)):
                raise ge_exceptions.InvalidConfigError(
                    "Please specify your sampling kwargs 'n' parameter as a string or int."
                )
            if isinstance(n, str) and not n.isdigit():
                raise ge_exceptions.InvalidConfigError(
                    "If specifying your sampling kwargs 'n' parameter as a string please ensure it is "
                    "parseable as an integer.")
            string_of_query = string_of_query.replace("?", str(n))
            return string_of_query
        else:
            return (sa.select("*").select_from(
                sa.table(table_name,
                         schema=batch_spec.get(
                             "schema_name", None))).where(where_clause).limit(
                                 batch_spec["sampling_kwargs"]["n"]))