def sample_using_random( self, execution_engine: "SqlAlchemyExecutionEngine", # noqa: F821 batch_spec: BatchSpec, where_clause: Optional[Selectable] = None, ) -> Selectable: """Sample using random data with configuration provided via the batch_spec. Note: where_clause needs to be included at this stage since we use the where clause to determine the total number of rows to use in determining the rows returned in the sample fraction. Args: execution_engine: Engine used to connect to the database. batch_spec: Batch specification describing the batch of interest. where_clause: Optional clause used in WHERE clause. Typically generated by a splitter. Returns: Sqlalchemy selectable. """ # TODO: AJB 20220429 WARNING THIS METHOD IS NOT COVERED BY TESTS table_name: str = batch_spec["table_name"] num_rows: int = execution_engine.engine.execute( sa.select([sa.func.count()]).select_from( sa.table(table_name, schema=batch_spec.get( "schema_name", None))).where(where_clause)).scalar() p: float = batch_spec["sampling_kwargs"]["p"] or 1.0 sample_size: int = round(p * num_rows) return (sa.select("*").select_from( sa.table(table_name, schema=batch_spec.get( "schema_name", None))).where(where_clause).order_by( sa.func.random()).limit(sample_size))
def verify_batch_spec_sampling_kwargs_exists(self, batch_spec: BatchSpec) -> None: """Verify that sampling_kwargs key exists in batch_spec or raise error. Args: batch_spec: Can contain sampling_kwargs. Returns: None Raises: SamplerError """ if batch_spec.get("sampling_kwargs") is None: raise ge_exceptions.SamplerError( "Please make sure to provide sampling_kwargs in addition to your sampling_method." )
def sample_using_limit( self, execution_engine: "SqlAlchemyExecutionEngine", # noqa: F821 batch_spec: BatchSpec, where_clause: Optional[Selectable] = None, ) -> Union[str, BinaryExpression, BooleanClauseList]: """Sample using a limit with configuration provided via the batch_spec. Note: where_clause needs to be included at this stage since SqlAlchemy's semantics for LIMIT are different than normal WHERE clauses. Also this requires an engine to find the dialect since certain databases require different handling. Args: execution_engine: Engine used to connect to the database. batch_spec: Batch specification describing the batch of interest. where_clause: Optional clause used in WHERE clause. Typically generated by a splitter. Returns: A query as a string or sqlalchemy object. """ # Split clause should be permissive of all values if not supplied. if where_clause is None: if execution_engine.dialect_name == "sqlite": where_clause = sa.text("1 = 1") else: where_clause = sa.true() table_name: str = batch_spec["table_name"] # SQLalchemy's semantics for LIMIT are different than normal WHERE clauses, # so the business logic for building the query needs to be different. dialect_name: str = execution_engine.dialect_name if dialect_name == GESqlDialect.ORACLE.value: # TODO: AJB 20220429 WARNING THIS oracle dialect METHOD IS NOT COVERED BY TESTS # limit doesn't compile properly for oracle so we will append rownum to query string later raw_query: Selectable = (sa.select("*").select_from( sa.table(table_name, schema=batch_spec.get("schema_name", None))).where(where_clause)) query: str = str( raw_query.compile( dialect=execution_engine.dialect, compile_kwargs={"literal_binds": True}, )) query += "\nAND ROWNUM <= %d" % batch_spec["sampling_kwargs"]["n"] return query elif dialect_name == GESqlDialect.MSSQL.value: # Note that this code path exists because the limit parameter is not getting rendered # successfully in the resulting mssql query. selectable_query: Selectable = (sa.select("*").select_from( sa.table(table_name, schema=batch_spec.get( "schema_name", None))).where(where_clause).limit( batch_spec["sampling_kwargs"]["n"])) string_of_query: str = str( selectable_query.compile( dialect=execution_engine.dialect, compile_kwargs={"literal_binds": True}, )) n: Union[str, int] = batch_spec["sampling_kwargs"]["n"] self._validate_mssql_limit_param(n) # This string replacement is here because the limit parameter is not substituted during query.compile() string_of_query = string_of_query.replace("?", str(n)) return string_of_query else: return (sa.select("*").select_from( sa.table(table_name, schema=batch_spec.get( "schema_name", None))).where(where_clause).limit( batch_spec["sampling_kwargs"]["n"]))
def sample_using_limit( self, execution_engine: "SqlAlchemyExecutionEngine", # noqa: F821 batch_spec: BatchSpec, where_clause: Optional[Selectable] = None, ) -> Union[str, BinaryExpression, BooleanClauseList]: """Sample using a limit with configuration provided via the batch_spec. Note: where_clause needs to be included at this stage since SqlAlchemy's semantics for LIMIT are different than normal WHERE clauses. Also this requires an engine to find the dialect since certain databases require different handling. Args: execution_engine: Engine used to connect to the database. batch_spec: Batch specification describing the batch of interest. where_clause: Optional clause used in WHERE clause. Typically generated by a splitter. Returns: A query as a string or sqlalchemy object. """ # Split clause should be permissive of all values if not supplied. if not where_clause: where_clause = True table_name: str = batch_spec["table_name"] # SQLalchemy's semantics for LIMIT are different than normal WHERE clauses, # so the business logic for building the query needs to be different. dialect: str = execution_engine.engine.dialect.name.lower() if dialect == "oracle": # TODO: AJB 20220429 WARNING THIS oracle dialect METHOD IS NOT COVERED BY TESTS # limit doesn't compile properly for oracle so we will append rownum to query string later raw_query: Selectable = (sa.select("*").select_from( sa.table(table_name, schema=batch_spec.get("schema_name", None))).where(where_clause)) query: str = str( raw_query.compile(execution_engine, compile_kwargs={"literal_binds": True})) query += "\nAND ROWNUM <= %d" % batch_spec["sampling_kwargs"]["n"] return query elif dialect == "mssql": # TODO: AJB 20220429 WARNING THIS mssql dialect METHOD IS NOT COVERED BY TESTS selectable_query: Selectable = (sa.select("*").select_from( sa.table(table_name, schema=batch_spec.get( "schema_name", None))).where(where_clause).limit( batch_spec["sampling_kwargs"]["n"])) string_of_query: str = str( selectable_query.compile( execution_engine.engine, compile_kwargs={"literal_binds": True})) # TODO: AJB 20220504 REMOVE THIS HACK! # This hack is here because the limit parameter is not substituted during query.compile() n: Union[str, int] = batch_spec["sampling_kwargs"]["n"] if not isinstance(n, (str, int)): raise ge_exceptions.InvalidConfigError( "Please specify your sampling kwargs 'n' parameter as a string or int." ) if isinstance(n, str) and not n.isdigit(): raise ge_exceptions.InvalidConfigError( "If specifying your sampling kwargs 'n' parameter as a string please ensure it is " "parseable as an integer.") string_of_query = string_of_query.replace("?", str(n)) return string_of_query else: return (sa.select("*").select_from( sa.table(table_name, schema=batch_spec.get( "schema_name", None))).where(where_clause).limit( batch_spec["sampling_kwargs"]["n"]))