def get_batch_data_and_markers(
            self, batch_spec: BatchSpec) -> Tuple[Any, BatchMarkers]:
        selectable = self._build_selectable_from_batch_spec(
            batch_spec=batch_spec)

        if "bigquery_temp_table" in batch_spec:
            temp_table_name = batch_spec.get("bigquery_temp_table")
        else:
            temp_table_name = None

        source_table_name = batch_spec.get("table_name", None)
        source_schema_name = batch_spec.get("schema_name", None)

        batch_data = SqlAlchemyBatchData(
            execution_engine=self,
            selectable=selectable,
            temp_table_name=temp_table_name,
            create_temp_table=batch_spec.get("create_temp_table",
                                             self._create_temp_table),
            source_table_name=source_table_name,
            source_schema_name=source_schema_name,
        )
        batch_markers = BatchMarkers({
            "ge_load_time":
            datetime.datetime.now(
                datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
        })

        return batch_data, batch_markers
Exemplo n.º 2
0
def test_to_make_sure_splitter_and_sampler_methods_are_optional(
    test_cases_for_sql_data_connector_sqlite_execution_engine, ):
    execution_engine = test_cases_for_sql_data_connector_sqlite_execution_engine

    batch_data, batch_markers = execution_engine.get_batch_data_and_markers(
        batch_spec=BatchSpec({
            "table_name": "table_partitioned_by_date_column__A",
            "partition_definition": {},
            "sampling_method": "_sample_using_mod",
            "sampling_kwargs": {
                "column_name": "id",
                "mod": 10,
                "value": 8,
            },
        }))

    assert len(batch_data.head(fetch_all=True)) == 12

    batch_data, batch_markers = execution_engine.get_batch_data_and_markers(
        batch_spec=BatchSpec({
            "table_name": "table_partitioned_by_date_column__A",
            "partition_definition": {},
        }))

    assert len(batch_data.head(fetch_all=True)) == 120

    batch_data, batch_markers = execution_engine.get_batch_data_and_markers(
        batch_spec=BatchSpec({
            "table_name": "table_partitioned_by_date_column__A",
            "partition_definition": {},
            "splitter_method": "_split_on_whole_table",
            "splitter_kwargs": {},
        }))

    assert len(batch_data.head(fetch_all=True)) == 120
Exemplo n.º 3
0
    def _build_selectable_from_batch_spec(
        self, batch_spec: BatchSpec
    ) -> Union[Selectable, str]:
        if "splitter_method" in batch_spec:
            splitter_fn: Callable = self._get_splitter_method(
                splitter_method_name=batch_spec["splitter_method"]
            )
            split_clause = splitter_fn(
                batch_identifiers=batch_spec["batch_identifiers"],
                **batch_spec["splitter_kwargs"],
            )

        else:
            split_clause = True

        table_name: str = batch_spec["table_name"]
        sampling_method: Optional[str] = batch_spec.get("sampling_method")
        if sampling_method is not None:
            if sampling_method in [
                "_sample_using_limit",
                "sample_using_limit",
                "_sample_using_random",
                "sample_using_random",
            ]:
                sampler_fn = self._data_sampler.get_sampler_method(sampling_method)
                return sampler_fn(
                    execution_engine=self,
                    batch_spec=batch_spec,
                    where_clause=split_clause,
                )
            else:
                sampler_fn = self._data_sampler.get_sampler_method(sampling_method)
                return (
                    sa.select("*")
                    .select_from(
                        sa.table(table_name, schema=batch_spec.get("schema_name", None))
                    )
                    .where(
                        sa.and_(
                            split_clause,
                            sampler_fn(batch_spec),
                        )
                    )
                )
        return (
            sa.select("*")
            .select_from(
                sa.table(table_name, schema=batch_spec.get("schema_name", None))
            )
            .where(split_clause)
        )
def test_sampling_method__limit(
    test_cases_for_sql_data_connector_sqlite_execution_engine,
):
    execution_engine = test_cases_for_sql_data_connector_sqlite_execution_engine

    batch_data, batch_markers = execution_engine.get_batch_data_and_markers(
        batch_spec=BatchSpec(
            {
                "table_name": "table_partitioned_by_date_column__A",
                "partition_definition": {},
                "splitter_method": "_split_on_whole_table",
                "splitter_kwargs": {},
                "sampling_method": "_sample_using_limit",
                "sampling_kwargs": {"n": 20},
            }
        )
    )
    execution_engine.load_batch_data("__", batch_data)
    validator = Validator(execution_engine)
    assert len(validator.head(fetch_all=True)) == 20

    assert (
        validator.expect_column_values_to_be_in_set(
            "date", value_set=["2020-01-02"]
        ).success
        == False
    )
def test_batch__str__method():
    batch = Batch(
        data=None,
        batch_request=BatchRequest(
            datasource_name="my_datasource",
            data_connector_name="my_data_connector",
            data_asset_name="my_data_asset_name",
        ),
        batch_definition=BatchDefinition(
            datasource_name="my_datasource",
            data_connector_name="my_data_connector",
            data_asset_name="my_data_asset_name",
            batch_identifiers=IDDict({}),
        ),
        batch_spec=BatchSpec(path="/some/path/some.file"),
        batch_markers=BatchMarkers(ge_load_time="FAKE_LOAD_TIME"),
    )
    print(batch.__str__())

    assert (batch.__str__() == """{
  "data": "None",
  "batch_request": {
    "datasource_name": "my_datasource",
    "data_connector_name": "my_data_connector",
    "data_asset_name": "my_data_asset_name"
  },
  "batch_definition": {
    "datasource_name": "my_datasource",
    "data_connector_name": "my_data_connector",
    "data_asset_name": "my_data_asset_name",
    "batch_identifiers": {}
  },
  "batch_spec": "{'path': '/some/path/some.file'}",
  "batch_markers": "{'ge_load_time': 'FAKE_LOAD_TIME'}"
}""")
    def build_batch_spec(self, batch_definition: BatchDefinition):
        data_asset_name = batch_definition.data_asset_name
        batch_spec = BatchSpec({
            "table_name": data_asset_name,
            "partition_definition": batch_definition.partition_definition,
            **self.data_assets[data_asset_name],
        })

        return batch_spec
Exemplo n.º 7
0
    def get_batch_data_and_markers(
            self,
            batch_spec: BatchSpec) -> Tuple[Any, BatchMarkers]:  # batch_data
        batch_data: DataFrame

        # We need to build a batch_markers to be used in the dataframe
        batch_markers: BatchMarkers = BatchMarkers({
            "ge_load_time":
            datetime.datetime.now(
                datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
        })

        if isinstance(batch_spec, RuntimeDataBatchSpec):
            # batch_data != None is already checked when RuntimeDataBatchSpec is instantiated
            batch_data = batch_spec.batch_data
            batch_spec.batch_data = "SparkDataFrame"
        elif isinstance(batch_spec, (PathBatchSpec, S3BatchSpec)):
            reader_method: str = batch_spec.get("reader_method")
            reader_options: dict = batch_spec.get("reader_options") or {}
            path: str = batch_spec.get("path") or batch_spec.get("s3")
            try:
                reader_options = self.spark.read.options(**reader_options)
                reader_fn: Callable = self._get_reader_fn(
                    reader=reader_options,
                    reader_method=reader_method,
                    path=path,
                )
                batch_data = reader_fn(path)
            except AttributeError:
                raise ExecutionEngineError("""
                    Unable to load pyspark. Pyspark is required for SparkDFExecutionEngine.
                    """)
        else:
            raise BatchSpecError("""
                Invalid batch_spec: batch_data is required for a SparkDFExecutionEngine to operate.
                """)

        batch_data = self._apply_splitting_and_sampling_methods(
            batch_spec, batch_data)
        typed_batch_data = SparkDFBatchData(batch_data)

        return typed_batch_data, batch_markers
Exemplo n.º 8
0
def test_instantiation_via_connection_string(sa, test_db_connection_string):
    my_execution_engine = SqlAlchemyExecutionEngine(
        connection_string=test_db_connection_string)
    assert my_execution_engine.connection_string == test_db_connection_string
    assert my_execution_engine.credentials == None
    assert my_execution_engine.url == None

    my_execution_engine.get_batch_data_and_markers(
        BatchSpec(
            table_name="main.table_1",
            sampling_method="_sample_using_limit",
            sampling_kwargs={"n": 5},
        ))
Exemplo n.º 9
0
    def get_batch_data_and_markers(
            self, batch_spec: BatchSpec) -> Tuple[Any, BatchMarkers]:
        if not isinstance(
                batch_spec,
            (SqlAlchemyDatasourceBatchSpec, RuntimeQueryBatchSpec)):
            raise InvalidBatchSpecError(
                f"""SqlAlchemyExecutionEngine accepts batch_spec only of type SqlAlchemyDatasourceBatchSpec or
        RuntimeQueryBatchSpec (illegal type "{str(type(batch_spec))}" was received).
                        """)

        batch_data: Optional[SqlAlchemyBatchData] = None
        batch_markers: BatchMarkers = BatchMarkers({
            "ge_load_time":
            datetime.datetime.now(
                datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
        })

        source_schema_name: str = batch_spec.get("schema_name", None)
        source_table_name: str = batch_spec.get("table_name", None)

        temp_table_schema_name: Optional[str] = batch_spec.get(
            "temp_table_schema_name")
        temp_table_name: Optional[str] = batch_spec.get("bigquery_temp_table")

        create_temp_table: bool = batch_spec.get("create_temp_table",
                                                 self._create_temp_table)

        if isinstance(batch_spec, RuntimeQueryBatchSpec):
            # query != None is already checked when RuntimeQueryBatchSpec is instantiated
            query: str = batch_spec.query

            batch_spec.query = "SQLQuery"
            batch_data = SqlAlchemyBatchData(
                execution_engine=self,
                query=query,
                temp_table_schema_name=temp_table_schema_name,
                temp_table_name=temp_table_name,
                create_temp_table=create_temp_table,
                source_table_name=source_table_name,
                source_schema_name=source_schema_name,
            )
        elif isinstance(batch_spec, SqlAlchemyDatasourceBatchSpec):
            if self.engine.dialect.name.lower() == "oracle":
                selectable: str = self._build_selectable_from_batch_spec(
                    batch_spec=batch_spec)
            else:
                selectable: Selectable = self._build_selectable_from_batch_spec(
                    batch_spec=batch_spec)

            batch_data = SqlAlchemyBatchData(
                execution_engine=self,
                selectable=selectable,
                temp_table_name=temp_table_name,
                create_temp_table=create_temp_table,
                source_table_name=source_table_name,
                source_schema_name=source_schema_name,
            )

        return batch_data, batch_markers
Exemplo n.º 10
0
def test_sampling_method__limit(
    test_cases_for_sql_data_connector_sqlite_execution_engine, ):
    execution_engine = test_cases_for_sql_data_connector_sqlite_execution_engine

    batch_data, batch_markers = execution_engine.get_batch_data_and_markers(
        batch_spec=BatchSpec({
            "table_name": "table_partitioned_by_date_column__A",
            "partition_definition": {},
            "splitter_method": "_split_on_whole_table",
            "splitter_kwargs": {},
            "sampling_method": "_sample_using_limit",
            "sampling_kwargs": {
                "n": 20
            },
        }))
    assert len(batch_data.head(fetch_all=True)) == 20
Exemplo n.º 11
0
def test_instantiation_via_url(sa):
    db_file = file_relative_path(
        __file__,
        os.path.join("..", "test_sets",
                     "test_cases_for_sql_data_connector.db"),
    )
    my_execution_engine = SqlAlchemyExecutionEngine(url="sqlite:///" + db_file)
    assert my_execution_engine.connection_string == None
    assert my_execution_engine.credentials == None
    assert my_execution_engine.url[
        -36:] == "test_cases_for_sql_data_connector.db"

    my_execution_engine.get_batch_data_and_markers(
        BatchSpec(
            table_name="table_partitioned_by_date_column__A",
            sampling_method="_sample_using_limit",
            sampling_kwargs={"n": 5},
        ))
Exemplo n.º 12
0
def test_sampling_method__random(
    test_cases_for_sql_data_connector_sqlite_execution_engine, ):
    execution_engine = test_cases_for_sql_data_connector_sqlite_execution_engine

    batch_data, batch_markers = execution_engine.get_batch_data_and_markers(
        batch_spec=BatchSpec({
            "table_name": "table_partitioned_by_date_column__A",
            "partition_definition": {},
            "splitter_method": "_split_on_whole_table",
            "splitter_kwargs": {},
            "sampling_method": "_sample_using_random",
            "sampling_kwargs": {
                "p": 1.0
            },
        }))

    # random.seed() is no good here: the random number generator is in the database, not python
    # assert len(batch_data.head(fetch_all=True)) == 63
    pass
Exemplo n.º 13
0
def test_sampling_method__a_list(
    test_cases_for_sql_data_connector_sqlite_execution_engine, ):
    execution_engine = test_cases_for_sql_data_connector_sqlite_execution_engine

    batch_data, batch_markers = execution_engine.get_batch_data_and_markers(
        batch_spec=BatchSpec({
            "table_name": "table_partitioned_by_date_column__A",
            "partition_definition": {},
            "splitter_method": "_split_on_whole_table",
            "splitter_kwargs": {},
            "sampling_method": "_sample_using_a_list",
            "sampling_kwargs": {
                "column_name": "id",
                "value_list": [10, 20, 30, 40],
            },
        }))
    execution_engine.load_batch_data("__", batch_data)
    validator = Validator(execution_engine)
    assert len(validator.head(fetch_all=True)) == 4
    def build_batch_spec(self, batch_definition: BatchDefinition):
        """
        Build BatchSpec from batch_definition with the following components:
            1. data_asset_name from batch_definition
            2. partition_definition from batch_definition
            3. data_asset from data_connector

        Args:
            batch_definition (BatchDefinition): to be used to build batch_spec

        Returns:
            BatchSpec built from batch_definition
        """
        data_asset_name = batch_definition.data_asset_name
        batch_spec = BatchSpec({
            "table_name": data_asset_name,
            "partition_definition": batch_definition.partition_definition,
            **self.data_assets[data_asset_name],
        })
        return batch_spec
Exemplo n.º 15
0
    def get_batch_data_and_markers(
        self, batch_spec: BatchSpec
    ) -> Tuple[Any, BatchMarkers]:
        if not isinstance(
            batch_spec, (SqlAlchemyDatasourceBatchSpec, RuntimeQueryBatchSpec)
        ):
            raise InvalidBatchSpecError(
                f"""SqlAlchemyExecutionEngine accepts batch_spec only of type SqlAlchemyDatasourceBatchSpec or
        RuntimeQueryBatchSpec (illegal type "{str(type(batch_spec))}" was received).
                        """
            )

        batch_data: Optional[SqlAlchemyBatchData] = None
        batch_markers: BatchMarkers = BatchMarkers(
            {
                "ge_load_time": datetime.datetime.now(datetime.timezone.utc).strftime(
                    "%Y%m%dT%H%M%S.%fZ"
                )
            }
        )

        source_schema_name: str = batch_spec.get("schema_name", None)
        source_table_name: str = batch_spec.get("table_name", None)

        temp_table_schema_name: Optional[str] = batch_spec.get("temp_table_schema_name")

        if batch_spec.get("bigquery_temp_table"):
            # deprecated-v0.15.3
            warnings.warn(
                "BigQuery tables that are created as the result of a query are no longer created as "
                "permanent tables. Thus, a named permanent table through the `bigquery_temp_table`"
                "parameter is not required. The `bigquery_temp_table` parameter is deprecated as of"
                "v0.15.3 and will be removed in v0.18.",
                DeprecationWarning,
            )

        create_temp_table: bool = batch_spec.get(
            "create_temp_table", self._create_temp_table
        )

        if isinstance(batch_spec, RuntimeQueryBatchSpec):
            # query != None is already checked when RuntimeQueryBatchSpec is instantiated
            query: str = batch_spec.query

            batch_spec.query = "SQLQuery"
            batch_data = SqlAlchemyBatchData(
                execution_engine=self,
                query=query,
                temp_table_schema_name=temp_table_schema_name,
                create_temp_table=create_temp_table,
                source_table_name=source_table_name,
                source_schema_name=source_schema_name,
            )
        elif isinstance(batch_spec, SqlAlchemyDatasourceBatchSpec):
            if self.engine.dialect.name.lower() == "oracle":
                selectable: str = self._build_selectable_from_batch_spec(
                    batch_spec=batch_spec
                )
            else:
                selectable: Selectable = self._build_selectable_from_batch_spec(
                    batch_spec=batch_spec
                )

            batch_data = SqlAlchemyBatchData(
                execution_engine=self,
                selectable=selectable,
                create_temp_table=create_temp_table,
                source_table_name=source_table_name,
                source_schema_name=source_schema_name,
            )

        return batch_data, batch_markers
Exemplo n.º 16
0
def test_instantiation_with_and_without_temp_table(sqlite_view_engine, sa):
    print(get_sqlite_temp_table_names(sqlite_view_engine))
    assert len(get_sqlite_temp_table_names(sqlite_view_engine)) == 1
    assert get_sqlite_temp_table_names(sqlite_view_engine) == {
        "test_temp_view"
    }

    engine = SqlAlchemyExecutionEngine(engine=sqlite_view_engine)
    # When the SqlAlchemyBatchData object is based on a table, a new temp table is NOT created, even if create_temp_table=True
    SqlAlchemyBatchData(
        execution_engine=engine,
        table_name="test_table",
        create_temp_table=True,
    )
    assert len(get_sqlite_temp_table_names(sqlite_view_engine)) == 1

    selectable = sa.select("*").select_from(sa.text("main.test_table"))

    # If create_temp_table=False, a new temp table should NOT be created
    SqlAlchemyBatchData(
        execution_engine=engine,
        selectable=selectable,
        create_temp_table=False,
    )
    assert len(get_sqlite_temp_table_names(sqlite_view_engine)) == 1

    # If create_temp_table=True, a new temp table should be created
    SqlAlchemyBatchData(
        execution_engine=engine,
        selectable=selectable,
        create_temp_table=True,
    )
    assert len(get_sqlite_temp_table_names(sqlite_view_engine)) == 2

    # If create_temp_table=True, a new temp table should be created
    SqlAlchemyBatchData(
        execution_engine=engine,
        selectable=selectable,
        # create_temp_table defaults to True
    )
    assert len(get_sqlite_temp_table_names(sqlite_view_engine)) == 3

    # testing whether schema is supported
    selectable = sa.select("*").select_from(
        sa.table(name="test_table", schema="main"))
    SqlAlchemyBatchData(
        execution_engine=engine,
        selectable=selectable,
        # create_temp_table defaults to True
    )
    assert len(get_sqlite_temp_table_names(sqlite_view_engine)) == 4

    # test schema with execution engine
    # TODO : Will20210222 Add tests for specifying schema with non-sqlite backend that actually supports new schema creation
    my_batch_spec = BatchSpec(
        **{
            "table_name": "test_table",
            "partition_definition": {},
            "schema_name": "main",
        })
    res = engine.get_batch_data_and_markers(batch_spec=my_batch_spec)
    assert len(res) == 2
Exemplo n.º 17
0
    def _build_selectable_from_batch_spec(
            self, batch_spec: BatchSpec) -> Union[Selectable, str]:
        table_name: str = batch_spec["table_name"]
        if "splitter_method" in batch_spec:
            splitter_fn = getattr(self, batch_spec["splitter_method"])
            split_clause = splitter_fn(
                table_name=table_name,
                batch_identifiers=batch_spec["batch_identifiers"],
                **batch_spec["splitter_kwargs"],
            )

        else:
            split_clause = True

        if "sampling_method" in batch_spec:
            if batch_spec["sampling_method"] == "_sample_using_limit":
                # SQLalchemy's semantics for LIMIT are different than normal WHERE clauses,
                # so the business logic for building the query needs to be different.
                if self.engine.dialect.name.lower() == "oracle":
                    # limit doesn't compile properly for oracle so we will append rownum to query string later
                    raw_query = (sa.select("*").select_from(
                        sa.table(
                            table_name,
                            schema=batch_spec.get("schema_name",
                                                  None))).where(split_clause))
                    query = str(
                        raw_query.compile(
                            self.engine,
                            compile_kwargs={"literal_binds": True}))
                    query += "\nAND ROWNUM <= %d" % batch_spec[
                        "sampling_kwargs"]["n"]
                    return query
                else:
                    return (sa.select("*").select_from(
                        sa.table(table_name,
                                 schema=batch_spec.get(
                                     "schema_name",
                                     None))).where(split_clause).limit(
                                         batch_spec["sampling_kwargs"]["n"]))
            elif batch_spec["sampling_method"] == "_sample_using_random":
                num_rows: int = self.engine.execute(
                    sa.select([sa.func.count()]).select_from(
                        sa.table(table_name,
                                 schema=batch_spec.get(
                                     "schema_name",
                                     None))).where(split_clause)).scalar()
                p: float = batch_spec["sampling_kwargs"]["p"] or 1.0
                sample_size: int = round(p * num_rows)
                return (sa.select("*").select_from(
                    sa.table(table_name,
                             schema=batch_spec.get(
                                 "schema_name",
                                 None))).where(split_clause).order_by(
                                     sa.func.random()).limit(sample_size))
            else:
                sampler_fn = getattr(self, batch_spec["sampling_method"])
                return (sa.select("*").select_from(
                    sa.table(
                        table_name,
                        schema=batch_spec.get("schema_name", None))).where(
                            sa.and_(
                                split_clause,
                                sampler_fn(**batch_spec["sampling_kwargs"]),
                            )))
        return (sa.select("*").select_from(
            sa.table(table_name,
                     schema=batch_spec.get("schema_name",
                                           None))).where(split_clause))