def get_batch_data_and_markers(
            self, batch_spec: BatchSpec) -> Tuple[Any, BatchMarkers]:
        selectable = self._build_selectable_from_batch_spec(
            batch_spec=batch_spec)

        if "bigquery_temp_table" in batch_spec:
            temp_table_name = batch_spec.get("bigquery_temp_table")
        else:
            temp_table_name = None

        source_table_name = batch_spec.get("table_name", None)
        source_schema_name = batch_spec.get("schema_name", None)

        batch_data = SqlAlchemyBatchData(
            execution_engine=self,
            selectable=selectable,
            temp_table_name=temp_table_name,
            create_temp_table=batch_spec.get("create_temp_table",
                                             self._create_temp_table),
            source_table_name=source_table_name,
            source_schema_name=source_schema_name,
        )
        batch_markers = BatchMarkers({
            "ge_load_time":
            datetime.datetime.now(
                datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
        })

        return batch_data, batch_markers
Пример #2
0
    def get_batch_data_and_markers(
            self, batch_spec: BatchSpec) -> Tuple[Any, BatchMarkers]:
        if not isinstance(
                batch_spec,
            (SqlAlchemyDatasourceBatchSpec, RuntimeQueryBatchSpec)):
            raise InvalidBatchSpecError(
                f"""SqlAlchemyExecutionEngine accepts batch_spec only of type SqlAlchemyDatasourceBatchSpec or
        RuntimeQueryBatchSpec (illegal type "{str(type(batch_spec))}" was received).
                        """)

        batch_data: Optional[SqlAlchemyBatchData] = None
        batch_markers: BatchMarkers = BatchMarkers({
            "ge_load_time":
            datetime.datetime.now(
                datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
        })

        source_schema_name: str = batch_spec.get("schema_name", None)
        source_table_name: str = batch_spec.get("table_name", None)

        temp_table_schema_name: Optional[str] = batch_spec.get(
            "temp_table_schema_name")
        temp_table_name: Optional[str] = batch_spec.get("bigquery_temp_table")

        create_temp_table: bool = batch_spec.get("create_temp_table",
                                                 self._create_temp_table)

        if isinstance(batch_spec, RuntimeQueryBatchSpec):
            # query != None is already checked when RuntimeQueryBatchSpec is instantiated
            query: str = batch_spec.query

            batch_spec.query = "SQLQuery"
            batch_data = SqlAlchemyBatchData(
                execution_engine=self,
                query=query,
                temp_table_schema_name=temp_table_schema_name,
                temp_table_name=temp_table_name,
                create_temp_table=create_temp_table,
                source_table_name=source_table_name,
                source_schema_name=source_schema_name,
            )
        elif isinstance(batch_spec, SqlAlchemyDatasourceBatchSpec):
            if self.engine.dialect.name.lower() == "oracle":
                selectable: str = self._build_selectable_from_batch_spec(
                    batch_spec=batch_spec)
            else:
                selectable: Selectable = self._build_selectable_from_batch_spec(
                    batch_spec=batch_spec)

            batch_data = SqlAlchemyBatchData(
                execution_engine=self,
                selectable=selectable,
                temp_table_name=temp_table_name,
                create_temp_table=create_temp_table,
                source_table_name=source_table_name,
                source_schema_name=source_schema_name,
            )

        return batch_data, batch_markers
Пример #3
0
    def _build_selectable_from_batch_spec(
        self, batch_spec: BatchSpec
    ) -> Union[Selectable, str]:
        if "splitter_method" in batch_spec:
            splitter_fn: Callable = self._get_splitter_method(
                splitter_method_name=batch_spec["splitter_method"]
            )
            split_clause = splitter_fn(
                batch_identifiers=batch_spec["batch_identifiers"],
                **batch_spec["splitter_kwargs"],
            )

        else:
            split_clause = True

        table_name: str = batch_spec["table_name"]
        sampling_method: Optional[str] = batch_spec.get("sampling_method")
        if sampling_method is not None:
            if sampling_method in [
                "_sample_using_limit",
                "sample_using_limit",
                "_sample_using_random",
                "sample_using_random",
            ]:
                sampler_fn = self._data_sampler.get_sampler_method(sampling_method)
                return sampler_fn(
                    execution_engine=self,
                    batch_spec=batch_spec,
                    where_clause=split_clause,
                )
            else:
                sampler_fn = self._data_sampler.get_sampler_method(sampling_method)
                return (
                    sa.select("*")
                    .select_from(
                        sa.table(table_name, schema=batch_spec.get("schema_name", None))
                    )
                    .where(
                        sa.and_(
                            split_clause,
                            sampler_fn(batch_spec),
                        )
                    )
                )
        return (
            sa.select("*")
            .select_from(
                sa.table(table_name, schema=batch_spec.get("schema_name", None))
            )
            .where(split_clause)
        )
Пример #4
0
    def get_batch_data_and_markers(
            self,
            batch_spec: BatchSpec) -> Tuple[Any, BatchMarkers]:  # batch_data
        batch_data: DataFrame

        # We need to build a batch_markers to be used in the dataframe
        batch_markers: BatchMarkers = BatchMarkers({
            "ge_load_time":
            datetime.datetime.now(
                datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
        })

        if isinstance(batch_spec, RuntimeDataBatchSpec):
            # batch_data != None is already checked when RuntimeDataBatchSpec is instantiated
            batch_data = batch_spec.batch_data
            batch_spec.batch_data = "SparkDataFrame"
        elif isinstance(batch_spec, (PathBatchSpec, S3BatchSpec)):
            reader_method: str = batch_spec.get("reader_method")
            reader_options: dict = batch_spec.get("reader_options") or {}
            path: str = batch_spec.get("path") or batch_spec.get("s3")
            try:
                reader_options = self.spark.read.options(**reader_options)
                reader_fn: Callable = self._get_reader_fn(
                    reader=reader_options,
                    reader_method=reader_method,
                    path=path,
                )
                batch_data = reader_fn(path)
            except AttributeError:
                raise ExecutionEngineError("""
                    Unable to load pyspark. Pyspark is required for SparkDFExecutionEngine.
                    """)
        else:
            raise BatchSpecError("""
                Invalid batch_spec: batch_data is required for a SparkDFExecutionEngine to operate.
                """)

        batch_data = self._apply_splitting_and_sampling_methods(
            batch_spec, batch_data)
        typed_batch_data = SparkDFBatchData(batch_data)

        return typed_batch_data, batch_markers
Пример #5
0
    def get_batch_data_and_markers(
        self, batch_spec: BatchSpec
    ) -> Tuple[Any, BatchMarkers]:
        if not isinstance(
            batch_spec, (SqlAlchemyDatasourceBatchSpec, RuntimeQueryBatchSpec)
        ):
            raise InvalidBatchSpecError(
                f"""SqlAlchemyExecutionEngine accepts batch_spec only of type SqlAlchemyDatasourceBatchSpec or
        RuntimeQueryBatchSpec (illegal type "{str(type(batch_spec))}" was received).
                        """
            )

        batch_data: Optional[SqlAlchemyBatchData] = None
        batch_markers: BatchMarkers = BatchMarkers(
            {
                "ge_load_time": datetime.datetime.now(datetime.timezone.utc).strftime(
                    "%Y%m%dT%H%M%S.%fZ"
                )
            }
        )

        source_schema_name: str = batch_spec.get("schema_name", None)
        source_table_name: str = batch_spec.get("table_name", None)

        temp_table_schema_name: Optional[str] = batch_spec.get("temp_table_schema_name")

        if batch_spec.get("bigquery_temp_table"):
            # deprecated-v0.15.3
            warnings.warn(
                "BigQuery tables that are created as the result of a query are no longer created as "
                "permanent tables. Thus, a named permanent table through the `bigquery_temp_table`"
                "parameter is not required. The `bigquery_temp_table` parameter is deprecated as of"
                "v0.15.3 and will be removed in v0.18.",
                DeprecationWarning,
            )

        create_temp_table: bool = batch_spec.get(
            "create_temp_table", self._create_temp_table
        )

        if isinstance(batch_spec, RuntimeQueryBatchSpec):
            # query != None is already checked when RuntimeQueryBatchSpec is instantiated
            query: str = batch_spec.query

            batch_spec.query = "SQLQuery"
            batch_data = SqlAlchemyBatchData(
                execution_engine=self,
                query=query,
                temp_table_schema_name=temp_table_schema_name,
                create_temp_table=create_temp_table,
                source_table_name=source_table_name,
                source_schema_name=source_schema_name,
            )
        elif isinstance(batch_spec, SqlAlchemyDatasourceBatchSpec):
            if self.engine.dialect.name.lower() == "oracle":
                selectable: str = self._build_selectable_from_batch_spec(
                    batch_spec=batch_spec
                )
            else:
                selectable: Selectable = self._build_selectable_from_batch_spec(
                    batch_spec=batch_spec
                )

            batch_data = SqlAlchemyBatchData(
                execution_engine=self,
                selectable=selectable,
                create_temp_table=create_temp_table,
                source_table_name=source_table_name,
                source_schema_name=source_schema_name,
            )

        return batch_data, batch_markers
Пример #6
0
    def _build_selectable_from_batch_spec(
            self, batch_spec: BatchSpec) -> Union[Selectable, str]:
        table_name: str = batch_spec["table_name"]
        if "splitter_method" in batch_spec:
            splitter_fn = getattr(self, batch_spec["splitter_method"])
            split_clause = splitter_fn(
                table_name=table_name,
                batch_identifiers=batch_spec["batch_identifiers"],
                **batch_spec["splitter_kwargs"],
            )

        else:
            split_clause = True

        if "sampling_method" in batch_spec:
            if batch_spec["sampling_method"] == "_sample_using_limit":
                # SQLalchemy's semantics for LIMIT are different than normal WHERE clauses,
                # so the business logic for building the query needs to be different.
                if self.engine.dialect.name.lower() == "oracle":
                    # limit doesn't compile properly for oracle so we will append rownum to query string later
                    raw_query = (sa.select("*").select_from(
                        sa.table(
                            table_name,
                            schema=batch_spec.get("schema_name",
                                                  None))).where(split_clause))
                    query = str(
                        raw_query.compile(
                            self.engine,
                            compile_kwargs={"literal_binds": True}))
                    query += "\nAND ROWNUM <= %d" % batch_spec[
                        "sampling_kwargs"]["n"]
                    return query
                else:
                    return (sa.select("*").select_from(
                        sa.table(table_name,
                                 schema=batch_spec.get(
                                     "schema_name",
                                     None))).where(split_clause).limit(
                                         batch_spec["sampling_kwargs"]["n"]))
            elif batch_spec["sampling_method"] == "_sample_using_random":
                num_rows: int = self.engine.execute(
                    sa.select([sa.func.count()]).select_from(
                        sa.table(table_name,
                                 schema=batch_spec.get(
                                     "schema_name",
                                     None))).where(split_clause)).scalar()
                p: float = batch_spec["sampling_kwargs"]["p"] or 1.0
                sample_size: int = round(p * num_rows)
                return (sa.select("*").select_from(
                    sa.table(table_name,
                             schema=batch_spec.get(
                                 "schema_name",
                                 None))).where(split_clause).order_by(
                                     sa.func.random()).limit(sample_size))
            else:
                sampler_fn = getattr(self, batch_spec["sampling_method"])
                return (sa.select("*").select_from(
                    sa.table(
                        table_name,
                        schema=batch_spec.get("schema_name", None))).where(
                            sa.and_(
                                split_clause,
                                sampler_fn(**batch_spec["sampling_kwargs"]),
                            )))
        return (sa.select("*").select_from(
            sa.table(table_name,
                     schema=batch_spec.get("schema_name",
                                           None))).where(split_clause))