def _ge_validation_fn(context, dataset): data_context = context.resources.ge_data_context validator_kwargs = { "datasource_name": datasource_name, "data_connector_name": data_connector_name, "data_asset_name": datasource_name or data_asset_name, "runtime_parameters": { runtime_method_type: dataset }, "batch_identifiers": batch_identifiers, "expectation_suite_name": suite_name, **extra_kwargs, } validator = data_context.get_validator(**validator_kwargs) run_id = { "run_name": datasource_name + " run", "run_time": datetime.datetime.utcnow(), } results = validator.validate(run_id=run_id) validation_results_page_renderer = ValidationResultsPageRenderer( run_info_at_end=True) rendered_document_content_list = validation_results_page_renderer.render( validation_results=results) md_str = "".join( DefaultMarkdownPageView().render(rendered_document_content_list)) meta_stats = MetadataEntry.md(md_str=md_str, label="Expectation Results") yield ExpectationResult( success=bool(results["success"]), metadata_entries=[meta_stats], ) yield Output(results.to_json_dict())
def handle_output(self, context: OutputContext, obj: Union[PandasDataFrame, SparkDataFrame]): schema, table = DB_SCHEMA, context.asset_key.path[-1] time_window = context.asset_partitions_time_window if context.has_asset_partitions else None with connect_snowflake(config=self._config, schema=schema) as con: con.execute(self._get_cleanup_statement(table, schema, time_window)) if isinstance(obj, SparkDataFrame): yield from self._handle_spark_output(obj, schema, table) elif isinstance(obj, PandasDataFrame): yield from self._handle_pandas_output(obj, schema, table) elif obj is None: # dbt config = dict(SHARED_SNOWFLAKE_CONF) config["schema"] = DB_SCHEMA with connect_snowflake(config=config) as con: df = read_sql(f"SELECT * FROM {context.name} LIMIT 5", con=con) num_rows = con.execute(f"SELECT COUNT(*) FROM {context.name}").fetchone() yield MetadataEntry.md(df.to_markdown(), "Data sample") yield MetadataEntry.int(num_rows, "Rows") else: raise Exception( "SnowflakeIOManager only supports pandas DataFrames and spark DataFrames" ) yield MetadataEntry.text( self._get_select_statement( table, schema, None, time_window, ), "Query", )
def _handle_spark_output(self, df: SparkDataFrame, schema: str, table: str): options = { "sfURL": f"{self._config['account']}.snowflakecomputing.com", "sfUser": self._config["user"], "sfPassword": self._config["password"], "sfDatabase": self._config["database"], "sfSchema": schema, "sfWarehouse": self._config["warehouse"], "dbtable": table, } yield MetadataEntry.md(spark_columns_to_markdown(df.schema), "DataFrame columns") df.write.format("net.snowflake.spark.snowflake").options(**options).mode("append").save()
def handle_output(self, context, obj: pd.DataFrame): """This saves the dataframe as a CSV.""" fpath = self._get_fs_path(context.asset_key) os.makedirs(os.path.dirname(fpath), exist_ok=True) obj.to_csv(fpath) with open(fpath + ".version", "w") as f: f.write(context.version if context.version else "None") yield MetadataEntry.int(obj.shape[0], "Rows") yield MetadataEntry.path(fpath, "Path") yield MetadataEntry.md(obj.head(5).to_markdown(), "Sample") yield MetadataEntry.text(context.version, "Resolved version") yield MetadataEntry.table_schema( self.get_schema(context.dagster_type), "Schema", )
def _get_metadata(self, result: Dict[str, Any]) -> List[MetadataEntry]: """ Here, we run queries against our output Snowflake database tables to add additional context to our asset materializations. """ table_name = result["unique_id"].split(".")[-1] with connect_snowflake(config=self._snowflake_config, schema=self._dbt_schema) as con: n_rows = pandas.read_sql_query(f"SELECT COUNT(*) FROM {table_name}", con) sample_rows = pandas.read_sql_query( f"SELECT * FROM {table_name} SAMPLE ROW (10 rows)", con ) return super()._get_metadata(result) + [ MetadataEntry.int(int(n_rows.iloc[0][0]), "dbt Model Number of Rows"), MetadataEntry.md(sample_rows.astype("str").to_markdown(), "dbt Model Sample Rows"), ]
def _handle_pandas_output(self, obj: PandasDataFrame, schema: str, table: str): from snowflake import connector # pylint: disable=no-name-in-module yield MetadataEntry.int(obj.shape[0], "Rows") yield MetadataEntry.md(pandas_columns_to_markdown(obj), "DataFrame columns") connector.paramstyle = "pyformat" with connect_snowflake(config=self._config, schema=schema) as con: with_uppercase_cols = obj.rename(str.upper, copy=False, axis="columns") with_uppercase_cols.to_sql( table, con=con, if_exists="append", index=False, method=pd_writer, )
def _ge_validation_fn(context, dataset): data_context = context.resources.ge_data_context if validation_operator_name is not None: validation_operator = validation_operator_name else: data_context.add_validation_operator( "ephemeral_validation", { "class_name": "ActionListValidationOperator", "action_list": [] }, ) validation_operator = "ephemeral_validation" suite = data_context.get_expectation_suite(suite_name) final_batch_kwargs = batch_kwargs or {"dataset": dataset} if "datasource" in batch_kwargs: context.log.warning( "`datasource` field of `batch_kwargs` will be ignored; use the `datasource_name` " f"parameter of the {decorator_name} factory instead.") final_batch_kwargs["datasource"] = datasource_name batch = data_context.get_batch(final_batch_kwargs, suite) run_id = { "run_name": datasource_name + " run", "run_time": datetime.datetime.utcnow(), } results = data_context.run_validation_operator( validation_operator, assets_to_validate=[batch], run_id=run_id) res = convert_to_json_serializable( results.list_validation_results())[0] validation_results_page_renderer = ValidationResultsPageRenderer( run_info_at_end=True) rendered_document_content_list = ( validation_results_page_renderer.render_validation_operator_result( results)) md_str = " ".join( DefaultMarkdownPageView().render(rendered_document_content_list)) meta_stats = MetadataEntry.md(md_str=md_str, label="Expectation Results") yield ExpectationResult( success=res["success"], metadata_entries=[ meta_stats, ], ) yield Output(res)