def _ge_validation_fn(context, dataset): data_context = context.resources.ge_data_context validator_kwargs = { "datasource_name": datasource_name, "data_connector_name": data_connector_name, "data_asset_name": datasource_name or data_asset_name, "runtime_parameters": {runtime_method_type: dataset}, "batch_identifiers": batch_identifiers, "expectation_suite_name": suite_name, **extra_kwargs, } validator = data_context.get_validator(**validator_kwargs) run_id = { "run_name": datasource_name + " run", "run_time": datetime.datetime.utcnow(), } results = validator.validate(run_id=run_id) validation_results_page_renderer = ValidationResultsPageRenderer(run_info_at_end=True) rendered_document_content_list = validation_results_page_renderer.render( validation_results=results ) md_str = "".join(DefaultMarkdownPageView().render(rendered_document_content_list)) meta_stats = EventMetadataEntry.md(md_str=md_str, label="Expectation Results") yield ExpectationResult( success=bool(results["success"]), metadata_entries=[meta_stats], ) yield Output(results.to_json_dict())
def event_metadata_entries(metadata_entry_datas): if not metadata_entry_datas: return for metadata_entry_data in metadata_entry_datas: typename = metadata_entry_data['__typename'] label = metadata_entry_data['label'] description = metadata_entry_data.get('description') if typename == 'EventPathMetadataEntry': yield EventMetadataEntry.path(label=label, description=description, path=metadata_entry_data['path']) elif typename == 'EventJsonMetadataEntry': yield EventMetadataEntry.json( label=label, description=description, data=seven.json.loads(metadata_entry_data.get( 'jsonString', '')), ) elif typename == 'EventMarkdownMetadataEntry': yield EventMetadataEntry.md(label=label, description=description, md_str=metadata_entry_data.get( 'md_str', '')) elif typename == 'EventTextMetadataEntry': yield EventMetadataEntry.text(label=label, description=description, text=metadata_entry_data['text']) elif typename == 'EventUrlMetadataEntry': yield EventMetadataEntry.url(label=label, description=description, url=metadata_entry_data['url']) else: check.not_implemented('TODO for type {}'.format(typename))
def ge_validation_solid(context, dataset): data_context = context.resources.ge_data_context if validation_operator_name is not None: validation_operator = validation_operator_name else: data_context.add_validation_operator( "ephemeral_validation", {"class_name": "ActionListValidationOperator", "action_list": []}, ) validation_operator = "ephemeral_validation" suite = data_context.get_expectation_suite(suite_name) final_batch_kwargs = batch_kwargs or {"dataset": dataset} if "datasource" in batch_kwargs: context.log.warning( "`datasource` field of `batch_kwargs` will be ignored; use the `datasource_name` " "parameter of the solid factory instead." ) final_batch_kwargs["datasource"] = datasource_name batch = data_context.get_batch(final_batch_kwargs, suite) run_id = { "run_name": datasource_name + " run", "run_time": datetime.datetime.utcnow(), } results = data_context.run_validation_operator( validation_operator, assets_to_validate=[batch], run_id=run_id ) res = convert_to_json_serializable(results.list_validation_results())[0] md_str = render_multiple_validation_result_pages_markdown( validation_operator_result=results, run_info_at_end=True, ) meta_stats = EventMetadataEntry.md(md_str=md_str, label="Expectation Results") yield ExpectationResult( success=res["success"], metadata_entries=[meta_stats,], ) yield Output(res)
def _best_n_actions(_, df): df = df.nlargest(n, "score") return Output( df, metadata_entries=[ EventMetadataEntry.md(df.to_markdown(), "data"), ], )
def _handle_spark_output(self, df: SparkDataFrame, schema: str, table: str): options = { "sfURL": f"{self._config['account']}.snowflakecomputing.com", "sfUser": self._config["user"], "sfPassword": self._config["password"], "sfDatabase": self._config["database"], "sfSchema": schema, "sfWarehouse": self._config["warehouse"], "dbtable": table, } yield EventMetadataEntry.md(spark_columns_to_markdown(df.schema), "DataFrame columns") df.write.format("net.snowflake.spark.snowflake").options(**options).mode("append").save()
def many_table_materializations(_context): with open(file_relative_path(__file__, MARKDOWN_EXAMPLE), 'r') as f: md_str = f.read() for table in raw_tables: yield Materialization( label='table_info', metadata_entries=[ EventMetadataEntry.text(text=table, label='table_name'), EventMetadataEntry.fspath(path='/path/to/{}'.format(table), label='table_path'), EventMetadataEntry.json(data={'name': table}, label='table_data'), EventMetadataEntry.url( url='https://bigty.pe/{}'.format(table), label='table_name_big' ), EventMetadataEntry.md(md_str=md_str, label='table_blurb'), ], )
def _handle_pandas_output(self, obj: PandasDataFrame, schema: str, table: str): from snowflake import connector # pylint: disable=no-name-in-module yield EventMetadataEntry.int(obj.shape[0], "Rows") yield EventMetadataEntry.md(pandas_columns_to_markdown(obj), "DataFrame columns") connector.paramstyle = "pyformat" with connect_snowflake(config=self._config, schema=schema) as con: with_uppercase_cols = obj.rename(str.upper, copy=False, axis="columns") with_uppercase_cols.to_sql( table, con=con, if_exists="append", index=False, method=pd_writer, )
def _handle_dataframe_output(self, context: OutputContext, obj: DataFrame): from snowflake import connector # pylint: disable=no-name-in-module yield EventMetadataEntry.int(obj.shape[0], "Rows") yield EventMetadataEntry.md(columns_to_markdown(obj), "DataFrame columns") connector.paramstyle = "pyformat" schema, table = context.metadata["table"].split(".") with connect_snowflake(config=context.resource_config, schema=schema) as con: with_uppercase_cols = obj.rename(str.upper, copy=False, axis="columns") with_uppercase_cols.to_sql( table, con=con, if_exists="replace", index=False, method=pd_writer, )
def many_table_materializations(_context): with open(file_relative_path(__file__, MARKDOWN_EXAMPLE), "r") as f: md_str = f.read() for table in raw_tables: yield AssetMaterialization( asset_key="table_info", metadata_entries=[ EventMetadataEntry.text(text=table, label="table_name"), EventMetadataEntry.fspath(path="/path/to/{}".format(table), label="table_path"), EventMetadataEntry.json(data={"name": table}, label="table_data"), EventMetadataEntry.url( url="https://bigty.pe/{}".format(table), label="table_name_big"), EventMetadataEntry.md(md_str=md_str, label="table_blurb"), ], )
def event_metadata_entries(metadata_entry_datas): if not metadata_entry_datas: return for metadata_entry_data in metadata_entry_datas: typename = metadata_entry_data["__typename"] label = metadata_entry_data["label"] description = metadata_entry_data.get("description") if typename == "EventPathMetadataEntry": yield EventMetadataEntry.path(label=label, description=description, path=metadata_entry_data["path"]) elif typename == "EventJsonMetadataEntry": yield EventMetadataEntry.json( label=label, description=description, data=seven.json.loads(metadata_entry_data.get( "jsonString", "")), ) elif typename == "EventMarkdownMetadataEntry": yield EventMetadataEntry.md(label=label, description=description, md_str=metadata_entry_data.get( "md_str", "")) elif typename == "EventTextMetadataEntry": yield EventMetadataEntry.text(label=label, description=description, text=metadata_entry_data["text"]) elif typename == "EventUrlMetadataEntry": yield EventMetadataEntry.url(label=label, description=description, url=metadata_entry_data["url"]) elif typename == "EventPythonArtifactMetadataEntry": yield EventMetadataEntry( label=label, description=description, entry_data=PythonArtifactMetadataEntryData( metadata_entry_data["module"], metadata_entry_data["name"]), ) else: check.not_implemented("TODO for type {}".format(typename))
def _get_metadata(self, result: Dict[str, Any]) -> List[EventMetadataEntry]: """ Here, we run queries against our output Snowflake database tables to add additional context to our asset materializations. """ table_name = result["unique_id"].split(".")[-1] with connect_snowflake(config=self._snowflake_config, schema=self._dbt_schema) as con: n_rows = pandas.read_sql_query( f"SELECT COUNT(*) FROM {table_name}", con) sample_rows = pandas.read_sql_query( f"SELECT * FROM {table_name} SAMPLE ROW (10 rows)", con) return super()._get_metadata(result) + [ EventMetadataEntry.int(int(n_rows.iloc[0][0]), "dbt Model Number of Rows"), EventMetadataEntry.md( sample_rows.astype("str").to_markdown(), "dbt Model Sample Rows"), ]
def ge_validation_solid(context, pandas_df): data_context = context.resources.ge_data_context if validation_operator_name is not None: validation_operator = validation_operator_name else: data_context.add_validation_operator( "ephemeral_validation", { "class_name": "ActionListValidationOperator", "action_list": [] }, ) validation_operator = "ephemeral_validation" suite = data_context.get_expectation_suite(suite_name) batch_kwargs = { "dataset": pandas_df, "datasource": datasource_name, } batch = data_context.get_batch(batch_kwargs, suite) run_id = { "run_name": datasource_name + " run", "run_time": datetime.datetime.utcnow(), } results = data_context.run_validation_operator( validation_operator, assets_to_validate=[batch], run_id=run_id) res = convert_to_json_serializable( results.list_validation_results())[0] md_str = render_multiple_validation_result_pages_markdown( validation_operator_result=results, run_info_at_end=True, ) meta_stats = EventMetadataEntry.md(md_str=md_str, label="Expectation Results") yield ExpectationResult( success=res["success"], metadata_entries=[ meta_stats, ], ) yield Output(res)
def metadata_for_actions(df): return [ EventMetadataEntry.int(int(df["score"].min()), "min score"), EventMetadataEntry.int(int(df["score"].max()), "max score"), EventMetadataEntry.md(df[:5].to_markdown(), "sample rows"), ]