def _ge_validation_fn(context, dataset): data_context = context.resources.ge_data_context validator_kwargs = { "datasource_name": datasource_name, "data_connector_name": data_connector_name, "data_asset_name": datasource_name or data_asset_name, "runtime_parameters": { runtime_method_type: dataset }, "batch_identifiers": batch_identifiers, "expectation_suite_name": suite_name, **extra_kwargs, } validator = data_context.get_validator(**validator_kwargs) run_id = { "run_name": datasource_name + " run", "run_time": datetime.datetime.utcnow(), } results = validator.validate(run_id=run_id) validation_results_page_renderer = ValidationResultsPageRenderer( run_info_at_end=True) rendered_document_content_list = validation_results_page_renderer.render( validation_results=results) md_str = "".join( DefaultMarkdownPageView().render(rendered_document_content_list)) meta_stats = MetadataEntry("Expectation Results", value=MetadataValue.md(md_str)) yield ExpectationResult( success=bool(results["success"]), metadata_entries=[meta_stats], ) yield Output(results.to_json_dict())
def asset_metadata(_context, model_info): config = dict(SHARED_SNOWFLAKE_CONF) config["schema"] = model_info["schema"] with connect_snowflake(config=config) as con: df = pd.read_sql(f"SELECT * FROM {model_info['name']} LIMIT 5", con=con) num_rows = con.execute( f"SELECT COUNT(*) FROM {model_info['name']}").fetchone() return { "Data sample": MetadataValue.md(df.to_markdown()), "Rows": num_rows[0] }
def _ge_validation_fn(context, dataset): data_context = context.resources.ge_data_context if validation_operator_name is not None: validation_operator = validation_operator_name else: data_context.add_validation_operator( "ephemeral_validation", { "class_name": "ActionListValidationOperator", "action_list": [] }, ) validation_operator = "ephemeral_validation" suite = data_context.get_expectation_suite(suite_name) final_batch_kwargs = batch_kwargs or {"dataset": dataset} if "datasource" in batch_kwargs: context.log.warning( "`datasource` field of `batch_kwargs` will be ignored; use the `datasource_name` " f"parameter of the {decorator_name} factory instead.") final_batch_kwargs["datasource"] = datasource_name batch = data_context.get_batch(final_batch_kwargs, suite) run_id = { "run_name": datasource_name + " run", "run_time": datetime.datetime.utcnow(), } results = data_context.run_validation_operator( validation_operator, assets_to_validate=[batch], run_id=run_id) res = convert_to_json_serializable( results.list_validation_results())[0] validation_results_page_renderer = ValidationResultsPageRenderer( run_info_at_end=True) rendered_document_content_list = ( validation_results_page_renderer.render_validation_operator_result( results)) md_str = " ".join( DefaultMarkdownPageView().render(rendered_document_content_list)) meta_stats = MetadataEntry("Expectation Results", value=MetadataValue.md(md_str)) yield ExpectationResult( success=res["success"], metadata_entries=[ meta_stats, ], ) yield Output(res)
def build_component_top_stories(model: TruncatedSVD, user_story_matrix: IndexedCooMatrix, story_titles: DataFrame): """ For each component in the collaborative filtering model, finds the titles of the top stories it's associated with. """ n_stories = 10 components_column = [] titles_column = [] story_titles = story_titles.set_index("id") for i in range(model.components_.shape[0]): component = model.components_[i] top_story_indices = component.argsort()[-n_stories:][::-1] top_story_ids = user_story_matrix.col_index[top_story_indices] top_story_titles = story_titles.loc[top_story_ids] for title in top_story_titles["title"]: components_column.append(i) titles_column.append(title) component_top_stories = DataFrame({ "component_index": Series(components_column), "title": Series(titles_column) }) yield Output( component_top_stories, metadata={ "Top component top stories": MetadataValue.md( top_components_to_markdown(component_top_stories)), }, )
def many_table_materializations(_context): with open(file_relative_path(__file__, MARKDOWN_EXAMPLE), "r") as f: md_str = f.read() for table in raw_tables: yield AssetMaterialization( asset_key="table_info", metadata={ "table_name": table, "table_path": MetadataValue.path(f"/path/to/{table}"), "table_data": { "name": table }, "table_name_big": MetadataValue.url(f"https://bigty.pe/{table}"), "table_blurb": MetadataValue.md(md_str), "big_int": 29119888133298982934829348, "float_nan": float("nan"), }, )
def metadata_for_actions(df): return { "min_score": int(df["score"].min()), "max_score": int(df["score"].max()), "sample rows": MetadataValue.md(df[:5].to_markdown()), }
def daily_top_action(_, df1, df2): df = pd.concat([df1, df2]).nlargest(1, "score") return Output(df, metadata={"data": MetadataValue.md(df.to_markdown())})
def _best_n_actions(_, df): df = df.nlargest(n, "score") return Output( df, metadata={"data": MetadataValue.md(df.to_markdown())}, )