def _id_range_for_time(start, end, hn_client): start = datetime.timestamp( datetime.strptime(start, "%Y-%m-%d %H:%M:%S").replace(tzinfo=timezone.utc) ) end = datetime.timestamp( datetime.strptime(end, "%Y-%m-%d %H:%M:%S").replace(tzinfo=timezone.utc) ) def _get_item_timestamp(item_id): item = hn_client.fetch_item_by_id(item_id) return item["time"] max_item_id = hn_client.fetch_max_item_id() # declared by resource to allow testability against snapshot min_item_id = hn_client.min_item_id() start_id = binary_search_nearest_left(_get_item_timestamp, min_item_id, max_item_id, start) end_id = binary_search_nearest_right(_get_item_timestamp, min_item_id, max_item_id, end) start_timestamp = str(datetime.fromtimestamp(_get_item_timestamp(start_id), tz=timezone.utc)) end_timestamp = str(datetime.fromtimestamp(_get_item_timestamp(end_id), tz=timezone.utc)) metadata_entries = [ EventMetadataEntry.int(value=max_item_id, label="max_item_id"), EventMetadataEntry.int(value=start_id, label="start_id"), EventMetadataEntry.int(value=end_id, label="end_id"), EventMetadataEntry.int(value=end_id - start_id, label="items"), EventMetadataEntry.text(text=start_timestamp, label="start_timestamp"), EventMetadataEntry.text(text=end_timestamp, label="end_timestamp"), ] id_range = (start_id, end_id) return id_range, metadata_entries
def handle_output(self, context, obj): table_name = context.name write_dataframe_to_table(name=table_name, dataframe=obj) # attach these to the Handled Output event yield EventMetadataEntry.int(len(obj), label="number of rows") yield EventMetadataEntry.text(table_name, label="table name")
def handle_output(self, context, obj): file_path = os.path.join("my_base_dir", context.step_key, context.name) obj.to_csv(file_path) yield EventMetadataEntry.int(obj.shape[0], label="number of rows") yield EventMetadataEntry.float(obj["some_column"].mean(), "some_column mean")
def materialize(_): yield AssetMaterialization( asset_key="all_types", description="a materialization with all metadata types", metadata_entries=[ EventMetadataEntry.text("text is cool", "text"), EventMetadataEntry.url("https://bigty.pe/neato", "url"), EventMetadataEntry.fspath("/tmp/awesome", "path"), EventMetadataEntry.json({"is_dope": True}, "json"), EventMetadataEntry.python_artifact(EventMetadataEntry, "python class"), EventMetadataEntry.python_artifact(file_relative_path, "python function"), EventMetadataEntry.float(1.2, "float"), EventMetadataEntry.int(1, "int"), EventMetadataEntry.float(float("nan"), "float NaN"), EventMetadataEntry.int(LONG_INT, "long int"), EventMetadataEntry.pipeline_run("fake_run_id", "pipeline run"), EventMetadataEntry.asset(AssetKey("my_asset"), "my asset"), EventMetadataEntry.table( label="table", records=[ TableRecord(foo=1, bar=2), TableRecord(foo=3, bar=4), ], ), EventMetadataEntry.table_schema( label="table_schema", schema=TableSchema( columns=[ TableColumn( name="foo", type="integer", constraints=TableColumnConstraints( unique=True), ), TableColumn(name="bar", type="string"), ], constraints=TableConstraints(other=["some constraint" ], ), ), ), ], ) yield Output(None)
def handle_output(self, context, obj): key = context.metadata["key"] bucket = context.resource_config["bucket"] context.log.debug("about to pickle object") pickled_obj = pickle.dumps(obj) yield EventMetadataEntry.int(len(pickled_obj), "Bytes") client = s3_client() context.log.debug("created S3 client") client.put_object(Bucket=bucket, Key=key, Body=pickled_obj)
def materialize_one(_): yield AssetMaterialization( asset_key=asset_key, metadata_entries=[ EventMetadataEntry.text("hello", "text"), EventMetadataEntry.json({"hello": "world"}, "json"), EventMetadataEntry.float(1.0, "one_float"), EventMetadataEntry.int(1, "one_int"), ], ) yield Output(1)
def materialize(_): yield AssetMaterialization( asset_key="all_types", description="a materialization with all metadata types", metadata_entries=[ EventMetadataEntry.text("text is cool", "text"), EventMetadataEntry.url("https://bigty.pe/neato", "url"), EventMetadataEntry.fspath("/tmp/awesome", "path"), EventMetadataEntry.json({"is_dope": True}, "json"), EventMetadataEntry.python_artifact(EventMetadataEntry, "python class"), EventMetadataEntry.python_artifact(file_relative_path, "python function"), EventMetadataEntry.float(1.2, "float"), EventMetadataEntry.int(1, "int"), EventMetadataEntry.float(float("nan"), "float NaN"), EventMetadataEntry.int(LONG_INT, "long int"), ], ) yield Output(None)
def my_metadata_output(context): df = get_some_data() yield Output( df, metadata_entries=[ EventMetadataEntry.text("Text-based metadata for this event", label="text_metadata"), EventMetadataEntry.url("http://mycoolsite.com/url_for_my_data", label="dashboard_url"), EventMetadataEntry.int(len(df), "row count"), EventMetadataEntry.float(calculate_bytes(df), "size (bytes)"), ], )
def my_metadata_expectation_solid(context, df): df = do_some_transform(df) yield ExpectationResult( success=len(df) > 0, description="ensure dataframe has rows", metadata_entries=[ EventMetadataEntry.text("Text-based metadata for this event", label="text_metadata"), EventMetadataEntry.url("http://mycoolsite.com/url_for_my_data", label="dashboard_url"), EventMetadataEntry.int(len(df), "row count"), EventMetadataEntry.float(calculate_bytes(df), "size (bytes)"), ], ) yield Output(df)
def handle_output(self, context, obj): file_path = os.path.join(["my_base_dir", context.step_key, context.output_name]) obj.to_csv(file_path) yield AssetMaterialization( asset_key=AssetKey(file_path), description="Persisted result to storage.", metadata_entries=[ EventMetadataEntry.int(obj.shape[0], label="number of rows"), EventMetadataEntry.float(obj["some_column"].mean(), "some_column mean"), ], )
def positive_num_check(_, value): # return True if value > 0 else False if value <= 0: return TypeCheck( success=False, description=("Numbers cannot be 0 or negative, got " "{value} for PositiveNumber type").format( value=value), metadata_entries=[ EventMetadataEntry.int(value, "The input number") ]) else: return True
def handle_output(self, context: OutputContext, obj: Union[pandas.DataFrame, pyspark.sql.DataFrame]): path = self._get_path(context) if isinstance(obj, pandas.DataFrame): row_count = len(obj) obj.to_parquet(path=path) elif isinstance(obj, pyspark.sql.DataFrame): row_count = obj.count() obj.write.parquet(path=path, mode="overwrite") else: raise Exception(f"Outputs of type {type(obj)} not supported.") yield EventMetadataEntry.int(value=row_count, label="row_count") yield EventMetadataEntry.path(path=path, label="path")
def _handle_pandas_output(self, obj: PandasDataFrame, schema: str, table: str): from snowflake import connector # pylint: disable=no-name-in-module yield EventMetadataEntry.int(obj.shape[0], "Rows") yield EventMetadataEntry.md(pandas_columns_to_markdown(obj), "DataFrame columns") connector.paramstyle = "pyformat" with connect_snowflake(config=self._config, schema=schema) as con: with_uppercase_cols = obj.rename(str.upper, copy=False, axis="columns") with_uppercase_cols.to_sql( table, con=con, if_exists="append", index=False, method=pd_writer, )
def _handle_dataframe_output(self, context: OutputContext, obj: DataFrame): from snowflake import connector # pylint: disable=no-name-in-module yield EventMetadataEntry.int(obj.shape[0], "Rows") yield EventMetadataEntry.md(columns_to_markdown(obj), "DataFrame columns") connector.paramstyle = "pyformat" schema, table = context.metadata["table"].split(".") with connect_snowflake(config=context.resource_config, schema=schema) as con: with_uppercase_cols = obj.rename(str.upper, copy=False, axis="columns") with_uppercase_cols.to_sql( table, con=con, if_exists="replace", index=False, method=pd_writer, )
def many_table_materializations(_context): with open(file_relative_path(__file__, MARKDOWN_EXAMPLE), "r") as f: md_str = f.read() for table in raw_tables: yield AssetMaterialization( asset_key="table_info", metadata_entries=[ EventMetadataEntry.text(text=table, label="table_name"), EventMetadataEntry.fspath(path="/path/to/{}".format(table), label="table_path"), EventMetadataEntry.json(data={"name": table}, label="table_data"), EventMetadataEntry.url( url="https://bigty.pe/{}".format(table), label="table_name_big"), EventMetadataEntry.md(md_str=md_str, label="table_blurb"), EventMetadataEntry.int(29119888133298982934829348, label="big_int"), EventMetadataEntry.float(float("nan"), label="float_nan"), ], )
def _get_metadata(self, result: Dict[str, Any]) -> List[EventMetadataEntry]: """ Here, we run queries against our output Snowflake database tables to add additional context to our asset materializations. """ table_name = result["unique_id"].split(".")[-1] with connect_snowflake(config=self._snowflake_config, schema=self._dbt_schema) as con: n_rows = pandas.read_sql_query( f"SELECT COUNT(*) FROM {table_name}", con) sample_rows = pandas.read_sql_query( f"SELECT * FROM {table_name} SAMPLE ROW (10 rows)", con) return super()._get_metadata(result) + [ EventMetadataEntry.int(int(n_rows.iloc[0][0]), "dbt Model Number of Rows"), EventMetadataEntry.md( sample_rows.astype("str").to_markdown(), "dbt Model Sample Rows"), ]
def metadata_for_actions(df): return [ EventMetadataEntry.int(int(df["score"].min()), "min score"), EventMetadataEntry.int(int(df["score"].max()), "max score"), EventMetadataEntry.md(df[:5].to_markdown(), "sample rows"), ]
def handle_output(self, context, obj): super().handle_output(context, obj) # can pretend this actually came from a library call yield EventMetadataEntry.int(len(obj), "num rows written to db")