def join_q2_data( context, april_data, may_data, june_data, master_cord_data, ): dfs = {"april": april_data, "may": may_data, "june": june_data} missing_things = [] for required_column in ["DestAirportSeqID", "OriginAirportSeqID"]: for month, df in dfs.items(): if required_column not in df.columns: missing_things.append({"month": month, "missing_column": required_column}) yield ExpectationResult( success=not bool(missing_things), label="airport_ids_present", description="Sequence IDs present in incoming monthly flight data.", metadata_entries=[ MetadataEntry.json(label="metadata", data={"missing_columns": missing_things}) ], ) yield ExpectationResult( success=set(april_data.columns) == set(may_data.columns) == set(june_data.columns), label="flight_data_same_shape", metadata_entries=[ MetadataEntry.json(label="metadata", data={"columns": april_data.columns}) ], ) q2_data = april_data.union(may_data).union(june_data) sampled_q2_data = q2_data.sample( withReplacement=False, fraction=context.solid_config["subsample_pct"] / 100.0 ) sampled_q2_data.createOrReplaceTempView("q2_data") dest_prefixed_master_cord_data = do_prefix_column_names(master_cord_data, "DEST_") dest_prefixed_master_cord_data.createOrReplaceTempView("dest_cord_data") origin_prefixed_master_cord_data = do_prefix_column_names(master_cord_data, "ORIGIN_") origin_prefixed_master_cord_data.createOrReplaceTempView("origin_cord_data") full_data = context.resources.pyspark.spark_session.sql( """ SELECT * FROM origin_cord_data LEFT JOIN ( SELECT * FROM q2_data LEFT JOIN dest_cord_data ON q2_data.DestAirportSeqID = dest_cord_data.DEST_AIRPORT_SEQ_ID ) q2_dest_data ON origin_cord_data.ORIGIN_AIRPORT_SEQ_ID = q2_dest_data.OriginAirportSeqID """ ) yield Output(rename_spark_dataframe_columns(full_data, lambda c: c.lower()))
def __init__(self, invalid_line_nos: List[int]): check.list_param(invalid_line_nos, "invalid_line_nos", int) line_nos_str = ", ".join(map(str, invalid_line_nos)) description = f"dbt CLI emitted unexpected output on lines {line_nos_str}" metadata_entries = [ MetadataEntry.json({"line_nos": invalid_line_nos}, "Invalid CLI Output Line Numbers") ] super().__init__(description, metadata_entries) self.invalid_line_nos = invalid_line_nos
def df_type_check(_, value): if not isinstance(value, dd.DataFrame): return TypeCheck(success=False) return TypeCheck( success=True, metadata_entries=[ # string cast columns since they may be things like datetime MetadataEntry.json({"columns": list(map(str, value.columns))}, "metadata"), ], )
def df_type_check(_, value): if not isinstance(value, pd.DataFrame): return TypeCheck(success=False) return TypeCheck( success=True, metadata_entries=[ MetadataEntry.text(str(len(value)), "row_count", "Number of rows in DataFrame"), # string cast columns since they may be things like datetime MetadataEntry.json({"columns": list(map(str, value.columns))}, "metadata"), ], )
def __init__(self, description: str, logs: List[Dict[str, Any]], raw_output: str): metadata_entries = [ MetadataEntry.json( {"logs": logs}, label="Parsed CLI Output (JSON)", ), MetadataEntry.text( DagsterDbtCliRuntimeError.stitch_messages(logs), label="Parsed CLI Output (JSON) Message Attributes", ), MetadataEntry.text( raw_output, label="Raw CLI Output", ), ] super().__init__(description, metadata_entries)
def materialize(_): yield AssetMaterialization( asset_key="all_types", description="a materialization with all metadata types", metadata_entries=[ MetadataEntry.text("text is cool", "text"), MetadataEntry.url("https://bigty.pe/neato", "url"), MetadataEntry.fspath("/tmp/awesome", "path"), MetadataEntry.json({"is_dope": True}, "json"), MetadataEntry.python_artifact(MetadataEntry, "python class"), MetadataEntry.python_artifact(file_relative_path, "python function"), MetadataEntry.float(1.2, "float"), MetadataEntry.int(1, "int"), MetadataEntry.float(float("nan"), "float NaN"), MetadataEntry.int(LONG_INT, "long int"), MetadataEntry.pipeline_run("fake_run_id", "pipeline run"), MetadataEntry.asset(AssetKey("my_asset"), "my asset"), MetadataEntry.table( label="table", records=[ TableRecord(foo=1, bar=2), TableRecord(foo=3, bar=4), ], ), MetadataEntry.table_schema( label="table_schema", schema=TableSchema( columns=[ TableColumn( name="foo", type="integer", constraints=TableColumnConstraints( unique=True), ), TableColumn(name="bar", type="string"), ], constraints=TableConstraints(other=["some constraint" ], ), ), ), ], ) yield Output(None)
def _dagster_type_check(_, value): if not isinstance(value, pd.DataFrame): return TypeCheck( success=False, description="Must be a pandas.DataFrame. Got value of type. {type_name}".format( type_name=type(value).__name__ ), ) individual_result_dict = {} if dataframe_validator is not None: individual_result_dict["dataframe"] = dataframe_validator.validate(value) if columns_validator is not None: individual_result_dict["columns"] = columns_validator.validate(value) if columns_aggregate_validator is not None: individual_result_dict["column-aggregates"] = columns_aggregate_validator.validate( value ) typechecks_succeeded = True metadata = [] overall_description = "Failed Constraints: {}" constraint_clauses = [] for key, result in individual_result_dict.items(): result_val = result.success if result_val: continue typechecks_succeeded = typechecks_succeeded and result_val result_dict = result.metadata_entries[0].entry_data.data metadata.append( MetadataEntry.json( result_dict, "{}-constraint-metadata".format(key), ) ) constraint_clauses.append("{} failing constraints, {}".format(key, result.description)) # returns aggregates, then column, then dataframe return TypeCheck( success=typechecks_succeeded, description=overall_description.format(constraint_clauses), metadata_entries=sorted(metadata, key=lambda x: x.label), )
def backcompat_materialize(_): yield Materialization( asset_key="all_types", description="a materialization with all metadata types", metadata_entries=[ MetadataEntry.text("text is cool", "text"), MetadataEntry.url("https://bigty.pe/neato", "url"), MetadataEntry.fspath("/tmp/awesome", "path"), MetadataEntry.json({"is_dope": True}, "json"), MetadataEntry.python_artifact(MetadataEntry, "python class"), MetadataEntry.python_artifact(file_relative_path, "python function"), MetadataEntry.float(1.2, "float"), MetadataEntry.int(1, "int"), MetadataEntry.float(float("nan"), "float NaN"), MetadataEntry.int(LONG_INT, "long int"), MetadataEntry.pipeline_run("fake_run_id", "pipeline run"), MetadataEntry.asset(AssetKey("my_asset"), "my asset"), ], ) yield Output(None)