Exemplo n.º 1
0
def join_q2_data(
    context,
    april_data,
    may_data,
    june_data,
    master_cord_data,
):

    dfs = {"april": april_data, "may": may_data, "june": june_data}

    missing_things = []

    for required_column in ["DestAirportSeqID", "OriginAirportSeqID"]:
        for month, df in dfs.items():
            if required_column not in df.columns:
                missing_things.append({"month": month, "missing_column": required_column})

    yield ExpectationResult(
        success=not bool(missing_things),
        label="airport_ids_present",
        description="Sequence IDs present in incoming monthly flight data.",
        metadata_entries=[
            MetadataEntry.json(label="metadata", data={"missing_columns": missing_things})
        ],
    )

    yield ExpectationResult(
        success=set(april_data.columns) == set(may_data.columns) == set(june_data.columns),
        label="flight_data_same_shape",
        metadata_entries=[
            MetadataEntry.json(label="metadata", data={"columns": april_data.columns})
        ],
    )

    q2_data = april_data.union(may_data).union(june_data)
    sampled_q2_data = q2_data.sample(
        withReplacement=False, fraction=context.solid_config["subsample_pct"] / 100.0
    )
    sampled_q2_data.createOrReplaceTempView("q2_data")

    dest_prefixed_master_cord_data = do_prefix_column_names(master_cord_data, "DEST_")
    dest_prefixed_master_cord_data.createOrReplaceTempView("dest_cord_data")

    origin_prefixed_master_cord_data = do_prefix_column_names(master_cord_data, "ORIGIN_")
    origin_prefixed_master_cord_data.createOrReplaceTempView("origin_cord_data")

    full_data = context.resources.pyspark.spark_session.sql(
        """
        SELECT * FROM origin_cord_data
        LEFT JOIN (
            SELECT * FROM q2_data
            LEFT JOIN dest_cord_data ON
            q2_data.DestAirportSeqID = dest_cord_data.DEST_AIRPORT_SEQ_ID
        ) q2_dest_data
        ON origin_cord_data.ORIGIN_AIRPORT_SEQ_ID = q2_dest_data.OriginAirportSeqID
        """
    )

    yield Output(rename_spark_dataframe_columns(full_data, lambda c: c.lower()))
Exemplo n.º 2
0
 def __init__(self, invalid_line_nos: List[int]):
     check.list_param(invalid_line_nos, "invalid_line_nos", int)
     line_nos_str = ", ".join(map(str, invalid_line_nos))
     description = f"dbt CLI emitted unexpected output on lines {line_nos_str}"
     metadata_entries = [
         MetadataEntry.json({"line_nos": invalid_line_nos}, "Invalid CLI Output Line Numbers")
     ]
     super().__init__(description, metadata_entries)
     self.invalid_line_nos = invalid_line_nos
Exemplo n.º 3
0
def df_type_check(_, value):
    if not isinstance(value, dd.DataFrame):
        return TypeCheck(success=False)
    return TypeCheck(
        success=True,
        metadata_entries=[
            # string cast columns since they may be things like datetime
            MetadataEntry.json({"columns": list(map(str, value.columns))}, "metadata"),
        ],
    )
Exemplo n.º 4
0
def df_type_check(_, value):
    if not isinstance(value, pd.DataFrame):
        return TypeCheck(success=False)
    return TypeCheck(
        success=True,
        metadata_entries=[
            MetadataEntry.text(str(len(value)), "row_count", "Number of rows in DataFrame"),
            # string cast columns since they may be things like datetime
            MetadataEntry.json({"columns": list(map(str, value.columns))}, "metadata"),
        ],
    )
Exemplo n.º 5
0
 def __init__(self, description: str, logs: List[Dict[str, Any]], raw_output: str):
     metadata_entries = [
         MetadataEntry.json(
             {"logs": logs},
             label="Parsed CLI Output (JSON)",
         ),
         MetadataEntry.text(
             DagsterDbtCliRuntimeError.stitch_messages(logs),
             label="Parsed CLI Output (JSON) Message Attributes",
         ),
         MetadataEntry.text(
             raw_output,
             label="Raw CLI Output",
         ),
     ]
     super().__init__(description, metadata_entries)
Exemplo n.º 6
0
 def materialize(_):
     yield AssetMaterialization(
         asset_key="all_types",
         description="a materialization with all metadata types",
         metadata_entries=[
             MetadataEntry.text("text is cool", "text"),
             MetadataEntry.url("https://bigty.pe/neato", "url"),
             MetadataEntry.fspath("/tmp/awesome", "path"),
             MetadataEntry.json({"is_dope": True}, "json"),
             MetadataEntry.python_artifact(MetadataEntry, "python class"),
             MetadataEntry.python_artifact(file_relative_path,
                                           "python function"),
             MetadataEntry.float(1.2, "float"),
             MetadataEntry.int(1, "int"),
             MetadataEntry.float(float("nan"), "float NaN"),
             MetadataEntry.int(LONG_INT, "long int"),
             MetadataEntry.pipeline_run("fake_run_id", "pipeline run"),
             MetadataEntry.asset(AssetKey("my_asset"), "my asset"),
             MetadataEntry.table(
                 label="table",
                 records=[
                     TableRecord(foo=1, bar=2),
                     TableRecord(foo=3, bar=4),
                 ],
             ),
             MetadataEntry.table_schema(
                 label="table_schema",
                 schema=TableSchema(
                     columns=[
                         TableColumn(
                             name="foo",
                             type="integer",
                             constraints=TableColumnConstraints(
                                 unique=True),
                         ),
                         TableColumn(name="bar", type="string"),
                     ],
                     constraints=TableConstraints(other=["some constraint"
                                                         ], ),
                 ),
             ),
         ],
     )
     yield Output(None)
Exemplo n.º 7
0
    def _dagster_type_check(_, value):
        if not isinstance(value, pd.DataFrame):
            return TypeCheck(
                success=False,
                description="Must be a pandas.DataFrame. Got value of type. {type_name}".format(
                    type_name=type(value).__name__
                ),
            )
        individual_result_dict = {}

        if dataframe_validator is not None:
            individual_result_dict["dataframe"] = dataframe_validator.validate(value)
        if columns_validator is not None:
            individual_result_dict["columns"] = columns_validator.validate(value)

        if columns_aggregate_validator is not None:
            individual_result_dict["column-aggregates"] = columns_aggregate_validator.validate(
                value
            )

        typechecks_succeeded = True
        metadata = []
        overall_description = "Failed Constraints: {}"
        constraint_clauses = []
        for key, result in individual_result_dict.items():
            result_val = result.success
            if result_val:
                continue
            typechecks_succeeded = typechecks_succeeded and result_val
            result_dict = result.metadata_entries[0].entry_data.data
            metadata.append(
                MetadataEntry.json(
                    result_dict,
                    "{}-constraint-metadata".format(key),
                )
            )
            constraint_clauses.append("{} failing constraints, {}".format(key, result.description))
        # returns aggregates, then column, then dataframe
        return TypeCheck(
            success=typechecks_succeeded,
            description=overall_description.format(constraint_clauses),
            metadata_entries=sorted(metadata, key=lambda x: x.label),
        )
Exemplo n.º 8
0
 def backcompat_materialize(_):
     yield Materialization(
         asset_key="all_types",
         description="a materialization with all metadata types",
         metadata_entries=[
             MetadataEntry.text("text is cool", "text"),
             MetadataEntry.url("https://bigty.pe/neato", "url"),
             MetadataEntry.fspath("/tmp/awesome", "path"),
             MetadataEntry.json({"is_dope": True}, "json"),
             MetadataEntry.python_artifact(MetadataEntry, "python class"),
             MetadataEntry.python_artifact(file_relative_path,
                                           "python function"),
             MetadataEntry.float(1.2, "float"),
             MetadataEntry.int(1, "int"),
             MetadataEntry.float(float("nan"), "float NaN"),
             MetadataEntry.int(LONG_INT, "long int"),
             MetadataEntry.pipeline_run("fake_run_id", "pipeline run"),
             MetadataEntry.asset(AssetKey("my_asset"), "my asset"),
         ],
     )
     yield Output(None)