Exemplo n.º 1
0
    def execute(self):
        pipeline = self.recon_pipeline
        with DagsterInstance.from_ref(self.instance_ref) as instance:
            start_termination_thread(self.term_event)
            execution_plan = create_execution_plan(
                pipeline=pipeline,
                run_config=self.run_config,
                mode=self.pipeline_run.mode,
                step_keys_to_execute=[self.step_key],
                known_state=self.known_state,
            )

            yield instance.report_engine_event(
                "Executing step {} in subprocess".format(self.step_key),
                self.pipeline_run,
                EngineEventData(
                    [
                        MetadataEntry.text(str(os.getpid()), "pid"),
                        MetadataEntry.text(self.step_key, "step_key"),
                    ],
                    marker_end=DELEGATE_MARKER,
                ),
                MultiprocessExecutor,
                self.step_key,
            )

            yield from execute_plan_iterator(
                execution_plan,
                pipeline,
                self.pipeline_run,
                run_config=self.run_config,
                retry_mode=self.retry_mode.for_inner_plan(),
                instance=instance,
            )
Exemplo n.º 2
0
def _timing_to_metadata(timings: List[Dict[str, Any]]) -> List[MetadataEntry]:
    metadata = []
    for timing in timings:
        if timing["name"] == "execute":
            desc = "Execution"
        elif timing["name"] == "compile":
            desc = "Compilation"
        else:
            continue

        started_at = dateutil.parser.isoparse(timing["started_at"])
        completed_at = dateutil.parser.isoparse(timing["completed_at"])
        duration = completed_at - started_at
        metadata.extend(
            [
                MetadataEntry.text(
                    text=started_at.isoformat(timespec="seconds"), label=f"{desc} Started At"
                ),
                MetadataEntry.text(
                    text=started_at.isoformat(timespec="seconds"), label=f"{desc} Completed At"
                ),
                MetadataEntry.float(value=duration.total_seconds(), label=f"{desc} Duration"),
            ]
        )
    return metadata
Exemplo n.º 3
0
    def handle_output(self, context: OutputContext, obj: Union[PandasDataFrame, SparkDataFrame]):
        schema, table = DB_SCHEMA, context.asset_key.path[-1]

        time_window = context.asset_partitions_time_window if context.has_asset_partitions else None
        with connect_snowflake(config=self._config, schema=schema) as con:
            con.execute(self._get_cleanup_statement(table, schema, time_window))

        if isinstance(obj, SparkDataFrame):
            yield from self._handle_spark_output(obj, schema, table)
        elif isinstance(obj, PandasDataFrame):
            yield from self._handle_pandas_output(obj, schema, table)
        elif obj is None:  # dbt
            config = dict(SHARED_SNOWFLAKE_CONF)
            config["schema"] = DB_SCHEMA
            with connect_snowflake(config=config) as con:
                df = read_sql(f"SELECT * FROM {context.name} LIMIT 5", con=con)
                num_rows = con.execute(f"SELECT COUNT(*) FROM {context.name}").fetchone()

            yield MetadataEntry.md(df.to_markdown(), "Data sample")
            yield MetadataEntry.int(num_rows, "Rows")
        else:
            raise Exception(
                "SnowflakeIOManager only supports pandas DataFrames and spark DataFrames"
            )

        yield MetadataEntry.text(
            self._get_select_statement(
                table,
                schema,
                None,
                time_window,
            ),
            "Query",
        )
Exemplo n.º 4
0
    def handle_output(self, context, obj):
        table_name = context.name
        write_dataframe_to_table(name=table_name, dataframe=obj)

        # attach these to the Handled Output event
        yield MetadataEntry.int(len(obj), label="number of rows")
        yield MetadataEntry.text(table_name, label="table name")
Exemplo n.º 5
0
def join_q2_data(
    context,
    april_data,
    may_data,
    june_data,
    master_cord_data,
):

    dfs = {"april": april_data, "may": may_data, "june": june_data}

    missing_things = []

    for required_column in ["DestAirportSeqID", "OriginAirportSeqID"]:
        for month, df in dfs.items():
            if required_column not in df.columns:
                missing_things.append({"month": month, "missing_column": required_column})

    yield ExpectationResult(
        success=not bool(missing_things),
        label="airport_ids_present",
        description="Sequence IDs present in incoming monthly flight data.",
        metadata_entries=[
            MetadataEntry.json(label="metadata", data={"missing_columns": missing_things})
        ],
    )

    yield ExpectationResult(
        success=set(april_data.columns) == set(may_data.columns) == set(june_data.columns),
        label="flight_data_same_shape",
        metadata_entries=[
            MetadataEntry.json(label="metadata", data={"columns": april_data.columns})
        ],
    )

    q2_data = april_data.union(may_data).union(june_data)
    sampled_q2_data = q2_data.sample(
        withReplacement=False, fraction=context.solid_config["subsample_pct"] / 100.0
    )
    sampled_q2_data.createOrReplaceTempView("q2_data")

    dest_prefixed_master_cord_data = do_prefix_column_names(master_cord_data, "DEST_")
    dest_prefixed_master_cord_data.createOrReplaceTempView("dest_cord_data")

    origin_prefixed_master_cord_data = do_prefix_column_names(master_cord_data, "ORIGIN_")
    origin_prefixed_master_cord_data.createOrReplaceTempView("origin_cord_data")

    full_data = context.resources.pyspark.spark_session.sql(
        """
        SELECT * FROM origin_cord_data
        LEFT JOIN (
            SELECT * FROM q2_data
            LEFT JOIN dest_cord_data ON
            q2_data.DestAirportSeqID = dest_cord_data.DEST_AIRPORT_SEQ_ID
        ) q2_dest_data
        ON origin_cord_data.ORIGIN_AIRPORT_SEQ_ID = q2_dest_data.OriginAirportSeqID
        """
    )

    yield Output(rename_spark_dataframe_columns(full_data, lambda c: c.lower()))
Exemplo n.º 6
0
    def handle_output(self, context, obj):
        file_path = os.path.join("my_base_dir", context.step_key, context.name)

        obj.to_csv(file_path)

        yield MetadataEntry.int(obj.shape[0], label="number of rows")
        yield MetadataEntry.float(obj["some_column"].mean(),
                                  "some_column mean")
Exemplo n.º 7
0
def df_type_check(_, value):
    if not isinstance(value, pd.DataFrame):
        return TypeCheck(success=False)
    return TypeCheck(
        success=True,
        metadata_entries=[
            MetadataEntry.text(str(len(value)), "row_count", "Number of rows in DataFrame"),
            # string cast columns since they may be things like datetime
            MetadataEntry.json({"columns": list(map(str, value.columns))}, "metadata"),
        ],
    )
Exemplo n.º 8
0
def df_type_check(_, value):
    if not isinstance(value, pd.DataFrame):
        return TypeCheck(success=False)
    return TypeCheck(
        success=True,
        metadata_entries=[
            MetadataEntry("row_count", value=str(len(value))),
            # string cast columns since they may be things like datetime
            MetadataEntry("metadata", value={"columns": list(map(str, value.columns))}),
        ],
    )
Exemplo n.º 9
0
def _node_result_to_metadata(node_result: Dict[str, Any]) -> List[MetadataEntry]:
    return [
        MetadataEntry.text(
            text=node_result["config"]["materialized"],
            label="Materialization Strategy",
        ),
        MetadataEntry.text(text=node_result["database"], label="Database"),
        MetadataEntry.text(text=node_result["schema"], label="Schema"),
        MetadataEntry.text(text=node_result["alias"], label="Alias"),
        MetadataEntry.text(text=node_result["description"], label="Description"),
    ]
Exemplo n.º 10
0
    def _launch_k8s_job_with_args(self, job_name, args, run):
        container_context = self.get_container_context_for_run(run)

        pod_name = job_name

        pipeline_origin = run.pipeline_code_origin
        user_defined_k8s_config = get_user_defined_k8s_config(
            frozentags(run.tags))
        repository_origin = pipeline_origin.repository_origin

        job_config = container_context.get_k8s_job_config(
            job_image=repository_origin.container_image, run_launcher=self)

        self._instance.add_run_tags(
            run.run_id,
            {DOCKER_IMAGE_TAG: job_config.job_image},
        )

        job = construct_dagster_k8s_job(
            job_config=job_config,
            args=args,
            job_name=job_name,
            pod_name=pod_name,
            component="run_worker",
            user_defined_k8s_config=user_defined_k8s_config,
            labels={
                "dagster/job": pipeline_origin.pipeline_name,
                "dagster/run-id": run.run_id,
            },
        )

        self._instance.report_engine_event(
            "Creating Kubernetes run worker job",
            run,
            EngineEventData([
                MetadataEntry("Kubernetes Job name", value=job_name),
                MetadataEntry("Kubernetes Namespace",
                              value=container_context.namespace),
                MetadataEntry("Run ID", value=run.run_id),
            ]),
            cls=self.__class__,
        )

        self._batch_api.create_namespaced_job(
            body=job, namespace=container_context.namespace)
        self._instance.report_engine_event(
            "Kubernetes run worker job created",
            run,
            cls=self.__class__,
        )
Exemplo n.º 11
0
def result_to_materialization(
    result: Dict[str, Any], asset_key_prefix: List[str] = None, docs_url: str = None
) -> Optional[AssetMaterialization]:
    """
    This is a hacky solution that attempts to consolidate parsing many of the potential formats
    that dbt can provide its results in. This is known to work for CLI Outputs for dbt versions 0.18+,
    as well as RPC responses for a similar time period, but as the RPC response schema is not documented
    nor enforced, this can become out of date easily.
    """

    asset_key_prefix = check.opt_list_param(asset_key_prefix, "asset_key_prefix", of_type=str)

    # status comes from set of fields rather than "status"
    if "fail" in result:
        success = not result.get("fail") and not result.get("skip") and not result.get("error")
    else:
        success = result["status"] == "success"

    if not success:
        return None

    # all versions represent timing the same way
    metadata = [
        MetadataEntry.float(value=result["execution_time"], label="Execution Time (seconds)")
    ] + _timing_to_metadata(result["timing"])

    # working with a response that contains the node block (RPC and CLI 0.18.x)
    if "node" in result:

        unique_id = result["node"]["unique_id"]
        metadata += _node_result_to_metadata(result["node"])
    else:
        unique_id = result["unique_id"]

    id_prefix = unique_id.split(".")

    # only generate materializations for models
    if id_prefix[0] != "model":
        return None

    if docs_url:
        metadata = [
            MetadataEntry.url(url=f"{docs_url}#!/model/{unique_id}", label="docs_url")
        ] + metadata

    return AssetMaterialization(
        description=f"dbt node: {unique_id}",
        metadata_entries=metadata,
        asset_key=asset_key_prefix + id_prefix,
    )
Exemplo n.º 12
0
def load_data_to_database_from_spark(context, data_frame):
    context.resources.db_info.load_table(data_frame, context.solid_config["table_name"])

    table_name = context.solid_config["table_name"]
    yield AssetMaterialization(
        asset_key="table:{table_name}".format(table_name=table_name),
        description=(
            "Persisted table {table_name} in database configured in the db_info resource."
        ).format(table_name=table_name),
        metadata_entries=[
            MetadataEntry.text(label="Host", text=context.resources.db_info.host),
            MetadataEntry.text(label="Db", text=context.resources.db_info.db_name),
        ],
    )
    yield Output(value=table_name, output_name="table_name")
Exemplo n.º 13
0
    def handle_output(
        self, context: OutputContext, obj: Union[pandas.DataFrame, pyspark.sql.DataFrame]
    ):

        path = self._get_path(context)
        if isinstance(obj, pandas.DataFrame):
            row_count = len(obj)
            obj.to_parquet(path=path, index=False)
        elif isinstance(obj, pyspark.sql.DataFrame):
            row_count = obj.count()
            obj.write.parquet(path=path, mode="overwrite")
        else:
            raise Exception(f"Outputs of type {type(obj)} not supported.")
        yield MetadataEntry.int(value=row_count, label="row_count")
        yield MetadataEntry.path(path=path, label="path")
Exemplo n.º 14
0
def raise_for_rpc_error(context: SolidExecutionContext,
                        resp: Response) -> None:
    error = resp.json().get("error")
    if error is not None:
        if error["code"] in [
                DBTErrors.project_currently_compiling_error.value,
                DBTErrors.runtime_error.value,
                DBTErrors.server_error.value,
        ]:
            context.log.warning(error["message"])
            raise RetryRequested(max_retries=5, seconds_to_wait=30)
        elif error["code"] == DBTErrors.project_compile_failure_error.value:
            raise Failure(
                description=error["message"],
                metadata_entries=[
                    MetadataEntry.text(text=str(error["code"]),
                                       label="RPC Error Code"),
                    MetadataEntry.text(text=error["data"]["cause"]["message"],
                                       label="RPC Error Cause"),
                ],
            )
        elif error["code"] == DBTErrors.rpc_process_killed_error.value:
            raise Failure(
                description=error["message"],
                metadata_entries=[
                    MetadataEntry.text(text=str(error["code"]),
                                       label="RPC Error Code"),
                    MetadataEntry.text(text=str(error["data"]["signum"]),
                                       label="RPC Signum"),
                    MetadataEntry.text(text=error["data"]["message"],
                                       label="RPC Error Message"),
                ],
            )
        elif error["code"] == DBTErrors.rpc_timeout_error.value:
            raise Failure(
                description=error["message"],
                metadata_entries=[
                    MetadataEntry.text(text=str(error["code"]),
                                       label="RPC Error Code"),
                    MetadataEntry.text(text=str(error["data"]["timeout"]),
                                       label="RPC Timeout"),
                    MetadataEntry.text(text=error["data"]["message"],
                                       label="RPC Error Message"),
                ],
            )
        else:
            raise Failure(
                description=error["message"],
                metadata_entries=[
                    MetadataEntry.text(text=str(error["code"]),
                                       label="RPC Error Code"),
                ],
            )
Exemplo n.º 15
0
    def handle_output(self, context: OutputContext,
                      obj: Union[PandasDataFrame, SparkDataFrame]):
        schema, table = context.metadata["table"].split(".")

        partition_bounds = (context.resources.partition_bounds
                            if context.metadata.get("partitioned") is True else
                            None)
        with connect_snowflake(config=self._config, schema=schema) as con:
            con.execute(
                self._get_cleanup_statement(table, schema, partition_bounds))

        if isinstance(obj, SparkDataFrame):
            yield from self._handle_spark_output(obj, schema, table)
        elif isinstance(obj, PandasDataFrame):
            yield from self._handle_pandas_output(obj, schema, table)
        else:
            raise Exception(
                "SnowflakeIOManager only supports pandas DataFrames and spark DataFrames"
            )

        yield MetadataEntry.text(
            self._get_select_statement(table, schema,
                                       context.metadata.get("columns"),
                                       partition_bounds),
            "Query",
        )
Exemplo n.º 16
0
def test_raise_on_error_true_type_check_returns_unsuccessful_type_check():
    FalsyType = DagsterType(
        name="FalsyType",
        type_check_fn=lambda _, _val: TypeCheck(
            success=False,
            metadata_entries=[MetadataEntry.text("foo", "bar", "baz")]),
    )

    @solid(output_defs=[OutputDefinition(FalsyType)])
    def foo_solid(_):
        return 1

    @pipeline
    def foo_pipeline():
        foo_solid()

    with pytest.raises(DagsterTypeCheckDidNotPass) as e:
        execute_pipeline(foo_pipeline)
    assert e.value.metadata_entries[0].label == "bar"
    assert e.value.metadata_entries[0].entry_data.text == "foo"
    assert e.value.metadata_entries[0].description == "baz"
    assert isinstance(e.value.dagster_type, DagsterType)

    pipeline_result = execute_pipeline(foo_pipeline, raise_on_error=False)
    assert not pipeline_result.success
    assert [
        event.event_type_value for event in pipeline_result.step_event_list
    ] == [
        DagsterEventType.STEP_START.value,
        DagsterEventType.STEP_OUTPUT.value,
        DagsterEventType.STEP_FAILURE.value,
    ]
    for event in pipeline_result.step_event_list:
        if event.event_type_value == DagsterEventType.STEP_FAILURE.value:
            assert event.event_specific_data.error.cls_name == "DagsterTypeCheckDidNotPass"
Exemplo n.º 17
0
 def __init__(self, description: str, logs: List[Dict[str, Any]], raw_output: str):
     metadata_entries = [
         MetadataEntry.json(
             {"logs": logs},
             label="Parsed CLI Output (JSON)",
         ),
         MetadataEntry.text(
             DagsterDbtCliRuntimeError.stitch_messages(logs),
             label="Parsed CLI Output (JSON) Message Attributes",
         ),
         MetadataEntry.text(
             raw_output,
             label="Raw CLI Output",
         ),
     ]
     super().__init__(description, metadata_entries)
Exemplo n.º 18
0
    def _get_metadata(self, result: Dict[str, Any]) -> List[MetadataEntry]:
        """
        Here, we run queries against our output Snowflake database tables to add additional context
        to our asset materializations.
        """

        table_name = result["unique_id"].split(".")[-1]
        with connect_snowflake(config=self._snowflake_config, schema=self._dbt_schema) as con:
            n_rows = pandas.read_sql_query(f"SELECT COUNT(*) FROM {table_name}", con)
            sample_rows = pandas.read_sql_query(
                f"SELECT * FROM {table_name} SAMPLE ROW (10 rows)", con
            )
        return super()._get_metadata(result) + [
            MetadataEntry.int(int(n_rows.iloc[0][0]), "dbt Model Number of Rows"),
            MetadataEntry.md(sample_rows.astype("str").to_markdown(), "dbt Model Sample Rows"),
        ]
Exemplo n.º 19
0
    def handle_output(self, context, obj: pd.DataFrame):
        """This saves the dataframe as a CSV."""
        fpath = self._get_fs_path(context.asset_key)
        os.makedirs(os.path.dirname(fpath), exist_ok=True)
        obj.to_csv(fpath)
        with open(fpath + ".version", "w") as f:
            f.write(context.version if context.version else "None")

        yield MetadataEntry.int(obj.shape[0], "Rows")
        yield MetadataEntry.path(fpath, "Path")
        yield MetadataEntry.md(obj.head(5).to_markdown(), "Sample")
        yield MetadataEntry.text(context.version, "Resolved version")
        yield MetadataEntry.table_schema(
            self.get_schema(context.dagster_type),
            "Schema",
        )
Exemplo n.º 20
0
    def _handle_pandas_output(self, obj: PandasDataFrame, schema: str, table: str):
        from snowflake import connector  # pylint: disable=no-name-in-module

        yield MetadataEntry.int(obj.shape[0], "Rows")
        yield MetadataEntry.md(pandas_columns_to_markdown(obj), "DataFrame columns")

        connector.paramstyle = "pyformat"
        with connect_snowflake(config=self._config, schema=schema) as con:
            with_uppercase_cols = obj.rename(str.upper, copy=False, axis="columns")
            with_uppercase_cols.to_sql(
                table,
                con=con,
                if_exists="append",
                index=False,
                method=pd_writer,
            )
Exemplo n.º 21
0
def cache_file_from_s3(context, s3_coordinate: S3Coordinate) -> FileHandle:
    target_key = context.solid_config.get("file_key", s3_coordinate["key"].split("/")[-1])

    file_cache = context.resources.file_cache

    target_file_handle = file_cache.get_file_handle(target_key)

    if file_cache.overwrite or not file_cache.has_file_object(target_key):
        with get_temp_file_name() as tmp_file:
            context.resources.s3.download_file(
                Bucket=s3_coordinate["bucket"], Key=s3_coordinate["key"], Filename=tmp_file
            )

            context.log.info("File downloaded to {}".format(tmp_file))

            with open(tmp_file, "rb") as tmp_file_object:
                file_cache.write_file_object(target_key, tmp_file_object)
                context.log.info("File handle written at : {}".format(target_file_handle.path_desc))
    else:
        context.log.info("File {} already present in cache".format(target_file_handle.path_desc))

    yield ExpectationResult(
        success=file_cache.has_file_object(target_key),
        label="file_handle_exists",
        metadata_entries=[MetadataEntry.path(path=target_file_handle.path_desc, label=target_key)],
    )
    yield Output(target_file_handle)
Exemplo n.º 22
0
def test_explicit_failure():
    with tempfile.TemporaryDirectory() as tmpdir:
        run_config = {
            "resources": {
                "step_launcher": {
                    "config": {
                        "scratch_dir": tmpdir
                    }
                },
                "io_manager": {
                    "config": {
                        "base_dir": tmpdir
                    }
                },
            }
        }
        with instance_for_test() as instance:
            run = execute_pipeline(
                pipeline=reconstructable(_define_failure_job),
                run_config=run_config,
                instance=instance,
                raise_on_error=False,
            )
            fd = run.result_for_solid("retry_op").failure_data
            assert fd.user_failure_data.description == "some failure description"
            assert fd.user_failure_data.metadata_entries == [
                MetadataEntry.float(label="foo", value=1.23)
            ]
Exemplo n.º 23
0
    def _ge_validation_fn(context, dataset):
        data_context = context.resources.ge_data_context
        validator_kwargs = {
            "datasource_name": datasource_name,
            "data_connector_name": data_connector_name,
            "data_asset_name": datasource_name or data_asset_name,
            "runtime_parameters": {
                runtime_method_type: dataset
            },
            "batch_identifiers": batch_identifiers,
            "expectation_suite_name": suite_name,
            **extra_kwargs,
        }
        validator = data_context.get_validator(**validator_kwargs)

        run_id = {
            "run_name": datasource_name + " run",
            "run_time": datetime.datetime.utcnow(),
        }
        results = validator.validate(run_id=run_id)

        validation_results_page_renderer = ValidationResultsPageRenderer(
            run_info_at_end=True)
        rendered_document_content_list = validation_results_page_renderer.render(
            validation_results=results)
        md_str = "".join(
            DefaultMarkdownPageView().render(rendered_document_content_list))

        meta_stats = MetadataEntry("Expectation Results",
                                   value=MetadataValue.md(md_str))
        yield ExpectationResult(
            success=bool(results["success"]),
            metadata_entries=[meta_stats],
        )
        yield Output(results.to_json_dict())
Exemplo n.º 24
0
def pandera_schema_to_dagster_type(
    schema: Union[pa.DataFrameSchema, Type[pa.SchemaModel]],
) -> DagsterType:
    """
    Convert a Pandera dataframe schema to a `DagsterType`.

    The generated Dagster type will be given an automatically generated `name`. The schema's `title`
    property, `name` property, or class name (in that order) will be used. If neither `title` or
    `name` is defined, a name of the form `DagsterPanderaDataframe<n>` is generated.

    Additional metadata is also extracted from the Pandera schema and attached to the returned
    `DagsterType` in an `MetadataEntry` object. The extracted metadata includes:

    - Descriptions on the schema and constituent columns and checks.
    - Data types for each column.
    - String representations of all column-wise checks.
    - String representations of all row-wise (i.e. "wide") checks.

    The returned `DagsterType` type will call the Pandera schema's `validate()` method in its type
    check function. Validation is done in `lazy` mode, i.e. pandera will attempt to validate all
    values in the dataframe, rather than stopping on the first error.

    If validation fails, the returned `TypeCheck` object will contain two pieces of metadata:

    - `num_failures` total number of validation errors.
    - `failure_sample` a table containing up to the first 10 validation errors.

    Args:
        schema (Union[pa.DataFrameSchema, Type[pa.SchemaModel]]):

    Returns:
        DagsterType: Dagster Type constructed from the Pandera schema.

    """
    if not (
        isinstance(schema, pa.DataFrameSchema)
        or (isinstance(schema, type) and issubclass(schema, pa.SchemaModel))
    ):
        raise TypeError(
            "schema must be a pandera `DataFrameSchema` or a subclass of a pandera `SchemaModel`"
        )

    name = _extract_name_from_pandera_schema(schema)
    norm_schema = (
        schema.to_schema()  # type: ignore[attr-defined]
        if isinstance(schema, type) and issubclass(schema, pa.SchemaModel)
        else schema
    )
    tschema = _pandera_schema_to_table_schema(norm_schema)
    type_check_fn = _pandera_schema_to_type_check_fn(norm_schema, tschema)

    return DagsterType(
        type_check_fn=type_check_fn,
        name=name,
        description=norm_schema.description,
        metadata_entries=[
            MetadataEntry("schema", value=tschema),
        ],
    )
Exemplo n.º 25
0
    def handle_output(self, context: OutputContext,
                      obj: Union[pandas.DataFrame, pyspark.sql.DataFrame]):
        path = self._get_path(context)
        if "://" not in self._base_path:
            os.makedirs(os.path.dirname(path), exist_ok=True)

        if isinstance(obj, pandas.DataFrame):
            row_count = len(obj)
            context.log.info(f"Row count: {row_count}")
            obj.to_parquet(path=path, index=False)
        elif isinstance(obj, pyspark.sql.DataFrame):
            row_count = obj.count()
            obj.write.parquet(path=path, mode="overwrite")
        else:
            raise Exception(f"Outputs of type {type(obj)} not supported.")
        yield MetadataEntry.int(value=row_count, label="row_count")
        yield MetadataEntry.path(path=path, label="path")
Exemplo n.º 26
0
            def handle_output(self, context, obj):
                keys = tuple(context.get_output_identifier())
                self.values[keys] = obj

                context.add_output_metadata({"foo": "bar"})
                yield MetadataEntry("baz", value="baz")
                context.add_output_metadata({"bar": "bar"})
                yield materialization
Exemplo n.º 27
0
 def __init__(self, invalid_line_nos: List[int]):
     check.list_param(invalid_line_nos, "invalid_line_nos", int)
     line_nos_str = ", ".join(map(str, invalid_line_nos))
     description = f"dbt CLI emitted unexpected output on lines {line_nos_str}"
     metadata_entries = [
         MetadataEntry.json({"line_nos": invalid_line_nos}, "Invalid CLI Output Line Numbers")
     ]
     super().__init__(description, metadata_entries)
     self.invalid_line_nos = invalid_line_nos
Exemplo n.º 28
0
 def should_fail(_):
     raise Failure(
         description="Foolure",
         metadata_entries=[
             MetadataEntry.text(label="label",
                                text="text",
                                description="description")
         ],
     )
Exemplo n.º 29
0
def df_type_check(_, value):
    if not isinstance(value, dd.DataFrame):
        return TypeCheck(success=False)
    return TypeCheck(
        success=True,
        metadata_entries=[
            # string cast columns since they may be things like datetime
            MetadataEntry.json({"columns": list(map(str, value.columns))}, "metadata"),
        ],
    )
Exemplo n.º 30
0
    def handle_output(self, context, obj):
        key = context.asset_key.path[-1]
        bucket = context.resource_config["bucket"]

        context.log.debug("about to pickle object")
        pickled_obj = pickle.dumps(obj)
        yield MetadataEntry.int(len(pickled_obj), "Bytes")
        client = s3_client()
        context.log.debug("created S3 client")
        client.put_object(Bucket=bucket, Key=key, Body=pickled_obj)