def materialization_and_expectation(_context): yield AssetMaterialization( asset_key="all_types", description="a materialization with all metadata types", metadata_entries=[ EventMetadataEntry.text("text is cool", "text"), EventMetadataEntry.url("https://bigty.pe/neato", "url"), EventMetadataEntry.fspath("/tmp/awesome", "path"), EventMetadataEntry.json({"is_dope": True}, "json"), ], ) yield ExpectationResult(success=True, label="row_count", description="passed") yield ExpectationResult(True) yield Output(True)
def dbt_cli_test(context) -> DbtCliStatsResult: """This solid executes ``dbt test`` via the dbt CLI.""" logs, raw_output, return_code = execute_dbt( context.solid_config["dbt_executable"], command=("test", ), flags_dict=passthrough_flags_only( context.solid_config, ("data", "schema", "fail-fast", "threads", "models", "exclude")), log=context.log, warn_error=context.solid_config["warn-error"], ignore_handled_error=context.solid_config["ignore_handled_error"], ) run_results = get_run_results(logs) yield AssetMaterialization( asset_key= "dbt_cli_test-shell_output", # TODO: Perhaps derive asset key from CLI flags? description="The output of a shell execution of `dbt test`.", metadata_entries=[ EventMetadataEntry.float( label="return_code", value=float(return_code), description= "The return code of a shell exeuction of `dbt test`.", ), EventMetadataEntry.json( label="run_results", data=run_results, description= "The summarized results of a shell execution of `dbt test`.", ), EventMetadataEntry.text( label="raw_output", text=raw_output, description= "The raw output of a shell execution of `dbt test`.", ), ], ) yield Output( DbtCliStatsResult(logs=logs, raw_output=raw_output, return_code=return_code, **run_results))
def _materialization_event_record(run_id, asset_key): return DagsterEventRecord( None, "", "debug", "", run_id, time.time() - 25, step_key="my_step_key", pipeline_name="my_pipeline", dagster_event=DagsterEvent( DagsterEventType.STEP_MATERIALIZATION.value, "my_pipeline", step_key="my_step_key", event_specific_data=StepMaterializationData(AssetMaterialization(asset_key=asset_key)), ), )
def materialize(_): yield AssetMaterialization( asset_key="all_types", description="a materialization with all metadata types", metadata_entries=[ EventMetadataEntry.text("text is cool", "text"), EventMetadataEntry.url("https://bigty.pe/neato", "url"), EventMetadataEntry.fspath("/tmp/awesome", "path"), EventMetadataEntry.json({"is_dope": True}, "json"), EventMetadataEntry.python_artifact(EventMetadataEntry, "python class"), EventMetadataEntry.python_artifact(file_relative_path, "python function"), EventMetadataEntry.float(1.2, "float"), ], ) yield Output(None)
def dataframe_materializer(_context, config, pandas_df): check.inst_param(pandas_df, "pandas_df", pd.DataFrame) file_type, file_options = list(config.items())[0] if file_type == "csv": path = file_options["path"] pandas_df.to_csv(path, index=False, **dict_without_keys(file_options, "path")) elif file_type == "parquet": pandas_df.to_parquet(file_options["path"]) elif file_type == "table": pandas_df.to_csv(file_options["path"], sep="\t", index=False) elif file_type == "pickle": pandas_df.to_pickle(file_options["path"]) else: check.failed("Unsupported file_type {file_type}".format(file_type=file_type)) return AssetMaterialization.file(file_options["path"])
def file_handle_to_s3(context, file_handle): bucket = context.solid_config['Bucket'] key = context.solid_config['Key'] with context.file_manager.read(file_handle, 'rb') as fileobj: context.resources.s3.upload_fileobj(fileobj, bucket, key) s3_file_handle = S3FileHandle(bucket, key) yield AssetMaterialization( asset_key=s3_file_handle.s3_path, metadata_entries=[ EventMetadataEntry.path(s3_file_handle.s3_path, label=last_key(key)) ], ) yield Output(value=s3_file_handle, output_name='s3_file_handle')
def materialize(_): yield AssetMaterialization( asset_key='all_types', description='a materialization with all metadata types', metadata_entries=[ EventMetadataEntry.text('text is cool', 'text'), EventMetadataEntry.url('https://bigty.pe/neato', 'url'), EventMetadataEntry.fspath('/tmp/awesome', 'path'), EventMetadataEntry.json({'is_dope': True}, 'json'), EventMetadataEntry.python_artifact(EventMetadataEntry, 'python class'), EventMetadataEntry.python_artifact(file_relative_path, 'python function'), EventMetadataEntry.float(1.2, 'float'), ], ) yield Output(None)
def dbt_cli_compile(context) -> DbtCliResult: """This solid executes ``dbt compile`` via the dbt CLI.""" logs, raw_output, return_code = execute_dbt( context.solid_config["dbt_executable"], command=("compile", ), flags_dict=passthrough_flags_only( context.solid_config, ( "parse-only", "threads", "no-version-check", "models", "exclude", "selector", "state", "full-refresh", ), ), log=context.log, warn_error=context.solid_config["warn-error"], ignore_handled_error=context.solid_config["ignore_handled_error"], ) yield AssetMaterialization( asset_key= "dbt_cli_compile-shell_output", # TODO: Perhaps derive asset key from CLI flags? description="The output of a shell execution of `dbt compile`.", metadata_entries=[ EventMetadataEntry.float( label="return_code", value=float(return_code), description= "The return code of a shell exeuction of `dbt compile`.", ), EventMetadataEntry.text( label="raw_output", text=raw_output, description= "The raw output of a shell execution of `dbt compile`.", ), ], ) yield Output( DbtCliResult(logs=logs, raw_output=raw_output, return_code=return_code))
def dataframe_materializer(_context, config, dask_df): check.inst_param(dask_df, "dask_df", dd.DataFrame) if "to" in config: to_specs = config["to"] # https://github.com/dagster-io/dagster/issues/2872 else: to_specs = { to_type: to_options for to_type, to_options in config.items() if to_type in DataFrameToTypes } for key in to_specs.keys(): warnings.warn( "Specifying {key}: is deprecated. Use to:{key}: instead.". format(key=key)) for to_type, to_options in to_specs.items(): if not to_type in DataFrameToTypes: check.failed( "Unsupported to_type {to_type}".format(to_type=to_type)) # Get the metadata entry for the read_type in order to know which method # to call and whether it uses path as the first argument. And, make # to_options mutable if we need to pop off a path argument. to_meta = DataFrameToTypes[to_type] to_options = dict(to_options) # Get the to function and prepare its arguments. to_function = to_meta["function"] to_path = to_options.pop("path") if to_meta.get( "is_path_based", False) else None to_args = [to_path] if to_path else [] to_kwargs = to_options # Get the Dask client from the dask resource, if available. client_context = (_context.resources.dask.client.as_current() if hasattr(_context.resources, "dask") else contextlib.suppress()) with client_context: to_function(dask_df, *to_args, **to_kwargs) if to_path: yield AssetMaterialization.file(to_path)
def materialize(_): yield AssetMaterialization( asset_key="all_types", description="a materialization with all metadata types", metadata_entries=[ EventMetadataEntry.text("text is cool", "text"), EventMetadataEntry.url("https://bigty.pe/neato", "url"), EventMetadataEntry.fspath("/tmp/awesome", "path"), EventMetadataEntry.json({"is_dope": True}, "json"), EventMetadataEntry.python_artifact(EventMetadataEntry, "python class"), EventMetadataEntry.python_artifact(file_relative_path, "python function"), EventMetadataEntry.float(1.2, "float"), EventMetadataEntry.int(1, "int"), EventMetadataEntry.float(float("nan"), "float NaN"), EventMetadataEntry.int(LONG_INT, "long int"), EventMetadataEntry.pipeline_run("fake_run_id", "pipeline run"), EventMetadataEntry.asset(AssetKey("my_asset"), "my asset"), EventMetadataEntry.table( label="table", records=[ TableRecord(foo=1, bar=2), TableRecord(foo=3, bar=4), ], ), EventMetadataEntry.table_schema( label="table_schema", schema=TableSchema( columns=[ TableColumn( name="foo", type="integer", constraints=TableColumnConstraints( unique=True), ), TableColumn(name="bar", type="string"), ], constraints=TableConstraints(other=["some constraint" ], ), ), ), ], ) yield Output(None)
def load_data_to_database_from_spark(context, data_frame: DataFrame): context.resources.db_info.load_table(data_frame, context.solid_config['table_name']) table_name = context.solid_config['table_name'] yield AssetMaterialization( asset_key='table:{table_name}'.format(table_name=table_name), description= ('Persisted table {table_name} in database configured in the db_info resource.' ).format(table_name=table_name), metadata_entries=[ EventMetadataEntry.text(label='Host', text=context.resources.db_info.host), EventMetadataEntry.text(label='Db', text=context.resources.db_info.db_name), ], ) yield Output(value=table_name, output_name='table_name')
def many_table_materializations(_context): with open(file_relative_path(__file__, MARKDOWN_EXAMPLE), 'r') as f: md_str = f.read() for table in raw_tables: yield AssetMaterialization( asset_key='table_info', metadata_entries=[ EventMetadataEntry.text(text=table, label='table_name'), EventMetadataEntry.fspath(path='/path/to/{}'.format(table), label='table_path'), EventMetadataEntry.json(data={'name': table}, label='table_data'), EventMetadataEntry.url( url='https://bigty.pe/{}'.format(table), label='table_name_big'), EventMetadataEntry.md(md_str=md_str, label='table_blurb'), ], )
def dbt_cli_snapshot_freshness(context) -> Dict: """This solid executes ``dbt source snapshot-freshness`` via the dbt CLI.""" cli_output = execute_cli( context.solid_config["dbt_executable"], command=("source", "snapshot-freshness"), flags_dict=passthrough_flags_only(context.solid_config, ("select", "output", "threads")), log=context.log, warn_error=context.solid_config["warn-error"], ignore_handled_error=context.solid_config["ignore_handled_error"], ) yield AssetMaterialization( asset_key="dbt_source_snapshot-freshness_cli_output", description="Output from the CLI execution of `dbt source snapshot-freshness`.", metadata_entries=[EventMetadataEntry.json(cli_output, label="CLI Output")], ) yield Output(cli_output)
def dbt_cli_run_operation(context) -> Dict: """This solid executes ``dbt run-operation`` via the dbt CLI.""" cli_output = execute_cli( context.solid_config["dbt_executable"], command=("run-operation", context.solid_config["macro"]), flags_dict=passthrough_flags_only(context.solid_config, ("args",)), log=context.log, warn_error=context.solid_config["warn-error"], ignore_handled_error=context.solid_config["ignore_handled_error"], ) yield AssetMaterialization( asset_key="dbt_run_operation_cli_output", description="Output from the CLI execution of `dbt run-operation`.", metadata_entries=[EventMetadataEntry.json(cli_output, label="CLI Output")], ) yield Output(cli_output)
def many_table_materializations(_context): with open(file_relative_path(__file__, MARKDOWN_EXAMPLE), "r") as f: md_str = f.read() for table in raw_tables: yield AssetMaterialization( asset_key="table_info", metadata_entries=[ EventMetadataEntry.text(text=table, label="table_name"), EventMetadataEntry.fspath(path="/path/to/{}".format(table), label="table_path"), EventMetadataEntry.json(data={"name": table}, label="table_data"), EventMetadataEntry.url( url="https://bigty.pe/{}".format(table), label="table_name_big"), EventMetadataEntry.md(md_str=md_str, label="table_blurb"), ], )
def dataframe_materializer(_context, config, pandas_df): check.inst_param(pandas_df, 'pandas_df', pd.DataFrame) file_type, file_options = list(config.items())[0] if file_type == 'csv': path = file_options['path'] pandas_df.to_csv(path, index=False, **dict_without_keys(file_options, 'path')) elif file_type == 'parquet': pandas_df.to_parquet(file_options['path']) elif file_type == 'table': pandas_df.to_csv(file_options['path'], sep='\t', index=False) else: check.failed( 'Unsupported file_type {file_type}'.format(file_type=file_type)) return AssetMaterialization.file(file_options['path'])
def my_metadata_materialization_solid(context): df = read_df() remote_storage_path = persist_to_storage(df) yield AssetMaterialization( asset_key="my_dataset", description="Persisted result to storage", metadata={ "text_metadata": "Text-based metadata for this event", "path": EventMetadata.path(remote_storage_path), "dashboard_url": EventMetadata.url("http://mycoolsite.com/url_for_my_data"), "size (bytes)": calculate_bytes(df), }, ) yield Output(remote_storage_path)
def load_data_to_database_from_spark(context, data_frame): context.resources.db_info.load_table(data_frame, context.solid_config["table_name"]) table_name = context.solid_config["table_name"] yield AssetMaterialization( asset_key="table:{table_name}".format(table_name=table_name), description= ("Persisted table {table_name} in database configured in the db_info resource." ).format(table_name=table_name), metadata_entries=[ EventMetadataEntry.text(label="Host", text=context.resources.db_info.host), EventMetadataEntry.text(label="Db", text=context.resources.db_info.db_name), ], ) yield Output(value=table_name, output_name="table_name")
def test_access_partition_keys_from_context_only_one_asset_partitioned(): upstream_partitions_def = StaticPartitionsDefinition(["a", "b", "c"]) class MyIOManager(IOManager): def handle_output(self, context, obj): if context.op_def.name == "upstream_asset": assert context.asset_partition_key == "b" elif context.op_def.name in [ "downstream_asset", "double_downstream_asset" ]: assert not context.has_asset_partitions with pytest.raises(Exception): # TODO: better error message assert context.asset_partition_key_range else: assert False def load_input(self, context): assert not context.has_asset_partitions @asset(partitions_def=upstream_partitions_def) def upstream_asset(context): assert context.output_asset_partition_key() == "b" @asset def downstream_asset(upstream_asset): assert upstream_asset is None @asset def double_downstream_asset(downstream_asset): assert downstream_asset is None my_job = build_assets_job( "my_job", assets=[upstream_asset, downstream_asset, double_downstream_asset], resource_defs={ "io_manager": IOManagerDefinition.hardcoded_io_manager(MyIOManager()) }, ) result = my_job.execute_in_process(partition_key="b") assert result.asset_materializations_for_node("upstream_asset") == [ AssetMaterialization(asset_key=AssetKey(["upstream_asset"]), partition="b") ]
def observes_dataset_op(context): df = read_df() remote_storage_path = persist_to_storage(df) context.log_event( AssetObservation( asset_key="my_dataset", metadata={ "text_metadata": "Text-based metadata for this event", "path": EventMetadata.path(remote_storage_path), "dashboard_url": EventMetadata.url("http://mycoolsite.com/url_for_my_data"), "size (bytes)": calculate_bytes(df), }, )) context.log_event(AssetMaterialization(asset_key="my_dataset")) return remote_storage_path
def dbt_cli_snapshot(context) -> Dict: """This solid executes ``dbt snapshot`` via the dbt CLI.""" cli_output = execute_cli( context.solid_config["dbt_executable"], command=("snapshot",), flags_dict=passthrough_flags_only(context.solid_config, ("threads", "models", "exclude")), log=context.log, warn_error=context.solid_config["warn-error"], ignore_handled_error=context.solid_config["ignore_handled_error"], ) if context.solid_config["yield_materializations"]: yield AssetMaterialization( asset_key="dbt_snapshot_cli_output", description="Output from the CLI execution of `dbt snapshot`.", metadata_entries=[EventMetadataEntry.json(cli_output, label="CLI Output")], ) yield Output(cli_output)
def materialize_gdelt_mining_asset(context, gdelt_mined_events_filename): # Extracting which file we're materializing filename = gdelt_mined_events_filename.splitlines()[-1] # Getting csv file and transform to pandas dataframe s3 = boto3.resource('s3') obj = s3.Object('discursus-io', filename) df_gdelt_events = pd.read_csv(StringIO(obj.get()['Body'].read().decode('utf-8')), sep='\t') # Materialize asset yield AssetMaterialization( asset_key = "gdelt_events", description = "List of events mined on GDELT", metadata={ "path": "s3://discursus-io/" + filename, "rows": df_gdelt_events.index.size } ) yield Output(df_gdelt_events)
def materialize(_): yield AssetMaterialization( asset_key="all_types", description="a materialization with all metadata types", metadata_entries=[ EventMetadataEntry.text("text is cool", "text"), EventMetadataEntry.url("https://bigty.pe/neato", "url"), EventMetadataEntry.fspath("/tmp/awesome", "path"), EventMetadataEntry.json({"is_dope": True}, "json"), EventMetadataEntry.python_artifact(EventMetadataEntry, "python class"), EventMetadataEntry.python_artifact(file_relative_path, "python function"), EventMetadataEntry.float(1.2, "float"), EventMetadataEntry.int(1, "int"), EventMetadataEntry.float(float("nan"), "float NaN"), EventMetadataEntry.int(LONG_INT, "long int"), EventMetadataEntry.pipeline_run("fake_run_id", "pipeline run"), EventMetadataEntry.asset(AssetKey("my_asset"), "my asset"), ], ) yield Output(None)
def dataframe_materializer(_context, config, dask_df): check.inst_param(dask_df, "dask_df", dd.DataFrame) file_type, file_options = list(config.items())[0] path = file_options.get("path") if file_type == "csv": dask_df.to_csv(path, **dict_without_keys(file_options, "path")) elif file_type == "parquet": dask_df.to_parquet(path, **dict_without_keys(file_options, "path")) elif file_type == "hdf": dask_df.to_hdf(path, **dict_without_keys(file_options, "path")) elif file_type == "json": dask_df.to_json(path, **dict_without_keys(file_options, "path")) elif file_type == "sql": dask_df.to_sql(**file_options) else: check.failed( "Unsupported file_type {file_type}".format(file_type=file_type)) return AssetMaterialization.file(path)
def made_op(context): partition_date = datetime.strptime(context.op_config["partition"], DEFAULT_DATE_FORMAT) if data_size_fn: data_size = data_size_fn(partition_date) sleep_time = sleep_factor * data_size time.sleep(sleep_time) rand = random() if error_rate and rand < error_rate: raise IntentionalRandomFailure(f"random {rand} < error rate {error_rate}") if asset_key: metadata = {"Data size (bytes)": data_size} if data_size_fn else None yield AssetMaterialization( asset_key=asset_key, metadata=metadata, partition=context.op_config.get("partition"), )
def dataframe_materializer(_context, config, dask_df): check.inst_param(dask_df, 'dask_df', dd.DataFrame) file_type, file_options = list(config.items())[0] path = file_options.get('path') if file_type == 'csv': dask_df.to_csv(path, **dict_without_keys(file_options, 'path')) elif file_type == 'parquet': dask_df.to_parquet(path, **dict_without_keys(file_options, 'path')) elif file_type == 'hdf': dask_df.to_hdf(path, **dict_without_keys(file_options, 'path')) elif file_type == 'json': dask_df.to_json(path, **dict_without_keys(file_options, 'path')) elif file_type == 'sql': dask_df.to_sql(**file_options) else: check.failed( 'Unsupported file_type {file_type}'.format(file_type=file_type)) return AssetMaterialization.file(path)
def compare_calories(context, cereals, least_hot, least_cold): cereals_df = pd.DataFrame(cereals) def get_calories(name): return cereals_df[cereals_df["name"] == name]["calories"].iloc[0] cereal_choice = ( least_hot if get_calories(least_hot) > get_calories(least_cold) else least_cold ) context.log.info( f"Compare the calories of hot and cold cereals: {cereal_choice} is healthier" ) yield AssetMaterialization( asset_key="cereal_choice", description="Which cereal is healthiest", metadata_entries=[ EventMetadataEntry.text(cereal_choice, "Cereal Choice") ], ) yield Output(cereal_choice)
def download_zipfile_from_url(context, file_name: str, base_url: str): url = "/".join([base_url, file_name]) # mount dirs onto volume target = os.path.join(context.resources.volume, file_name) if not os.path.exists(target): _download_zipfile_from_url( url, target, context.solid_config["chunk_size"], ) yield AssetMaterialization( asset_key=file_name, metadata_entries=[ EventMetadataEntry.text(url, "zipfile url source"), EventMetadataEntry.text(target, "zipfile filepath"), EventMetadataEntry.text(str(os.path.getsize(target)), "size of zipfile (bytes)"), ], ) yield Output(target)
def read_file(context): relative_filename = context.op_config["filename"] directory = context.op_config["directory"] filename = os.path.join(directory, relative_filename) try: fstats = os.stat(filename) context.log.info("Found file {}".format(relative_filename)) yield AssetMaterialization( asset_key=AssetKey(["log_file", relative_filename]), metadata={ "path": MetadataValue.path(filename), "File status": { "size": fstats.st_size, "ctime": fstats.st_ctime, "mtime": fstats.st_mtime, }, }, ) yield Output(relative_filename) except FileNotFoundError: context.log.error("No file found: {}".format(relative_filename))
def upload_pickled_object_to_gcs_bucket(context, value: Any, bucket_name: str, file_name: str): gcs_bucket = context.resources.gcs_client.get_bucket(bucket_name) key = "{}-{}".format(file_name, uuid.uuid4()) with tempfile.TemporaryFile("w+b") as fp: pickle.dump(value, fp, PICKLE_PROTOCOL) # Done because you can't upload the contents of a file outside the context manager if it's a tempfile. fp.seek(0) gcs_bucket.blob(key).upload_from_file(fp) gcs_url = "gs://{bucket_name}/{key}".format(bucket_name=bucket_name, key=key) yield AssetMaterialization( asset_key=gcs_url, description="Serialized object to Google Cloud Storage Bucket", metadata_entries=[ EventMetadataEntry.text(gcs_url, "google cloud storage URI"), ], ) yield Output(value)