def div_2(_, x: Float) -> float: return x / 2 @solid def concat(_, x: String, y: str) -> str: return x + y @solid def wait(_) -> Nothing: time.sleep(0.2) return @solid(input_defs=[InputDefinition("ready", dagster_type=Nothing)]) def done(_) -> str: return "done" @pipeline def nothing_pipeline(): done(wait()) @solid def wait_int(_) -> Int: time.sleep(0.2) return 1
# pylint: disable=unused-argument from dagster import InputDefinition, ModeDefinition, input_manager, pipeline, solid def read_dataframe_from_table(**_kwargs): pass # start_marker @solid( input_defs=[InputDefinition("dataframe", manager_key="my_root_manager")]) def my_solid(_, dataframe): """Do some stuff""" @input_manager def table1_loader(_): return read_dataframe_from_table(name="table1") @pipeline(mode_defs=[ ModeDefinition(resource_defs={"my_root_manager": table1_loader}) ]) def my_pipeline(): my_solid() # end_marker
return {'solids': solids_config} def create_sum_table(): def transform(_context, inputs): num_csv = inputs['num_csv'] check.inst_param(num_csv, 'num_csv', pd.DataFrame) num_csv['sum'] = num_csv['num1'] + num_csv['num2'] return num_csv return _dataframe_solid(name='sum_table', input_defs=[InputDefinition('num_csv', DataFrame)], compute_fn=transform) @lambda_solid(input_defs=[InputDefinition('num_csv', DataFrame)], output_def=OutputDefinition(DataFrame)) def sum_table(num_csv): check.inst_param(num_csv, 'num_csv', pd.DataFrame) num_csv['sum'] = num_csv['num1'] + num_csv['num2'] return num_csv @lambda_solid(input_defs=[InputDefinition('sum_df', DataFrame)], output_def=OutputDefinition(DataFrame)) def sum_sq_table(sum_df): sum_df['sum_squared'] = sum_df['sum'] * sum_df['sum'] return sum_df @lambda_solid(
def create_sum_table(): def transform(_context, inputs): num_csv = inputs['num_csv'] check.inst_param(num_csv, 'num_csv', pd.DataFrame) num_csv['sum'] = num_csv['num1'] + num_csv['num2'] return num_csv return _dataframe_solid( name='sum_table', inputs=[InputDefinition('num_csv', dagster_pd.DataFrame)], transform_fn=transform, ) @lambda_solid( inputs=[InputDefinition('num_csv', dagster_pd.DataFrame)], output=OutputDefinition(dagster_pd.DataFrame), ) def sum_table(num_csv): check.inst_param(num_csv, 'num_csv', pd.DataFrame) num_csv['sum'] = num_csv['num1'] + num_csv['num2'] return num_csv @lambda_solid( inputs=[InputDefinition('sum_df', dagster_pd.DataFrame)], output=OutputDefinition(dagster_pd.DataFrame), ) def sum_sq_table(sum_df): sum_df['sum_squared'] = sum_df['sum'] * sum_df['sum'] return sum_df
execution_manager = MultiprocessingExecutionManager() execution_manager.execute_pipeline(handle, crashy_pipeline, pipeline_run, raise_on_error=False) execution_manager.join() assert pipeline_run.status == PipelineRunStatus.FAILURE last_log = pipeline_run.all_logs()[-1] print(last_log.message) assert last_log.message.startswith( 'Exception: Pipeline execution process for {run_id} unexpectedly exited\n' .format(run_id=run_id)) @lambda_solid( input_defs=[InputDefinition('num', PoorMansDataFrame)], output_def=OutputDefinition(PoorMansDataFrame), ) def sum_solid(num): sum_df = deepcopy(num) for x in sum_df: x['sum'] = x['num1'] + x['num2'] return PoorMansDataFrame(sum_df) @lambda_solid( input_defs=[InputDefinition('sum_df', PoorMansDataFrame)], output_def=OutputDefinition(PoorMansDataFrame), ) def error_solid(sum_df): # pylint: disable=W0613 raise Exception('foo')
def LR_solid(): return dagstermill.define_dagstermill_solid( 'linear_regression', nb_test_path('tutorial_LR'), input_defs=[InputDefinition(name='df', dagster_type=DataFrame)], )
def test_pipeline(): return simple() @pipeline(mode_defs=celery_mode_defs) def test_serial_pipeline(): return add_one(simple()) @solid(output_defs=[OutputDefinition(name='value_one'), OutputDefinition(name='value_two')]) def emit_values(_context): yield Output(1, 'value_one') yield Output(2, 'value_two') @lambda_solid(input_defs=[InputDefinition('num_one'), InputDefinition('num_two')]) def subtract(num_one, num_two): return num_one - num_two @pipeline(mode_defs=celery_mode_defs) def test_diamond_pipeline(): value_one, value_two = emit_values() return subtract(num_one=add_one(num=value_one), num_two=add_one.alias('renamed')(num=value_two)) @pipeline(mode_defs=celery_mode_defs) def test_parallel_pipeline(): value = simple() for i in range(10): add_one.alias('add_one_' + str(i))(value)
Float, InputDefinition, Int, Output, RunConfig, String, composite_solid, execute_pipeline, lambda_solid, pipeline, solid, ) # have to use "pipe" solid since "result_for_solid" doesnt work with composite mappings @lambda_solid(input_defs=[InputDefinition('input_str')]) def pipe(input_str): return input_str @solid(config_field=Field(String, is_optional=True)) def scalar_config_solid(context): yield Output(context.solid_config) @composite_solid( config={'override_str': Field(String)}, config_fn=lambda _, cfg: {'scalar_config_solid': { 'config': cfg['override_str'] }},
def test_wrap_all_config_one_input(): @solid( config={ 'config_field_a': Field(String), 'config_field_b': Field(String) }, input_defs=[ InputDefinition('input_a', String), InputDefinition('input_b', String) ], ) def basic(context, input_a, input_b): res = '.'.join([ context.solid_config['config_field_a'], context.solid_config['config_field_b'], input_a, input_b, ]) yield Output(res) @composite_solid( input_defs=[InputDefinition('input_a', String)], config_fn=lambda _, cfg: { 'basic': { 'config': { 'config_field_a': cfg['config_field_a'], 'config_field_b': cfg['config_field_b'], }, 'inputs': { 'input_b': { 'value': 'set_input_b' } }, } }, config={ 'config_field_a': Field(String), 'config_field_b': Field(String) }, ) def wrap_all_config_one_input(input_a): return basic(input_a) @pipeline(name='config_mapping') def config_mapping_pipeline(): return pipe(wrap_all_config_one_input()) result = execute_pipeline( config_mapping_pipeline, { 'solids': { 'wrap_all_config_one_input': { 'config': { 'config_field_a': 'override_a', 'config_field_b': 'override_b' }, 'inputs': { 'input_a': { 'value': 'set_input_a' } }, } } }, ) assert result.success assert (result.result_for_solid('pipe').output_value() == 'override_a.override_b.set_input_a.set_input_b') with pytest.raises(DagsterInvalidConfigError) as exc_info: result = execute_pipeline( config_mapping_pipeline, { 'solids': { 'wrap_all_config_one_input': { 'config': { 'config_field_a': 1234, 'config_field_b': 'override_b' }, 'inputs': { 'input_a': { 'value': 'set_input_a' } }, } } }, ) assert len(exc_info.value.errors) == 1 assert exc_info.value.errors[0].message == ( 'Value at path root:solids:wrap_all_config_one_input:config:config_field_a is not valid. ' 'Expected "String"') with pytest.raises(DagsterInvalidConfigError) as exc_info: result = execute_pipeline( config_mapping_pipeline, { 'solids': { 'wrap_all_config_one_input': { 'config': { 'config_field_a': 'override_a', 'config_field_b': 'override_b' }, 'inputs': { 'input_a': { 'value': 1234 } }, } } }, ) assert len(exc_info.value.errors) == 1 assert exc_info.value.errors[0].message == ( 'Value at path root:solids:wrap_all_config_one_input:inputs:input_a:value is not valid. ' 'Expected "String"')
def test_pd_df_load(): dataset = get_dataset() table = "%s.%s" % (dataset, "df") test_df = pd.DataFrame({"num1": [1, 3], "num2": [2, 4]}) create_op = bq_create_dataset.alias("create_op") load_op = import_df_to_bq.alias("load_op") query_op = bq_op_for_queries(["SELECT num1, num2 FROM %s" % table ]).alias("query_op") delete_op = bq_delete_dataset.alias("delete_op") @op(input_defs=[InputDefinition("success", Nothing)], output_defs=[OutputDefinition(DataFrame)]) def return_df(_context): # pylint: disable=unused-argument return test_df @job(resource_defs={"bigquery": bigquery_resource}) def bq_circle_of_life(): delete_op(query_op(load_op(return_df(create_op())))) result = bq_circle_of_life.execute_in_process( run_config={ "ops": { "create_op": { "config": { "dataset": dataset, "exists_ok": True } }, "load_op": { "config": { "destination": table } }, "delete_op": { "config": { "dataset": dataset, "delete_contents": True } }, } }) assert result.success values = result.output_for_node("query_op") assert values[0].to_dict() == test_df.to_dict() # BQ loads should throw an exception if pyarrow and fastparquet aren't available with mock.patch.dict(sys.modules, {"pyarrow": None, "fastparquet": None}): with pytest.raises(DagsterExecutionStepExecutionError) as exc_info: bq_circle_of_life.execute_in_process( run_config={ "ops": { "create_op": { "config": { "dataset": dataset, "exists_ok": True } }, "load_op": { "config": { "destination": table } }, "delete_op": { "config": { "dataset": dataset, "delete_contents": True } }, } }) assert ( "loading data to BigQuery from pandas DataFrames requires either pyarrow or fastparquet" " to be installed" in str(exc_info.value.user_exception)) @job(resource_defs={"bigquery": bigquery_resource}) def cleanup_bq(): delete_op() result = cleanup_bq.execute_in_process( run_config={ "ops": { "delete_op": { "config": { "dataset": dataset, "delete_contents": True } } } }) assert result.success assert not dataset_exists(dataset)
def test_gcs_load(): dataset = get_dataset() table = "%s.%s" % (dataset, "df") create_op = bq_create_dataset.alias("create_op") query_op = bq_op_for_queries([ "SELECT string_field_0, string_field_1 FROM %s ORDER BY string_field_0 ASC LIMIT 1" % table ]).alias("query_op") delete_op = bq_delete_dataset.alias("delete_op") @op(input_defs=[InputDefinition("success", Nothing)], output_defs=[OutputDefinition(List[str])]) def return_gcs_uri(_context): # pylint: disable=unused-argument return ["gs://cloud-samples-data/bigquery/us-states/us-states.csv"] @job(resource_defs={"bigquery": bigquery_resource}) def bq_from_gcs(): delete_op(query_op(import_gcs_paths_to_bq(return_gcs_uri( create_op())))) result = bq_from_gcs.execute_in_process( run_config={ "ops": { "create_op": { "config": { "dataset": dataset, "exists_ok": True } }, "import_gcs_paths_to_bq": { "config": { "destination": table, "load_job_config": { "autodetect": True, "skip_leading_rows": 1, "source_format": "CSV", "write_disposition": "WRITE_TRUNCATE", }, } }, "delete_op": { "config": { "dataset": dataset, "delete_contents": True } }, } }) assert result.success values = result.output_for_node("query_op") assert values[0].to_dict() == { "string_field_0": { 0: "Alabama" }, "string_field_1": { 0: "AL" } } assert not dataset_exists(dataset)
def create_databricks_job_solid( name="databricks_job", num_inputs=1, description=None, required_resource_keys=frozenset(["databricks_client"]), ): """ Creates a solid that launches a databricks job. As config, the solid accepts a blob of the form described in Databricks' job API: https://docs.databricks.com/dev-tools/api/latest/jobs.html. Returns: SolidDefinition: A solid definition. """ check.str_param(name, "name") check.opt_str_param(description, "description") check.int_param(num_inputs, "num_inputs") check.set_param(required_resource_keys, "required_resource_keys", of_type=str) input_defs = [ InputDefinition("input_" + str(i), Nothing) for i in range(num_inputs) ] @solid( name=name, description=description, config_schema={ "job": Field( Permissive(), description= "Databricks job run configuration, in the form described in " "Databricks' job API: https://docs.databricks.com/dev-tools/api/latest/jobs.html", ), "poll_interval_sec": Field( float, description="Check whether the job is done at this interval.", default_value=10, ), "max_wait_time_sec": Field( float, description= "If the job is not complete after this length of time, raise an error.", default_value=(24 * 60 * 60), ), }, input_defs=input_defs, output_defs=[OutputDefinition(Nothing)], required_resource_keys=required_resource_keys, tags={"kind": "databricks"}, ) def databricks_solid(context): job_config = context.solid_config["job"] databricks_client = context.resources.databricks_client run_id = databricks_client.submit_run(**job_config) context.log.info( "Launched databricks job with run id {run_id}. UI: {url}. Waiting to run to completion..." .format(run_id=run_id, url=create_ui_url(databricks_client, context.solid_config))) wait_for_run_to_complete( databricks_client, context.log, run_id, context.solid_config["poll_interval_sec"], context.solid_config["max_wait_time_sec"], ) return databricks_solid
def define_test_all_scalars_pipeline(): @lambda_solid(input_defs=[InputDefinition('num', Int)]) def take_int(num): return num @lambda_solid(output_def=OutputDefinition(Int)) def produce_int(): return 2 @lambda_solid(input_defs=[InputDefinition('string', String)]) def take_string(string): return string @lambda_solid(output_def=OutputDefinition(String)) def produce_string(): return 'foo' @lambda_solid(input_defs=[InputDefinition('path', Path)]) def take_path(path): return path @lambda_solid(output_def=OutputDefinition(Path)) def produce_path(): return '/path/to/foo' @lambda_solid(input_defs=[InputDefinition('float_number', Float)]) def take_float(float_number): return float_number @lambda_solid(output_def=OutputDefinition(Float)) def produce_float(): return 3.14 @lambda_solid(input_defs=[InputDefinition('bool_value', Bool)]) def take_bool(bool_value): return bool_value @lambda_solid(output_def=OutputDefinition(Bool)) def produce_bool(): return True @lambda_solid(input_defs=[InputDefinition('any_value', Any)]) def take_any(any_value): return any_value @lambda_solid(output_def=OutputDefinition(Any)) def produce_any(): return True @lambda_solid(input_defs=[InputDefinition('string_list', List[String])]) def take_string_list(string_list): return string_list @lambda_solid( input_defs=[InputDefinition('nullable_string', Optional[String])]) def take_nullable_string(nullable_string): return nullable_string return PipelineDefinition( name='test_all_scalars_pipeline', solid_defs=[ produce_any, produce_bool, produce_float, produce_int, produce_path, produce_string, take_any, take_bool, take_float, take_int, take_nullable_string, take_path, take_string, take_string_list, ], )
lambda_solid, solid, pipeline, ) def builder(graph): return graph.add_one(graph.return_one()) @lambda_solid def return_one(): return 1 @lambda_solid(input_defs=[InputDefinition('num')]) def add_one(num): return num + 1 def test_basic_use_case(): pipeline_def = PipelineDefinition( name='basic', solid_defs=[return_one, add_one], dependencies={'add_one': {'num': DependencyDefinition('return_one')}}, ) assert execute_pipeline(pipeline_def).result_for_solid('add_one').output_value() == 2 def test_basic_use_case_with_dsl():
def test_solid_def(): @lambda_solid def produce_string(): return 'foo' @solid( inputs=[InputDefinition('input_one', types.String)], outputs=[OutputDefinition(types.Any)], config_field=Field(Dict({'another_field': Field(types.Int)})), ) def solid_one(_context, input_one): raise Exception('should not execute') pipeline_def = PipelineDefinition( solids=[produce_string, solid_one], dependencies={ 'solid_one': { 'input_one': DependencyDefinition('produce_string') } }, ) assert len(pipeline_def.solids[0].output_handles()) == 1 assert isinstance(pipeline_def.solid_named('solid_one'), Solid) solid_one_solid = pipeline_def.solid_named('solid_one') assert solid_one_solid.has_input('input_one') assert isinstance(solid_one_solid.input_def_named('input_one'), InputDefinition) assert len(solid_one_solid.input_defs) == 1 assert len(solid_one_solid.output_defs) == 1 assert str(solid_one_solid.input_handle('input_one')) == ( 'SolidInputHandle(definition_name="\'solid_one\'", input_name="\'input_one\'", ' 'solid_name="\'solid_one\'")') assert repr(solid_one_solid.input_handle('input_one')) == ( 'SolidInputHandle(definition_name="\'solid_one\'", input_name="\'input_one\'", ' 'solid_name="\'solid_one\'")') assert str(solid_one_solid.output_handle('result')) == ( 'SolidOutputHandle(definition_name="\'solid_one\'", output_name="\'result\'", ' 'solid_name="\'solid_one\'")') assert repr(solid_one_solid.output_handle('result')) == ( 'SolidOutputHandle(definition_name="\'solid_one\'", output_name="\'result\'", ' 'solid_name="\'solid_one\'")') assert solid_one_solid.output_handle('result') == SolidOutputHandle( solid_one_solid, solid_one_solid.output_defs[0]) assert len( pipeline_def.dependency_structure.deps_of_solid_with_input( 'solid_one')) == 1 assert len( pipeline_def.dependency_structure.depended_by_of_solid( 'produce_string')) == 1 assert len(pipeline_def.dependency_structure.input_handles()) == 1 assert len(pipeline_def.dependency_structure.items()) == 1
def test_wrap_all_config_and_inputs(): @solid( config={ 'config_field_a': Field(String), 'config_field_b': Field(String) }, input_defs=[ InputDefinition('input_a', String), InputDefinition('input_b', String) ], ) def basic(context, input_a, input_b): res = '.'.join([ context.solid_config['config_field_a'], context.solid_config['config_field_b'], input_a, input_b, ]) yield Output(res) @composite_solid( config_fn=lambda _, cfg: { 'basic': { 'config': { 'config_field_a': cfg['config_field_a'], 'config_field_b': cfg['config_field_b'], }, 'inputs': { 'input_a': { 'value': 'override_input_a' }, 'input_b': { 'value': 'override_input_b' }, }, } }, config={ 'config_field_a': Field(String), 'config_field_b': Field(String) }, ) def wrap_all(): return basic() @pipeline(name='config_mapping') def config_mapping_pipeline(): return pipe(wrap_all()) result = execute_pipeline( config_mapping_pipeline, { 'solids': { 'wrap_all': { 'config': { 'config_field_a': 'override_a', 'config_field_b': 'override_b' } } } }, ) assert result.success assert (result.result_for_solid('pipe').output_value() == 'override_a.override_b.override_input_a.override_input_b') with pytest.raises(DagsterInvalidConfigError) as exc_info: result = execute_pipeline( config_mapping_pipeline, { 'solids': { 'wrap_all': { 'config': { 'config_field_a': 'override_a', 'this_key_doesnt_exist': 'override_b', } } } }, ) assert len(exc_info.value.errors) == 2 assert exc_info.value.errors[0].message == ( 'Field "this_key_doesnt_exist" is not defined at path root:solids:wrap_all:config ' 'Expected: "{ config_field_a: String config_field_b: String }"') assert ( exc_info.value.errors[1].message == 'Missing required field "config_field_b" at path root:solids:wrap_all:config ' 'Available Fields: "[\'config_field_a\', \'config_field_b\']".')
def test_input_manager_with_retries(): _count = {"total": 0} @root_input_manager def should_succeed_after_retries(_): if _count["total"] < 2: _count["total"] += 1 raise RetryRequested(max_retries=3) return "foo" @root_input_manager def should_retry(_): raise RetryRequested(max_retries=3) @solid(input_defs=[ InputDefinition("solid_input", root_manager_key="should_succeed_after_retries") ]) def take_input_1(_, solid_input): return solid_input @solid(input_defs=[ InputDefinition("solid_input", root_manager_key="should_retry") ]) def take_input_2(_, solid_input): return solid_input @solid def take_input_3(_, _input1, _input2): assert False, "should not be called" @pipeline(mode_defs=[ ModeDefinition( resource_defs={ "should_succeed_after_retries": should_succeed_after_retries, "should_retry": should_retry, }) ]) def simple(): take_input_3(take_input_2(), take_input_1()) with tempfile.TemporaryDirectory() as tmpdir_path: instance = DagsterInstance.from_ref(InstanceRef.from_dir(tmpdir_path)) result = execute_pipeline(simple, instance=instance, raise_on_error=False) step_stats = instance.get_run_step_stats(result.run_id) assert len(step_stats) == 2 step_stats_1 = instance.get_run_step_stats(result.run_id, step_keys=["take_input_1"]) assert len(step_stats_1) == 1 step_stat_1 = step_stats_1[0] assert step_stat_1.status.value == "SUCCESS" assert step_stat_1.attempts == 3 step_stats_2 = instance.get_run_step_stats(result.run_id, step_keys=["take_input_2"]) assert len(step_stats_2) == 1 step_stat_2 = step_stats_2[0] assert step_stat_2.status.value == "FAILURE" assert step_stat_2.attempts == 4 step_stats_3 = instance.get_run_step_stats(result.run_id, step_keys=["take_input_3"]) assert len(step_stats_3) == 0
OutputDefinition, Path, PipelineDefinition, solid, ) from dagster_pyspark import spark_session_resource, SparkRDD def parseNeighbors(urls): """Parses a urls pair string into urls pair.""" parts = re.split(r'\s+', urls) return parts[0], parts[1] @solid(inputs=[InputDefinition('pagerank_data', Path)], outputs=[OutputDefinition(SparkRDD)]) def parse_pagerank_data_step_five(context, pagerank_data): lines = context.resources.spark.read.text(pagerank_data).rdd.map( lambda r: r[0]) return lines.map(parseNeighbors) @solid(inputs=[InputDefinition('urls', SparkRDD)], outputs=[OutputDefinition(SparkRDD)]) def compute_links_step_five(_context, urls): return urls.distinct().groupByKey().cache() def computeContribs(urls, rank): """Calculates URL contributions to the rank of other URLs."""
def RF_solid(): return dagstermill.define_dagstermill_solid( 'random_forest_regression', nb_test_path('tutorial_RF'), input_defs=[InputDefinition(name='df', dagster_type=DataFrame)], )
def test_depends_on_adls2_resource_file_manager(storage_account, file_system): bar_bytes = "bar".encode() @solid(output_defs=[OutputDefinition(ADLS2FileHandle)], required_resource_keys={"file_manager"}) def emit_file(context): return context.resources.file_manager.write_data(bar_bytes) @solid( input_defs=[InputDefinition("file_handle", ADLS2FileHandle)], required_resource_keys={"file_manager"}, ) def accept_file(context, file_handle): local_path = context.resources.file_manager.copy_handle_to_local_temp( file_handle) assert isinstance(local_path, str) assert open(local_path, "rb").read() == bar_bytes adls2_fake_resource = FakeADLS2Resource(storage_account) adls2_fake_file_manager = ADLS2FileManager( adls2_client=adls2_fake_resource.adls2_client, file_system=file_system, prefix="some-prefix", ) @pipeline(mode_defs=[ ModeDefinition( intermediate_storage_defs= adls2_plus_default_intermediate_storage_defs, resource_defs={ "adls2": ResourceDefinition.hardcoded_resource(adls2_fake_resource), "file_manager": ResourceDefinition.hardcoded_resource(adls2_fake_file_manager), }, ) ]) def adls2_file_manager_test(): accept_file(emit_file()) result = execute_pipeline( adls2_file_manager_test, run_config={ "intermediate_storage": { "adls2": { "config": { "adls2_file_system": file_system } } } }, ) assert result.success keys_in_bucket = set( adls2_fake_resource.adls2_client.file_systems[file_system].keys()) for step_key, output_name in [ ("emit_file", "result"), ("accept_file", "result"), ]: keys_in_bucket.remove( create_adls2_key(result.run_id, step_key, output_name)) assert len(keys_in_bucket) == 1 file_key = list(keys_in_bucket)[0] comps = file_key.split("/") assert "/".join(comps[:-1]) == "some-prefix" assert uuid.UUID(comps[-1])
return DagsterGraphQLContext( instance=DagsterInstance.ephemeral(), locations=[InProcessRepositoryLocation(create_main_recon_repo())], ) def main_repo_location_name(): return '<<in_process>>' def main_repo_name(): return 'test_repo' @lambda_solid( input_defs=[InputDefinition('num', PoorMansDataFrame)], output_def=OutputDefinition(PoorMansDataFrame), ) def sum_solid(num): sum_df = deepcopy(num) for x in sum_df: x['sum'] = int(x['num1']) + int(x['num2']) return sum_df @lambda_solid( input_defs=[InputDefinition('sum_df', PoorMansDataFrame)], output_def=OutputDefinition(PoorMansDataFrame), ) def sum_sq_solid(sum_df): sum_sq_df = deepcopy(sum_df)
def test_depends_on_adls2_resource_intermediates(storage_account, file_system): @solid( input_defs=[ InputDefinition("num_one", Int), InputDefinition("num_two", Int) ], output_defs=[OutputDefinition(Int)], ) def add_numbers(_, num_one, num_two): return num_one + num_two adls2_fake_resource = FakeADLS2Resource(storage_account) @pipeline(mode_defs=[ ModeDefinition( intermediate_storage_defs= adls2_plus_default_intermediate_storage_defs, resource_defs={ "adls2": ResourceDefinition.hardcoded_resource(adls2_fake_resource) }, ) ]) def adls2_internal_pipeline(): return add_numbers() result = execute_pipeline( adls2_internal_pipeline, run_config={ "solids": { "add_numbers": { "inputs": { "num_one": { "value": 2 }, "num_two": { "value": 4 } } } }, "intermediate_storage": { "adls2": { "config": { "adls2_file_system": file_system } } }, }, ) assert result.success assert result.result_for_solid("add_numbers").output_value() == 6 assert file_system in adls2_fake_resource.adls2_client.file_systems keys = set() for step_key, output_name in [("add_numbers", "result")]: keys.add(create_adls2_key(result.run_id, step_key, output_name)) assert set(adls2_fake_resource.adls2_client.file_systems[file_system].keys( )) == keys
for chunk in response.iter_content(chunk_size=chunk_size): if chunk: output_fp.write(chunk) def _download_zipfile_from_url(url: str, target: str, chunk_size=8192) -> str: with requests.get(url, stream=True) as response, open(target, 'wb+') as output_fp: response.raise_for_status() _write_chunks_to_fp(response, output_fp, chunk_size) return target @solid( input_defs=[ InputDefinition('file_name', str), InputDefinition('base_url', str) ], output_defs=[OutputDefinition(str)], config={'chunk_size': Field(int, is_required=False, default_value=8192)}, required_resource_keys={'volume'}, ) def download_zipfile_from_url(context, file_name: str, base_url: str) -> str: # mount dirs onto volume target = os.path.join(context.resources.volume, file_name) if not os.path.exists(target): _download_zipfile_from_url( "/".join([base_url, file_name]), target, context.solid_config['chunk_size'], )
@lambda_solid def return_one(): return 1 @lambda_solid def return_two(): return 2 @lambda_solid def return_three(): return 3 @lambda_solid(input_defs=[InputDefinition("num")]) def add_one(num): return num + 1 def test_basic_use_case(): pipeline_def = PipelineDefinition( name="basic", solid_defs=[return_one, add_one], dependencies={"add_one": { "num": DependencyDefinition("return_one") }}, ) assert execute_pipeline(pipeline_def).result_for_solid( "add_one").output_value() == 2
from dagster import InputDefinition, List, OutputDefinition, pipeline, repository, solid @solid(output_defs=[OutputDefinition(int)]) def return_one(_): return 1 @solid(input_defs=[InputDefinition("nums", List[int])], output_defs=[OutputDefinition(int)]) def sum_fan_in(_, nums): return sum(nums) @pipeline def fan_in_pipeline(): fan_outs = [] for i in range(0, 10): fan_outs.append(return_one.alias("return_one_{}".format(i))()) sum_fan_in(fan_outs) @repository def fan_in_pipeline_repository(): return [fan_in_pipeline]
) from dagster.utils import file_relative_path @solid def pandas_yielder(_): return read_csv("./basic.csv") @solid(required_resource_keys={"pyspark"}) def pyspark_yielder(context): return (context.resources.pyspark.spark_session.read.format("csv").options( header="true", inferSchema="true").load("./basic.csv")) @solid(input_defs=[InputDefinition(name="res")]) def reyielder(_context, res): yield Output((res["statistics"], res["results"])) @pipeline( mode_defs=[ ModeDefinition("basic", resource_defs={"ge_data_context": ge_data_context}) ], ) def hello_world_pandas_pipeline(): return reyielder( ge_validation_solid_factory("getest", "basic.warning")(pandas_yielder()))
def create_solid_with_deps(name, *solid_deps): inputs = [InputDefinition(solid_dep.name) for solid_dep in solid_deps] return SolidDefinition( name=name, inputs=inputs, compute_fn=_compute_fn, outputs=[OutputDefinition()] )
working_directory=None, location_name=main_repo_location_name(), ), ) as workspace_process_context: yield workspace_process_context.create_request_context() @contextmanager def get_main_external_repo(instance): with get_main_workspace(instance) as workspace: location = workspace.get_repository_location(main_repo_location_name()) yield location.get_repository(main_repo_name()) @lambda_solid( input_defs=[InputDefinition("num", PoorMansDataFrame)], output_def=OutputDefinition(PoorMansDataFrame), ) def sum_solid(num): sum_df = deepcopy(num) for x in sum_df: x["sum"] = int(x["num1"]) + int(x["num2"]) return sum_df @lambda_solid( input_defs=[InputDefinition("sum_df", PoorMansDataFrame)], output_def=OutputDefinition(PoorMansDataFrame), ) def sum_sq_solid(sum_df): sum_sq_df = deepcopy(sum_df)
def return_one(context): return 1 # start_composite_solid_example_marker @solid def add_one(_, number: int): return number + 1 @solid def multiply_by_three(_, number: int): return number * 3 @composite_solid(input_defs=[InputDefinition("number", int)]) def add_one_times_three_solid(number): return multiply_by_three(add_one(number)) # end_composite_solid_example_marker # start_composite_solid_config_marker @solid(config_schema={"n": int}) def add_n(context, number: int): return number + context.solid_config["n"] @solid(config_schema={"m": int})
) ], compute_fn=compute_fn, description=description, metadata={ 'kind': 'sql', 'sql': sql_statement }, ) @solid( name='unzip_file', inputs=[ InputDefinition('archive_file', Bytes, description='The archive to unzip'), InputDefinition('archive_member', String, description='The archive member to extract.'), ], description='Extracts an archive member from a zip archive.', outputs=[ OutputDefinition(Bytes, description='The unzipped archive member.') ], ) def unzip_file(_context, archive_file, archive_member): with zipfile.ZipFile(archive_file) as zip_ref: return BytesIO(zip_ref.open(archive_member).read())