def test_spew_pipeline(): assert execute_pipeline(log_spew).success
@solid def load_cereals(context): csv_path = os.path.join(os.path.dirname(__file__), '../../cereal.csv') with open(csv_path, 'r') as fd: cereals = [row for row in csv.DictReader(fd)] context.log.info( 'Found {n_cereals} cereals'.format(n_cereals=len(cereals))) return cereals @solid def sort_by_calories(context, cereals): sorted_cereals = list( sorted(cereals, key=lambda cereal: cereal['calories'])) context.log.info('Least caloric cereal: {least_caloric}'.format( least_caloric=sorted_cereals[0]['name'])) context.log.info('Most caloric cereal: {most_caloric}'.format( most_caloric=sorted_cereals[-1]['name'])) @pipeline def serial_pipeline(): sort_by_calories(load_cereals()) if __name__ == '__main__': result = execute_pipeline(serial_pipeline) assert result.success
def train_model(context, df): context.log.info("{}".format(df)) model = train(df) return model @pipeline(mode_defs=[ ModeDefinition("test", resource_defs={"fs_asset_store": fs_asset_store}), ModeDefinition("local", resource_defs={"fs_asset_store": local_asset_store}), ]) def model_pipeline(): train_model(parse_df(call_api())) @repository def builtin_default_repo(): return [model_pipeline] if __name__ == "__main__": instance = DagsterInstance.ephemeral() result = execute_pipeline(model_pipeline, mode="local", instance=instance) result_a = reexecute_pipeline( model_pipeline, parent_run_id=result.run_id, mode="local", instance=instance, step_selection=["parse_df*"], )
def test_pipeline_wrapping_types(): @lambda_solid( input_defs=[ InputDefinition('value', Optional[List[Optional[String]]]) ], output_def=OutputDefinition(Optional[List[Optional[String]]]), ) def double_string_for_all(value): if not value: return value output = [] for item in value: output.append(None if item is None else item + item) return output @pipeline def wrapping_test(): double_string_for_all() assert execute_pipeline( wrapping_test, environment_dict={ 'solids': { 'double_string_for_all': { 'inputs': { 'value': None } } } }, ).success assert execute_pipeline( wrapping_test, environment_dict={ 'solids': { 'double_string_for_all': { 'inputs': { 'value': [] } } } }, ).success assert execute_pipeline( wrapping_test, environment_dict={ 'solids': { 'double_string_for_all': { 'inputs': { 'value': [{ 'value': 'foo' }] } } } }, ).success assert execute_pipeline( wrapping_test, environment_dict={ 'solids': { 'double_string_for_all': { 'inputs': { 'value': [{ 'value': 'bar' }, None] } } } }, ).success
) @solid def sort_cold_cereals_by_calories(context, cereals): sorted_cereals = sorted(cereals, key=lambda cereal: cereal['calories']) context.log.info( 'Least caloric cold cereal: {least_caloric}'.format( least_caloric=sorted_cereals[0]['name'] ) ) @pipeline def multiple_outputs_pipeline(): hot_cereals, cold_cereals = split_cereals(read_csv()) sort_hot_cereals_by_calories(hot_cereals) sort_cold_cereals_by_calories(cold_cereals) if __name__ == '__main__': environment_dict = { 'solids': { 'read_csv': {'inputs': {'csv_path': {'value': 'cereal.csv'}}} } } result = execute_pipeline( multiple_outputs_pipeline, environment_dict=environment_dict ) assert result.success
def test_template_task_dag(): dag = DAG( dag_id="dag", default_args=default_args, schedule_interval=None, ) t1 = BashOperator( task_id="print_hello", bash_command="echo hello dagsir", dag=dag, ) t2 = BashOperator( task_id="sleep", bash_command="sleep 2", dag=dag, ) templated_command = """ {% for i in range(5) %} echo '{{ ds }}' echo '{{ macros.ds_add(ds, 7)}}' echo '{{ params.my_param }}' {% endfor %} """ t3 = BashOperator( task_id="templated", depends_on_past=False, bash_command=templated_command, params={"my_param": "Parameter I passed in"}, dag=dag, ) # pylint: disable=pointless-statement t1 >> [t2, t3] with instance_for_test() as instance: manager = instance.compute_log_manager execution_date = get_current_datetime_in_utc() execution_date_add_one_week = execution_date + datetime.timedelta( days=7) execution_date_iso = execution_date.strftime("%Y-%m-%d") execution_date_add_one_week_iso = execution_date_add_one_week.strftime( "%Y-%m-%d") result = execute_pipeline( make_dagster_pipeline_from_airflow_dag( dag=dag, tags={AIRFLOW_EXECUTION_DATE_STR: execution_date_iso}), instance=instance, ) compute_steps = [ event.step_key for event in result.step_event_list if event.event_type == DagsterEventType.STEP_START ] assert compute_steps == [ "airflow_print_hello", "airflow_sleep", "airflow_templated", ] for step_key in compute_steps: compute_io_path = manager.get_local_path(result.run_id, step_key, ComputeIOType.STDOUT) assert os.path.exists(compute_io_path) stdout_file = open(compute_io_path, "r") file_contents = normalize_file_content(stdout_file.read()) stdout_file.close() if step_key == "airflow_print_hello": assert file_contents.count( "INFO - Running command: echo hello dagsir\n") == 1 assert file_contents.count( "INFO - Command exited with return code 0") == 1 elif step_key == "airflow_sleep": assert file_contents.count( "INFO - Running command: sleep 2\n") == 1 assert file_contents.count("INFO - Output:\n") == 1 assert file_contents.count( "INFO - Command exited with return code 0") == 1 elif step_key == "airflow_templated": assert (file_contents.count( "INFO - Running command: \n \n " "echo '{execution_date_iso}'\n " "echo '{execution_date_add_one_week_iso}'\n " "echo 'Parameter I passed in'\n \n " "echo '{execution_date_iso}'\n " "echo '{execution_date_add_one_week_iso}'\n " "echo 'Parameter I passed in'\n \n " "echo '{execution_date_iso}'\n " "echo '{execution_date_add_one_week_iso}'\n " "echo 'Parameter I passed in'\n \n " "echo '{execution_date_iso}'\n " "echo '{execution_date_add_one_week_iso}'\n " "echo 'Parameter I passed in'\n \n " "echo '{execution_date_iso}'\n " "echo '{execution_date_add_one_week_iso}'\n " "echo 'Parameter I passed in'\n \n \n".format( execution_date_iso=execution_date_iso, execution_date_add_one_week_iso= execution_date_add_one_week_iso, )) == 1) assert (file_contents.count( "INFO - {execution_date_iso}\n".format( execution_date_iso=execution_date_iso)) == 5) assert (file_contents.count( "INFO - {execution_date_add_one_week_iso}\n".format( execution_date_add_one_week_iso= execution_date_add_one_week_iso)) == 5) assert file_contents.count( "INFO - Parameter I passed in\n") == 5 assert file_contents.count( "INFO - Command exited with return code 0") == 1
def test_create_pipeline_with_empty_solids_list(): @pipeline def empty_pipe(): pass assert execute_pipeline(empty_pipe).success
def test_diamond_simple_execution(): result = execute_pipeline(define_diamond_pipeline()) assert result.success assert result.result_for_solid("adder").output_value() == 11
context.log.info(struct_to_string(df)) model = train(df) return model @pipeline( mode_defs=[ ModeDefinition("test", resource_defs={ "default_fs_asset_store": default_filesystem_asset_store }), ModeDefinition( "local", resource_defs={"default_fs_asset_store": local_asset_store}), ], preset_defs=[ PresetDefinition("local", run_config={"storage": { "filesystem": {} }}, mode="local"), ], ) def model_pipeline(): train_model(parse_df(call_api())) if __name__ == "__main__": result = execute_pipeline(model_pipeline, preset="local")
]) def modes_pipeline(): normalize_calories(read_csv()) if __name__ == '__main__': environment_dict = { 'solids': { 'read_csv': { 'inputs': { 'csv_path': { 'value': 'cereal.csv' } } } }, 'resources': { 'warehouse': { 'config': { 'conn_str': ':memory:' } } }, } result = execute_pipeline( modes_pipeline, environment_dict=environment_dict, run_config=RunConfig(mode='unittest'), ) assert result.success
def test_error_pipeline(): pipe = define_error_pipeline() result = execute_pipeline(pipe, raise_on_error=False) assert not result.success
def test_resource_pipeline_with_config(): result = execute_pipeline(resource_pipeline, run_config={"resources": {"R1": {"config": 2}}}) assert result.result_for_solid("one").output_value() == 3
def test_resource_pipeline_no_config(): result = execute_pipeline(resource_pipeline) assert result.result_for_solid("one").output_value() == 2
def test_hammer_pipeline(): assert execute_pipeline(hammer_pipeline).success
@solid def error_message(info): info.log.error('An error occurred.') def define_execution_context_pipeline_step_one(): return PipelineDefinition(solids=[debug_message, error_message]) def define_execution_context_pipeline_step_two(): return PipelineDefinition( name='execution_context_pipeline', solids=[debug_message, error_message], ) def define_execution_context_pipeline_step_three(): return PipelineDefinition( name='execution_context_pipeline', solids=[debug_message, error_message], ) if __name__ == '__main__': execute_pipeline( define_execution_context_pipeline_step_three(), {'context': {'default': {'config': {'log_level': 'DEBUG'}}}}, )
def test_metadata(): execute_pipeline(my_pipeline)
def test_works_in_memory(): environment_dict = { 'solids': {'sum_solid': {'inputs': {'num': file_relative_path(__file__, 'data/num.csv')}}} } assert execute_pipeline(passing_pipeline, environment_dict).success
def test_hook_accumulation(): called_hook_to_step_keys = defaultdict(set) @event_list_hook def pipeline_hook(context, _): called_hook_to_step_keys[context.hook_def.name].add(context.step_key) return HookExecutionResult("pipeline_hook") @event_list_hook def solid_1_hook(context, _): called_hook_to_step_keys[context.hook_def.name].add(context.step_key) return HookExecutionResult("solid_1_hook") @event_list_hook def composite_1_hook(context, _): called_hook_to_step_keys[context.hook_def.name].add(context.step_key) return HookExecutionResult("composite_1_hook") @solid def solid_1(_): return 1 @solid def solid_2(_, num): return num @solid def solid_3(_): return 1 @composite_solid def composite_1(): return solid_2(solid_1.with_hooks({solid_1_hook})()) @composite_solid def composite_2(): solid_3() return composite_1.with_hooks({composite_1_hook})() @pipeline_hook @pipeline def a_pipeline(): composite_2() result = execute_pipeline(a_pipeline) assert result.success # make sure we gather hooks from all places and invoke them with the right steps assert called_hook_to_step_keys == { "pipeline_hook": { "composite_2.composite_1.solid_1", "composite_2.composite_1.solid_2", "composite_2.solid_3", }, "solid_1_hook": {"composite_2.composite_1.solid_1"}, "composite_1_hook": { "composite_2.composite_1.solid_1", "composite_2.composite_1.solid_2", }, }
def sort_by_calories(context, cereals: LessSimpleDataFrame): sorted_cereals = sorted(cereals, key=lambda cereal: cereal['calories']) context.log.info( 'Least caloric cereal: {least_caloric}'.format( least_caloric=sorted_cereals[0]['name'] ) ) context.log.info( 'Most caloric cereal: {most_caloric}'.format( most_caloric=sorted_cereals[-1]['name'] ) ) @pipeline def custom_type_pipeline(): sort_by_calories() if __name__ == '__main__': execute_pipeline( custom_type_pipeline, { 'solids': { 'sort_by_calories': { 'inputs': {'cereals': {'csv': 'cereal.csv'}} } } }, )
if __name__ == "__main__": run_config = { "solids": { "load_cereals": { "solids": { "read_cereals": { "inputs": { "csv_path": { "value": "cereal.csv" } } }, "read_manufacturers": { "config": { "delimiter": ";" }, "inputs": { "csv_path": { "value": "manufacturers.csv" } }, }, } } } } result = execute_pipeline(composite_solids_pipeline, run_config=run_config) assert result.success
def test_empty_pipeline_execution(): result = execute_pipeline(PipelineDefinition(solid_defs=[])) assert result.success
@solid(required_resource_keys={'R2'}) def two(_): return 1 @solid(required_resource_keys={'R1', 'R2', 'R3'}) def one_and_two_and_three(_): return 1 @pipeline(mode_defs=[ModeDefinition(resource_defs=lots_of_resources)]) def resource_pipeline(): all_resources() one() two() one_and_two_and_three() if __name__ == '__main__': result = execute_pipeline( reconstructable(resource_pipeline), run_config={ 'storage': { 'filesystem': {} }, 'execution': { 'multiprocessing': {} } }, )
@solid def sort_by_calories(context, cereals: SimpleDataFrame): sorted_cereals = sorted(cereals, key=lambda cereal: cereal['calories']) context.log.info('Least caloric cereal: {least_caloric}'.format( least_caloric=sorted_cereals[0]['name'])) context.log.info('Most caloric cereal: {most_caloric}'.format( most_caloric=sorted_cereals[-1]['name'])) @pipeline def custom_type_pipeline(): sort_by_calories(read_csv()) if __name__ == '__main__': environment_dict = { 'solids': { 'read_csv': { 'inputs': { 'csv_path': { 'value': 'cereal.csv' } } } } } result = execute_pipeline(custom_type_pipeline, environment_dict=environment_dict) assert result.success
def execute_solid( solid_def, mode_def=None, input_values=None, environment_dict=None, run_config=None, raise_on_error=True, ): '''Execute a single solid in an ephemeral pipeline. Intended to support unit tests. Input values may be passed directly, and no pipeline need be specified -- an ephemeral pipeline will be constructed. Args: solid_def (SolidDefinition): The solid to execute. mode_def (Optional[ModeDefinition]): The mode within which to execute the solid. Use this if, e.g., custom resources, loggers, or executors are desired. input_values (Optional[Dict[str, Any]]): A dict of input names to input values, used to pass inputs to the solid directly. You may also use the ``environment_dict`` to configure any inputs that are configurable. environment_dict (Optional[dict]): The enviroment configuration that parameterizes this execution, as a dict. run_config (Optional[RunConfig]): Optionally specifies additional config options for pipeline execution. raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur. Defaults to ``True``, since this is the most useful behavior in test. Returns: Union[CompositeSolidExecutionResult, SolidExecutionResult]: The result of executing the solid. ''' check.inst_param(solid_def, 'solid_def', ISolidDefinition) check.opt_inst_param(mode_def, 'mode_def', ModeDefinition) input_values = check.opt_dict_param(input_values, 'input_values', key_type=str) solid_defs = [solid_def] def create_value_solid(input_name, input_value): @lambda_solid(name=input_name) def input_solid(): return input_value return input_solid dependencies = defaultdict(dict) for input_name, input_value in input_values.items(): dependencies[solid_def.name][input_name] = DependencyDefinition(input_name) solid_defs.append(create_value_solid(input_name, input_value)) result = execute_pipeline( PipelineDefinition( name='ephemeral_{}_solid_pipeline'.format(solid_def.name), solid_defs=solid_defs, dependencies=dependencies, mode_defs=[mode_def] if mode_def else None, ), environment_dict=environment_dict, run_config=run_config, raise_on_error=raise_on_error, ) return result.result_for_handle(solid_def.name)
most_caloric=sorted_cereals[-1]["name"])) @pipeline def custom_type_pipeline(): sort_by_calories() if __name__ == "__main__": execute_pipeline( custom_type_pipeline, { "solids": { "sort_by_calories": { "inputs": { "cereals": { "csv": "cereal.csv" } } } } }, ) # start_custom_types_test_marker_0 def test_less_simple_data_frame(): assert check_dagster_type(LessSimpleDataFrame, [{ "foo": 1 }, { "foo": 2 }]).success
def test_config_with_and_without_config(): @solid(config={'prefix': Field(str, is_optional=True, default_value='_')}) def prefix_value(context, v): return '{prefix}{v}'.format(prefix=context.solid_config["prefix"], v=v) @composite_solid( config_fn=lambda _, cfg: {'prefix_value': { 'config': { 'prefix': cfg['prefix'] } }}, config={'prefix': Field(str, is_optional=True, default_value='_id_')}, ) def prefix_id(val): return prefix_value(val) @solid def print_value(_, v): return str(v) @pipeline def config_issue_pipeline(): v = prefix_id() print_value(v) result = execute_pipeline( config_issue_pipeline, { 'solids': { 'prefix_id': { 'config': { 'prefix': '_customprefix_' }, 'inputs': { 'val': { 'value': "12345" } }, } } }, ) result_using_default = execute_pipeline( config_issue_pipeline, { 'solids': { 'prefix_id': { 'config': {}, 'inputs': { 'val': { 'value': "12345" } } } } }, ) assert result.success assert result.result_for_solid( 'print_value').output_value() == '_customprefix_12345' assert result_using_default.success assert result_using_default.result_for_solid( 'print_value').output_value() == '_id_12345'
def _execute_pipeline_with_subset(pipeline, environment_dict, solid_subset): return execute_pipeline(pipeline.build_sub_pipeline(solid_subset), environment_dict=environment_dict)
def test_datadog_resource( event, gauge, increment, decrement, histogram, distribution, statsd_set, service_check, timed, timing, ): @solid def datadog_solid(context): assert context.resources.datadog # event context.resources.datadog.event('Man down!', 'This server needs assistance.') event.assert_called_with('Man down!', 'This server needs assistance.') # gauge context.resources.datadog.gauge('users.online', 1001, tags=["protocol:http"]) gauge.assert_called_with('users.online', 1001, tags=["protocol:http"]) # increment context.resources.datadog.increment('page.views') increment.assert_called_with('page.views') # decrement context.resources.datadog.decrement('page.views') decrement.assert_called_with('page.views') context.resources.datadog.histogram('album.photo.count', 26, tags=["gender:female"]) histogram.assert_called_with('album.photo.count', 26, tags=["gender:female"]) context.resources.datadog.distribution('album.photo.count', 26, tags=["color:blue"]) distribution.assert_called_with('album.photo.count', 26, tags=["color:blue"]) context.resources.datadog.set('visitors.uniques', 999, tags=["browser:ie"]) statsd_set.assert_called_with('visitors.uniques', 999, tags=["browser:ie"]) context.resources.datadog.service_check( 'svc.check_name', context.resources.datadog.WARNING) service_check.assert_called_with('svc.check_name', context.resources.datadog.WARNING) context.resources.datadog.timing("query.response.time", 1234) timing.assert_called_with("query.response.time", 1234) @context.resources.datadog.timed('run_fn') def run_fn(): pass run_fn() timed.assert_called_with('run_fn') pipeline = PipelineDefinition( name='test_datadog_resource', solid_defs=[datadog_solid], mode_defs=[ ModeDefinition(resource_defs={'datadog': datadog_resource}) ], ) result = execute_pipeline( pipeline, { 'resources': { 'datadog': { 'config': { 'api_key': 'NOT_USED', 'app_key': 'NOT_USED' } } } }, ) assert result.success
def test_pipeline_step_key_subset_execution(): pipeline_def = define_addy_pipeline() instance = DagsterInstance.ephemeral() run_config = env_with_fs( {"solids": { "add_one": { "inputs": { "num": { "value": 3 } } } }}) result = execute_pipeline(pipeline_def, run_config=run_config, instance=instance) assert result.success intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, result.run_id) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_one.compute")).obj == 4) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_two.compute")).obj == 6) ## re-execute add_two pipeline_reexecution_result = reexecute_pipeline( pipeline_def, parent_run_id=result.run_id, run_config=run_config, step_selection=["add_two.compute"], instance=instance, ) assert pipeline_reexecution_result.success step_events = pipeline_reexecution_result.step_event_list assert step_events intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, result.run_id) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_one.compute")).obj == 4) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_two.compute")).obj == 6) assert not get_step_output_event(step_events, "add_one.compute") assert get_step_output_event(step_events, "add_two.compute") with pytest.raises( DagsterInvalidSubsetError, match="No qualified steps to execute found for step_selection"): reexecute_pipeline( pipeline_def, parent_run_id=result.run_id, run_config=run_config, step_selection=["nope.compute"], instance=instance, )
def test_sleepy_pipeline(): assert execute_pipeline(sleepy_pipeline).success