Exemplo n.º 1
0
def start_pipeline_execution(graphene_info, execution_params,
                             reexecution_config):
    check.inst_param(graphene_info, 'graphene_info', ResolveInfo)
    check.inst_param(execution_params, 'execution_params', ExecutionParams)
    check.opt_inst_param(reexecution_config, 'reexecution_config',
                         ReexecutionConfig)

    instance = graphene_info.context.instance

    dauphin_pipeline = get_dauphin_pipeline_from_selector(
        graphene_info, execution_params.selector)

    get_validated_config(
        graphene_info,
        dauphin_pipeline,
        environment_dict=execution_params.environment_dict,
        mode=execution_params.mode,
    )

    execution_plan = create_execution_plan(
        dauphin_pipeline.get_dagster_pipeline(),
        execution_params.environment_dict,
        run_config=RunConfig(mode=execution_params.mode),
    )

    _check_start_pipeline_execution_errors(graphene_info, execution_params,
                                           execution_plan, reexecution_config)

    run = instance.create_run(
        PipelineRun(
            pipeline_name=dauphin_pipeline.get_dagster_pipeline().name,
            run_id=execution_params.execution_metadata.run_id if
            execution_params.execution_metadata.run_id else make_new_run_id(),
            selector=execution_params.selector,
            environment_dict=execution_params.environment_dict,
            mode=execution_params.mode,
            reexecution_config=reexecution_config,
            step_keys_to_execute=execution_params.step_keys,
            status=PipelineRunStatus.NOT_STARTED,
        ))

    graphene_info.context.execution_manager.execute_pipeline(
        graphene_info.context.get_handle(),
        dauphin_pipeline.get_dagster_pipeline(),
        run,
        raise_on_error=graphene_info.context.raise_on_error,
        instance=instance,
    )

    return graphene_info.schema.type_named('StartPipelineExecutionSuccess')(
        run=graphene_info.schema.type_named('PipelineRun')(run))
Exemplo n.º 2
0
def create_valid_pipeline_run(graphene_info, external_pipeline, execution_params):
    if execution_params.mode is None and len(external_pipeline.available_modes) > 1:
        raise UserFacingGraphQLError(
            GrapheneNoModeProvidedError(external_pipeline.name, external_pipeline.available_modes)
        )
    elif execution_params.mode is None and len(external_pipeline.available_modes) == 1:
        mode = external_pipeline.available_modes[0]

    else:
        mode = execution_params.mode

    ensure_valid_config(external_pipeline, mode, execution_params.run_config)

    step_keys_to_execute, known_state = compute_step_keys_to_execute(
        graphene_info, execution_params
    )

    external_execution_plan = get_external_execution_plan_or_raise(
        graphene_info=graphene_info,
        external_pipeline=external_pipeline,
        mode=mode,
        run_config=execution_params.run_config,
        step_keys_to_execute=step_keys_to_execute,
        known_state=known_state,
    )
    tags = merge_dicts(external_pipeline.tags, execution_params.execution_metadata.tags)

    pipeline_run = graphene_info.context.instance.create_run(
        pipeline_snapshot=external_pipeline.pipeline_snapshot,
        execution_plan_snapshot=external_execution_plan.execution_plan_snapshot,
        parent_pipeline_snapshot=external_pipeline.parent_pipeline_snapshot,
        pipeline_name=execution_params.selector.pipeline_name,
        run_id=execution_params.execution_metadata.run_id
        if execution_params.execution_metadata.run_id
        else make_new_run_id(),
        solid_selection=execution_params.selector.solid_selection,
        solids_to_execute=frozenset(execution_params.selector.solid_selection)
        if execution_params.selector.solid_selection
        else None,
        run_config=execution_params.run_config,
        mode=mode,
        step_keys_to_execute=step_keys_to_execute,
        tags=tags,
        root_run_id=execution_params.execution_metadata.root_run_id,
        parent_run_id=execution_params.execution_metadata.parent_run_id,
        status=PipelineRunStatus.NOT_STARTED,
        external_pipeline_origin=external_pipeline.get_external_origin(),
        pipeline_code_origin=external_pipeline.get_python_origin(),
    )

    return pipeline_run
Exemplo n.º 3
0
def test_listen_notify_filter_run_event(conn_string):
    event_log_storage = PostgresEventLogStorage.create_clean_storage(
        conn_string)

    @solid
    def return_one(_):
        return 1

    def _solids():
        return_one()

    run_id_one = make_new_run_id()
    run_id_two = make_new_run_id()

    # only watch one of the runs
    event_list = []
    event_log_storage.event_watcher.watch_run(run_id_two, 0, event_list.append)

    try:
        events_one, _result_one = synthesize_events(_solids, run_id=run_id_one)
        for event in events_one:
            event_log_storage.store_event(event)

        events_two, _result_two = synthesize_events(_solids, run_id=run_id_two)
        for event in events_two:
            event_log_storage.store_event(event)

        start = time.time()
        while len(event_list) < len(
                events_two) and time.time() - start < TEST_TIMEOUT:
            pass

        assert len(event_list) == len(events_two)
        # uncomment when https://github.com/dagster-io/dagster/issues/3368 is resolved with structured event
        # assert all([isinstance(event, DagsterEventRecord) for event in event_list])

    finally:
        del event_log_storage
Exemplo n.º 4
0
def test_s3_intermediate_storage(mock_s3_bucket):
    run_id = make_new_run_id()
    run_id_2 = make_new_run_id()

    intermediate_storage = S3IntermediateStorage(run_id=run_id,
                                                 s3_bucket=mock_s3_bucket.name)
    assert intermediate_storage.root == "/".join(
        ["dagster", "storage", run_id])

    intermediate_storage_2 = S3IntermediateStorage(
        run_id=run_id_2, s3_bucket=mock_s3_bucket.name)
    assert intermediate_storage_2.root == "/".join(
        ["dagster", "storage", run_id_2])

    try:
        with yield_empty_pipeline_context(run_id=run_id) as context:

            intermediate_storage.set_intermediate(context, RuntimeBool,
                                                  StepOutputHandle("true"),
                                                  True)

            assert intermediate_storage.has_intermediate(
                context, StepOutputHandle("true"))
            assert (intermediate_storage.get_intermediate(
                context, RuntimeBool, StepOutputHandle("true")).obj is True)
            assert intermediate_storage.uri_for_paths(["true"
                                                       ]).startswith("s3://")

            intermediate_storage_2.copy_intermediate_from_run(
                context, run_id, StepOutputHandle("true"))
            assert intermediate_storage_2.has_intermediate(
                context, StepOutputHandle("true"))
            assert (intermediate_storage_2.get_intermediate(
                context, RuntimeBool, StepOutputHandle("true")).obj is True)
    finally:
        intermediate_storage.rm_intermediate(context, StepOutputHandle("true"))
        intermediate_storage_2.rm_intermediate(context,
                                               StepOutputHandle("true"))
Exemplo n.º 5
0
def test_s3_intermediate_storage(s3_bucket):
    run_id = make_new_run_id()
    run_id_2 = make_new_run_id()

    intermediate_storage = S3IntermediateStorage(run_id=run_id,
                                                 s3_bucket=s3_bucket)
    assert intermediate_storage.root == '/'.join(
        ['dagster', 'storage', run_id])

    intermediate_storage_2 = S3IntermediateStorage(run_id=run_id_2,
                                                   s3_bucket=s3_bucket)
    assert intermediate_storage_2.root == '/'.join(
        ['dagster', 'storage', run_id_2])

    try:
        with yield_empty_pipeline_context(run_id=run_id) as context:

            intermediate_storage.set_intermediate(context, RuntimeBool,
                                                  StepOutputHandle('true'),
                                                  True)

            assert intermediate_storage.has_intermediate(
                context, StepOutputHandle('true'))
            assert (intermediate_storage.get_intermediate(
                context, RuntimeBool, StepOutputHandle('true')).obj is True)
            assert intermediate_storage.uri_for_paths(['true'
                                                       ]).startswith('s3://')

            intermediate_storage_2.copy_intermediate_from_run(
                context, run_id, StepOutputHandle('true'))
            assert intermediate_storage_2.has_intermediate(
                context, StepOutputHandle('true'))
            assert (intermediate_storage_2.get_intermediate(
                context, RuntimeBool, StepOutputHandle('true')).obj is True)
    finally:
        intermediate_storage.rm_intermediate(context, StepOutputHandle('true'))
        intermediate_storage_2.rm_intermediate(context,
                                               StepOutputHandle('true'))
Exemplo n.º 6
0
def test_using_file_system_for_subplan_missing_input():
    pipeline = define_inty_pipeline()
    environment_dict = {'storage': {'filesystem': {}}}

    execution_plan = create_execution_plan(pipeline, environment_dict=environment_dict)
    pipeline_run = PipelineRun.create_empty_run(pipeline.name, make_new_run_id())

    with pytest.raises(DagsterStepOutputNotFoundError):
        execute_plan(
            execution_plan.build_subset_plan(['add_one.compute']),
            DagsterInstance.ephemeral(),
            environment_dict=environment_dict,
            pipeline_run=pipeline_run,
        )
Exemplo n.º 7
0
 def test_delete_with_tags(self, storage):
     assert storage
     run_id = make_new_run_id()
     storage.add_run(
         TestRunStorage.build_run(
             run_id=run_id,
             pipeline_name="some_pipeline",
             tags={run_id: run_id},
         ))
     assert len(storage.get_runs()) == 1
     assert run_id in [key for key, value in storage.get_run_tags()]
     storage.delete_run(run_id)
     assert list(storage.get_runs()) == []
     assert run_id not in [key for key, value in storage.get_run_tags()]
Exemplo n.º 8
0
def test_running():
    run_id = make_new_run_id()
    handle = ExecutionTargetHandle.for_pipeline_python_file(
        __file__, 'passing_pipeline')
    environment_dict = {
        'solids': {
            'sum_solid': {
                'inputs': {
                    'num': script_relative_path('data/num.csv')
                }
            }
        }
    }
    selector = ExecutionSelector('csv_hello_world')

    instance = DagsterInstance.local_temp()
    pipeline_run = instance.create_run(
        PipelineRun(
            pipeline_name=passing_pipeline.name,
            run_id=run_id,
            selector=selector,
            environment_dict=environment_dict,
            mode='default',
            reexecution_config=None,
            step_keys_to_execute=None,
            tags=None,
            status=PipelineRunStatus.NOT_STARTED,
        ))
    execution_manager = SubprocessExecutionManager(instance)
    execution_manager.execute_pipeline(handle,
                                       passing_pipeline,
                                       pipeline_run,
                                       instance,
                                       raise_on_error=False)
    execution_manager.join()
    assert instance.get_run(run_id).status == PipelineRunStatus.SUCCESS
    events = instance.all_logs(run_id)
    assert events

    process_start_events = get_events_of_type(
        events, DagsterEventType.PIPELINE_PROCESS_START)
    assert len(process_start_events) == 1

    process_started_events = get_events_of_type(
        events, DagsterEventType.PIPELINE_PROCESS_STARTED)
    assert len(process_started_events) == 1

    process_exited_events = get_events_of_type(
        events, DagsterEventType.PIPELINE_PROCESS_EXITED)
    assert len(process_exited_events) == 1
Exemplo n.º 9
0
def test_execution_plan_reexecution_with_in_memory():
    pipeline_def = define_addy_pipeline()
    instance = DagsterInstance.ephemeral()
    old_run_id = make_new_run_id()
    environment_dict = {'solids': {'add_one': {'inputs': {'num': {'value': 3}}}}}
    result = execute_pipeline(
        pipeline_def,
        environment_dict=environment_dict,
        run_config=RunConfig(run_id=old_run_id),
        instance=instance,
    )

    assert result.success

    ## re-execute add_two

    new_run_id = make_new_run_id()

    pipeline_run = PipelineRun(
        pipeline_name=pipeline_def.name,
        run_id=new_run_id,
        environment_dict=environment_dict,
        mode='default',
        previous_run_id=result.run_id,
    )

    execution_plan = create_execution_plan(
        pipeline_def, environment_dict=environment_dict, run_config=pipeline_run
    )

    with pytest.raises(DagsterInvariantViolationError):
        execute_plan(
            execution_plan.build_subset_plan(['add_two.compute']),
            environment_dict=environment_dict,
            pipeline_run=pipeline_run,
            instance=instance,
        )
Exemplo n.º 10
0
def test_fan_out_should_skip_step():
    @solid(output_defs=[
        OutputDefinition(Int, "out_1", is_required=False),
        OutputDefinition(Int, "out_2", is_required=False),
        OutputDefinition(Int, "out_3", is_required=False),
    ])
    def foo(_):
        yield Output(1, "out_1")

    @solid
    def bar(_, input_arg):
        return input_arg

    @pipeline
    def optional_outputs():
        foo_res = foo()
        # pylint: disable=no-member
        bar.alias("bar_1")(input_arg=foo_res.out_1)
        bar.alias("bar_2")(input_arg=foo_res.out_2)
        bar.alias("bar_3")(input_arg=foo_res.out_3)

    instance = DagsterInstance.ephemeral()
    pipeline_run = PipelineRun(pipeline_name="optional_outputs",
                               run_id=make_new_run_id())
    execute_plan(
        create_execution_plan(optional_outputs, step_keys_to_execute=["foo"]),
        InMemoryPipeline(optional_outputs),
        instance,
        pipeline_run,
    )

    assert not should_skip_step(
        create_execution_plan(optional_outputs, step_keys_to_execute=["bar_1"
                                                                      ]),
        instance,
        pipeline_run.run_id,
    )
    assert should_skip_step(
        create_execution_plan(optional_outputs,
                              step_keys_to_execute=["bar_2"]),
        instance,
        pipeline_run.run_id,
    )
    assert should_skip_step(
        create_execution_plan(optional_outputs,
                              step_keys_to_execute=["bar_3"]),
        instance,
        pipeline_run.run_id,
    )
Exemplo n.º 11
0
    def test_fetch_run_filter(self, storage):
        assert storage
        one = make_new_run_id()
        two = make_new_run_id()

        storage.add_run(
            TestRunStorage.build_run(
                run_id=one,
                pipeline_name="some_pipeline",
                status=PipelineRunStatus.SUCCESS,
            ))
        storage.add_run(
            TestRunStorage.build_run(
                run_id=two,
                pipeline_name="some_pipeline",
                status=PipelineRunStatus.SUCCESS,
            ), )

        assert len(storage.get_runs()) == 2

        some_runs = storage.get_runs(PipelineRunsFilter(run_ids=[one, two]))
        count = storage.get_runs_count(PipelineRunsFilter(run_ids=[one, two]))
        assert len(some_runs) == 2
        assert count == 2
Exemplo n.º 12
0
def test_file_system_intermediate_store():
    run_id = make_new_run_id()
    instance = DagsterInstance.ephemeral()
    intermediate_store = build_fs_intermediate_store(
        instance.intermediates_directory, run_id=run_id
    )

    with yield_empty_pipeline_context(run_id=run_id, instance=instance) as context:
        intermediate_store.set_object(True, context, RuntimeBool, ['true'])
        assert intermediate_store.has_object(context, ['true'])
        assert intermediate_store.get_object(context, RuntimeBool, ['true']).obj is True
        assert intermediate_store.uri_for_paths(['true']).startswith('file:///')
        assert intermediate_store.rm_object(context, ['true']) is None
        assert intermediate_store.rm_object(context, ['true']) is None
        assert intermediate_store.rm_object(context, ['dslkfhjsdflkjfs']) is None
Exemplo n.º 13
0
def pipeline_run_from_execution_params(execution_params):
    check.inst_param(execution_params, 'execution_params', ExecutionParams)

    return PipelineRun(
        pipeline_name=execution_params.selector.name,
        run_id=execution_params.execution_metadata.run_id
        if execution_params.execution_metadata.run_id else make_new_run_id(),
        selector=execution_params.selector,
        environment_dict=execution_params.environment_dict,
        mode=execution_params.mode,
        step_keys_to_execute=execution_params.step_keys,
        tags=execution_params.execution_metadata.tags,
        status=PipelineRunStatus.NOT_STARTED,
        previous_run_id=execution_params.previous_run_id,
    )
Exemplo n.º 14
0
    def test_fetch_by_snapshot_id(self, storage):
        assert storage
        pipeline_def_a = PipelineDefinition(name="some_pipeline", solid_defs=[])
        pipeline_def_b = PipelineDefinition(name="some_other_pipeline", solid_defs=[])
        pipeline_snapshot_a = pipeline_def_a.get_pipeline_snapshot()
        pipeline_snapshot_b = pipeline_def_b.get_pipeline_snapshot()
        pipeline_snapshot_a_id = create_pipeline_snapshot_id(pipeline_snapshot_a)
        pipeline_snapshot_b_id = create_pipeline_snapshot_id(pipeline_snapshot_b)

        assert storage.add_pipeline_snapshot(pipeline_snapshot_a) == pipeline_snapshot_a_id
        assert storage.add_pipeline_snapshot(pipeline_snapshot_b) == pipeline_snapshot_b_id

        one = make_new_run_id()
        two = make_new_run_id()
        storage.add_run(
            TestRunStorage.build_run(
                run_id=one,
                pipeline_name="some_pipeline",
                pipeline_snapshot_id=pipeline_snapshot_a_id,
            )
        )
        storage.add_run(
            TestRunStorage.build_run(
                run_id=two,
                pipeline_name="some_other_pipeline",
                pipeline_snapshot_id=pipeline_snapshot_b_id,
            )
        )
        assert len(storage.get_runs()) == 2
        runs_a = storage.get_runs(PipelineRunsFilter(snapshot_id=pipeline_snapshot_a_id))
        assert len(runs_a) == 1
        assert runs_a[0].run_id == one

        runs_b = storage.get_runs(PipelineRunsFilter(snapshot_id=pipeline_snapshot_b_id))
        assert len(runs_b) == 1
        assert runs_b[0].run_id == two
Exemplo n.º 15
0
def in_pipeline_manager(
    pipeline_name='hello_world_pipeline',
    solid_handle=SolidHandle('hello_world', 'hello_world', None),
    handle_kwargs=None,
    mode=None,
    **kwargs
):
    manager = Manager()

    run_id = make_new_run_id()
    instance = DagsterInstance.local_temp()
    marshal_dir = tempfile.mkdtemp()

    if not handle_kwargs:
        handle_kwargs = {
            'pipeline_name': pipeline_name,
            'module_name': 'dagstermill.examples.repository',
            'fn_name': 'define_hello_world_pipeline',
        }

    pipeline_run_dict = pack_value(
        PipelineRun(
            pipeline_name=pipeline_name,
            run_id=run_id,
            mode=mode or 'default',
            environment_dict=None,
            selector=None,
            step_keys_to_execute=None,
            status=PipelineRunStatus.NOT_STARTED,
        )
    )

    try:
        with safe_tempfile_path() as output_log_file_path:
            context_dict = {
                'pipeline_run_dict': pipeline_run_dict,
                'solid_handle_kwargs': solid_handle._asdict(),
                'handle_kwargs': handle_kwargs,
                'marshal_dir': marshal_dir,
                'environment_dict': {},
                'output_log_path': output_log_file_path,
                'instance_ref_dict': pack_value(instance.get_ref()),
            }

            manager.reconstitute_pipeline_context(**dict(context_dict, **kwargs))
            yield manager
    finally:
        shutil.rmtree(marshal_dir)
Exemplo n.º 16
0
def test_file_system_intermediate_store_composite_types_with_custom_serializer_for_inner_type():
    run_id = make_new_run_id()
    instance = DagsterInstance.ephemeral()
    intermediate_store = build_fs_intermediate_store(
        instance.intermediates_directory, run_id=run_id
    )

    with yield_empty_pipeline_context(run_id=run_id, instance=instance) as context:

        intermediate_store.set_object(
            ['foo', 'bar'], context, resolve_dagster_type(List[LowercaseString]), ['list']
        )
        assert intermediate_store.has_object(context, ['list'])
        assert intermediate_store.get_object(
            context, resolve_dagster_type(List[Bool]), ['list']
        ).obj == ['foo', 'bar']
Exemplo n.º 17
0
def test_gcs_intermediate_store_composite_types_with_custom_serializer_for_inner_type(gcs_bucket):
    run_id = make_new_run_id()

    intermediate_store = GCSIntermediateStore(run_id=run_id, gcs_bucket=gcs_bucket)
    with yield_empty_pipeline_context(run_id=run_id) as context:
        try:
            intermediate_store.set_object(
                ['foo', 'bar'], context, resolve_dagster_type(List[LowercaseString]), ['list'],
            )
            assert intermediate_store.has_object(context, ['list'])
            assert intermediate_store.get_object(
                context, resolve_dagster_type(List[Bool]), ['list']
            ).obj == ['foo', 'bar']

        finally:
            intermediate_store.rm_object(context, ['foo'])
Exemplo n.º 18
0
def test_file_system_intermediate_store_composite_types():
    run_id = make_new_run_id()
    instance = DagsterInstance.ephemeral()

    intermediate_store = build_fs_intermediate_store(
        instance.intermediates_directory, run_id=run_id
    )

    with yield_empty_pipeline_context(instance=instance, run_id=run_id) as context:
        intermediate_store.set_object(
            [True, False], context, resolve_dagster_type(List[Bool]), ['bool']
        )
        assert intermediate_store.has_object(context, ['bool'])
        assert intermediate_store.get_object(
            context, resolve_dagster_type(List[Bool]), ['bool']
        ).obj == [True, False]
Exemplo n.º 19
0
def test_error_dag_python():  # pylint: disable=redefined-outer-name
    pipeline_name = 'demo_error_pipeline'
    recon_repo = ReconstructableRepository.for_module('test_pipelines.repo', pipeline_name)
    environments_path = test_project_environments_path()
    environment_yaml = [
        os.path.join(environments_path, 'env_filesystem.yaml'),
    ]
    run_config = load_yaml_from_glob_list(environment_yaml)
    execution_date = timezone.utcnow()

    dag, tasks = make_airflow_dag_for_recon_repo(recon_repo, pipeline_name, run_config)

    with pytest.raises(AirflowException) as exc_info:
        execute_tasks_in_dag(dag, tasks, run_id=make_new_run_id(), execution_date=execution_date)

    assert 'Exception: Unusual error' in str(exc_info.value)
Exemplo n.º 20
0
    def test_synchronously_execute_run_within_hosted_user_process_not_found(
            self, graphql_context):
        run_id = make_new_run_id()
        result = execute_dagster_graphql(
            graphql_context,
            EXECUTE_RUN_IN_PROCESS_MUTATION,
            variables={
                "runId": run_id,
                "repositoryLocationName": main_repo_location_name(),
                "repositoryName": main_repo_name(),
            },
        )

        assert result.data
        assert result.data["executeRunInProcess"][
            "__typename"] == "PipelineRunNotFoundError"
Exemplo n.º 21
0
    def __new__(
        cls, run_id=None, tags=None, step_keys_to_execute=None, mode=None, previous_run_id=None,
    ):

        check.opt_list_param(step_keys_to_execute, 'step_keys_to_execute', of_type=str)

        tags = check.opt_dict_param(tags, 'tags', key_type=str)

        return super(RunConfig, cls).__new__(
            cls,
            run_id=check.str_param(run_id, 'run_id') if run_id else make_new_run_id(),
            tags=tags,
            step_keys_to_execute=step_keys_to_execute,
            mode=check.opt_str_param(mode, 'mode'),
            previous_run_id=check.opt_str_param(previous_run_id, 'previous_run_id'),
        )
def test_s3_intermediate_store_with_composite_type_storage_plugin(s3_bucket):
    run_id = make_new_run_id()

    intermediate_store = S3IntermediateStore(
        run_id=run_id,
        s3_bucket=s3_bucket,
        type_storage_plugin_registry=TypeStoragePluginRegistry([
            (RuntimeString, FancyStringS3TypeStoragePlugin)
        ]),
    )

    with yield_empty_pipeline_context(run_id=run_id) as context:
        with pytest.raises(check.NotImplementedCheckError):
            intermediate_store.set_value(['hello'], context,
                                         resolve_dagster_type(List[String]),
                                         ['obj_name'])
Exemplo n.º 23
0
def test_file_system_intermediate_store_with_custom_serializer():
    run_id = make_new_run_id()
    instance = DagsterInstance.ephemeral()
    intermediate_store = build_fs_intermediate_store(
        instance.intermediates_directory, run_id=run_id
    )

    with yield_empty_pipeline_context(run_id=run_id, instance=instance) as context:

        intermediate_store.set_object('foo', context, LowercaseString, ['foo'])

        with open(os.path.join(intermediate_store.root, 'foo'), 'rb') as fd:
            assert fd.read().decode('utf-8') == 'FOO'

        assert intermediate_store.has_object(context, ['foo'])
        assert intermediate_store.get_object(context, LowercaseString, ['foo']).obj == 'foo'
Exemplo n.º 24
0
def pipeline_run_args_from_execution_params(execution_params, step_keys_to_execute=None):
    check.inst_param(execution_params, 'execution_params', ExecutionParams)
    return dict(
        pipeline_name=execution_params.selector.name,
        run_id=execution_params.execution_metadata.run_id
        if execution_params.execution_metadata.run_id
        else make_new_run_id(),
        selector=execution_params.selector,
        environment_dict=execution_params.environment_dict,
        mode=execution_params.mode,
        step_keys_to_execute=step_keys_to_execute or execution_params.step_keys,
        tags=execution_params.execution_metadata.tags,
        root_run_id=execution_params.execution_metadata.root_run_id,
        parent_run_id=execution_params.execution_metadata.parent_run_id,
        status=PipelineRunStatus.NOT_STARTED,
    )
Exemplo n.º 25
0
 def test_basic_storage(self, storage):
     assert storage
     run_id = make_new_run_id()
     added = storage.add_run(
         TestRunStorage.build_run(run_id=run_id, pipeline_name='some_pipeline')
     )
     assert added
     runs = storage.get_runs()
     assert len(runs) == 1
     run = runs[0]
     assert run.run_id == run_id
     assert run.pipeline_name == 'some_pipeline'
     assert storage.has_run(run_id)
     fetched_run = storage.get_run_by_id(run_id)
     assert fetched_run.run_id == run_id
     assert fetched_run.pipeline_name == 'some_pipeline'
    def test_synchronously_execute_run_within_hosted_user_process_not_found(
            self, graphql_context):
        run_id = make_new_run_id()
        result = execute_dagster_graphql(
            graphql_context,
            EXECUTE_RUN_IN_PROCESS_QUERY,
            variables={
                'runId': run_id,
                'repositoryLocationName': main_repo_location_name(),
                'repositoryName': main_repo_name(),
            },
        )

        assert result.data
        assert result.data['executeRunInProcess'][
            '__typename'] == 'PipelineRunNotFoundError'
Exemplo n.º 27
0
def test_spark_data_frame_serialization_file_system_file_handle(spark_config):
    @solid
    def nonce(_):
        return LocalFileHandle(file_relative_path(__file__, 'data/test.csv'))

    @pipeline(mode_defs=[spark_mode])
    def spark_df_test_pipeline():

        ingest_csv_file_handle_to_spark(nonce())

    run_id = make_new_run_id()
    instance = DagsterInstance.ephemeral()
    intermediate_store = build_fs_intermediate_store(
        instance.intermediates_directory, run_id=run_id)

    result = execute_pipeline(
        spark_df_test_pipeline,
        run_config=RunConfig(run_id=run_id, mode='spark'),
        environment_dict={
            'storage': {
                'filesystem': {}
            },
            'resources': {
                'spark': {
                    'config': {
                        'spark_conf': spark_config
                    }
                }
            },
        },
        instance=instance,
    )

    assert result.success
    result_dir = os.path.join(
        intermediate_store.root,
        'intermediates',
        'ingest_csv_file_handle_to_spark.compute',
        'result',
    )

    assert '_SUCCESS' in os.listdir(result_dir)

    spark = SparkSession.builder.getOrCreate()
    df = spark.read.parquet(result_dir)
    assert isinstance(df, pyspark.sql.dataframe.DataFrame)
    assert df.head()[0] == '1'
Exemplo n.º 28
0
    def test_run_record_stats(self, storage):
        assert storage

        self._skip_in_memory(storage)

        run_id = make_new_run_id()
        run_to_add = TestRunStorage.build_run(pipeline_name="pipeline_name",
                                              run_id=run_id)

        storage.add_run(run_to_add)

        run_record = storage.get_run_records(
            PipelineRunsFilter(run_ids=[run_id]))[0]

        assert run_record.start_time is None
        assert run_record.end_time is None

        storage.handle_run_event(
            run_id,
            DagsterEvent(
                message="a message",
                event_type_value=DagsterEventType.PIPELINE_START.value,
                pipeline_name="pipeline_name",
            ),
        )

        run_record = storage.get_run_records(
            PipelineRunsFilter(run_ids=[run_id]))[0]

        assert run_record.start_time is not None
        assert run_record.end_time is None

        storage.handle_run_event(
            run_id,
            DagsterEvent(
                message="a message",
                event_type_value=DagsterEventType.PIPELINE_SUCCESS.value,
                pipeline_name="pipeline_name",
            ),
        )

        run_record = storage.get_run_records(
            PipelineRunsFilter(run_ids=[run_id]))[0]

        assert run_record.start_time is not None
        assert run_record.end_time is not None
        assert run_record.end_time >= run_record.start_time
Exemplo n.º 29
0
def start_pipeline_execution(graphene_info, execution_params,
                             reexecution_config):
    check.inst_param(graphene_info, 'graphene_info', ResolveInfo)
    check.inst_param(execution_params, 'execution_params', ExecutionParams)
    check.opt_inst_param(reexecution_config, 'reexecution_config',
                         ReexecutionConfig)

    pipeline_run_storage = graphene_info.context.pipeline_runs

    dauphin_pipeline = get_dauphin_pipeline_from_selector(
        graphene_info, execution_params.selector)

    execution_plan = create_execution_plan(
        dauphin_pipeline.get_dagster_pipeline(),
        get_validated_config(
            graphene_info,
            dauphin_pipeline,
            environment_dict=execution_params.environment_dict,
            mode=execution_params.mode,
        ).value,
        mode=execution_params.mode,
    )

    _check_start_pipeline_execution_errors(graphene_info, execution_params,
                                           execution_plan, reexecution_config)

    run = pipeline_run_storage.create_run(
        run_id=execution_params.execution_metadata.run_id
        if execution_params.execution_metadata.run_id else make_new_run_id(),
        selector=execution_params.selector,
        env_config=execution_params.environment_dict,
        mode=execution_params.mode,
        execution_plan=execution_plan,
        reexecution_config=reexecution_config,
        step_keys_to_execute=execution_params.step_keys,
    )
    pipeline_run_storage.add_run(run)

    graphene_info.context.execution_manager.execute_pipeline(
        graphene_info.context.get_handle(),
        dauphin_pipeline.get_dagster_pipeline(),
        run,
        raise_on_error=graphene_info.context.raise_on_error,
    )

    return graphene_info.schema.type_named('StartPipelineExecutionSuccess')(
        run=graphene_info.schema.type_named('PipelineRun')(run))
Exemplo n.º 30
0
    def _prepare_message(self, orig_message, message_props):
        check.str_param(orig_message, 'orig_message')
        check.dict_param(message_props, 'message_props')

        # These are todos to further align with the Python logging API
        check.invariant(
            'extra' not in message_props, 'do not allow until explicit support is handled'
        )
        check.invariant(
            'exc_info' not in message_props, 'do not allow until explicit support is handled'
        )

        # Reserved keys in the message_props -- these are system generated.
        check.invariant('orig_message' not in message_props, 'orig_message reserved value')
        check.invariant('message' not in message_props, 'message reserved value')
        check.invariant('log_message_id' not in message_props, 'log_message_id reserved value')
        check.invariant('log_timestamp' not in message_props, 'log_timestamp reserved value')

        log_message_id = make_new_run_id()

        log_timestamp = datetime.datetime.utcnow().isoformat()

        synth_props = {
            'orig_message': orig_message,
            'log_message_id': log_message_id,
            'log_timestamp': log_timestamp,
            'run_id': self.run_id,
        }

        # We first generate all props for the purpose of producing the semi-structured
        # log message via _kv_messsage
        all_props = dict(
            itertools.chain(synth_props.items(), self.logging_tags.items(), message_props.items())
        )

        # So here we use the arbitrary key DAGSTER_META_KEY to store a dictionary of
        # all the meta information that dagster injects into log message.
        # The python logging module, in its infinite wisdom, actually takes all the
        # keys in extra and unconditionally smashes them into the internal dictionary
        # of the logging.LogRecord class. We used a reserved key here to avoid naming
        # collisions with internal variables of the LogRecord class.
        # See __init__.py:363 (makeLogRecord) in the python 3.6 logging module source
        # for the gory details.
        return (
            construct_log_string(synth_props, self.logging_tags, message_props),
            {DAGSTER_META_KEY: all_props},
        )