Exemplo n.º 1
0
def _build_sub_pipeline(pipeline_def, solid_names):
    '''
    Build a pipeline which is a subset of another pipeline.
    Only includes the solids which are in solid_names.
    '''

    from dagster.core.definitions.handle import ExecutionTargetHandle

    check.inst_param(pipeline_def, 'pipeline_def', PipelineDefinition)
    check.list_param(solid_names, 'solid_names', of_type=str)

    solid_name_set = set(solid_names)
    solids = list(map(pipeline_def.solid_named, solid_names))
    deps = {_dep_key_of(solid): {} for solid in solids}

    for solid in solids:
        for input_handle in solid.input_handles():
            if pipeline_def.dependency_structure.has_singular_dep(
                    input_handle):
                output_handle = pipeline_def.dependency_structure.get_singular_dep(
                    input_handle)
                if output_handle.solid.name in solid_name_set:
                    deps[_dep_key_of(solid)][
                        input_handle.input_def.name] = DependencyDefinition(
                            solid=output_handle.solid.name,
                            output=output_handle.output_def.name)
            elif pipeline_def.dependency_structure.has_multi_deps(
                    input_handle):
                output_handles = pipeline_def.dependency_structure.get_multi_deps(
                    input_handle)
                deps[_dep_key_of(solid)][
                    input_handle.input_def.name] = MultiDependencyDefinition([
                        DependencyDefinition(
                            solid=output_handle.solid.name,
                            output=output_handle.output_def.name)
                        for output_handle in output_handles
                        if output_handle.solid.name in solid_name_set
                    ])

    sub_pipeline_def = PipelineDefinition(
        name=pipeline_def.
        name,  # should we change the name for subsetted pipeline?
        solid_defs=list({solid.definition
                         for solid in solids}),
        mode_defs=pipeline_def.mode_definitions,
        dependencies=deps,
        _parent_pipeline_def=pipeline_def,
    )
    handle, _ = ExecutionTargetHandle.get_handle(pipeline_def)
    if handle:
        ExecutionTargetHandle.cache_handle(sub_pipeline_def,
                                           handle,
                                           solid_names=solid_names)

    return sub_pipeline_def
Exemplo n.º 2
0
def multiprocess_executor(init_context):
    '''The default multiprocess executor.

    This simple multiprocess executor is available by default on any :py:class:`ModeDefinition`
    that does not provide custom executors. To select the multiprocess executor, include a fragment
    such as the following in your config:

    .. code-block:: yaml

        execution:
          multiprocess:
            max_concurrent: 4
    
    The ``max_concurrent`` arg is optional and tells the execution engine how many processes may run
    concurrently. By default, or if you set ``max_concurrent`` to be 0, this is the return value of
    :py:func:`python:multiprocessing.cpu_count`.

    '''
    from dagster.core.definitions.handle import ExecutionTargetHandle
    from dagster.core.engine.init import InitExecutorContext

    check.inst_param(init_context, 'init_context', InitExecutorContext)

    handle, _ = ExecutionTargetHandle.get_handle(init_context.pipeline_def)
    return MultiprocessExecutorConfig(
        handle=handle,
        max_concurrent=init_context.executor_config['max_concurrent'])
def create_context_creation_data(pipeline_def, environment_dict, pipeline_run,
                                 instance, execution_plan):
    environment_config = EnvironmentConfig.build(pipeline_def,
                                                 environment_dict,
                                                 pipeline_run)

    mode_def = pipeline_def.get_mode_definition(pipeline_run.mode)
    system_storage_def = system_storage_def_from_config(
        mode_def, environment_config)
    executor_def = executor_def_from_config(mode_def, environment_config)

    execution_target_handle, _ = ExecutionTargetHandle.get_handle(pipeline_def)

    return ContextCreationData(
        pipeline_def=pipeline_def,
        environment_config=environment_config,
        pipeline_run=pipeline_run,
        mode_def=mode_def,
        system_storage_def=system_storage_def,
        execution_target_handle=execution_target_handle,
        executor_def=executor_def,
        instance=instance,
        resource_keys_to_init=get_required_resource_keys_to_init(
            execution_plan, system_storage_def),
    )
Exemplo n.º 4
0
def multiprocess_executor(init_context):
    '''The default multiprocess executor.

    This simple multiprocess executor is available by default on any :py:class:`ModeDefinition`
    that does not provide custom executors. To select the multiprocess executor, include a fragment
    such as the following in your config:

    .. code-block:: yaml

        execution:
          multiprocess:
            max_concurrent: 4

    The ``max_concurrent`` arg is optional and tells the execution engine how many processes may run
    concurrently. By default, or if you set ``max_concurrent`` to be 0, this is the return value of
    :py:func:`python:multiprocessing.cpu_count`.

    Execution priority can be configured using the ``dagster/priority`` tag via solid metadata,
    where the higher the number the higher the priority. 0 is the default and both positive
    and negative numbers can be used.
    '''
    from dagster.core.definitions.handle import ExecutionTargetHandle
    from dagster.core.engine.init import InitExecutorContext

    check.inst_param(init_context, 'init_context', InitExecutorContext)

    check_cross_process_constraints(init_context)

    handle, _ = ExecutionTargetHandle.get_handle(init_context.pipeline_def)
    return MultiprocessExecutorConfig(
        handle=handle,
        max_concurrent=init_context.executor_config['max_concurrent'],
        retries=Retries.from_config(init_context.executor_config['retries']),
    )
Exemplo n.º 5
0
def initialize_step_context(scratch_dir):
    pipeline_def = ExecutionTargetHandle.for_pipeline_fn(
        define_basic_pipeline).build_pipeline_definition()

    pipeline_run = PipelineRun(
        pipeline_name='foo_pipeline',
        run_id=str(uuid.uuid4()),
        environment_dict=make_environment_dict(scratch_dir, 'external'),
        mode='external',
    )

    plan = create_execution_plan(pipeline_def,
                                 pipeline_run.environment_dict,
                                 mode='external')

    initialization_manager = pipeline_initialization_manager(
        plan,
        pipeline_run.environment_dict,
        pipeline_run,
        DagsterInstance.ephemeral(),
    )
    for _ in initialization_manager.generate_setup_events():
        pass
    pipeline_context = initialization_manager.get_object()

    active_execution = plan.start(retries=Retries(RetryMode.DISABLED))
    step = active_execution.get_next_step()
    step_context = pipeline_context.for_step(step)
    return step_context
Exemplo n.º 6
0
def multiprocess_executor(init_context):
    from dagster.core.definitions.handle import ExecutionTargetHandle
    from dagster.core.engine.init import InitExecutorContext

    check.inst_param(init_context, 'init_context', InitExecutorContext)

    handle, _ = ExecutionTargetHandle.get_handle(init_context.pipeline_def)
    return MultiprocessExecutorConfig(
        handle=handle,
        max_concurrent=init_context.executor_config['max_concurrent'])
Exemplo n.º 7
0
def test_pipeline(mode):
    with seven.TemporaryDirectory() as tmpdir:
        pipeline_def = ExecutionTargetHandle.for_pipeline_fn(
            define_basic_pipeline).build_pipeline_definition()
        result = execute_pipeline(
            pipeline=pipeline_def,
            mode=mode,
            environment_dict=make_environment_dict(tmpdir, mode),
        )
        assert result.result_for_solid('return_two').output_value() == 2
        assert result.result_for_solid('add_one').output_value() == 3
Exemplo n.º 8
0
def _check_pipeline_has_target_handle(pipeline_def):
    from dagster.core.definitions.handle import ExecutionTargetHandle

    handle, _ = ExecutionTargetHandle.get_handle(pipeline_def)
    if not handle:
        raise DagsterUnmetExecutorRequirementsError(
            'You have attempted to use an executor that uses multiple processes with the pipeline "{name}" '
            'that can not be re-hydrated. Pipelines must be loaded in a way that allows dagster to reconstruct '
            'them in a new process. This means: \n'
            '  * using the file, module, or repository.yaml arguments of dagit/dagster-graphql/dagster\n'
            '  * constructing an ExecutionTargetHandle directly\n'.format(
                name=pipeline_def.name))
Exemplo n.º 9
0
def test_launcher_requests_retry():
    mode = 'request_retry'
    with seven.TemporaryDirectory() as tmpdir:
        pipeline_def = ExecutionTargetHandle.for_pipeline_fn(
            define_basic_pipeline).build_pipeline_definition()
        result = execute_pipeline(
            pipeline=pipeline_def,
            mode=mode,
            environment_dict=make_environment_dict(tmpdir, mode),
        )
        assert result.result_for_solid('return_two').output_value() == 2
        assert result.result_for_solid('add_one').output_value() == 3
        for step_key, events in result.events_by_step_key.items():
            if step_key:
                event_types = [event.event_type for event in events]
                assert DagsterEventType.STEP_UP_FOR_RETRY in event_types
                assert DagsterEventType.STEP_RESTARTED in event_types
Exemplo n.º 10
0
def create_context_creation_data(pipeline_def, environment_dict, run_config, instance):
    environment_config = create_environment_config(pipeline_def, environment_dict, run_config)

    mode_def = pipeline_def.get_mode_definition(run_config.mode)
    system_storage_def = system_storage_def_from_config(mode_def, environment_config)
    executor_def = executor_def_from_config(mode_def, environment_config)

    execution_target_handle, _ = ExecutionTargetHandle.get_handle(pipeline_def)
    return ContextCreationData(
        pipeline_def=pipeline_def,
        environment_config=environment_config,
        run_config=run_config,
        mode_def=mode_def,
        system_storage_def=system_storage_def,
        execution_target_handle=execution_target_handle,
        executor_def=executor_def,
        instance=instance,
    )
Exemplo n.º 11
0
def test_pyspark_emr(mock_wait, mock_get_step_events):
    run_job_flow_args = dict(
        Instances={
            'InstanceCount': 1,
            'KeepJobFlowAliveWhenNoSteps': True,
            'MasterInstanceType': 'c3.medium',
            'Placement': {
                'AvailabilityZone': 'us-west-1a'
            },
            'SlaveInstanceType': 'c3.xlarge',
        },
        JobFlowRole='EMR_EC2_DefaultRole',
        LogUri='s3://mybucket/log',
        Name='cluster',
        ServiceRole='EMR_DefaultRole',
        VisibleToAllUsers=True,
    )

    # Doing cluster setup outside of a solid here, because run_job_flow is not yet plumbed through
    # to the pyspark EMR resource.
    job_runner = EmrJobRunner(region='us-west-1')
    context = create_test_pipeline_execution_context()
    cluster_id = job_runner.run_job_flow(context.log, run_job_flow_args)

    pipeline_def = ExecutionTargetHandle.for_pipeline_fn(
        define_do_nothing_pipe).build_pipeline_definition()
    result = execute_pipeline(
        pipeline=pipeline_def,
        mode='prod',
        environment_dict={
            'resources': {
                'pyspark_step_launcher': {
                    'config':
                    deep_merge_dicts(BASE_EMR_PYSPARK_STEP_LAUNCHER_CONFIG,
                                     {'cluster_id': cluster_id}),
                }
            },
        },
    )
    assert result.success
    assert mock_wait.called_once
    assert mock_get_step_events.called_once
Exemplo n.º 12
0
def create_context_creation_data(pipeline_def, environment_dict, run_config):
    environment_config = create_environment_config(pipeline_def,
                                                   environment_dict,
                                                   run_config)

    mode_def = pipeline_def.get_mode_definition(run_config.mode)
    system_storage_def = system_storage_def_from_config(
        mode_def, environment_config)

    check_persistent_storage_requirement(pipeline_def, system_storage_def,
                                         run_config)

    return ContextCreationData(
        pipeline_def=pipeline_def,
        environment_config=environment_config,
        run_config=run_config,
        mode_def=mode_def,
        system_storage_def=system_storage_def,
        execution_target_handle=ExecutionTargetHandle.get_handle(pipeline_def),
    )
Exemplo n.º 13
0
def multiprocess_executor(init_context):
    '''The default multiprocess executor.

    This simple multiprocess executor is available by default on any :py:class:`ModeDefinition`
    that does not provide custom executors. To select the multiprocess executor, include a fragment
    such as the following in your config:

    .. code-block:: yaml

        execution:
          multiprocess:
            max_concurrent: 4

    The ``max_concurrent`` arg is optional and tells the execution engine how many processes may run
    concurrently. By default, or if you set ``max_concurrent`` to be 0, this is the return value of
    :py:func:`python:multiprocessing.cpu_count`.

    Execution priority can be configured using the ``dagster/priority`` tag via solid metadata,
    where the higher the number the higher the priority. 0 is the default and both positive
    and negative numbers can be used.
    '''
    from dagster.core.definitions.handle import ExecutionTargetHandle
    from dagster.core.engine.init import InitExecutorContext

    check.inst_param(init_context, 'init_context', InitExecutorContext)

    check_cross_process_constraints(init_context)

    # ExecutionTargetHandle.get_handle returns an ExecutionTargetHandleCacheEntry, which is a tuple
    # (handle, solid_subset). Right now we are throwing away the solid_subset that we store in the
    # cache -- this is fragile and we should fix this with
    # https://github.com/dagster-io/dagster/issues/2115 and friends so there are not multiple
    # sources of truth for the solid subset
    handle, _ = ExecutionTargetHandle.get_handle(init_context.pipeline_def)
    return MultiprocessExecutorConfig(
        handle=handle,
        max_concurrent=init_context.executor_config['max_concurrent'],
        retries=Retries.from_config(init_context.executor_config['retries']),
    )
Exemplo n.º 14
0
def test_do_it_live_emr():
    sync_code()

    # Retrieving the pipeline this way stores pipeline definition in the ExecutionTargetHandle
    # cache, where it can be retrieved and sent to the remote cluster at launch time.
    pipeline_def = ExecutionTargetHandle.for_pipeline_fn(
        define_pyspark_pipe).build_pipeline_definition()

    result = execute_pipeline(
        pipeline_def,
        mode='prod',
        environment_dict={
            'solids': {
                'blah': {
                    'config': {
                        'foo': 'a string',
                        'bar': 123
                    }
                }
            },
            'resources': {
                'pyspark_step_launcher': {
                    'config': BASE_EMR_PYSPARK_STEP_LAUNCHER_CONFIG
                },
            },
            'storage': {
                's3': {
                    'config': {
                        's3_bucket': S3_BUCKET,
                        's3_prefix': 'test_pyspark'
                    }
                }
            },
        },
    )
    assert result.success