def test_using_s3_for_subplan(s3_bucket):
    pipeline = define_inty_pipeline()

    environment_dict = {'storage': {'s3': {'s3_bucket': s3_bucket}}}

    execution_plan = create_execution_plan(pipeline,
                                           environment_dict=environment_dict)

    assert execution_plan.get_step_by_key('return_one.transform')

    step_keys = ['return_one.transform']

    run_id = str(uuid.uuid4())

    try:
        return_one_step_events = list(
            execute_plan(
                execution_plan,
                environment_dict=environment_dict,
                run_config=RunConfig(run_id=run_id),
                step_keys_to_execute=step_keys,
            ))

        assert get_step_output(return_one_step_events, 'return_one.transform')
        with yield_pipeline_execution_context(
                pipeline, environment_dict,
                RunConfig(run_id=run_id)) as context:
            assert has_s3_intermediate(context, s3_bucket, run_id,
                                       'return_one.transform')
            assert get_s3_intermediate(context, s3_bucket, run_id,
                                       'return_one.transform', Int) == 1

        add_one_step_events = list(
            execute_plan(
                execution_plan,
                environment_dict=environment_dict,
                run_config=RunConfig(run_id=run_id),
                step_keys_to_execute=['add_one.transform'],
            ))

        assert get_step_output(add_one_step_events, 'add_one.transform')
        with yield_pipeline_execution_context(
                pipeline, environment_dict,
                RunConfig(run_id=run_id)) as context:
            assert has_s3_intermediate(context, s3_bucket, run_id,
                                       'add_one.transform')
            assert get_s3_intermediate(context, s3_bucket, run_id,
                                       'add_one.transform', Int) == 2
    finally:
        with yield_pipeline_execution_context(
                pipeline, environment_dict,
                RunConfig(run_id=run_id)) as context:
            rm_s3_intermediate(context, s3_bucket, run_id,
                               'return_one.transform')
            rm_s3_intermediate(context, s3_bucket, run_id, 'add_one.transform')
Exemplo n.º 2
0
def test_file_system_object_store_composite_types_with_custom_serializer_for_inner_type(
):
    run_id = str(uuid.uuid4())

    object_store = FileSystemObjectStore(run_id=run_id)
    assert object_store.root == os.path.join(seven.get_system_temp_directory(),
                                             'dagster', 'runs', run_id,
                                             'files')

    with yield_pipeline_execution_context(PipelineDefinition([]), {},
                                          RunConfig(run_id=run_id)) as context:
        try:
            object_store.set_object(
                ['foo', 'bar'],
                context,
                resolve_to_runtime_type(List_(LowercaseString)).inst(),
                ['list'],
            )
            assert object_store.has_object(context, ['list'])
            assert object_store.get_object(
                context,
                resolve_to_runtime_type(List_(Bool_)).inst(),
                ['list']) == ['foo', 'bar']

        finally:
            try:
                shutil.rmtree(object_store.root)
            except seven.FileNotFoundError:
                pass
Exemplo n.º 3
0
def test_file_system_object_store_composite_types():
    run_id = str(uuid.uuid4())

    object_store = FileSystemObjectStore(run_id=run_id)
    assert object_store.root == os.path.join(seven.get_system_temp_directory(),
                                             'dagster', 'runs', run_id,
                                             'files')

    with yield_pipeline_execution_context(PipelineDefinition([]), {},
                                          RunConfig(run_id=run_id)) as context:
        try:
            object_store.set_object([True, False], context,
                                    resolve_to_runtime_type(
                                        List_(Bool_)).inst(), ['bool'])
            assert object_store.has_object(context, ['bool'])
            assert object_store.get_object(
                context,
                resolve_to_runtime_type(List_(Bool_)).inst(),
                ['bool']) == [True, False]

        finally:
            try:
                shutil.rmtree(object_store.root)
            except seven.FileNotFoundError:
                pass
Exemplo n.º 4
0
def test_file_system_object_store_with_base_dir():
    run_id = str(uuid.uuid4())

    try:
        tempdir = tempfile.mkdtemp()

        object_store = FileSystemObjectStore(run_id=run_id, base_dir=tempdir)
        assert object_store.root == os.path.join(tempdir, 'dagster', 'runs',
                                                 run_id, 'files')

        with yield_pipeline_execution_context(PipelineDefinition(
            []), {}, RunConfig(run_id=run_id)) as context:
            try:
                object_store.set_object(True, context, Bool.inst(), ['true'])
                assert object_store.has_object(context, ['true'])
                assert object_store.get_object(context, Bool.inst(),
                                               ['true']) is True

            finally:
                try:
                    shutil.rmtree(object_store.root)
                except seven.FileNotFoundError:
                    pass
    finally:
        try:
            shutil.rmtree(tempdir)
        except seven.FileNotFoundError:
            pass
Exemplo n.º 5
0
def test_serialize_deserialize():
    with yield_pipeline_execution_context(PipelineDefinition([]), {},
                                          RunConfig()) as context:
        with tempfile.NamedTemporaryFile() as fd:
            serialize_to_file(context, PickleSerializationStrategy(), 'foo',
                              fd.name)
            assert deserialize_from_file(context,
                                         PickleSerializationStrategy(),
                                         fd.name) == 'foo'
Exemplo n.º 6
0
def test_file_system_object_store_with_composite_type_storage_plugin():
    run_id = str(uuid.uuid4())

    # FIXME need a dedicated test bucket
    object_store = FileSystemObjectStore(
        run_id=run_id,
        types_to_register={
            String.inst(): FancyStringFilesystemTypeStoragePlugin
        })

    with yield_pipeline_execution_context(PipelineDefinition([]), {},
                                          RunConfig(run_id=run_id)) as context:
        with pytest.raises(check.NotImplementedCheckError):
            object_store.set_value(['hello'], context,
                                   resolve_to_runtime_type(List_(String_)),
                                   ['obj_name'])

    with yield_pipeline_execution_context(PipelineDefinition([]), {},
                                          RunConfig(run_id=run_id)) as context:
        with pytest.raises(check.NotImplementedCheckError):
            object_store.set_value(['hello'], context,
                                   resolve_to_runtime_type(Nullable_(String_)),
                                   ['obj_name'])

    with yield_pipeline_execution_context(PipelineDefinition([]), {},
                                          RunConfig(run_id=run_id)) as context:
        with pytest.raises(check.NotImplementedCheckError):
            object_store.set_value(['hello'], context,
                                   resolve_to_runtime_type(
                                       List_(Nullable_(String_))),
                                   ['obj_name'])

    with yield_pipeline_execution_context(PipelineDefinition([]), {},
                                          RunConfig(run_id=run_id)) as context:
        with pytest.raises(check.NotImplementedCheckError):
            object_store.set_value(['hello'], context,
                                   resolve_to_runtime_type(
                                       Nullable_(List_(String_))),
                                   ['obj_name'])
Exemplo n.º 7
0
    def define_out_of_pipeline_context(self, context_config):
        pipeline_def = PipelineDefinition([], name='Ephemeral Notebook Pipeline')

        # BUG: If the context cleans up after itself (e.g. closes a db connection or similar)
        # This will instigate that process *before* return. We are going to have to
        # manage this manually (without an if block) in order to make this work.
        # See https://github.com/dagster-io/dagster/issues/796
        with yield_pipeline_execution_context(
            pipeline_def,
            {} if context_config is None else {'context': context_config},
            ExecutionMetadata(run_id=''),
        ) as pipeline_context:
            self.context = DagstermillInNotebookExecutionContext(pipeline_context)
        return self.context
Exemplo n.º 8
0
def test_s3_object_store_with_composite_type_storage_plugin():
    run_id = str(uuid.uuid4())

    # FIXME need a dedicated test bucket
    object_store = S3ObjectStore(
        run_id=run_id,
        s3_bucket='dagster-airflow-scratch',
        types_to_register={String.inst(): FancyStringS3TypeStoragePlugin},
    )

    with yield_pipeline_execution_context(PipelineDefinition([]), {},
                                          RunConfig(run_id=run_id)) as context:
        with pytest.raises(check.NotImplementedCheckError):
            object_store.set_value(['hello'], context,
                                   resolve_to_runtime_type(List_(String_)),
                                   ['obj_name'])
Exemplo n.º 9
0
def test_s3_object_store():
    run_id = str(uuid.uuid4())

    # FIXME need a dedicated test bucket
    object_store = S3ObjectStore(run_id=run_id,
                                 s3_bucket='dagster-airflow-scratch')
    assert object_store.root == '/'.join(['dagster', 'runs', run_id, 'files'])

    with yield_pipeline_execution_context(PipelineDefinition([]), {},
                                          RunConfig(run_id=run_id)) as context:
        try:
            object_store.set_object(True, context, Bool.inst(), ['true'])

            assert object_store.has_object(context, ['true'])
            assert object_store.get_object(context, Bool.inst(),
                                           ['true']) is True
            assert object_store.url_for_paths(['true']).startswith('s3://')

        finally:
            object_store.rm_object(context, ['true'])
Exemplo n.º 10
0
    def execute(self):
        from dagster.core.execution import yield_pipeline_execution_context

        check.inst(self.run_config.executor_config, MultiprocessExecutorConfig)
        pipeline = self.run_config.executor_config.pipeline_fn()

        with yield_pipeline_execution_context(
                pipeline, self.environment_dict,
                self.run_config.with_tags(
                    pid=str(os.getpid()))) as pipeline_context:

            execution_plan = create_execution_plan_core(
                pipeline_context.pipeline_def,
                pipeline_context.environment_config)

            for step_event in start_inprocess_executor(
                    pipeline_context,
                    execution_plan,
                    pipeline_context.intermediates_manager,
                    step_keys_to_execute=[self.step_key],
            ):
                yield step_event
Exemplo n.º 11
0
def test_file_system_object_store_with_custom_serializer():
    run_id = str(uuid.uuid4())

    object_store = FileSystemObjectStore(run_id=run_id)

    with yield_pipeline_execution_context(PipelineDefinition([]), {},
                                          RunConfig(run_id=run_id)) as context:
        try:
            object_store.set_object('foo', context, LowercaseString.inst(),
                                    ['foo'])

            with open(os.path.join(object_store.root, 'foo'), 'rb') as fd:
                assert fd.read().decode('utf-8') == 'FOO'

            assert object_store.has_object(context, ['foo'])
            assert object_store.get_object(context, LowercaseString.inst(),
                                           ['foo']) == 'foo'
        finally:
            try:
                shutil.rmtree(object_store.root)
            except seven.FileNotFoundError:
                pass
Exemplo n.º 12
0
def test_s3_object_store_with_type_storage_plugin():
    run_id = str(uuid.uuid4())

    # FIXME need a dedicated test bucket
    object_store = S3ObjectStore(
        run_id=run_id,
        s3_bucket='dagster-airflow-scratch',
        types_to_register={String.inst(): FancyStringS3TypeStoragePlugin},
    )

    with yield_pipeline_execution_context(PipelineDefinition([]), {},
                                          RunConfig(run_id=run_id)) as context:
        try:
            object_store.set_value('hello', context, String.inst(),
                                   ['obj_name'])

            assert object_store.has_object(context, ['obj_name'])
            assert object_store.get_value(context, String.inst(),
                                          ['obj_name']) == 'hello'

        finally:
            object_store.rm_object(context, ['obj_name'])
Exemplo n.º 13
0
def test_s3_object_store_composite_types_with_custom_serializer_for_inner_type(
):
    run_id = str(uuid.uuid4())

    object_store = S3ObjectStore(run_id=run_id,
                                 s3_bucket='dagster-airflow-scratch')
    with yield_pipeline_execution_context(PipelineDefinition([]), {},
                                          RunConfig(run_id=run_id)) as context:
        try:
            object_store.set_object(
                ['foo', 'bar'],
                context,
                resolve_to_runtime_type(List_(LowercaseString)).inst(),
                ['list'],
            )
            assert object_store.has_object(context, ['list'])
            assert object_store.get_object(
                context,
                resolve_to_runtime_type(List_(Bool_)).inst(),
                ['list']) == ['foo', 'bar']

        finally:
            object_store.rm_object(context, ['foo'])
Exemplo n.º 14
0
def test_s3_object_store_with_custom_serializer():
    run_id = str(uuid.uuid4())

    # FIXME need a dedicated test bucket
    object_store = S3ObjectStore(run_id=run_id,
                                 s3_bucket='dagster-airflow-scratch')

    with yield_pipeline_execution_context(PipelineDefinition([]), {},
                                          RunConfig(run_id=run_id)) as context:
        try:
            object_store.set_object('foo', context, LowercaseString.inst(),
                                    ['foo'])

            assert (object_store.s3.get_object(
                Bucket=object_store.bucket,
                Key='/'.join([object_store.root] +
                             ['foo']))['Body'].read().decode('utf-8') == 'FOO')

            assert object_store.has_object(context, ['foo'])
            assert object_store.get_object(context, LowercaseString.inst(),
                                           ['foo']) == 'foo'
        finally:
            object_store.rm_object(context, ['foo'])
Exemplo n.º 15
0
    def populate_context(
        self,
        run_id,
        solid_def_name,
        pipeline_def_name,
        marshal_dir,
        environment_dict,
        output_log_path,
    ):
        check.dict_param(environment_dict, 'environment_dict')
        self.populated_by_papermill = True
        check.invariant(
            self.repository_def != None,
            desc='When running Dagstermill notebook in pipeline, '
            'must register a repository within notebook by calling '
            '"dm.register_repository(repository_def)"',
        )
        self.pipeline_def = self.repository_def.get_pipeline(pipeline_def_name)
        check.invariant(self.pipeline_def.has_solid_def(solid_def_name))
        self.solid_def = self.pipeline_def.solid_def_named(solid_def_name)

        self.marshal_dir = marshal_dir
        loggers = None
        if output_log_path != 0:
            event_logger = construct_json_event_logger(output_log_path)
            loggers = [event_logger]
        # do not include event_callback in ExecutionMetadata,
        # since that'll be taken care of by side-channel established by event_logger
        execution_metadata = ExecutionMetadata(run_id, loggers=loggers)
        # See block comment above referencing this issue
        # See https://github.com/dagster-io/dagster/issues/796
        with yield_pipeline_execution_context(
            self.pipeline_def, environment_dict, execution_metadata
        ) as pipeline_context:
            self.context = DagstermillInNotebookExecutionContext(pipeline_context)

        return self.context
Exemplo n.º 16
0
def test_file_system_object_store():
    run_id = str(uuid.uuid4())

    object_store = FileSystemObjectStore(run_id=run_id)
    assert object_store.root == os.path.join(seven.get_system_temp_directory(),
                                             'dagster', 'runs', run_id,
                                             'files')

    with yield_pipeline_execution_context(PipelineDefinition([]), {},
                                          RunConfig(run_id=run_id)) as context:
        try:
            object_store.set_object(True, context, Bool.inst(), ['true'])
            assert object_store.has_object(context, ['true'])
            assert object_store.get_object(context, Bool.inst(),
                                           ['true']) is True
            assert object_store.url_for_paths(['true']).startswith('file:///')
            assert object_store.rm_object(context, ['true']) is None
            assert object_store.rm_object(context, ['true']) is None
            assert object_store.rm_object(context, ['dslkfhjsdflkjfs']) is None
        finally:
            try:
                shutil.rmtree(object_store.root)
            except seven.FileNotFoundError:
                pass
Exemplo n.º 17
0
    def populate_context(
        self,
        run_id,
        solid_def_name,
        pipeline_def_name,
        marshal_dir,
        environment_dict,
        output_log_path,
        input_name_type_dict,
        output_name_type_dict,
    ):
        check.dict_param(environment_dict, 'environment_dict')
        self.populated_by_papermill = True
        self.solid_def_name = solid_def_name
        self.marshal_dir = marshal_dir

        if self.repository_def is None:
            self.pipeline_def = PipelineDefinition(
                [], name='Dummy Pipeline (No Repo Registration)')
            self.input_name_type_dict = dict_to_enum(input_name_type_dict)
            self.output_name_type_dict = dict_to_enum(output_name_type_dict)
            for _, runtime_type_enum in self.input_name_type_dict.items():
                if runtime_type_enum == SerializableRuntimeType.NONE:
                    raise DagstermillError(
                        'If Dagstermill solids have inputs that require serialization strategies '
                        'that are not pickling, then you must register a repository within '
                        'notebook by calling dm.register_repository(repository_def)'
                    )
            for _, runtime_type_enum in self.output_name_type_dict.items():
                if runtime_type_enum == SerializableRuntimeType.NONE:
                    raise DagstermillError(
                        'If Dagstermill solids have outputs that require serialization strategies '
                        'that are not pickling, then you must register a repository within '
                        'notebook by calling dm.register_repository(repository_def).'
                    )
            with yield_pipeline_execution_context(
                    self.pipeline_def, {},
                    RunConfig(run_id=run_id)) as pipeline_context:
                self.context = DagstermillInNotebookExecutionContext(
                    pipeline_context)
        else:
            self.pipeline_def = self.repository_def.get_pipeline(
                pipeline_def_name)
            check.invariant(self.pipeline_def.has_solid_def(solid_def_name))
            self.solid_def = self.pipeline_def.solid_def_named(solid_def_name)

            loggers = None
            if output_log_path != 0:  # there is no output log
                event_logger = construct_json_event_logger(output_log_path)
                loggers = [event_logger]
            # do not include event_callback in ExecutionMetadata,
            # since that'll be taken care of by side-channel established by event_logger
            execution_metadata = RunConfig(run_id, loggers=loggers)
            # See block comment above referencing this issue
            # See https://github.com/dagster-io/dagster/issues/796
            with yield_pipeline_execution_context(
                    self.pipeline_def, environment_dict,
                    execution_metadata) as pipeline_context:
                self.context = DagstermillInNotebookExecutionContext(
                    pipeline_context)

        return self.context
Exemplo n.º 18
0
def run_test_pipeline(pipeline):
    execution_metadata = ExecutionMetadata(run_id=str(uuid.uuid4()))
    with yield_pipeline_execution_context(pipeline, TEST_ENVIRONMENT,
                                          execution_metadata) as context:
        execution_plan = create_execution_plan_core(context)
        return execute_plan(context, execution_plan)