Exemplo n.º 1
0
@resource(config=Field(Int))
def multer_resource(init_context):
    return lambda x: x * init_context.resource_config


@resource(config={'num_one': Field(Int), 'num_two': Field(Int)})
def double_adder_resource(init_context):
    return (lambda x: x + init_context.resource_config['num_one'] +
            init_context.resource_config['num_two'])


@pipeline(
    mode_defs=[
        ModeDefinition(
            name='add_mode',
            resource_defs={'op': adder_resource},
            description='Mode that adds things',
        ),
        ModeDefinition(
            name='mult_mode',
            resource_defs={'op': multer_resource},
            description='Mode that multiplies things',
        ),
        ModeDefinition(
            name='double_adder',
            resource_defs={'op': double_adder_resource},
            description='Mode that adds two numbers to thing',
        ),
    ],
    preset_defs=[PresetDefinition.from_files("add", mode="add_mode")],
)
Exemplo n.º 2
0
    iterations = context.solid_config['iterations']
    for iteration in range(iterations):
        # Calculates URL contributions to the rank of other URLs.
        contribs = links.join(
            ranks).flatMap(lambda url_urls_rank: computeContribs(
                url_urls_rank[1][0], url_urls_rank[1][1]))

        # Re-calculates URL ranks based on neighbor contributions.
        ranks = contribs.reduceByKey(add).mapValues(
            lambda rank: rank * 0.85 + 0.15)
        context.log.info('Completed iteration {}'.format(iteration))

    return ranks


@solid(input_defs=[InputDefinition(name='ranks', dagster_type=SparkRDD)])
def log_ranks_step_five(context, ranks):
    for (link, rank) in ranks.collect():
        context.log.info("%s has rank: %s." % (link, rank))

    return ranks.collect()


@pipeline(
    mode_defs=[ModeDefinition(resource_defs={'spark': pyspark_resource})])
def pyspark_pagerank_step_five():
    log_ranks_step_five(
        calculate_ranks_step_five(
            compute_links_step_five(parse_pagerank_data_step_five())))
def test_two_modes():
    pipeline_def = PipelineDefinition(
        name='TwoModePipelines',
        solids=[],
        mode_definitions=[
            ModeDefinition(
                'mode_one',
                resources={
                    'value':
                    dummy_resource(
                        Field(Dict({'mode_one_field': Field(String)})))
                },
            ),
            ModeDefinition(
                'mode_two',
                resources={
                    'value':
                    dummy_resource(Field(Dict({'mode_two_field': Field(Int)})))
                },
            ),
        ],
    )

    assert scaffold_pipeline_config(pipeline_def, mode='mode_one') == {
        'resources': {
            'value': {
                'config': {
                    'mode_one_field': ''
                }
            }
        }
    }

    assert scaffold_pipeline_config(pipeline_def,
                                    mode='mode_one',
                                    skip_optional=False) == {
                                        'loggers': {
                                            'console': {
                                                'config': {
                                                    'log_level': '',
                                                    'name': ''
                                                }
                                            }
                                        },
                                        'solids': {},
                                        'expectations': {
                                            'evaluate': True
                                        },
                                        'storage': {
                                            'in_memory': {},
                                            'filesystem': {
                                                'base_dir': ''
                                            },
                                            's3': {
                                                's3_bucket': ''
                                            }
                                        },
                                        'execution': {},
                                        'resources': {
                                            'value': {
                                                'config': {
                                                    'mode_one_field': ''
                                                }
                                            }
                                        },
                                    }

    assert scaffold_pipeline_config(pipeline_def, mode='mode_two') == {
        'resources': {
            'value': {
                'config': {
                    'mode_two_field': 0
                }
            }
        }
    }

    assert scaffold_pipeline_config(pipeline_def,
                                    mode='mode_two',
                                    skip_optional=False) == {
                                        'solids': {},
                                        'expectations': {
                                            'evaluate': True
                                        },
                                        'storage': {
                                            'in_memory': {},
                                            'filesystem': {
                                                'base_dir': ''
                                            },
                                            's3': {
                                                's3_bucket': ''
                                            }
                                        },
                                        'execution': {},
                                        'resources': {
                                            'value': {
                                                'config': {
                                                    'mode_two_field': 0
                                                }
                                            }
                                        },
                                        'loggers': {
                                            'console': {
                                                'config': {
                                                    'log_level': '',
                                                    'name': ''
                                                }
                                            }
                                        },
                                    }
Exemplo n.º 4
0
@resource(config_field=Field(Int))
def multer_resource(init_context):
    return lambda x: x * init_context.resource_config


@resource(config={'num_one': Field(Int), 'num_two': Field(Int)})
def double_adder_resource(init_context):
    return (lambda x: x + init_context.resource_config['num_one'] +
            init_context.resource_config['num_two'])


@pipeline(
    mode_defs=[
        ModeDefinition(
            name='add_mode',
            resource_defs={'op': adder_resource},
            description='Mode that adds things',
        ),
        ModeDefinition(
            name='mult_mode',
            resource_defs={'op': multer_resource},
            description='Mode that multiplies things',
        ),
        ModeDefinition(
            name='double_adder',
            resource_defs={'op': double_adder_resource},
            description='Mode that adds two numbers to thing',
        ),
    ],
    preset_defs=[PresetDefinition.from_files("add", mode="add_mode")],
)
Exemplo n.º 5
0
   SELECT FORMAT_DATETIME("%F %H:00:00", DATETIME(TIMESTAMP_SECONDS(CAST(timestamp AS INT64)))) AS ts,
          COUNT(1) AS num_visits
     FROM events.events
    WHERE url = '/explore'
 GROUP BY ts
 ORDER BY ts ASC
'''
    context.resources.bigquery.query(sql, job_config=query_job_config)


@pipeline(
    mode_defs=[
        ModeDefinition(
            name='default',
            resource_defs={
                'bigquery': bigquery_resource,
                'dataproc': dataproc_resource
            },
        )
    ],
    preset_defs=[
        PresetDefinition.from_pkg_resources(
            'default',
            pkg_resource_defs=[
                ('dagster_examples.gcp_data_platform.environments',
                 'resources_pipeline.yaml'),
            ],
        )
    ],
)
def gcp_data_platform():
Exemplo n.º 6
0
    load_data_to_database_from_spark,
    process_sfo_weather_data,
    q2_sfo_outbound_flights,
    s3_to_df,
    s3_to_dw_table,
    sfo_delays_by_destination,
    tickets_with_destination,
    westbound_delays,
)

test_mode = ModeDefinition(
    name='test',
    resource_defs={
        'spark': spark_session_local,
        'db_info': redshift_db_info_resource,
        'tempfile': tempfile_resource,
        's3': s3_resource,
        'file_cache': fs_file_cache,
    },
    system_storage_defs=s3_plus_default_storage_defs,
)

local_mode = ModeDefinition(
    name='local',
    resource_defs={
        'spark': spark_session_local,
        's3': s3_resource,
        'db_info': postgres_db_info_resource,
        'tempfile': tempfile_resource,
        'file_cache': fs_file_cache,
    },
Exemplo n.º 7
0
                "inputs": {"table_name": "weather"},
            },
            "upload_training_set_to_gcs": {
                "inputs": {"bucket_name": "dagster-scratch-ccdfe1e", "file_name": "training_data",}
            },
        },
    }


@pipeline(
    mode_defs=[
        ModeDefinition(
            name='testing',
            resource_defs={
                'postgres_db': postgres_db_info_resource,
                'gcs_client': testing_client,
                'volume': mount,
            },
            description='Mode to be used during testing. Allows us to clean up test artifacts without interfearing with local artifacts.',
        ),
    ],
)
def generate_test_training_set_pipeline():
    upload_training_set_to_gcs = upload_pickled_object_to_gcs_bucket.alias(
        'upload_training_set_to_gcs'
    )
    return upload_training_set_to_gcs(
        produce_training_set(
            transform_into_traffic_dataset(produce_trip_dataset()), produce_weather_dataset(),
        )
    )
Exemplo n.º 8
0

@solid(required_resource_keys={'R1'})
def one(context):
    return 1 + context.resources.R1


@solid(required_resource_keys={'R2'})
def two(_):
    return 1


@solid(required_resource_keys={'R1', 'R2', 'R3'})
def one_and_two_and_three(_):
    return 1


@pipeline(mode_defs=[ModeDefinition(resource_defs=lots_of_resources)])
def resource_pipeline():
    all_resources()
    one()
    two()
    one_and_two_and_three()


if __name__ == '__main__':
    result = execute_pipeline(
        ExecutionTargetHandle.for_pipeline_fn(resource_pipeline).build_pipeline_definition(),
        environment_dict={'storage': {'filesystem': {}}, 'execution': {'multiprocessing': {}}},
    )
Exemplo n.º 9
0
@solid(
    input_defs=[
        InputDefinition("_string_input_1", String),
        InputDefinition("_string_input_2", String),
    ],
    version="take_string_two_inputs_version",
    config_schema={
        "input_str": Field(String),
        "base_dir": Field(String)
    },
)
def take_string_two_inputs(context, _string_input_1, _string_input_2):
    yield Output(
        context.solid_config["input_str"],
        address=os.path.join(
            context.solid_config["base_dir"],
            "intermediates/take_string_two_inputs.compute/result"),
    )


@pipeline(mode_defs=[
    ModeDefinition("only_mode",
                   intermediate_storage_defs=[fs_intermediate_storage])
])
def basic_pipeline():
    take_string_two_inputs(
        _string_input_1=take_string_1(create_string_1()),
        _string_input_2=take_string_2(create_string_2()),
    )
Exemplo n.º 10
0
def test_optional_and_required_context():
    pipeline_def = PipelineDefinition(
        name='some_pipeline',
        solids=[],
        mode_definitions=[
            ModeDefinition(
                name='mixed',
                resources={
                    'optional_resource':
                    ResourceDefinition(
                        lambda: None,
                        config_field=Field(dagster_type=Dict(
                            fields={
                                'optional_field': Field(String,
                                                        is_optional=True)
                            })),
                    ),
                    'required_resource':
                    ResourceDefinition(
                        lambda: None,
                        config_field=Field(dagster_type=Dict(
                            fields={'required_field': Field(String)})),
                    ),
                },
            )
        ],
    )

    env_type = create_environment_type(pipeline_def)
    assert env_type.fields['solids'].is_optional

    assert env_type.fields['execution'].is_optional
    assert env_type.fields['expectations'].is_optional

    assert nested_field(env_type, 'resources').is_required
    assert nested_field(env_type, 'resources', 'optional_resource').is_optional
    assert nested_field(env_type, 'resources', 'optional_resource',
                        'config').is_optional
    assert nested_field(env_type, 'resources', 'optional_resource', 'config',
                        'optional_field').is_optional

    assert nested_field(env_type, 'resources', 'required_resource').is_required
    assert nested_field(env_type, 'resources', 'required_resource',
                        'config').is_required
    assert nested_field(env_type, 'resources', 'required_resource', 'config',
                        'required_field').is_required

    env_obj = EnvironmentConfig.from_dict(
        throwing_evaluate_config_value(
            env_type, {
                'resources': {
                    'required_resource': {
                        'config': {
                            'required_field': 'foo'
                        }
                    }
                }
            }))

    assert env_obj.resources == {
        'optional_resource': {
            'config': {}
        },
        'required_resource': {
            'config': {
                'required_field': 'foo'
            }
        },
    }
Exemplo n.º 11
0
def define_pipeline():
    @pipeline(mode_defs=[ModeDefinition(resource_defs={'a': resource_a})])
    def spew_pipeline():
        spew(spew(spawn()))

    return spew_pipeline
Exemplo n.º 12
0
from dagster import ModeDefinition, execute_pipeline, pipeline, solid

from .conftest import AWS_REGION, TEST_CLOUDWATCH_LOG_GROUP_NAME, TEST_CLOUDWATCH_LOG_STREAM_NAME

TEN_MINUTES_MS = 10 * 60 * 1000  # in milliseconds
NUM_POLL_ATTEMPTS = 5


@solid
def hello_cloudwatch(context):
    context.log.info('Hello, Cloudwatch!')
    context.log.error('This is an error')


@pipeline(mode_defs=[ModeDefinition(logger_defs={'cloudwatch': cloudwatch_logger})])
def hello_cloudwatch_pipeline():
    hello_cloudwatch()


def test_cloudwatch_logging_bad_log_group_name():
    with pytest.raises(
        Exception,
        match='Failed to initialize Cloudwatch logger: Could not find log group with name foo',
    ):
        execute_pipeline(
            hello_cloudwatch_pipeline,
            {
                'loggers': {
                    'cloudwatch': {
                        'config': {
Exemplo n.º 13
0
def test_cache_file_from_s3_overwrite():
    with tempfile.TemporaryDirectory() as temp_dir:
        s3_session_one = mock.MagicMock()
        execute_solid(
            cache_file_from_s3,
            ModeDefinition(
                resource_defs={
                    "file_cache": fs_file_cache,
                    "s3": ResourceDefinition.hardcoded_resource(
                        s3_session_one),
                }),
            run_config={
                "solids": {
                    "cache_file_from_s3": {
                        "inputs": {
                            "s3_coordinate": {
                                "bucket": "some-bucket",
                                "key": "some-key"
                            }
                        }
                    }
                },
                "resources": {
                    "file_cache": {
                        "config": {
                            "target_folder": temp_dir,
                            "overwrite": True
                        }
                    }
                },
            },
        )

        # assert the download occurred
        assert s3_session_one.download_file.call_count == 1

        s3_session_two = mock.MagicMock()
        execute_solid(
            cache_file_from_s3,
            ModeDefinition(
                resource_defs={
                    "file_cache": fs_file_cache,
                    "s3": ResourceDefinition.hardcoded_resource(
                        s3_session_two),
                }),
            run_config={
                "solids": {
                    "cache_file_from_s3": {
                        "inputs": {
                            "s3_coordinate": {
                                "bucket": "some-bucket",
                                "key": "some-key"
                            }
                        }
                    }
                },
                "resources": {
                    "file_cache": {
                        "config": {
                            "target_folder": temp_dir,
                            "overwrite": True
                        }
                    }
                },
            },
        )

        # assert the download did not occur because file is already there
        assert s3_session_two.download_file.call_count == 0
Exemplo n.º 14
0
    Output,
    OutputDefinition,
    PipelineExecutionResult,
    SolidExecutionResult,
    default_executors,
    execute_pipeline,
    lambda_solid,
    pipeline,
    seven,
    solid,
)
from dagster.core.instance import DagsterInstance
from dagster.core.test_utils import nesting_composite_pipeline

celery_mode_defs = [
    ModeDefinition(executor_defs=default_executors + [celery_executor])
]

BUILDKITE = os.getenv('BUILDKITE')
skip_ci = pytest.mark.skipif(
    bool(BUILDKITE),
    reason=
    'Tests hang forever on buildkite for reasons we don\'t currently understand',
)


@solid
def simple(_):
    return 1

Exemplo n.º 15
0
    def reconstitute_pipeline_context(
        self,
        output_log_path=None,
        marshal_dir=None,
        environment_dict=None,
        handle=None,
        run_config=None,
        solid_handle=None,
    ):
        '''Reconstitutes a context for dagstermill-managed execution.

        You'll see this function called to reconstruct a pipeline context within the ``injected
        parameters`` cell of a dagstermill output notebook. Users should not call this function
        interactively except when debugging output notebooks.

        Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a
        context for interactive exploration and development. This call will be replaced by one to
        :func:`dagstermill.reconstitute_pipeline_context` when the notebook is executed by
        dagstermill.
        '''
        check.opt_str_param(output_log_path, 'output_log_path')
        check.opt_str_param(marshal_dir, 'marshal_dir')
        environment_dict = check.opt_dict_param(environment_dict, 'environment_dict', key_type=str)
        check.inst_param(run_config, 'run_config', RunConfig)
        check.inst_param(handle, 'handle', ExecutionTargetHandle)
        check.inst_param(solid_handle, 'solid_handle', SolidHandle)

        pipeline_def = check.inst_param(
            handle.build_pipeline_definition(),
            'pipeline_def (from handle {handle_dict})'.format(handle_dict=handle.data._asdict()),
            PipelineDefinition,
        )

        solid_def = pipeline_def.get_solid(solid_handle)

        mode_def = pipeline_def.get_mode_definition(run_config.mode)
        shim_mode_def = ModeDefinition(
            name=mode_def.name,
            logger_defs=dict(
                mode_def.loggers, dagstermill=construct_sqlite_logger(output_log_path)
            ),
            resource_defs=mode_def.resource_defs,
        )

        pipeline_def = PipelineDefinition(
            pipeline_def.solid_defs,
            name=pipeline_def.name,
            description=pipeline_def.description,
            dependencies=pipeline_def.dependencies,
            mode_defs=[shim_mode_def],
            preset_defs=pipeline_def.preset_defs,
        )

        if 'loggers' not in environment_dict:
            environment_dict['loggers'] = {'dagstermill': {}}

        if 'dagstermill' not in environment_dict['loggers']:
            environment_dict['loggers']['dagstermill'] = {}

        self.marshal_dir = marshal_dir
        self.in_pipeline = True
        self.solid_def = solid_def
        self.pipeline_def = pipeline_def

        with scoped_pipeline_context(
            self.pipeline_def,
            environment_dict,
            run_config,
            scoped_resources_builder_cm=self._setup_resources,
        ) as pipeline_context:
            self.context = DagstermillExecutionContext(pipeline_context)

        return self.context
Exemplo n.º 16
0
    def get_context(self, solid_config=None, mode_def=None, run_config=None):
        """Get a dagstermill execution context for interactive exploration and development.

        Args:
            solid_config (Optional[Any]): If specified, this value will be made available on the
                context as its ``solid_config`` property.
            mode_def (Optional[:class:`dagster.ModeDefinition`]): If specified, defines the mode to
                use to construct the context. Specify this if you would like a context constructed
                with specific ``resource_defs`` or ``logger_defs``. By default, an ephemeral mode
                with a console logger will be constructed.
            run_config(Optional[dict]): The environment config dict with which to construct
                the context.

        Returns:
            :py:class:`~dagstermill.DagstermillExecutionContext`
        """
        check.opt_inst_param(mode_def, "mode_def", ModeDefinition)
        run_config = check.opt_dict_param(run_config,
                                          "run_config",
                                          key_type=str)

        # If we are running non-interactively, and there is already a context reconstituted, return
        # that context rather than overwriting it.
        if self.context is not None and isinstance(
                self.context, DagstermillRuntimeExecutionContext):
            return self.context

        if not mode_def:
            mode_def = ModeDefinition(
                logger_defs={"dagstermill": colored_console_logger})
            run_config["loggers"] = {"dagstermill": {}}

        solid_def = SolidDefinition(
            name="this_solid",
            input_defs=[],
            compute_fn=lambda *args, **kwargs: None,
            output_defs=[],
            description=
            "Ephemeral solid constructed by dagstermill.get_context()",
            required_resource_keys=mode_def.resource_key_set,
        )

        pipeline_def = PipelineDefinition(
            [solid_def],
            mode_defs=[mode_def],
            name="ephemeral_dagstermill_pipeline")

        run_id = make_new_run_id()

        # construct stubbed PipelineRun for notebook exploration...
        # The actual pipeline run during pipeline execution will be serialized and reconstituted
        # in the `reconstitute_pipeline_context` call
        pipeline_run = PipelineRun(
            pipeline_name=pipeline_def.name,
            run_id=run_id,
            run_config=run_config,
            mode=mode_def.name,
            step_keys_to_execute=None,
            status=PipelineRunStatus.NOT_STARTED,
            tags=None,
        )

        self.in_pipeline = False
        self.solid_def = solid_def
        self.pipeline = pipeline_def

        execution_plan = create_execution_plan(self.pipeline,
                                               run_config,
                                               mode=mode_def.name)
        with scoped_pipeline_context(
                execution_plan,
                run_config,
                pipeline_run,
                DagsterInstance.ephemeral(),
                scoped_resources_builder_cm=self._setup_resources,
        ) as pipeline_context:

            self.context = DagstermillExecutionContext(
                pipeline_context=pipeline_context,
                solid_config=solid_config,
                resource_keys_to_init=get_required_resource_keys_to_init(
                    execution_plan,
                    pipeline_context.intermediate_storage_def,
                ),
                solid_name=solid_def.name,
            )

        return self.context
Exemplo n.º 17
0
    if not safe_isfile(outfile):
        mkdir_p(output_folder)

        with gzip.open(gzip_file, "rb") as f_in, open(outfile, "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)

    return [path_prefix]


@pipeline(
    mode_defs=[
        ModeDefinition(
            name="default",
            resource_defs={
                "s3": s3_resource,
                "snowflake": snowflake_resource,
                "spark": spark_resource,
            },
        )
    ],
    preset_defs=[
        PresetDefinition.from_pkg_resources(
            "default",
            pkg_resource_defs=[
                ("dagster_examples.event_pipeline_demo.environments",
                 "default.yaml"),
            ],
        )
    ],
)
def event_ingest_pipeline():