Exemplo n.º 1
0
from dagster.core.execution.context_creation_pipeline import pipeline_initialization_manager
from dagster.core.execution.plan.execute_step import core_dagster_event_sequence_for_step
from dagster.core.execution.retries import Retries
from dagster.core.instance import DagsterInstance
from dagster.core.storage.file_manager import LocalFileHandle

PICKLED_EVENTS_FILE_NAME = 'events.pkl'
PICKLED_STEP_RUN_REF_FILE_NAME = 'step_run_ref.pkl'


@resource(
    config_schema={
        'scratch_dir':
        Field(
            StringSource,
            description=
            'Directory used to pass files between the plan process and step process.',
        ),
    }, )
def local_external_step_launcher(context):
    return LocalExternalStepLauncher(**context.resource_config)


class LocalExternalStepLauncher(StepLauncher):
    '''Launches each step in its own local process, outside the plan process.'''
    def __init__(self, scratch_dir):
        self.scratch_dir = check.str_param(scratch_dir, 'scratch_dir')

    def launch_step(self, step_context, prior_attempts_count):
        step_run_ref = step_context_to_step_run_ref(step_context,
                                                    prior_attempts_count)
Exemplo n.º 2
0
def _multiple_required_fields_config_permissive_dict():
    return Field(Permissive({"field_one": Field(String), "field_two": Field(String)}))
Exemplo n.º 3
0
def test_noop_config():
    assert Field(Any)
Exemplo n.º 4
0
def test_compute_fields_hash():
    assert isinstance(_hash({"some_int": Field(int)}), str)
Exemplo n.º 5
0
def _single_optional_string_config_dict():
    return convert_potential_field({"optional_field": Field(String, is_required=False)})
Exemplo n.º 6
0
from .decorators import pyspark_solid
from .resources import (
    PySparkResourceDefinition,
    pyspark_resource,
    spark_session_from_config,
    spark_session_resource,
)


@input_selector_schema(
    Selector({
        'csv':
        Field(
            Dict({
                'path': Field(Path),
                'sep': Field(String, is_optional=True),
                'header': Field(Bool, is_optional=True),
            }))
    }))
def load_rdd(context, file_type, file_options):
    if file_type == 'csv':
        return context.resources.spark.read.csv(
            file_options['path'], sep=file_options.get('sep')).rdd
    else:
        check.failed('Unsupported file type: {}'.format(file_type))


@output_selector_schema(
    Selector({
        'csv':
        Field(
Exemplo n.º 7
0
def test_construct_different_selectors():
    int_selector = Selector(fields={"an_int": Field(int)})
    string_selector = Selector(fields={"a_string": Field(str)})

    assert int_selector is not string_selector
    assert int_selector.key != string_selector.key
Exemplo n.º 8
0
    InputDefinition,
    Int,
    ModeDefinition,
    RepositoryDefinition,
    String,
    lambda_solid,
    pipeline,
    solid,
)

from dagster_aws.s3.resources import s3_resource
from dagster_aws.s3.system_storage import s3_plus_default_storage_defs


@solid(input_defs=[InputDefinition('word', String)],
       config={'factor': Field(Int)})
def multiply_the_word(context, word):
    return word * context.solid_config['factor']


@lambda_solid(input_defs=[InputDefinition('word')])
def count_letters(word):
    counts = defaultdict(int)
    for letter in word:
        counts[letter] += 1
    return dict(counts)


@lambda_solid()
def error_solid():
    raise Exception('Unusual error')
Exemplo n.º 9
0
        logger = S3Logger(context.log.debug, bucket, key, target_file,
                          int(headers['ContentLength']))
        session.download_file(Bucket=bucket,
                              Key=key,
                              Filename=target_file,
                              Callback=logger)
    return target_file


# This should be ported to use FileHandle-based solids.
# See https://github.com/dagster-io/dagster/issues/1476
@solid(
    name='download_from_s3_to_file',
    config={
        'bucket':
        Field(String, description='S3 bucket name'),
        'key':
        Field(String, description='S3 key name'),
        'target_folder':
        Field(Path,
              description=(
                  'Specifies the path at which to download the object.')),
        'skip_if_present':
        Field(Bool, is_required=False, default_value=False),
    },
    description='Downloads an object from S3 to a file.',
    output_defs=[
        OutputDefinition(FileExistsAtPath,
                         description='The path to the downloaded object.')
    ],
    required_resource_keys={'s3'},
Exemplo n.º 10
0
def test_config_with_and_without_config():
    @solid(config_schema={
        "prefix": Field(str, is_required=False, default_value="_")
    })
    def prefix_value(context, v):
        return "{prefix}{v}".format(prefix=context.solid_config["prefix"], v=v)

    @composite_solid(
        config_fn=lambda cfg:
        {"prefix_value": {
            "config": {
                "prefix": cfg["prefix"]
            }
        }},
        config_schema={
            "prefix": Field(str, is_required=False, default_value="_id_")
        },
    )
    def prefix_id(val):
        return prefix_value(val)

    @solid
    def print_value(_, v):
        return str(v)

    @pipeline
    def config_issue_pipeline():
        v = prefix_id()
        print_value(v)

    result = execute_pipeline(
        config_issue_pipeline,
        {
            "solids": {
                "prefix_id": {
                    "config": {
                        "prefix": "_customprefix_"
                    },
                    "inputs": {
                        "val": {
                            "value": "12345"
                        }
                    },
                }
            }
        },
    )

    assert result.success
    assert result.result_for_solid(
        "print_value").output_value() == "_customprefix_12345"

    result_using_default = execute_pipeline(
        config_issue_pipeline,
        {
            "solids": {
                "prefix_id": {
                    "config": {},
                    "inputs": {
                        "val": {
                            "value": "12345"
                        }
                    }
                }
            }
        },
    )

    assert result_using_default.success
    assert result_using_default.result_for_solid(
        "print_value").output_value() == "_id_12345"
Exemplo n.º 11
0
# pylint: disable=no-value-for-parameter

import collections

from dagster import Field, Int, lambda_solid, solid, pipeline, as_dagster_type

Counter = as_dagster_type(collections.Counter)


@solid(config={'factor': Field(Int)})
def multiply_the_word(context, word: str) -> str:
    return word * context.solid_config['factor']


@lambda_solid
def count_letters(word: str) -> Counter:
    return collections.Counter(word)


@pipeline
def configuration_schema_pipeline():
    return count_letters(multiply_the_word())
Exemplo n.º 12
0
def test_map_shape_complex():
    # Long form
    assert _validate(
        Field(Map(str, Shape({
            "name": Field(str),
            "number": Field(int)
        }))),
        {
            "foo": {
                "name": "test_name",
                "number": 5,
            },
            "bar": {
                "name": "other_name",
                "number": 10,
            },
        },
    ) == {
        "foo": {
            "name": "test_name",
            "number": 5,
        },
        "bar": {
            "name": "other_name",
            "number": 10,
        },
    }

    # Short form
    assert _validate(
        Field({
            str: {
                "name": Field(str),
                "number": Field(int),
            },
        }),
        {
            "foo": {
                "name": "test_name",
                "number": 5,
            },
            "bar": {
                "name": "other_name",
                "number": 10,
            },
        },
    ) == {
        "foo": {
            "name": "test_name",
            "number": 5,
        },
        "bar": {
            "name": "other_name",
            "number": 10,
        },
    }

    with pytest.raises(AssertionError):
        _validate(
            Field(Map(str, Shape({
                "name": Field(str),
                "number": Field(int)
            }))),
            {
                "foo": {
                    "name": "test_name",
                    "number": "not_a_number",
                },
                "bar": {
                    "name": "other_name",
                    "number": 10,
                },
            },
        )

    with pytest.raises(AssertionError):
        _validate(
            Field(Map(str, Shape({
                "name": Field(str),
                "number": Field(int)
            }))),
            {
                "foo": {
                    "name": "test_name",
                    "number": 15,
                },
                "baz": "not_a_shape",
            },
        )
Exemplo n.º 13
0
def test_permissive_defaults():
    @solid(config_schema=Permissive({"four": Field(int, default_value=4)}))
    def perm_with_defaults(context):
        assert context.solid_config["four"] == 4

    assert execute_solid(perm_with_defaults).success
Exemplo n.º 14
0
from dagster.utils import safe_tempfile_path

try:
    import _thread as thread
except ImportError:
    import thread


def _send_kbd_int(temp_files):
    while not all([os.path.exists(temp_file) for temp_file in temp_files]):
        time.sleep(0.1)

    thread.interrupt_main()


@solid(config={'tempfile': Field(String)})
def write_a_file(context):
    with open(context.solid_config['tempfile'], 'w') as ff:
        ff.write('yup')

    while True:
        time.sleep(0.1)


@pipeline
def write_files_pipeline():
    write_a_file.alias('write_1')()
    write_a_file.alias('write_2')()
    write_a_file.alias('write_3')()
    write_a_file.alias('write_4')()
Exemplo n.º 15
0
def resource_init(init_context):
    if init_context.resource_config['throw_on_resource_init']:
        raise Exception('throwing from in resource_fn')
    return ErrorableResource()


def define_errorable_resource():
    return ResourceDefinition(resource_fn=resource_init,
                              config_field=Field(
                                  Dict({'throw_on_resource_init':
                                        Field(Bool)})))


solid_throw_config = Field(
    Dict(fields={
        'throw_in_solid': Field(Bool),
        'return_wrong_type': Field(Bool)
    }))


@solid(name='emit_num',
       output_defs=[OutputDefinition(Int)],
       config_field=solid_throw_config)
def emit_num(context):
    if context.solid_config['throw_in_solid']:
        raise Exception('throwing from in the solid')

    if context.solid_config['return_wrong_type']:
        return 'wow'

    return 13
Exemplo n.º 16
0
from dagster.config.field_utils import Selector
from dagster.core.types.config_schema import input_selector_schema, output_selector_schema

CONSTRAINT_BLACKLIST = {ColumnDTypeFnConstraint, ColumnDTypeInSetConstraint}


def dict_without_keys(ddict, *keys):
    return {key: value for key, value in ddict.items() if key not in set(keys)}


@output_selector_schema(
    Selector(
        {
            'csv': {
                'path': StringSource,
                'sep': Field(
                    StringSource, is_required=False, default_value=','),
            },
            'parquet': {
                'path': StringSource
            },
            'table': {
                'path': StringSource
            },
        }, ))
def dataframe_output_schema(_context, file_type, file_options, pandas_df):
    check.str_param(file_type, 'file_type')
    check.dict_param(file_options, 'file_options')
    check.inst_param(pandas_df, 'pandas_df', pd.DataFrame)

    if file_type == 'csv':
        path = file_options['path']
Exemplo n.º 17
0
def define_errorable_resource():
    return ResourceDefinition(resource_fn=resource_init,
                              config_field=Field(
                                  Dict({'throw_on_resource_init':
                                        Field(Bool)})))
Exemplo n.º 18
0
    if not isinstance(value, list):
        return False

    fields = [field for field in value[0].keys()]

    for i in range(len(value)):
        row = value[i]
        if not isinstance(row, dict):
            return False
        row_fields = [field for field in row.keys()]
        if fields != row_fields:
            return False
    return True


@input_hydration_config(Selector({'csv': Field(String)}))
def less_simple_data_frame_input_hydration_config(context, selector):
    with open(selector['csv'], 'r') as fd:
        lines = [row for row in csv.DictReader(fd)]

    context.log.info('Read {n_lines} lines'.format(n_lines=len(lines)))
    return lines


if typing.TYPE_CHECKING:
    LessSimpleDataFrame = list
else:
    LessSimpleDataFrame = DagsterType(
        name='LessSimpleDataFrame',
        description=
        'A more sophisticated data frame that type checks its structure.',
Exemplo n.º 19
0
def test_hash_diff():

    assert _hash({"some_int": Field(int)}) != _hash(
        {"another_int": Field(int)})

    assert _hash({"same_name": Field(int)}) != _hash({"same_name": Field(str)})

    assert _hash({"same_name": Field(int)}) != _hash(
        {"same_name": Field(int, is_required=False)})

    assert _hash({"same_name": Field(int)}) != _hash(
        {"same_name": Field(int, is_required=False, default_value=2)})

    assert _hash({"same_name": Field(int, is_required=False)}) != _hash(
        {"same_name": Field(int, is_required=False, default_value=2)})

    assert _hash({"same_name": Field(int)}) != _hash(
        {"same_name": Field(int, description="desc")})
Exemplo n.º 20
0
    EventMetadataEntry,
    Field,
    Materialization,
    Selector,
    String,
    execute_pipeline,
    input_hydration_config,
    output_materialization_config,
    pipeline,
    seven,
    solid,
    usable_as_dagster_type,
)


@input_hydration_config(Selector({'csv': Field(String)}))
def less_simple_data_frame_input_hydration_config(context, selector):
    with open(selector['csv'], 'r') as fd:
        lines = [row for row in csv.DictReader(fd)]

    context.log.info('Read {n_lines} lines'.format(n_lines=len(lines)))
    return LessSimpleDataFrame(lines)


@output_materialization_config({
    'csv':
    Field(
        {
            'path': String,
            'sep': Field(String, is_required=False, default_value=','),
        },
Exemplo n.º 21
0
def test_kitchen_sink():
    big_dict_1 = Shape({
        "field_one":
        Field(int, default_value=2, is_required=False),
        "field_two":
        Field(
            Shape({
                "nested_field_one":
                Field(bool),
                "nested_selector":
                Field(
                    Selector({
                        "int_field_in_selector":
                        Field(int),
                        "permissive_dict_in_selector":
                        Field(Permissive()),
                        "permissive_dict_with_fields_in_selector":
                        Field(Permissive({"string_field": Field(str)})),
                    })),
            })),
    })

    big_dict_2 = Shape({
        "field_one":
        Field(int, default_value=2, is_required=False),
        "field_two":
        Field(
            Shape(
                fields={
                    "nested_field_one":
                    Field(bool),
                    "nested_selector":
                    Field(
                        Selector(
                            fields={
                                "permissive_dict_in_selector":
                                Field(Permissive()),
                                "int_field_in_selector":
                                Field(int),
                                "permissive_dict_with_fields_in_selector":
                                Field(
                                    Permissive(
                                        fields={"string_field": Field(str)})),
                            })),
                })),
    })

    assert big_dict_1 is big_dict_2
    assert big_dict_1.key == big_dict_2.key

    # differs way down in tree
    big_dict_3 = Shape({
        "field_one":
        Field(int, default_value=2, is_required=False),
        "field_two":
        Field(
            Shape(
                fields={
                    "nested_field_one":
                    Field(bool),
                    "nested_selector":
                    Field(
                        Selector(
                            fields={
                                "permissive_dict_in_selector":
                                Field(Permissive()),
                                "int_field_in_selector":
                                Field(int),
                                "permissive_dict_with_fields_in_selector":
                                Field(
                                    Permissive(
                                        fields={"int_field": Field(int)})),
                            })),
                })),
    })

    assert big_dict_1 is not big_dict_3
    assert big_dict_1.key != big_dict_3.key
Exemplo n.º 22
0
def bash_command_solid(bash_command, name=None, output_encoding=None):
    '''Execute a Bash command.
    '''
    check.str_param(bash_command, 'bash_command')
    name = check.opt_str_param(name, 'name', default='bash_solid')
    output_encoding = check.opt_str_param(output_encoding, 'output_encoding', default='utf-8')

    @solid(
        name=name,
        config={
            'output_logging': Field(
                Enum(
                    'OutputType',
                    [
                        EnumValue('STREAM', description='Stream script stdout/stderr.'),
                        EnumValue(
                            'BUFFER',
                            description='Buffer bash script stdout/stderr, then log upon completion.',
                        ),
                        EnumValue('NONE', description='No logging'),
                    ],
                ),
                is_required=False,
                default_value='STREAM',
            ),
            'env': Field(
                Permissive(),
                description='Environment variables to pass to the child process; if not provided, '
                'the current process environment will be passed.',
                is_required=False,
                default_value=None,
            ),
        },
    )
    def _bash_solid(context):
        '''This logic is ported from the Airflow BashOperator implementation.

        https://github.com/apache/airflow/blob/master/airflow/operators/bash_operator.py
        '''

        def log_info_msg(log_str):
            context.log.info('[bash][{name}] '.format(name=name) + log_str)

        tmp_path = seven.get_system_temp_directory()
        log_info_msg('using temporary directory: %s' % tmp_path)

        env = (
            context.solid_config['env']
            if context.solid_config['env'] is not None
            else os.environ.copy()
        )

        with NamedTemporaryFile(dir=tmp_path, prefix=name) as tmp_file:
            tmp_file.write(bytes(bash_command.encode('utf-8')))
            tmp_file.flush()
            script_location = os.path.abspath(tmp_file.name)
            log_info_msg('Temporary script location: {location}'.format(location=script_location))

            def pre_exec():
                # Restore default signal disposition and invoke setsid
                for sig in ('SIGPIPE', 'SIGXFZ', 'SIGXFSZ'):
                    if hasattr(signal, sig):
                        signal.signal(getattr(signal, sig), signal.SIG_DFL)
                os.setsid()

            log_info_msg('Running command: {command}'.format(command=bash_command))

            # pylint: disable=subprocess-popen-preexec-fn
            sub_process = Popen(
                ['bash', tmp_file.name],
                stdout=PIPE,
                stderr=STDOUT,
                cwd=tmp_path,
                env=env,
                preexec_fn=pre_exec,
            )

            # Stream back logs as they are emitted
            if context.solid_config['output_logging'] == 'STREAM':
                line = ''
                for raw_line in iter(sub_process.stdout.readline, b''):
                    line = raw_line.decode(output_encoding).rstrip()
                    log_info_msg(line)

            sub_process.wait()

            # Collect and buffer all logs, then emit
            if context.solid_config['output_logging'] == 'BUFFER':
                line = ''
                for raw_line in iter(sub_process.stdout.readline, b''):
                    line += raw_line.decode(output_encoding)
                log_info_msg(line)

            # no logging in this case
            elif context.solid_config['output_logging'] == 'NONE':
                pass

            log_info_msg(
                'Command exited with return code {retcode}'.format(retcode=sub_process.returncode)
            )

            if sub_process.returncode:
                raise Failure(description='[bash][{name}] Bash command failed'.format(name=name))

        return line

    return _bash_solid
Exemplo n.º 23
0
def test_construct_different_dicts():
    int_dict = Shape(fields={"an_int": Field(int)})
    string_dict = Shape(fields={"a_string": Field(str)})

    assert int_dict is not string_dict
    assert int_dict.key != string_dict.key
Exemplo n.º 24
0
from slack import WebClient

from dagster import Field, StringSource, resource


@resource(
    {
        "token":
        Field(
            StringSource,
            description=
            """To configure access to the Slack API, you'll need an access
                    token provisioned with access to your Slack workspace.

                    Tokens are typically either user tokens or bot tokens. For programmatic posting
                    to Slack from this resource, you probably want to provision and use a bot token.

                    More in the Slack API documentation here: https://api.slack.com/docs/token-types
                    """,
        )
    },
    description="This resource is for connecting to Slack",
)
def slack_resource(context):
    """This resource is for connecting to Slack.

    By configuring this Slack resource, you can post messages to Slack from any Dagster solid:

    Examples:

    .. code-block:: python
Exemplo n.º 25
0
def _single_optional_string_field_config_dict_with_default():
    optional_field_def = Field(String, is_required=False, default_value="some_default")
    return convert_potential_field({"optional_field": optional_field_def})
Exemplo n.º 26
0
        if self._has_object(key):
            logging.warning(
                "Removing existing ADLS2 key: {key}".format(key=key))
            self._rm_object(key)

        pickled_obj = pickle.dumps(obj, PICKLE_PROTOCOL)

        file = self.file_system_client.create_file(key)
        with file.acquire_lease(self.lease_duration) as lease:
            file.upload_data(pickled_obj, lease=lease, overwrite=True)


@object_manager(
    config_schema={
        "adls2_file_system":
        Field(StringSource, description="ADLS Gen2 file system name"),
        "adls2_prefix":
        Field(StringSource, is_required=False, default_value="dagster"),
    },
    required_resource_keys={"adls2"},
)
def adls2_object_manager(init_context):
    """Persistent object manager using Azure Data Lake Storage Gen2 for storage.

    Suitable for objects storage for distributed executors, so long as
    each execution node has network connectivity and credentials for ADLS and
    the backing container.

    Attach this resource definition to a :py:class:`~dagster.ModeDefinition`
    in order to make it available to your pipeline:
Exemplo n.º 27
0
def _nested_optional_config_with_no_default():
    return convert_potential_field({"nested": {"int_field": Field(Int, is_required=False)}})
Exemplo n.º 28
0
class SlackToFile:
    def __init__(self, output_path):
        self.chat = ChatToFile(output_path)


class ChatToFile:
    def __init__(self, output_path):
        self.output_path = output_path

    def post_message(self, channel, text):
        with open(self.output_path, 'a') as f:
            f.write('%s -- %s\n' % (channel, text))


@resource(Field(Dict({'output_path': Field(String)})))
def slack_to_file_resource(context):
    return SlackToFile(context.resource_config['output_path'])


@pipeline(mode_definitions=[
    ModeDefinition(name='production', resources={'slack': slack_resource}),
    ModeDefinition(name='local', resources={'slack': slack_to_file_resource}),
])
def resources_pipeline():
    post_hello_message()


if __name__ == '__main__':
    execute_pipeline(
        resources_pipeline,
Exemplo n.º 29
0
def test_default_arg():
    config_field = convert_potential_field(
        {"int_field": Field(Int, default_value=2, is_required=False)}
    )

    assert_config_value_success(config_field.config_type, {}, {"int_field": 2})
Exemplo n.º 30
0
from dagster.core.events import DagsterEvent
from dagster.core.execution.api import create_execution_plan
from dagster.core.execution.context.system import SystemStepExecutionContext
from dagster.core.execution.context_creation_pipeline import PlanExecutionContextManager
from dagster.core.execution.plan.execute_step import core_dagster_event_sequence_for_step
from dagster.core.instance import DagsterInstance
from dagster.core.storage.file_manager import LocalFileHandle, LocalFileManager

PICKLED_EVENTS_FILE_NAME = "events.pkl"
PICKLED_STEP_RUN_REF_FILE_NAME = "step_run_ref.pkl"


@resource(
    config_schema={
        "scratch_dir": Field(
            StringSource,
            description="Directory used to pass files between the plan process and step process.",
        ),
    },
)
def local_external_step_launcher(context):
    return LocalExternalStepLauncher(**context.resource_config)


class LocalExternalStepLauncher(StepLauncher):
    """Launches each step in its own local process, outside the plan process."""

    def __init__(self, scratch_dir: str):
        self.scratch_dir = check.str_param(scratch_dir, "scratch_dir")

    def launch_step(
        self, step_context: SystemStepExecutionContext, prior_attempts_count: int