示例#1
0
            {'pypi': {'package': 'databricks-api'}},
            {'pypi': {'package': 'pytest'}},
        ],
    },
    'storage': {
        's3': {
            'secret_scope': 'dagster-databricks-tests',
            'access_key_key': 'aws-access-key',
            'secret_key_key': 'aws-secret-key',
        }
    },
}


@solid(
    output_defs=[OutputDefinition(DataFrame)],
    required_resource_keys={'pyspark_step_launcher', 'pyspark'},
)
def make_df_solid(context):
    schema = StructType([StructField('name', StringType()), StructField('age', IntegerType())])
    rows = [Row(name='John', age=19), Row(name='Jennifer', age=29), Row(name='Henry', age=50)]
    return context.resources.pyspark.spark_session.createDataFrame(rows, schema)


@solid(
    name='blah',
    description='this is a test',
    config_schema={'foo': str, 'bar': int},
    input_defs=[InputDefinition('people', DataFrame)],
    output_defs=[OutputDefinition(DataFrame)],
    required_resource_keys={'pyspark_step_launcher'},
示例#2
0
def clean_data_solid():
    return dm.define_dagstermill_solid('clean_data',
                                       nb_test_path('clean_data'),
                                       outputs=[OutputDefinition(DataFrame)])
示例#3
0
def define_hello_world_with_output():
    return dm.define_dagstermill_solid('hello_world_output',
                                       nb_test_path('hello_world_output'), [],
                                       [OutputDefinition()])
示例#4
0
def define_errorable_resource():
    return ResourceDefinition(resource_fn=resource_init,
                              config_field=Field(
                                  Dict({'throw_on_resource_init':
                                        Field(Bool)})))


solid_throw_config = Field(
    Dict(fields={
        'throw_in_solid': Field(Bool),
        'return_wrong_type': Field(Bool)
    }))


@solid(name='emit_num',
       output_defs=[OutputDefinition(Int)],
       config_field=solid_throw_config)
def emit_num(context):
    if context.solid_config['throw_in_solid']:
        raise Exception('throwing from in the solid')

    if context.solid_config['return_wrong_type']:
        return 'wow'

    return 13


@solid(
    name='num_to_str',
    input_defs=[InputDefinition('num', Int)],
    output_defs=[OutputDefinition(String)],
示例#5
0
    DependencyDefinition,
    InputDefinition,
    OutputDefinition,
    PipelineDefinition,
    SolidInstance,
    execute_solid,
    execute_solids,
    lambda_solid,
    Int,
)


@lambda_solid(
    inputs=[InputDefinition('num1', Int),
            InputDefinition('num2', Int)],
    output=OutputDefinition(Int),
)
def adder(num1, num2):
    return num1 + num2


@lambda_solid(
    inputs=[InputDefinition('num1', Int),
            InputDefinition('num2', Int)],
    output=OutputDefinition(Int),
)
def multer(num1, num2):
    return num1 * num2


def define_part_fourteen_step_one_pipeline():
示例#6
0
        instance=DagsterInstance.ephemeral(),
        locations=[InProcessRepositoryLocation(create_main_recon_repo())],
    )


def main_repo_location_name():
    return '<<in_process>>'


def main_repo_name():
    return 'test_repo'


@lambda_solid(
    input_defs=[InputDefinition('num', PoorMansDataFrame)],
    output_def=OutputDefinition(PoorMansDataFrame),
)
def sum_solid(num):
    sum_df = deepcopy(num)
    for x in sum_df:
        x['sum'] = int(x['num1']) + int(x['num2'])
    return sum_df


@lambda_solid(
    input_defs=[InputDefinition('sum_df', PoorMansDataFrame)],
    output_def=OutputDefinition(PoorMansDataFrame),
)
def sum_sq_solid(sum_df):
    sum_sq_df = deepcopy(sum_df)
    for x in sum_sq_df:
示例#7
0
def add_one(_, num):
    return num + 1


@pipeline(mode_defs=celery_mode_defs)
def test_pipeline():
    return simple()


@pipeline(mode_defs=celery_mode_defs)
def test_serial_pipeline():
    return add_one(simple())


@solid(output_defs=[
    OutputDefinition(name="value_one"),
    OutputDefinition(name="value_two")
])
def emit_values(_context):
    yield Output(1, "value_one")
    yield Output(2, "value_two")


@lambda_solid(
    input_defs=[InputDefinition("num_one"),
                InputDefinition("num_two")])
def subtract(num_one, num_two):
    return num_one - num_two


@pipeline(mode_defs=celery_mode_defs)
示例#8
0
def sql_solid(name,
              select_statement,
              materialization_strategy,
              table_name=None,
              inputs=None):
    '''Return a new solid that executes and materializes a SQL select statement.

    Args:
        name (str): The name of the new solid.
        select_statement (str): The select statement to execute.
        materialization_strategy (str): Must be 'table', the only currently supported
            materialization strategy. If 'table', the kwarg `table_name` must also be passed.
    Kwargs:
        table_name (str): THe name of the new table to create, if the materialization strategy
            is 'table'. Default: None.
        inputs (list[InputDefinition]): Inputs, if any, for the new solid. Default: None.

    Returns:
        function:
            The new SQL solid.
    '''
    inputs = check.opt_list_param(inputs, 'inputs', InputDefinition)

    materialization_strategy_output_types = {  # pylint:disable=C0103
        'table': SqlTableName,
        # 'view': String,
        # 'query': SqlAlchemyQueryType,
        # 'subquery': SqlAlchemySubqueryType,
        # 'result_proxy': SqlAlchemyResultProxyType,
        # could also materialize as a Pandas table, as a Spark table, as an intermediate file, etc.
    }

    if materialization_strategy not in materialization_strategy_output_types:
        raise Exception(
            'Invalid materialization strategy {materialization_strategy}, must '
            'be one of {materialization_strategies}'.format(
                materialization_strategy=materialization_strategy,
                materialization_strategies=str(
                    list(materialization_strategy_output_types.keys())),
            ))

    if materialization_strategy == 'table':
        if table_name is None:
            raise Exception(
                'Missing table_name: required for materialization strategy \'table\''
            )

    output_description = (
        'The string name of the new table created by the solid'
        if materialization_strategy == 'table' else
        'The materialized SQL statement. If the materialization_strategy is '
        '\'table\', this is the string name of the new table created by the solid.'
    )

    description = '''This solid executes the following SQL statement:
    {select_statement}'''.format(select_statement=select_statement)

    # n.b., we will eventually want to make this resources key configurable
    sql_statement = (
        'drop table if exists {table_name};\n'
        'create table {table_name} as {select_statement};').format(
            table_name=table_name, select_statement=select_statement)

    def transform_fn(info, _inputs):
        '''Inner function defining the new solid.

        Args:
            info (ExpectationExecutionInfo): Must expose a `db` resource with an `execute` method,
                like a SQLAlchemy engine, that can execute raw SQL against a database.

        Returns:
            str:
                The table name of the newly materialized SQL select statement.
        '''

        info.context.info('Executing sql statement:\n{sql_statement}'.format(
            sql_statement=sql_statement))
        info.context.resources.db_info.engine.execute(text(sql_statement))
        yield Result(value=table_name, output_name='result')

    return SolidDefinition(
        name=name,
        inputs=inputs,
        outputs=[
            OutputDefinition(
                materialization_strategy_output_types[
                    materialization_strategy],
                description=output_description,
            )
        ],
        transform_fn=transform_fn,
        description=description,
        metadata={
            'kind': 'sql',
            'sql': sql_statement
        },
    )
示例#9
0
# pylint: disable=unused-argument

from dagster import Failure, InputDefinition, Output, OutputDefinition, pipeline, solid

conditional = True


@solid(output_defs=[OutputDefinition(int, "a", is_required=False)])
def my_solid(context):
    if conditional:
        yield Output(1, "a")


@solid(output_defs=[
    OutputDefinition(int, "a", is_required=False),
    OutputDefinition(int, "b", is_required=False),
])
def branching_solid(context):
    if conditional:
        yield Output(1, "a")
    else:
        yield Output(2, "b")


@solid(input_defs=[InputDefinition("inp", int)])
def path_1(context, inp):
    pass


@solid(input_defs=[InputDefinition("inp", int)])
def path_2(context, inp):
示例#10
0
@solid
def add_one(_, num):
    return num + 1


@pipeline(mode_defs=celery_mode_defs)
def test_pipeline():
    return simple()


@pipeline(mode_defs=celery_mode_defs)
def test_serial_pipeline():
    return add_one(simple())


@solid(output_defs=[OutputDefinition(name="value_one"), OutputDefinition(name="value_two")])
def emit_values(_context):
    yield Output(1, "value_one")
    yield Output(2, "value_two")


@lambda_solid(input_defs=[InputDefinition("num_one"), InputDefinition("num_two")])
def subtract(num_one, num_two):
    return num_one - num_two


@pipeline(mode_defs=celery_mode_defs)
def test_diamond_pipeline():
    value_one, value_two = emit_values()
    return subtract(num_one=add_one(num=value_one), num_two=add_one.alias("renamed")(num=value_two))
示例#11
0
        ],
        transform_fn=transform_fn,
        description=description,
        metadata={
            'kind': 'sql',
            'sql': sql_statement
        },
    )


@solid(
    name='thunk',
    config_field=Field(String, description='The string value to output.'),
    description=
    'No-op solid that simply outputs its single string config value.',
    outputs=[OutputDefinition(String, description='The string passed in as ')],
)
def thunk(info):
    '''Output the config vakue.

    Especially useful when constructing DAGs with root nodes that take inputs which might in
    other dags come from upstream solids.

    Args:
        info (ExpectationExecutionInfo)

    Returns:
        str;
            The config value passed to the solid.
    '''
    return info.config
示例#12
0
    def outer():
        @composite_solid(output_defs=[OutputDefinition()])
        def inner():
            return add_one(return_one())

        add_one(inner())
示例#13
0
    lambda_solid,
    pipeline,
    repository,
    solid,
)
from dagster.core.definitions.decorators.hook import event_list_hook, success_hook
from dagster.core.definitions.events import HookExecutionResult
from dagster.core.errors import DagsterInvalidDefinitionError, DagsterInvariantViolationError
from dagster.core.execution.api import create_execution_plan


def builder(graph):
    return graph.add_one(graph.return_one())


@lambda_solid(output_def=OutputDefinition(Int))
def echo(blah):
    return blah


@lambda_solid
def return_one():
    return 1


@lambda_solid
def return_two():
    return 2


@lambda_solid
示例#14
0
def test_dataframe_outputs(file_type, read, other):
    df = create_pyspark_df()

    @solid(output_defs=[
        OutputDefinition(dagster_type=DagsterPySparkDataFrame, name="df")
    ])
    def return_df(_):
        return df

    with get_temp_dir() as temp_path:
        shutil.rmtree(temp_path)

        options = {"path": temp_path}
        if other:
            options["format"] = file_type
            file_type = "other"

        result = execute_solid(
            return_df,
            mode_def=ModeDefinition(
                resource_defs={"pyspark": pyspark_resource}),
            run_config={
                "solids": {
                    "return_df": {
                        "outputs": [{
                            "df": {
                                file_type: options
                            }
                        }]
                    }
                }
            },
        )
        assert result.success
        actual = read(options["path"], **dict_without_keys(options, "path"))
        assert sorted(df.collect()) == sorted(actual.collect())

        result = execute_solid(
            return_df,
            mode_def=ModeDefinition(
                resource_defs={"pyspark": pyspark_resource}),
            run_config={
                "solids": {
                    "return_df": {
                        "outputs": [{
                            "df": {
                                file_type:
                                dict(
                                    {
                                        "mode": "overwrite",
                                        "compression": "gzip",
                                    },
                                    **options,
                                )
                            }
                        }]
                    }
                }
            },
        )
        assert result.success
        actual = read(options["path"], **dict_without_keys(options, "path"))
        assert sorted(df.collect()) == sorted(actual.collect())
示例#15
0
def test_basic_typing_dictionary_output():
    @lambda_solid(output_def=OutputDefinition(typing.Dict))
    def emit_dict():
        return {"key": "value"}

    assert execute_solid(emit_dict).output_value() == {"key": "value"}
示例#16
0
def test_pd_df_load():
    dataset = get_dataset()
    table = '%s.%s' % (dataset, 'df')

    test_df = pd.DataFrame({'num1': [1, 3], 'num2': [2, 4]})

    create_solid = bq_create_dataset.alias('create_solid')
    load_solid = import_df_to_bq.alias('load_solid')
    query_solid = bq_solid_for_queries(['SELECT num1, num2 FROM %s' % table
                                        ]).alias('query_solid')
    delete_solid = bq_delete_dataset.alias('delete_solid')

    @solid(input_defs=[InputDefinition('success', Nothing)],
           output_defs=[OutputDefinition(DataFrame)])
    def return_df(_context):  # pylint: disable=unused-argument
        return test_df

    config = {
        'solids': {
            'create_solid': {
                'config': {
                    'dataset': dataset,
                    'exists_ok': True
                }
            },
            'load_solid': {
                'config': {
                    'destination': table
                }
            },
            'delete_solid': {
                'config': {
                    'dataset': dataset,
                    'delete_contents': True
                }
            },
        }
    }

    @pipeline(mode_defs=bq_modes())
    def bq_pipeline():
        delete_solid(query_solid(load_solid(return_df(create_solid()))))

    result = execute_pipeline(bq_pipeline, config)
    assert result.success

    values = result.result_for_solid('query_solid').output_value()
    assert values[0].to_dict() == test_df.to_dict()

    # BQ loads should throw an exception if pyarrow and fastparquet aren't available
    with mock.patch.dict(sys.modules, {'pyarrow': None, 'fastparquet': None}):
        with pytest.raises(DagsterExecutionStepExecutionError) as exc_info:
            result = execute_pipeline(bq_pipeline, config)
        assert (
            'loading data to BigQuery from pandas DataFrames requires either pyarrow or fastparquet'
            ' to be installed' in str(exc_info.value.user_exception))
        cleanup_config = {
            'solids': {
                'delete_solid': {
                    'config': {
                        'dataset': dataset,
                        'delete_contents': True
                    }
                }
            }
        }

        @pipeline(mode_defs=bq_modes())
        def cleanup():
            delete_solid()

        assert execute_pipeline(cleanup, cleanup_config).success

    assert not dataset_exists(dataset)
示例#17
0
    adder_2 = adder.alias('adder_2')
    three = adder_2(a=two, b=one)


@composite_solid
def add_one(num: int):
    adder(num, return_one())


@composite_solid
def add_one_out(num: int) -> int:
    return adder(num, return_one())


@composite_solid(output_defs=[
    OutputDefinition(int, 'a_out'),
    OutputDefinition(int, 'b_out')
])
def add_both(a: int, b: int):
    one = return_one()
    a = adder.alias('adder_a')(a, one)
    b = adder.alias('adder_b')(b, one)
    return {'a_out': a, 'b_out': b}


@solid
def sales_team_path(_):
    pass


@solid
示例#18
0
def test_gcs_load():
    dataset = get_dataset()
    table = '%s.%s' % (dataset, 'df')

    create_solid = bq_create_dataset.alias('create_solid')
    query_solid = bq_solid_for_queries([
        'SELECT string_field_0, string_field_1 FROM %s ORDER BY string_field_0 ASC LIMIT 1'
        % table
    ]).alias('query_solid')
    delete_solid = bq_delete_dataset.alias('delete_solid')

    @solid(input_defs=[InputDefinition('success', Nothing)],
           output_defs=[OutputDefinition(List[Path])])
    def return_gcs_uri(_context):  # pylint: disable=unused-argument
        return ["gs://cloud-samples-data/bigquery/us-states/us-states.csv"]

    config = {
        'solids': {
            'create_solid': {
                'config': {
                    'dataset': dataset,
                    'exists_ok': True
                }
            },
            'import_gcs_paths_to_bq': {
                'config': {
                    'destination': table,
                    'load_job_config': {
                        'autodetect': True,
                        'skip_leading_rows': 1,
                        'source_format': 'CSV',
                        'write_disposition': 'WRITE_TRUNCATE',
                    },
                }
            },
            'delete_solid': {
                'config': {
                    'dataset': dataset,
                    'delete_contents': True
                }
            },
        }
    }

    @pipeline(mode_defs=bq_modes())
    def bq_pipeline():
        delete_solid(
            query_solid(import_gcs_paths_to_bq(return_gcs_uri(
                create_solid()))))

    result = execute_pipeline(bq_pipeline, config)
    assert result.success

    values = result.result_for_solid('query_solid').output_value()
    assert values[0].to_dict() == {
        'string_field_0': {
            0: 'Alabama'
        },
        'string_field_1': {
            0: 'AL'
        }
    }

    assert not dataset_exists(dataset)
示例#19
0
def connect():
    pass


def write_dataframe_to_table(**_kwargs):
    pass


def read_dataframe_from_table(**_kwargs):
    pass


# solids_start_marker
@solid(output_defs=[
    OutputDefinition(asset_metadata={
        "schema": "some_schema",
        "table": "some_table"
    })
])
def solid1(_):
    """Return a Pandas DataFrame"""


@solid(output_defs=[
    OutputDefinition(asset_metadata={
        "schema": "other_schema",
        "table": "other_table"
    })
])
def solid2(_, _input_dataframe):
    """Return a Pandas DataFrame"""
示例#20
0
def test_pd_df_load():
    dataset = get_dataset()
    table = "%s.%s" % (dataset, "df")

    test_df = pd.DataFrame({"num1": [1, 3], "num2": [2, 4]})

    create_op = bq_create_dataset.alias("create_op")
    load_op = import_df_to_bq.alias("load_op")
    query_op = bq_op_for_queries(["SELECT num1, num2 FROM %s" % table]).alias("query_op")
    delete_op = bq_delete_dataset.alias("delete_op")

    @op(input_defs=[InputDefinition("success", Nothing)], output_defs=[OutputDefinition(DataFrame)])
    def return_df(_context):  # pylint: disable=unused-argument
        return test_df

    @job(resource_defs={"bigquery": bigquery_resource})
    def bq_circle_of_life():
        delete_op(query_op(load_op(return_df(create_op()))))

    result = bq_circle_of_life.execute_in_process(
        run_config={
            "ops": {
                "create_op": {"config": {"dataset": dataset, "exists_ok": True}},
                "load_op": {"config": {"destination": table}},
                "delete_op": {"config": {"dataset": dataset, "delete_contents": True}},
            }
        }
    )
    assert result.success

    values = result.output_for_node("query_op")
    assert values[0].to_dict() == test_df.to_dict()

    # BQ loads should throw an exception if pyarrow and fastparquet aren't available
    with mock.patch.dict(sys.modules, {"pyarrow": None, "fastparquet": None}):
        with pytest.raises(DagsterExecutionStepExecutionError) as exc_info:
            bq_circle_of_life.execute_in_process(
                run_config={
                    "ops": {
                        "create_op": {"config": {"dataset": dataset, "exists_ok": True}},
                        "load_op": {"config": {"destination": table}},
                        "delete_op": {"config": {"dataset": dataset, "delete_contents": True}},
                    }
                }
            )
        assert (
            "loading data to BigQuery from pandas DataFrames requires either pyarrow or fastparquet"
            " to be installed" in str(exc_info.value.user_exception)
        )

        @job(resource_defs={"bigquery": bigquery_resource})
        def cleanup_bq():
            delete_op()

        result = cleanup_bq.execute_in_process(
            run_config={
                "ops": {"delete_op": {"config": {"dataset": dataset, "delete_contents": True}}}
            }
        )
        assert result.success

    assert not dataset_exists(dataset)
示例#21
0
from dagster import Output, OutputDefinition, composite_solid, pipeline, solid


@solid
def do_something():
    pass


@solid(
    output_defs=[OutputDefinition(int, "one"),
                 OutputDefinition(int, "two")])
def return_multi():
    yield Output(1, "one")
    yield Output(2, "two")


@composite_solid(
    output_defs=[OutputDefinition(int, "one"),
                 OutputDefinition(int, "two")])
def do_two_things():
    do_something()
    one, two = return_multi()
    return {"one": one, "two": two}


@solid
def do_yet_more(arg1, arg2):
    assert arg1 == 1
    assert arg2 == 2

示例#22
0
            default_value=None,
            is_required=False,
            description='Working directory in which to execute shell script',
        ),
    }


@solid(
    name='shell_solid',
    description=
    ('This solid executes a shell command it receives as input.\n\n'
     'This solid is suitable for uses where the command to execute is generated dynamically by '
     'upstream solids. If you know the command to execute at pipeline construction time, '
     'consider `shell_command_solid` instead.'),
    input_defs=[InputDefinition('shell_command', str)],
    output_defs=[OutputDefinition(str, 'result')],
    config_schema=shell_solid_config(),
)
def shell_solid(context, shell_command):
    '''This solid executes a shell command it receives as input.

    This solid is suitable for uses where the command to execute is generated dynamically by
    upstream solids. If you know the command to execute at pipeline construction time, consider
    `shell_command_solid` instead.
    '''
    output, return_code = execute(shell_command=shell_command,
                                  log=context.log,
                                  **context.solid_config)

    if return_code:
        raise Failure(
示例#23
0
    cereals_df = pandas.read_csv(csv_file_path)
    with context.resources.db.connect() as conn:
        conn.execute("drop table if exists cereals cascade")
        cereals_df.to_sql(name="cereals", con=conn)


@solid(config_schema={"channels": Array(str)}, required_resource_keys={"slack"})
def post_plot_to_slack(context, plot_path):
    context.resources.slack.files_upload(
        channels=",".join(context.solid_config["channels"]), file=plot_path
    )


@solid(required_resource_keys={"dbt"}, input_defs=[InputDefinition("after", Nothing)])
def run_cereals_models(context) -> DbtCliOutput:
    return context.resources.dbt.run()


@solid(required_resource_keys={"dbt"}, input_defs=[InputDefinition("after", Nothing)])
def test_cereals_models(context) -> DbtCliOutput:
    return context.resources.dbt.test()


analyze_cereals = define_dagstermill_solid(
    "analyze_cereals",
    file_relative_path(__file__, "notebooks/Analyze_Cereals.ipynb"),
    input_defs=[InputDefinition("run_results", dagster_type=DbtCliOutput)],
    output_defs=[OutputDefinition(str)],
    required_resource_keys={"db"},
)
示例#24
0
def create_shell_script_solid(shell_script_path,
                              name='create_shell_script_solid',
                              input_defs=None,
                              **kwargs):
    '''This function is a factory which constructs a solid that will execute a shell command read
    from a script file.

    Any kwargs passed to this function will be passed along to the underlying :func:`@solid
    <dagster.solid>` decorator. However, note that overriding ``config`` or ``output_defs`` is not
    supported.

    You might consider using :func:`@composite_solid <dagster.composite_solid>` to wrap this solid
    in the cases where you'd like to configure the shell solid with different config fields.


    Examples:

    .. literalinclude:: ../../../../../python_modules/libraries/dagster-shell/dagster_shell_tests/example_shell_script_solid.py
       :language: python


    Args:
        shell_script_path (str): The script file to execute.
        name (str, optional): The name of this solid. Defaults to "create_shell_script_solid".
        input_defs (List[InputDefinition], optional): input definitions for the solid. Defaults to
            a single Nothing input.

    Raises:
        Failure: Raised when the shell command returns a non-zero exit code.

    Returns:
        SolidDefinition: Returns the constructed solid definition.
    '''
    check.str_param(shell_script_path, 'shell_script_path')
    name = check.str_param(name, 'name')
    check.opt_list_param(input_defs, 'input_defs', of_type=InputDefinition)

    if 'output_defs' in kwargs:
        raise TypeError(
            'Overriding output_defs for shell solid is not supported.')

    if 'config' in kwargs:
        raise TypeError('Overriding config for shell solid is not supported.')

    @solid(name=name,
           description=kwargs.pop('description',
                                  'A solid to invoke a shell command.'),
           input_defs=input_defs or [InputDefinition('start', Nothing)],
           output_defs=[OutputDefinition(str, 'result')],
           config_schema=shell_solid_config(),
           **kwargs)
    def _shell_script_solid(context):
        output, return_code = execute_script_file(
            shell_script_path=shell_script_path,
            log=context.log,
            **context.solid_config)

        if return_code:
            raise Failure(
                description=
                'Shell command execution failed with output: {output}'.format(
                    output=output))

        return output

    return _shell_script_solid
示例#25
0
def test_basic_solid_dict_int_int_output():
    @lambda_solid(output_def=OutputDefinition(Dict[int, int]))
    def emit_dict_int_int():
        return {1: 1}

    assert execute_solid(emit_dict_int_int).output_value() == {1: 1}
示例#26
0
def create_shell_command_solid(
    shell_command,
    name,
    description=None,
    required_resource_keys=None,
    tags=None,
):
    '''This function is a factory that constructs solids to execute a shell command.

    Note that you can only use `shell_command_solid` if you know the command you'd like to execute
    at pipeline construction time. If you'd like to construct shell commands dynamically during
    pipeline execution and pass them between solids, you should use `shell_solid` instead.

    Examples:

    .. literalinclude:: ../../../../../python_modules/libraries/dagster-shell/dagster_shell_tests/example_shell_command_solid.py
       :language: python


    Args:
        shell_command (str): The shell command that the constructed solid will execute.
        name (str): The name of the constructed solid.
        description (Optional[str]): Human-readable description of this solid.
        required_resource_keys (Optional[Set[str]]): Set of resource handles required by this solid.
            Setting this ensures that resource spin up for the required resources will occur before
            the shell command is executed.
        tags (Optional[Dict[str, Any]]): Arbitrary metadata for the solid. Frameworks may
            expect and require certain metadata to be attached to a solid. Users should generally
            not set metadata directly. Values that are not strings will be json encoded and must meet
            the criteria that `json.loads(json.dumps(value)) == value`.

    Raises:
        Failure: Raised when the shell command returns a non-zero exit code.

    Returns:
        SolidDefinition: Returns the constructed solid definition.
    '''
    check.str_param(shell_command, 'shell_command')
    name = check.str_param(name, 'name')

    @solid(
        name=name,
        description=description,
        input_defs=[InputDefinition('start', Nothing)],
        output_defs=[OutputDefinition(str, 'result')],
        config_schema=shell_solid_config(),
        required_resource_keys=required_resource_keys,
        tags=tags,
    )
    def _shell_solid(context):
        output, return_code = execute(shell_command=shell_command,
                                      log=context.log,
                                      **context.solid_config)

        if return_code:
            raise Failure(
                description=
                'Shell command execution failed with output: {output}'.format(
                    output=output))

        return output

    return _shell_solid
示例#27
0
def no_repo_reg_solid():
    return dm.define_dagstermill_solid(
        'no_repo_reg',
        nb_test_path('no_repo_reg_error'),
        outputs=[OutputDefinition(name='df', dagster_type=ComplexDagsterType)],
    )
示例#28
0
def test_dagster_dictionary_output():
    @lambda_solid(output_def=OutputDefinition(dict))
    def emit_dict():
        return {"key": "value"}

    assert execute_solid(emit_dict).output_value() == {"key": "value"}
示例#29
0
            )
            results.append(
                context.resources.bigquery.query(sql_query, job_config=cfg).to_dataframe()
            )

        return results

    return _solid


BIGQUERY_LOAD_CONFIG = define_bigquery_load_config()


@solid(
    input_defs=[InputDefinition("paths", List[str])],
    output_defs=[OutputDefinition(Nothing)],
    config_schema=BIGQUERY_LOAD_CONFIG,
    required_resource_keys={"bigquery"},
)
def import_gcs_paths_to_bq(context, paths):
    return _execute_load_in_source(context, paths, BigQueryLoadSource.GCS)


@solid(
    input_defs=[InputDefinition("df", DataFrame)],
    output_defs=[OutputDefinition(Nothing)],
    config_schema=BIGQUERY_LOAD_CONFIG,
    required_resource_keys={"bigquery"},
)
def import_df_to_bq(context, df):
    return _execute_load_in_source(context, df, BigQueryLoadSource.DataFrame)
示例#30
0
from urllib.request import urlretrieve

from dagster import Field, OutputDefinition, String, solid
from dagster.utils import script_relative_path


@solid(
    name="download_file",
    config_schema={
        "url":
        Field(String, description="The URL from which to download the file"),
        "path":
        Field(String, description="The path to which to download the file"),
    },
    output_defs=[
        OutputDefinition(
            String,
            name="path",
            description="The path to which the file was downloaded")
    ],
    description=
    ("A simple utility solid that downloads a file from a URL to a path using "
     "urllib.urlretrieve"),
)
def download_file(context):
    output_path = script_relative_path(context.solid_config["path"])
    urlretrieve(context.solid_config["url"], output_path)
    return output_path