Exemplo n.º 1
0
from dagster import Float, InputDefinition, Int, OutputDefinition, composite_solid, pipeline, solid


@solid(input_defs=[InputDefinition('num', Int)],
       output_defs=[OutputDefinition(Int)])
def add_one(_, num):
    return num + 1


@solid(input_defs=[InputDefinition('num', Float)],
       output_defs=[OutputDefinition(Float)])
def div_two(_, num):
    return num / 2


@composite_solid(input_defs=[InputDefinition('num', Int)],
                 output_defs=[OutputDefinition(Int)])
def add_two(num):
    return add_one(num=add_one(num))


@composite_solid(input_defs=[InputDefinition('num', Int)],
                 output_defs=[OutputDefinition(Int)])
def add_four(num):
    return add_two(num=add_two(num))


@composite_solid(input_defs=[InputDefinition('num', Float)],
                 output_defs=[OutputDefinition(Float)])
def div_four(num):
    return div_two(num=div_two(num))
Exemplo n.º 2
0
    def _get_config_test_solid(config_key, config_value):
        @solid(inputs=[], outputs=[OutputDefinition()])
        def config_test(info):
            assert info.context.resources == {config_key: config_value}

        return config_test
Exemplo n.º 3
0
from dagster import InputDefinition, Int, OutputDefinition, lambda_solid, pipeline, repository


@lambda_solid(input_defs=[InputDefinition('num', Int)], output_def=OutputDefinition(Int))
def add_one(num):
    return num + 1


@lambda_solid(input_defs=[InputDefinition('num', Int)], output_def=OutputDefinition(Int))
def mult_two(num):
    return num * 2


@pipeline
def math():
    return mult_two(num=add_one())


@repository
def test_override_repository():
    return [math]
Exemplo n.º 4
0
)


def define_context(raise_on_error=True, log_dir=None, schedule_dir=None):
    return DagsterGraphQLContext(
        handle=ExecutionTargetHandle.for_repo_fn(define_repository),
        pipeline_runs=FilesystemRunStorage(log_dir) if log_dir else InMemoryRunStorage(),
        scheduler=SystemCronScheduler(schedule_dir) if schedule_dir else None,
        execution_manager=SynchronousExecutionManager(),
        raise_on_error=raise_on_error,
    )


@lambda_solid(
    input_defs=[InputDefinition('num', PoorMansDataFrame)],
    output_def=OutputDefinition(PoorMansDataFrame),
)
def sum_solid(num):
    sum_df = deepcopy(num)
    for x in sum_df:
        x['sum'] = int(x['num1']) + int(x['num2'])
    return PoorMansDataFrame(sum_df)


@lambda_solid(
    input_defs=[InputDefinition('sum_df', PoorMansDataFrame)],
    output_def=OutputDefinition(PoorMansDataFrame),
)
def sum_sq_solid(sum_df):
    sum_sq_df = deepcopy(sum_df)
    for x in sum_sq_df:
Exemplo n.º 5
0
def test_depends_on_adls2_resource_file_manager(storage_account, file_system):
    bar_bytes = b"bar"

    @solid(output_defs=[OutputDefinition(ADLS2FileHandle)],
           required_resource_keys={"file_manager"})
    def emit_file(context):
        return context.resources.file_manager.write_data(bar_bytes)

    @solid(
        input_defs=[InputDefinition("file_handle", ADLS2FileHandle)],
        required_resource_keys={"file_manager"},
    )
    def accept_file(context, file_handle):
        local_path = context.resources.file_manager.copy_handle_to_local_temp(
            file_handle)
        assert isinstance(local_path, str)
        assert open(local_path, "rb").read() == bar_bytes

    adls2_fake_resource = FakeADLS2Resource(storage_account)
    adls2_fake_file_manager = ADLS2FileManager(
        adls2_client=adls2_fake_resource.adls2_client,
        file_system=file_system,
        prefix="some-prefix",
    )

    @pipeline(mode_defs=[
        ModeDefinition(resource_defs={
            "adls2":
            ResourceDefinition.hardcoded_resource(adls2_fake_resource),
            "file_manager":
            ResourceDefinition.hardcoded_resource(adls2_fake_file_manager),
        }, )
    ])
    def adls2_file_manager_test():
        accept_file(emit_file())

    result = execute_pipeline(
        adls2_file_manager_test,
        run_config={
            "resources": {
                "file_manager": {
                    "config": {
                        "adls2_file_system": file_system
                    }
                }
            }
        },
    )

    assert result.success

    keys_in_bucket = set(
        adls2_fake_resource.adls2_client.file_systems[file_system].keys())

    assert len(keys_in_bucket) == 1

    file_key = list(keys_in_bucket)[0]
    comps = file_key.split("/")

    assert "/".join(comps[:-1]) == "some-prefix"

    assert uuid.UUID(comps[-1])
Exemplo n.º 6
0
from dagster import (
    DependencyDefinition,
    Field,
    InputDefinition,
    MultipleResults,
    OutputDefinition,
    PipelineDefinition,
    Result,
    solid,
    String,
    Int,
)


@solid(outputs=[
    OutputDefinition(dagster_type=Int, name='out_one'),
    OutputDefinition(dagster_type=Int, name='out_two'),
])
def yield_outputs(_context):
    yield Result(23, 'out_one')
    yield Result(45, 'out_two')


@solid(outputs=[
    OutputDefinition(dagster_type=Int, name='out_one'),
    OutputDefinition(dagster_type=Int, name='out_two'),
])
def return_dict_results(_context):
    return MultipleResults.from_dict({'out_one': 23, 'out_two': 45})

Exemplo n.º 7
0
    def load_input(self, context):
        if "table_name" in context.input_config:
            table_name = context.input_config["table_name"]
        else:
            table_name = context.upstream_output.name

        return read_dataframe_from_table(name=table_name)


@object_manager(
    input_config_schema={"table_name": Field(str, is_required=False)})
def my_object_manager(_):
    return MyObjectManager()


@solid(output_defs=[OutputDefinition(manager_key="my_object_manager")])
def solid1(_):
    """Do stuff"""


@solid(
    input_defs=[InputDefinition("dataframe", manager_key="my_object_manager")])
def solid2(_, dataframe):
    """Do stuff"""


@pipeline(mode_defs=[
    ModeDefinition(resource_defs={"my_object_manager": my_object_manager})
])
def my_pipeline():
    solid2(solid1())
Exemplo n.º 8
0
def replace_values_spark(data_frame, old, new):
    return data_frame.na.replace(old, new)


@solid(required_resource_keys={'pyspark_step_launcher'})
def process_sfo_weather_data(_context,
                             sfo_weather_data: DataFrame) -> DataFrame:
    normalized_sfo_weather_data = replace_values_spark(sfo_weather_data, 'M',
                                                       None)
    return rename_spark_dataframe_columns(normalized_sfo_weather_data,
                                          lambda c: c.lower())


@solid(
    output_defs=[OutputDefinition(name='table_name', dagster_type=String)],
    config_schema={'table_name': String},
    required_resource_keys={'db_info', 'pyspark_step_launcher'},
)
def load_data_to_database_from_spark(context, data_frame: DataFrame):
    context.resources.db_info.load_table(data_frame,
                                         context.solid_config['table_name'])

    table_name = context.solid_config['table_name']
    yield AssetMaterialization(
        asset_key='table:{table_name}'.format(table_name=table_name),
        description=
        ('Persisted table {table_name} in database configured in the db_info resource.'
         ).format(table_name=table_name),
        metadata_entries=[
            EventMetadataEntry.text(label='Host',
Exemplo n.º 9
0
def sql_solid(name,
              select_statement,
              materialization_strategy,
              table_name=None,
              input_defs=None):
    '''Return a new solid that executes and materializes a SQL select statement.

    Args:
        name (str): The name of the new solid.
        select_statement (str): The select statement to execute.
        materialization_strategy (str): Must be 'table', the only currently supported
            materialization strategy. If 'table', the kwarg `table_name` must also be passed.
    Kwargs:
        table_name (str): THe name of the new table to create, if the materialization strategy
            is 'table'. Default: None.
        input_defs (list[InputDefinition]): Inputs, if any, for the new solid. Default: None.

    Returns:
        function:
            The new SQL solid.
    '''
    input_defs = check.opt_list_param(input_defs, 'input_defs',
                                      InputDefinition)

    materialization_strategy_output_types = {  # pylint:disable=C0103
        'table': SqlTableName,
        # 'view': String,
        # 'query': SqlAlchemyQueryType,
        # 'subquery': SqlAlchemySubqueryType,
        # 'result_proxy': SqlAlchemyResultProxyType,
        # could also materialize as a Pandas table, as a Spark table, as an intermediate file, etc.
    }

    if materialization_strategy not in materialization_strategy_output_types:
        raise Exception(
            'Invalid materialization strategy {materialization_strategy}, must '
            'be one of {materialization_strategies}'.format(
                materialization_strategy=materialization_strategy,
                materialization_strategies=str(
                    list(materialization_strategy_output_types.keys())),
            ))

    if materialization_strategy == 'table':
        if table_name is None:
            raise Exception(
                'Missing table_name: required for materialization strategy \'table\''
            )

    output_description = (
        'The string name of the new table created by the solid'
        if materialization_strategy == 'table' else
        'The materialized SQL statement. If the materialization_strategy is '
        '\'table\', this is the string name of the new table created by the solid.'
    )

    description = '''This solid executes the following SQL statement:
    {select_statement}'''.format(select_statement=select_statement)

    # n.b., we will eventually want to make this resources key configurable
    sql_statement = (
        'drop table if exists {table_name};\n'
        'create table {table_name} as {select_statement};').format(
            table_name=table_name, select_statement=select_statement)

    @solid(
        name=name,
        input_defs=input_defs,
        output_defs=[
            OutputDefinition(
                materialization_strategy_output_types[
                    materialization_strategy],
                description=output_description,
            )
        ],
        description=description,
        required_resource_keys={'db_info'},
        tags={
            'kind': 'sql',
            'sql': sql_statement
        },
    )
    def _sql_solid(context, **input_defs):  # pylint: disable=unused-argument
        '''Inner function defining the new solid.

        Args:
            context (SolidExecutionContext): Must expose a `db` resource with an `execute` method,
                like a SQLAlchemy engine, that can execute raw SQL against a database.

        Returns:
            str:
                The table name of the newly materialized SQL select statement.
        '''
        context.log.info('Executing sql statement:\n{sql_statement}'.format(
            sql_statement=sql_statement))
        context.resources.db_info.engine.execute(text(sql_statement))
        yield Output(value=table_name, output_name='result')

    return _sql_solid
Exemplo n.º 10
0
        "Bucket":
        Field(StringSource,
              description="The name of the bucket to upload to.",
              is_required=True),
        "Key":
        Field(StringSource,
              description="The name of the key to upload to.",
              is_required=True),
    },
    input_defs=[
        InputDefinition("file_handle",
                        FileHandle,
                        description="The file to upload.")
    ],
    output_defs=[
        OutputDefinition(name="s3_file_handle", dagster_type=S3FileHandle)
    ],
    description=
    """Take a file handle and upload it to s3. Returns an S3FileHandle.""",
    required_resource_keys={"s3", "file_manager"},
)
def file_handle_to_s3(context, file_handle):
    bucket = context.solid_config["Bucket"]
    key = context.solid_config["Key"]

    with context.resources.file_manager.read(file_handle, "rb") as fileobj:
        context.resources.s3.upload_fileobj(fileobj, bucket, key)
        s3_file_handle = S3FileHandle(bucket, key)

        yield AssetMaterialization(
            asset_key=s3_file_handle.s3_path,
Exemplo n.º 11
0
def _create_lakehouse_table_def(
    name,
    lakehouse_fn,
    input_tables=None,
    other_input_defs=None,
    required_resource_keys=None,
    metadata=None,
    description=None,
):
    metadata = check.opt_dict_param(metadata, 'metadata')
    input_tables = check.opt_list_param(input_tables,
                                        input_tables,
                                        of_type=LakehouseTableInputDefinition)
    other_input_defs = check.opt_list_param(other_input_defs,
                                            other_input_defs,
                                            of_type=InputDefinition)
    required_resource_keys = check.opt_set_param(required_resource_keys,
                                                 'required_resource_keys',
                                                 of_type=str)

    table_type = define_python_dagster_type(python_type=ITableHandle,
                                            name=name,
                                            description=description)

    table_type_inst = table_type.inst()

    table_input_dict = {
        input_table.name: input_table
        for input_table in input_tables
    }
    input_defs = input_tables + other_input_defs
    validate_solid_fn('@solid', name, lakehouse_fn, input_defs,
                      [('context', )])

    def _compute(context, inputs):
        '''
        Workhouse function of lakehouse. The inputs are something that inherits from ITableHandle.
        This compute_fn:
        (1) Iterates over input tables and ask the lakehouse resource to
         hydrate their contents or a representation of their contents
         (e.g a pyspark dataframe) into memory for computation
        (2) Pass those into the lakehouse table function. Do the actual thing.
        (3) Pass the output of the lakehouse function to the lakehouse materialize function.
        (4) Yield a materialization if the lakehouse function returned that.


        There's an argument that the hydrate and materialize functions should return
        a stream of events but that started to feel like I was implementing what should
        be a framework feature.
        '''
        check.inst_param(context.resources.lakehouse,
                         'context.resources.lakehouse', Lakehouse)

        # hydrate tables
        hydrated_tables = {}
        other_inputs = {}
        for input_name, value in inputs.items():
            context.log.info(
                'About to hydrate table {input_name} for use in {name}'.format(
                    input_name=input_name, name=name))
            if input_name in table_input_dict:
                table_handle = value
                input_type = table_input_dict[input_name].runtime_type
                hydrated_tables[
                    input_name] = context.resources.lakehouse.hydrate(
                        context,
                        input_type,
                        table_def_of_type(context.pipeline_def,
                                          input_type.name).metadata,
                        table_handle,
                    )
            else:
                other_inputs[input_name] = value

        # call user-provided business logic which operates on the hydrated values
        # (as opposed to the handles)
        computed_output = lakehouse_fn(context, **hydrated_tables,
                                       **other_inputs)

        materialization, output_table_handle = context.resources.lakehouse.materialize(
            context, table_type_inst, metadata, computed_output)

        if materialization:
            yield materialization

        # just pass in a dummy handle for now if the materialize function
        # does not return one
        yield Output(
            output_table_handle if output_table_handle else TableHandle())

    required_resource_keys.add('lakehouse')

    return LakehouseTableDefinition(
        lakehouse_fn=lakehouse_fn,
        name=name,
        input_tables=input_tables,
        input_defs=input_defs,
        output_defs=[OutputDefinition(table_type)],
        compute_fn=_compute,
        required_resource_keys=required_resource_keys,
        metadata=metadata,
        description=description,
    )
Exemplo n.º 12
0
Arquivo: repo.py Projeto: zuik/dagster
# start_repo_marker_0
from dagster import InputDefinition, List, OutputDefinition, pipeline, repository, solid


@solid(output_defs=[OutputDefinition(int)])
def return_one(_):
    return 1


@solid(input_defs=[InputDefinition("nums", List[int])], output_defs=[OutputDefinition(int)])
def sum_fan_in(_, nums):
    return sum(nums)


@pipeline
def fan_in_pipeline():
    fan_outs = []
    for i in range(0, 10):
        fan_outs.append(return_one.alias("return_one_{}".format(i))())
    sum_fan_in(fan_outs)


@repository
def fan_in_pipeline_repository():
    return [fan_in_pipeline]


# end_repo_marker_0
Exemplo n.º 13
0
def test_solid_definition_errors():
    with pytest.raises(DagsterInvalidDefinitionError,
                       match='positional vararg'):

        @solid(input_defs=[InputDefinition(name="foo")],
               output_defs=[OutputDefinition()])
        def vargs(context, foo, *args):
            pass

    with pytest.raises(DagsterInvalidDefinitionError):

        @solid(input_defs=[InputDefinition(name="foo")],
               output_defs=[OutputDefinition()])
        def wrong_name(context, bar):
            pass

    with pytest.raises(DagsterInvalidDefinitionError):

        @solid(
            input_defs=[
                InputDefinition(name="foo"),
                InputDefinition(name="bar")
            ],
            output_defs=[OutputDefinition()],
        )
        def wrong_name_2(context, foo):
            pass

    with pytest.raises(DagsterInvalidDefinitionError):

        @solid(input_defs=[InputDefinition(name="foo")],
               output_defs=[OutputDefinition()])
        def no_context(foo):
            pass

    with pytest.raises(DagsterInvalidDefinitionError):

        @solid(input_defs=[InputDefinition(name="foo")],
               output_defs=[OutputDefinition()])
        def extras(_context, foo, bar):
            pass

    @solid(
        input_defs=[InputDefinition(name="foo"),
                    InputDefinition(name="bar")],
        output_defs=[OutputDefinition()],
    )
    def valid_kwargs(context, **kwargs):
        pass

    @solid(
        input_defs=[InputDefinition(name="foo"),
                    InputDefinition(name="bar")],
        output_defs=[OutputDefinition()],
    )
    def valid(context, foo, bar):
        pass

    @solid
    def valid_because_inference(context, foo, bar):
        pass
Exemplo n.º 14
0
def add_one(_, num):
    return num + 1


@pipeline(mode_defs=celery_mode_defs)
def test_pipeline():
    return simple()


@pipeline(mode_defs=celery_mode_defs)
def test_serial_pipeline():
    return add_one(simple())


@solid(output_defs=[
    OutputDefinition(name="value_one"),
    OutputDefinition(name="value_two")
])
def emit_values(_context):
    yield Output(1, "value_one")
    yield Output(2, "value_two")


@lambda_solid(
    input_defs=[InputDefinition("num_one"),
                InputDefinition("num_two")])
def subtract(num_one, num_two):
    return num_one - num_two


@pipeline(mode_defs=celery_mode_defs)
Exemplo n.º 15
0
    url = url_prefix + ',{}?exclude={}'.format(
        epoch_date, ','.join(context.solid_config['times_to_exclude']))
    context.log.info("Sending Request. URL is: {}".format(url))
    response = requests.get(url)
    response.raise_for_status()
    raw_weather_data = response.json()['daily']['data'][0]
    raw_weather_data['uuid'] = uuid.uuid4()
    return DataFrame([raw_weather_data])


@solid(
    input_defs=[
        InputDefinition(name='dataframe', dagster_type=RawTripDataFrame)
    ],
    output_defs=[
        OutputDefinition(name='trip_dataframe', dagster_type=TripDataFrame)
    ],
)
def preprocess_trip_dataset(_, dataframe: DataFrame) -> DataFrame:
    dataframe = dataframe[['bike_id', 'start_time',
                           'end_time']].dropna(how='all').reindex()
    dataframe['bike_id'] = dataframe['bike_id'].astype('int64')
    dataframe['start_time'] = to_datetime(dataframe['start_time'])
    dataframe['end_time'] = to_datetime(dataframe['end_time'])
    dataframe['interval_date'] = dataframe['start_time'].apply(
        lambda x: x.date())
    yield Output(dataframe, output_name='trip_dataframe')


@composite_solid(output_defs=[
    OutputDefinition(name='trip_dataframe', dagster_type=TripDataFrame)
Exemplo n.º 16
0
    context.log.debug(json.dumps(resp.json().get("result"), indent=2))
    rpc_poll_result = DbtRpcPollResult.from_results(resp.json().get("result"))
    if should_yield_materializations:
        for materialization in generate_materializations(
                rpc_poll_result=rpc_poll_result):
            yield materialization
    yield Output(value=rpc_poll_result, output_name="result")


@solid(
    description="A solid to invoke dbt run over RPC.",
    input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],
    output_defs=[
        OutputDefinition(
            name="request_token",
            dagster_type=String,
            description="The request token of the invoked dbt run.",
        )
    ],
    config_schema={
        "models":
        Field(
            config=Noneable(Array(String)),
            default_value=None,
            is_required=False,
            description="The dbt models to run.",
        ),
        "exclude":
        Field(
            config=Noneable(Array(String)),
            default_value=None,
Exemplo n.º 17
0
def test_pipeline_wrapping_types():
    @lambda_solid(
        input_defs=[
            InputDefinition('value', Optional[List[Optional[String]]])
        ],
        output_def=OutputDefinition(Optional[List[Optional[String]]]),
    )
    def double_string_for_all(value):
        if not value:
            return value

        output = []
        for item in value:
            output.append(None if item is None else item + item)
        return output

    @pipeline
    def wrapping_test():
        double_string_for_all()

    assert execute_pipeline(
        wrapping_test,
        environment_dict={
            'solids': {
                'double_string_for_all': {
                    'inputs': {
                        'value': None
                    }
                }
            }
        },
    ).success

    assert execute_pipeline(
        wrapping_test,
        environment_dict={
            'solids': {
                'double_string_for_all': {
                    'inputs': {
                        'value': []
                    }
                }
            }
        },
    ).success

    assert execute_pipeline(
        wrapping_test,
        environment_dict={
            'solids': {
                'double_string_for_all': {
                    'inputs': {
                        'value': [{
                            'value': 'foo'
                        }]
                    }
                }
            }
        },
    ).success

    assert execute_pipeline(
        wrapping_test,
        environment_dict={
            'solids': {
                'double_string_for_all': {
                    'inputs': {
                        'value': [{
                            'value': 'bar'
                        }, None]
                    }
                }
            }
        },
    ).success
Exemplo n.º 18
0
def create_dbt_rpc_run_sql_solid(name: str,
                                 output_def: Optional[OutputDefinition] = None,
                                 **kwargs) -> Callable:
    """This function is a factory which constructs a solid that will copy the results of a SQL query run within the context of a dbt project to a DataFrame.

    Any kwargs passed to this function will be passed along to the underlying @solid decorator.
    However, note that overriding config, input_defs, and required_resource_keys is not supported. You might consider using
    @composite_solid to wrap this solid in the cases where you'd like to configure the solid
    with different config fields.

    Args:
        name (str): The name of this solid.
        output_def (OutputDefinition, optional): The OutputDefinition for the solid. This value should always be a representation
            of a pandas DataFrame. If not specificed, the solid will default to an OutputDefinition named "df" with a DataFrame dagster type.

    Returns:
        SolidDefinition: Returns the constructed solid definition.
    """
    check.str_param(obj=name, param_name="name")
    check.opt_inst_param(obj=output_def,
                         param_name="output_def",
                         ttype=OutputDefinition)
    check.param_invariant("input_defs" not in kwargs, "input_defs",
                          "Overriding input_defs is not supported.")
    check.param_invariant(
        "required_resource_keys" not in kwargs,
        "required_resource_keys",
        "Overriding required_resource_keys is not supported.",
    )

    @solid(
        name=name,
        description=kwargs.pop(
            "description",
            "A solid to run a SQL query in context of a dbt project over RPC and return the results in a pandas DataFrame.",
        ),
        input_defs=[
            InputDefinition(name="start_after", dagster_type=Nothing),
            InputDefinition(name="sql",
                            description="The SQL query to be run.",
                            dagster_type=String),
        ],
        output_defs=[
            output_def
            or OutputDefinition(name="df",
                                description="The results of the SQL query.",
                                dagster_type=DataFrame)
        ],
        config_schema={
            "name":
            Field(config=String),
            "interval":
            Field(
                config=Int,
                is_required=False,
                default_value=10,
                description=
                "The interval (in seconds) at which to poll the dbt rpc process.",
            ),
            "logs":
            Field(
                config=Bool,
                is_required=False,
                default_value=True,
                description="Whether or not to return logs from the process.",
            ),
        },
        required_resource_keys={"dbt_rpc"},
        tags={"kind": "dbt"},
        **kwargs,
    )
    def _dbt_rpc_run_sql(context, sql: String) -> DataFrame:
        resp = context.resources.dbt_rpc.run_sql(
            sql=sql, name=context.solid_config["name"])
        context.log.debug(resp.text)
        raise_for_rpc_error(context, resp)
        request_token = resp.json().get("result").get("request_token")
        result = dbt_rpc_poll(context, request_token)
        table = result.results[0].table  # pylint: disable=no-member  # TODO
        return pd.DataFrame.from_records(data=table["rows"],
                                         columns=table["column_names"])

    return _dbt_rpc_run_sql
Exemplo n.º 19
0
import os
from openpyxl import load_workbook
import re
from google.cloud import storage

import basedosdados as bd
from basedosdados import Table

# Temporario, essa funcao vai ser incorporada a base dos dados
from repositories.helpers.storage import StoragePlus
from repositories.helpers.io import get_credentials_from_env


@solid(
    output_defs=[
        OutputDefinition(name="filename"),
        OutputDefinition(name="partitions"),
    ],
    required_resource_keys={"timezone_config"},
)
def create_current_datetime_partition(context):
    timezone = context.resources.timezone_config["timezone"]

    capture_time = pendulum.now(timezone)
    date = capture_time.strftime("%Y-%m-%d")
    hour = capture_time.strftime("%H")
    filename = capture_time.strftime("%Y-%m-%d-%H-%M-%S")

    partitions = f"data={date}/hora={hour}"

    yield Output(filename, output_name="filename")
Exemplo n.º 20
0
def test_open_typing_tuple_output():
    @lambda_solid(output_def=OutputDefinition(Tuple))
    def emit_tuple():
        return (1, 2)

    assert execute_solid(emit_tuple).output_value() == (1, 2)
Exemplo n.º 21
0
            resource_config['password']),
        config_field=Field(
            Dict({
                'username': Field(String),
                'password': Field(String)
            })),
        description='''This represents some cloud-hosted key value store.
        Username and password must be provided via configuration for this to
        work''',
    )


@solid(
    inputs=[InputDefinition('num_one', Int),
            InputDefinition('num_two', Int)],
    outputs=[OutputDefinition(Int)],
)
def add_ints(context, num_one, num_two):
    sum_ints = num_one + num_two
    context.resources.store.record_value(context.log, 'add', sum_ints)
    return sum_ints


def define_resource_test_pipeline():
    return PipelineDefinition(
        name='resource_test_pipeline',
        solids=[add_ints],
        context_definitions={
            'local':
            PipelineContextDefinition(
                resources={'store': define_in_memory_store_resource()}),
Exemplo n.º 22
0

CustomTripDataFrame = create_dagster_pandas_dataframe_type(
    name='CustomTripDataFrame',
    columns=[
        PandasColumn('amount_paid',
                     constraints=[
                         ColumnTypeConstraint('int64'),
                         DivisibleByFiveConstraint()
                     ])
    ],
)


@solid(
    output_defs=[
        OutputDefinition(name='custom_trip_dataframe',
                         dagster_type=CustomTripDataFrame)
    ], )
def load_custom_trip_dataframe(_) -> DataFrame:
    return read_csv(
        script_relative_path('./ebike_trips.csv'),
        parse_dates=['start_time', 'end_time'],
        date_parser=lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f'),
    )


@pipeline
def custom_column_constraint_pipeline():
    load_custom_trip_dataframe()
Exemplo n.º 23
0
CLI_CONFIG_SCHEMA = {**CLI_COMMON_FLAGS_CONFIG_SCHEMA, **CLI_COMMON_OPTIONS_CONFIG_SCHEMA}
CLI_COMMON_FLAGS = set(CLI_COMMON_FLAGS_CONFIG_SCHEMA.keys())


def passthrough_flags_only(solid_config, additional_flags):
    return {
        flag: solid_config[flag]
        for flag in (CLI_COMMON_FLAGS | set(additional_flags))
        if solid_config.get(flag) is not None
    }


@solid(
    description="A solid to invoke dbt run via CLI.",
    input_defs=[InputDefinition(name="start_after", dagster_type=Nothing)],
    output_defs=[OutputDefinition(name="dbt_cli_output", dagster_type=DbtCliOutput)],
    config_schema={
        **CLI_CONFIG_SCHEMA,
        "threads": Field(
            config=Noneable(int),
            default_value=None,
            is_required=False,
            description=(
                "Specify number of threads to use while executing models. Overrides settings "
                "in profiles.yml."
            ),
        ),
        "models": Field(
            config=Noneable([str]),
            default_value=None,
            is_required=False,
Exemplo n.º 24
0
def test_memoized_plan_inits_resources_once():
    @solid(output_defs=[OutputDefinition(io_manager_key="foo")], version="foo")
    def foo_solid():
        pass

    @solid(output_defs=[OutputDefinition(io_manager_key="bar")], version="bar")
    def bar_solid():
        pass

    foo_capture = []
    bar_capture = []
    resource_dep_capture = []
    default_capture = []

    @io_manager(required_resource_keys={"my_resource"})
    def foo_manager():
        foo_capture.append("entered")
        return VersionedInMemoryIOManager()

    @io_manager(required_resource_keys={"my_resource"})
    def bar_manager():
        bar_capture.append("entered")
        return VersionedInMemoryIOManager()

    @io_manager
    def default_manager():
        default_capture.append("entered")
        return VersionedInMemoryIOManager()

    @resource
    def my_resource():
        resource_dep_capture.append("entered")
        return None

    @pipeline(
        mode_defs=[
            ModeDefinition(
                name="fakemode",
                resource_defs={
                    "foo": foo_manager,
                    "bar": bar_manager,
                    "my_resource": my_resource,
                    "io_manager": default_manager,
                },
            ),
        ],
        tags={MEMOIZED_RUN_TAG: "true"},
    )
    def wrap_pipeline():
        foo_solid()
        foo_solid.alias("another_foo")()
        bar_solid()
        bar_solid.alias("another_bar")()

    with instance_for_test() as instance:
        create_execution_plan(wrap_pipeline, instance_ref=instance.get_ref())

    assert len(foo_capture) == 1
    assert len(bar_capture) == 1
    assert len(resource_dep_capture) == 1
    assert len(default_capture) == 0
Exemplo n.º 25
0
def define_dagstermill_solid(
    name,
    notebook_path,
    input_defs=None,
    output_defs=None,
    config_schema=None,
    required_resource_keys=None,
    output_notebook=None,
    asset_key_prefix=None,
    description=None,
    tags=None,
):
    """Wrap a Jupyter notebook in a solid.

    Arguments:
        name (str): The name of the solid.
        notebook_path (str): Path to the backing notebook.
        input_defs (Optional[List[InputDefinition]]): The solid's inputs.
        output_defs (Optional[List[OutputDefinition]]): The solid's outputs. Your notebook should
            call :py:func:`~dagstermill.yield_result` to yield each of these outputs.
        required_resource_keys (Optional[Set[str]]): The string names of any required resources.
        output_notebook (Optional[str]): If set, will be used as the name of an injected output of
            type :py:class:`~dagster.FileHandle` that will point to the executed notebook (in
            addition to the :py:class:`~dagster.AssetMaterialization` that is always created). This
            respects the :py:class:`~dagster.core.storage.file_manager.FileManager` configured on
            the pipeline resources via the "file_manager" resource key, so, e.g.,
            if :py:class:`~dagster_aws.s3.s3_file_manager` is configured, the output will be a :
            py:class:`~dagster_aws.s3.S3FileHandle`.
        asset_key_prefix (Optional[Union[List[str], str]]): If set, will be used to prefix the
            asset keys for materialized notebooks.
        description (Optional[str]): If set, description used for solid.
        tags (Optional[Dict[str, str]]): If set, additional tags used to annotate solid.
            Dagster uses the tag keys `notebook_path` and `kind`, which cannot be
            overwritten by the user.

    Returns:
        :py:class:`~dagster.SolidDefinition`
    """
    check.str_param(name, "name")
    check.str_param(notebook_path, "notebook_path")
    input_defs = check.opt_list_param(input_defs,
                                      "input_defs",
                                      of_type=InputDefinition)
    output_defs = check.opt_list_param(output_defs,
                                       "output_defs",
                                       of_type=OutputDefinition)
    required_resource_keys = check.opt_set_param(required_resource_keys,
                                                 "required_resource_keys",
                                                 of_type=str)
    if output_notebook is not None:
        required_resource_keys.add("file_manager")
    if isinstance(asset_key_prefix, str):
        asset_key_prefix = [asset_key_prefix]

    asset_key_prefix = check.opt_list_param(asset_key_prefix,
                                            "asset_key_prefix",
                                            of_type=str)

    default_description = f"This solid is backed by the notebook at {notebook_path}"
    description = check.opt_str_param(description,
                                      "description",
                                      default=default_description)

    user_tags = validate_tags(tags)
    if tags is not None:
        check.invariant(
            "notebook_path" not in tags,
            "user-defined solid tags contains the `notebook_path` key, but the `notebook_path` key is reserved for use by Dagster",
        )
        check.invariant(
            "kind" not in tags,
            "user-defined solid tags contains the `kind` key, but the `kind` key is reserved for use by Dagster",
        )
    default_tags = {"notebook_path": notebook_path, "kind": "ipynb"}

    return SolidDefinition(
        name=name,
        input_defs=input_defs,
        compute_fn=_dm_solid_compute(name,
                                     notebook_path,
                                     output_notebook,
                                     asset_key_prefix=asset_key_prefix),
        output_defs=output_defs +
        ([OutputDefinition(dagster_type=FileHandle, name=output_notebook)]
         if output_notebook else []),
        config_schema=config_schema,
        required_resource_keys=required_resource_keys,
        description=description,
        tags={
            **user_tags,
            **default_tags
        },
    )
Exemplo n.º 26
0
        instance=DagsterInstance.ephemeral(),
        locations=[InProcessRepositoryLocation(create_main_recon_repo())],
    )


def main_repo_location_name():
    return '<<in_process>>'


def main_repo_name():
    return 'test_repo'


@lambda_solid(
    input_defs=[InputDefinition('num', PoorMansDataFrame)],
    output_def=OutputDefinition(PoorMansDataFrame),
)
def sum_solid(num):
    sum_df = deepcopy(num)
    for x in sum_df:
        x['sum'] = int(x['num1']) + int(x['num2'])
    return sum_df


@lambda_solid(
    input_defs=[InputDefinition('sum_df', PoorMansDataFrame)],
    output_def=OutputDefinition(PoorMansDataFrame),
)
def sum_sq_solid(sum_df):
    sum_sq_df = deepcopy(sum_df)
    for x in sum_sq_df:
Exemplo n.º 27
0

@contextmanager
def get_main_external_repo():
    with location_origin_from_python_file(
            python_file=file_relative_path(__file__, "setup.py"),
            attribute=main_repo_name(),
            working_directory=None,
            location_name=main_repo_location_name(),
    ).create_handle() as handle:
        yield handle.create_location().get_repository(main_repo_name())


@lambda_solid(
    input_defs=[InputDefinition("num", PoorMansDataFrame)],
    output_def=OutputDefinition(PoorMansDataFrame),
)
def sum_solid(num):
    sum_df = deepcopy(num)
    for x in sum_df:
        x["sum"] = int(x["num1"]) + int(x["num2"])
    return sum_df


@lambda_solid(
    input_defs=[InputDefinition("sum_df", PoorMansDataFrame)],
    output_def=OutputDefinition(PoorMansDataFrame),
)
def sum_sq_solid(sum_df):
    sum_sq_df = deepcopy(sum_df)
    for x in sum_sq_df:
Exemplo n.º 28
0
def test_multiple_outputs_only_emit_one():
    def _t_fn(*_args):
        yield Result(output_name='output_one', value='foo')

    solid = SolidDefinition(
        name='multiple_outputs',
        inputs=[],
        outputs=[
            OutputDefinition(name='output_one'),
            OutputDefinition(name='output_two', is_optional=True),
        ],
        compute_fn=_t_fn,
    )

    called = {}

    def _compute_fn_one(*_args, **_kwargs):
        called['one'] = True

    downstream_one = SolidDefinition(
        name='downstream_one',
        inputs=[InputDefinition('some_input')],
        outputs=[],
        compute_fn=_compute_fn_one,
    )

    def _compute_fn_two(*_args, **_kwargs):
        raise Exception('do not call me')

    downstream_two = SolidDefinition(
        name='downstream_two',
        inputs=[InputDefinition('some_input')],
        outputs=[],
        compute_fn=_compute_fn_two,
    )

    pipeline = PipelineDefinition(
        solids=[solid, downstream_one, downstream_two],
        dependencies={
            'downstream_one': {'some_input': DependencyDefinition(solid.name, output='output_one')},
            'downstream_two': {'some_input': DependencyDefinition(solid.name, output='output_two')},
        },
    )

    result = execute_pipeline(pipeline)
    assert result.success

    assert called['one']
    solid_result = result.result_for_solid('multiple_outputs')
    assert set(solid_result.transformed_values.keys()) == set(['output_one'])

    with pytest.raises(
        DagsterInvariantViolationError, match='not_defined not defined in solid multiple_outputs'
    ):
        solid_result.transformed_value('not_defined')

    with pytest.raises(DagsterInvariantViolationError, match='Did not find result output_two'):
        solid_result.transformed_value('output_two')

    with pytest.raises(
        DagsterInvariantViolationError,
        match='Try to get result for solid not_present in <<unnamed>>. No such solid.',
    ):
        result.result_for_solid('not_present')

    assert result.result_for_solid('downstream_two').skipped
Exemplo n.º 29
0
            )
            results.append(
                context.resources.bigquery.query(sql_query, job_config=cfg).to_dataframe()
            )

        return results

    return bq_solid


BIGQUERY_LOAD_CONFIG = define_bigquery_load_config()


@solid(
    input_defs=[InputDefinition('paths', List[str])],
    output_defs=[OutputDefinition(Nothing)],
    config=BIGQUERY_LOAD_CONFIG,
    required_resource_keys={'bigquery'},
)
def import_gcs_paths_to_bq(context, paths):
    return _execute_load_in_source(context, paths, BigQueryLoadSource.GCS)


@solid(
    input_defs=[InputDefinition('df', DataFrame)],
    output_defs=[OutputDefinition(Nothing)],
    config=BIGQUERY_LOAD_CONFIG,
    required_resource_keys={'bigquery'},
)
def import_df_to_bq(context, df):
    return _execute_load_in_source(context, df, BigQueryLoadSource.DataFrame)
Exemplo n.º 30
0
def test_mapping_errors(composition_class):
    @lambda_solid
    def echo(foo):
        return foo

    with pytest.raises(
            DagsterInvalidDefinitionError,
            match="references solid 'inner' which it does not contain"):
        composition_class(
            name="bad",
            solid_defs=[echo],
            input_mappings=[
                InputDefinition("mismatch").mapping_to("inner", "foo")
            ],
        )

    with pytest.raises(DagsterInvalidDefinitionError,
                       match="no input named 'bar'"):
        composition_class(
            name="bad",
            solid_defs=[echo],
            input_mappings=[
                InputDefinition("mismatch").mapping_to("echo", "bar")
            ],
        )

    with pytest.raises(
            DagsterInvalidDefinitionError,
            match="InputMapping source and destination must have the same type",
    ):
        composition_class(
            name="bad",
            solid_defs=[echo],
            input_mappings=[
                InputDefinition("mismatch", str).mapping_to("echo", "foo")
            ],
        )

    with pytest.raises(
            DagsterInvalidDefinitionError,
            match=
            "mappings with same definition name but different definitions",
    ):
        composition_class(
            name="bad",
            solid_defs=[echo],
            input_mappings=[
                InputDefinition("mismatch").mapping_to("echo", "foo"),
                InputDefinition("mismatch").mapping_to("echo_2", "foo"),
            ],
        )

    with pytest.raises(
            DagsterInvalidDefinitionError,
            match="references solid 'inner' which it does not contain"):
        composition_class(
            name="bad",
            solid_defs=[echo],
            output_mappings=[
                OutputDefinition().mapping_from("inner", "result")
            ],
        )

    with pytest.raises(DagsterInvalidDefinitionError,
                       match="no output named 'return'"):
        composition_class(
            name="bad",
            solid_defs=[echo],
            output_mappings=[
                OutputDefinition().mapping_from("echo", "return")
            ],
        )

    with pytest.raises(
            DagsterInvalidDefinitionError,
            match=
            "OutputMapping source and destination must have the same type",
    ):
        composition_class(
            name="bad",
            solid_defs=[echo],
            output_mappings=[
                OutputDefinition(str).mapping_from("echo", "result")
            ],
        )