Exemplo n.º 1
0
def test_task_produce_deterministic_version():
    containerless_task = SdkPrestoTask(
        task_inputs=inputs(ds=Types.String, rg=Types.String),
        statement=
        "SELECT * FROM flyte.widgets WHERE ds = '{{ .Inputs.ds}}' LIMIT 10",
        output_schema=schema,
        routing_group="{{ .Inputs.rg }}",
    )
    identical_containerless_task = SdkPrestoTask(
        task_inputs=inputs(ds=Types.String, rg=Types.String),
        statement=
        "SELECT * FROM flyte.widgets WHERE ds = '{{ .Inputs.ds}}' LIMIT 10",
        output_schema=schema,
        routing_group="{{ .Inputs.rg }}",
    )
    different_containerless_task = SdkPrestoTask(
        task_inputs=inputs(ds=Types.String, rg=Types.String),
        statement=
        "SELECT * FROM flyte.widgets WHERE ds = '{{ .Inputs.ds}}' LIMIT 100000",
        output_schema=schema,
        routing_group="{{ .Inputs.rg }}",
    )
    assert (containerless_task._produce_deterministic_version() ==
            identical_containerless_task._produce_deterministic_version())

    assert (containerless_task._produce_deterministic_version() !=
            different_containerless_task._produce_deterministic_version())

    with _pytest.raises(Exception):
        get_sample_task()._produce_deterministic_version()
Exemplo n.º 2
0
from flytekit.sdk.spark_types import SparkType
from flytekit.sdk.tasks import generic_spark_task, inputs, python_task
from flytekit.sdk.types import Types
from flytekit.sdk.workflow import Input, workflow_class

scala_spark = generic_spark_task(
    spark_type=SparkType.SCALA,
    inputs=inputs(partitions=Types.Integer),
    main_class="org.apache.spark.examples.SparkPi",
    main_application_file="local:///opt/spark/examples/jars/spark-examples.jar",
    spark_conf={
        "spark.driver.memory": "1000M",
        "spark.executor.memory": "1000M",
        "spark.executor.cores": "1",
        "spark.executor.instances": "2",
    },
    cache_version="1",
)


@inputs(date_triggered=Types.Datetime)
@python_task(cache_version="1")
def print_every_time(workflow_parameters, date_triggered):
    print("My input : {}".format(date_triggered))


@workflow_class
class SparkTasksWorkflow(object):
    triggered_date = Input(Types.Datetime)
    partitions = Input(Types.Integer)
    spark_task = scala_spark(partitions=partitions)
Exemplo n.º 3
0
from __future__ import absolute_import

from flytekit.sdk.tasks import inputs
from flytekit.sdk.types import Types
from flytekit.sdk.workflow import workflow_class, Input, Output
from flytekit.common.tasks.presto_task import SdkPrestoTask

schema = Types.Schema([("a", Types.Integer), ("b", Types.String)])

presto_task = SdkPrestoTask(
    task_inputs=inputs(length=Types.Integer, rg=Types.String),
    statement=
    "SELECT a, chr(a+64) as b from unnest(sequence(1, {{ .Inputs.length }})) t(a)",
    output_schema=schema,
    routing_group="{{ .Inputs.rg }}",
    catalog="hive",  # can be left out if you specify in query
    schema="tmp",  # can be left out if you specify in query
)


@workflow_class()
class PrestoWorkflow(object):
    length = Input(Types.Integer, required=True, help="Int between 1 and 26")
    routing_group = Input(Types.String,
                          required=True,
                          help="Test string with no default")
    p_task = presto_task(length=length, rg=routing_group)
    output_a = Output(p_task.outputs.results, sdk_type=schema)
Exemplo n.º 4
0
    def get_sdk_node(
            self,
            pipeline_context,
            instance,
            pipeline_run,
            step_key,
            task_type=constants.SdkTaskType.PYTHON_TASK,
            cache_version="",
            retries=0,
            interruptible=False,
            deprecated="",
            storage_request=None,
            cpu_request=None,
            gpu_request=None,
            memory_request=None,
            storage_limit=None,
            cpu_limit=None,
            gpu_limit=None,
            memory_limit=None,
            cache=False,
            timeout=datetime.timedelta(seconds=0),
            environment=None,
    ):
        execution_step = self.execution_plan.get_step_by_key(step_key)
        flyte_inputs = self.flyte_inputs(execution_step.step_input_dict,
                                         execution_step.solid_name)
        flyte_outputs = self.flyte_outputs(execution_step.step_output_dict,
                                           execution_step.solid_name)

        def wrapper(wf_params, *args, **kwargs):  # pylint: disable=unused-argument
            # TODO: We can't update config values via inputs from Flyte, because they are immutable
            plan = self.execution_plan.build_subset_plan([step_key])
            for param, arg in kwargs.items():
                self.inject_intermediates(pipeline_context, execution_step,
                                          param, arg)

            results = list(
                execute_plan(
                    plan,
                    instance,
                    run_config=self.run_config,
                    pipeline_run=pipeline_run,
                ))

            for result in results:
                step_context = pipeline_context.for_step(execution_step)
                self.output_value(step_context, step_key, result,
                                  execution_step, kwargs)

        # This will take the wrapper definition and re-create it with explicit parameters as keyword argumentss
        wrapper = forge.sign(forge.arg("wf_params"),
                             *map(forge.arg, flyte_inputs.keys()),
                             *map(forge.arg, flyte_outputs.keys()))(wrapper)

        # flytekit uses this name for an internal representation, make it unique to the step key
        wrapper.__name__ = execution_step.solid_name

        task = sdk_runnable.SdkRunnableTask(
            task_function=wrapper,
            task_type=task_type,
            discovery_version=cache_version,
            retries=retries,
            interruptible=interruptible,
            deprecated=deprecated,
            storage_request=storage_request,
            cpu_request=cpu_request,
            gpu_request=gpu_request,
            memory_request=memory_request,
            storage_limit=storage_limit,
            cpu_limit=cpu_limit,
            gpu_limit=gpu_limit,
            memory_limit=memory_limit,
            discoverable=cache,
            timeout=timeout,
            environment=environment,
            custom={},
        )

        if flyte_inputs:
            task = inputs(task, **flyte_inputs)
        if flyte_outputs:
            task = outputs(task, **flyte_outputs)

        return task
Exemplo n.º 5
0
from flytekit.common.tasks.presto_task import SdkPrestoTask
from flytekit.sdk.tasks import inputs
from flytekit.sdk.types import Types
from flytekit.sdk.workflow import Input, Output, workflow_class

schema = Types.Schema([("a", Types.String), ("b", Types.Integer)])

presto_task = SdkPrestoTask(
    task_inputs=inputs(ds=Types.String, rg=Types.String),
    statement=
    "SELECT * FROM hive.city.fact_airport_sessions WHERE ds = '{{ .Inputs.ds}}' LIMIT 10",
    output_schema=schema,
    routing_group="{{ .Inputs.rg }}",
    # catalog="hive",
    # schema="city",
)


@workflow_class()
class PrestoWorkflow(object):
    ds = Input(Types.String, required=True, help="Test string with no default")
    # routing_group = Input(Types.String, required=True, help="Test string with no default")

    p_task = presto_task(ds=ds, rg="etl")

    output_a = Output(p_task.outputs.results, sdk_type=schema)
Exemplo n.º 6
0
from flytekit.contrib.notebook.tasks import python_notebook, spark_notebook
from flytekit.sdk.tasks import inputs, outputs
from flytekit.sdk.types import Types
from flytekit.sdk.workflow import Input, workflow_class

interactive_python = python_notebook(
    notebook_path="../../../../notebook-task-examples/python-notebook.ipynb",
    inputs=inputs(pi=Types.Float),
    outputs=outputs(out=Types.Float),
    cpu_request="1",
    memory_request="1G",
)

interactive_spark = spark_notebook(
    notebook_path="../../../../notebook-task-examples/spark-notebook-pi.ipynb",
    inputs=inputs(partitions=Types.Integer),
    outputs=outputs(pi=Types.Float),
)


@workflow_class
class FlyteNotebookSparkWorkflow(object):
    partitions = Input(Types.Integer, default=10)
    out1 = interactive_spark(partitions=partitions)
    out2 = interactive_python(pi=out1.outputs.pi)
Exemplo n.º 7
0
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from flytekit.sdk.types import Types
from flytekit.sdk.tasks import inputs, outputs

from flytekit.sdk.workflow import workflow_class, Input
from flytekit.contrib.notebook.tasks import python_notebook

# The path
interactive_python = python_notebook(
    notebook_path="./notebook-task-examples/python-notebook.ipynb",
    inputs=inputs(pi=Types.Float),
    outputs=outputs(out=Types.Float),
    cpu_request="1",
    memory_request="1G")


@workflow_class
class FlyteNotebookWorkflow(object):
    out2 = interactive_python(pi=3.14)