Exemplo n.º 1
0
def record_outputs(**kwargs) -> str:
    """
    Use this method to record outputs from a notebook.
    It will convert all outputs to a Flyte understandable format. For Files, Directories, please use FlyteFile or
    FlyteDirectory, or wrap up your paths in these decorators.
    """
    if kwargs is None:
        return ""

    m = {}
    ctx = FlyteContext.current_context()
    for k, v in kwargs.items():
        expected = TypeEngine.to_literal_type(type(v))
        lit = TypeEngine.to_literal(ctx,
                                    python_type=type(v),
                                    python_val=v,
                                    expected=expected)
        m[k] = lit
    return LiteralMap(literals=m).to_flyte_idl()
Exemplo n.º 2
0
    def execute(self, **kwargs) -> Any:
        """
        TODO: Figure out how to share FlyteContext ExecutionParameters with the notebook kernel (as notebook kernel
             is executed in a separate python process)
        For Spark, the notebooks today need to use the new_session or just getOrCreate session and get a handle to the
        singleton
        """
        logger.info(
            f"Hijacking the call for task-type {self.task_type}, to call notebook."
        )
        # Execute Notebook via Papermill.
        pm.execute_notebook(self._notebook_path,
                            self.output_notebook_path,
                            parameters=kwargs)  # type: ignore

        outputs = self.extract_outputs(self.output_notebook_path)
        self.render_nb_html(self.output_notebook_path,
                            self.rendered_output_path)

        m = {}
        if outputs:
            m = outputs.literals
        output_list = []
        for k, type_v in self.python_interface.outputs.items():
            if k == self._IMPLICIT_OP_NOTEBOOK:
                output_list.append(self.output_notebook_path)
            elif k == self._IMPLICIT_RENDERED_NOTEBOOK:
                output_list.append(self.rendered_output_path)
            elif k in m:
                v = TypeEngine.to_python_value(
                    ctx=FlyteContext.current_context(),
                    lv=m[k],
                    expected_python_type=type_v)
                output_list.append(v)
            else:
                raise RuntimeError(
                    f"Expected output {k} of type {v} not found in the notebook outputs"
                )

        return tuple(output_list)
Exemplo n.º 3
0
        w.write(python_val)
        return Literal(scalar=Scalar(
            schema=Schema(remote_path, self._get_schema_type())))

    def to_python_value(
            self, ctx: FlyteContext, lv: Literal,
            expected_python_type: Type[pyspark.sql.DataFrame]) -> T:
        if not (lv and lv.scalar and lv.scalar.schema):
            return pyspark.sql.DataFrame()
        r = SparkDataFrameSchemaReader(from_path=lv.scalar.schema.uri,
                                       cols=None,
                                       fmt=SchemaFormat.PARQUET)
        return r.all()


# %%
# Registers a handle for Spark DataFrame + Flyte Schema type transition
# This allows open(pyspark.DataFrame) to be an acceptable type
SchemaEngine.register_handler(
    SchemaHandler(
        "pyspark.sql.DataFrame-Schema",
        pyspark.sql.DataFrame,
        SparkDataFrameSchemaReader,
        SparkDataFrameSchemaWriter,
        handles_remote_io=True,
    ))

# %%
# This makes pyspark.DataFrame as a supported output/input type with flytekit.
TypeEngine.register(SparkDataFrameTransformer())
Exemplo n.º 4
0
        remote_path = ctx.file_access.get_random_remote_directory()
        ctx.file_access.put_data(local_dir, remote_path, is_multipart=True)
        return Literal(scalar=Scalar(
            schema=Schema(remote_path, self._get_schema_type())))

    def to_python_value(
        self,
        ctx: FlyteContext,
        lv: Literal,
        expected_python_type: Type[modin.pandas.DataFrame],
    ) -> T:
        if not (lv and lv.scalar and lv.scalar.schema):
            return modin.pandas.DataFrame()
        local_dir = ctx.file_access.get_random_local_directory()
        ctx.file_access.download_directory(lv.scalar.schema.uri, local_dir)
        r = ModinPandasSchemaReader(from_path=local_dir,
                                    cols=None,
                                    fmt=SchemaFormat.PARQUET)
        return r.all()


SchemaEngine.register_handler(
    SchemaHandler(
        "modin.pandas.Dataframe-Schema",
        modin.pandas.DataFrame,
        ModinPandasSchemaReader,
        ModinPandasSchemaWriter,
    ))

TypeEngine.register(ModinPandasDataFrameTransformer())
Exemplo n.º 5
0
    def get_literal_type(self,
                         t: Type[_params.ParameterRangeOneOf]) -> LiteralType:
        return primitives.Generic.to_flyte_literal_type()

    def to_literal(
        self,
        ctx: FlyteContext,
        python_val: _params.ParameterRangeOneOf,
        python_type: Type[_hpo_job_model.HyperparameterTuningJobConfig],
        expected: LiteralType,
    ) -> Literal:
        d = MessageToDict(python_val.to_flyte_idl())
        return DictTransformer.dict_to_generic_literal(d)

    def to_python_value(
        self, ctx: FlyteContext, lv: Literal,
        expected_python_type: Type[_params.ParameterRangeOneOf]
    ) -> _params.ParameterRangeOneOf:
        if lv and lv.scalar and lv.scalar.generic is not None:
            d = json.loads(json_format.MessageToJson(lv.scalar.generic))
            o = _pb2_params.ParameterRangeOneOf()
            o = json_format.ParseDict(d, o)
            return _params.ParameterRangeOneOf.from_flyte_idl(o)
        return None


# %%
# Register the types
TypeEngine.register(HPOTuningJobConfigTransformer())
TypeEngine.register(ParameterRangesTransformer())
Exemplo n.º 6
0
                "batch_request":
                final_batch_request,
                "expectation_suite_name":
                ge_conf.expectation_suite_name,
            }],
        )
        final_result = convert_to_json_serializable(
            checkpoint_result.list_validation_results())[0]

        result_string = ""
        if final_result["success"] is False:
            for every_result in final_result["results"]:
                if every_result["success"] is False:
                    result_string += (
                        every_result["expectation_config"]["kwargs"]["column"]
                        + " -> " +
                        every_result["expectation_config"]["expectation_type"]
                        + "\n")

            # raise a Great Expectations' exception
            raise ValidationError(
                "Validation failed!\nCOLUMN\t\tFAILED EXPECTATION\n" +
                result_string)

        logger.info("Validation succeeded!")

        return typing.cast(GreatExpectationsType, return_dataset)


TypeEngine.register(GreatExpectationsTypeTransformer())
Exemplo n.º 7
0
        self, ctx: FlyteContext, lv: Literal, expected_python_type: Type[MyDataset]
    ) -> MyDataset:
        """
        In this method, we want to be able to re-hydrate the custom object from Flyte Literal value.
        """
        # Step 1: let's download remote data locally
        local_dir = ctx.file_access.get_random_local_directory()
        ctx.file_access.download_directory(lv.scalar.blob.uri, local_dir)
        # Step 2: create the ``MyDataset`` object
        return MyDataset(base_dir=local_dir)


# %%
# Before we can use MyDataset in our tasks, we need to let Flytekit know that ``MyDataset`` should be considered as a valid type.
# This is done using :py:class:`~flytekit:flytekit.extend.TypeEngine`'s ``register`` method.
TypeEngine.register(MyDatasetTransformer())


# %%
# The new type should be ready to use! Let us write an example generator and consumer for this new datatype.
@task
def generate() -> MyDataset:
    d = MyDataset()
    for i in range(3):
        fp = d.new_file(f"x{i}")
        with open(fp, "w") as f:
            f.write(f"Contents of file{i}")

    return d

Exemplo n.º 8
0
    def get_literal_type(self, t: Type[DatasetProfileView]) -> LiteralType:
        return LiteralType(blob=self._TYPE_INFO)

    def to_literal(
        self,
        ctx: FlyteContext,
        python_val: DatasetProfileView,
        python_type: Type[DatasetProfileView],
        expected: LiteralType,
    ) -> Literal:
        remote_path = ctx.file_access.get_random_remote_directory()
        local_dir = ctx.file_access.get_random_local_path()
        python_val.write(local_dir)
        ctx.file_access.upload(local_dir, remote_path)
        return Literal(scalar=Scalar(blob=Blob(uri=remote_path, metadata=BlobMetadata(type=self._TYPE_INFO))))

    def to_python_value(self, ctx: FlyteContext, lv: Literal, expected_python_type: Type[DatasetProfileView]) -> T:
        local_dir = ctx.file_access.get_random_local_path()
        ctx.file_access.download(lv.scalar.blob.uri, local_dir)
        return DatasetProfileView.read(local_dir)

    def to_html(
        self, ctx: FlyteContext, python_val: DatasetProfileView, expected_python_type: Type[DatasetProfileView]
    ) -> str:
        pandas_profile = str(python_val.to_pandas().to_html())
        header = str("<h1>Profile View</h1> \n")
        return header + pandas_profile


TypeEngine.register(WhylogsDatasetProfileTransformer())
Exemplo n.º 9
0
            remote_path = ctx.file_access.get_random_remote_directory()
            ctx.file_access.put_data(local_dir, remote_path, is_multipart=True)
            return Literal(scalar=Scalar(schema=Schema(
                remote_path, self._get_schema_type(python_type))))
        else:
            raise AssertionError(
                f"Only Pandas Dataframe object can be returned from a task, returned object type {type(python_val)}"
            )

    def to_python_value(
        self, ctx: FlyteContext, lv: Literal,
        expected_python_type: Type[pandera.typing.DataFrame]
    ) -> pandera.typing.DataFrame:
        if not (lv and lv.scalar and lv.scalar.schema):
            raise AssertionError(
                "Can only covert a literal schema to a pandera schema")

        def downloader(x, y):
            ctx.file_access.download_directory(x, y)

        df = FlyteSchema(
            local_path=ctx.file_access.get_random_local_directory(),
            remote_path=lv.scalar.schema.uri,
            downloader=downloader,
            supported_mode=SchemaOpenMode.READ,
        )
        return self._pandera_schema(expected_python_type)(df.open().all())


TypeEngine.register(PanderaTransformer())
Exemplo n.º 10
0
        ctx: FlyteContext,
        lv: Literal,
        expected_python_type: typing.Type[DoltTable],
    ) -> DoltTable:
        if not (lv and lv.scalar and lv.scalar.generic and "config" in lv.scalar.generic):
            raise ValueError("DoltTable requires DoltConfig to load python value")

        conf_dict = MessageToDict(lv.scalar.generic["config"])

        conf = DoltConfig(**conf_dict)
        db = dolt.Dolt(conf.db_path)

        with tempfile.NamedTemporaryFile() as f:
            dolt_int.load(
                db=db,
                tablename=conf.tablename,
                sql=conf.sql,
                filename=f.name,
                branch_conf=conf.branch_conf,
                meta_conf=conf.meta_conf,
                remote_conf=conf.remote_conf,
                load_args=conf.io_args,
            )
            df = pandas.read_csv(f)
            lv.data = df

        return lv


TypeEngine.register(DoltTableNameTransformer())