Exemplo n.º 1
0
def test_file_handling_local_file_gets_force_no_copy():
    @task
    def t1() -> FlyteFile:
        # Use this test file itself, since we know it exists.
        return FlyteFile(__file__, remote_path=False)

    @workflow
    def my_wf() -> FlyteFile:
        return t1()

    random_dir = context_manager.FlyteContext.current_context().file_access.get_random_local_directory()
    fs = FileAccessProvider(local_sandbox_dir=random_dir, raw_output_prefix=os.path.join(random_dir, "mock_remote"))
    ctx = context_manager.FlyteContext.current_context()
    with context_manager.FlyteContextManager.with_context(ctx.with_file_access(fs)):
        top_level_files = os.listdir(random_dir)
        assert len(top_level_files) == 1  # the flytekit_local folder

        workflow_output = my_wf()

        # After running, this test file should've been copied to the mock remote location.
        assert not os.path.exists(os.path.join(random_dir, "mock_remote"))

        # Because Flyte doesn't presume to handle a uri that look like a raw path, the path that is returned is
        # the original.
        assert workflow_output.path == __file__
Exemplo n.º 2
0
def test_file_handling_local_file_gets_copied():
    @task
    def t1() -> FlyteFile:
        # Use this test file itself, since we know it exists.
        return __file__

    @workflow
    def my_wf() -> FlyteFile:
        return t1()

    random_dir = context_manager.FlyteContext.current_context().file_access.get_random_local_directory()
    # print(f"Random: {random_dir}")
    fs = FileAccessProvider(local_sandbox_dir=random_dir, raw_output_prefix=os.path.join(random_dir, "mock_remote"))
    ctx = context_manager.FlyteContext.current_context()
    with context_manager.FlyteContextManager.with_context(ctx.with_file_access(fs)):
        top_level_files = os.listdir(random_dir)
        assert len(top_level_files) == 1  # the flytekit_local folder

        x = my_wf()

        # After running, this test file should've been copied to the mock remote location.
        mock_remote_files = os.listdir(os.path.join(random_dir, "mock_remote"))
        assert len(mock_remote_files) == 1  # the file
        # File should've been copied to the mock remote folder
        assert x.path.startswith(random_dir)
Exemplo n.º 3
0
 def convert(self, value: typing.Any,
             param: typing.Optional[click.Parameter],
             ctx: typing.Optional[click.Context]) -> typing.Any:
     if FileAccessProvider.is_remote(value):
         return FileParam(filepath=value)
     p = pathlib.Path(value)
     if p.exists() and p.is_file():
         return FileParam(filepath=str(p.resolve()))
     raise click.BadParameter(
         f"parameter should be a valid file path, {value}")
Exemplo n.º 4
0
def test_file_handling_remote_file_handling_flyte_file():
    SAMPLE_DATA = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"

    @task
    def t1() -> FlyteFile:
        # Unlike the test above, this returns the remote path wrapped in a FlyteFile object
        return FlyteFile(SAMPLE_DATA)

    @workflow
    def my_wf() -> FlyteFile:
        return t1()

    # This creates a random directory that we know is empty.
    random_dir = context_manager.FlyteContext.current_context().file_access.get_random_local_directory()
    # Creating a new FileAccessProvider will add two folderst to the random dir
    fs = FileAccessProvider(local_sandbox_dir=random_dir, raw_output_prefix=os.path.join(random_dir, "mock_remote"))
    ctx = context_manager.FlyteContext.current_context()
    with context_manager.FlyteContextManager.with_context(ctx.with_file_access(fs)):
        working_dir = os.listdir(random_dir)
        assert len(working_dir) == 1  # the local_flytekit dir

        mock_remote_path = os.path.join(random_dir, "mock_remote")
        assert not os.path.exists(mock_remote_path)  # the persistence layer won't create the folder yet

        workflow_output = my_wf()

        # After running the mock remote dir should still be empty, since the workflow_output has not been used
        assert not os.path.exists(mock_remote_path)

        # While the literal returned by t1 does contain the web address as the uri, because it's a remote address,
        # flytekit will translate it back into a FlyteFile object on the local drive (but not download it)
        assert workflow_output.path.startswith(f"{random_dir}{os.sep}local_flytekit")
        # But the remote source should still be the https address
        assert workflow_output.remote_source == SAMPLE_DATA

        # The act of running the workflow should create the engine dir, and the directory that will contain the
        # file but the file itself isn't downloaded yet.
        working_dir = os.listdir(os.path.join(random_dir, "local_flytekit"))
        assert len(working_dir) == 2  # local flytekit and the downloaded file

        assert not os.path.exists(workflow_output.path)
        # # The act of opening it should trigger the download, since we do lazy downloading.
        with open(workflow_output, "rb"):
            ...
        # This second layer should have two dirs, a random one generated by the new_execution_context call
        # and an empty folder, created by FlyteFile transformer's to_python_value function. This folder will have
        # something in it after we open() it.
        working_dir = os.listdir(os.path.join(random_dir, "local_flytekit"))
        assert len(working_dir) == 3  # local flytekit and the downloaded file

        assert os.path.exists(workflow_output.path)

        # The file name is maintained on download.
        assert str(workflow_output).endswith(os.path.split(SAMPLE_DATA)[1])
Exemplo n.º 5
0
def test_transformer_to_literal_local():

    random_dir = context_manager.FlyteContext.current_context().file_access.get_random_local_directory()
    fs = FileAccessProvider(local_sandbox_dir=random_dir, raw_output_prefix=os.path.join(random_dir, "raw"))
    ctx = context_manager.FlyteContext.current_context()
    with context_manager.FlyteContextManager.with_context(ctx.with_file_access(fs)) as ctx:
        # Use a separate directory that we know won't be the same as anything generated by flytekit itself, lest we
        # accidentally try to cp -R /some/folder /some/folder/sub which causes exceptions obviously.
        p = "/tmp/flyte/test_fd_transformer"

        # Create an empty directory and call to literal on it
        if os.path.exists(p):
            shutil.rmtree(p)
        pathlib.Path(p).mkdir(parents=True)

        tf = FlyteDirToMultipartBlobTransformer()
        lt = tf.get_literal_type(FlyteDirectory)
        literal = tf.to_literal(ctx, FlyteDirectory(p), FlyteDirectory, lt)
        assert literal.scalar.blob.uri.startswith(random_dir)

        # Create a director with one file in it
        if os.path.exists(p):
            shutil.rmtree(p)
        pathlib.Path(p).mkdir(parents=True)
        with open(os.path.join(p, "xyz"), "w") as fh:
            fh.write("Hello world\n")
        literal = tf.to_literal(ctx, FlyteDirectory(p), FlyteDirectory, lt)

        mock_remote_files = os.listdir(literal.scalar.blob.uri)
        assert mock_remote_files == ["xyz"]

        # The only primitives allowed are strings
        with pytest.raises(AssertionError):
            tf.to_literal(ctx, 3, FlyteDirectory, lt)

        with pytest.raises(TypeError, match="No automatic conversion from <class 'int'>"):
            TypeEngine.to_literal(ctx, 3, FlyteDirectory, lt)

        # Can't use if it's not a directory
        with pytest.raises(FlyteAssertion):
            p = "/tmp/flyte/xyz"
            path = pathlib.Path(p)
            try:
                path.unlink()
            except OSError:
                ...
            with open(p, "w") as fh:
                fh.write("hello world\n")
            tf.to_literal(ctx, FlyteDirectory(p), FlyteDirectory, lt)
Exemplo n.º 6
0
 def convert(self, value: typing.Any,
             param: typing.Optional[click.Parameter],
             ctx: typing.Optional[click.Context]) -> typing.Any:
     if FileAccessProvider.is_remote(value):
         return Directory(dir_path=value, local=False)
     p = pathlib.Path(value)
     if p.exists() and p.is_dir():
         files = list(p.iterdir())
         if len(files) != 1:
             raise ValueError(
                 f"Currently only directories containing one file are supported, found [{len(files)}] files found in {p.resolve()}"
             )
         return Directory(dir_path=value, local_file=files[0].resolve())
     raise click.BadParameter(
         f"parameter should be a valid directory path, {value}")
Exemplo n.º 7
0
def test_transformer_to_literal_remote():
    random_dir = context_manager.FlyteContext.current_context().file_access.get_random_local_directory()
    fs = FileAccessProvider(local_sandbox_dir=random_dir, raw_output_prefix=os.path.join(random_dir, "raw"))
    ctx = context_manager.FlyteContext.current_context()
    with context_manager.FlyteContextManager.with_context(ctx.with_file_access(fs)) as ctx:
        # Use a separate directory that we know won't be the same as anything generated by flytekit itself, lest we
        # accidentally try to cp -R /some/folder /some/folder/sub which causes exceptions obviously.
        p = "/tmp/flyte/test_fd_transformer"
        # Create an empty directory and call to literal on it
        if os.path.exists(p):
            shutil.rmtree(p)
        pathlib.Path(p).mkdir(parents=True)

        tf = FlyteDirToMultipartBlobTransformer()
        lt = tf.get_literal_type(FlyteDirectory)

        # Remote directories should be copied as is.
        literal = tf.to_literal(ctx, FlyteDirectory("s3://anything"), FlyteDirectory, lt)
        assert literal.scalar.blob.uri == "s3://anything"
Exemplo n.º 8
0
def setup_execution(
    raw_output_data_prefix: str,
    checkpoint_path: Optional[str] = None,
    prev_checkpoint: Optional[str] = None,
    dynamic_addl_distro: Optional[str] = None,
    dynamic_dest_dir: Optional[str] = None,
):
    """

    :param raw_output_data_prefix:
    :param checkpoint_path:
    :param prev_checkpoint:
    :param dynamic_addl_distro: Works in concert with the other dynamic arg. If present, indicates that if a dynamic
      task were to run, it should set fast serialize to true and use these values in FastSerializationSettings
    :param dynamic_dest_dir: See above.
    :return:
    """
    exe_project = get_one_of("FLYTE_INTERNAL_EXECUTION_PROJECT", "_F_PRJ")
    exe_domain = get_one_of("FLYTE_INTERNAL_EXECUTION_DOMAIN", "_F_DM")
    exe_name = get_one_of("FLYTE_INTERNAL_EXECUTION_ID", "_F_NM")
    exe_wf = get_one_of("FLYTE_INTERNAL_EXECUTION_WORKFLOW", "_F_WF")
    exe_lp = get_one_of("FLYTE_INTERNAL_EXECUTION_LAUNCHPLAN", "_F_LP")

    tk_project = get_one_of("FLYTE_INTERNAL_TASK_PROJECT", "_F_TK_PRJ")
    tk_domain = get_one_of("FLYTE_INTERNAL_TASK_DOMAIN", "_F_TK_DM")
    tk_name = get_one_of("FLYTE_INTERNAL_TASK_NAME", "_F_TK_NM")
    tk_version = get_one_of("FLYTE_INTERNAL_TASK_VERSION", "_F_TK_V")

    compressed_serialization_settings = os.environ.get(SERIALIZED_CONTEXT_ENV_VAR, "")

    ctx = FlyteContextManager.current_context()
    # Create directories
    user_workspace_dir = ctx.file_access.get_random_local_directory()
    logger.info(f"Using user directory {user_workspace_dir}")
    pathlib.Path(user_workspace_dir).mkdir(parents=True, exist_ok=True)
    from flytekit import __version__ as _api_version

    checkpointer = None
    if checkpoint_path is not None:
        checkpointer = SyncCheckpoint(checkpoint_dest=checkpoint_path, checkpoint_src=prev_checkpoint)
        logger.debug(f"Checkpointer created with source {prev_checkpoint} and dest {checkpoint_path}")

    execution_parameters = ExecutionParameters(
        execution_id=_identifier.WorkflowExecutionIdentifier(
            project=exe_project,
            domain=exe_domain,
            name=exe_name,
        ),
        execution_date=_datetime.datetime.utcnow(),
        stats=_get_stats(
            cfg=StatsConfig.auto(),
            # Stats metric path will be:
            # registration_project.registration_domain.app.module.task_name.user_stats
            # and it will be tagged with execution-level values for project/domain/wf/lp
            prefix=f"{tk_project}.{tk_domain}.{tk_name}.user_stats",
            tags={
                "exec_project": exe_project,
                "exec_domain": exe_domain,
                "exec_workflow": exe_wf,
                "exec_launchplan": exe_lp,
                "api_version": _api_version,
            },
        ),
        logging=user_space_logger,
        tmp_dir=user_workspace_dir,
        raw_output_prefix=raw_output_data_prefix,
        checkpoint=checkpointer,
        task_id=_identifier.Identifier(_identifier.ResourceType.TASK, tk_project, tk_domain, tk_name, tk_version),
    )

    try:
        file_access = FileAccessProvider(
            local_sandbox_dir=tempfile.mkdtemp(prefix="flyte"),
            raw_output_prefix=raw_output_data_prefix,
        )
    except TypeError:  # would be thrown from DataPersistencePlugins.find_plugin
        logger.error(f"No data plugin found for raw output prefix {raw_output_data_prefix}")
        raise

    es = ctx.new_execution_state().with_params(
        mode=ExecutionState.Mode.TASK_EXECUTION,
        user_space_params=execution_parameters,
    )
    cb = ctx.new_builder().with_file_access(file_access).with_execution_state(es)

    if compressed_serialization_settings:
        ss = SerializationSettings.from_transport(compressed_serialization_settings)
        ssb = ss.new_builder()
        ssb.project = exe_project
        ssb.domain = exe_domain
        ssb.version = tk_version
        if dynamic_addl_distro:
            ssb.fast_serialization_settings = FastSerializationSettings(
                enabled=True,
                destination_dir=dynamic_dest_dir,
                distribution_location=dynamic_addl_distro,
            )
        cb = cb.with_serialization_settings(ssb.build())

    with FlyteContextManager.with_context(cb) as ctx:
        yield ctx
Exemplo n.º 9
0
def test_is_remote():
    fp = FileAccessProvider("/tmp", "s3://my-bucket")
    assert fp.is_remote("./checkpoint") is False
    assert fp.is_remote("/tmp/foo/bar") is False
    assert fp.is_remote("file://foo/bar") is False
    assert fp.is_remote("s3://my-bucket/foo/bar") is True
Exemplo n.º 10
0
def test_get_random_remote_path():
    fp = FileAccessProvider("/tmp", "s3://my-bucket")
    path = fp.get_random_remote_path()
    assert path.startswith("s3://my-bucket")