Пример #1
0
 def _test_fetch(
     self,
     fetch_fn,
     *,
     params={},
     secrets={},
     last_fetch_result=None,
     input_table_parquet_path=None,
     output_filename=None,
 ):
     with ExitStack() as ctx:
         ctx.enter_context(patch.object(module, "fetch", fetch_fn))
         if output_filename is None:
             # Make a temporary output filename -- this will make `fetch()`
             # complete, but callers won't be able to see the data it
             # outputs because we'll delete the file too soon.
             output_filename = ctx.enter_context(
                 tempfile_context(dir=self.basedir)).name
         thrift_result = module.fetch_thrift(
             ttypes.FetchRequest(
                 basedir=str(self.basedir),
                 params=arrow_params_to_thrift(Params(params)),
                 secrets=arrow_raw_params_to_thrift(RawParams(secrets)),
                 last_fetch_result=(
                     arrow_fetch_result_to_thrift(last_fetch_result)
                     if last_fetch_result is not None else None),
                 input_table_parquet_filename=(input_table_parquet_path.name
                                               if input_table_parquet_path
                                               is not None else None),
                 output_filename=output_filename,
             ))
         return thrift_fetch_result_to_arrow(thrift_result, self.basedir)
Пример #2
0
 def _test_render(
         self,
         render_fn,
         arrow_table_dict={},
         arrow_table=None,
         params={},
         tab=Tab("tab-1", "Tab 1"),
         fetch_result=None,
         output_filename=None,
 ):
     with ExitStack() as ctx:
         if arrow_table is None:
             arrow_table = ctx.enter_context(
                 arrow_table_context(arrow_table_dict, dir=self.basedir))
         ctx.enter_context(patch.object(module, "render", render_fn))
         out_filename = ctx.enter_context(
             tempfile_context(dir=self.basedir)).name
         thrift_result = module.render_thrift(
             ttypes.RenderRequest(
                 str(self.basedir),
                 arrow_arrow_table_to_thrift(arrow_table),
                 arrow_params_to_thrift(Params(params)),
                 arrow_tab_to_thrift(tab),
                 arrow_fetch_result_to_thrift(fetch_result)
                 if fetch_result is not None else None,
                 out_filename,
             ))
         return thrift_render_result_to_arrow(thrift_result, self.basedir)
Пример #3
0
 def test_default_render_returns_fetch_result(self):
     # Functionality used by libraryofcongress
     with ExitStack() as ctx:
         input_arrow_table = ctx.enter_context(
             arrow_table_context({"A": [1]}, dir=self.basedir)
         )
         parquet_filename = Path(
             ctx.enter_context(parquet_file({"A": [2]}, dir=self.basedir)).name
         ).name
         out_filename = ctx.enter_context(tempfile_context(dir=self.basedir)).name
         thrift_result = module.render_thrift(
             ttypes.RenderRequest(
                 str(self.basedir),
                 input_arrow_table.to_thrift(),
                 Params({}).to_thrift(),
                 ttypes.Tab("tab-1", "Tab 1"),
                 ttypes.FetchResult(
                     parquet_filename,
                     [RenderError(I18nMessage.TODO_i18n("A warning")).to_thrift()],
                 ),
                 out_filename,
             )
         )
         result = RenderResult.from_thrift(thrift_result, self.basedir)
         assert_render_result_equals(
             result,
             RenderResult(
                 arrow_table({"A": [2]}),
                 [RenderError(I18nMessage.TODO_i18n("A warning"))],
             ),
         )
Пример #4
0
 def test_simple(self):
     self.kernel.fetch.return_value = FetchResult(self.output_path)
     module_zipfile = create_module_zipfile(
         "mod",
         spec_kwargs={"parameters": [{
             "id_name": "A",
             "type": "string"
         }]})
     with self.assertLogs("fetcher.fetch", level=logging.INFO):
         result = fetch.fetch_or_wrap_error(
             self.ctx,
             self.chroot_context,
             self.basedir,
             "mod",
             module_zipfile,
             {"A": "B"},
             {"C": "D"},
             None,
             None,
             self.output_path,
         )
     self.assertEqual(result, FetchResult(self.output_path, []))
     self.assertEqual(
         self.kernel.fetch.call_args[1]["compiled_module"],
         module_zipfile.compile_code_without_executing(),
     )
     self.assertEqual(self.kernel.fetch.call_args[1]["params"],
                      Params({"A": "B"}))
     self.assertEqual(self.kernel.fetch.call_args[1]["secrets"], {"C": "D"})
     self.assertIsNone(self.kernel.fetch.call_args[1]["last_fetch_result"])
     self.assertIsNone(
         self.kernel.fetch.call_args[1]["input_parquet_filename"])
Пример #5
0
 def test_simple(self, load_module):
     load_module.return_value.migrate_params.return_value = {"A": "B"}
     load_module.return_value.fetch.return_value = FetchResult(self.output_path, [])
     result = fetch.fetch_or_wrap_error(
         self.ctx,
         self.chroot_context,
         self.basedir,
         WfModule(params={"A": "input"}, secrets={"C": "wrong"}),
         MockModuleVersion(
             id_name="A", param_schema=ParamDType.Dict({"A": ParamDType.String()})
         ),
         {"C": "D"},
         None,
         None,
         self.output_path,
     )
     self.assertEqual(result, FetchResult(self.output_path, []))
     load_module.return_value.migrate_params.assert_called_with({"A": "input"})
     load_module.return_value.fetch.assert_called_with(
         chroot_context=self.chroot_context,
         basedir=self.basedir,
         params=Params({"A": "B"}),
         secrets={"C": "D"},
         last_fetch_result=None,
         input_parquet_filename=None,
         output_filename=self.output_path.name,
     )
Пример #6
0
    def render(
        self,
        compiled_module: CompiledModule,
        chroot_context: ChrootContext,
        basedir: Path,
        input_table: ArrowTable,
        params: Params,
        tab: Tab,
        fetch_result: Optional[FetchResult],
        output_filename: str,
    ) -> RenderResult:
        """
        Run the module's `render_thrift()` function and return its result.

        Raise ModuleError if the module has a bug.
        """
        chroot_dir = chroot_context.chroot.root
        basedir_seen_by_module = Path("/") / basedir.relative_to(chroot_dir)
        request = ttypes.RenderRequest(
            str(basedir_seen_by_module),
            input_table.to_thrift(),
            params.to_thrift(),
            tab.to_thrift(),
            None if fetch_result is None else fetch_result.to_thrift(),
            output_filename,
        )
        try:
            with chroot_context.writable_file(basedir / output_filename):
                result = self._run_in_child(
                    chroot_dir=chroot_dir,
                    network_config=pyspawner.NetworkConfig(),  # TODO disallow networking
                    compiled_module=compiled_module,
                    timeout=self.render_timeout,
                    result=ttypes.RenderResult(),
                    function="render_thrift",
                    args=[request],
                )
        finally:
            chroot_context.clear_unowned_edits()

        if result.table.filename and result.table.filename != output_filename:
            raise ModuleExitedError(0, "Module wrote to wrong output file")

        try:
            # RenderResult.from_thrift() verifies all filenames passed by the
            # module are in the directory the module has access to. It assumes
            # the Arrow file (if there is one) is untrusted, so it can raise
            # ValidateError
            render_result = RenderResult.from_thrift(result, basedir)
        except ValidateError as err:
            raise ModuleExitedError(0, "Module produced invalid data: %s" % str(err))
        return render_result
Пример #7
0
    def fetch(
        self,
        compiled_module: CompiledModule,
        chroot_context: ChrootContext,
        basedir: Path,
        params: Params,
        secrets: Dict[str, Any],
        last_fetch_result: Optional[FetchResult],
        input_parquet_filename: str,
        output_filename: str,
    ) -> FetchResult:
        """
        Run the module's `fetch_thrift()` function and return its result.

        Raise ModuleError if the module has a bug.
        """
        chroot_dir = chroot_context.chroot.root
        basedir_seen_by_module = Path("/") / basedir.relative_to(chroot_dir)
        request = ttypes.FetchRequest(
            str(basedir_seen_by_module),
            params.to_thrift(),
            RawParams(secrets).to_thrift(),
            None if last_fetch_result is None else last_fetch_result.to_thrift(),
            input_parquet_filename,
            output_filename,
        )
        try:
            with chroot_context.writable_file(basedir / output_filename):
                result = self._run_in_child(
                    chroot_dir=chroot_dir,
                    network_config=pyspawner.NetworkConfig(),
                    compiled_module=compiled_module,
                    timeout=self.fetch_timeout,
                    result=ttypes.FetchResult(),
                    function="fetch_thrift",
                    args=[request],
                )
        finally:
            chroot_context.clear_unowned_edits()

        if result.filename and result.filename != output_filename:
            raise ModuleExitedError(0, "Module wrote to wrong output file")

        # TODO validate result isn't too large. If result is dataframe it makes
        # sense to truncate; but fetch results aren't necessarily data frames.
        # It's up to the module to enforce this logic ... but we need to set a
        # maximum file size.
        return FetchResult.from_thrift(result, basedir)
Пример #8
0
def get_param_values(
    schema: ParamDType.Dict, params: Dict[str, Any], context: RenderContext
) -> Params:
    """Convert `params` to a dict we'll pass to a module `render()` function.

    Concretely:

        * `Tab` parameters become Optional[TabOutput] (declared here)
        * Eliminate missing `Tab`s: they'll be `None`
        * Raise `TabCycleError` if a chosen Tab has not been rendered
        * `column` parameters become '' if they aren't input columns
        * `multicolumn` parameters lose values that aren't input columns
        * Raise `PromptingError` if a chosen column is of the wrong type
          (so the caller can return a RenderResult with errors and quickfixes)

    This uses database connections, and it's slow! (It needs to load input tab
    data.) Be sure the Workflow is locked while you call it.
    """
    values: Dict[str, Any] = clean_value(schema, params, context)
    return Params(values)
Пример #9
0
    def render(
        self,
        compiled_module: CompiledModule,
        basedir: Path,
        input_table: ArrowTable,
        params: Params,
        tab: Tab,
        fetch_result: Optional[FetchResult],
        output_filename: str,
    ) -> RenderResult:
        request = ttypes.RenderRequest(
            str(basedir),
            input_table.to_thrift(),
            params.to_thrift(),
            tab.to_thrift(),
            None if fetch_result is None else fetch_result.to_thrift(),
            output_filename,
        )
        with _chroot_dir_context(provide_paths=[basedir],
                                 extract_paths=[basedir / output_filename
                                                ]) as chroot:
            result = self._run_in_child(
                chroot=chroot,
                chroot_paths=[basedir] + DATA_PATHS + PARQUET_PATHS +
                NETWORKING_PATHS,  # TODO nix networking
                compiled_module=compiled_module,
                timeout=self.render_timeout,
                result=ttypes.RenderResult(),
                function="render_thrift",
                args=[request],
            )
            if result.table.filename and result.table.filename != output_filename:
                raise ModuleExitedError(0, "Module wrote to wrong output file")

        # RenderResult.from_thrift() verifies all filenames passed by the
        # module are in the directory the module has access to.
        render_result = RenderResult.from_thrift(result, basedir)
        if render_result.table.table is not None:
            validate(render_result.table.table, render_result.table.metadata)
        return render_result
Пример #10
0
 def fetch(
     self,
     compiled_module: CompiledModule,
     basedir: Path,
     params: Params,
     secrets: Dict[str, Any],
     last_fetch_result: Optional[FetchResult],
     input_parquet_filename: str,
     output_filename: str,
 ) -> FetchResult:
     request = ttypes.FetchRequest(
         str(basedir),
         params.to_thrift(),
         RawParams(secrets).to_thrift(),
         None
         if last_fetch_result is None else last_fetch_result.to_thrift(),
         input_parquet_filename,
         output_filename,
     )
     with _chroot_dir_context(provide_paths=[basedir],
                              extract_paths=[basedir / output_filename
                                             ]) as chroot:
         result = self._run_in_child(
             chroot=chroot,
             chroot_paths=[basedir] + DATA_PATHS + PARQUET_PATHS +
             NETWORKING_PATHS,
             compiled_module=compiled_module,
             timeout=self.fetch_timeout,
             result=ttypes.FetchResult(),
             function="fetch_thrift",
             args=[request],
         )
         if result.filename and result.filename != output_filename:
             raise ModuleExitedError(0, "Module wrote to wrong output file")
     # TODO validate result isn't too large. If result is dataframe it makes
     # sense to truncate; but fetch results aren't necessarily data frames.
     # It's up to the module to enforce this logic ... but we need to set a
     # maximum file size.
     return FetchResult.from_thrift(result, basedir)
Пример #11
0
    def test_load_dynamic(self):
        code = b"def render(table, params):\n    return table * 2"
        minio.client.put_object(
            Bucket=minio.ExternalModulesBucket,
            Key="imported/abcdef/imported.py",
            Body=code,
            ContentLength=len(code),
        )

        with self.assertLogs("cjwstate.modules.loaded_module"):
            lm = LoadedModule.for_module_version(
                MockModuleVersion("imported", "abcdef", ParamDType.Dict({}), "now")
            )

        self.assertEqual(lm.name, "imported:abcdef")

        # This ends up being kinda an integration test.
        with ExitStack() as ctx:
            basedir = Path(ctx.enter_context(tempdir_context(prefix="test-basedir-")))
            basedir.chmod(0o755)
            input_table = ctx.enter_context(
                arrow_table_context({"A": [1]}, dir=basedir)
            )
            input_table.path.chmod(0o644)
            output_tf = ctx.enter_context(tempfile.NamedTemporaryFile(dir=basedir))

            ctx.enter_context(self.assertLogs("cjwstate.modules.loaded_module"))

            result = lm.render(
                basedir=basedir,
                input_table=input_table,
                params=Params({"col": "A"}),
                tab=Tab("tab-1", "Tab 1"),
                fetch_result=None,
                output_filename=Path(output_tf.name).name,
            )

        assert_render_result_equals(result, RenderResult(arrow_table({"A": [2]})))
Пример #12
0
 def render(*args, params, **kwargs):
     self.assertEqual(params, Params({"x": "def"}))  # default params
     return RenderResult(arrow_table({"A": [1]}))
Пример #13
0
def fetch_or_wrap_error(
    ctx: contextlib.ExitStack,
    chroot_context: ChrootContext,
    basedir: Path,
    module_id_name: str,
    module_zipfile: ModuleZipfile,
    migrated_params_or_error: Union[Dict[str, Any], ModuleError],
    secrets: Dict[str, Any],
    last_fetch_result: Optional[FetchResult],
    maybe_input_crr: Optional[CachedRenderResult],
    output_path: Path,
):
    """
    Fetch, and do not raise any exceptions worth catching.

    Exceptions are wrapped -- the result is a FetchResult with `.errors`.

    This function is slow indeed. Perhaps call it from
    EventLoop.run_in_executor(). (Why not make it async? Because all the logic
    inside -- compile module, fetch() -- is sandboxed, meaning it gets its own
    processes. We may eventually avoid asyncio entirely in `fetcher`.

    These problems are all handled:

    * Module was deleted (`module_zipfile is None`)
    * Module times out (`cjwkernel.errors.ModuleTimeoutError`), in `fetch()`.
    * Module crashes (`cjwkernel.errors.ModuleExitedError`), in `fetch()`.
    * migrated_params_or_error is a `ModuleError`
    * migrated_params_or_error is invalid (`ValueError`)
    * input_crr points to a nonexistent file (`FileNotFoundError`)
    """
    # module_zipfile=None is allowed
    if module_zipfile is None:
        logger.info("fetch() deleted module '%s'", module_id_name)
        return FetchResult(
            output_path,
            [
                RenderError(
                    I18nMessage.trans(
                        "py.fetcher.fetch.no_loaded_module",
                        default="Cannot fetch: module was deleted",
                    ))
            ],
        )
    module_spec = module_zipfile.get_spec()
    param_schema = module_spec.get_param_schema()

    if isinstance(migrated_params_or_error, ModuleError):
        # raise the exception so we can log it
        try:
            raise migrated_params_or_error
        except ModuleError:
            # We'll always get here
            logger.exception("%s:migrate_params() raised error",
                             module_zipfile.path.name)
        return user_visible_bug_fetch_result(
            output_path, format_for_user_debugging(migrated_params_or_error))
    migrated_params = migrated_params_or_error

    try:
        param_schema.validate(migrated_params)
    except ValueError:
        logger.exception("Invalid return value from %s:migrate_params()",
                         module_zipfile.path.name)
        return user_visible_bug_fetch_result(
            output_path,
            "%s:migrate_params() output invalid params" %
            module_zipfile.path.name,
        )

    # get input_metadata, input_parquet_path. (This can't error.)
    input_parquet_path, input_metadata = _download_cached_render_result(
        ctx, maybe_input_crr, dir=basedir)

    # Clean params, so they're of the correct type. (This can't error.)
    params = Params(
        fetchprep.clean_value(param_schema, migrated_params, input_metadata))

    # actually fetch
    try:
        return invoke_fetch(
            module_zipfile,
            chroot_context=chroot_context,
            basedir=basedir,
            params=params,
            secrets=secrets,
            last_fetch_result=last_fetch_result,
            input_parquet_filename=(None if input_parquet_path is None else
                                    input_parquet_path.name),
            output_filename=output_path.name,
        )
    except ModuleError as err:
        logger.exception("Error calling %s:fetch()", module_zipfile.path.name)
        return user_visible_bug_fetch_result(output_path,
                                             format_for_user_debugging(err))
Пример #14
0
def fetch_or_wrap_error(
    ctx: contextlib.ExitStack,
    basedir: Path,
    wf_module: WfModule,
    module_version: ModuleVersion,
    secrets: Dict[str, Any],
    last_fetch_result: Optional[FetchResult],
    maybe_input_crr: Optional[CachedRenderResult],
    output_path: Path,
):
    """
    Fetch `wf_module`, and do not raise any exceptions worth catching.

    Exceptions are wrapped -- the result is a FetchResult with `.errors`.

    This function is slow indeed. Perhaps call it from
    EventLoop.run_in_executor(). (Why not make it async? Because all the
    logic inside -- compile module, migrate_params() and fetch() -- is
    sandboxed, meaning it gets its own processes. We may eventually avoid
    asyncio entirely in `fetcher`.

    These problems are all handled:

    * Module was deleted (`module_version is None`)
    * Module file has gone missing
    * Module does not compile (`cjwkernel.errors.ModuleCompileError`)
    * Module times out (`cjwkernel.errors.ModuleTimeoutError`), either in
      migrate_params() or in fetch().
    * Module crashes (`cjwkernel.errors.ModuleExitedError`), either in
      migrate_params() or in fetch().
    * Module migrate_params() returns invalid data (`ValueError`)
    * input_crr points to a nonexistent file (`FileNotFoundError`)
    """
    # module_version=None is allowed
    try:
        loaded_module = LoadedModule.for_module_version(module_version)
    except FileNotFoundError:
        logger.exception("Module %s code disappeared", module_version.id_name)
        return user_visible_bug_fetch_result(output_path, "FileNotFoundError")
    except ModuleError as err:
        logger.exception("Error loading module %s", module_version.id_name)
        return user_visible_bug_fetch_result(
            output_path,
            format_for_user_debugging(err) + " (during load)")

    if loaded_module is None:
        logger.info("fetch() deleted module '%s'", wf_module.module_id_name)
        return FetchResult(
            output_path,
            [
                RenderError(
                    I18nMessage.TODO_i18n("Cannot fetch: module was deleted"))
            ],
        )

    # Migrate params, so fetch() gets newest values
    try:
        # TODO use params.get_migrated_params(). (Remember to use a
        # Workflow.cooperative_lock().)
        params = loaded_module.migrate_params(wf_module.params)
    except ModuleError as err:
        logger.exception("Error calling %s.migrate_params()",
                         module_version.id_name)
        return user_visible_bug_fetch_result(output_path,
                                             format_for_user_debugging(err))
    try:
        module_version.param_schema.validate(params)
    except ValueError:
        logger.exception("Invalid return value from %s.migrate_params()",
                         module_version.id_name)
        return user_visible_bug_fetch_result(
            output_path,
            "%s.migrate_params() output invalid params" %
            module_version.id_name,
        )

    # get input_metadata, input_parquet_path. (This can't error.)
    input_parquet_path, input_metadata = _with_downloaded_cached_render_result(
        ctx, maybe_input_crr, dir=basedir)

    # Clean params, so they're of the correct type. (This can't error.)
    params = Params(
        fetchprep.clean_value(module_version.param_schema, params,
                              input_metadata))

    # actually fetch
    try:
        return loaded_module.fetch(
            basedir=basedir,
            params=params,
            secrets=secrets,
            last_fetch_result=last_fetch_result,
            input_parquet_filename=(None if input_parquet_path is None else
                                    input_parquet_path.name),
            output_filename=output_path.name,
        )
    except ModuleError as err:
        logger.exception("Error calling %s.fetch()", module_version.id_name)
        return user_visible_bug_fetch_result(output_path,
                                             format_for_user_debugging(err))