Пример #1
0
 def _test_fetch(
     self,
     fetch_fn,
     *,
     params={},
     secrets={},
     last_fetch_result=None,
     input_table_parquet_path=None,
     output_filename=None,
 ):
     # TODO simplify this logic and move it to ModuleTestEnv
     with ExitStack() as ctx:
         ctx.enter_context(patch.object(module, "fetch", fetch_fn))
         if output_filename is None:
             # Make a temporary output filename -- this will make `fetch()`
             # complete, but callers won't be able to see the data it
             # outputs because we'll delete the file too soon.
             output_filename = ctx.enter_context(
                 tempfile_context(dir=self.basedir)).name
         thrift_result = module.fetch_thrift(
             ttypes.FetchRequest(
                 basedir=str(self.basedir),
                 params=pydict_to_thrift_json_object(params),
                 secrets=pydict_to_thrift_json_object(secrets),
                 last_fetch_result=(
                     arrow_fetch_result_to_thrift(last_fetch_result)
                     if last_fetch_result is not None else None),
                 input_table_parquet_filename=(input_table_parquet_path.name
                                               if input_table_parquet_path
                                               is not None else None),
                 output_filename=output_filename,
             ))
         return thrift_fetch_result_to_arrow(thrift_result, self.basedir)
Пример #2
0
    def fetch(
        self,
        compiled_module: CompiledModule,
        chroot_context: ChrootContext,
        basedir: Path,
        params: Dict[str, Any],
        secrets: Dict[str, Any],
        last_fetch_result: Optional[FetchResult],
        input_parquet_filename: Optional[str],
        output_filename: str,
    ) -> FetchResult:
        """Run the module's `fetch_thrift()` function and return its result.

        Raise ModuleError if the module has a bug.
        """
        chroot_dir = chroot_context.chroot.root
        basedir_seen_by_module = Path("/") / basedir.relative_to(chroot_dir)
        request = ttypes.FetchRequest(
            basedir=str(basedir_seen_by_module),
            params=pydict_to_thrift_json_object(params),
            secrets=pydict_to_thrift_json_object(secrets),
            last_fetch_result=(
                None if last_fetch_result is None else
                arrow_fetch_result_to_thrift(last_fetch_result)),
            input_table_parquet_filename=input_parquet_filename,
            output_filename=output_filename,
        )
        try:
            with chroot_context.writable_file(basedir / output_filename):
                result = self._run_in_child(
                    chroot_dir=chroot_dir,
                    network_config=pyspawner.NetworkConfig(),
                    compiled_module=compiled_module,
                    timeout=self.fetch_timeout,
                    result=ttypes.FetchResult(),
                    function="fetch_thrift",
                    args=[request],
                )
        finally:
            chroot_context.clear_unowned_edits()

        if result.filename and result.filename != output_filename:
            raise ModuleExitedError(compiled_module.module_slug, 0,
                                    "Module wrote to wrong output file")

        # TODO validate result isn't too large. If result is dataframe it makes
        # sense to truncate; but fetch results aren't necessarily data frames.
        # It's up to the module to enforce this logic ... but we need to set a
        # maximum file size.
        return thrift_fetch_result_to_arrow(result, basedir)
Пример #3
0
    def render(
        self,
        compiled_module: CompiledModule,
        chroot_context: ChrootContext,
        basedir: Path,
        input_filename: str,
        params: Dict[str, Any],
        tab_name: str,
        fetch_result: Optional[FetchResult],
        tab_outputs: List[TabOutput],
        uploaded_files: Dict[str, UploadedFile],
        output_filename: str,
    ) -> RenderResult:
        """Run the module's `render_thrift()` function and return its result.

        Raise ModuleError if the module has a bug.
        """
        chroot_dir = chroot_context.chroot.root
        basedir_seen_by_module = Path("/") / basedir.relative_to(chroot_dir)
        request = ttypes.RenderRequest(
            basedir=str(basedir_seen_by_module),
            params=pydict_to_thrift_json_object(params),
            tab_name=tab_name,
            tab_outputs={
                k: arrow_tab_output_to_thrift(v)
                for k, v in tab_outputs.items()
            },
            uploaded_files={
                k: arrow_uploaded_file_to_thrift(v)
                for k, v in uploaded_files.items()
            },
            fetch_result=(None if fetch_result is None else
                          arrow_fetch_result_to_thrift(fetch_result)),
            output_filename=output_filename,
            input_filename=input_filename,
        )
        if compiled_module.module_slug in {"pythoncode", "ACS2016"}:
            # TODO disallow networking; make network_config always None
            network_config = pyspawner.NetworkConfig()
        else:
            network_config = None
        try:
            with chroot_context.writable_file(basedir / output_filename):
                result = self._run_in_child(
                    chroot_dir=chroot_dir,
                    network_config=network_config,
                    compiled_module=compiled_module,
                    timeout=self.render_timeout,
                    result=ttypes.RenderResult(),
                    function="render_thrift",
                    args=[request],
                )
        finally:
            chroot_context.clear_unowned_edits()

        return thrift_render_result_to_arrow(result)
Пример #4
0
def call_render(render: Callable,
                request: ttypes.RenderRequest) -> ttypes.RenderResult:
    basedir = Path(request.basedir)
    table = load_trusted_arrow_file(basedir / request.input_filename)
    params = thrift_json_object_to_pydict(request.params)

    tab_outputs = {
        k: TabOutput(
            tab_name=v.tab_name,
            table=load_trusted_arrow_file(basedir / v.table_filename),
        )
        for k, v in request.tab_outputs.items()
    }

    uploaded_files = {
        k: UploadedFile(
            name=v.name,
            path=(basedir / v.filename),
            uploaded_at=datetime.datetime.utcfromtimestamp(
                v.uploaded_at_timestampus / 1000000.0),
        )
        for k, v in request.uploaded_files.items()
    }

    if request.fetch_result is None:
        fetch_result = None
    else:
        fetch_result = thrift_fetch_result_to_arrow(request.fetch_result,
                                                    basedir)

    raw_result = render(
        table,
        params,
        settings=settings,
        tab_name=request.tab_name,
        tab_outputs=tab_outputs,
        uploaded_files=uploaded_files,
        fetch_result=fetch_result,
    )

    if not isinstance(raw_result, ArrowRenderResult):
        # Crash. The module author wrote a buggy module.
        raise ValueError(
            "render_arrow_v1() must return a cjwmodule.arrow.types.ArrowRenderResult"
        )

    with pa.ipc.RecordBatchFileWriter(
            basedir / request.output_filename,
            schema=raw_result.table.schema) as writer:
        writer.write_table(raw_result.table)

    return ttypes.RenderResult(
        errors=[arrow_render_error_to_thrift(e) for e in raw_result.errors],
        json=pydict_to_thrift_json_object(raw_result.json),
    )
Пример #5
0
 def migrate_params(self, compiled_module: CompiledModule,
                    params: Dict[str, Any]) -> None:
     """Call a module's migrate_params()."""
     response = self._run_in_child(
         chroot_dir=READONLY_CHROOT_DIR,
         network_config=None,
         compiled_module=compiled_module,
         timeout=self.migrate_params_timeout,
         result=ttypes.MigrateParamsResult(),
         function="migrate_params_thrift",
         args=[pydict_to_thrift_json_object(params)],
     )
     return thrift_json_object_to_pydict(response.params)
Пример #6
0
    def call_render(
        self,
        table: pa.Table,
        params: Dict[str, Any],
        tab_name: str = "Tab 1",
        tab_outputs: Dict[str, TabOutput] = {},
        fetch_result: Optional[FetchResult] = None,
        uploaded_files: Dict[str, UploadedFile] = {},
    ) -> RenderOutcome:
        """Conveniently call the module's `render_thrift()`.

        The calling convention is designed for ease of testing.
        """
        # tempfile will be deleted in __exit__().
        fd, output_filename = mkstemp(prefix="out-",
                                      suffix=".arrow",
                                      dir=self.basedir)
        os.close(fd)
        output_path = Path(output_filename)

        with arrow_table_context(table, dir=self.basedir) as (input_path, _):
            old_cwd = os.getcwd()
            os.chdir(self.basedir)
            try:
                thrift_result = cjwkernel.pandas.module.render_thrift(
                    ttypes.RenderRequest(
                        basedir=self.basedir,
                        input_filename=input_path.name,
                        params=pydict_to_thrift_json_object(params),
                        tab_name=tab_name,
                        tab_outputs={
                            k: arrow_tab_output_to_thrift(v)
                            for k, v in tab_outputs.items()
                        },
                        fetch_result=(
                            arrow_fetch_result_to_thrift(fetch_result)
                            if fetch_result is not None else None),
                        uploaded_files={
                            k: arrow_uploaded_file_to_thrift(v)
                            for k, v in uploaded_files.items()
                        },
                        output_filename=output_path.name,
                    ))
            finally:
                os.chdir(old_cwd)
            arrow_result = thrift_render_result_to_arrow(thrift_result)
            return RenderOutcome(arrow_result, output_path)
Пример #7
0
 def test_pydict_to_thrift_json_object(self):
     self.assertEqual(
         types.pydict_to_thrift_json_object({
             "str":
             "s",
             "int":
             2,
             "float":
             1.2,
             "null":
             None,
             "bool":
             False,
             "arrayofobjects": [{
                 "A": "a",
                 "B": "b"
             }, {
                 "C": "c",
                 "D": "d"
             }],
         }),
         {
             "str":
             ttypes.Json(string_value="s"),
             "int":
             ttypes.Json(int64_value=2),
             "float":
             ttypes.Json(number_value=1.2),
             "null":
             ttypes.Json(),
             "bool":
             ttypes.Json(boolean_value=False),
             "arrayofobjects":
             ttypes.Json(array_value=[
                 ttypes.Json(
                     object_value={
                         "A": ttypes.Json(string_value="a"),
                         "B": ttypes.Json(string_value="b"),
                     }),
                 ttypes.Json(
                     object_value={
                         "C": ttypes.Json(string_value="c"),
                         "D": ttypes.Json(string_value="d"),
                     }),
             ]),
         },
     )
Пример #8
0
def migrate_params_thrift(thrift_params: Dict[str, ttypes.Json]):
    params_dict: Dict[str, Any] = thrift_json_object_to_pydict(thrift_params)
    result_dict = migrate_params(params_dict)
    return ttypes.MigrateParamsResult(
        pydict_to_thrift_json_object(result_dict))