示例#1
0
 def test_fetch_result_from_thrift_disallow_non_file(self):
     with tempfile.TemporaryDirectory(dir=str(self.basedir)) as tmpsubdir:
         with self.assertRaisesRegex(ValueError, "be a regular file"):
             types.thrift_fetch_result_to_arrow(
                 ttypes.FetchResult(Path(tmpsubdir).name, []),
                 self.basedir,
             )
示例#2
0
def render_thrift(request: ttypes.RenderRequest) -> ttypes.RenderResult:
    """
    Render using Thrift data types.

    This function will convert to `cjwkernel.types` (opening Arrow tables in
    the process), call `render_arrow()`, and then convert the result back to
    Thrift. This uses very little RAM.

    Module authors may overwrite this function to avoid reading or writing the
    data table entirely -- for instance, a "change number format" module may
    not need to read any data, so it could operate on the Thrift layer. Most
    modules _do_ look at table data, so they should not overwrite this
    function.
    """
    basedir = Path(request.basedir)
    arrow_table = thrift_arrow_table_to_arrow(request.input_table,
                                              basedir,
                                              trusted=True)
    params = thrift_params_to_arrow(request.params, basedir)
    params_dict = params.params
    if request.fetch_result is None:
        fetch_result = None
    else:
        fetch_result = thrift_fetch_result_to_arrow(request.fetch_result,
                                                    basedir)

    arrow_result: types.RenderResult = __render_by_signature(
        table=arrow_table,
        params=params_dict,
        tab_name=request.tab.name,
        fetch_result=fetch_result,
        output_path=basedir / request.output_filename,
    )

    return arrow_render_result_to_thrift(arrow_result)
示例#3
0
def call_fetch(fetch: Callable,
               request: ttypes.FetchRequest) -> ttypes.FetchResult:
    """Call `fetch()` and validate the result.

    Module code may contain errors. This function and `fetch()` should strive
    to raise developer-friendly errors in the case of bugs -- including
    unexpected input.
    """
    # thrift => pandas
    basedir = Path(request.basedir)
    params: Dict[str, Any] = thrift_json_object_to_pydict(request.params)
    secrets: Dict[str, Any] = thrift_json_object_to_pydict(request.secrets)
    if request.input_table_parquet_filename is None:
        input_table_parquet_path = None
    else:
        input_table_parquet_path = basedir / request.input_table_parquet_filename
    if request.last_fetch_result is None:
        last_fetch_result = None
    else:
        last_fetch_result = thrift_fetch_result_to_arrow(
            request.last_fetch_result, basedir)
    output_path = basedir / request.output_filename

    result = fetch(
        params=params,
        secrets=secrets,
        last_fetch_result=last_fetch_result,
        input_table_parquet_path=input_table_parquet_path,
        output_path=output_path,
    )

    return arrow_fetch_result_to_thrift(result)
示例#4
0
 def _test_fetch(
     self,
     fetch_fn,
     *,
     params={},
     secrets={},
     last_fetch_result=None,
     input_table_parquet_path=None,
     output_filename=None,
 ):
     with ExitStack() as ctx:
         ctx.enter_context(patch.object(module, "fetch", fetch_fn))
         if output_filename is None:
             # Make a temporary output filename -- this will make `fetch()`
             # complete, but callers won't be able to see the data it
             # outputs because we'll delete the file too soon.
             output_filename = ctx.enter_context(
                 tempfile_context(dir=self.basedir)).name
         thrift_result = module.fetch_thrift(
             ttypes.FetchRequest(
                 basedir=str(self.basedir),
                 params=arrow_params_to_thrift(Params(params)),
                 secrets=arrow_raw_params_to_thrift(RawParams(secrets)),
                 last_fetch_result=(
                     arrow_fetch_result_to_thrift(last_fetch_result)
                     if last_fetch_result is not None else None),
                 input_table_parquet_filename=(input_table_parquet_path.name
                                               if input_table_parquet_path
                                               is not None else None),
                 output_filename=output_filename,
             ))
         return thrift_fetch_result_to_arrow(thrift_result, self.basedir)
示例#5
0
def call_render(render: Callable,
                request: ttypes.RenderRequest) -> ttypes.RenderResult:
    basedir = Path(request.basedir)
    table = load_trusted_arrow_file(basedir / request.input_filename)
    params = thrift_json_object_to_pydict(request.params)

    tab_outputs = {
        k: TabOutput(
            tab_name=v.tab_name,
            table=load_trusted_arrow_file(basedir / v.table_filename),
        )
        for k, v in request.tab_outputs.items()
    }

    uploaded_files = {
        k: UploadedFile(
            name=v.name,
            path=(basedir / v.filename),
            uploaded_at=datetime.datetime.utcfromtimestamp(
                v.uploaded_at_timestampus / 1000000.0),
        )
        for k, v in request.uploaded_files.items()
    }

    if request.fetch_result is None:
        fetch_result = None
    else:
        fetch_result = thrift_fetch_result_to_arrow(request.fetch_result,
                                                    basedir)

    raw_result = render(
        table,
        params,
        settings=settings,
        tab_name=request.tab_name,
        tab_outputs=tab_outputs,
        uploaded_files=uploaded_files,
        fetch_result=fetch_result,
    )

    if not isinstance(raw_result, ArrowRenderResult):
        # Crash. The module author wrote a buggy module.
        raise ValueError(
            "render_arrow_v1() must return a cjwmodule.arrow.types.ArrowRenderResult"
        )

    with pa.ipc.RecordBatchFileWriter(
            basedir / request.output_filename,
            schema=raw_result.table.schema) as writer:
        writer.write_table(raw_result.table)

    return ttypes.RenderResult(
        errors=[arrow_render_error_to_thrift(e) for e in raw_result.errors],
        json=pydict_to_thrift_json_object(raw_result.json),
    )
示例#6
0
def fetch_thrift(request: ttypes.FetchRequest) -> ttypes.FetchResult:
    basedir = Path(request.basedir)
    arrow_result = fetch_arrow(
        thrift_params_to_arrow(request.params, basedir).params,
        thrift_raw_params_to_arrow(request.secrets).params,
        (None if request.last_fetch_result is None else
         thrift_fetch_result_to_arrow(request.last_fetch_result, basedir)),
        (None if request.input_table_parquet_filename is None else basedir /
         request.input_table_parquet_filename),
        basedir / request.output_filename,
    )
    return arrow_fetch_result_to_thrift(arrow_result)
示例#7
0
 def test_fetch_result_from_thrift_happy_path(self):
     with tempfile.NamedTemporaryFile(dir=str(self.basedir)) as tf:
         self.assertEqual(
             types.thrift_fetch_result_to_arrow(
                 ttypes.FetchResult(
                     Path(tf.name).name,
                     [ttypes.RenderError(ttypes.I18nMessage("hi", {}), [])],
                 ),
                 self.basedir,
             ),
             types.FetchResult(Path(
                 tf.name), [types.RenderError(types.I18nMessage("hi"))]),
         )
示例#8
0
def call_render(module_spec: ModuleSpec, render: Callable,
                request: ttypes.RenderRequest) -> ttypes.RenderResult:
    basedir = Path(request.basedir)
    input_path = basedir / request.input_filename
    table, columns = load_trusted_arrow_file_with_columns(input_path)
    params = _prepare_params(
        module_spec,
        thrift_json_object_to_pydict(request.params),
        basedir=basedir,
        uploaded_files={
            k: thrift_uploaded_file_to_arrow(v)
            for k, v in request.uploaded_files.items()
        },
    )
    if request.fetch_result is None:
        fetch_result = None
    else:
        fetch_result = thrift_fetch_result_to_arrow(request.fetch_result,
                                                    basedir)
    output_path = basedir / request.output_filename

    raw_result = render(
        table,
        params,
        output_path,
        columns=columns,
        settings=settings,
        tab_name=request.tab_name,
        fetch_result=fetch_result,
    )

    # coerce result
    #
    # TODO omit all this code and rely on Workbench's validation. To do this:
    #
    # 1. Change all modules to return RenderResult
    # 2. Nix this coersion code
    _DEPRECATED_overwrite_to_fix_arrow_table_schema(
        output_path, fallback_schema=table.schema)
    if raw_result is None:
        errors = []
    elif isinstance(raw_result, list):
        errors = coerce_RenderError_list(raw_result)
    else:
        raise ValueError("Unhandled raw_result")

    return ttypes.RenderResult(
        errors=[arrow_render_error_to_thrift(e) for e in errors],
        json={},  # this framework never produces JSON
    )
示例#9
0
    def fetch(
        self,
        compiled_module: CompiledModule,
        chroot_context: ChrootContext,
        basedir: Path,
        params: Dict[str, Any],
        secrets: Dict[str, Any],
        last_fetch_result: Optional[FetchResult],
        input_parquet_filename: Optional[str],
        output_filename: str,
    ) -> FetchResult:
        """Run the module's `fetch_thrift()` function and return its result.

        Raise ModuleError if the module has a bug.
        """
        chroot_dir = chroot_context.chroot.root
        basedir_seen_by_module = Path("/") / basedir.relative_to(chroot_dir)
        request = ttypes.FetchRequest(
            basedir=str(basedir_seen_by_module),
            params=pydict_to_thrift_json_object(params),
            secrets=pydict_to_thrift_json_object(secrets),
            last_fetch_result=(
                None if last_fetch_result is None else
                arrow_fetch_result_to_thrift(last_fetch_result)),
            input_table_parquet_filename=input_parquet_filename,
            output_filename=output_filename,
        )
        try:
            with chroot_context.writable_file(basedir / output_filename):
                result = self._run_in_child(
                    chroot_dir=chroot_dir,
                    network_config=pyspawner.NetworkConfig(),
                    compiled_module=compiled_module,
                    timeout=self.fetch_timeout,
                    result=ttypes.FetchResult(),
                    function="fetch_thrift",
                    args=[request],
                )
        finally:
            chroot_context.clear_unowned_edits()

        if result.filename and result.filename != output_filename:
            raise ModuleExitedError(compiled_module.module_slug, 0,
                                    "Module wrote to wrong output file")

        # TODO validate result isn't too large. If result is dataframe it makes
        # sense to truncate; but fetch results aren't necessarily data frames.
        # It's up to the module to enforce this logic ... but we need to set a
        # maximum file size.
        return thrift_fetch_result_to_arrow(result, basedir)
示例#10
0
 def test_fetch_result_from_thrift_disallow_non_files(self):
     with self.assertRaisesRegex(ValueError, "must exist"):
         types.thrift_fetch_result_to_arrow(
             ttypes.FetchResult("missing", []), self.basedir)
示例#11
0
 def test_fetch_result_from_thrift_disallow_hidden_files(self):
     with self.assertRaisesRegex(ValueError, "must not be hidden"):
         types.thrift_fetch_result_to_arrow(
             ttypes.FetchResult(".secrets", []),
             Path(__file__).parent)
示例#12
0
 def test_fetch_result_from_thrift_disallow_directories(self):
     with self.assertRaisesRegex(ValueError,
                                 "must not contain directories"):
         types.thrift_fetch_result_to_arrow(
             ttypes.FetchResult("/etc/passwd", []),
             Path(__file__).parent)
示例#13
0
 def test_fetch_result_from_thrift_disallow_directories(self):
     with self.assertRaisesRegex(ValueError,
                                 "must not include directory names"):
         types.thrift_fetch_result_to_arrow(
             ttypes.FetchResult("/etc/passwd", []), self.basedir)