Exemplo n.º 1
0
def load_cached_render_result(
    crr: CachedRenderResult, path: Path
) -> LoadedRenderResult:
    """Create a LoadedRenderResult was it was passed to `cache_render_result()`.

    Write a zero-byte file if `crr` has no columns.

    The returned LoadedRenderResult is backed by `path`, an mmapped file on
    disk. The whole operation doesn't require much physical RAM.

    Raise CorruptCacheError if the cached data does not match `crr`. That can
    mean:

        * The cached Parquet file is corrupt
        * The cached Parquet file is missing
        * `crr` is stale -- the cached result is for a different delta. This
          could be detected by a `Workflow.cooperative_lock()`, too, should the
          caller want to distinguish this error from the others.
    """
    if not crr.table_metadata.columns:
        # Zero-column tables aren't written to cache
        path.write_bytes(b"")
        return LoadedRenderResult(
            path=path,
            table=pa.table({}),
            columns=[],
            errors=crr.errors,
            json=crr.json,
        )
    else:
        # raises CorruptCacheError
        with downloaded_parquet_file(crr) as parquet_path:
            try:
                # raises ArrowIOError
                table = read_parquet_as_arrow(parquet_path, crr.table_metadata.columns)
            except pa.ArrowIOError as err:
                raise CorruptCacheError from err

            # We don't expect errors writing to disk: this shouldn't consume RAM
            with pa.ipc.RecordBatchFileWriter(path, table.schema) as writer:
                writer.write_table(table)

            # Now, read the table from the file, so that `path` and `table` are
            # equivalent. Don't validate the file: we know what it contains.
            with pa.ipc.open_file(path) as reader:
                table = reader.read_all()

            return LoadedRenderResult(
                path=path,
                table=table,
                columns=crr.table_metadata.columns,
                errors=crr.errors,
                json=crr.json,
            )
Exemplo n.º 2
0
    def test_metadata_does_not_require_file_read(self):
        columns = [
            Column("A", ColumnType.Number(format="{:,.2f}")),
            Column("B", ColumnType.Timestamp()),
            Column("C", ColumnType.Text()),
            Column("D", ColumnType.Date("month")),
        ]
        with arrow_table_context(
            make_column("A", [1], format="{:,.2f}"),
            make_column("B", [datetime.datetime(2021, 4, 13)]),
            make_column("C", ["c"]),
            make_column("D", [datetime.date(2021, 4, 1)], unit="month"),
        ) as (path, table):
            result = LoadedRenderResult(
                path=path, table=table, columns=columns, errors=[], json={}
            )
            cache_render_result(self.workflow, self.step, 1, result)
        # Delete from disk entirely, to prove we did not read.
        s3.remove(BUCKET, crr_parquet_key(self.step.cached_render_result))

        # Load _new_ CachedRenderResult -- from DB columns, not memory
        fresh_step = Step.objects.get(id=self.step.id)
        cached_result = fresh_step.cached_render_result

        self.assertEqual(cached_result.table_metadata, TableMetadata(1, columns))
Exemplo n.º 3
0
 def test_invalid_parquet_is_corrupt_cache_error(self):
     with arrow_table_context(make_column("A", ["x"])) as (path, table):
         result = LoadedRenderResult(
             path=path,
             table=table,
             columns=[Column("A", ColumnType.Text())],
             errors=[],
             json={},
         )
         cache_render_result(self.workflow, self.step, 1, result)
     crr = self.step.cached_render_result
     s3.put_bytes(BUCKET, crr_parquet_key(crr), b"NOT PARQUET")
     with tempfile_context() as arrow_path:
         with self.assertRaises(CorruptCacheError):
             with open_cached_render_result(crr) as loaded:
                 pass
Exemplo n.º 4
0
 def test_read_cached_render_result_slice_as_text_timestamp(self):
     with arrow_table_context(
         make_column("A", [2134213412341232967, None], pa.timestamp("ns"))
     ) as (path, table):
         result = LoadedRenderResult(
             path=path,
             table=table,
             columns=[Column("A", ColumnType.Timestamp())],
             errors=[],
             json={},
         )
         cache_render_result(self.workflow, self.step, 1, result)
     crr = self.step.cached_render_result
     self.assertEqual(
         read_cached_render_result_slice_as_text(crr, "csv", range(2), range(3)),
         "A\n2037-08-18T13:03:32.341232967Z\n",
     )
Exemplo n.º 5
0
    def test_clear(self):
        with arrow_table_context(make_column("A", [1])) as (path, table):
            result = LoadedRenderResult(
                path=path,
                table=table,
                columns=[Column("A", ColumnType.Number(format="{:,}"))],
                errors=[],
                json={},
            )
            cache_render_result(self.workflow, self.step, 1, result)

        parquet_key = crr_parquet_key(self.step.cached_render_result)
        clear_cached_render_result_for_step(self.step)

        db_step = Step.objects.get(id=self.step.id)
        self.assertIsNone(db_step.cached_render_result)

        self.assertFalse(s3.exists(BUCKET, parquet_key))
Exemplo n.º 6
0
    def test_cache_render_result(self):
        with arrow_table_context(make_column("A", [1])) as (table_path, table):
            result = LoadedRenderResult(
                path=table_path,
                table=table,
                columns=[Column("A", ColumnType.Number(format="{:,}"))],
                errors=[
                    RenderError(
                        I18nMessage("e1", {"text": "hi"}, None),
                        [
                            QuickFix(
                                I18nMessage("q1", {"var": 2}, None),
                                QuickFixAction.PrependStep("filter", {"a": "x"}),
                            )
                        ],
                    ),
                    RenderError(I18nMessage("e2", {}, None), []),
                ],
                json={"foo": "bar"},
            )
            cache_render_result(self.workflow, self.step, 1, result)

        cached = self.step.cached_render_result
        self.assertEqual(cached.step_id, self.step.id)
        self.assertEqual(cached.delta_id, 1)

        self.assertEqual(
            crr_parquet_key(cached),
            f"wf-{self.workflow.id}/wfm-{self.step.id}/delta-1.dat",
        )

        # Reading completely freshly from the DB should give the same thing
        db_step = Step.objects.get(id=self.step.id)
        from_db = db_step.cached_render_result
        self.assertEqual(from_db, cached)

        with open_cached_render_result(from_db) as result2:
            assert_arrow_table_equals(
                result2.table, make_table(make_column("A", [1], format="{:,}"))
            )
            self.assertEqual(
                result2.columns, [Column("A", ColumnType.Number(format="{:,}"))]
            )
Exemplo n.º 7
0
def write_to_rendercache(
    workflow: Workflow,
    step: Step,
    delta_id: int,
    table: pa.Table,
    errors: List[RenderError] = [],
    json: Dict[str, Any] = {},
) -> None:
    with arrow_table_context(table) as (path, table):
        result = LoadedRenderResult(
            path=path,
            table=table,
            columns=read_columns(table, full=False),
            errors=errors,
            json=json,
        )

        # use the caller-provided delta ID: no assertion
        old_last_relevant_delta_id = step.last_relevant_delta_id
        step.last_relevant_delta_id = delta_id
        try:
            cache_render_result(workflow, step, delta_id, result)
        finally:
            step.last_relevant_delta_id = old_last_relevant_delta_id
Exemplo n.º 8
0
async def _render_step(
    chroot_context: ChrootContext,
    workflow: Workflow,
    step: Step,
    module_zipfile: Optional[ModuleZipfile],
    raw_params: Dict[str, Any],
    tab_name: str,
    input_path: Path,
    input_table_columns: List[Column],
    tab_results: Dict[Tab, Optional[StepResult]],
    output_path: Path,
) -> LoadedRenderResult:
    """Prepare and call `step`'s `render()`; return a LoadedRenderResult.

    The actual render runs in a background thread so the event loop can process
    other events.
    """
    basedir = output_path.parent

    if step.order > 0 and not input_table_columns:
        return LoadedRenderResult.unreachable(output_path)

    if module_zipfile is None:
        return LoadedRenderResult.from_errors(
            output_path,
            errors=[
                RenderError(
                    trans(
                        "py.renderer.execute.step.noModule",
                        default="Please delete this step: an administrator uninstalled its code.",
                    )
                )
            ],
        )

    # exit_stack: stuff that gets deleted when the render is done
    with contextlib.ExitStack() as exit_stack:
        try:
            # raise UnneededExecution, TabCycleError, TabOutputUnreachableError,
            # NoLoadedDataError, PromptingError
            fetch_result, params, tab_outputs, uploaded_files = await _execute_step_pre(
                basedir=basedir,
                exit_stack=exit_stack,
                workflow=workflow,
                step=step,
                module_zipfile=module_zipfile,
                raw_params=raw_params,
                input_path=input_path,
                input_table_columns=input_table_columns,
                tab_results=tab_results,
            )
        except NoLoadedDataError:
            return LoadedRenderResult.from_errors(
                output_path,
                errors=[
                    RenderError(
                        trans(
                            "py.renderer.execute.step.NoLoadedDataError",
                            default="Please Add Data before this step.",
                        )
                    )
                ],
            )
        except TabCycleError:
            return LoadedRenderResult.from_errors(
                output_path,
                errors=[
                    RenderError(
                        trans(
                            "py.renderer.execute.step.TabCycleError",
                            default="The chosen tab depends on this one. Please choose another tab.",
                        )
                    )
                ],
            )
        except TabOutputUnreachableError:
            return LoadedRenderResult.from_errors(
                output_path,
                errors=[
                    RenderError(
                        trans(
                            "py.renderer.execute.step.TabOutputUnreachableError",
                            default="The chosen tab has no output. Please select another one.",
                        )
                    )
                ],
            )
        except PromptingError as err:
            return LoadedRenderResult.from_errors(
                output_path, errors=err.as_render_errors()
            )

        # Render may take a while. run_in_executor to push that slowdown to a
        # thread and keep our event loop responsive.
        loop = asyncio.get_event_loop()

        try:
            return await loop.run_in_executor(
                None,
                partial(
                    invoke_render,
                    module_zipfile,
                    chroot_context=chroot_context,
                    basedir=basedir,
                    input_filename=input_path.name,
                    params=params,
                    tab_name=tab_name,
                    tab_outputs=tab_outputs,
                    uploaded_files=uploaded_files,
                    fetch_result=fetch_result,
                    output_filename=output_path.name,
                ),
            )
        except ModuleError as err:
            output_path.write_bytes(b"")  # SECURITY
            return LoadedRenderResult.from_errors(
                output_path,
                errors=[
                    RenderError(
                        trans(
                            "py.renderer.execute.step.user_visible_bug_during_render",
                            default="Something unexpected happened. We have been notified and are "
                            "working to fix it. If this persists, contact us. Error code: {message}",
                            arguments={"message": format_for_user_debugging(err)},
                        )
                    )
                ],
            )
Exemplo n.º 9
0
def invoke_render(
    module_zipfile: ModuleZipfile,
    *,
    chroot_context: ChrootContext,
    basedir: Path,
    input_filename: Optional[str],
    params: Dict[str, Any],
    tab_name: str,
    fetch_result: Optional[FetchResult],
    tab_outputs: Dict[str, TabOutput],
    uploaded_files: Dict[str, UploadedFile],
    output_filename: str,
) -> LoadedRenderResult:
    """Use kernel to process `table` with module `render` function.

    Raise `ModuleError` on error. (This is usually the module author's fault.)

    Log any ModuleError. Also log success.

    This synchronous method can be slow for complex modules or large
    datasets. Consider calling it from an executor.
    """
    time1 = time.time()
    begin_status_format = "%s:render() (%0.1fMB input)"
    begin_status_args = (
        module_zipfile.path.name,
        (
            (basedir / input_filename).stat().st_size / 1024 / 1024
            if input_filename is not None
            else 0
        ),
    )
    logger.info(begin_status_format + " begin", *begin_status_args)
    status = "???"
    try:
        result = cjwstate.modules.kernel.render(
            module_zipfile.compile_code_without_executing(),
            chroot_context=chroot_context,
            basedir=basedir,
            input_filename=input_filename,
            params=params,
            tab_name=tab_name,
            fetch_result=fetch_result,
            tab_outputs=tab_outputs,
            uploaded_files=uploaded_files,
            output_filename=output_filename,
        )

        output_path = basedir / output_filename
        st_size = output_path.stat().st_size
        if st_size == 0:
            table = pa.table({})
            columns = []
            status = "(no output)"
        else:
            try:
                table, columns = load_untrusted_arrow_file_with_columns(output_path)
                status = "(%drows, %dcols, %0.1fMB)" % (
                    table.num_rows,
                    table.num_columns,
                    st_size / 1024 / 1024,
                )
            except ValidateError as err:
                raise ModuleExitedError(
                    module_zipfile.path.name,
                    0,
                    "Module wrote invalid data: %s" % str(err),
                )
        return LoadedRenderResult(
            path=output_path,
            table=table,
            columns=columns,
            errors=result.errors,
            json=result.json,
        )
    except ModuleError as err:
        logger.exception("Exception in %s:render", module_zipfile.path.name)
        status = type(err).__name__
        raise
    finally:
        time2 = time.time()

        logger.info(
            begin_status_format + " => %s in %dms",
            *begin_status_args,
            status,
            int((time2 - time1) * 1000),
        )