def call_render(render: Callable, request: ttypes.RenderRequest) -> ttypes.RenderResult: basedir = Path(request.basedir) table = load_trusted_arrow_file(basedir / request.input_filename) params = thrift_json_object_to_pydict(request.params) tab_outputs = { k: TabOutput( tab_name=v.tab_name, table=load_trusted_arrow_file(basedir / v.table_filename), ) for k, v in request.tab_outputs.items() } uploaded_files = { k: UploadedFile( name=v.name, path=(basedir / v.filename), uploaded_at=datetime.datetime.utcfromtimestamp( v.uploaded_at_timestampus / 1000000.0), ) for k, v in request.uploaded_files.items() } if request.fetch_result is None: fetch_result = None else: fetch_result = thrift_fetch_result_to_arrow(request.fetch_result, basedir) raw_result = render( table, params, settings=settings, tab_name=request.tab_name, tab_outputs=tab_outputs, uploaded_files=uploaded_files, fetch_result=fetch_result, ) if not isinstance(raw_result, ArrowRenderResult): # Crash. The module author wrote a buggy module. raise ValueError( "render_arrow_v1() must return a cjwmodule.arrow.types.ArrowRenderResult" ) with pa.ipc.RecordBatchFileWriter( basedir / request.output_filename, schema=raw_result.table.schema) as writer: writer.write_table(raw_result.table) return ttypes.RenderResult( errors=[arrow_render_error_to_thrift(e) for e in raw_result.errors], json=pydict_to_thrift_json_object(raw_result.json), )
def _DEPRECATED_overwrite_to_fix_arrow_table_schema( path: Path, fallback_schema: pa.Schema) -> None: if not path.stat().st_size: return table = load_trusted_arrow_file(path) untyped_schema = table.schema fields = [ __DEPRECATED_fix_field( untyped_schema.field(i), (None if fallback_schema.get_field_index(name) == -1 else fallback_schema.field(fallback_schema.get_field_index(name))), ) for i, name in enumerate(untyped_schema.names) ] schema = pa.schema(fields) # Overwrite with new data # # We don't short-circuit by comparing schemas: two pa.Schema values # with different number formats evaluate as equal. # # We write a separate file to /var/tmp and then copy it: our sandbox # won't let us `rename(2)` in `path`'s directory. with tempfile_context(dir="/var/tmp") as rewrite_path: with pa.ipc.RecordBatchFileWriter(rewrite_path, schema) as writer: writer.write_table(pa.table(table.columns, schema=schema)) shutil.copyfile(rewrite_path, path)
def test_execute_empty_tab(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() tab_flow = TabFlow(Tab(tab.slug, tab.name), []) with self._execute(workflow, tab_flow, {}) as (result, path): self.assertEqual(result, StepResult(path, [])) self.assertEqual(load_trusted_arrow_file(path), make_table())
def call_render(module_spec: ModuleSpec, render: Callable, request: ttypes.RenderRequest) -> ttypes.RenderResult: basedir = Path(request.basedir) input_path = basedir / request.input_filename table = load_trusted_arrow_file(input_path) dataframe = cjwpandasmodule.convert.arrow_table_to_pandas_dataframe(table) tab_outputs = { k: _thrift_tab_output_to_pandas(v, basedir) for k, v in request.tab_outputs.items() } params = _prepare_params(module_spec, thrift_json_object_to_pydict(request.params), basedir, tab_outputs) spec = inspect.getfullargspec(render) kwargs = {} varkw = bool(spec.varkw) # if True, function accepts **kwargs kwonlyargs = spec.kwonlyargs if varkw or "fetch_result" in kwonlyargs: if request.fetch_result is None: fetch_result = None else: fetch_result_path = basedir / request.fetch_result.filename errors = [ # Data comes in as FetchError and we return RenderError. RenderError(thrift_i18n_message_to_arrow(e.message)) for e in request.fetch_result.errors ] if (fetch_result_path.stat().st_size == 0 or cjwparquet.file_has_parquet_magic_number( fetch_result_path)): fetch_result = ptypes.ProcessResult( dataframe=_parquet_to_pandas(fetch_result_path), errors=errors, # infer columns -- the fetch interface doesn't handle formats # (TODO nix pandas_v0 fetching altogether by rewriting all modules) ) else: # TODO nix pandas Fetch modules. (Do any use files, even?) fetch_result = types.FetchResult(path=fetch_result_path, errors=errors) kwargs["fetch_result"] = fetch_result if varkw or "settings" in kwonlyargs: kwargs["settings"] = settings if varkw or "tab_name" in kwonlyargs: kwargs["tab_name"] = request.tab_name if varkw or "input_columns" in kwonlyargs: kwargs["input_columns"] = arrow_schema_to_render_columns(table.schema) input_columns = read_columns(table, full=False) raw_result = render(dataframe, params, **kwargs) # raise ValueError if invalid pandas_result = ptypes.ProcessResult.coerce( raw_result, try_fallback_columns=input_columns) pandas_result.truncate_in_place_if_too_big() arrow_result = pandas_result.to_arrow(basedir / request.output_filename) return arrow_render_result_to_thrift(arrow_result)
def _thrift_tab_output_to_pandas(tab_output: ttypes.TabOutput, basedir: Path) -> ptypes.TabOutput: table = load_trusted_arrow_file(basedir / tab_output.table_filename) render_columns = arrow_schema_to_render_columns(table.schema) return ptypes.TabOutput( tab_output.tab_name, render_columns, cjwpandasmodule.convert.arrow_table_to_pandas_dataframe(table), )
def test_execute_partial_cache_hit(self): module_zipfile = create_module_zipfile( "mod", spec_kwargs={"loads_data": True}) workflow = Workflow.create_and_init() tab = workflow.tabs.first() # step1: cached result is fresh. Should not render. step1 = tab.steps.create( order=0, slug="step-1", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id, ) write_to_rendercache(workflow, step1, workflow.last_delta_id, make_table(make_column("A", ["a"]))) # step2: cached result is stale, so must be re-rendered step2 = tab.steps.create( order=1, slug="step-2", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id, ) write_to_rendercache( workflow, step2, workflow.last_delta_id - 1, make_table(make_column("B", ["b"])), ) tab_flow = TabFlow( Tab(tab.slug, tab.name), [ ExecuteStep(step1, module_zipfile, {}), ExecuteStep(step2, module_zipfile, {}), ], ) new_table = make_table(make_column("C", ["c"])) with patch.object(Kernel, "render", side_effect=mock_render(new_table)): with self._execute(workflow, tab_flow, {}) as (result, path): self.assertEqual( result, StepResult(path, [Column("C", ColumnType.Text())])) assert_arrow_table_equals(load_trusted_arrow_file(path), new_table) Kernel.render.assert_called_once() # step2, not step1 self.assertRegex( # Output is to the correct file Kernel.render.call_args[1]["output_filename"], r"execute-tab-output.*\.arrow", )
def test_execute_cache_hit(self): cached_table1 = make_table(make_column("A", [1])) cached_table2 = make_table(make_column("B", [2], format="${:,}")) module_zipfile = create_module_zipfile( "mod", spec_kwargs={"loads_data": True}) workflow = Workflow.create_and_init() tab = workflow.tabs.first() step1 = tab.steps.create(order=0, slug="step-1", last_relevant_delta_id=workflow.last_delta_id) write_to_rendercache(workflow, step1, workflow.last_delta_id, cached_table1) step2 = tab.steps.create(order=1, slug="step-2", last_relevant_delta_id=workflow.last_delta_id) write_to_rendercache(workflow, step2, workflow.last_delta_id, cached_table2) tab_flow = TabFlow( Tab(tab.slug, tab.name), [ ExecuteStep(step1, module_zipfile, {}), ExecuteStep(step2, module_zipfile, {}), ], ) unwanted_table = make_table(make_column("No", ["bad"])) with patch.object(Kernel, "render", side_effect=mock_render(unwanted_table)): with self._execute(workflow, tab_flow, {}) as (result, path): self.assertEqual( result, StepResult( path, [Column("B", ColumnType.Number(format="${:,}"))]), ) assert_arrow_table_equals(load_trusted_arrow_file(path), cached_table2) Kernel.render.assert_not_called()
def test_execute_cache_miss(self): module_zipfile = create_module_zipfile( "mod", spec_kwargs={"loads_data": True}) workflow = Workflow.create_and_init() tab = workflow.tabs.first() step1 = tab.steps.create( order=0, slug="step-1", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id, ) step2 = tab.steps.create( order=1, slug="step-2", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id, ) tab_flow = TabFlow( Tab(tab.slug, tab.name), [ ExecuteStep(step1, module_zipfile, {}), ExecuteStep(step2, module_zipfile, {}), ], ) table = make_table(make_column("A", ["a"])) with patch.object(Kernel, "render", side_effect=mock_render(table)): with self._execute(workflow, tab_flow, {}) as (result, path): self.assertEqual( result, StepResult(path, [Column("A", ColumnType.Text())])) assert_arrow_table_equals(load_trusted_arrow_file(path), table) self.assertEqual(Kernel.render.call_count, 2) # step2, not step1 self.assertRegex( # Output is to the correct file Kernel.render.call_args[1]["output_filename"], r"execute-tab-output.*\.arrow", )
def call_fetch(fetch: Callable, request: ttypes.FetchRequest) -> ttypes.FetchResult: """Call `fetch()` and validate the result. Module code may contain errors. This function and `fetch()` should strive to raise developer-friendly errors in the case of bugs -- including unexpected input. """ # thrift => pandas basedir = Path(request.basedir) params: Dict[str, Any] = thrift_json_object_to_pydict(request.params) output_path = basedir / request.output_filename spec = inspect.getfullargspec(fetch) kwargs = {} varkw = bool(spec.varkw) # if True, function accepts **kwargs kwonlyargs = spec.kwonlyargs if varkw or "secrets" in kwonlyargs: kwargs["secrets"] = thrift_json_object_to_pydict(request.secrets) if varkw or "settings" in kwonlyargs: kwargs["settings"] = settings if varkw or "get_input_dataframe" in kwonlyargs: async def get_input_dataframe(): if request.input_table_parquet_filename is None: return None else: return _parquet_to_pandas(basedir / request.input_table_parquet_filename) kwargs["get_input_dataframe"] = get_input_dataframe if varkw or "output_path" in kwonlyargs: kwargs["output_path"] = output_path result = fetch(params, **kwargs) if asyncio.iscoroutine(result): result = asyncio.run(result) if isinstance(result, tuple) and len(result) == 2 and isinstance( result[0], Path): errors = ptypes.coerce_RenderError_list(result[1]) elif isinstance(result, Path): errors = [] elif isinstance(result, list): errors = ptypes.coerce_RenderError_list(result) else: pandas_result = ptypes.ProcessResult.coerce(result) pandas_result.truncate_in_place_if_too_big() # ProcessResult => FetchResult isn't a thing; but we can hack it using # ProcessResult => RenderResult => FetchResult. with tempfile_context(suffix=".arrow") as arrow_path: if pandas_result.columns: hacky_result = pandas_result.to_arrow(arrow_path) table = load_trusted_arrow_file(arrow_path) cjwparquet.write(output_path, table) errors = hacky_result.errors else: output_path.write_bytes(b"") errors = pandas_result.errors return ttypes.FetchResult( filename=request.output_filename, errors=[arrow_render_error_to_thrift(e) for e in errors], )