def test_delete_step(self): result = RenderResult(arrow_table({"A": [1]}), [RenderError(I18nMessage("X", {}, None), [])], {}) cache_render_result(self.workflow, self.step, 1, result) parquet_key = crr_parquet_key(self.step.cached_render_result) self.step.delete() self.assertFalse(s3.exists(BUCKET, parquet_key))
def test_clean_tabs_preserve_ordering(self): tab2 = Tab("tab-2", "Tab 2") tab2_output = arrow_table({"B": [1]}) tab3 = Tab("tab-3", "Tab 3") tab3_output = arrow_table({"C": [1]}) context = self._render_context( # RenderContext's dict ordering determines desired tab order. # (Python 3.7 spec: dict is ordered in insertion order. CPython 3.6 # and PyPy 7 do this, too.) tab_results={ tab3: RenderResult(tab3_output), tab2: RenderResult(tab2_output), } ) # Supply wrongly-ordered tabs; renderprep should reorder them. result = clean_value(ParamDType.Multitab(), ["tab-2", "tab-3"], context) self.assertEqual([t.tab.slug for t in result], ["tab-3", "tab-2"])
def test_render_with_parquet_fetch_result(self): def render(*args, fetch_result): return fetch_result with parquet_file({"A": ["fetched"]}, dir=self.basedir) as pf: result = self._test_render(render, fetch_result=FetchResult(pf)) assert_render_result_equals( result, RenderResult(arrow_table({"A": ["fetched"]})) )
def test_execute_cache_hit(self, fake_module): workflow = Workflow.objects.create() tab = workflow.tabs.create(position=0) delta = InitWorkflowCommand.create(workflow) wf_module1 = tab.wf_modules.create(order=0, slug="step-1", last_relevant_delta_id=delta.id) cache_render_result(workflow, wf_module1, delta.id, RenderResult(arrow_table({"A": [1]}))) wf_module2 = tab.wf_modules.create(order=1, slug="step-2", last_relevant_delta_id=delta.id) cache_render_result(workflow, wf_module2, delta.id, RenderResult(arrow_table({"B": [2]}))) self._execute(workflow) fake_module.assert_not_called()
def test_empty_json(self): cache_render_result(self.workflow, self.step, 1, RenderResult(arrow_table({"A": ["a", "b"]}))) response = self._request() self.assertEqual(response.status_code, status.NOT_FOUND) self.assertEqual(json.loads(response.content), {"error": "render result has no JSON"})
def test_render_with_non_parquet_fetch_result(self): def render(table, params, *, fetch_result): return pd.DataFrame({"A": [fetch_result.path.read_text()]}) with tempfile_context(dir=self.basedir) as tf: tf.write_bytes(b"abcd") result = self._test_render(render, fetch_result=FetchResult(tf)) assert_render_result_equals( result, RenderResult(arrow_table({"A": ["abcd"]})))
def test_invalid_parquet_is_corrupt_cache_error(self): result = RenderResult(arrow_table({"A": [1]})) cache_render_result(self.workflow, self.wf_module, self.delta.id, result) crr = self.wf_module.cached_render_result minio.put_bytes(BUCKET, crr_parquet_key(crr), b"NOT PARQUET") with tempfile_context() as arrow_path: with self.assertRaises(CorruptCacheError): load_cached_render_result(crr, arrow_path)
def _wrap_render_errors(render_call): try: return render_call() except ModuleError as err: return RenderResult.from_deprecated_error( "Something unexpected happened. We have been notified and are " "working to fix it. If this persists, contact us. Error code: " + format_for_user_debugging(err) )
def test_delete_wfmodule(self): result = RenderResult( arrow_table({"A": [1]}), [RenderError(I18nMessage("X", []), [])], {} ) cache_render_result(self.workflow, self.wf_module, self.delta.id, result) parquet_key = crr_parquet_key(self.wf_module.cached_render_result) self.wf_module.delete() self.assertFalse(minio.exists(BUCKET, parquet_key))
def test_email_delta_ignore_corrupt_cache_error(self, email_delta, read_cache): read_cache.side_effect = rendercache.CorruptCacheError workflow = Workflow.create_and_init() tab = workflow.tabs.first() wf_module = tab.wf_modules.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id - 1, notifications=True, ) # We need to actually populate the cache to set up the test. The code # under test will only try to open the render result if the database # says there's something there. rendercache.cache_render_result( workflow, wf_module, workflow.last_delta_id - 1, RenderResult(arrow_table({"A": [1]})), ) wf_module.last_relevant_delta_id = workflow.last_delta_id wf_module.save(update_fields=["last_relevant_delta_id"]) with arrow_table_context({"A": [2]}) as table2: def render(*args, **kwargs): return RenderResult(table2) with self._stub_module(render): with self.assertLogs(level=logging.ERROR): self.run_with_async_db( execute_wfmodule( self.chroot_context, workflow, wf_module, {}, Tab(tab.slug, tab.name), RenderResult(), {}, self.output_path, )) email_delta.assert_not_called()
def test_email_delta(self, email_delta): workflow = Workflow.create_and_init() tab = workflow.tabs.first() step = tab.steps.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id - 1, notifications=True, ) rendercache.cache_render_result( workflow, step, workflow.last_delta_id - 1, RenderResult(arrow_table({"A": [1]})), ) step.last_relevant_delta_id = workflow.last_delta_id step.save(update_fields=["last_relevant_delta_id"]) module_zipfile = create_module_zipfile( "x", spec_kwargs={"loads_data": True}, python_code= 'import pandas as pd\ndef render(table, params): return pd.DataFrame({"A": [2]})', ) with self.assertLogs(level=logging.INFO): self.run_with_async_db( execute_step( self.chroot_context, workflow, step, module_zipfile, {}, Tab(tab.slug, tab.name), RenderResult(), {}, self.output_path, )) email_delta.assert_called() delta = email_delta.call_args[0][0] self.assertEqual(delta.user, workflow.owner) self.assertEqual(delta.workflow, workflow) self.assertEqual(delta.step, step)
def test_report_module_error(self): workflow = Workflow.create_and_init() tab = workflow.tabs.first() step = tab.steps.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id, ) module_zipfile = create_module_zipfile( "x", spec_kwargs={"loads_data": True}, python_code="def render(table, params):\n undefined()", ) with self.assertLogs(level=logging.INFO): result = self.run_with_async_db( execute_step( self.chroot_context, workflow, step, module_zipfile, {}, Tab(tab.slug, tab.name), RenderResult(), {}, self.output_path, )) assert_render_result_equals( result, RenderResult(errors=[ RenderError( I18nMessage( "py.renderer.execute.step.user_visible_bug_during_render", { "message": "exit code 1: NameError: name 'undefined' is not defined" }, None, )) ]), )
def test_resume_without_rerunning_unneeded_renders(self, fake_load_module): workflow = Workflow.create_and_init() tab = workflow.tabs.first() delta_id = workflow.last_delta_id ModuleVersion.create_or_replace_from_spec({ "id_name": "mod", "name": "Mod", "category": "Clean", "parameters": [] }) # wf_module1: has a valid, cached result wf_module1 = tab.wf_modules.create( order=0, slug="step-1", last_relevant_delta_id=delta_id, module_id_name="mod", ) cache_render_result(workflow, wf_module1, delta_id, RenderResult(arrow_table({"A": [1]}))) # wf_module2: has no cached result (must be rendered) wf_module2 = tab.wf_modules.create( order=1, slug="step-2", last_relevant_delta_id=delta_id, module_id_name="mod", ) fake_loaded_module = Mock(LoadedModule) fake_loaded_module.migrate_params.return_value = {} fake_load_module.return_value = fake_loaded_module result2 = RenderResult(arrow_table({"A": [2]})) fake_loaded_module.render.return_value = result2 self._execute(workflow) fake_loaded_module.render.assert_called_once() # only with module2 wf_module2.refresh_from_db() with open_cached_render_result( wf_module2.cached_render_result) as actual: assert_render_result_equals(actual, result2)
def test_start_row_after_end_row(self): cache_render_result( self.workflow, self.step2, self.step2.last_relevant_delta_id, RenderResult(arrow_table({"A": [0, 1, 2, 3, 4]})), ) response = self._request_step(self.step2, "?startrow=3&endrow=1") self.assertEqual(response.status_code, status.OK) self.assertEqual(read_streaming_json(response), [])
def test_clear(self): result = RenderResult(arrow_table({"A": [1]})) cache_render_result(self.workflow, self.wf_module, self.delta.id, result) parquet_key = crr_parquet_key(self.wf_module.cached_render_result) clear_cached_render_result_for_wf_module(self.wf_module) db_wf_module = WfModule.objects.get(id=self.wf_module.id) self.assertIsNone(db_wf_module.cached_render_result) self.assertFalse(minio.exists(BUCKET, parquet_key))
def test_render_arrow_table_empty_output_table_is_empty(self): # The param name "arrow_table" is a special case def render(arrow_table, params, output_path, **kwargs): out = pa.table({}) with pa.ipc.RecordBatchFileWriter(output_path, out.schema) as writer: writer.write_table(out) with ModuleTestEnv(render=render) as env: outcome = env.call_render(make_table(), {}) self.assertEqual(outcome.result, RenderResult()) self.assertEqual(outcome.read_table(), make_table())
def render_arrow(table, params, tab_name, fetch_result: Optional[FetchResult], output_path: Path) -> RenderResult: if fetch_result is None: # empty table return RenderResult(ArrowTable()) elif fetch_result.path is not None and parquet.file_has_parquet_magic_number( fetch_result.path): # Deprecated files: we used to parse in fetch() and store the result # as Parquet. Now we've lost the original file data, and we need to # support our oldest users. return _render_deprecated_parquet(fetch_result.path, fetch_result.errors, output_path, params) elif fetch_result.errors: # We've never stored errors+data. If there are errors, assume # there's no data. return RenderResult(ArrowTable(), fetch_result.errors) else: assert not fetch_result.errors # we've never stored errors+data. return _render_file(fetch_result.path, output_path, params)
def render(self, params: Dict[str, Any], fetch_result: Optional[FetchResult]): with tempfile_context(prefix="output-", suffix=".arrow") as output_path: errors = render(ArrowTable(), params, output_path, fetch_result=fetch_result) arrow_table = ArrowTable.from_arrow_file_with_inferred_metadata( output_path) yield RenderResult(arrow_table, [RenderError(I18nMessage(*e)) for e in errors])
def test_clip_out_of_bounds(self): cache_render_result( self.workflow, self.step2, self.step2.last_relevant_delta_id, RenderResult(arrow_table({"A": [0, 1]})), ) # index out of bounds should clip response = self._request_step(self.step2, "?startrow=-1&endrow=500") self.assertEqual(response.status_code, 200) self.assertEqual(read_streaming_json(response), [{"A": 0}, {"A": 1}])
def test_deprecated_current_table_csv(self): cache_render_result( self.workflow, self.step2, 2, RenderResult(arrow_table({"A": ["a", "b"]})), ) response = self.client.get( f"/public/moduledata/live/{self.step2.id}.csv") self.assertEqual(response.status_code, 200) self.assertEqual(b"".join(response.streaming_content), b"A\na\nb")
def test_render_fetch_error(self): errors = [RenderResult(I18nMessage("x", {"y": "z"}))] with tempfile_context() as empty_path: result = render_arrow( ArrowTable(), P(), "tab-x", FetchResult(empty_path, errors), self.output_path, ) assert_arrow_table_equals(result.table, ArrowTable()) self.assertEqual(result.errors, errors)
def render( self, compiled_module: CompiledModule, chroot_context: ChrootContext, basedir: Path, input_table: ArrowTable, params: Params, tab: Tab, fetch_result: Optional[FetchResult], output_filename: str, ) -> RenderResult: """ Run the module's `render_thrift()` function and return its result. Raise ModuleError if the module has a bug. """ chroot_dir = chroot_context.chroot.root basedir_seen_by_module = Path("/") / basedir.relative_to(chroot_dir) request = ttypes.RenderRequest( str(basedir_seen_by_module), input_table.to_thrift(), params.to_thrift(), tab.to_thrift(), None if fetch_result is None else fetch_result.to_thrift(), output_filename, ) try: with chroot_context.writable_file(basedir / output_filename): result = self._run_in_child( chroot_dir=chroot_dir, network_config=pyspawner.NetworkConfig(), # TODO disallow networking compiled_module=compiled_module, timeout=self.render_timeout, result=ttypes.RenderResult(), function="render_thrift", args=[request], ) finally: chroot_context.clear_unowned_edits() if result.table.filename and result.table.filename != output_filename: raise ModuleExitedError(0, "Module wrote to wrong output file") try: # RenderResult.from_thrift() verifies all filenames passed by the # module are in the directory the module has access to. It assumes # the Arrow file (if there is one) is untrusted, so it can raise # ValidateError render_result = RenderResult.from_thrift(result, basedir) except ValidateError as err: raise ModuleExitedError(0, "Module produced invalid data: %s" % str(err)) return render_result
def test_wrong_column(self): cache_render_result( self.workflow, self.step1, self.step1.last_relevant_delta_id, RenderResult(arrow_table({"A": ["a", "b"]})), ) response = self._request("B") self.assertEqual(response.status_code, status.NOT_FOUND) self.assertEqual(json.loads(response.content), {"error": 'column "B" not found'})
def test_email_no_delta_when_errors_stay_the_same(self, email_delta): workflow = Workflow.create_and_init() tab = workflow.tabs.first() step = tab.steps.create( order=0, slug="step-1", module_id_name="x", last_relevant_delta_id=workflow.last_delta_id - 1, notifications=True, ) # We need to actually populate the cache to set up the test. The code # under test will only try to open the render result if the database # says there's something there. rendercache.cache_render_result( workflow, step, workflow.last_delta_id - 1, RenderResult(errors=[ RenderError( I18nMessage("py.renderer.execute.step.noModule", {}, None)) ]), ) step.last_relevant_delta_id = workflow.last_delta_id step.save(update_fields=["last_relevant_delta_id"]) self.run_with_async_db( execute_step( self.chroot_context, workflow, step, None, # module_zipfile {}, Tab(tab.slug, tab.name), RenderResult(), {}, self.output_path, )) email_delta.assert_not_called() # error is the same error
def test_execute_cache_hit(self): module_zipfile = create_module_zipfile("mod") workflow = Workflow.create_and_init() tab = workflow.tabs.first() step1 = tab.wf_modules.create( order=0, slug="step-1", last_relevant_delta_id=workflow.last_delta_id) rendercache.cache_render_result( workflow, step1, workflow.last_delta_id, RenderResult(arrow_table({"A": [1]})), ) step2 = tab.wf_modules.create( order=1, slug="step-2", last_relevant_delta_id=workflow.last_delta_id) rendercache.cache_render_result( workflow, step2, workflow.last_delta_id, RenderResult(arrow_table({"B": [2]})), ) tab_flow = TabFlow( tab.to_arrow(), [ ExecuteStep(step1, module_zipfile, {}), ExecuteStep(step2, module_zipfile, {}), ], ) with patch.object(Kernel, "render", side_effect=mock_render({"No": ["bad"]})): with self._execute(workflow, tab_flow, {}) as result: assert_render_result_equals( result, RenderResult(arrow_table({"B": [2]}), []))
def test_current_table_csv(self): cache_render_result( self.workflow, self.step2, 2, RenderResult(arrow_table({"A": ["a", "b"]})), ) response = self.client.get( f"/workflows/{self.workflow.id}/steps/step-2/current-result-table.csv" ) self.assertEqual(response.status_code, 200) self.assertEqual(b"".join(response.streaming_content), b"A\na\nb")
def test_json(self): cache_render_result( self.workflow, self.step2, 2, RenderResult(arrow_table({"A": ["a", "b"]})), ) response = self.client.get( f"/workflows/{self.workflow.id}/tiles/step-2/delta-2/0,0.json") self.assertEqual(response.status_code, 200) self.assertEqual(json.loads(response.content), {"rows": [["a"], ["b"]]})
def test_tile_row_out_of_bounds(self): cache_render_result( self.workflow, self.step2, 2, RenderResult(arrow_table({"A": ["a", "b"]})), ) response = self.client.get( f"/workflows/{self.workflow.id}/tiles/step-2/delta-2/1,0.json") self.assertEqual(response.status_code, status.NOT_FOUND) self.assertEqual(json.loads(response.content), {"error": "tile out of bounds"})
def test_json(self): cache_render_result( self.workflow, self.step, 1, RenderResult(arrow_table({"A": ["a", "b"]}), json={"hello": "world!"}), ) response = self._request() self.assertEqual(response.status_code, status.OK) self.assertEqual(json.loads(response.content), {"hello": "world!"})
def test_wrong_delta_id(self): cache_render_result( self.workflow, self.step2, self.step2.last_relevant_delta_id, RenderResult(arrow_table({"A": [0, 1, 2, 3, 4]})), ) self.step2.last_relevant_delta_id = 99 self.step2.save(update_fields=["last_relevant_delta_id"]) response = self._request_slug_delta(self.step2.slug, 99) self.assertEqual(response.status_code, status.OK) self.assertEqual(json.loads(response.content), [])