def test_metadata_does_not_require_file_read(self): columns = [ Column("A", ColumnType.Number(format="{:,.2f}")), Column("B", ColumnType.Timestamp()), Column("C", ColumnType.Text()), Column("D", ColumnType.Date("month")), ] with arrow_table_context( make_column("A", [1], format="{:,.2f}"), make_column("B", [datetime.datetime(2021, 4, 13)]), make_column("C", ["c"]), make_column("D", [datetime.date(2021, 4, 1)], unit="month"), ) as (path, table): result = LoadedRenderResult( path=path, table=table, columns=columns, errors=[], json={} ) cache_render_result(self.workflow, self.step, 1, result) # Delete from disk entirely, to prove we did not read. s3.remove(BUCKET, crr_parquet_key(self.step.cached_render_result)) # Load _new_ CachedRenderResult -- from DB columns, not memory fresh_step = Step.objects.get(id=self.step.id) cached_result = fresh_step.cached_render_result self.assertEqual(cached_result.table_metadata, TableMetadata(1, columns))
def test_render_with_input_columns(self): def render(*args, input_columns): self.assertEqual( input_columns, { "A": ptypes.RenderColumn("A", "text", None), "B": ptypes.RenderColumn("B", "number", "{:,.3f}"), "C": ptypes.RenderColumn("C", "datetime", None), }, ) with arrow_table_context( { "A": ["x"], "B": [1], "C": pa.array([datetime.now()], pa.timestamp("ns")) }, columns=[ Column("A", ColumnType.Text()), Column("B", ColumnType.Number("{:,.3f}")), Column("C", ColumnType.Datetime()), ], dir=self.basedir, ) as arrow_table: self._test_render(render, arrow_table=arrow_table)
def test_render_using_tab_output(self): def render(table, params): self.assertEqual(params["tabparam"].slug, "tab-1") self.assertEqual(params["tabparam"].name, "Tab 1") self.assertEqual( params["tabparam"].columns, { "X": ptypes.RenderColumn("X", "number", "{:,d}"), "Y": ptypes.RenderColumn("Y", "text", None), }, ) assert_frame_equal(params["tabparam"].dataframe, pd.DataFrame({ "X": [1], "Y": ["y"] })) with arrow_table_context( { "X": [1], "Y": ["y"] }, columns=[ Column("X", ColumnType.Number("{:,d}")), Column("Y", ColumnType.Text()), ], dir=self.basedir, ) as atable: self._test_render( render, params={"tabparam": TabOutput(Tab("tab-1", "Tab 1"), atable)})
def test_clean_multicolumn_sort_in_table_order(self): input_shape = TableMetadata(3, [ Column("B", ColumnType.Number()), Column("A", ColumnType.Number()) ]) result = clean_value(ParamDType.Multicolumn(), ["A", "B"], input_shape) self.assertEqual(result, ["B", "A"])
def test_metadata_comes_from_db_columns(self): columns = [ Column("A", ColumnType.Number(format="{:,.2f}")), Column("B", ColumnType.Datetime()), Column("C", ColumnType.Text()), ] result = RenderResult( arrow_table({ "A": [1], "B": [datetime.datetime.now()], "C": ["x"] }, columns=columns)) cache_render_result(self.workflow, self.wf_module, self.delta.id, result) # Delete from disk entirely, to prove we did not read. minio.remove(BUCKET, crr_parquet_key(self.wf_module.cached_render_result)) # Load _new_ CachedRenderResult -- from DB columns, not memory fresh_wf_module = WfModule.objects.get(id=self.wf_module.id) cached_result = fresh_wf_module.cached_render_result self.assertEqual(cached_result.table_metadata, TableMetadata(1, columns))
def test_clean_multicolumn_valid(self): input_shape = TableMetadata(3, [ Column("A", ColumnType.Number()), Column("B", ColumnType.Number()) ]) result = clean_value(ParamSchema.Multicolumn(), ["A", "B"], input_shape) self.assertEqual(result, ["A", "B"])
def test_clean_multicolumn_missing_is_removed(self): input_shape = TableMetadata(3, [ Column("A", ColumnType.Number()), Column("B", ColumnType.Number()) ]) result = clean_value(ParamDType.Multicolumn(), ["A", "X", "B"], input_shape) self.assertEqual(result, ["A", "B"])
def test_render_use_input_columns_as_try_fallback_columns(self): def render(*args, input_columns): return pd.DataFrame({"A": [1]}) with arrow_table_context({"A": [1]}, [Column("A", ColumnType.Number("{:,.3f}"))], dir=self.basedir) as arrow_table: result = self._test_render(render, arrow_table=arrow_table) self.assertEqual( result.table.metadata.columns, [Column("A", ColumnType.Number("{:,.3f}"))], )
def test_fetch_truncate(self): def fetch(params): return pd.DataFrame({"A": [1, 2, 3]}) with tempfile_context(dir=self.basedir) as outfile: result = self._test_fetch(fetch, output_filename=outfile.name) self.assertEqual( result, FetchResult( outfile, errors=[ FetchError( I18nMessage( "py.cjwkernel.pandas.types.ProcessResult.truncate_in_place_if_too_big.warning", { "old_number": 3, "new_number": 2 }, None, )) ], ), ) assert_arrow_table_equals( read_parquet_as_arrow( outfile, [Column("A", ColumnType.Number("{:,}"))]), make_table(make_column("A", [1, 2])), )
def test_clean_column_happy_path(self): input_shape = TableMetadata(3, [Column("A", ColumnType.Number())]) self.assertEqual( clean_value(ParamDType.Column(column_types=frozenset({"number"})), "A", input_shape), "A", )
def test_fetch_integration(self, send_update, queue_render): queue_render.side_effect = async_value(None) send_update.side_effect = async_value(None) workflow = Workflow.create_and_init() create_module_zipfile( "mod", python_code= ("import pandas as pd\ndef fetch(params): return pd.DataFrame({'A': [1]})\ndef render(table, params): return table" ), ) step = workflow.tabs.first().steps.create(order=0, slug="step-1", module_id_name="mod") cjwstate.modules.init_module_system() now = datetime.datetime.now() with self.assertLogs(level=logging.INFO): self.run_with_async_db( fetch.fetch(workflow_id=workflow.id, step_id=step.id, now=now)) step.refresh_from_db() so = step.stored_objects.get(stored_at=step.stored_data_version) with s3.temporarily_download(s3.StoredObjectsBucket, so.key) as parquet_path: # fetch results are stored without a schema. Let's hard-code a # schema simply so we can test that the table data is the same. table = read_parquet_as_arrow(parquet_path, [Column("A", ColumnType.Number())]) assert_arrow_table_equals(table, make_table(make_column("A", [1]))) workflow.refresh_from_db() queue_render.assert_called_with(workflow.id, workflow.last_delta_id) send_update.assert_called()
def test_input_crr_corrupt_cache_error_is_none( self, downloaded_parquet_file, load_module ): load_module.return_value.migrate_params.return_value = {} load_module.return_value.fetch.return_value = FetchResult(self.output_path, []) downloaded_parquet_file.side_effect = rendercache.CorruptCacheError( "file not found" ) input_metadata = TableMetadata(3, [Column("A", ColumnType.Text())]) input_crr = CachedRenderResult(1, 2, 3, "ok", [], {}, input_metadata) fetch.fetch_or_wrap_error( self.ctx, self.chroot_context, self.basedir, WfModule(), MockModuleVersion(), {}, None, input_crr, self.output_path, ) # fetch is still called, with `None` as argument. self.assertIsNone( load_module.return_value.fetch.call_args[1]["input_parquet_filename"] )
def test_input_crr(self, downloaded_parquet_file, clean_value, load_module): load_module.return_value.migrate_params.return_value = {} load_module.return_value.fetch.return_value = FetchResult(self.output_path, []) clean_value.return_value = {} downloaded_parquet_file.return_value = Path("/path/to/x.parquet") input_metadata = TableMetadata(3, [Column("A", ColumnType.Text())]) input_crr = CachedRenderResult(1, 2, 3, "ok", [], {}, input_metadata) fetch.fetch_or_wrap_error( self.ctx, self.chroot_context, self.basedir, WfModule(), MockModuleVersion(), {}, None, input_crr, self.output_path, ) # Passed file is downloaded from rendercache downloaded_parquet_file.assert_called_with(input_crr, dir=self.basedir) self.assertEqual( load_module.return_value.fetch.call_args[1]["input_parquet_filename"], "x.parquet", ) # clean_value() is called with input metadata from CachedRenderResult clean_value.assert_called() self.assertEqual(clean_value.call_args[0][2], input_metadata)
def test_date_unit_day_ok(self): table = pa.table( [pa.array([date(2021, 4, 4)])], pa.schema([pa.field("A", pa.date32(), metadata={b"unit": b"day"})]), ) self.assertEqual(read_columns(table), [Column("A", ColumnType.Date(unit="day"))])
def infer_table_metadata(table: pyarrow.Table) -> TableMetadata: return TableMetadata( n_rows=table.num_rows, columns=[ Column(name, _infer_output_column_type(column)) for name, column in zip(table.column_names, table.columns) ], )
def test_date_unit_month_ok(self): table = pa.table( [pa.array([date(1200, 12, 1), date(3199, 2, 1), None])], pa.schema( [pa.field("A", pa.date32(), metadata={b"unit": b"month"})]), ) self.assertEqual(read_columns(table), [Column("A", ColumnType.Date(unit="month"))])
def test_cache_render_result(self): with arrow_table_context(make_column("A", [1])) as (table_path, table): result = LoadedRenderResult( path=table_path, table=table, columns=[Column("A", ColumnType.Number(format="{:,}"))], errors=[ RenderError( I18nMessage("e1", {"text": "hi"}, None), [ QuickFix( I18nMessage("q1", {"var": 2}, None), QuickFixAction.PrependStep("filter", {"a": "x"}), ) ], ), RenderError(I18nMessage("e2", {}, None), []), ], json={"foo": "bar"}, ) cache_render_result(self.workflow, self.step, 1, result) cached = self.step.cached_render_result self.assertEqual(cached.step_id, self.step.id) self.assertEqual(cached.delta_id, 1) self.assertEqual( crr_parquet_key(cached), f"wf-{self.workflow.id}/wfm-{self.step.id}/delta-1.dat", ) # Reading completely freshly from the DB should give the same thing db_step = Step.objects.get(id=self.step.id) from_db = db_step.cached_render_result self.assertEqual(from_db, cached) with open_cached_render_result(from_db) as result2: assert_arrow_table_equals( result2.table, make_table(make_column("A", [1], format="{:,}")) ) self.assertEqual( result2.columns, [Column("A", ColumnType.Number(format="{:,}"))] )
def test_clean_normal_dict(self): input_shape = TableMetadata(3, [Column("A", ColumnType.Number())]) schema = ParamDType.Dict({ "str": ParamDType.String(), "int": ParamDType.Integer() }) value = {"str": "foo", "int": 3} expected = dict(value) # no-op result = clean_value(schema, value, input_shape) self.assertEqual(result, expected)
def test_execute_partial_cache_hit(self): module_zipfile = create_module_zipfile( "mod", spec_kwargs={"loads_data": True}) workflow = Workflow.create_and_init() tab = workflow.tabs.first() # step1: cached result is fresh. Should not render. step1 = tab.steps.create( order=0, slug="step-1", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id, ) write_to_rendercache(workflow, step1, workflow.last_delta_id, make_table(make_column("A", ["a"]))) # step2: cached result is stale, so must be re-rendered step2 = tab.steps.create( order=1, slug="step-2", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id, ) write_to_rendercache( workflow, step2, workflow.last_delta_id - 1, make_table(make_column("B", ["b"])), ) tab_flow = TabFlow( Tab(tab.slug, tab.name), [ ExecuteStep(step1, module_zipfile, {}), ExecuteStep(step2, module_zipfile, {}), ], ) new_table = make_table(make_column("C", ["c"])) with patch.object(Kernel, "render", side_effect=mock_render(new_table)): with self._execute(workflow, tab_flow, {}) as (result, path): self.assertEqual( result, StepResult(path, [Column("C", ColumnType.Text())])) assert_arrow_table_equals(load_trusted_arrow_file(path), new_table) Kernel.render.assert_called_once() # step2, not step1 self.assertRegex( # Output is to the correct file Kernel.render.call_args[1]["output_filename"], r"execute-tab-output.*\.arrow", )
def test_resume_backtrack_on_corrupt_cache_error(self): module_zipfile = create_module_zipfile( "mod", spec_kwargs={"loads_data": True}) workflow = Workflow.create_and_init() tab = workflow.tabs.first() # step1: cached result is fresh -- but CORRUPT step1 = tab.steps.create( order=0, slug="step-1", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id, ) write_to_rendercache(workflow, step1, workflow.last_delta_id, make_table(make_column("A", [1]))) step1.refresh_from_db() s3.put_bytes( # Write corrupted data -- will lead to CorruptCacheError rendercache.io.BUCKET, rendercache.io.crr_parquet_key(step1.cached_render_result), b"CORRUPT", ) # step2: no cached result -- must re-render step2 = tab.steps.create(order=1, slug="step-2", module_id_name="mod") tab_flow = TabFlow( Tab(tab.slug, tab.name), [ ExecuteStep(step1, module_zipfile, {}), ExecuteStep(step2, module_zipfile, {}), ], ) new_table = make_table(make_column("B", ["b"])) with patch.object(Kernel, "render", side_effect=mock_render(new_table)): with self._execute(workflow, tab_flow, {}, expect_log_level=logging.ERROR) as (result, path): self.assertEqual( result, StepResult(path, [Column("B", ColumnType.Text())])) self.assertEqual( # called with step1, then step2 Kernel.render.call_count, 2, ) self.assertRegex( # Output is to the correct file Kernel.render.call_args[1]["output_filename"], r"execute-tab-output.*\.arrow", )
def _arrow_column_to_column(name: str, column: pyarrow.ChunkedArray) -> Column: if pyarrow.types.is_floating(column.type) or pyarrow.types.is_integer( column.type): column_type = ColumnType.Number("{:,}") elif pyarrow.types.is_timestamp(column.type): column_type = ColumnType.Timestamp() elif pyarrow.types.is_string(column.type) or pyarrow.types.is_dictionary( column.type): column_type = ColumnType.Text() else: raise RuntimeError("Unknown column type %r" % column.type) return Column(name, column_type)
def test_fetch_return_dataframe(self): async def fetch(params): return pd.DataFrame({"A": ["x", "y"]}) with tempfile_context(dir=self.basedir) as outfile: result = self._test_fetch(fetch, output_filename=outfile.name) self.assertEqual(result.errors, []) arrow_table = read_parquet_as_arrow( outfile, [Column("A", ColumnType.Text())]) assert_arrow_table_equals(arrow_table, make_table(make_column("A", ["x", "y"])))
def test_clean_column_prompting_error_convert_to_number(self): input_shape = TableMetadata(3, [Column("A", ColumnType.Text())]) with self.assertRaises(PromptingError) as cm: clean_value(ParamDType.Column(column_types=frozenset({"number"})), "A", input_shape) self.assertEqual( cm.exception.errors, [ PromptingError.WrongColumnType(["A"], "text", frozenset({"number"})) ], )
def test_clean_multicolumn_prompting_error_convert_to_text(self): # TODO make this _automatic_ instead of quick-fix? # ... but for now: prompt for a Quick Fix. input_shape = TableMetadata( 3, [ Column("A", ColumnType.Number()), Column("B", ColumnType.Timestamp()), Column("C", ColumnType.Text()), ], ) with self.assertRaises(PromptingError) as cm: schema = ParamDType.Multicolumn(column_types=frozenset({"text"})) clean_value(schema, "A,B", input_shape) self.assertEqual( cm.exception.errors, [ PromptingError.WrongColumnType(["A", "B"], None, frozenset({"text"})) ], )
def test_date_unit_year_ok(self): table = pa.table( [ pa.array( [date(1900, 1, 1), date(1, 1, 1), date(9999, 1, 1), None]) ], pa.schema( [pa.field("A", pa.date32(), metadata={b"unit": b"year"})]), ) self.assertEqual(read_columns(table), [Column("A", ColumnType.Date(unit="year"))])
def test_number_metadata_utf8_format(self): table = pa.table( [pa.array([123])], pa.schema([ pa.field( "A", pa.int64(), metadata={b"format": "€{:,.2f}".encode("utf-8")}, ) ]), ) self.assertEqual(read_columns(table), [Column("A", ColumnType.Number(format="€{:,.2f}"))])
def test_dict_prompting_error(self): input_shape = TableMetadata( 3, [Column("A", ColumnType.Text()), Column("B", ColumnType.Text())]) schema = ParamDType.Dict({ "col1": ParamDType.Column(column_types=frozenset({"number"})), "col2": ParamDType.Column(column_types=frozenset({"timestamp"})), }) with self.assertRaises(PromptingError) as cm: clean_value(schema, {"col1": "A", "col2": "B"}, input_shape) self.assertEqual( cm.exception.errors, [ PromptingError.WrongColumnType(["A"], "text", frozenset({"number"})), PromptingError.WrongColumnType(["B"], "text", frozenset({"timestamp"})), ], )
def _dict_to_column(value: Dict[str, Any]) -> ColumnType: kwargs = dict(value) name = kwargs.pop("name") type_name = kwargs.pop("type") try: type_cls = { "text": ColumnType.Text, "number": ColumnType.Number, "datetime": ColumnType.Datetime, }[type_name] except KeyError: raise ValueError("Invalid type: %r" % type_name) return Column(name, type_cls(**kwargs))
def test_clean_column_prompting_error_convert_to_text(self): # TODO make this _automatic_ instead of quick-fix? # Consider Regex. We probably want to pass the module a text Series # _separately_ from the input DataFrame. That way Regex can output # a new Text column but preserve its input column's data type. # # ... but for now: prompt for a Quick Fix. input_shape = TableMetadata(3, [Column("A", ColumnType.Number())]) with self.assertRaises(PromptingError) as cm: clean_value(ParamDType.Column(column_types=frozenset({"text"})), "A", input_shape) self.assertEqual( cm.exception.errors, [PromptingError.WrongColumnType(["A"], None, frozenset({"text"}))], )
def test_invalid_parquet_is_corrupt_cache_error(self): with arrow_table_context(make_column("A", ["x"])) as (path, table): result = LoadedRenderResult( path=path, table=table, columns=[Column("A", ColumnType.Text())], errors=[], json={}, ) cache_render_result(self.workflow, self.step, 1, result) crr = self.step.cached_render_result s3.put_bytes(BUCKET, crr_parquet_key(crr), b"NOT PARQUET") with tempfile_context() as arrow_path: with self.assertRaises(CorruptCacheError): with open_cached_render_result(crr) as loaded: pass