def test_clean_column_prompting_error_convert_to_number(self): input_shape = TableMetadata(3, [Column("A", ColumnType.Text())]) with self.assertRaises(PromptingError) as cm: clean_value(ParamSchema.Column(column_types=frozenset({"number"})), "A", input_shape) self.assertEqual( cm.exception.errors, [ PromptingError.WrongColumnType(["A"], "text", frozenset({"number"})) ], )
def test_date_unit_year_ok(self): table = pa.table( [ pa.array( [date(1900, 1, 1), date(1, 1, 1), date(9999, 1, 1), None]) ], pa.schema( [pa.field("A", pa.date32(), metadata={b"unit": b"year"})]), ) self.assertEqual(read_columns(table), [Column("A", ColumnType.Date(unit="year"))])
def test_clean_multicolumn_prompting_error_convert_to_text(self): # TODO make this _automatic_ instead of quick-fix? # ... but for now: prompt for a Quick Fix. input_shape = TableMetadata( 3, [ Column("A", ColumnType.Number()), Column("B", ColumnType.Timestamp()), Column("C", ColumnType.Text()), ], ) with self.assertRaises(PromptingError) as cm: schema = ParamDType.Multicolumn(column_types=frozenset({"text"})) clean_value(schema, "A,B", input_shape) self.assertEqual( cm.exception.errors, [ PromptingError.WrongColumnType(["A", "B"], None, frozenset({"text"})) ], )
def test_number_metadata_utf8_format(self): table = pa.table( [pa.array([123])], pa.schema([ pa.field( "A", pa.int64(), metadata={b"format": "€{:,.2f}".encode("utf-8")}, ) ]), ) self.assertEqual(read_columns(table), [Column("A", ColumnType.Number(format="€{:,.2f}"))])
def test_dict_prompting_error(self): input_shape = TableMetadata( 3, [Column("A", ColumnType.Text()), Column("B", ColumnType.Text())]) schema = ParamDType.Dict({ "col1": ParamDType.Column(column_types=frozenset({"number"})), "col2": ParamDType.Column(column_types=frozenset({"timestamp"})), }) with self.assertRaises(PromptingError) as cm: clean_value(schema, {"col1": "A", "col2": "B"}, input_shape) self.assertEqual( cm.exception.errors, [ PromptingError.WrongColumnType(["A"], "text", frozenset({"number"})), PromptingError.WrongColumnType(["B"], "text", frozenset({"timestamp"})), ], )
def test_disallow_non_text(self): cache_render_result( self.workflow, self.step1, self.step1.last_relevant_delta_id, RenderResult( arrow_table( {"A": [1, 2, 3, 2, 1]}, columns=[Column("A", ColumnType.Number(format="{:.2f}"))], )), ) response = self._request("A") self.assertEqual(response.status_code, 200) self.assertEqual(json.loads(response.content), {"values": {}})
def test_invalid_parquet_is_corrupt_cache_error(self): with arrow_table_context(make_column("A", ["x"])) as (path, table): result = LoadedRenderResult( path=path, table=table, columns=[Column("A", ColumnType.Text())], errors=[], json={}, ) cache_render_result(self.workflow, self.step, 1, result) crr = self.step.cached_render_result s3.put_bytes(BUCKET, crr_parquet_key(crr), b"NOT PARQUET") with tempfile_context() as arrow_path: with self.assertRaises(CorruptCacheError): with open_cached_render_result(crr) as loaded: pass
def test_clean_column_prompting_error_convert_to_text(self): # TODO make this _automatic_ instead of quick-fix? # Consider Regex. We probably want to pass the module a text Series # _separately_ from the input DataFrame. That way Regex can output # a new Text column but preserve its input column's data type. # # ... but for now: prompt for a Quick Fix. input_shape = TableMetadata(3, [Column("A", ColumnType.Number())]) with self.assertRaises(PromptingError) as cm: clean_value(ParamDType.Column(column_types=frozenset({"text"})), "A", input_shape) self.assertEqual( cm.exception.errors, [PromptingError.WrongColumnType(["A"], None, frozenset({"text"}))], )
def test_read_cached_render_result_slice_as_text_datetime(self): result = RenderResult( arrow_table( { "A": pa.array([2134213412341232967, None], pa.timestamp("ns")) }, columns=[Column("A", ColumnType.Datetime())], )) cache_render_result(self.workflow, self.wf_module, self.delta.id, result) crr = self.wf_module.cached_render_result self.assertEqual( read_cached_render_result_slice_as_text(crr, "csv", range(2), range(3)), "A\n2037-08-18T13:03:32.341232967Z\n", )
def test_value_counts_disallow_non_text(self): cache_render_result( self.workflow, self.wf_module2, self.wf_module2.last_relevant_delta_id, RenderResult( arrow_table( {"A": [1, 2, 3, 2, 1]}, columns=[Column("A", ColumnType.Number(format="{:.2f}"))], )), ) response = self.client.get( f"/api/wfmodules/{self.wf_module2.id}/value-counts?column=A") self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(json.loads(response.content), {"values": {}})
def test_read_cached_render_result_slice_as_text_timestamp(self): with arrow_table_context( make_column("A", [2134213412341232967, None], pa.timestamp("ns")) ) as (path, table): result = LoadedRenderResult( path=path, table=table, columns=[Column("A", ColumnType.Timestamp())], errors=[], json={}, ) cache_render_result(self.workflow, self.step, 1, result) crr = self.step.cached_render_result self.assertEqual( read_cached_render_result_slice_as_text(crr, "csv", range(2), range(3)), "A\n2037-08-18T13:03:32.341232967Z\n", )
def test_input_crr(self, downloaded_parquet_file, clean_value): def do_fetch( compiled_module, chroot_context, basedir, params, secrets, last_fetch_result, input_parquet_filename, output_filename, ): shutil.copy(basedir / input_parquet_filename, basedir / output_filename) return FetchResult(basedir / output_filename) self.kernel.fetch.side_effect = do_fetch clean_value.return_value = {} with tempfile_context(dir=self.basedir, suffix=".parquet") as parquet_path: parquet_path.write_bytes(b"abc123") downloaded_parquet_file.return_value = parquet_path input_metadata = TableMetadata(3, [Column("A", ColumnType.Text())]) input_crr = CachedRenderResult(1, 2, 3, "ok", [], {}, input_metadata) with self.assertLogs("fetcher.fetch", level=logging.INFO): result = fetch.fetch_or_wrap_error( self.ctx, self.chroot_context, self.basedir, "mod", create_module_zipfile("mod"), {}, {}, None, input_crr, self.output_path, ) # Passed file is downloaded from rendercache self.assertEqual(result.path.read_bytes(), b"abc123") # clean_value() is called with input metadata from CachedRenderResult clean_value.assert_called() self.assertEqual(clean_value.call_args[0][2], input_metadata)
def test_clear(self): with arrow_table_context(make_column("A", [1])) as (path, table): result = LoadedRenderResult( path=path, table=table, columns=[Column("A", ColumnType.Number(format="{:,}"))], errors=[], json={}, ) cache_render_result(self.workflow, self.step, 1, result) parquet_key = crr_parquet_key(self.step.cached_render_result) clear_cached_render_result_for_step(self.step) db_step = Step.objects.get(id=self.step.id) self.assertIsNone(db_step.cached_render_result) self.assertFalse(s3.exists(BUCKET, parquet_key))
def test_execute_cache_hit(self): cached_table1 = make_table(make_column("A", [1])) cached_table2 = make_table(make_column("B", [2], format="${:,}")) module_zipfile = create_module_zipfile( "mod", spec_kwargs={"loads_data": True}) workflow = Workflow.create_and_init() tab = workflow.tabs.first() step1 = tab.steps.create(order=0, slug="step-1", last_relevant_delta_id=workflow.last_delta_id) write_to_rendercache(workflow, step1, workflow.last_delta_id, cached_table1) step2 = tab.steps.create(order=1, slug="step-2", last_relevant_delta_id=workflow.last_delta_id) write_to_rendercache(workflow, step2, workflow.last_delta_id, cached_table2) tab_flow = TabFlow( Tab(tab.slug, tab.name), [ ExecuteStep(step1, module_zipfile, {}), ExecuteStep(step2, module_zipfile, {}), ], ) unwanted_table = make_table(make_column("No", ["bad"])) with patch.object(Kernel, "render", side_effect=mock_render(unwanted_table)): with self._execute(workflow, tab_flow, {}) as (result, path): self.assertEqual( result, StepResult( path, [Column("B", ColumnType.Number(format="${:,}"))]), ) assert_arrow_table_equals(load_trusted_arrow_file(path), cached_table2) Kernel.render.assert_not_called()
def test_execute_cache_miss(self): module_zipfile = create_module_zipfile( "mod", spec_kwargs={"loads_data": True}) workflow = Workflow.create_and_init() tab = workflow.tabs.first() step1 = tab.steps.create( order=0, slug="step-1", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id, ) step2 = tab.steps.create( order=1, slug="step-2", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id, ) tab_flow = TabFlow( Tab(tab.slug, tab.name), [ ExecuteStep(step1, module_zipfile, {}), ExecuteStep(step2, module_zipfile, {}), ], ) table = make_table(make_column("A", ["a"])) with patch.object(Kernel, "render", side_effect=mock_render(table)): with self._execute(workflow, tab_flow, {}) as (result, path): self.assertEqual( result, StepResult(path, [Column("A", ColumnType.Text())])) assert_arrow_table_equals(load_trusted_arrow_file(path), table) self.assertEqual(Kernel.render.call_count, 2) # step2, not step1 self.assertRegex( # Output is to the correct file Kernel.render.call_args[1]["output_filename"], r"execute-tab-output.*\.arrow", )
def test_input_crr_corrupt_cache_error_is_none(self, downloaded_parquet_file): self.kernel.fetch.return_value = FetchResult(self.output_path, []) downloaded_parquet_file.side_effect = rendercache.CorruptCacheError( "file not found" ) input_metadata = TableMetadata(3, [Column("A", ColumnType.Text())]) input_crr = CachedRenderResult(1, 2, 3, "ok", [], {}, input_metadata) with self.assertLogs("fetcher.fetch", level=logging.INFO): fetch.fetch_or_wrap_error( self.ctx, self.chroot_context, self.basedir, "mod", create_module_zipfile("mod"), {}, {}, None, input_crr, self.output_path, ) # fetch is still called, with `None` as argument. self.assertIsNone(self.kernel.fetch.call_args[1]["input_parquet_filename"])
def test_input_crr_corrupt_cache_error_is_none( self, downloaded_parquet_file, load_module ): load_module.return_value.migrate_params.return_value = {} load_module.return_value.fetch.return_value = FetchResult(self.output_path, []) downloaded_parquet_file.side_effect = rendercache.CorruptCacheError( "file not found" ) input_metadata = TableMetadata(3, [Column("A", ColumnType.Text())]) input_crr = CachedRenderResult(1, 2, 3, "ok", [], {}, input_metadata) fetch.fetch_or_wrap_error( self.ctx, self.basedir, WfModule(), MockModuleVersion(), {}, None, input_crr, self.output_path, ) # fetch is still called, with `None` as argument. self.assertIsNone( load_module.return_value.fetch.call_args[1]["input_parquet_filename"] )
def test_clean_column_missing(self): input_shape = TableMetadata(3, [Column("A", ColumnType.Number())]) self.assertEqual(clean_value(ParamDType.Column(), "B", input_shape), "")
def test_clean_column_tab_parameter_is_error(self): input_shape = TableMetadata(3, [Column("A", ColumnType.Number())]) with self.assertRaisesRegex( RuntimeError, "Unsupported: fetch column with tab_parameter"): clean_value(ParamDType.Column(tab_parameter="tab-2"), "A", input_shape)
def Date(name: str, unit: str) -> Column: return Column(name, ColumnType.Date(unit))
def migrate_params(params): return [ColumnType.Text()]
def render(arrow_table, params, output_path, *, columns, **kwargs): # Test the "columns" kwarg # # TODO nix this! The only module that uses it is `converttotext`. self.assertEqual( columns, [ Column("A", ColumnType.Number("{:,.3f}")), Column("B", ColumnType.Number("{:,.3f}")), Column("C", ColumnType.Number("{:,.3f}")), Column("D", ColumnType.Timestamp()), Column("E", ColumnType.Timestamp()), Column("F", ColumnType.Timestamp()), Column("G", ColumnType.Text()), Column("H", ColumnType.Text()), Column("I", ColumnType.Text()), Column("J", ColumnType.Date(unit="day")), Column("K", ColumnType.Date(unit="week")), Column("L", ColumnType.Text()), ], ) table = pa.table( { "A": [1], "B": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "C": ["a"], "D": [1], "E": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "F": ["a"], "G": [1], "H": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "I": ["a"], "J": pa.array([date(2021, 4, 1)]), "K": pa.array([date(2021, 4, 12)]), "L": pa.array([date(2021, 4, 1)]), } ) schema = table.schema.set( table.schema.get_field_index("J"), pa.field("J", pa.date32(), metadata={"unit": "month"}), ) with pa.ipc.RecordBatchFileWriter(output_path, schema) as writer: writer.write_table(pa.table(table.columns, schema=schema)) return []
def Timestamp(name: str) -> Column: return Column(name, ColumnType.Timestamp())
def test_text_dictionary_ok(self): self.assertEqual( read_columns(pa.table({"A": pa.array(["x"]).dictionary_encode()}), ), [Column("A", ColumnType.Text())], )
def test_render_arrow_table_infer_output_column_formats_from_input(self): input_columns = [ Column("A", ColumnType.Number("{:,.3f}")), Column("B", ColumnType.Number("{:,.3f}")), Column("C", ColumnType.Number("{:,.3f}")), Column("D", ColumnType.Datetime()), Column("E", ColumnType.Datetime()), Column("F", ColumnType.Datetime()), Column("G", ColumnType.Text()), Column("H", ColumnType.Text()), Column("I", ColumnType.Text()), ] # The param name "arrow_table" is a special case def render(arrow_table, params, output_path, *, columns, **kwargs): # Test the "columns" kwarg self.assertEqual(columns, input_columns) table = pa.table( { "A": [1], "B": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "C": ["a"], "D": [1], "E": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "F": ["a"], "G": [1], "H": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "I": ["a"], } ) with pa.ipc.RecordBatchFileWriter(output_path, table.schema) as writer: writer.write_table(table) return [] with arrow_table_context( { "A": [1], "B": [1], "C": [1], "D": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "E": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "F": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "G": ["a"], "H": ["a"], "I": ["a"], }, columns=input_columns, dir=self.basedir, ) as arrow_table: result = self._test_render(render, arrow_table=arrow_table) self.assertEqual( result.table.metadata.columns, [ Column("A", ColumnType.Number("{:,.3f}")), # recalled Column("B", ColumnType.Datetime()), # inferred Column("C", ColumnType.Text()), # inferred Column("D", ColumnType.Number("{:,}")), # inferred Column("E", ColumnType.Datetime()), # recalled Column("F", ColumnType.Text()), # inferred Column("G", ColumnType.Number("{:,}")), # inferred Column("H", ColumnType.Datetime()), # inferred Column("I", ColumnType.Text()), # recalled ], )
def test_timestamp_ok(self): table = pa.table({"A": pa.array([12312312314512], pa.timestamp("ns"))}) self.assertEqual(read_columns(table), [Column("A", ColumnType.Timestamp())])
def Datetime(name: str) -> Column: return Column(name, ColumnType.Datetime())
def Text(name: str) -> Column: return Column(name, ColumnType.Text())
def Number(name: str, format: str = "{:,.2f}") -> Column: return Column(name, ColumnType.Number(format=format))
def test_text_ok(self): self.assertEqual(read_columns(pa.table({"A": ["x"]})), [Column("A", ColumnType.Text())])