def test_input_crr_corrupt_cache_error_is_none( self, downloaded_parquet_file, load_module ): load_module.return_value.migrate_params.return_value = {} load_module.return_value.fetch.return_value = FetchResult(self.output_path, []) downloaded_parquet_file.side_effect = rendercache.CorruptCacheError( "file not found" ) input_metadata = TableMetadata(3, [Column("A", ColumnType.Text())]) input_crr = CachedRenderResult(1, 2, 3, "ok", [], {}, input_metadata) fetch.fetch_or_wrap_error( self.ctx, self.chroot_context, self.basedir, WfModule(), MockModuleVersion(), {}, None, input_crr, self.output_path, ) # fetch is still called, with `None` as argument. self.assertIsNone( load_module.return_value.fetch.call_args[1]["input_parquet_filename"] )
def test_input_crr(self, downloaded_parquet_file, clean_value, load_module): load_module.return_value.migrate_params.return_value = {} load_module.return_value.fetch.return_value = FetchResult(self.output_path, []) clean_value.return_value = {} downloaded_parquet_file.return_value = Path("/path/to/x.parquet") input_metadata = TableMetadata(3, [Column("A", ColumnType.Text())]) input_crr = CachedRenderResult(1, 2, 3, "ok", [], {}, input_metadata) fetch.fetch_or_wrap_error( self.ctx, self.chroot_context, self.basedir, WfModule(), MockModuleVersion(), {}, None, input_crr, self.output_path, ) # Passed file is downloaded from rendercache downloaded_parquet_file.assert_called_with(input_crr, dir=self.basedir) self.assertEqual( load_module.return_value.fetch.call_args[1]["input_parquet_filename"], "x.parquet", ) # clean_value() is called with input metadata from CachedRenderResult clean_value.assert_called() self.assertEqual(clean_value.call_args[0][2], input_metadata)
def test_metadata_comes_from_db_columns(self): columns = [ Column("A", ColumnType.Number(format="{:,.2f}")), Column("B", ColumnType.Datetime()), Column("C", ColumnType.Text()), ] result = RenderResult( arrow_table({ "A": [1], "B": [datetime.datetime.now()], "C": ["x"] }, columns=columns)) cache_render_result(self.workflow, self.wf_module, self.delta.id, result) # Delete from disk entirely, to prove we did not read. minio.remove(BUCKET, crr_parquet_key(self.wf_module.cached_render_result)) # Load _new_ CachedRenderResult -- from DB columns, not memory fresh_wf_module = WfModule.objects.get(id=self.wf_module.id) cached_result = fresh_wf_module.cached_render_result self.assertEqual(cached_result.table_metadata, TableMetadata(1, columns))
def _infer_output_column_type(column: pyarrow.ChunkedArray) -> ColumnType: if column.type == pyarrow.utf8() or (hasattr(column.type, "value_type") and column.type.value_type == pyarrow.utf8()): return ColumnType.Text() else: return ColumnType.Number()
def test_render_using_tab_output(self): def render(table, params): self.assertEqual(params["tabparam"].slug, "tab-1") self.assertEqual(params["tabparam"].name, "Tab 1") self.assertEqual( params["tabparam"].columns, { "X": ptypes.RenderColumn("X", "number", "{:,d}"), "Y": ptypes.RenderColumn("Y", "text", None), }, ) assert_frame_equal(params["tabparam"].dataframe, pd.DataFrame({ "X": [1], "Y": ["y"] })) with arrow_table_context( { "X": [1], "Y": ["y"] }, columns=[ Column("X", ColumnType.Number("{:,d}")), Column("Y", ColumnType.Text()), ], dir=self.basedir, ) as atable: self._test_render( render, params={"tabparam": TabOutput(Tab("tab-1", "Tab 1"), atable)})
def test_render_with_input_columns(self): def render(*args, input_columns): self.assertEqual( input_columns, { "A": ptypes.RenderColumn("A", "text", None), "B": ptypes.RenderColumn("B", "number", "{:,.3f}"), "C": ptypes.RenderColumn("C", "datetime", None), }, ) with arrow_table_context( { "A": ["x"], "B": [1], "C": pa.array([datetime.now()], pa.timestamp("ns")) }, columns=[ Column("A", ColumnType.Text()), Column("B", ColumnType.Number("{:,.3f}")), Column("C", ColumnType.Datetime()), ], dir=self.basedir, ) as arrow_table: self._test_render(render, arrow_table=arrow_table)
def test_metadata_does_not_require_file_read(self): columns = [ Column("A", ColumnType.Number(format="{:,.2f}")), Column("B", ColumnType.Timestamp()), Column("C", ColumnType.Text()), Column("D", ColumnType.Date("month")), ] with arrow_table_context( make_column("A", [1], format="{:,.2f}"), make_column("B", [datetime.datetime(2021, 4, 13)]), make_column("C", ["c"]), make_column("D", [datetime.date(2021, 4, 1)], unit="month"), ) as (path, table): result = LoadedRenderResult( path=path, table=table, columns=columns, errors=[], json={} ) cache_render_result(self.workflow, self.step, 1, result) # Delete from disk entirely, to prove we did not read. s3.remove(BUCKET, crr_parquet_key(self.step.cached_render_result)) # Load _new_ CachedRenderResult -- from DB columns, not memory fresh_step = Step.objects.get(id=self.step.id) cached_result = fresh_step.cached_render_result self.assertEqual(cached_result.table_metadata, TableMetadata(1, columns))
def render(arrow_table, params, output_path, *, columns, **kwargs): # Test the "columns" kwarg # # TODO nix this! The only module that uses it is `converttotext`. self.assertEqual( columns, [ Column("A", ColumnType.Number("{:,.3f}")), Column("B", ColumnType.Number("{:,.3f}")), Column("C", ColumnType.Number("{:,.3f}")), Column("D", ColumnType.Timestamp()), Column("E", ColumnType.Timestamp()), Column("F", ColumnType.Timestamp()), Column("G", ColumnType.Text()), Column("H", ColumnType.Text()), Column("I", ColumnType.Text()), Column("J", ColumnType.Date(unit="day")), Column("K", ColumnType.Date(unit="week")), Column("L", ColumnType.Text()), ], ) table = pa.table( { "A": [1], "B": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "C": ["a"], "D": [1], "E": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "F": ["a"], "G": [1], "H": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "I": ["a"], "J": pa.array([date(2021, 4, 1)]), "K": pa.array([date(2021, 4, 12)]), "L": pa.array([date(2021, 4, 1)]), } ) schema = table.schema.set( table.schema.get_field_index("J"), pa.field("J", pa.date32(), metadata={"unit": "month"}), ) with pa.ipc.RecordBatchFileWriter(output_path, schema) as writer: writer.write_table(pa.table(table.columns, schema=schema)) return []
def _arrow_column_to_column(name: str, column: pyarrow.ChunkedArray) -> Column: if pyarrow.types.is_floating(column.type) or pyarrow.types.is_integer( column.type): column_type = ColumnType.Number("{:,}") elif pyarrow.types.is_timestamp(column.type): column_type = ColumnType.Timestamp() elif pyarrow.types.is_string(column.type) or pyarrow.types.is_dictionary( column.type): column_type = ColumnType.Text() else: raise RuntimeError("Unknown column type %r" % column.type) return Column(name, column_type)
def test_fetch_return_dataframe(self): async def fetch(params): return pd.DataFrame({"A": ["x", "y"]}) with tempfile_context(dir=self.basedir) as outfile: result = self._test_fetch(fetch, output_filename=outfile.name) self.assertEqual(result.errors, []) arrow_table = read_parquet_as_arrow( outfile, [Column("A", ColumnType.Text())]) assert_arrow_table_equals(arrow_table, make_table(make_column("A", ["x", "y"])))
def test_resume_backtrack_on_corrupt_cache_error(self): module_zipfile = create_module_zipfile( "mod", spec_kwargs={"loads_data": True}) workflow = Workflow.create_and_init() tab = workflow.tabs.first() # step1: cached result is fresh -- but CORRUPT step1 = tab.steps.create( order=0, slug="step-1", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id, ) write_to_rendercache(workflow, step1, workflow.last_delta_id, make_table(make_column("A", [1]))) step1.refresh_from_db() s3.put_bytes( # Write corrupted data -- will lead to CorruptCacheError rendercache.io.BUCKET, rendercache.io.crr_parquet_key(step1.cached_render_result), b"CORRUPT", ) # step2: no cached result -- must re-render step2 = tab.steps.create(order=1, slug="step-2", module_id_name="mod") tab_flow = TabFlow( Tab(tab.slug, tab.name), [ ExecuteStep(step1, module_zipfile, {}), ExecuteStep(step2, module_zipfile, {}), ], ) new_table = make_table(make_column("B", ["b"])) with patch.object(Kernel, "render", side_effect=mock_render(new_table)): with self._execute(workflow, tab_flow, {}, expect_log_level=logging.ERROR) as (result, path): self.assertEqual( result, StepResult(path, [Column("B", ColumnType.Text())])) self.assertEqual( # called with step1, then step2 Kernel.render.call_count, 2, ) self.assertRegex( # Output is to the correct file Kernel.render.call_args[1]["output_filename"], r"execute-tab-output.*\.arrow", )
def test_execute_partial_cache_hit(self): module_zipfile = create_module_zipfile( "mod", spec_kwargs={"loads_data": True}) workflow = Workflow.create_and_init() tab = workflow.tabs.first() # step1: cached result is fresh. Should not render. step1 = tab.steps.create( order=0, slug="step-1", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id, ) write_to_rendercache(workflow, step1, workflow.last_delta_id, make_table(make_column("A", ["a"]))) # step2: cached result is stale, so must be re-rendered step2 = tab.steps.create( order=1, slug="step-2", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id, ) write_to_rendercache( workflow, step2, workflow.last_delta_id - 1, make_table(make_column("B", ["b"])), ) tab_flow = TabFlow( Tab(tab.slug, tab.name), [ ExecuteStep(step1, module_zipfile, {}), ExecuteStep(step2, module_zipfile, {}), ], ) new_table = make_table(make_column("C", ["c"])) with patch.object(Kernel, "render", side_effect=mock_render(new_table)): with self._execute(workflow, tab_flow, {}) as (result, path): self.assertEqual( result, StepResult(path, [Column("C", ColumnType.Text())])) assert_arrow_table_equals(load_trusted_arrow_file(path), new_table) Kernel.render.assert_called_once() # step2, not step1 self.assertRegex( # Output is to the correct file Kernel.render.call_args[1]["output_filename"], r"execute-tab-output.*\.arrow", )
def test_clean_column_prompting_error_convert_to_number(self): input_shape = TableMetadata(3, [Column("A", ColumnType.Text())]) with self.assertRaises(PromptingError) as cm: clean_value(ParamDType.Column(column_types=frozenset({"number"})), "A", input_shape) self.assertEqual( cm.exception.errors, [ PromptingError.WrongColumnType(["A"], "text", frozenset({"number"})) ], )
def test_dict_prompting_error(self): input_shape = TableMetadata( 3, [Column("A", ColumnType.Text()), Column("B", ColumnType.Text())]) schema = ParamDType.Dict({ "col1": ParamDType.Column(column_types=frozenset({"number"})), "col2": ParamDType.Column(column_types=frozenset({"timestamp"})), }) with self.assertRaises(PromptingError) as cm: clean_value(schema, {"col1": "A", "col2": "B"}, input_shape) self.assertEqual( cm.exception.errors, [ PromptingError.WrongColumnType(["A"], "text", frozenset({"number"})), PromptingError.WrongColumnType(["B"], "text", frozenset({"timestamp"})), ], )
def test_invalid_parquet_is_corrupt_cache_error(self): with arrow_table_context(make_column("A", ["x"])) as (path, table): result = LoadedRenderResult( path=path, table=table, columns=[Column("A", ColumnType.Text())], errors=[], json={}, ) cache_render_result(self.workflow, self.step, 1, result) crr = self.step.cached_render_result s3.put_bytes(BUCKET, crr_parquet_key(crr), b"NOT PARQUET") with tempfile_context() as arrow_path: with self.assertRaises(CorruptCacheError): with open_cached_render_result(crr) as loaded: pass
def test_input_crr(self, downloaded_parquet_file, clean_value): def do_fetch( compiled_module, chroot_context, basedir, params, secrets, last_fetch_result, input_parquet_filename, output_filename, ): shutil.copy(basedir / input_parquet_filename, basedir / output_filename) return FetchResult(basedir / output_filename) self.kernel.fetch.side_effect = do_fetch clean_value.return_value = {} with tempfile_context(dir=self.basedir, suffix=".parquet") as parquet_path: parquet_path.write_bytes(b"abc123") downloaded_parquet_file.return_value = parquet_path input_metadata = TableMetadata(3, [Column("A", ColumnType.Text())]) input_crr = CachedRenderResult(1, 2, 3, "ok", [], {}, input_metadata) with self.assertLogs("fetcher.fetch", level=logging.INFO): result = fetch.fetch_or_wrap_error( self.ctx, self.chroot_context, self.basedir, "mod", create_module_zipfile("mod"), {}, {}, None, input_crr, self.output_path, ) # Passed file is downloaded from rendercache self.assertEqual(result.path.read_bytes(), b"abc123") # clean_value() is called with input metadata from CachedRenderResult clean_value.assert_called() self.assertEqual(clean_value.call_args[0][2], input_metadata)
def test_execute_cache_miss(self): module_zipfile = create_module_zipfile( "mod", spec_kwargs={"loads_data": True}) workflow = Workflow.create_and_init() tab = workflow.tabs.first() step1 = tab.steps.create( order=0, slug="step-1", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id, ) step2 = tab.steps.create( order=1, slug="step-2", module_id_name="mod", last_relevant_delta_id=workflow.last_delta_id, ) tab_flow = TabFlow( Tab(tab.slug, tab.name), [ ExecuteStep(step1, module_zipfile, {}), ExecuteStep(step2, module_zipfile, {}), ], ) table = make_table(make_column("A", ["a"])) with patch.object(Kernel, "render", side_effect=mock_render(table)): with self._execute(workflow, tab_flow, {}) as (result, path): self.assertEqual( result, StepResult(path, [Column("A", ColumnType.Text())])) assert_arrow_table_equals(load_trusted_arrow_file(path), table) self.assertEqual(Kernel.render.call_count, 2) # step2, not step1 self.assertRegex( # Output is to the correct file Kernel.render.call_args[1]["output_filename"], r"execute-tab-output.*\.arrow", )
def test_input_crr_corrupt_cache_error_is_none(self, downloaded_parquet_file): self.kernel.fetch.return_value = FetchResult(self.output_path, []) downloaded_parquet_file.side_effect = rendercache.CorruptCacheError( "file not found" ) input_metadata = TableMetadata(3, [Column("A", ColumnType.Text())]) input_crr = CachedRenderResult(1, 2, 3, "ok", [], {}, input_metadata) with self.assertLogs("fetcher.fetch", level=logging.INFO): fetch.fetch_or_wrap_error( self.ctx, self.chroot_context, self.basedir, "mod", create_module_zipfile("mod"), {}, {}, None, input_crr, self.output_path, ) # fetch is still called, with `None` as argument. self.assertIsNone(self.kernel.fetch.call_args[1]["input_parquet_filename"])
def test_clean_multicolumn_prompting_error_convert_to_text(self): # TODO make this _automatic_ instead of quick-fix? # ... but for now: prompt for a Quick Fix. input_shape = TableMetadata( 3, [ Column("A", ColumnType.Number()), Column("B", ColumnType.Timestamp()), Column("C", ColumnType.Text()), ], ) with self.assertRaises(PromptingError) as cm: schema = ParamDType.Multicolumn(column_types=frozenset({"text"})) clean_value(schema, "A,B", input_shape) self.assertEqual( cm.exception.errors, [ PromptingError.WrongColumnType(["A", "B"], None, frozenset({"text"})) ], )
def Text(name: str) -> Column: return Column(name, ColumnType.Text())
def test_text_ok(self): self.assertEqual(read_columns(pa.table({"A": ["x"]})), [Column("A", ColumnType.Text())])
def test_text_dictionary_ok(self): self.assertEqual( read_columns(pa.table({"A": pa.array(["x"]).dictionary_encode()}), ), [Column("A", ColumnType.Text())], )
def TEXT(name: str): return Column(name, ColumnType.Text())
def migrate_params(params): return [ColumnType.Text()]
def test_render_arrow_table_infer_output_column_formats_from_input(self): input_columns = [ Column("A", ColumnType.Number("{:,.3f}")), Column("B", ColumnType.Number("{:,.3f}")), Column("C", ColumnType.Number("{:,.3f}")), Column("D", ColumnType.Datetime()), Column("E", ColumnType.Datetime()), Column("F", ColumnType.Datetime()), Column("G", ColumnType.Text()), Column("H", ColumnType.Text()), Column("I", ColumnType.Text()), ] # The param name "arrow_table" is a special case def render(arrow_table, params, output_path, *, columns, **kwargs): # Test the "columns" kwarg self.assertEqual(columns, input_columns) table = pa.table( { "A": [1], "B": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "C": ["a"], "D": [1], "E": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "F": ["a"], "G": [1], "H": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "I": ["a"], } ) with pa.ipc.RecordBatchFileWriter(output_path, table.schema) as writer: writer.write_table(table) return [] with arrow_table_context( { "A": [1], "B": [1], "C": [1], "D": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "E": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "F": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")), "G": ["a"], "H": ["a"], "I": ["a"], }, columns=input_columns, dir=self.basedir, ) as arrow_table: result = self._test_render(render, arrow_table=arrow_table) self.assertEqual( result.table.metadata.columns, [ Column("A", ColumnType.Number("{:,.3f}")), # recalled Column("B", ColumnType.Datetime()), # inferred Column("C", ColumnType.Text()), # inferred Column("D", ColumnType.Number("{:,}")), # inferred Column("E", ColumnType.Datetime()), # recalled Column("F", ColumnType.Text()), # inferred Column("G", ColumnType.Number("{:,}")), # inferred Column("H", ColumnType.Datetime()), # inferred Column("I", ColumnType.Text()), # recalled ], )