Пример #1
0
    def test_clean_column_prompting_error_convert_to_number(self):
        input_shape = TableMetadata(3, [Column("A", ColumnType.Text())])
        with self.assertRaises(PromptingError) as cm:
            clean_value(ParamSchema.Column(column_types=frozenset({"number"})),
                        "A", input_shape)

        self.assertEqual(
            cm.exception.errors,
            [
                PromptingError.WrongColumnType(["A"], "text",
                                               frozenset({"number"}))
            ],
        )
Пример #2
0
 def test_date_unit_year_ok(self):
     table = pa.table(
         [
             pa.array(
                 [date(1900, 1, 1),
                  date(1, 1, 1),
                  date(9999, 1, 1), None])
         ],
         pa.schema(
             [pa.field("A", pa.date32(), metadata={b"unit": b"year"})]),
     )
     self.assertEqual(read_columns(table),
                      [Column("A", ColumnType.Date(unit="year"))])
Пример #3
0
    def test_clean_multicolumn_prompting_error_convert_to_text(self):
        # TODO make this _automatic_ instead of quick-fix?
        # ... but for now: prompt for a Quick Fix.
        input_shape = TableMetadata(
            3,
            [
                Column("A", ColumnType.Number()),
                Column("B", ColumnType.Timestamp()),
                Column("C", ColumnType.Text()),
            ],
        )
        with self.assertRaises(PromptingError) as cm:
            schema = ParamDType.Multicolumn(column_types=frozenset({"text"}))
            clean_value(schema, "A,B", input_shape)

        self.assertEqual(
            cm.exception.errors,
            [
                PromptingError.WrongColumnType(["A", "B"], None,
                                               frozenset({"text"}))
            ],
        )
Пример #4
0
 def test_number_metadata_utf8_format(self):
     table = pa.table(
         [pa.array([123])],
         pa.schema([
             pa.field(
                 "A",
                 pa.int64(),
                 metadata={b"format": "€{:,.2f}".encode("utf-8")},
             )
         ]),
     )
     self.assertEqual(read_columns(table),
                      [Column("A", ColumnType.Number(format="€{:,.2f}"))])
Пример #5
0
    def test_dict_prompting_error(self):
        input_shape = TableMetadata(
            3,
            [Column("A", ColumnType.Text()),
             Column("B", ColumnType.Text())])
        schema = ParamDType.Dict({
            "col1":
            ParamDType.Column(column_types=frozenset({"number"})),
            "col2":
            ParamDType.Column(column_types=frozenset({"timestamp"})),
        })
        with self.assertRaises(PromptingError) as cm:
            clean_value(schema, {"col1": "A", "col2": "B"}, input_shape)

        self.assertEqual(
            cm.exception.errors,
            [
                PromptingError.WrongColumnType(["A"], "text",
                                               frozenset({"number"})),
                PromptingError.WrongColumnType(["B"], "text",
                                               frozenset({"timestamp"})),
            ],
        )
Пример #6
0
    def test_disallow_non_text(self):
        cache_render_result(
            self.workflow,
            self.step1,
            self.step1.last_relevant_delta_id,
            RenderResult(
                arrow_table(
                    {"A": [1, 2, 3, 2, 1]},
                    columns=[Column("A", ColumnType.Number(format="{:.2f}"))],
                )),
        )

        response = self._request("A")

        self.assertEqual(response.status_code, 200)
        self.assertEqual(json.loads(response.content), {"values": {}})
Пример #7
0
 def test_invalid_parquet_is_corrupt_cache_error(self):
     with arrow_table_context(make_column("A", ["x"])) as (path, table):
         result = LoadedRenderResult(
             path=path,
             table=table,
             columns=[Column("A", ColumnType.Text())],
             errors=[],
             json={},
         )
         cache_render_result(self.workflow, self.step, 1, result)
     crr = self.step.cached_render_result
     s3.put_bytes(BUCKET, crr_parquet_key(crr), b"NOT PARQUET")
     with tempfile_context() as arrow_path:
         with self.assertRaises(CorruptCacheError):
             with open_cached_render_result(crr) as loaded:
                 pass
Пример #8
0
    def test_clean_column_prompting_error_convert_to_text(self):
        # TODO make this _automatic_ instead of quick-fix?
        # Consider Regex. We probably want to pass the module a text Series
        # _separately_ from the input DataFrame. That way Regex can output
        # a new Text column but preserve its input column's data type.
        #
        # ... but for now: prompt for a Quick Fix.
        input_shape = TableMetadata(3, [Column("A", ColumnType.Number())])
        with self.assertRaises(PromptingError) as cm:
            clean_value(ParamDType.Column(column_types=frozenset({"text"})),
                        "A", input_shape)

        self.assertEqual(
            cm.exception.errors,
            [PromptingError.WrongColumnType(["A"], None, frozenset({"text"}))],
        )
Пример #9
0
 def test_read_cached_render_result_slice_as_text_datetime(self):
     result = RenderResult(
         arrow_table(
             {
                 "A": pa.array([2134213412341232967, None],
                               pa.timestamp("ns"))
             },
             columns=[Column("A", ColumnType.Datetime())],
         ))
     cache_render_result(self.workflow, self.wf_module, self.delta.id,
                         result)
     crr = self.wf_module.cached_render_result
     self.assertEqual(
         read_cached_render_result_slice_as_text(crr, "csv", range(2),
                                                 range(3)),
         "A\n2037-08-18T13:03:32.341232967Z\n",
     )
Пример #10
0
    def test_value_counts_disallow_non_text(self):
        cache_render_result(
            self.workflow,
            self.wf_module2,
            self.wf_module2.last_relevant_delta_id,
            RenderResult(
                arrow_table(
                    {"A": [1, 2, 3, 2, 1]},
                    columns=[Column("A", ColumnType.Number(format="{:.2f}"))],
                )),
        )

        response = self.client.get(
            f"/api/wfmodules/{self.wf_module2.id}/value-counts?column=A")

        self.assertEqual(response.status_code, status.HTTP_200_OK)
        self.assertEqual(json.loads(response.content), {"values": {}})
Пример #11
0
 def test_read_cached_render_result_slice_as_text_timestamp(self):
     with arrow_table_context(
         make_column("A", [2134213412341232967, None], pa.timestamp("ns"))
     ) as (path, table):
         result = LoadedRenderResult(
             path=path,
             table=table,
             columns=[Column("A", ColumnType.Timestamp())],
             errors=[],
             json={},
         )
         cache_render_result(self.workflow, self.step, 1, result)
     crr = self.step.cached_render_result
     self.assertEqual(
         read_cached_render_result_slice_as_text(crr, "csv", range(2), range(3)),
         "A\n2037-08-18T13:03:32.341232967Z\n",
     )
Пример #12
0
    def test_input_crr(self, downloaded_parquet_file, clean_value):
        def do_fetch(
            compiled_module,
            chroot_context,
            basedir,
            params,
            secrets,
            last_fetch_result,
            input_parquet_filename,
            output_filename,
        ):
            shutil.copy(basedir / input_parquet_filename,
                        basedir / output_filename)
            return FetchResult(basedir / output_filename)

        self.kernel.fetch.side_effect = do_fetch
        clean_value.return_value = {}

        with tempfile_context(dir=self.basedir,
                              suffix=".parquet") as parquet_path:
            parquet_path.write_bytes(b"abc123")
            downloaded_parquet_file.return_value = parquet_path

            input_metadata = TableMetadata(3, [Column("A", ColumnType.Text())])
            input_crr = CachedRenderResult(1, 2, 3, "ok", [], {},
                                           input_metadata)
            with self.assertLogs("fetcher.fetch", level=logging.INFO):
                result = fetch.fetch_or_wrap_error(
                    self.ctx,
                    self.chroot_context,
                    self.basedir,
                    "mod",
                    create_module_zipfile("mod"),
                    {},
                    {},
                    None,
                    input_crr,
                    self.output_path,
                )

            # Passed file is downloaded from rendercache
            self.assertEqual(result.path.read_bytes(), b"abc123")
            # clean_value() is called with input metadata from CachedRenderResult
            clean_value.assert_called()
            self.assertEqual(clean_value.call_args[0][2], input_metadata)
Пример #13
0
    def test_clear(self):
        with arrow_table_context(make_column("A", [1])) as (path, table):
            result = LoadedRenderResult(
                path=path,
                table=table,
                columns=[Column("A", ColumnType.Number(format="{:,}"))],
                errors=[],
                json={},
            )
            cache_render_result(self.workflow, self.step, 1, result)

        parquet_key = crr_parquet_key(self.step.cached_render_result)
        clear_cached_render_result_for_step(self.step)

        db_step = Step.objects.get(id=self.step.id)
        self.assertIsNone(db_step.cached_render_result)

        self.assertFalse(s3.exists(BUCKET, parquet_key))
Пример #14
0
    def test_execute_cache_hit(self):
        cached_table1 = make_table(make_column("A", [1]))
        cached_table2 = make_table(make_column("B", [2], format="${:,}"))
        module_zipfile = create_module_zipfile(
            "mod", spec_kwargs={"loads_data": True})
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        step1 = tab.steps.create(order=0,
                                 slug="step-1",
                                 last_relevant_delta_id=workflow.last_delta_id)
        write_to_rendercache(workflow, step1, workflow.last_delta_id,
                             cached_table1)
        step2 = tab.steps.create(order=1,
                                 slug="step-2",
                                 last_relevant_delta_id=workflow.last_delta_id)
        write_to_rendercache(workflow, step2, workflow.last_delta_id,
                             cached_table2)

        tab_flow = TabFlow(
            Tab(tab.slug, tab.name),
            [
                ExecuteStep(step1, module_zipfile, {}),
                ExecuteStep(step2, module_zipfile, {}),
            ],
        )

        unwanted_table = make_table(make_column("No", ["bad"]))
        with patch.object(Kernel,
                          "render",
                          side_effect=mock_render(unwanted_table)):
            with self._execute(workflow, tab_flow, {}) as (result, path):
                self.assertEqual(
                    result,
                    StepResult(
                        path,
                        [Column("B", ColumnType.Number(format="${:,}"))]),
                )
                assert_arrow_table_equals(load_trusted_arrow_file(path),
                                          cached_table2)

            Kernel.render.assert_not_called()
Пример #15
0
    def test_execute_cache_miss(self):
        module_zipfile = create_module_zipfile(
            "mod", spec_kwargs={"loads_data": True})
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        step1 = tab.steps.create(
            order=0,
            slug="step-1",
            module_id_name="mod",
            last_relevant_delta_id=workflow.last_delta_id,
        )
        step2 = tab.steps.create(
            order=1,
            slug="step-2",
            module_id_name="mod",
            last_relevant_delta_id=workflow.last_delta_id,
        )

        tab_flow = TabFlow(
            Tab(tab.slug, tab.name),
            [
                ExecuteStep(step1, module_zipfile, {}),
                ExecuteStep(step2, module_zipfile, {}),
            ],
        )

        table = make_table(make_column("A", ["a"]))

        with patch.object(Kernel, "render", side_effect=mock_render(table)):
            with self._execute(workflow, tab_flow, {}) as (result, path):
                self.assertEqual(
                    result, StepResult(path, [Column("A", ColumnType.Text())]))
                assert_arrow_table_equals(load_trusted_arrow_file(path), table)

            self.assertEqual(Kernel.render.call_count, 2)  # step2, not step1
            self.assertRegex(
                # Output is to the correct file
                Kernel.render.call_args[1]["output_filename"],
                r"execute-tab-output.*\.arrow",
            )
Пример #16
0
 def test_input_crr_corrupt_cache_error_is_none(self, downloaded_parquet_file):
     self.kernel.fetch.return_value = FetchResult(self.output_path, [])
     downloaded_parquet_file.side_effect = rendercache.CorruptCacheError(
         "file not found"
     )
     input_metadata = TableMetadata(3, [Column("A", ColumnType.Text())])
     input_crr = CachedRenderResult(1, 2, 3, "ok", [], {}, input_metadata)
     with self.assertLogs("fetcher.fetch", level=logging.INFO):
         fetch.fetch_or_wrap_error(
             self.ctx,
             self.chroot_context,
             self.basedir,
             "mod",
             create_module_zipfile("mod"),
             {},
             {},
             None,
             input_crr,
             self.output_path,
         )
     # fetch is still called, with `None` as argument.
     self.assertIsNone(self.kernel.fetch.call_args[1]["input_parquet_filename"])
Пример #17
0
 def test_input_crr_corrupt_cache_error_is_none(
     self, downloaded_parquet_file, load_module
 ):
     load_module.return_value.migrate_params.return_value = {}
     load_module.return_value.fetch.return_value = FetchResult(self.output_path, [])
     downloaded_parquet_file.side_effect = rendercache.CorruptCacheError(
         "file not found"
     )
     input_metadata = TableMetadata(3, [Column("A", ColumnType.Text())])
     input_crr = CachedRenderResult(1, 2, 3, "ok", [], {}, input_metadata)
     fetch.fetch_or_wrap_error(
         self.ctx,
         self.basedir,
         WfModule(),
         MockModuleVersion(),
         {},
         None,
         input_crr,
         self.output_path,
     )
     # fetch is still called, with `None` as argument.
     self.assertIsNone(
         load_module.return_value.fetch.call_args[1]["input_parquet_filename"]
     )
Пример #18
0
 def test_clean_column_missing(self):
     input_shape = TableMetadata(3, [Column("A", ColumnType.Number())])
     self.assertEqual(clean_value(ParamDType.Column(), "B", input_shape),
                      "")
Пример #19
0
 def test_clean_column_tab_parameter_is_error(self):
     input_shape = TableMetadata(3, [Column("A", ColumnType.Number())])
     with self.assertRaisesRegex(
             RuntimeError, "Unsupported: fetch column with tab_parameter"):
         clean_value(ParamDType.Column(tab_parameter="tab-2"), "A",
                     input_shape)
Пример #20
0
def Date(name: str, unit: str) -> Column:
    return Column(name, ColumnType.Date(unit))
Пример #21
0
 def migrate_params(params):
     return [ColumnType.Text()]
Пример #22
0
 def render(arrow_table, params, output_path, *, columns, **kwargs):
     # Test the "columns" kwarg
     #
     # TODO nix this! The only module that uses it is `converttotext`.
     self.assertEqual(
         columns,
         [
             Column("A", ColumnType.Number("{:,.3f}")),
             Column("B", ColumnType.Number("{:,.3f}")),
             Column("C", ColumnType.Number("{:,.3f}")),
             Column("D", ColumnType.Timestamp()),
             Column("E", ColumnType.Timestamp()),
             Column("F", ColumnType.Timestamp()),
             Column("G", ColumnType.Text()),
             Column("H", ColumnType.Text()),
             Column("I", ColumnType.Text()),
             Column("J", ColumnType.Date(unit="day")),
             Column("K", ColumnType.Date(unit="week")),
             Column("L", ColumnType.Text()),
         ],
     )
     table = pa.table(
         {
             "A": [1],
             "B": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")),
             "C": ["a"],
             "D": [1],
             "E": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")),
             "F": ["a"],
             "G": [1],
             "H": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")),
             "I": ["a"],
             "J": pa.array([date(2021, 4, 1)]),
             "K": pa.array([date(2021, 4, 12)]),
             "L": pa.array([date(2021, 4, 1)]),
         }
     )
     schema = table.schema.set(
         table.schema.get_field_index("J"),
         pa.field("J", pa.date32(), metadata={"unit": "month"}),
     )
     with pa.ipc.RecordBatchFileWriter(output_path, schema) as writer:
         writer.write_table(pa.table(table.columns, schema=schema))
     return []
Пример #23
0
def Timestamp(name: str) -> Column:
    return Column(name, ColumnType.Timestamp())
Пример #24
0
 def test_text_dictionary_ok(self):
     self.assertEqual(
         read_columns(pa.table({"A":
                                pa.array(["x"]).dictionary_encode()}), ),
         [Column("A", ColumnType.Text())],
     )
Пример #25
0
    def test_render_arrow_table_infer_output_column_formats_from_input(self):
        input_columns = [
            Column("A", ColumnType.Number("{:,.3f}")),
            Column("B", ColumnType.Number("{:,.3f}")),
            Column("C", ColumnType.Number("{:,.3f}")),
            Column("D", ColumnType.Datetime()),
            Column("E", ColumnType.Datetime()),
            Column("F", ColumnType.Datetime()),
            Column("G", ColumnType.Text()),
            Column("H", ColumnType.Text()),
            Column("I", ColumnType.Text()),
        ]
        # The param name "arrow_table" is a special case
        def render(arrow_table, params, output_path, *, columns, **kwargs):
            # Test the "columns" kwarg
            self.assertEqual(columns, input_columns)
            table = pa.table(
                {
                    "A": [1],
                    "B": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")),
                    "C": ["a"],
                    "D": [1],
                    "E": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")),
                    "F": ["a"],
                    "G": [1],
                    "H": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")),
                    "I": ["a"],
                }
            )
            with pa.ipc.RecordBatchFileWriter(output_path, table.schema) as writer:
                writer.write_table(table)
            return []

        with arrow_table_context(
            {
                "A": [1],
                "B": [1],
                "C": [1],
                "D": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")),
                "E": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")),
                "F": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")),
                "G": ["a"],
                "H": ["a"],
                "I": ["a"],
            },
            columns=input_columns,
            dir=self.basedir,
        ) as arrow_table:
            result = self._test_render(render, arrow_table=arrow_table)
            self.assertEqual(
                result.table.metadata.columns,
                [
                    Column("A", ColumnType.Number("{:,.3f}")),  # recalled
                    Column("B", ColumnType.Datetime()),  # inferred
                    Column("C", ColumnType.Text()),  # inferred
                    Column("D", ColumnType.Number("{:,}")),  # inferred
                    Column("E", ColumnType.Datetime()),  # recalled
                    Column("F", ColumnType.Text()),  # inferred
                    Column("G", ColumnType.Number("{:,}")),  # inferred
                    Column("H", ColumnType.Datetime()),  # inferred
                    Column("I", ColumnType.Text()),  # recalled
                ],
            )
Пример #26
0
 def test_timestamp_ok(self):
     table = pa.table({"A": pa.array([12312312314512], pa.timestamp("ns"))})
     self.assertEqual(read_columns(table),
                      [Column("A", ColumnType.Timestamp())])
Пример #27
0
def Datetime(name: str) -> Column:
    return Column(name, ColumnType.Datetime())
Пример #28
0
def Text(name: str) -> Column:
    return Column(name, ColumnType.Text())
Пример #29
0
def Number(name: str, format: str = "{:,.2f}") -> Column:
    return Column(name, ColumnType.Number(format=format))
Пример #30
0
 def test_text_ok(self):
     self.assertEqual(read_columns(pa.table({"A": ["x"]})),
                      [Column("A", ColumnType.Text())])