Exemplo n.º 1
0
 def test_clean_multicolumn_sort_in_table_order(self):
     input_shape = TableMetadata(3, [
         Column("B", ColumnType.Number()),
         Column("A", ColumnType.Number())
     ])
     result = clean_value(ParamDType.Multicolumn(), ["A", "B"], input_shape)
     self.assertEqual(result, ["B", "A"])
Exemplo n.º 2
0
 def test_clean_multicolumn_valid(self):
     input_shape = TableMetadata(3, [
         Column("A", ColumnType.Number()),
         Column("B", ColumnType.Number())
     ])
     result = clean_value(ParamSchema.Multicolumn(), ["A", "B"],
                          input_shape)
     self.assertEqual(result, ["A", "B"])
Exemplo n.º 3
0
 def test_clean_multicolumn_missing_is_removed(self):
     input_shape = TableMetadata(3, [
         Column("A", ColumnType.Number()),
         Column("B", ColumnType.Number())
     ])
     result = clean_value(ParamDType.Multicolumn(), ["A", "X", "B"],
                          input_shape)
     self.assertEqual(result, ["A", "B"])
Exemplo n.º 4
0
    def test_render_use_input_columns_as_try_fallback_columns(self):
        def render(*args, input_columns):
            return pd.DataFrame({"A": [1]})

        with arrow_table_context({"A": [1]},
                                 [Column("A", ColumnType.Number("{:,.3f}"))],
                                 dir=self.basedir) as arrow_table:
            result = self._test_render(render, arrow_table=arrow_table)
            self.assertEqual(
                result.table.metadata.columns,
                [Column("A", ColumnType.Number("{:,.3f}"))],
            )
Exemplo n.º 5
0
    def test_fetch_truncate(self):
        def fetch(params):
            return pd.DataFrame({"A": [1, 2, 3]})

        with tempfile_context(dir=self.basedir) as outfile:
            result = self._test_fetch(fetch, output_filename=outfile.name)
            self.assertEqual(
                result,
                FetchResult(
                    outfile,
                    errors=[
                        FetchError(
                            I18nMessage(
                                "py.cjwkernel.pandas.types.ProcessResult.truncate_in_place_if_too_big.warning",
                                {
                                    "old_number": 3,
                                    "new_number": 2
                                },
                                None,
                            ))
                    ],
                ),
            )
            assert_arrow_table_equals(
                read_parquet_as_arrow(
                    outfile, [Column("A", ColumnType.Number("{:,}"))]),
                make_table(make_column("A", [1, 2])),
            )
Exemplo n.º 6
0
    def test_fetch_integration(self, send_update, queue_render):
        queue_render.side_effect = async_value(None)
        send_update.side_effect = async_value(None)
        workflow = Workflow.create_and_init()
        create_module_zipfile(
            "mod",
            python_code=
            ("import pandas as pd\ndef fetch(params): return pd.DataFrame({'A': [1]})\ndef render(table, params): return table"
             ),
        )
        step = workflow.tabs.first().steps.create(order=0,
                                                  slug="step-1",
                                                  module_id_name="mod")
        cjwstate.modules.init_module_system()
        now = datetime.datetime.now()
        with self.assertLogs(level=logging.INFO):
            self.run_with_async_db(
                fetch.fetch(workflow_id=workflow.id, step_id=step.id, now=now))
        step.refresh_from_db()
        so = step.stored_objects.get(stored_at=step.stored_data_version)
        with s3.temporarily_download(s3.StoredObjectsBucket,
                                     so.key) as parquet_path:
            # fetch results are stored without a schema. Let's hard-code a
            # schema simply so we can test that the table data is the same.
            table = read_parquet_as_arrow(parquet_path,
                                          [Column("A", ColumnType.Number())])
            assert_arrow_table_equals(table, make_table(make_column("A", [1])))

        workflow.refresh_from_db()
        queue_render.assert_called_with(workflow.id, workflow.last_delta_id)
        send_update.assert_called()
Exemplo n.º 7
0
    def test_metadata_comes_from_db_columns(self):
        columns = [
            Column("A", ColumnType.Number(format="{:,.2f}")),
            Column("B", ColumnType.Datetime()),
            Column("C", ColumnType.Text()),
        ]
        result = RenderResult(
            arrow_table({
                "A": [1],
                "B": [datetime.datetime.now()],
                "C": ["x"]
            },
                        columns=columns))
        cache_render_result(self.workflow, self.wf_module, self.delta.id,
                            result)
        # Delete from disk entirely, to prove we did not read.
        minio.remove(BUCKET,
                     crr_parquet_key(self.wf_module.cached_render_result))

        # Load _new_ CachedRenderResult -- from DB columns, not memory
        fresh_wf_module = WfModule.objects.get(id=self.wf_module.id)
        cached_result = fresh_wf_module.cached_render_result

        self.assertEqual(cached_result.table_metadata,
                         TableMetadata(1, columns))
Exemplo n.º 8
0
def _infer_output_column_type(column: pyarrow.ChunkedArray) -> ColumnType:
    if column.type == pyarrow.utf8() or (hasattr(column.type, "value_type")
                                         and column.type.value_type
                                         == pyarrow.utf8()):
        return ColumnType.Text()
    else:
        return ColumnType.Number()
Exemplo n.º 9
0
    def test_render_with_input_columns(self):
        def render(*args, input_columns):
            self.assertEqual(
                input_columns,
                {
                    "A": ptypes.RenderColumn("A", "text", None),
                    "B": ptypes.RenderColumn("B", "number", "{:,.3f}"),
                    "C": ptypes.RenderColumn("C", "datetime", None),
                },
            )

        with arrow_table_context(
            {
                "A": ["x"],
                "B": [1],
                "C": pa.array([datetime.now()], pa.timestamp("ns"))
            },
                columns=[
                    Column("A", ColumnType.Text()),
                    Column("B", ColumnType.Number("{:,.3f}")),
                    Column("C", ColumnType.Datetime()),
                ],
                dir=self.basedir,
        ) as arrow_table:
            self._test_render(render, arrow_table=arrow_table)
Exemplo n.º 10
0
 def test_clean_column_happy_path(self):
     input_shape = TableMetadata(3, [Column("A", ColumnType.Number())])
     self.assertEqual(
         clean_value(ParamDType.Column(column_types=frozenset({"number"})),
                     "A", input_shape),
         "A",
     )
Exemplo n.º 11
0
    def test_metadata_does_not_require_file_read(self):
        columns = [
            Column("A", ColumnType.Number(format="{:,.2f}")),
            Column("B", ColumnType.Timestamp()),
            Column("C", ColumnType.Text()),
            Column("D", ColumnType.Date("month")),
        ]
        with arrow_table_context(
            make_column("A", [1], format="{:,.2f}"),
            make_column("B", [datetime.datetime(2021, 4, 13)]),
            make_column("C", ["c"]),
            make_column("D", [datetime.date(2021, 4, 1)], unit="month"),
        ) as (path, table):
            result = LoadedRenderResult(
                path=path, table=table, columns=columns, errors=[], json={}
            )
            cache_render_result(self.workflow, self.step, 1, result)
        # Delete from disk entirely, to prove we did not read.
        s3.remove(BUCKET, crr_parquet_key(self.step.cached_render_result))

        # Load _new_ CachedRenderResult -- from DB columns, not memory
        fresh_step = Step.objects.get(id=self.step.id)
        cached_result = fresh_step.cached_render_result

        self.assertEqual(cached_result.table_metadata, TableMetadata(1, columns))
Exemplo n.º 12
0
    def test_render_using_tab_output(self):
        def render(table, params):
            self.assertEqual(params["tabparam"].slug, "tab-1")
            self.assertEqual(params["tabparam"].name, "Tab 1")
            self.assertEqual(
                params["tabparam"].columns,
                {
                    "X": ptypes.RenderColumn("X", "number", "{:,d}"),
                    "Y": ptypes.RenderColumn("Y", "text", None),
                },
            )
            assert_frame_equal(params["tabparam"].dataframe,
                               pd.DataFrame({
                                   "X": [1],
                                   "Y": ["y"]
                               }))

        with arrow_table_context(
            {
                "X": [1],
                "Y": ["y"]
            },
                columns=[
                    Column("X", ColumnType.Number("{:,d}")),
                    Column("Y", ColumnType.Text()),
                ],
                dir=self.basedir,
        ) as atable:
            self._test_render(
                render,
                params={"tabparam": TabOutput(Tab("tab-1", "Tab 1"), atable)})
Exemplo n.º 13
0
 def render(arrow_table, params, output_path, *, columns, **kwargs):
     # Test the "columns" kwarg
     #
     # TODO nix this! The only module that uses it is `converttotext`.
     self.assertEqual(
         columns,
         [
             Column("A", ColumnType.Number("{:,.3f}")),
             Column("B", ColumnType.Number("{:,.3f}")),
             Column("C", ColumnType.Number("{:,.3f}")),
             Column("D", ColumnType.Timestamp()),
             Column("E", ColumnType.Timestamp()),
             Column("F", ColumnType.Timestamp()),
             Column("G", ColumnType.Text()),
             Column("H", ColumnType.Text()),
             Column("I", ColumnType.Text()),
             Column("J", ColumnType.Date(unit="day")),
             Column("K", ColumnType.Date(unit="week")),
             Column("L", ColumnType.Text()),
         ],
     )
     table = pa.table(
         {
             "A": [1],
             "B": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")),
             "C": ["a"],
             "D": [1],
             "E": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")),
             "F": ["a"],
             "G": [1],
             "H": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")),
             "I": ["a"],
             "J": pa.array([date(2021, 4, 1)]),
             "K": pa.array([date(2021, 4, 12)]),
             "L": pa.array([date(2021, 4, 1)]),
         }
     )
     schema = table.schema.set(
         table.schema.get_field_index("J"),
         pa.field("J", pa.date32(), metadata={"unit": "month"}),
     )
     with pa.ipc.RecordBatchFileWriter(output_path, schema) as writer:
         writer.write_table(pa.table(table.columns, schema=schema))
     return []
Exemplo n.º 14
0
 def test_clean_normal_dict(self):
     input_shape = TableMetadata(3, [Column("A", ColumnType.Number())])
     schema = ParamDType.Dict({
         "str": ParamDType.String(),
         "int": ParamDType.Integer()
     })
     value = {"str": "foo", "int": 3}
     expected = dict(value)  # no-op
     result = clean_value(schema, value, input_shape)
     self.assertEqual(result, expected)
Exemplo n.º 15
0
    def test_cache_render_result(self):
        with arrow_table_context(make_column("A", [1])) as (table_path, table):
            result = LoadedRenderResult(
                path=table_path,
                table=table,
                columns=[Column("A", ColumnType.Number(format="{:,}"))],
                errors=[
                    RenderError(
                        I18nMessage("e1", {"text": "hi"}, None),
                        [
                            QuickFix(
                                I18nMessage("q1", {"var": 2}, None),
                                QuickFixAction.PrependStep("filter", {"a": "x"}),
                            )
                        ],
                    ),
                    RenderError(I18nMessage("e2", {}, None), []),
                ],
                json={"foo": "bar"},
            )
            cache_render_result(self.workflow, self.step, 1, result)

        cached = self.step.cached_render_result
        self.assertEqual(cached.step_id, self.step.id)
        self.assertEqual(cached.delta_id, 1)

        self.assertEqual(
            crr_parquet_key(cached),
            f"wf-{self.workflow.id}/wfm-{self.step.id}/delta-1.dat",
        )

        # Reading completely freshly from the DB should give the same thing
        db_step = Step.objects.get(id=self.step.id)
        from_db = db_step.cached_render_result
        self.assertEqual(from_db, cached)

        with open_cached_render_result(from_db) as result2:
            assert_arrow_table_equals(
                result2.table, make_table(make_column("A", [1], format="{:,}"))
            )
            self.assertEqual(
                result2.columns, [Column("A", ColumnType.Number(format="{:,}"))]
            )
Exemplo n.º 16
0
    def test_render_return_column_formats(self):
        def render(table, params):
            return {
                "dataframe": pd.DataFrame({"A": [1]}),
                "column_formats": {"A": "{:,d}"},
            }

        result = self._test_render(render)
        self.assertEqual(
            result.table.metadata.columns[0].type, ColumnType.Number("{:,d}")
        )
Exemplo n.º 17
0
def _arrow_column_to_column(name: str, column: pyarrow.ChunkedArray) -> Column:
    if pyarrow.types.is_floating(column.type) or pyarrow.types.is_integer(
            column.type):
        column_type = ColumnType.Number("{:,}")
    elif pyarrow.types.is_timestamp(column.type):
        column_type = ColumnType.Timestamp()
    elif pyarrow.types.is_string(column.type) or pyarrow.types.is_dictionary(
            column.type):
        column_type = ColumnType.Text()
    else:
        raise RuntimeError("Unknown column type %r" % column.type)
    return Column(name, column_type)
Exemplo n.º 18
0
 def test_number_metadata_utf8_format(self):
     table = pa.table(
         [pa.array([123])],
         pa.schema([
             pa.field(
                 "A",
                 pa.int64(),
                 metadata={b"format": "€{:,.2f}".encode("utf-8")},
             )
         ]),
     )
     self.assertEqual(read_columns(table),
                      [Column("A", ColumnType.Number(format="€{:,.2f}"))])
Exemplo n.º 19
0
    def test_clean_column_prompting_error_convert_to_text(self):
        # TODO make this _automatic_ instead of quick-fix?
        # Consider Regex. We probably want to pass the module a text Series
        # _separately_ from the input DataFrame. That way Regex can output
        # a new Text column but preserve its input column's data type.
        #
        # ... but for now: prompt for a Quick Fix.
        input_shape = TableMetadata(3, [Column("A", ColumnType.Number())])
        with self.assertRaises(PromptingError) as cm:
            clean_value(ParamDType.Column(column_types=frozenset({"text"})),
                        "A", input_shape)

        self.assertEqual(
            cm.exception.errors,
            [PromptingError.WrongColumnType(["A"], None, frozenset({"text"}))],
        )
Exemplo n.º 20
0
    def test_disallow_non_text(self):
        cache_render_result(
            self.workflow,
            self.step1,
            self.step1.last_relevant_delta_id,
            RenderResult(
                arrow_table(
                    {"A": [1, 2, 3, 2, 1]},
                    columns=[Column("A", ColumnType.Number(format="{:.2f}"))],
                )),
        )

        response = self._request("A")

        self.assertEqual(response.status_code, 200)
        self.assertEqual(json.loads(response.content), {"values": {}})
Exemplo n.º 21
0
    def test_value_counts_disallow_non_text(self):
        cache_render_result(
            self.workflow,
            self.wf_module2,
            self.wf_module2.last_relevant_delta_id,
            RenderResult(
                arrow_table(
                    {"A": [1, 2, 3, 2, 1]},
                    columns=[Column("A", ColumnType.Number(format="{:.2f}"))],
                )),
        )

        response = self.client.get(
            f"/api/wfmodules/{self.wf_module2.id}/value-counts?column=A")

        self.assertEqual(response.status_code, status.HTTP_200_OK)
        self.assertEqual(json.loads(response.content), {"values": {}})
Exemplo n.º 22
0
    def test_clear(self):
        with arrow_table_context(make_column("A", [1])) as (path, table):
            result = LoadedRenderResult(
                path=path,
                table=table,
                columns=[Column("A", ColumnType.Number(format="{:,}"))],
                errors=[],
                json={},
            )
            cache_render_result(self.workflow, self.step, 1, result)

        parquet_key = crr_parquet_key(self.step.cached_render_result)
        clear_cached_render_result_for_step(self.step)

        db_step = Step.objects.get(id=self.step.id)
        self.assertIsNone(db_step.cached_render_result)

        self.assertFalse(s3.exists(BUCKET, parquet_key))
Exemplo n.º 23
0
    def test_execute_cache_hit(self):
        cached_table1 = make_table(make_column("A", [1]))
        cached_table2 = make_table(make_column("B", [2], format="${:,}"))
        module_zipfile = create_module_zipfile(
            "mod", spec_kwargs={"loads_data": True})
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        step1 = tab.steps.create(order=0,
                                 slug="step-1",
                                 last_relevant_delta_id=workflow.last_delta_id)
        write_to_rendercache(workflow, step1, workflow.last_delta_id,
                             cached_table1)
        step2 = tab.steps.create(order=1,
                                 slug="step-2",
                                 last_relevant_delta_id=workflow.last_delta_id)
        write_to_rendercache(workflow, step2, workflow.last_delta_id,
                             cached_table2)

        tab_flow = TabFlow(
            Tab(tab.slug, tab.name),
            [
                ExecuteStep(step1, module_zipfile, {}),
                ExecuteStep(step2, module_zipfile, {}),
            ],
        )

        unwanted_table = make_table(make_column("No", ["bad"]))
        with patch.object(Kernel,
                          "render",
                          side_effect=mock_render(unwanted_table)):
            with self._execute(workflow, tab_flow, {}) as (result, path):
                self.assertEqual(
                    result,
                    StepResult(
                        path,
                        [Column("B", ColumnType.Number(format="${:,}"))]),
                )
                assert_arrow_table_equals(load_trusted_arrow_file(path),
                                          cached_table2)

            Kernel.render.assert_not_called()
Exemplo n.º 24
0
    def test_clean_multicolumn_prompting_error_convert_to_text(self):
        # TODO make this _automatic_ instead of quick-fix?
        # ... but for now: prompt for a Quick Fix.
        input_shape = TableMetadata(
            3,
            [
                Column("A", ColumnType.Number()),
                Column("B", ColumnType.Timestamp()),
                Column("C", ColumnType.Text()),
            ],
        )
        with self.assertRaises(PromptingError) as cm:
            schema = ParamDType.Multicolumn(column_types=frozenset({"text"}))
            clean_value(schema, "A,B", input_shape)

        self.assertEqual(
            cm.exception.errors,
            [
                PromptingError.WrongColumnType(["A", "B"], None,
                                               frozenset({"text"}))
            ],
        )
Exemplo n.º 25
0
 def test_clean_column_missing(self):
     input_shape = TableMetadata(3, [Column("A", ColumnType.Number())])
     self.assertEqual(clean_value(ParamDType.Column(), "B", input_shape),
                      "")
Exemplo n.º 26
0
 def test_clean_column_tab_parameter_is_error(self):
     input_shape = TableMetadata(3, [Column("A", ColumnType.Number())])
     with self.assertRaisesRegex(
             RuntimeError, "Unsupported: fetch column with tab_parameter"):
         clean_value(ParamDType.Column(tab_parameter="tab-2"), "A",
                     input_shape)
Exemplo n.º 27
0
def NUMBER(name: str, format: str = "{:,}"):
    return Column(name, ColumnType.Number(format=format))
Exemplo n.º 28
0
def Number(name: str, format: str = "{:,.2f}") -> Column:
    return Column(name, ColumnType.Number(format=format))
Exemplo n.º 29
0
    def test_render_arrow_table_infer_output_column_formats_from_input(self):
        input_columns = [
            Column("A", ColumnType.Number("{:,.3f}")),
            Column("B", ColumnType.Number("{:,.3f}")),
            Column("C", ColumnType.Number("{:,.3f}")),
            Column("D", ColumnType.Datetime()),
            Column("E", ColumnType.Datetime()),
            Column("F", ColumnType.Datetime()),
            Column("G", ColumnType.Text()),
            Column("H", ColumnType.Text()),
            Column("I", ColumnType.Text()),
        ]
        # The param name "arrow_table" is a special case
        def render(arrow_table, params, output_path, *, columns, **kwargs):
            # Test the "columns" kwarg
            self.assertEqual(columns, input_columns)
            table = pa.table(
                {
                    "A": [1],
                    "B": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")),
                    "C": ["a"],
                    "D": [1],
                    "E": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")),
                    "F": ["a"],
                    "G": [1],
                    "H": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")),
                    "I": ["a"],
                }
            )
            with pa.ipc.RecordBatchFileWriter(output_path, table.schema) as writer:
                writer.write_table(table)
            return []

        with arrow_table_context(
            {
                "A": [1],
                "B": [1],
                "C": [1],
                "D": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")),
                "E": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")),
                "F": pa.array([datetime(2020, 3, 8)], pa.timestamp("ns")),
                "G": ["a"],
                "H": ["a"],
                "I": ["a"],
            },
            columns=input_columns,
            dir=self.basedir,
        ) as arrow_table:
            result = self._test_render(render, arrow_table=arrow_table)
            self.assertEqual(
                result.table.metadata.columns,
                [
                    Column("A", ColumnType.Number("{:,.3f}")),  # recalled
                    Column("B", ColumnType.Datetime()),  # inferred
                    Column("C", ColumnType.Text()),  # inferred
                    Column("D", ColumnType.Number("{:,}")),  # inferred
                    Column("E", ColumnType.Datetime()),  # recalled
                    Column("F", ColumnType.Text()),  # inferred
                    Column("G", ColumnType.Number("{:,}")),  # inferred
                    Column("H", ColumnType.Datetime()),  # inferred
                    Column("I", ColumnType.Text()),  # recalled
                ],
            )