Exemplo n.º 1
0
    def test_metadata_does_not_require_file_read(self):
        columns = [
            Column("A", ColumnType.Number(format="{:,.2f}")),
            Column("B", ColumnType.Timestamp()),
            Column("C", ColumnType.Text()),
            Column("D", ColumnType.Date("month")),
        ]
        with arrow_table_context(
            make_column("A", [1], format="{:,.2f}"),
            make_column("B", [datetime.datetime(2021, 4, 13)]),
            make_column("C", ["c"]),
            make_column("D", [datetime.date(2021, 4, 1)], unit="month"),
        ) as (path, table):
            result = LoadedRenderResult(
                path=path, table=table, columns=columns, errors=[], json={}
            )
            cache_render_result(self.workflow, self.step, 1, result)
        # Delete from disk entirely, to prove we did not read.
        s3.remove(BUCKET, crr_parquet_key(self.step.cached_render_result))

        # Load _new_ CachedRenderResult -- from DB columns, not memory
        fresh_step = Step.objects.get(id=self.step.id)
        cached_result = fresh_step.cached_render_result

        self.assertEqual(cached_result.table_metadata, TableMetadata(1, columns))
Exemplo n.º 2
0
    def test_render_with_input_columns(self):
        def render(*args, input_columns):
            self.assertEqual(
                input_columns,
                {
                    "A": ptypes.RenderColumn("A", "text", None),
                    "B": ptypes.RenderColumn("B", "number", "{:,.3f}"),
                    "C": ptypes.RenderColumn("C", "datetime", None),
                },
            )

        with arrow_table_context(
            {
                "A": ["x"],
                "B": [1],
                "C": pa.array([datetime.now()], pa.timestamp("ns"))
            },
                columns=[
                    Column("A", ColumnType.Text()),
                    Column("B", ColumnType.Number("{:,.3f}")),
                    Column("C", ColumnType.Datetime()),
                ],
                dir=self.basedir,
        ) as arrow_table:
            self._test_render(render, arrow_table=arrow_table)
Exemplo n.º 3
0
    def test_render_using_tab_output(self):
        def render(table, params):
            self.assertEqual(params["tabparam"].slug, "tab-1")
            self.assertEqual(params["tabparam"].name, "Tab 1")
            self.assertEqual(
                params["tabparam"].columns,
                {
                    "X": ptypes.RenderColumn("X", "number", "{:,d}"),
                    "Y": ptypes.RenderColumn("Y", "text", None),
                },
            )
            assert_frame_equal(params["tabparam"].dataframe,
                               pd.DataFrame({
                                   "X": [1],
                                   "Y": ["y"]
                               }))

        with arrow_table_context(
            {
                "X": [1],
                "Y": ["y"]
            },
                columns=[
                    Column("X", ColumnType.Number("{:,d}")),
                    Column("Y", ColumnType.Text()),
                ],
                dir=self.basedir,
        ) as atable:
            self._test_render(
                render,
                params={"tabparam": TabOutput(Tab("tab-1", "Tab 1"), atable)})
Exemplo n.º 4
0
 def test_clean_multicolumn_sort_in_table_order(self):
     input_shape = TableMetadata(3, [
         Column("B", ColumnType.Number()),
         Column("A", ColumnType.Number())
     ])
     result = clean_value(ParamDType.Multicolumn(), ["A", "B"], input_shape)
     self.assertEqual(result, ["B", "A"])
Exemplo n.º 5
0
    def test_metadata_comes_from_db_columns(self):
        columns = [
            Column("A", ColumnType.Number(format="{:,.2f}")),
            Column("B", ColumnType.Datetime()),
            Column("C", ColumnType.Text()),
        ]
        result = RenderResult(
            arrow_table({
                "A": [1],
                "B": [datetime.datetime.now()],
                "C": ["x"]
            },
                        columns=columns))
        cache_render_result(self.workflow, self.wf_module, self.delta.id,
                            result)
        # Delete from disk entirely, to prove we did not read.
        minio.remove(BUCKET,
                     crr_parquet_key(self.wf_module.cached_render_result))

        # Load _new_ CachedRenderResult -- from DB columns, not memory
        fresh_wf_module = WfModule.objects.get(id=self.wf_module.id)
        cached_result = fresh_wf_module.cached_render_result

        self.assertEqual(cached_result.table_metadata,
                         TableMetadata(1, columns))
Exemplo n.º 6
0
 def test_clean_multicolumn_valid(self):
     input_shape = TableMetadata(3, [
         Column("A", ColumnType.Number()),
         Column("B", ColumnType.Number())
     ])
     result = clean_value(ParamSchema.Multicolumn(), ["A", "B"],
                          input_shape)
     self.assertEqual(result, ["A", "B"])
Exemplo n.º 7
0
 def test_clean_multicolumn_missing_is_removed(self):
     input_shape = TableMetadata(3, [
         Column("A", ColumnType.Number()),
         Column("B", ColumnType.Number())
     ])
     result = clean_value(ParamDType.Multicolumn(), ["A", "X", "B"],
                          input_shape)
     self.assertEqual(result, ["A", "B"])
Exemplo n.º 8
0
    def test_render_use_input_columns_as_try_fallback_columns(self):
        def render(*args, input_columns):
            return pd.DataFrame({"A": [1]})

        with arrow_table_context({"A": [1]},
                                 [Column("A", ColumnType.Number("{:,.3f}"))],
                                 dir=self.basedir) as arrow_table:
            result = self._test_render(render, arrow_table=arrow_table)
            self.assertEqual(
                result.table.metadata.columns,
                [Column("A", ColumnType.Number("{:,.3f}"))],
            )
Exemplo n.º 9
0
    def test_fetch_truncate(self):
        def fetch(params):
            return pd.DataFrame({"A": [1, 2, 3]})

        with tempfile_context(dir=self.basedir) as outfile:
            result = self._test_fetch(fetch, output_filename=outfile.name)
            self.assertEqual(
                result,
                FetchResult(
                    outfile,
                    errors=[
                        FetchError(
                            I18nMessage(
                                "py.cjwkernel.pandas.types.ProcessResult.truncate_in_place_if_too_big.warning",
                                {
                                    "old_number": 3,
                                    "new_number": 2
                                },
                                None,
                            ))
                    ],
                ),
            )
            assert_arrow_table_equals(
                read_parquet_as_arrow(
                    outfile, [Column("A", ColumnType.Number("{:,}"))]),
                make_table(make_column("A", [1, 2])),
            )
Exemplo n.º 10
0
 def test_clean_column_happy_path(self):
     input_shape = TableMetadata(3, [Column("A", ColumnType.Number())])
     self.assertEqual(
         clean_value(ParamDType.Column(column_types=frozenset({"number"})),
                     "A", input_shape),
         "A",
     )
Exemplo n.º 11
0
    def test_fetch_integration(self, send_update, queue_render):
        queue_render.side_effect = async_value(None)
        send_update.side_effect = async_value(None)
        workflow = Workflow.create_and_init()
        create_module_zipfile(
            "mod",
            python_code=
            ("import pandas as pd\ndef fetch(params): return pd.DataFrame({'A': [1]})\ndef render(table, params): return table"
             ),
        )
        step = workflow.tabs.first().steps.create(order=0,
                                                  slug="step-1",
                                                  module_id_name="mod")
        cjwstate.modules.init_module_system()
        now = datetime.datetime.now()
        with self.assertLogs(level=logging.INFO):
            self.run_with_async_db(
                fetch.fetch(workflow_id=workflow.id, step_id=step.id, now=now))
        step.refresh_from_db()
        so = step.stored_objects.get(stored_at=step.stored_data_version)
        with s3.temporarily_download(s3.StoredObjectsBucket,
                                     so.key) as parquet_path:
            # fetch results are stored without a schema. Let's hard-code a
            # schema simply so we can test that the table data is the same.
            table = read_parquet_as_arrow(parquet_path,
                                          [Column("A", ColumnType.Number())])
            assert_arrow_table_equals(table, make_table(make_column("A", [1])))

        workflow.refresh_from_db()
        queue_render.assert_called_with(workflow.id, workflow.last_delta_id)
        send_update.assert_called()
Exemplo n.º 12
0
 def test_input_crr_corrupt_cache_error_is_none(
     self, downloaded_parquet_file, load_module
 ):
     load_module.return_value.migrate_params.return_value = {}
     load_module.return_value.fetch.return_value = FetchResult(self.output_path, [])
     downloaded_parquet_file.side_effect = rendercache.CorruptCacheError(
         "file not found"
     )
     input_metadata = TableMetadata(3, [Column("A", ColumnType.Text())])
     input_crr = CachedRenderResult(1, 2, 3, "ok", [], {}, input_metadata)
     fetch.fetch_or_wrap_error(
         self.ctx,
         self.chroot_context,
         self.basedir,
         WfModule(),
         MockModuleVersion(),
         {},
         None,
         input_crr,
         self.output_path,
     )
     # fetch is still called, with `None` as argument.
     self.assertIsNone(
         load_module.return_value.fetch.call_args[1]["input_parquet_filename"]
     )
Exemplo n.º 13
0
 def test_input_crr(self, downloaded_parquet_file, clean_value, load_module):
     load_module.return_value.migrate_params.return_value = {}
     load_module.return_value.fetch.return_value = FetchResult(self.output_path, [])
     clean_value.return_value = {}
     downloaded_parquet_file.return_value = Path("/path/to/x.parquet")
     input_metadata = TableMetadata(3, [Column("A", ColumnType.Text())])
     input_crr = CachedRenderResult(1, 2, 3, "ok", [], {}, input_metadata)
     fetch.fetch_or_wrap_error(
         self.ctx,
         self.chroot_context,
         self.basedir,
         WfModule(),
         MockModuleVersion(),
         {},
         None,
         input_crr,
         self.output_path,
     )
     # Passed file is downloaded from rendercache
     downloaded_parquet_file.assert_called_with(input_crr, dir=self.basedir)
     self.assertEqual(
         load_module.return_value.fetch.call_args[1]["input_parquet_filename"],
         "x.parquet",
     )
     # clean_value() is called with input metadata from CachedRenderResult
     clean_value.assert_called()
     self.assertEqual(clean_value.call_args[0][2], input_metadata)
Exemplo n.º 14
0
 def test_date_unit_day_ok(self):
     table = pa.table(
         [pa.array([date(2021, 4, 4)])],
         pa.schema([pa.field("A", pa.date32(), metadata={b"unit":
                                                         b"day"})]),
     )
     self.assertEqual(read_columns(table),
                      [Column("A", ColumnType.Date(unit="day"))])
Exemplo n.º 15
0
def infer_table_metadata(table: pyarrow.Table) -> TableMetadata:
    return TableMetadata(
        n_rows=table.num_rows,
        columns=[
            Column(name, _infer_output_column_type(column))
            for name, column in zip(table.column_names, table.columns)
        ],
    )
Exemplo n.º 16
0
 def test_date_unit_month_ok(self):
     table = pa.table(
         [pa.array([date(1200, 12, 1),
                    date(3199, 2, 1), None])],
         pa.schema(
             [pa.field("A", pa.date32(), metadata={b"unit": b"month"})]),
     )
     self.assertEqual(read_columns(table),
                      [Column("A", ColumnType.Date(unit="month"))])
Exemplo n.º 17
0
    def test_cache_render_result(self):
        with arrow_table_context(make_column("A", [1])) as (table_path, table):
            result = LoadedRenderResult(
                path=table_path,
                table=table,
                columns=[Column("A", ColumnType.Number(format="{:,}"))],
                errors=[
                    RenderError(
                        I18nMessage("e1", {"text": "hi"}, None),
                        [
                            QuickFix(
                                I18nMessage("q1", {"var": 2}, None),
                                QuickFixAction.PrependStep("filter", {"a": "x"}),
                            )
                        ],
                    ),
                    RenderError(I18nMessage("e2", {}, None), []),
                ],
                json={"foo": "bar"},
            )
            cache_render_result(self.workflow, self.step, 1, result)

        cached = self.step.cached_render_result
        self.assertEqual(cached.step_id, self.step.id)
        self.assertEqual(cached.delta_id, 1)

        self.assertEqual(
            crr_parquet_key(cached),
            f"wf-{self.workflow.id}/wfm-{self.step.id}/delta-1.dat",
        )

        # Reading completely freshly from the DB should give the same thing
        db_step = Step.objects.get(id=self.step.id)
        from_db = db_step.cached_render_result
        self.assertEqual(from_db, cached)

        with open_cached_render_result(from_db) as result2:
            assert_arrow_table_equals(
                result2.table, make_table(make_column("A", [1], format="{:,}"))
            )
            self.assertEqual(
                result2.columns, [Column("A", ColumnType.Number(format="{:,}"))]
            )
Exemplo n.º 18
0
 def test_clean_normal_dict(self):
     input_shape = TableMetadata(3, [Column("A", ColumnType.Number())])
     schema = ParamDType.Dict({
         "str": ParamDType.String(),
         "int": ParamDType.Integer()
     })
     value = {"str": "foo", "int": 3}
     expected = dict(value)  # no-op
     result = clean_value(schema, value, input_shape)
     self.assertEqual(result, expected)
Exemplo n.º 19
0
    def test_execute_partial_cache_hit(self):
        module_zipfile = create_module_zipfile(
            "mod", spec_kwargs={"loads_data": True})
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        # step1: cached result is fresh. Should not render.
        step1 = tab.steps.create(
            order=0,
            slug="step-1",
            module_id_name="mod",
            last_relevant_delta_id=workflow.last_delta_id,
        )
        write_to_rendercache(workflow, step1, workflow.last_delta_id,
                             make_table(make_column("A", ["a"])))
        # step2: cached result is stale, so must be re-rendered
        step2 = tab.steps.create(
            order=1,
            slug="step-2",
            module_id_name="mod",
            last_relevant_delta_id=workflow.last_delta_id,
        )
        write_to_rendercache(
            workflow,
            step2,
            workflow.last_delta_id - 1,
            make_table(make_column("B", ["b"])),
        )

        tab_flow = TabFlow(
            Tab(tab.slug, tab.name),
            [
                ExecuteStep(step1, module_zipfile, {}),
                ExecuteStep(step2, module_zipfile, {}),
            ],
        )

        new_table = make_table(make_column("C", ["c"]))

        with patch.object(Kernel, "render",
                          side_effect=mock_render(new_table)):
            with self._execute(workflow, tab_flow, {}) as (result, path):
                self.assertEqual(
                    result, StepResult(path, [Column("C", ColumnType.Text())]))
                assert_arrow_table_equals(load_trusted_arrow_file(path),
                                          new_table)

            Kernel.render.assert_called_once()  # step2, not step1

            self.assertRegex(
                # Output is to the correct file
                Kernel.render.call_args[1]["output_filename"],
                r"execute-tab-output.*\.arrow",
            )
Exemplo n.º 20
0
    def test_resume_backtrack_on_corrupt_cache_error(self):
        module_zipfile = create_module_zipfile(
            "mod", spec_kwargs={"loads_data": True})
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        # step1: cached result is fresh -- but CORRUPT
        step1 = tab.steps.create(
            order=0,
            slug="step-1",
            module_id_name="mod",
            last_relevant_delta_id=workflow.last_delta_id,
        )
        write_to_rendercache(workflow, step1, workflow.last_delta_id,
                             make_table(make_column("A", [1])))
        step1.refresh_from_db()
        s3.put_bytes(
            # Write corrupted data -- will lead to CorruptCacheError
            rendercache.io.BUCKET,
            rendercache.io.crr_parquet_key(step1.cached_render_result),
            b"CORRUPT",
        )
        # step2: no cached result -- must re-render
        step2 = tab.steps.create(order=1, slug="step-2", module_id_name="mod")

        tab_flow = TabFlow(
            Tab(tab.slug, tab.name),
            [
                ExecuteStep(step1, module_zipfile, {}),
                ExecuteStep(step2, module_zipfile, {}),
            ],
        )

        new_table = make_table(make_column("B", ["b"]))

        with patch.object(Kernel, "render",
                          side_effect=mock_render(new_table)):
            with self._execute(workflow,
                               tab_flow, {},
                               expect_log_level=logging.ERROR) as (result,
                                                                   path):
                self.assertEqual(
                    result, StepResult(path, [Column("B", ColumnType.Text())]))

            self.assertEqual(
                # called with step1, then step2
                Kernel.render.call_count,
                2,
            )
            self.assertRegex(
                # Output is to the correct file
                Kernel.render.call_args[1]["output_filename"],
                r"execute-tab-output.*\.arrow",
            )
Exemplo n.º 21
0
def _arrow_column_to_column(name: str, column: pyarrow.ChunkedArray) -> Column:
    if pyarrow.types.is_floating(column.type) or pyarrow.types.is_integer(
            column.type):
        column_type = ColumnType.Number("{:,}")
    elif pyarrow.types.is_timestamp(column.type):
        column_type = ColumnType.Timestamp()
    elif pyarrow.types.is_string(column.type) or pyarrow.types.is_dictionary(
            column.type):
        column_type = ColumnType.Text()
    else:
        raise RuntimeError("Unknown column type %r" % column.type)
    return Column(name, column_type)
Exemplo n.º 22
0
    def test_fetch_return_dataframe(self):
        async def fetch(params):
            return pd.DataFrame({"A": ["x", "y"]})

        with tempfile_context(dir=self.basedir) as outfile:
            result = self._test_fetch(fetch, output_filename=outfile.name)

            self.assertEqual(result.errors, [])
            arrow_table = read_parquet_as_arrow(
                outfile, [Column("A", ColumnType.Text())])
            assert_arrow_table_equals(arrow_table,
                                      make_table(make_column("A", ["x", "y"])))
Exemplo n.º 23
0
    def test_clean_column_prompting_error_convert_to_number(self):
        input_shape = TableMetadata(3, [Column("A", ColumnType.Text())])
        with self.assertRaises(PromptingError) as cm:
            clean_value(ParamDType.Column(column_types=frozenset({"number"})),
                        "A", input_shape)

        self.assertEqual(
            cm.exception.errors,
            [
                PromptingError.WrongColumnType(["A"], "text",
                                               frozenset({"number"}))
            ],
        )
Exemplo n.º 24
0
    def test_clean_multicolumn_prompting_error_convert_to_text(self):
        # TODO make this _automatic_ instead of quick-fix?
        # ... but for now: prompt for a Quick Fix.
        input_shape = TableMetadata(
            3,
            [
                Column("A", ColumnType.Number()),
                Column("B", ColumnType.Timestamp()),
                Column("C", ColumnType.Text()),
            ],
        )
        with self.assertRaises(PromptingError) as cm:
            schema = ParamDType.Multicolumn(column_types=frozenset({"text"}))
            clean_value(schema, "A,B", input_shape)

        self.assertEqual(
            cm.exception.errors,
            [
                PromptingError.WrongColumnType(["A", "B"], None,
                                               frozenset({"text"}))
            ],
        )
Exemplo n.º 25
0
 def test_date_unit_year_ok(self):
     table = pa.table(
         [
             pa.array(
                 [date(1900, 1, 1),
                  date(1, 1, 1),
                  date(9999, 1, 1), None])
         ],
         pa.schema(
             [pa.field("A", pa.date32(), metadata={b"unit": b"year"})]),
     )
     self.assertEqual(read_columns(table),
                      [Column("A", ColumnType.Date(unit="year"))])
Exemplo n.º 26
0
 def test_number_metadata_utf8_format(self):
     table = pa.table(
         [pa.array([123])],
         pa.schema([
             pa.field(
                 "A",
                 pa.int64(),
                 metadata={b"format": "€{:,.2f}".encode("utf-8")},
             )
         ]),
     )
     self.assertEqual(read_columns(table),
                      [Column("A", ColumnType.Number(format="€{:,.2f}"))])
Exemplo n.º 27
0
    def test_dict_prompting_error(self):
        input_shape = TableMetadata(
            3,
            [Column("A", ColumnType.Text()),
             Column("B", ColumnType.Text())])
        schema = ParamDType.Dict({
            "col1":
            ParamDType.Column(column_types=frozenset({"number"})),
            "col2":
            ParamDType.Column(column_types=frozenset({"timestamp"})),
        })
        with self.assertRaises(PromptingError) as cm:
            clean_value(schema, {"col1": "A", "col2": "B"}, input_shape)

        self.assertEqual(
            cm.exception.errors,
            [
                PromptingError.WrongColumnType(["A"], "text",
                                               frozenset({"number"})),
                PromptingError.WrongColumnType(["B"], "text",
                                               frozenset({"timestamp"})),
            ],
        )
Exemplo n.º 28
0
def _dict_to_column(value: Dict[str, Any]) -> ColumnType:
    kwargs = dict(value)
    name = kwargs.pop("name")
    type_name = kwargs.pop("type")
    try:
        type_cls = {
            "text": ColumnType.Text,
            "number": ColumnType.Number,
            "datetime": ColumnType.Datetime,
        }[type_name]
    except KeyError:
        raise ValueError("Invalid type: %r" % type_name)

    return Column(name, type_cls(**kwargs))
Exemplo n.º 29
0
    def test_clean_column_prompting_error_convert_to_text(self):
        # TODO make this _automatic_ instead of quick-fix?
        # Consider Regex. We probably want to pass the module a text Series
        # _separately_ from the input DataFrame. That way Regex can output
        # a new Text column but preserve its input column's data type.
        #
        # ... but for now: prompt for a Quick Fix.
        input_shape = TableMetadata(3, [Column("A", ColumnType.Number())])
        with self.assertRaises(PromptingError) as cm:
            clean_value(ParamDType.Column(column_types=frozenset({"text"})),
                        "A", input_shape)

        self.assertEqual(
            cm.exception.errors,
            [PromptingError.WrongColumnType(["A"], None, frozenset({"text"}))],
        )
Exemplo n.º 30
0
 def test_invalid_parquet_is_corrupt_cache_error(self):
     with arrow_table_context(make_column("A", ["x"])) as (path, table):
         result = LoadedRenderResult(
             path=path,
             table=table,
             columns=[Column("A", ColumnType.Text())],
             errors=[],
             json={},
         )
         cache_render_result(self.workflow, self.step, 1, result)
     crr = self.step.cached_render_result
     s3.put_bytes(BUCKET, crr_parquet_key(crr), b"NOT PARQUET")
     with tempfile_context() as arrow_path:
         with self.assertRaises(CorruptCacheError):
             with open_cached_render_result(crr) as loaded:
                 pass