Exemplo n.º 1
0
 def test_date_unit_year_bad(self):
     table = pa.table(
         [pa.array([date(1900, 4, 1)])],
         pa.schema(
             [pa.field("A", pa.date32(), metadata={b"unit": b"year"})]),
     )
     with self.assertRaises(DateValueHasWrongUnit):
         read_columns(table)
Exemplo n.º 2
0
 def test_date_metadata_invalid_unit(self):
     table = pa.table(
         [pa.array([date(2021, 4, 4)])],
         pa.schema(
             [pa.field("A", pa.date32(), metadata={b"unit": b"days"})]),
     )
     with self.assertRaises(FieldMetadataNotAllowed):
         read_columns(table)
Exemplo n.º 3
0
 def test_text_metadata_not_none(self):
     table = pa.table(
         [pa.array(["x"])],
         pa.schema(
             [pa.field("A", pa.string(), metadata={b"unit": b"year"})]),
     )
     with self.assertRaises(FieldMetadataNotAllowed):
         read_columns(table)
Exemplo n.º 4
0
 def test_timestamp_metadata_non_null(self):
     table = pa.table(
         [pa.array([123123123], pa.timestamp("ns"))],
         pa.schema(
             [pa.field("A", pa.timestamp("ns"), metadata={b"foo":
                                                          b"bar"})]),
     )
     with self.assertRaises(FieldMetadataNotAllowed):
         read_columns(table)
Exemplo n.º 5
0
 def test_number_metadata_format_invalid_utf8(self):
     table = pa.table(
         [pa.array([123])],
         pa.schema([
             pa.field("A", pa.int64(), metadata={b"format": b"\xe2{:,.2f}"})
         ]),
     )
     with self.assertRaises(InvalidNumberFormat):
         read_columns(table)
Exemplo n.º 6
0
 def test_duplicate_column_names(self):
     table = pa.table(
         [pa.array(["x"]), pa.array(["x"])],
         pa.schema([pa.field("A", pa.string()),
                    pa.field("A", pa.string())]),
     )
     with self.assertRaisesRegex(
             DuplicateColumnName,
             "Table has two columns named 'A': column 0 and column 1",
     ):
         read_columns(table)
Exemplo n.º 7
0
 def test_date_metadata_too_many_keys(self):
     table = pa.table(
         [pa.array([date(2021, 4, 4)])],
         pa.schema([
             pa.field("A",
                      pa.date32(),
                      metadata={
                          b"unit": b"day",
                          b"foo": b"bar"
                      })
         ]),
     )
     with self.assertRaises(FieldMetadataNotAllowed):
         read_columns(table)
Exemplo n.º 8
0
 def test_number_metadata_too_many_keys(self):
     table = pa.table(
         [pa.array([123])],
         pa.schema([
             pa.field("A",
                      pa.int64(),
                      metadata={
                          b"format": b"{:,}",
                          b"foo": b"bar"
                      })
         ]),
     )
     with self.assertRaises(FieldMetadataNotAllowed):
         read_columns(table)
Exemplo n.º 9
0
 def test_date_unit_day_ok(self):
     table = pa.table(
         [pa.array([date(2021, 4, 4)])],
         pa.schema([pa.field("A", pa.date32(), metadata={b"unit":
                                                         b"day"})]),
     )
     self.assertEqual(read_columns(table),
                      [Column("A", ColumnType.Date(unit="day"))])
Exemplo n.º 10
0
def call_render(module_spec: ModuleSpec, render: Callable,
                request: ttypes.RenderRequest) -> ttypes.RenderResult:
    basedir = Path(request.basedir)
    input_path = basedir / request.input_filename
    table = load_trusted_arrow_file(input_path)
    dataframe = cjwpandasmodule.convert.arrow_table_to_pandas_dataframe(table)
    tab_outputs = {
        k: _thrift_tab_output_to_pandas(v, basedir)
        for k, v in request.tab_outputs.items()
    }
    params = _prepare_params(module_spec,
                             thrift_json_object_to_pydict(request.params),
                             basedir, tab_outputs)
    spec = inspect.getfullargspec(render)
    kwargs = {}
    varkw = bool(spec.varkw)  # if True, function accepts **kwargs
    kwonlyargs = spec.kwonlyargs
    if varkw or "fetch_result" in kwonlyargs:
        if request.fetch_result is None:
            fetch_result = None
        else:
            fetch_result_path = basedir / request.fetch_result.filename
            errors = [
                # Data comes in as FetchError and we return RenderError.
                RenderError(thrift_i18n_message_to_arrow(e.message))
                for e in request.fetch_result.errors
            ]
            if (fetch_result_path.stat().st_size == 0
                    or cjwparquet.file_has_parquet_magic_number(
                        fetch_result_path)):
                fetch_result = ptypes.ProcessResult(
                    dataframe=_parquet_to_pandas(fetch_result_path),
                    errors=errors,
                    # infer columns -- the fetch interface doesn't handle formats
                    # (TODO nix pandas_v0 fetching altogether by rewriting all modules)
                )
            else:
                # TODO nix pandas Fetch modules. (Do any use files, even?)
                fetch_result = types.FetchResult(path=fetch_result_path,
                                                 errors=errors)
        kwargs["fetch_result"] = fetch_result
    if varkw or "settings" in kwonlyargs:
        kwargs["settings"] = settings
    if varkw or "tab_name" in kwonlyargs:
        kwargs["tab_name"] = request.tab_name
    if varkw or "input_columns" in kwonlyargs:
        kwargs["input_columns"] = arrow_schema_to_render_columns(table.schema)

    input_columns = read_columns(table, full=False)
    raw_result = render(dataframe, params, **kwargs)

    # raise ValueError if invalid
    pandas_result = ptypes.ProcessResult.coerce(
        raw_result, try_fallback_columns=input_columns)
    pandas_result.truncate_in_place_if_too_big()

    arrow_result = pandas_result.to_arrow(basedir / request.output_filename)
    return arrow_render_result_to_thrift(arrow_result)
Exemplo n.º 11
0
 def test_date_unit_month_ok(self):
     table = pa.table(
         [pa.array([date(1200, 12, 1),
                    date(3199, 2, 1), None])],
         pa.schema(
             [pa.field("A", pa.date32(), metadata={b"unit": b"month"})]),
     )
     self.assertEqual(read_columns(table),
                      [Column("A", ColumnType.Date(unit="month"))])
Exemplo n.º 12
0
 def test_number_metadata_utf8_format(self):
     table = pa.table(
         [pa.array([123])],
         pa.schema([
             pa.field(
                 "A",
                 pa.int64(),
                 metadata={b"format": "€{:,.2f}".encode("utf-8")},
             )
         ]),
     )
     self.assertEqual(read_columns(table),
                      [Column("A", ColumnType.Number(format="€{:,.2f}"))])
Exemplo n.º 13
0
 def test_date_unit_year_ok(self):
     table = pa.table(
         [
             pa.array(
                 [date(1900, 1, 1),
                  date(1, 1, 1),
                  date(9999, 1, 1), None])
         ],
         pa.schema(
             [pa.field("A", pa.date32(), metadata={b"unit": b"year"})]),
     )
     self.assertEqual(read_columns(table),
                      [Column("A", ColumnType.Date(unit="year"))])
Exemplo n.º 14
0
def write_to_rendercache(
    workflow: Workflow,
    step: Step,
    delta_id: int,
    table: pa.Table,
    errors: List[RenderError] = [],
    json: Dict[str, Any] = {},
) -> None:
    with arrow_table_context(table) as (path, table):
        result = LoadedRenderResult(
            path=path,
            table=table,
            columns=read_columns(table, full=False),
            errors=errors,
            json=json,
        )

        # use the caller-provided delta ID: no assertion
        old_last_relevant_delta_id = step.last_relevant_delta_id
        step.last_relevant_delta_id = delta_id
        try:
            cache_render_result(workflow, step, delta_id, result)
        finally:
            step.last_relevant_delta_id = old_last_relevant_delta_id
Exemplo n.º 15
0
 def test_timestamp_tz_non_null(self):
     table = pa.table(
         {"A": pa.array([12312312314512], pa.timestamp("ns", tz="utc"))})
     with self.assertRaisesRegex(TimestampTimezoneNotAllowed,
                                 "Workbench does not support time zones"):
         read_columns(table)
Exemplo n.º 16
0
 def test_table_too_many_record_batches(self):
     table = pa.table(
         {"A": pa.chunked_array([pa.array(["x"]),
                                 pa.array(["y"])])})
     with self.assertRaises(TableHasTooManyRecordBatches):
         read_columns(table)
Exemplo n.º 17
0
 def test_table_has_metadata(self):
     table = pa.table({"A": ["x"]}).replace_schema_metadata({})  # non-null
     with self.assertRaises(TableSchemaHasMetadata):
         read_columns(table)
Exemplo n.º 18
0
 def test_unknown_column_type(self):
     table = pa.table({"A": pa.array([1231231], pa.time64("ns"))})
     with self.assertRaises(WrongColumnType):
         read_columns(table)
Exemplo n.º 19
0
 def test_timestamp_unit_not_ns(self):
     table = pa.table({"A": pa.array([12312312314512], pa.timestamp("us"))})
     with self.assertRaisesRegex(TimestampUnitNotAllowed,
                                 "Workbench only supports 'ns'"):
         read_columns(table)
Exemplo n.º 20
0
 def test_timestamp_ok(self):
     table = pa.table({"A": pa.array([12312312314512], pa.timestamp("ns"))})
     self.assertEqual(read_columns(table),
                      [Column("A", ColumnType.Timestamp())])
Exemplo n.º 21
0
 def test_number_metadata_none(self):
     table = pa.table({"A": pa.array([123123123])})
     with self.assertRaises(FieldMetadataNotAllowed):
         read_columns(table)
Exemplo n.º 22
0
 def test_text_dictionary_ok(self):
     self.assertEqual(
         read_columns(pa.table({"A":
                                pa.array(["x"]).dictionary_encode()}), ),
         [Column("A", ColumnType.Text())],
     )
Exemplo n.º 23
0
 def test_text_ok(self):
     self.assertEqual(read_columns(pa.table({"A": ["x"]})),
                      [Column("A", ColumnType.Text())])