Пример #1
0
def render_arrow(
    table, params, tab_name, fetch_result: Optional[FetchResult], output_path: Path
) -> RenderResult:
    # Must perform header operation here in the event the header checkbox
    # state changes
    if fetch_result is None:
        # empty table
        return RenderResult(ArrowTable())
    elif fetch_result.path is not None and parquet.file_has_parquet_magic_number(
        fetch_result.path
    ):
        # Deprecated files: we used to parse in fetch() and store the result
        # as Parquet. Now we've lost the original file data, and we need to
        # support our oldest users.
        #
        # In this deprecated format, parse errors were written as
        # fetch_result.errors.
        return _render_deprecated_parquet(
            fetch_result.path, fetch_result.errors, output_path, params
        )
    elif fetch_result.errors:
        # We've never stored errors+data. If there are errors, assume
        # there's no data.
        return RenderResult(ArrowTable(), fetch_result.errors)
    else:
        assert not fetch_result.errors  # we've never stored errors+data.
        return _render_file(fetch_result.path, params, output_path)
Пример #2
0
def render_arrow(
    table: types.ArrowTable,
    params: Dict[str, Any],
    tab_name: str,
    fetch_result: Optional[types.FetchResult],
    output_path: Path,
) -> types.RenderResult:
    """
    Render using `cjwkernel.types` data types.

    If outputting Arrow data, write to `output_path`.

    Module authors are encouraged to replace this function, because Arrow
    tables are simpler and more memory-efficient than Pandas tables. This is
    the ideal signature for a "rename columns" module, for instance: Arrow
    can pass data through without consuming excessive RAM.

    This does not validate the render_pandas() return value.
    """
    pandas_table = __arrow_to_pandas(table)
    pandas_input_tabs = {
        to.tab.slug: __arrow_tab_output_to_pandas(to)
        for to in _find_tab_outputs(params)
    }
    if fetch_result is not None:
        if fetch_result.path.stat(
        ).st_size == 0 or parquet.file_has_parquet_magic_number(
                fetch_result.path):
            fetched_table = __parquet_to_pandas(fetch_result.path)
            pandas_fetch_result = ptypes.ProcessResult(
                fetched_table,
                [
                    ptypes.ProcessResultError.from_arrow(error)
                    for error in fetch_result.errors
                ],
            )
        else:
            pandas_fetch_result = fetch_result
    else:
        pandas_fetch_result = None

    pandas_result: ptypes.ProcessResult = render_pandas(
        input_table=pandas_table,
        input_table_shape=ptypes.TableShape.from_arrow(table.metadata),
        params=_arrow_param_to_pandas_param(params),
        tab_name=tab_name,
        input_tabs=pandas_input_tabs,
        fetch_result=pandas_fetch_result,
    )

    return pandas_result.to_arrow(output_path)
Пример #3
0
 def test_empty_parquet_file(self):
     with parquet_file({}) as path:
         self.assertTrue(parquet.file_has_parquet_magic_number(path))
Пример #4
0
 def test_good_magic_numbers_but_too_short_to_be_parquet(self):
     # Parquet has PAR1 at the beginning and end. But the file "PAR1" on its
     # own is not a Parquet file.
     with tempfile_context() as path:
         path.write_bytes(b"PAR1")
         self.assertFalse(parquet.file_has_parquet_magic_number(path))
Пример #5
0
 def test_very_short_file(self):
     with tempfile_context() as path:
         path.write_bytes(b"PAR")
         self.assertFalse(parquet.file_has_parquet_magic_number(path))
Пример #6
0
 def test_empty_file(self):
     with tempfile_context() as path:
         self.assertFalse(parquet.file_has_parquet_magic_number(path))
Пример #7
0
 def test_parquet_file_has_magic_numbers(self):
     with parquet_file({"A": [1]}) as path:
         self.assertTrue(parquet.file_has_parquet_magic_number(path))