def read_cached_render_result_pydict(crr: CachedRenderResult, only_columns: range, only_rows: range) -> Dict[str, List[Any]]: """ Return a dict mapping column name to data (Python objects). Python data consumes RAM, so you must specify columns and rows. `retval.keys()` is in table-column order (not `only_columns` order). Missing rows and columns are ignored. `NaN` is returned as float("nan"). Raise CorruptCacheError if the cached data does not match `crr`. That can mean: * The cached Parquet file is corrupt * The cached Parquet file is missing * `crr` is stale -- the cached result is for a different delta. This could be detected by a `Workflow.cooperative_lock()`, too, should the caller want to distinguish this error from the others. """ if not crr.table_metadata.columns: # Zero-column tables aren't written to cache return {} try: with downloaded_parquet_file(crr) as parquet_path: return parquet.read_pydict(parquet_path, only_columns, only_rows) except (pyarrow.ArrowIOError, FileNotFoundError): # FIXME unit-test raise CorruptCacheError
def test_pydict_zero_rows(self): with tempfile_context() as path: # ensure at least 1 row group parquet.write( path, pyarrow.table({ "A": pyarrow.array([], type=pyarrow.string()), "B": pyarrow.DictionaryArray.from_arrays( pyarrow.array([], type=pyarrow.int32()), pyarrow.array([], type=pyarrow.string()), ), "C": pyarrow.array([], type=pyarrow.timestamp("ns")), "D": pyarrow.array([], type=pyarrow.float64()), }), ) self.assertEqual( parquet.read_pydict(path, range(4), range(0)), { "A": [], "B": [], "C": [], "D": [] }, )
def test_pydict_zero_row_groups(self): table = pyarrow.Table.from_batches([], schema=pyarrow.schema([ ("A", pyarrow.string()) ])) with parquet_file(table) as path: self.assertEqual(parquet.read_pydict(path, range(1), range(0)), {"A": []})
def test_pydict_nan(self): with parquet_file({ "A": pyarrow.array([1.1, float("nan"), None], type=pyarrow.float64()) }) as path: result = parquet.read_pydict(path, range(1), range(3)) self.assertEqual(result["A"][0], 1.1) self.assert_(math.isnan(result["A"][1])) self.assert_(math.isnan(result["A"][2]))
def test_pydict_lots_of_types(self): dt1 = datetime.now() dt2 = datetime.now() with parquet_file({ "str": ["x", "y", None, "z"], "cat": pyarrow.array(["x", "y", None, "x"]).dictionary_encode(), "dt": [dt1, None, dt2, None], "int32": [1, 2, 3, 2**31], "float": [1.1, 2.2, 3.3, 4.4], }) as path: self.assertEqual( parquet.read_pydict(path, range(5), range(4)), { "str": ["x", "y", None, "z"], "cat": ["x", "y", None, "x"], "dt": [dt1, None, dt2, None], "int32": [1, 2, 3, 2**31], "float": [1.1, 2.2, 3.3, 4.4], }, )
def test_pydict_ignore_missing_rows(self): with parquet_file({"A": [0, 1, 2, 3]}) as path: self.assertEqual(parquet.read_pydict(path, range(1), range(2, 5)), {"A": [2, 3]})
def test_pydict_only_rows(self): with parquet_file({"A": [0, 1, 2, 3, 4, 5, 6, 7]}) as path: self.assertEqual(parquet.read_pydict(path, range(1), range(2, 5)), {"A": [2, 3, 4]})
def test_pydict_ignore_missing_columns(self): with parquet_file({"A": [1]}) as path: self.assertEqual(parquet.read_pydict(path, range(3), range(1)), {"A": [1]})