示例#1
0
文件: io.py 项目: afcarl/cjworkbench
def read_cached_render_result_pydict(crr: CachedRenderResult,
                                     only_columns: range,
                                     only_rows: range) -> Dict[str, List[Any]]:
    """
    Return a dict mapping column name to data (Python objects).

    Python data consumes RAM, so you must specify columns and rows.

    `retval.keys()` is in table-column order (not `only_columns` order).

    Missing rows and columns are ignored.

    `NaN` is returned as float("nan").

    Raise CorruptCacheError if the cached data does not match `crr`. That can
    mean:

        * The cached Parquet file is corrupt
        * The cached Parquet file is missing
        * `crr` is stale -- the cached result is for a different delta. This
          could be detected by a `Workflow.cooperative_lock()`, too, should the
          caller want to distinguish this error from the others.
    """
    if not crr.table_metadata.columns:
        # Zero-column tables aren't written to cache
        return {}

    try:
        with downloaded_parquet_file(crr) as parquet_path:
            return parquet.read_pydict(parquet_path, only_columns, only_rows)
    except (pyarrow.ArrowIOError, FileNotFoundError):  # FIXME unit-test
        raise CorruptCacheError
示例#2
0
 def test_pydict_zero_rows(self):
     with tempfile_context() as path:
         # ensure at least 1 row group
         parquet.write(
             path,
             pyarrow.table({
                 "A":
                 pyarrow.array([], type=pyarrow.string()),
                 "B":
                 pyarrow.DictionaryArray.from_arrays(
                     pyarrow.array([], type=pyarrow.int32()),
                     pyarrow.array([], type=pyarrow.string()),
                 ),
                 "C":
                 pyarrow.array([], type=pyarrow.timestamp("ns")),
                 "D":
                 pyarrow.array([], type=pyarrow.float64()),
             }),
         )
         self.assertEqual(
             parquet.read_pydict(path, range(4), range(0)),
             {
                 "A": [],
                 "B": [],
                 "C": [],
                 "D": []
             },
         )
示例#3
0
 def test_pydict_zero_row_groups(self):
     table = pyarrow.Table.from_batches([],
                                        schema=pyarrow.schema([
                                            ("A", pyarrow.string())
                                        ]))
     with parquet_file(table) as path:
         self.assertEqual(parquet.read_pydict(path, range(1), range(0)),
                          {"A": []})
示例#4
0
 def test_pydict_nan(self):
     with parquet_file({
             "A":
             pyarrow.array([1.1, float("nan"), None],
                           type=pyarrow.float64())
     }) as path:
         result = parquet.read_pydict(path, range(1), range(3))
         self.assertEqual(result["A"][0], 1.1)
         self.assert_(math.isnan(result["A"][1]))
         self.assert_(math.isnan(result["A"][2]))
示例#5
0
 def test_pydict_lots_of_types(self):
     dt1 = datetime.now()
     dt2 = datetime.now()
     with parquet_file({
             "str": ["x", "y", None, "z"],
             "cat":
             pyarrow.array(["x", "y", None, "x"]).dictionary_encode(),
             "dt": [dt1, None, dt2, None],
             "int32": [1, 2, 3, 2**31],
             "float": [1.1, 2.2, 3.3, 4.4],
     }) as path:
         self.assertEqual(
             parquet.read_pydict(path, range(5), range(4)),
             {
                 "str": ["x", "y", None, "z"],
                 "cat": ["x", "y", None, "x"],
                 "dt": [dt1, None, dt2, None],
                 "int32": [1, 2, 3, 2**31],
                 "float": [1.1, 2.2, 3.3, 4.4],
             },
         )
示例#6
0
 def test_pydict_ignore_missing_rows(self):
     with parquet_file({"A": [0, 1, 2, 3]}) as path:
         self.assertEqual(parquet.read_pydict(path, range(1), range(2, 5)),
                          {"A": [2, 3]})
示例#7
0
 def test_pydict_only_rows(self):
     with parquet_file({"A": [0, 1, 2, 3, 4, 5, 6, 7]}) as path:
         self.assertEqual(parquet.read_pydict(path, range(1), range(2, 5)),
                          {"A": [2, 3, 4]})
示例#8
0
 def test_pydict_ignore_missing_columns(self):
     with parquet_file({"A": [1]}) as path:
         self.assertEqual(parquet.read_pydict(path, range(3), range(1)),
                          {"A": [1]})