def sample_block(block: Block[T]) -> np.ndarray: return BlockAccessor.for_block(block).sample(n_samples, key)
def sort_block(block, boundaries): return BlockAccessor.for_block(block).sort_and_partition( boundaries, key, descending)
def get_metadata(table: "pyarrow.Table") -> BlockMetadata: return BlockAccessor.for_block(table).get_metadata(input_files=None)
def test_json_read(ray_start_regular_shared, tmp_path): # Single file. df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) path1 = os.path.join(tmp_path, "test1.json") df1.to_json(path1, orient="records", lines=True) ds = ray.experimental.data.read_json(path1) assert df1.equals(ray.get(ds.to_pandas())[0]) # Test metadata ops. assert ds.count() == 3 assert ds.input_files() == [path1] assert "{one: int64, two: string}" in str(ds), ds # Two files, parallelism=2. df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) path2 = os.path.join(tmp_path, "test2.json") df2.to_json(path2, orient="records", lines=True) ds = ray.experimental.data.read_json([path1, path2], parallelism=2) dsdf = pd.concat(ray.get(ds.to_pandas())) assert pd.concat([df1, df2]).equals(dsdf) # Test metadata ops. for block, meta in zip(ds._blocks, ds._blocks.get_metadata()): BlockAccessor.for_block(ray.get(block)).size_bytes() == meta.size_bytes # Three files, parallelism=2. df3 = pd.DataFrame({"one": [7, 8, 9], "two": ["h", "i", "j"]}) path3 = os.path.join(tmp_path, "test3.json") df3.to_json(path3, orient="records", lines=True) df = pd.concat([df1, df2, df3], ignore_index=True) ds = ray.experimental.data.read_json([path1, path2, path3], parallelism=2) dsdf = pd.concat(ray.get(ds.to_pandas()), ignore_index=True) assert df.equals(dsdf) # Directory, two files. path = os.path.join(tmp_path, "test_json_dir") os.mkdir(path) df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) path1 = os.path.join(path, "data0.json") df1.to_json(path1, orient="records", lines=True) df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) path2 = os.path.join(path, "data1.json") df2.to_json(path2, orient="records", lines=True) ds = ray.experimental.data.read_json(path) df = pd.concat([df1, df2]) dsdf = pd.concat(ray.get(ds.to_pandas())) assert df.equals(dsdf) shutil.rmtree(path) # Two directories, three files. path1 = os.path.join(tmp_path, "test_json_dir1") path2 = os.path.join(tmp_path, "test_json_dir2") os.mkdir(path1) os.mkdir(path2) df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) file_path1 = os.path.join(path1, "data0.json") df1.to_json(file_path1, orient="records", lines=True) df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) file_path2 = os.path.join(path2, "data1.json") df2.to_json(file_path2, orient="records", lines=True) df3 = pd.DataFrame({"one": [7, 8, 9], "two": ["h", "i", "j"]}) file_path3 = os.path.join(path2, "data2.json") df3.to_json(file_path3, orient="records", lines=True) ds = ray.experimental.data.read_json([path1, path2]) df = pd.concat([df1, df2, df3]) dsdf = pd.concat(ray.get(ds.to_pandas())) assert df.equals(dsdf) shutil.rmtree(path1) shutil.rmtree(path2) # Directory and file, two files. dir_path = os.path.join(tmp_path, "test_json_dir") os.mkdir(dir_path) df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) path1 = os.path.join(dir_path, "data0.json") df1.to_json(path1, orient="records", lines=True) df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) path2 = os.path.join(tmp_path, "data1.json") df2.to_json(path2, orient="records", lines=True) ds = ray.experimental.data.read_json([dir_path, path2]) df = pd.concat([df1, df2]) dsdf = pd.concat(ray.get(ds.to_pandas())) assert df.equals(dsdf) shutil.rmtree(dir_path)
def write(self, block: Block) -> str: block = BlockAccessor.for_block(block) if not self.enabled: raise ValueError("disabled") self.rows_written += block.num_rows() return "ok"
def df_to_block(df: "pandas.DataFrame") -> Block[ArrowRow]: block = pa.table(df) return (block, BlockAccessor.for_block(block).get_metadata(input_files=None))
def agg(block: Block) -> int: block = BlockAccessor.for_block(block) return sum(block.iter_rows())
def json_write(write_path: str, block: Block): block = BlockAccessor.for_block(block) logger.debug( f"Writing {block.num_rows()} records to {write_path}.") block.to_pandas().to_json(write_path, orient="records")
def count(block: Block) -> int: block = BlockAccessor.for_block(block) return block.num_rows()
def block_to_df(block: Block): block = BlockAccessor.for_block(block) return block.to_arrow_table()
def block_to_df(block: Block): block = BlockAccessor.for_block(block) return block.to_pandas()
def transform(block: Block) -> Block: block = BlockAccessor.for_block(block) builder = DelegatingArrowBlockBuilder() for row in block.iter_rows(): builder.add(fn(row)) return builder.build()
def get_schema(block: Block) -> Any: return BlockAccessor.for_block(block).schema()