def test_memory_mapped_table_filter(arrow_file, in_memory_pa_table): mask = pa.array([i % 2 == 0 for i in range(len(in_memory_pa_table))]) table = MemoryMappedTable.from_file(arrow_file).filter(mask) assert table.table == in_memory_pa_table.filter(mask) assert isinstance(table, MemoryMappedTable) assert table.replays == [("filter", (mask, ), {})] assert_deepcopy_without_bringing_data_in_memory(table) # filter DOES increase memory # assert_pickle_without_bringing_data_in_memory(table) assert_pickle_does_bring_data_in_memory(table)
def test_memory_mapped_table_set_column(arrow_file, in_memory_pa_table): i = len(in_memory_pa_table.column_names) field_ = "new_field" column = pa.array([i for i in range(len(in_memory_pa_table))]) table = MemoryMappedTable.from_file(arrow_file).set_column(i, field_, column) assert table.table == in_memory_pa_table.set_column(i, field_, column) assert isinstance(table, MemoryMappedTable) assert table.replays == [("set_column", (i, field_, column), {})] assert_deepcopy_without_bringing_data_in_memory(table) assert_pickle_without_bringing_data_in_memory(table)
def test_memory_mapped_table_from_file_with_replay(arrow_file, in_memory_pa_table): replays = [("slice", (0, 1), {}), ("flatten", tuple(), {})] with assert_arrow_memory_doesnt_increase(): table = MemoryMappedTable.from_file(arrow_file, replays=replays) assert len(table) == 1 for method, args, kwargs in replays: in_memory_pa_table = getattr(in_memory_pa_table, method)(*args, **kwargs) assert table.table == in_memory_pa_table assert_deepcopy_without_bringing_data_in_memory(table) assert_pickle_without_bringing_data_in_memory(table)
def test_memory_mapped_table_rename_columns(arrow_file, in_memory_pa_table): assert "tokens" in in_memory_pa_table.column_names names = [ name if name != "tokens" else "new_tokens" for name in in_memory_pa_table.column_names ] table = MemoryMappedTable.from_file(arrow_file).rename_columns(names) assert table.table == in_memory_pa_table.rename_columns(names) assert isinstance(table, MemoryMappedTable) assert table.replays == [("rename_columns", (names, ), {})] assert_pickle_without_bringing_data_in_memory(table)
def test_memory_mapped_table_deepcopy(arrow_file): table = MemoryMappedTable.from_file(arrow_file) copied_table = copy.deepcopy(table) assert table.table == copied_table.table assert table.path == copied_table.path assert_index_attributes_equal(table, copied_table) # deepcopy must return the exact same arrow objects since they are immutable assert table.table is copied_table.table assert all( batch1 is batch2 for batch1, batch2 in zip(table._batches, copied_table._batches))
def test_memory_mapped_table_replace_schema_metadata(arrow_file, in_memory_pa_table): metadata = {"huggingface": "{}"} table = MemoryMappedTable.from_file(arrow_file).replace_schema_metadata( metadata) assert table.table.schema.metadata == in_memory_pa_table.replace_schema_metadata( metadata).schema.metadata assert isinstance(table, MemoryMappedTable) assert table.replays == [("replace_schema_metadata", (metadata, ), {})] assert_deepcopy_without_bringing_data_in_memory(table) assert_pickle_without_bringing_data_in_memory(table)
def test_concat_tables(arrow_file, in_memory_pa_table): t0 = in_memory_pa_table t1 = InMemoryTable(t0) t2 = MemoryMappedTable.from_file(arrow_file) t3 = ConcatenationTable.from_blocks(t1) concatenated_table = concat_tables([t0, t1, t2, t3]) assert concatenated_table.table == pa.concat_tables([t0] * 4) assert isinstance(concatenated_table, ConcatenationTable) assert len(concatenated_table.blocks) == 4 assert isinstance(concatenated_table.blocks[0][0], InMemoryTable) assert isinstance(concatenated_table.blocks[1][0], InMemoryTable) assert isinstance(concatenated_table.blocks[2][0], MemoryMappedTable) assert isinstance(concatenated_table.blocks[3][0], InMemoryTable)
def test_memory_mapped_table_cast(arrow_file, in_memory_pa_table): assert pa.list_(pa.int64()) in in_memory_pa_table.schema.types schema = pa.schema({ k: v if v != pa.list_(pa.int64()) else pa.list_(pa.int32()) for k, v in zip(in_memory_pa_table.schema.names, in_memory_pa_table.schema.types) }) table = MemoryMappedTable.from_file(arrow_file).cast(schema) assert table.table == in_memory_pa_table.cast(schema) assert isinstance(table, MemoryMappedTable) assert table.replays == [("cast", (schema, ), {})] # cast DOES increase memory when converting integers precision for example # assert_pickle_without_bringing_data_in_memory(table) assert_pickle_does_bring_data_in_memory(table)
def test_concat_tables(arrow_file, in_memory_pa_table): t0 = in_memory_pa_table t1 = InMemoryTable(t0) t2 = MemoryMappedTable.from_file(arrow_file) t3 = ConcatenationTable.from_blocks(t1) tables = [t0, t1, t2, t3] concatenated_table = concat_tables(tables, axis=0) assert concatenated_table.table == pa.concat_tables([t0] * 4) assert concatenated_table.table.shape == (40, 4) assert isinstance(concatenated_table, ConcatenationTable) assert len(concatenated_table.blocks) == 3 # t0 and t1 are consolidated as a single InMemoryTable assert isinstance(concatenated_table.blocks[0][0], InMemoryTable) assert isinstance(concatenated_table.blocks[1][0], MemoryMappedTable) assert isinstance(concatenated_table.blocks[2][0], InMemoryTable) concatenated_table = concat_tables(tables, axis=1) assert concatenated_table.table.shape == (10, 16) assert len(concatenated_table.blocks[0]) == 3 # t0 and t1 are consolidated as a single InMemoryTable assert isinstance(concatenated_table.blocks[0][0], InMemoryTable) assert isinstance(concatenated_table.blocks[0][1], MemoryMappedTable) assert isinstance(concatenated_table.blocks[0][2], InMemoryTable)
def test_concatenation_table_from_tables(axis, in_memory_pa_table, arrow_file): in_memory_table = InMemoryTable(in_memory_pa_table) concatenation_table = ConcatenationTable.from_blocks(in_memory_table) memory_mapped_table = MemoryMappedTable.from_file(arrow_file) tables = [in_memory_pa_table, in_memory_table, concatenation_table, memory_mapped_table] if axis == 0: expected_table = pa.concat_tables([in_memory_pa_table] * len(tables)) else: expected_table = in_memory_pa_table for _ in range(1, len(tables)): for name, col in zip(in_memory_pa_table.column_names, in_memory_pa_table.columns): expected_table = expected_table.append_column(name, col) with assert_arrow_memory_doesnt_increase(): table = ConcatenationTable.from_tables(tables, axis=axis) assert isinstance(table, ConcatenationTable) assert table.table == expected_table # because of consolidation, we end up with 1 InMemoryTable and 1 MemoryMappedTable assert len(table.blocks) == 1 if axis == 1 else 2 assert len(table.blocks[0]) == 1 if axis == 0 else 2 assert axis == 1 or len(table.blocks[1]) == 1 assert isinstance(table.blocks[0][0], InMemoryTable) assert isinstance(table.blocks[1][0] if axis == 0 else table.blocks[0][1], MemoryMappedTable)
def test_concat_tables(arrow_file, in_memory_pa_table): t0 = in_memory_pa_table t1 = InMemoryTable(t0) t2 = MemoryMappedTable.from_file(arrow_file) t3 = ConcatenationTable.from_blocks(t1) tables = [t0, t1, t2, t3] concatenated_table = concat_tables(tables, axis=0) assert concatenated_table.table == pa.concat_tables([t0] * 4) assert concatenated_table.table.shape == (40, 4) assert isinstance(concatenated_table, ConcatenationTable) assert len(concatenated_table.blocks) == 3 # t0 and t1 are consolidated as a single InMemoryTable assert isinstance(concatenated_table.blocks[0][0], InMemoryTable) assert isinstance(concatenated_table.blocks[1][0], MemoryMappedTable) assert isinstance(concatenated_table.blocks[2][0], InMemoryTable) # add suffix to avoid error due to duplicate column names concatenated_table = concat_tables( [add_suffix_to_column_names(table, i) for i, table in enumerate(tables)], axis=1 ) assert concatenated_table.table.shape == (10, 16) assert len(concatenated_table.blocks[0]) == 3 # t0 and t1 are consolidated as a single InMemoryTable assert isinstance(concatenated_table.blocks[0][0], InMemoryTable) assert isinstance(concatenated_table.blocks[0][1], MemoryMappedTable) assert isinstance(concatenated_table.blocks[0][2], InMemoryTable)
def test_memory_mapped_table_slice(arrow_file, in_memory_pa_table): table = MemoryMappedTable.from_file(arrow_file).slice(1, 2) assert table.table == in_memory_pa_table.slice(1, 2) assert isinstance(table, MemoryMappedTable) assert table.replays == [("slice", (1, 2), {})] assert_pickle_without_bringing_data_in_memory(table)
def memory_mapped_blocks(arrow_file): table = MemoryMappedTable.from_file(arrow_file) return _to_testing_blocks(table)
def test_memory_mapped_table_pickle_doesnt_fill_memory(arrow_file): with assert_arrow_memory_doesnt_increase(): table = MemoryMappedTable.from_file(arrow_file) assert_deepcopy_without_bringing_data_in_memory(table) assert_pickle_without_bringing_data_in_memory(table)
def test_memory_mapped_table_remove_column(arrow_file, in_memory_pa_table): table = MemoryMappedTable.from_file(arrow_file).remove_column(0) assert table.table == in_memory_pa_table.remove_column(0) assert isinstance(table, MemoryMappedTable) assert table.replays == [("remove_column", (0, ), {})] assert_pickle_without_bringing_data_in_memory(table)
def test_memory_mapped_table_combine_chunks(arrow_file, in_memory_pa_table): table = MemoryMappedTable.from_file(arrow_file).combine_chunks() assert table.table == in_memory_pa_table.combine_chunks() assert isinstance(table, MemoryMappedTable) assert table.replays == [("combine_chunks", tuple(), {})] assert_pickle_without_bringing_data_in_memory(table)
def test_memory_mapped_table_flatten(arrow_file, in_memory_pa_table): table = MemoryMappedTable.from_file(arrow_file).flatten() assert table.table == in_memory_pa_table.flatten() assert isinstance(table, MemoryMappedTable) assert table.replays == [("flatten", tuple(), {})] assert_pickle_without_bringing_data_in_memory(table)
def test_memory_mapped_table_from_file(arrow_file, in_memory_pa_table): with assert_arrow_memory_doesnt_increase(): table = MemoryMappedTable.from_file(arrow_file) assert table.table == in_memory_pa_table assert isinstance(table, MemoryMappedTable) assert_pickle_without_bringing_data_in_memory(table)
def test_memory_mapped_table_init(arrow_file, in_memory_pa_table): table = MemoryMappedTable(_memory_mapped_arrow_table_from_file(arrow_file), arrow_file) assert table.table == in_memory_pa_table assert isinstance(table, MemoryMappedTable) assert_pickle_without_bringing_data_in_memory(table)