Пример #1
0
def test_concatenation_table_flatten(blocks_type, in_memory_pa_table,
                                     in_memory_blocks, memory_mapped_blocks,
                                     mixed_in_memory_and_memory_mapped_blocks):
    blocks = {
        "in_memory": in_memory_blocks,
        "memory_mapped": memory_mapped_blocks,
        "mixed": mixed_in_memory_and_memory_mapped_blocks,
    }[blocks_type]
    table = ConcatenationTable.from_blocks(blocks).flatten()
    assert table.table == in_memory_pa_table.flatten()
    assert isinstance(table, ConcatenationTable)
Пример #2
0
def test_concatenation_table_from_blocks(in_memory_pa_table, in_memory_blocks):
    assert len(in_memory_pa_table) > 2
    in_memory_table = InMemoryTable(in_memory_pa_table)
    t1, t2 = in_memory_table.slice(0, 2), in_memory_table.slice(2)
    table = ConcatenationTable.from_blocks(in_memory_table)
    assert isinstance(table, ConcatenationTable)
    assert table.table == in_memory_pa_table
    assert table.blocks == [[in_memory_table]]
    table = ConcatenationTable.from_blocks([t1, t2])
    assert isinstance(table, ConcatenationTable)
    assert table.blocks == [[t1], [t2]]
    assert table.table == in_memory_pa_table
    table = ConcatenationTable.from_blocks([[t1], [t2]])
    assert isinstance(table, ConcatenationTable)
    assert table.table == in_memory_pa_table
    assert table.blocks == [[t1], [t2]]
    table = ConcatenationTable.from_blocks(in_memory_blocks)
    assert isinstance(table, ConcatenationTable)
    assert table.table == in_memory_pa_table
    assert table.blocks == in_memory_blocks
Пример #3
0
def test_concatenation_table_drop(blocks_type, in_memory_pa_table,
                                  in_memory_blocks, memory_mapped_blocks,
                                  mixed_in_memory_and_memory_mapped_blocks):
    blocks = {
        "in_memory": in_memory_blocks,
        "memory_mapped": memory_mapped_blocks,
        "mixed": mixed_in_memory_and_memory_mapped_blocks,
    }[blocks_type]
    names = [in_memory_pa_table.column_names[0]]
    table = ConcatenationTable.from_blocks(blocks).drop(names)
    assert table.table == in_memory_pa_table.drop(names)
    assert isinstance(table, ConcatenationTable)
Пример #4
0
def test_concatenation_table_filter(blocks_type, in_memory_pa_table,
                                    in_memory_blocks, memory_mapped_blocks,
                                    mixed_in_memory_and_memory_mapped_blocks):
    blocks = {
        "in_memory": in_memory_blocks,
        "memory_mapped": memory_mapped_blocks,
        "mixed": mixed_in_memory_and_memory_mapped_blocks,
    }[blocks_type]
    mask = pa.array([i % 2 == 0 for i in range(len(in_memory_pa_table))])
    table = ConcatenationTable.from_blocks(blocks).filter(mask)
    assert table.table == in_memory_pa_table.filter(mask)
    assert isinstance(table, ConcatenationTable)
def test_concatenation_table_replace_schema_metadata(
    blocks_type, in_memory_pa_table, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks
):
    blocks = {
        "in_memory": in_memory_blocks,
        "memory_mapped": memory_mapped_blocks,
        "mixed": mixed_in_memory_and_memory_mapped_blocks,
    }[blocks_type]
    metadata = {"huggingface": "{}"}
    table = ConcatenationTable.from_blocks(blocks).replace_schema_metadata(metadata)
    assert table.table.schema.metadata == in_memory_pa_table.replace_schema_metadata(metadata).schema.metadata
    assert isinstance(table, ConcatenationTable)
def test_concatenation_table_rename_columns(
    blocks_type, in_memory_pa_table, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks
):
    blocks = {
        "in_memory": in_memory_blocks,
        "memory_mapped": memory_mapped_blocks,
        "mixed": mixed_in_memory_and_memory_mapped_blocks,
    }[blocks_type]
    assert "tokens" in in_memory_pa_table.column_names
    names = [name if name != "tokens" else "new_tokens" for name in in_memory_pa_table.column_names]
    table = ConcatenationTable.from_blocks(blocks).rename_columns(names)
    assert isinstance(table, ConcatenationTable)
    assert table.table == in_memory_pa_table.rename_columns(names)
def test_concatenation_table_init(
    blocks_type, in_memory_pa_table, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks
):
    blocks = (
        in_memory_blocks
        if blocks_type == "in_memory"
        else memory_mapped_blocks
        if blocks_type == "memory_mapped"
        else mixed_in_memory_and_memory_mapped_blocks
    )
    table = ConcatenationTable(in_memory_pa_table, blocks)
    assert table.table == in_memory_pa_table
    assert table.blocks == blocks
Пример #8
0
def test_concat_tables(arrow_file, in_memory_pa_table):
    t0 = in_memory_pa_table
    t1 = InMemoryTable(t0)
    t2 = MemoryMappedTable.from_file(arrow_file)
    t3 = ConcatenationTable.from_blocks(t1)
    concatenated_table = concat_tables([t0, t1, t2, t3])
    assert concatenated_table.table == pa.concat_tables([t0] * 4)
    assert isinstance(concatenated_table, ConcatenationTable)
    assert len(concatenated_table.blocks) == 4
    assert isinstance(concatenated_table.blocks[0][0], InMemoryTable)
    assert isinstance(concatenated_table.blocks[1][0], InMemoryTable)
    assert isinstance(concatenated_table.blocks[2][0], MemoryMappedTable)
    assert isinstance(concatenated_table.blocks[3][0], InMemoryTable)
Пример #9
0
def test_concatenation_table_from_blocks_doesnt_increase_memory(
        blocks_type, in_memory_pa_table, in_memory_blocks,
        memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks):
    blocks = {
        "in_memory": in_memory_blocks,
        "memory_mapped": memory_mapped_blocks,
        "mixed": mixed_in_memory_and_memory_mapped_blocks,
    }[blocks_type]
    with assert_arrow_memory_doesnt_increase():
        table = ConcatenationTable.from_blocks(blocks)
        assert isinstance(table, ConcatenationTable)
        assert table.table == in_memory_pa_table
        assert table.blocks == blocks
Пример #10
0
def test_concatenation_table_pickle(blocks_type, in_memory_blocks,
                                    memory_mapped_blocks,
                                    mixed_in_memory_and_memory_mapped_blocks):
    blocks = {
        "in_memory": in_memory_blocks,
        "memory_mapped": memory_mapped_blocks,
        "mixed": mixed_in_memory_and_memory_mapped_blocks,
    }[blocks_type]
    table = ConcatenationTable.from_blocks(blocks)
    pickled_table = pickle.dumps(table)
    unpickled_table = pickle.loads(pickled_table)
    assert unpickled_table.table == table.table
    assert unpickled_table.blocks == table.blocks
    assert_index_attributes_equal(table, unpickled_table)
Пример #11
0
def test_concatenation_table_from_tables(axis, in_memory_pa_table, arrow_file):
    in_memory_table = InMemoryTable(in_memory_pa_table)
    concatenation_table = ConcatenationTable.from_blocks(in_memory_table)
    memory_mapped_table = MemoryMappedTable.from_file(arrow_file)
    tables = [in_memory_pa_table, in_memory_table, concatenation_table, memory_mapped_table]
    if axis == 0:
        expected_table = pa.concat_tables([in_memory_pa_table] * len(tables))
    else:
        expected_table = in_memory_pa_table
        for _ in range(1, len(tables)):
            for name, col in zip(in_memory_pa_table.column_names, in_memory_pa_table.columns):
                expected_table = expected_table.append_column(name, col)

    with assert_arrow_memory_doesnt_increase():
        table = ConcatenationTable.from_tables(tables, axis=axis)
    assert isinstance(table, ConcatenationTable)
    assert table.table == expected_table
    # because of consolidation, we end up with 1 InMemoryTable and 1 MemoryMappedTable
    assert len(table.blocks) == 1 if axis == 1 else 2
    assert len(table.blocks[0]) == 1 if axis == 0 else 2
    assert axis == 1 or len(table.blocks[1]) == 1
    assert isinstance(table.blocks[0][0], InMemoryTable)
    assert isinstance(table.blocks[1][0] if axis == 0 else table.blocks[0][1], MemoryMappedTable)
def test_concatenation_table_deepcopy(
    blocks_type, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks
):
    blocks = {
        "in_memory": in_memory_blocks,
        "memory_mapped": memory_mapped_blocks,
        "mixed": mixed_in_memory_and_memory_mapped_blocks,
    }[blocks_type]
    table = ConcatenationTable.from_blocks(blocks)
    copied_table = copy.deepcopy(table)
    assert table.table == copied_table.table
    assert table.blocks == copied_table.blocks
    assert_index_attributes_equal(table, copied_table)
    # deepcopy must return the exact same arrow objects since they are immutable
    assert table.table is copied_table.table
    assert all(batch1 is batch2 for batch1, batch2 in zip(table._batches, copied_table._batches))
Пример #13
0
def test_concat_tables(arrow_file, in_memory_pa_table):
    t0 = in_memory_pa_table
    t1 = InMemoryTable(t0)
    t2 = MemoryMappedTable.from_file(arrow_file)
    t3 = ConcatenationTable.from_blocks(t1)
    tables = [t0, t1, t2, t3]
    concatenated_table = concat_tables(tables, axis=0)
    assert concatenated_table.table == pa.concat_tables([t0] * 4)
    assert concatenated_table.table.shape == (40, 4)
    assert isinstance(concatenated_table, ConcatenationTable)
    assert len(concatenated_table.blocks) == 3  # t0 and t1 are consolidated as a single InMemoryTable
    assert isinstance(concatenated_table.blocks[0][0], InMemoryTable)
    assert isinstance(concatenated_table.blocks[1][0], MemoryMappedTable)
    assert isinstance(concatenated_table.blocks[2][0], InMemoryTable)
    concatenated_table = concat_tables(tables, axis=1)
    assert concatenated_table.table.shape == (10, 16)
    assert len(concatenated_table.blocks[0]) == 3  # t0 and t1 are consolidated as a single InMemoryTable
    assert isinstance(concatenated_table.blocks[0][0], InMemoryTable)
    assert isinstance(concatenated_table.blocks[0][1], MemoryMappedTable)
    assert isinstance(concatenated_table.blocks[0][2], InMemoryTable)
def test_concat_tables(arrow_file, in_memory_pa_table):
    t0 = in_memory_pa_table
    t1 = InMemoryTable(t0)
    t2 = MemoryMappedTable.from_file(arrow_file)
    t3 = ConcatenationTable.from_blocks(t1)
    tables = [t0, t1, t2, t3]
    concatenated_table = concat_tables(tables, axis=0)
    assert concatenated_table.table == pa.concat_tables([t0] * 4)
    assert concatenated_table.table.shape == (40, 4)
    assert isinstance(concatenated_table, ConcatenationTable)
    assert len(concatenated_table.blocks) == 3  # t0 and t1 are consolidated as a single InMemoryTable
    assert isinstance(concatenated_table.blocks[0][0], InMemoryTable)
    assert isinstance(concatenated_table.blocks[1][0], MemoryMappedTable)
    assert isinstance(concatenated_table.blocks[2][0], InMemoryTable)
    # add suffix to avoid error due to duplicate column names
    concatenated_table = concat_tables(
        [add_suffix_to_column_names(table, i) for i, table in enumerate(tables)], axis=1
    )
    assert concatenated_table.table.shape == (10, 16)
    assert len(concatenated_table.blocks[0]) == 3  # t0 and t1 are consolidated as a single InMemoryTable
    assert isinstance(concatenated_table.blocks[0][0], InMemoryTable)
    assert isinstance(concatenated_table.blocks[0][1], MemoryMappedTable)
    assert isinstance(concatenated_table.blocks[0][2], InMemoryTable)