Пример #1
0
def test_direct_read_dictionary_subfield(use_legacy_dataset):
    repeats = 10
    nunique = 5

    data = [
        [[util.rands(10)] for i in range(nunique)] * repeats,
    ]
    table = pa.table(data, names=['f0'])

    bio = pa.BufferOutputStream()
    pq.write_table(table, bio)
    contents = bio.getvalue()
    result = pq.read_table(pa.BufferReader(contents),
                           read_dictionary=['f0.list.item'],
                           use_legacy_dataset=use_legacy_dataset)

    arr = pa.array(data[0])
    values_as_dict = arr.values.dictionary_encode()

    inner_indices = values_as_dict.indices.cast('int32')
    new_values = pa.DictionaryArray.from_arrays(inner_indices,
                                                values_as_dict.dictionary)

    offsets = pa.array(range(51), type='int32')
    expected_arr = pa.ListArray.from_arrays(offsets, new_values)
    expected = pa.table([expected_arr], names=['f0'])

    assert result.equals(expected)
    assert result[0].num_chunks == 1
def test_deserialize_pandas_arrow_7956():
    df = pd.DataFrame({
        'a': np.arange(10000),
        'b': [test_util.rands(5) for _ in range(10000)]
    })

    def action():
        df_bytes = pa.ipc.serialize_pandas(df).to_pybytes()
        buf = pa.py_buffer(df_bytes)
        pa.ipc.deserialize_pandas(buf)

    # Abort at 128MB threshold
    test_util.memory_leak_check(action, threshold=1 << 27, iterations=100)
Пример #3
0
def test_leak3():
    import pyarrow.parquet as pq

    df = pd.DataFrame({'a{0}'.format(i): [1, 2, 3, 4] for i in range(50)})
    table = pa.Table.from_pandas(df, preserve_index=False)

    writer = pq.ParquetWriter('leak_test_' + rands(5) + '.parquet',
                              table.schema)

    def func():
        writer.write_table(table, row_group_size=len(table))

    # This does not "leak" per se but we do want to have this use as little
    # memory as possible
    assert_does_not_leak(func, iterations=500, check_interval=50, tolerance=20)
Пример #4
0
def test_direct_read_dictionary(use_legacy_dataset):
    # ARROW-3325
    repeats = 10
    nunique = 5

    data = [
        [util.rands(10) for i in range(nunique)] * repeats,
    ]
    table = pa.table(data, names=['f0'])

    bio = pa.BufferOutputStream()
    pq.write_table(table, bio)
    contents = bio.getvalue()

    result = pq.read_table(pa.BufferReader(contents),
                           read_dictionary=['f0'],
                           use_legacy_dataset=use_legacy_dataset)

    # Compute dictionary-encoded subfield
    expected = pa.table([table[0].dictionary_encode()], names=['f0'])
    assert result.equals(expected)
Пример #5
0
def _test_dataframe(size=10000, seed=0):
    import pandas as pd

    np.random.seed(seed)
    df = pd.DataFrame({
        'uint8': _random_integers(size, np.uint8),
        'uint16': _random_integers(size, np.uint16),
        'uint32': _random_integers(size, np.uint32),
        'uint64': _random_integers(size, np.uint64),
        'int8': _random_integers(size, np.int8),
        'int16': _random_integers(size, np.int16),
        'int32': _random_integers(size, np.int32),
        'int64': _random_integers(size, np.int64),
        'float32': np.random.randn(size).astype(np.float32),
        'float64': np.arange(size, dtype=np.float64),
        'bool': np.random.randn(size) > 0,
        'strings': [util.rands(10) for i in range(size)],
        'all_none': [None] * size,
        'all_none_category': [None] * size
    })

    # TODO(PARQUET-1015)
    # df['all_none_category'] = df['all_none_category'].astype('category')
    return df