예제 #1
0
def test_in_expr():
    import pyarrow.gandiva as gandiva

    arr = pa.array(["ga", "an", "nd", "di", "iv", "va"])
    table = pa.Table.from_arrays([arr], ["a"])

    # string
    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    cond = builder.make_in_expression(node_a, ["an", "nd"], pa.string())
    condition = builder.make_condition(cond)
    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert list(result.to_array()) == [1, 2]

    # int32
    arr = pa.array([3, 1, 4, 1, 5, 9, 2, 6, 5, 4])
    table = pa.Table.from_arrays([arr.cast(pa.int32())], ["a"])
    node_a = builder.make_field(table.schema.field("a"))
    cond = builder.make_in_expression(node_a, [1, 5], pa.int32())
    condition = builder.make_condition(cond)
    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert list(result.to_array()) == [1, 3, 4, 8]

    # int64
    arr = pa.array([3, 1, 4, 1, 5, 9, 2, 6, 5, 4])
    table = pa.Table.from_arrays([arr], ["a"])
    node_a = builder.make_field(table.schema.field("a"))
    cond = builder.make_in_expression(node_a, [1, 5], pa.int64())
    condition = builder.make_condition(cond)
    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert list(result.to_array()) == [1, 3, 4, 8]
예제 #2
0
def test_call_function_with_memory_pool():
    arr = pa.array(["foo", "bar", "baz"])
    indices = np.array([2, 2, 1])
    result1 = arr.take(indices)
    result2 = pc.call_function('take', [arr, indices],
                               memory_pool=pa.default_memory_pool())
    expected = pa.array(["baz", "baz", "bar"])
    assert result1.equals(expected)
    assert result2.equals(expected)

    result3 = pc.take(arr, indices, memory_pool=pa.default_memory_pool())
    assert result3.equals(expected)
예제 #3
0
def test_logging_memory_pool(capfd):
    pool = pa.logging_memory_pool(pa.default_memory_pool())
    check_allocated_bytes(pool)
    out, err = capfd.readouterr()
    assert err == ""
    assert out.count("Allocate:") > 0
    assert out.count("Allocate:") == out.count("Free:")
예제 #4
0
def test_scanner(dataset):
    scanner = ds.Scanner(dataset, memory_pool=pa.default_memory_pool())
    assert isinstance(scanner, ds.Scanner)
    assert len(list(scanner.scan())) == 2

    with pytest.raises(pa.ArrowInvalid):
        dataset.scan(columns=['unknown'])

    scanner = ds.Scanner(dataset, columns=['i64'],
                         memory_pool=pa.default_memory_pool())

    assert isinstance(scanner, ds.Scanner)
    assert len(list(scanner.scan())) == 2
    for task in scanner.scan():
        for batch in task.execute():
            assert batch.num_columns == 1
예제 #5
0
def table_to_blockmanager(options,
                          table,
                          categories=None,
                          ignore_metadata=False):
    from pandas.core.internals import BlockManager

    all_columns = []
    column_indexes = []
    pandas_metadata = table.schema.pandas_metadata

    if not ignore_metadata and pandas_metadata is not None:
        all_columns = pandas_metadata['columns']
        column_indexes = pandas_metadata.get('column_indexes', [])
        index_descriptors = pandas_metadata['index_columns']
        table = _add_any_metadata(table, pandas_metadata)
        table, index = _reconstruct_index(table, index_descriptors,
                                          all_columns)
    else:
        index = _pandas_api.pd.RangeIndex(table.num_rows)

    _check_data_column_metadata_consistency(all_columns)
    blocks = _table_to_blocks(options, table, pa.default_memory_pool(),
                              categories)
    columns = _deserialize_column_index(table, all_columns, column_indexes)

    axes = [columns, index]
    return BlockManager(blocks, axes)
예제 #6
0
def test_tree_exp_builder():
    import pyarrow.gandiva as gandiva

    builder = gandiva.TreeExprBuilder()

    field_a = pa.field('a', pa.int32())
    field_b = pa.field('b', pa.int32())

    schema = pa.schema([field_a, field_b])

    field_result = pa.field('res', pa.int32())

    node_a = builder.make_field(field_a)
    node_b = builder.make_field(field_b)

    condition = builder.make_function("greater_than", [node_a, node_b],
                                      pa.bool_())
    if_node = builder.make_if(condition, node_a, node_b, pa.int32())

    expr = builder.make_expression(if_node, field_result)

    projector = gandiva.make_projector(schema, [expr],
                                       pa.default_memory_pool())

    a = pa.array([10, 12, -20, 5], type=pa.int32())
    b = pa.array([5, 15, 15, 17], type=pa.int32())
    e = pa.array([10, 15, 15, 17], type=pa.int32())
    input_batch = pa.RecordBatch.from_arrays([a, b], names=['a', 'b'])

    r, = projector.evaluate(input_batch)
    assert r.equals(e)
예제 #7
0
        def gandiva_query(table, query):
            """
            Evaluate string query on the passed table.

            Parameters
            ----------
            table : pyarrow.Table
                Table to evaluate query on.
            query : str
                Query string to evaluate on the `table` columns.

            Returns
            -------
            pyarrow.Table
            """
            expr = gen_table_expr(table, query)
            if not can_be_condition(expr):
                raise ValueError("Root operation should be a filter.")
            builder = gandiva.TreeExprBuilder()
            root = build_node(table, expr.terms, builder)
            cond = builder.make_condition(root)
            filt = gandiva.make_filter(table.schema, cond)
            sel_vec = filt.evaluate(table.to_batches()[0],
                                    pa.default_memory_pool())
            result = filter_with_selection_vector(table, sel_vec)
            return result
예제 #8
0
def table_to_blockmanager(options, table, categories=None,
                          ignore_metadata=False):
    from pandas.core.internals import BlockManager

    all_columns = []
    column_indexes = []
    pandas_metadata = table.schema.pandas_metadata

    if not ignore_metadata and pandas_metadata is not None:
        all_columns = pandas_metadata['columns']
        column_indexes = pandas_metadata.get('column_indexes', [])
        index_descriptors = pandas_metadata['index_columns']
        table = _add_any_metadata(table, pandas_metadata)
        table, index = _reconstruct_index(table, index_descriptors,
                                          all_columns)
    else:
        index = _pandas_api.pd.RangeIndex(table.num_rows)

    _check_data_column_metadata_consistency(all_columns)
    blocks = _table_to_blocks(options, table, pa.default_memory_pool(),
                              categories)
    columns = _deserialize_column_index(table, all_columns, column_indexes)

    axes = [columns, index]
    return BlockManager(blocks, axes)
예제 #9
0
def test_boolean():
    import pyarrow.gandiva as gandiva

    df = pd.DataFrame({
        "a": [1., 31., 46., 3., 57., 44., 22.],
        "b": [5., 45., 36., 73., 83., 23., 76.]
    })
    table = pa.Table.from_pandas(df)

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field_by_name("a"))
    node_b = builder.make_field(table.schema.field_by_name("b"))
    fifty = builder.make_literal(50.0, pa.float64())
    eleven = builder.make_literal(11.0, pa.float64())

    cond_1 = builder.make_function("less_than", [node_a, fifty], pa.bool_())
    cond_2 = builder.make_function("greater_than", [node_a, node_b],
                                   pa.bool_())
    cond_3 = builder.make_function("less_than", [node_b, eleven], pa.bool_())
    cond = builder.make_or([builder.make_and([cond_1, cond_2]), cond_3])
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert list(result.to_array()) == [0, 2, 5]
def test_proxy_memory_pool():
    pool = pa.proxy_memory_pool(pa.default_memory_pool())
    check_allocated_bytes(pool)
    wr = weakref.ref(pool)
    assert wr() is not None
    del pool
    assert wr() is None
예제 #11
0
        def gandiva_query(table, query):
            """
            Evaluate string query on the passed table.

            Parameters
            ----------
            table : pyarrow.Table
                Table to evaluate query on.
            query : str
                Query string to evaluate on the `table` columns.

            Returns
            -------
            pyarrow.Table
            """
            expr = gen_table_expr(table, query)
            if not can_be_condition(expr):
                raise ValueError("Root operation should be a filter.")

            # We use this import here because of https://github.com/modin-project/modin/issues/3849,
            # after the issue is fixed we should put the import at the top of this file
            import pyarrow.gandiva as gandiva

            builder = gandiva.TreeExprBuilder()
            root = build_node(table, expr.terms, builder)
            cond = builder.make_condition(root)
            filt = gandiva.make_filter(table.schema, cond)
            sel_vec = filt.evaluate(table.to_batches()[0],
                                    pa.default_memory_pool())
            result = filter_with_selection_vector(table, sel_vec)
            return result
def test_logging_memory_pool(capfd):
    pool = pa.logging_memory_pool(pa.default_memory_pool())
    check_allocated_bytes(pool)
    out, err = capfd.readouterr()
    assert err == ""
    assert out.count("Allocate:") > 0
    assert out.count("Allocate:") == out.count("Free:")
예제 #13
0
def test_scanner_builder(dataset):
    builder = ds.ScannerBuilder(dataset, memory_pool=pa.default_memory_pool())
    scanner = builder.finish()
    assert isinstance(scanner, ds.Scanner)
    assert len(list(scanner.scan())) == 2

    with pytest.raises(pa.ArrowInvalid):
        dataset.new_scan().project(['unknown'])

    builder = dataset.new_scan(memory_pool=pa.default_memory_pool())
    scanner = builder.project(['i64']).finish()

    assert isinstance(scanner, ds.Scanner)
    assert len(list(scanner.scan())) == 2
    for task in scanner.scan():
        for batch in task.execute():
            assert batch.num_columns == 1
예제 #14
0
 def gandiva_query(table, query):
     expr = gen_table_expr(table, query)
     if not can_be_condition(expr):
         raise ValueError("Root operation should be a filter.")
     builder = gandiva.TreeExprBuilder()
     root = build_node(table, expr.terms, builder)
     cond = builder.make_condition(root)
     filt = gandiva.make_filter(table.schema, cond)
     sel_vec = filt.evaluate(table.to_batches()[0], pa.default_memory_pool())
     result = filter_with_selection_vector(table, sel_vec)
     return result
예제 #15
0
def test_set_memory_pool():
    old_pool = pa.default_memory_pool()
    pool = pa.proxy_memory_pool(old_pool)
    pa.set_memory_pool(pool)
    try:
        allocated_before = pool.bytes_allocated()
        with allocate_bytes(None, 512):
            assert pool.bytes_allocated() == allocated_before + 512
        assert pool.bytes_allocated() == allocated_before
    finally:
        pa.set_memory_pool(old_pool)
def test_set_memory_pool():
    old_pool = pa.default_memory_pool()
    pool = pa.proxy_memory_pool(old_pool)
    pa.set_memory_pool(pool)
    try:
        allocated_before = pool.bytes_allocated()
        with allocate_bytes(None, 512):
            assert pool.bytes_allocated() == allocated_before + 512
        assert pool.bytes_allocated() == allocated_before
    finally:
        pa.set_memory_pool(old_pool)
예제 #17
0
def test_filter():
    import pyarrow.gandiva as gandiva

    df = pd.DataFrame({"a": [1.0 * i for i in range(10000)]})
    table = pa.Table.from_pandas(df)

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field_by_name("a"))
    thousand = builder.make_literal(1000.0, pa.float64())
    cond = builder.make_function("less_than", [node_a, thousand], pa.bool_())
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert result.to_array().equals(pa.array(range(1000), type=pa.uint32()))
예제 #18
0
def test_default_memory_pool():
    gc.collect()
    bytes_before_default = pa.total_allocated_bytes()
    bytes_before_jemalloc = pa.jemalloc_memory_pool().bytes_allocated()

    old_memory_pool = pa.default_memory_pool()
    pa.set_memory_pool(pa.jemalloc_memory_pool())

    array = pa.array([1, None, 3, None])  # noqa

    pa.set_memory_pool(old_memory_pool)
    gc.collect()

    assert pa.total_allocated_bytes() == bytes_before_default

    assert (pa.jemalloc_memory_pool().bytes_allocated() >
            bytes_before_jemalloc)
예제 #19
0
def test_default_memory_pool():
    gc.collect()
    bytes_before_default = pa.total_allocated_bytes()
    bytes_before_jemalloc = pa.jemalloc_memory_pool().bytes_allocated()

    old_memory_pool = pa.default_memory_pool()
    pa.set_memory_pool(pa.jemalloc_memory_pool())

    array = pa.array([1, None, 3, None])  # noqa

    pa.set_memory_pool(old_memory_pool)
    gc.collect()

    assert pa.total_allocated_bytes() == bytes_before_default

    assert (pa.jemalloc_memory_pool().bytes_allocated() >
            bytes_before_jemalloc)
def test_filter():
    import pyarrow.gandiva as gandiva

    table = pa.Table.from_arrays([pa.array([1.0 * i for i in range(10000)])],
                                 ['a'])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    thousand = builder.make_literal(1000.0, pa.float64())
    cond = builder.make_function("less_than", [node_a, thousand], pa.bool_())
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    # Gandiva generates compute kernel function named `@expr_X`
    assert filter.llvm_ir.find("@expr_") != -1

    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert result.to_array().equals(pa.array(range(1000), type=pa.uint32()))
예제 #21
0
def test_regex():
    import pyarrow.gandiva as gandiva

    elements = ["park", "sparkle", "bright spark and fire", "spark"]
    data = pa.array(elements, type=pa.string())
    table = pa.Table.from_arrays([data], names=['a'])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field_by_name("a"))
    regex = builder.make_literal("%spark%", pa.string())
    like = builder.make_function("like", [node_a, regex], pa.bool_())

    field_result = pa.field("b", pa.bool_())
    expr = builder.make_expression(like, field_result)

    projector = gandiva.make_projector(table.schema, [expr],
                                       pa.default_memory_pool())

    r, = projector.evaluate(table.to_batches()[0])
    b = pa.array([False, True, True, True], type=pa.bool_())
    assert r.equals(b)
def test_filter_project():
    import pyarrow.gandiva as gandiva
    mpool = pa.default_memory_pool()
    # Create a table with some sample data
    array0 = pa.array([10, 12, -20, 5, 21, 29], pa.int32())
    array1 = pa.array([5, 15, 15, 17, 12, 3], pa.int32())
    array2 = pa.array([1, 25, 11, 30, -21, None], pa.int32())

    table = pa.Table.from_arrays([array0, array1, array2], ['a', 'b', 'c'])

    field_result = pa.field("res", pa.int32())

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    node_b = builder.make_field(table.schema.field("b"))
    node_c = builder.make_field(table.schema.field("c"))

    greater_than_function = builder.make_function("greater_than",
                                                  [node_a, node_b], pa.bool_())
    filter_condition = builder.make_condition(greater_than_function)

    project_condition = builder.make_function("less_than", [node_b, node_c],
                                              pa.bool_())
    if_node = builder.make_if(project_condition, node_b, node_c, pa.int32())
    expr = builder.make_expression(if_node, field_result)

    # Build a filter for the expressions.
    filter = gandiva.make_filter(table.schema, filter_condition)

    # Build a projector for the expressions.
    projector = gandiva.make_projector(table.schema, [expr], mpool, "UINT32")

    # Evaluate filter
    selection_vector = filter.evaluate(table.to_batches()[0], mpool)

    # Evaluate project
    r, = projector.evaluate(table.to_batches()[0], selection_vector)

    exp = pa.array([1, -21, None], pa.int32())
    assert r.equals(exp)
예제 #23
0
def test_tree_exp_builder():
    import pyarrow.gandiva as gandiva

    builder = gandiva.TreeExprBuilder()

    field_a = pa.field('a', pa.int32())
    field_b = pa.field('b', pa.int32())

    schema = pa.schema([field_a, field_b])

    field_result = pa.field('res', pa.int32())

    node_a = builder.make_field(field_a)
    node_b = builder.make_field(field_b)

    assert node_a.return_type() == field_a.type

    condition = builder.make_function("greater_than", [node_a, node_b],
                                      pa.bool_())
    if_node = builder.make_if(condition, node_a, node_b, pa.int32())

    expr = builder.make_expression(if_node, field_result)

    assert expr.result().type == pa.int32()

    projector = gandiva.make_projector(schema, [expr],
                                       pa.default_memory_pool())

    # Gandiva generates compute kernel function named `@expr_X`
    assert projector.llvm_ir.find("@expr_") != -1

    a = pa.array([10, 12, -20, 5], type=pa.int32())
    b = pa.array([5, 15, 15, 17], type=pa.int32())
    e = pa.array([10, 15, 15, 17], type=pa.int32())
    input_batch = pa.RecordBatch.from_arrays([a, b], names=['a', 'b'])

    r, = projector.evaluate(input_batch)
    assert r.equals(e)
예제 #24
0
def test_boolean():
    import pyarrow.gandiva as gandiva

    table = pa.Table.from_arrays([
        pa.array([1., 31., 46., 3., 57., 44., 22.]),
        pa.array([5., 45., 36., 73., 83., 23., 76.])
    ], ['a', 'b'])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    node_b = builder.make_field(table.schema.field("b"))
    fifty = builder.make_literal(50.0, pa.float64())
    eleven = builder.make_literal(11.0, pa.float64())

    cond_1 = builder.make_function("less_than", [node_a, fifty], pa.bool_())
    cond_2 = builder.make_function("greater_than", [node_a, node_b],
                                   pa.bool_())
    cond_3 = builder.make_function("less_than", [node_b, eleven], pa.bool_())
    cond = builder.make_or([builder.make_and([cond_1, cond_2]), cond_3])
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert result.to_array().equals(pa.array([0, 2, 5], type=pa.uint32()))
예제 #25
0
def test_table():
    import pyarrow.gandiva as gandiva

    df = pd.DataFrame({"a": [1.0, 2.0], "b": [3.0, 4.0]})
    table = pa.Table.from_pandas(df)

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field_by_name("a"))
    node_b = builder.make_field(table.schema.field_by_name("b"))

    sum = builder.make_function("add", [node_a, node_b], pa.float64())

    field_result = pa.field("c", pa.float64())
    expr = builder.make_expression(sum, field_result)

    projector = gandiva.make_projector(table.schema, [expr],
                                       pa.default_memory_pool())

    # TODO: Add .evaluate function which can take Tables instead of
    # RecordBatches
    r, = projector.evaluate(table.to_batches()[0])

    e = pa.Array.from_pandas(df["a"] + df["b"])
    assert r.equals(e)
예제 #26
0
def test_table():
    import pyarrow.gandiva as gandiva

    table = pa.Table.from_arrays(
        [pa.array([1.0, 2.0]), pa.array([3.0, 4.0])], ['a', 'b'])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    node_b = builder.make_field(table.schema.field("b"))

    sum = builder.make_function("add", [node_a, node_b], pa.float64())

    field_result = pa.field("c", pa.float64())
    expr = builder.make_expression(sum, field_result)

    projector = gandiva.make_projector(table.schema, [expr],
                                       pa.default_memory_pool())

    # TODO: Add .evaluate function which can take Tables instead of
    # RecordBatches
    r, = projector.evaluate(table.to_batches()[0])

    e = pa.array([4.0, 6.0])
    assert r.equals(e)
예제 #27
0
def table_to_bytes(table):
    global _temp_dir
    if _temp_dir is None or not os.path.exists(_temp_dir):
        _temp_dir = tempfile.mkdtemp(prefix='knime-python-')
        # Delete temporary directory upon Python shutdown.
        atexit.register(close)
    fd, path = tempfile.mkstemp(suffix='.dat',
                                prefix='python-to-java-',
                                dir=_temp_dir,
                                text=False)
    try:
        os.close(fd)

        mp = pyarrow.default_memory_pool()
        col_arrays = []
        col_names = []
        all_names = []
        missing_names = []

        # add the index column to the list of columns
        all_names.append("__index_level_0__")
        if len(table._data_frame.index) > 0:
            col_names.append("__index_level_0__")
            col_arrays.append(
                pyarrow.Array.from_pandas(table._data_frame.index,
                                          type=to_pyarrow_type(_types_.STRING),
                                          memory_pool=mp))
        else:
            missing_names.append("__index_level_0__")

        # Serialize the dataframe into a list of pyarrow.Array column by column
        for i in range(len(table._data_frame.columns)):
            # Do not allocate a buffer for columns that only contain missing values. We track and transfer their names
            # to give them special treatment on Java side.
            # This also covers tables of row count zero.
            if table._data_frame.iloc[:, i].isnull().all():
                missing_names.append(table.get_name(i))
                all_names.append(table.get_name(i))
                continue
            # Convert collection types to binary
            if table.get_type(i) == _types_.INTEGER_LIST:
                col_arrays.append(
                    pyarrow.Array.from_pandas(
                        binary_from_list_generator(
                            table._data_frame.iloc[:, i], '<i4')))
            elif table.get_type(i) == _types_.LONG_LIST:
                col_arrays.append(
                    pyarrow.Array.from_pandas(
                        binary_from_list_generator(
                            table._data_frame.iloc[:, i], '<i8')))
            elif table.get_type(i) == _types_.DOUBLE_LIST:
                col_arrays.append(
                    pyarrow.Array.from_pandas(
                        binary_from_list_generator(
                            table._data_frame.iloc[:, i], '<f8')))
            elif table.get_type(i) == _types_.FLOAT_LIST:
                col_arrays.append(
                    pyarrow.Array.from_pandas(
                        binary_from_list_generator(
                            table._data_frame.iloc[:, i], '<f4')))
            elif table.get_type(i) == _types_.BOOLEAN_LIST:
                col_arrays.append(
                    pyarrow.Array.from_pandas(
                        binary_from_boolean_list_generator(
                            table._data_frame.iloc[:, i])))
            elif table.get_type(i) == _types_.STRING_LIST:
                col_arrays.append(
                    pyarrow.Array.from_pandas(
                        binary_from_string_list_generator(
                            table._data_frame.iloc[:, i])))
            elif table.get_type(i) == _types_.BYTES_LIST:
                col_arrays.append(
                    pyarrow.Array.from_pandas(
                        binary_from_bytes_list_generator(
                            table._data_frame.iloc[:, i])))
            elif table.get_type(i) == _types_.INTEGER_SET:
                col_arrays.append(
                    pyarrow.Array.from_pandas(
                        binary_from_set_generator(table._data_frame.iloc[:, i],
                                                  '<i4')))
            elif table.get_type(i) == _types_.LONG_SET:
                col_arrays.append(
                    pyarrow.Array.from_pandas(
                        binary_from_set_generator(table._data_frame.iloc[:, i],
                                                  '<i8')))
            elif table.get_type(i) == _types_.DOUBLE_SET:
                col_arrays.append(
                    pyarrow.Array.from_pandas(
                        binary_from_set_generator(table._data_frame.iloc[:, i],
                                                  '<f8')))
            elif table.get_type(i) == _types_.FLOAT_SET:
                col_arrays.append(
                    pyarrow.Array.from_pandas(
                        binary_from_set_generator(table._data_frame.iloc[:, i],
                                                  '<f4')))
            elif table.get_type(i) == _types_.BOOLEAN_SET:
                col_arrays.append(
                    pyarrow.Array.from_pandas(
                        binary_from_boolean_set_generator(
                            table._data_frame.iloc[:, i])))
            elif table.get_type(i) == _types_.STRING_SET:
                col_arrays.append(
                    pyarrow.Array.from_pandas(
                        binary_from_string_set_generator(
                            table._data_frame.iloc[:, i])))
            elif table.get_type(i) == _types_.BYTES_SET:
                col_arrays.append(
                    pyarrow.Array.from_pandas(
                        binary_from_bytes_set_generator(
                            table._data_frame.iloc[:, i])))
            # Workaround until numpy typecasts are implemented in pyarrow
            elif table.get_type(
                    i
            ) == _types_.INTEGER and table._data_frame.iloc[:,
                                                            i].dtype == np.int64:
                col_arrays.append(
                    pyarrow.Array.from_pandas(np.array(
                        table._data_frame.iloc[:, i], dtype=np.int32),
                                              memory_pool=mp))
            # Workaround until fixed in pyarrow ... it is assumed that the first non-None object is bytearray if any
            elif table.get_type(i) == _types_.BYTES and type(
                    get_first_not_None(
                        table._data_frame.iloc[:, i])) == bytearray:
                col_arrays.append(
                    pyarrow.Array.from_pandas(map(
                        lambda x: x if x is None else bytes(x),
                        table._data_frame.iloc[:, i]),
                                              memory_pool=mp))
            # create pyarrow.Array
            else:
                pa_type = to_pyarrow_type(table.get_type(i))
                # pyarrow.binary() type is not allowed as argument for type atm
                if pa_type == pyarrow.binary():
                    col_arrays.append(
                        pyarrow.BinaryArray.from_pandas(
                            table._data_frame.iloc[:, i], memory_pool=mp))
                else:
                    col_arrays.append(
                        pyarrow.Array.from_pandas(table._data_frame.iloc[:, i],
                                                  type=pa_type,
                                                  memory_pool=mp))
            col_names.append(table.get_name(i))
            all_names.append(table.get_name(i))

        # Construct metadata
        custom_metadata = {
            "index_columns": [all_names[0]],
            "columns": [{
                "name": all_names[0],
                "metadata": {
                    "serializer_id": "",
                    "type_id": _types_.STRING
                }
            }],
            "missing_columns":
            missing_names,
            "num_rows":
            len(table._data_frame)
        }

        real_col_names = list(table._data_frame.columns)
        for name in all_names[1:]:
            col_idx = real_col_names.index(name)
            if table.get_type(col_idx) in [
                    _types_.BYTES, _types_.BYTES_LIST, _types_.BYTES_SET
            ]:
                custom_metadata['columns'].append({
                    "name": name,
                    "metadata": {
                        "serializer_id":
                        table.get_column_serializers().get(name, ""),
                        "type_id":
                        table.get_type(col_idx)
                    }
                })
            else:
                custom_metadata['columns'].append({
                    "name": name,
                    "metadata": {
                        "serializer_id": "",
                        "type_id": table.get_type(col_idx)
                    }
                })

        metadata = {
            b'ArrowSerializationLibrary':
            json.dumps(custom_metadata).encode('utf-8')
        }

        batch = pyarrow.RecordBatch.from_arrays(col_arrays, col_names)

        schema = batch.schema.remove_metadata()
        schema = schema.add_metadata(metadata)

        # Write data to file and return filepath
        with pyarrow.OSFile(path, 'wb') as f:
            stream_writer = pyarrow.RecordBatchStreamWriter(f, schema)
            stream_writer.write_batch(batch)
            stream_writer.close()
        return bytearray(path, 'utf-8')
    except BaseException:
        PythonUtils.invoke_safely(None, os.remove, [path])
        raise
예제 #28
0
def memory_and_io_interfaces_example():
	# pyarrow.Buffer.

	data = b"abcdefghijklmnopqrstuvwxyz"

	# Creating a Buffer in this way does not allocate any memory; it is a zero-copy view on the memory exported from the data bytes object.
	buf = pa.py_buffer(data)
	# External memory, under the form of a raw pointer and size, can also be referenced using the foreign_buffer() function.
	#buf = pa.foreign_buffer(data)

	print("buf = {}.".format(buf))
	print("buf.size = {}.".format(buf.size))

	print("memoryview(buf) = {}.".format(memoryview(buf)))
	print("buf.to_pybytes() = {}.".format(buf.to_pybytes()))

	#--------------------
	# Memory pools.

	print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes()))

	buf = pa.allocate_buffer(1024, resizable=True)
	print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes()))

	buf.resize(2048)
	print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes()))

	buf = None
	print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes()))

	print("pa.default_memory_pool().backend_name = {}.".format(pa.default_memory_pool().backend_name))

	#--------------------
	# Input and output streams.

	buf = memoryview(b"some data")
	stream = pa.input_stream(buf)

	print("stream.read(4) = {}.".format(stream.read(4)))

	import gzip
	with gzip.open("./example.gz", "wb") as f:
		f.write(b"some data\n" * 3)

	stream = pa.input_stream("./example.gz")
	print("stream.read() = {}.".format(stream.read()))

	with pa.output_stream("./example1.dat") as stream:
		stream.write(b"some data")

	f = open("./example1.dat", "rb")
	print("f.read() = {}.".format(f.read()))

	#--------------------
	# On-disk and memory mapped files.

	# Using regular Python.
	with open("./example2.dat", "wb") as f:
		f.write(b"some example data")

	file_obj = pa.OSFile("./example2.dat")
	print("file_obj.read(4) = {}.".format(file_obj.read(4)))

	# Using pyarrow's OSFile class.
	with pa.OSFile("./example3.dat", "wb") as f:
		f.write(b"some example data")

	mmap = pa.memory_map("./example3.dat")
	print("mmap.read(4) = {}.".format(mmap.read(4)))

	mmap.seek(0)
	buf = mmap.read_buffer(4)
	print("buf = {}.".format(buf))
	print("buf.to_pybytes() = {}.".format(buf.to_pybytes()))

	#--------------------
	# In-memory reading and writing.

	writer = pa.BufferOutputStream()
	writer.write(b"hello, friends")
	buf = writer.getvalue()
	print("buf = {}.".format(buf))
	print("buf.size = {}.".format(buf.size))

	reader = pa.BufferReader(buf)
	reader.seek(7)
	print("reader.read(7) = {}.".format(reader.read(7)))
예제 #29
0
def test_default_allocated_bytes():
    pool = pa.default_memory_pool()
    with allocate_bytes(pool, 1024):
        check_allocated_bytes(pool)
        assert pool.bytes_allocated() == pa.total_allocated_bytes()
def test_default_allocated_bytes():
    pool = pa.default_memory_pool()
    with allocate_bytes(pool, 1024):
        check_allocated_bytes(pool)
        assert pool.bytes_allocated() == pa.total_allocated_bytes()
예제 #31
0
 def __init__(self):
     self.start_use = pa.total_allocated_bytes()
     self.start_rss = get_rss()
     self.pool = pa.default_memory_pool()
     self.start_peak_use = self.pool.max_memory()
예제 #32
0
def test_default_backend_name():
    pool = pa.default_memory_pool()
    assert pool.backend_name in possible_backends
예제 #33
0
def table_to_blockmanager(options,
                          table,
                          categories=None,
                          ignore_metadata=False):

    index_columns = []
    columns = []
    column_indexes = []
    index_arrays = []
    index_names = []
    schema = table.schema
    row_count = table.num_rows
    metadata = schema.metadata

    has_pandas_metadata = (not ignore_metadata and metadata is not None
                           and b'pandas' in metadata)

    if has_pandas_metadata:
        pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8'))
        index_columns = pandas_metadata['index_columns']
        columns = pandas_metadata['columns']
        column_indexes = pandas_metadata.get('column_indexes', [])
        table = _add_any_metadata(table, pandas_metadata)

    block_table = table

    index_columns_set = frozenset(index_columns)

    # 0. 'field_name' is the name of the column in the arrow Table
    # 1. 'name' is the user-facing name of the column, that is, it came from
    #    pandas
    # 2. 'field_name' and 'name' differ for index columns
    # 3. We fall back on c['name'] for backwards compatibility
    logical_index_names = [
        c['name'] for c in columns
        if c.get('field_name', c['name']) in index_columns_set
    ]

    # There must be the same number of field names and physical names
    # (fields in the arrow Table)
    assert len(logical_index_names) == len(index_columns_set)

    # It can never be the case in a released version of pyarrow that
    # c['name'] is None *and* 'field_name' is not a key in the column metadata,
    # because the change to allow c['name'] to be None and the change to add
    # 'field_name' are in the same release (0.8.0)
    assert all(
        (c['name'] is None and 'field_name' in c) or c['name'] is not None
        for c in columns)

    # Build up a list of index columns and names while removing those columns
    # from the original table
    for raw_name, logical_name in zip(index_columns, logical_index_names):
        i = schema.get_field_index(raw_name)
        if i != -1:
            col = table.column(i)
            col_pandas = col.to_pandas()
            values = col_pandas.values
            if hasattr(values, 'flags') and not values.flags.writeable:
                # ARROW-1054: in pandas 0.19.2, factorize will reject
                # non-writeable arrays when calling MultiIndex.from_arrays
                values = values.copy()

            if isinstance(col_pandas.dtype, DatetimeTZDtype):
                index_array = (
                    pd.Series(values).dt.tz_localize('utc').dt.tz_convert(
                        col_pandas.dtype.tz))
            else:
                index_array = pd.Series(values, dtype=col_pandas.dtype)
            index_arrays.append(index_array)
            index_names.append(
                _backwards_compatible_index_name(raw_name, logical_name))
            block_table = block_table.remove_column(
                block_table.schema.get_field_index(raw_name))

    blocks = _table_to_blocks(options, block_table, pa.default_memory_pool(),
                              categories)

    # Construct the row index
    if len(index_arrays) > 1:
        index = pd.MultiIndex.from_arrays(index_arrays, names=index_names)
    elif len(index_arrays) == 1:
        index = pd.Index(index_arrays[0], name=index_names[0])
    else:
        index = pd.RangeIndex(row_count)

    column_strings = [x.name for x in block_table.itercolumns()]
    if columns:
        columns_name_dict = {
            c.get('field_name', _column_name_to_strings(c['name'])): c['name']
            for c in columns
        }
        columns_values = [
            columns_name_dict.get(name, name) for name in column_strings
        ]
    else:
        columns_values = column_strings

    # If we're passed multiple column indexes then evaluate with
    # ast.literal_eval, since the column index values show up as a list of
    # tuples
    to_pair = ast.literal_eval if len(column_indexes) > 1 else lambda x: (x, )

    # Create the column index

    # Construct the base index
    if not columns_values:
        columns = pd.Index(columns_values)
    else:
        columns = pd.MultiIndex.from_tuples(
            list(map(to_pair, columns_values)),
            names=[col_index['name'] for col_index in column_indexes] or None,
        )

    # if we're reconstructing the index
    if has_pandas_metadata:
        columns = _reconstruct_columns_from_metadata(columns, column_indexes)

    # ARROW-1751: flatten a single level column MultiIndex for pandas 0.21.0
    columns = _flatten_single_level_multiindex(columns)

    axes = [columns, index]
    return _int.BlockManager(blocks, axes)
예제 #34
0
def test_proxy_memory_pool():
    pool = pa.proxy_memory_pool(pa.default_memory_pool())
    check_allocated_bytes(pool)
예제 #35
0
def test_release_unused():
    pool = pa.default_memory_pool()
    pool.release_unused()
예제 #36
0
def test_in_expr_todo():
    import pyarrow.gandiva as gandiva
    # TODO: Implement reasonable support for timestamp, time & date.
    # Current exceptions:
    # pyarrow.lib.ArrowException: ExpressionValidationError:
    # Evaluation expression for IN clause returns XXXX values are of typeXXXX

    # binary
    arr = pa.array([b"ga", b"an", b"nd", b"di", b"iv", b"va"])
    table = pa.Table.from_arrays([arr], ["a"])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    cond = builder.make_in_expression(node_a, [b'an', b'nd'], pa.binary())
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert list(result.to_array()) == [1, 2]

    # timestamp
    datetime_1 = datetime.datetime.utcfromtimestamp(1542238951.621877)
    datetime_2 = datetime.datetime.utcfromtimestamp(1542238911.621877)
    datetime_3 = datetime.datetime.utcfromtimestamp(1542238051.621877)

    arr = pa.array([datetime_1, datetime_2, datetime_3])
    table = pa.Table.from_arrays([arr], ["a"])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    cond = builder.make_in_expression(node_a, [datetime_2], pa.timestamp('ms'))
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert list(result.to_array()) == [1]

    # time
    time_1 = datetime_1.time()
    time_2 = datetime_2.time()
    time_3 = datetime_3.time()

    arr = pa.array([time_1, time_2, time_3])
    table = pa.Table.from_arrays([arr], ["a"])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    cond = builder.make_in_expression(node_a, [time_2], pa.time64('ms'))
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert list(result.to_array()) == [1]

    # date
    date_1 = datetime_1.date()
    date_2 = datetime_2.date()
    date_3 = datetime_3.date()

    arr = pa.array([date_1, date_2, date_3])
    table = pa.Table.from_arrays([arr], ["a"])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    cond = builder.make_in_expression(node_a, [date_2], pa.date32())
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert list(result.to_array()) == [1]