def test_in_expr(): import pyarrow.gandiva as gandiva arr = pa.array(["ga", "an", "nd", "di", "iv", "va"]) table = pa.Table.from_arrays([arr], ["a"]) # string builder = gandiva.TreeExprBuilder() node_a = builder.make_field(table.schema.field("a")) cond = builder.make_in_expression(node_a, ["an", "nd"], pa.string()) condition = builder.make_condition(cond) filter = gandiva.make_filter(table.schema, condition) result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool()) assert list(result.to_array()) == [1, 2] # int32 arr = pa.array([3, 1, 4, 1, 5, 9, 2, 6, 5, 4]) table = pa.Table.from_arrays([arr.cast(pa.int32())], ["a"]) node_a = builder.make_field(table.schema.field("a")) cond = builder.make_in_expression(node_a, [1, 5], pa.int32()) condition = builder.make_condition(cond) filter = gandiva.make_filter(table.schema, condition) result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool()) assert list(result.to_array()) == [1, 3, 4, 8] # int64 arr = pa.array([3, 1, 4, 1, 5, 9, 2, 6, 5, 4]) table = pa.Table.from_arrays([arr], ["a"]) node_a = builder.make_field(table.schema.field("a")) cond = builder.make_in_expression(node_a, [1, 5], pa.int64()) condition = builder.make_condition(cond) filter = gandiva.make_filter(table.schema, condition) result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool()) assert list(result.to_array()) == [1, 3, 4, 8]
def test_call_function_with_memory_pool(): arr = pa.array(["foo", "bar", "baz"]) indices = np.array([2, 2, 1]) result1 = arr.take(indices) result2 = pc.call_function('take', [arr, indices], memory_pool=pa.default_memory_pool()) expected = pa.array(["baz", "baz", "bar"]) assert result1.equals(expected) assert result2.equals(expected) result3 = pc.take(arr, indices, memory_pool=pa.default_memory_pool()) assert result3.equals(expected)
def test_logging_memory_pool(capfd): pool = pa.logging_memory_pool(pa.default_memory_pool()) check_allocated_bytes(pool) out, err = capfd.readouterr() assert err == "" assert out.count("Allocate:") > 0 assert out.count("Allocate:") == out.count("Free:")
def test_scanner(dataset): scanner = ds.Scanner(dataset, memory_pool=pa.default_memory_pool()) assert isinstance(scanner, ds.Scanner) assert len(list(scanner.scan())) == 2 with pytest.raises(pa.ArrowInvalid): dataset.scan(columns=['unknown']) scanner = ds.Scanner(dataset, columns=['i64'], memory_pool=pa.default_memory_pool()) assert isinstance(scanner, ds.Scanner) assert len(list(scanner.scan())) == 2 for task in scanner.scan(): for batch in task.execute(): assert batch.num_columns == 1
def table_to_blockmanager(options, table, categories=None, ignore_metadata=False): from pandas.core.internals import BlockManager all_columns = [] column_indexes = [] pandas_metadata = table.schema.pandas_metadata if not ignore_metadata and pandas_metadata is not None: all_columns = pandas_metadata['columns'] column_indexes = pandas_metadata.get('column_indexes', []) index_descriptors = pandas_metadata['index_columns'] table = _add_any_metadata(table, pandas_metadata) table, index = _reconstruct_index(table, index_descriptors, all_columns) else: index = _pandas_api.pd.RangeIndex(table.num_rows) _check_data_column_metadata_consistency(all_columns) blocks = _table_to_blocks(options, table, pa.default_memory_pool(), categories) columns = _deserialize_column_index(table, all_columns, column_indexes) axes = [columns, index] return BlockManager(blocks, axes)
def test_tree_exp_builder(): import pyarrow.gandiva as gandiva builder = gandiva.TreeExprBuilder() field_a = pa.field('a', pa.int32()) field_b = pa.field('b', pa.int32()) schema = pa.schema([field_a, field_b]) field_result = pa.field('res', pa.int32()) node_a = builder.make_field(field_a) node_b = builder.make_field(field_b) condition = builder.make_function("greater_than", [node_a, node_b], pa.bool_()) if_node = builder.make_if(condition, node_a, node_b, pa.int32()) expr = builder.make_expression(if_node, field_result) projector = gandiva.make_projector(schema, [expr], pa.default_memory_pool()) a = pa.array([10, 12, -20, 5], type=pa.int32()) b = pa.array([5, 15, 15, 17], type=pa.int32()) e = pa.array([10, 15, 15, 17], type=pa.int32()) input_batch = pa.RecordBatch.from_arrays([a, b], names=['a', 'b']) r, = projector.evaluate(input_batch) assert r.equals(e)
def gandiva_query(table, query): """ Evaluate string query on the passed table. Parameters ---------- table : pyarrow.Table Table to evaluate query on. query : str Query string to evaluate on the `table` columns. Returns ------- pyarrow.Table """ expr = gen_table_expr(table, query) if not can_be_condition(expr): raise ValueError("Root operation should be a filter.") builder = gandiva.TreeExprBuilder() root = build_node(table, expr.terms, builder) cond = builder.make_condition(root) filt = gandiva.make_filter(table.schema, cond) sel_vec = filt.evaluate(table.to_batches()[0], pa.default_memory_pool()) result = filter_with_selection_vector(table, sel_vec) return result
def test_boolean(): import pyarrow.gandiva as gandiva df = pd.DataFrame({ "a": [1., 31., 46., 3., 57., 44., 22.], "b": [5., 45., 36., 73., 83., 23., 76.] }) table = pa.Table.from_pandas(df) builder = gandiva.TreeExprBuilder() node_a = builder.make_field(table.schema.field_by_name("a")) node_b = builder.make_field(table.schema.field_by_name("b")) fifty = builder.make_literal(50.0, pa.float64()) eleven = builder.make_literal(11.0, pa.float64()) cond_1 = builder.make_function("less_than", [node_a, fifty], pa.bool_()) cond_2 = builder.make_function("greater_than", [node_a, node_b], pa.bool_()) cond_3 = builder.make_function("less_than", [node_b, eleven], pa.bool_()) cond = builder.make_or([builder.make_and([cond_1, cond_2]), cond_3]) condition = builder.make_condition(cond) filter = gandiva.make_filter(table.schema, condition) result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool()) assert list(result.to_array()) == [0, 2, 5]
def test_proxy_memory_pool(): pool = pa.proxy_memory_pool(pa.default_memory_pool()) check_allocated_bytes(pool) wr = weakref.ref(pool) assert wr() is not None del pool assert wr() is None
def gandiva_query(table, query): """ Evaluate string query on the passed table. Parameters ---------- table : pyarrow.Table Table to evaluate query on. query : str Query string to evaluate on the `table` columns. Returns ------- pyarrow.Table """ expr = gen_table_expr(table, query) if not can_be_condition(expr): raise ValueError("Root operation should be a filter.") # We use this import here because of https://github.com/modin-project/modin/issues/3849, # after the issue is fixed we should put the import at the top of this file import pyarrow.gandiva as gandiva builder = gandiva.TreeExprBuilder() root = build_node(table, expr.terms, builder) cond = builder.make_condition(root) filt = gandiva.make_filter(table.schema, cond) sel_vec = filt.evaluate(table.to_batches()[0], pa.default_memory_pool()) result = filter_with_selection_vector(table, sel_vec) return result
def test_scanner_builder(dataset): builder = ds.ScannerBuilder(dataset, memory_pool=pa.default_memory_pool()) scanner = builder.finish() assert isinstance(scanner, ds.Scanner) assert len(list(scanner.scan())) == 2 with pytest.raises(pa.ArrowInvalid): dataset.new_scan().project(['unknown']) builder = dataset.new_scan(memory_pool=pa.default_memory_pool()) scanner = builder.project(['i64']).finish() assert isinstance(scanner, ds.Scanner) assert len(list(scanner.scan())) == 2 for task in scanner.scan(): for batch in task.execute(): assert batch.num_columns == 1
def gandiva_query(table, query): expr = gen_table_expr(table, query) if not can_be_condition(expr): raise ValueError("Root operation should be a filter.") builder = gandiva.TreeExprBuilder() root = build_node(table, expr.terms, builder) cond = builder.make_condition(root) filt = gandiva.make_filter(table.schema, cond) sel_vec = filt.evaluate(table.to_batches()[0], pa.default_memory_pool()) result = filter_with_selection_vector(table, sel_vec) return result
def test_set_memory_pool(): old_pool = pa.default_memory_pool() pool = pa.proxy_memory_pool(old_pool) pa.set_memory_pool(pool) try: allocated_before = pool.bytes_allocated() with allocate_bytes(None, 512): assert pool.bytes_allocated() == allocated_before + 512 assert pool.bytes_allocated() == allocated_before finally: pa.set_memory_pool(old_pool)
def test_filter(): import pyarrow.gandiva as gandiva df = pd.DataFrame({"a": [1.0 * i for i in range(10000)]}) table = pa.Table.from_pandas(df) builder = gandiva.TreeExprBuilder() node_a = builder.make_field(table.schema.field_by_name("a")) thousand = builder.make_literal(1000.0, pa.float64()) cond = builder.make_function("less_than", [node_a, thousand], pa.bool_()) condition = builder.make_condition(cond) filter = gandiva.make_filter(table.schema, condition) result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool()) assert result.to_array().equals(pa.array(range(1000), type=pa.uint32()))
def test_default_memory_pool(): gc.collect() bytes_before_default = pa.total_allocated_bytes() bytes_before_jemalloc = pa.jemalloc_memory_pool().bytes_allocated() old_memory_pool = pa.default_memory_pool() pa.set_memory_pool(pa.jemalloc_memory_pool()) array = pa.array([1, None, 3, None]) # noqa pa.set_memory_pool(old_memory_pool) gc.collect() assert pa.total_allocated_bytes() == bytes_before_default assert (pa.jemalloc_memory_pool().bytes_allocated() > bytes_before_jemalloc)
def test_filter(): import pyarrow.gandiva as gandiva table = pa.Table.from_arrays([pa.array([1.0 * i for i in range(10000)])], ['a']) builder = gandiva.TreeExprBuilder() node_a = builder.make_field(table.schema.field("a")) thousand = builder.make_literal(1000.0, pa.float64()) cond = builder.make_function("less_than", [node_a, thousand], pa.bool_()) condition = builder.make_condition(cond) filter = gandiva.make_filter(table.schema, condition) # Gandiva generates compute kernel function named `@expr_X` assert filter.llvm_ir.find("@expr_") != -1 result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool()) assert result.to_array().equals(pa.array(range(1000), type=pa.uint32()))
def test_regex(): import pyarrow.gandiva as gandiva elements = ["park", "sparkle", "bright spark and fire", "spark"] data = pa.array(elements, type=pa.string()) table = pa.Table.from_arrays([data], names=['a']) builder = gandiva.TreeExprBuilder() node_a = builder.make_field(table.schema.field_by_name("a")) regex = builder.make_literal("%spark%", pa.string()) like = builder.make_function("like", [node_a, regex], pa.bool_()) field_result = pa.field("b", pa.bool_()) expr = builder.make_expression(like, field_result) projector = gandiva.make_projector(table.schema, [expr], pa.default_memory_pool()) r, = projector.evaluate(table.to_batches()[0]) b = pa.array([False, True, True, True], type=pa.bool_()) assert r.equals(b)
def test_filter_project(): import pyarrow.gandiva as gandiva mpool = pa.default_memory_pool() # Create a table with some sample data array0 = pa.array([10, 12, -20, 5, 21, 29], pa.int32()) array1 = pa.array([5, 15, 15, 17, 12, 3], pa.int32()) array2 = pa.array([1, 25, 11, 30, -21, None], pa.int32()) table = pa.Table.from_arrays([array0, array1, array2], ['a', 'b', 'c']) field_result = pa.field("res", pa.int32()) builder = gandiva.TreeExprBuilder() node_a = builder.make_field(table.schema.field("a")) node_b = builder.make_field(table.schema.field("b")) node_c = builder.make_field(table.schema.field("c")) greater_than_function = builder.make_function("greater_than", [node_a, node_b], pa.bool_()) filter_condition = builder.make_condition(greater_than_function) project_condition = builder.make_function("less_than", [node_b, node_c], pa.bool_()) if_node = builder.make_if(project_condition, node_b, node_c, pa.int32()) expr = builder.make_expression(if_node, field_result) # Build a filter for the expressions. filter = gandiva.make_filter(table.schema, filter_condition) # Build a projector for the expressions. projector = gandiva.make_projector(table.schema, [expr], mpool, "UINT32") # Evaluate filter selection_vector = filter.evaluate(table.to_batches()[0], mpool) # Evaluate project r, = projector.evaluate(table.to_batches()[0], selection_vector) exp = pa.array([1, -21, None], pa.int32()) assert r.equals(exp)
def test_tree_exp_builder(): import pyarrow.gandiva as gandiva builder = gandiva.TreeExprBuilder() field_a = pa.field('a', pa.int32()) field_b = pa.field('b', pa.int32()) schema = pa.schema([field_a, field_b]) field_result = pa.field('res', pa.int32()) node_a = builder.make_field(field_a) node_b = builder.make_field(field_b) assert node_a.return_type() == field_a.type condition = builder.make_function("greater_than", [node_a, node_b], pa.bool_()) if_node = builder.make_if(condition, node_a, node_b, pa.int32()) expr = builder.make_expression(if_node, field_result) assert expr.result().type == pa.int32() projector = gandiva.make_projector(schema, [expr], pa.default_memory_pool()) # Gandiva generates compute kernel function named `@expr_X` assert projector.llvm_ir.find("@expr_") != -1 a = pa.array([10, 12, -20, 5], type=pa.int32()) b = pa.array([5, 15, 15, 17], type=pa.int32()) e = pa.array([10, 15, 15, 17], type=pa.int32()) input_batch = pa.RecordBatch.from_arrays([a, b], names=['a', 'b']) r, = projector.evaluate(input_batch) assert r.equals(e)
def test_boolean(): import pyarrow.gandiva as gandiva table = pa.Table.from_arrays([ pa.array([1., 31., 46., 3., 57., 44., 22.]), pa.array([5., 45., 36., 73., 83., 23., 76.]) ], ['a', 'b']) builder = gandiva.TreeExprBuilder() node_a = builder.make_field(table.schema.field("a")) node_b = builder.make_field(table.schema.field("b")) fifty = builder.make_literal(50.0, pa.float64()) eleven = builder.make_literal(11.0, pa.float64()) cond_1 = builder.make_function("less_than", [node_a, fifty], pa.bool_()) cond_2 = builder.make_function("greater_than", [node_a, node_b], pa.bool_()) cond_3 = builder.make_function("less_than", [node_b, eleven], pa.bool_()) cond = builder.make_or([builder.make_and([cond_1, cond_2]), cond_3]) condition = builder.make_condition(cond) filter = gandiva.make_filter(table.schema, condition) result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool()) assert result.to_array().equals(pa.array([0, 2, 5], type=pa.uint32()))
def test_table(): import pyarrow.gandiva as gandiva df = pd.DataFrame({"a": [1.0, 2.0], "b": [3.0, 4.0]}) table = pa.Table.from_pandas(df) builder = gandiva.TreeExprBuilder() node_a = builder.make_field(table.schema.field_by_name("a")) node_b = builder.make_field(table.schema.field_by_name("b")) sum = builder.make_function("add", [node_a, node_b], pa.float64()) field_result = pa.field("c", pa.float64()) expr = builder.make_expression(sum, field_result) projector = gandiva.make_projector(table.schema, [expr], pa.default_memory_pool()) # TODO: Add .evaluate function which can take Tables instead of # RecordBatches r, = projector.evaluate(table.to_batches()[0]) e = pa.Array.from_pandas(df["a"] + df["b"]) assert r.equals(e)
def test_table(): import pyarrow.gandiva as gandiva table = pa.Table.from_arrays( [pa.array([1.0, 2.0]), pa.array([3.0, 4.0])], ['a', 'b']) builder = gandiva.TreeExprBuilder() node_a = builder.make_field(table.schema.field("a")) node_b = builder.make_field(table.schema.field("b")) sum = builder.make_function("add", [node_a, node_b], pa.float64()) field_result = pa.field("c", pa.float64()) expr = builder.make_expression(sum, field_result) projector = gandiva.make_projector(table.schema, [expr], pa.default_memory_pool()) # TODO: Add .evaluate function which can take Tables instead of # RecordBatches r, = projector.evaluate(table.to_batches()[0]) e = pa.array([4.0, 6.0]) assert r.equals(e)
def table_to_bytes(table): global _temp_dir if _temp_dir is None or not os.path.exists(_temp_dir): _temp_dir = tempfile.mkdtemp(prefix='knime-python-') # Delete temporary directory upon Python shutdown. atexit.register(close) fd, path = tempfile.mkstemp(suffix='.dat', prefix='python-to-java-', dir=_temp_dir, text=False) try: os.close(fd) mp = pyarrow.default_memory_pool() col_arrays = [] col_names = [] all_names = [] missing_names = [] # add the index column to the list of columns all_names.append("__index_level_0__") if len(table._data_frame.index) > 0: col_names.append("__index_level_0__") col_arrays.append( pyarrow.Array.from_pandas(table._data_frame.index, type=to_pyarrow_type(_types_.STRING), memory_pool=mp)) else: missing_names.append("__index_level_0__") # Serialize the dataframe into a list of pyarrow.Array column by column for i in range(len(table._data_frame.columns)): # Do not allocate a buffer for columns that only contain missing values. We track and transfer their names # to give them special treatment on Java side. # This also covers tables of row count zero. if table._data_frame.iloc[:, i].isnull().all(): missing_names.append(table.get_name(i)) all_names.append(table.get_name(i)) continue # Convert collection types to binary if table.get_type(i) == _types_.INTEGER_LIST: col_arrays.append( pyarrow.Array.from_pandas( binary_from_list_generator( table._data_frame.iloc[:, i], '<i4'))) elif table.get_type(i) == _types_.LONG_LIST: col_arrays.append( pyarrow.Array.from_pandas( binary_from_list_generator( table._data_frame.iloc[:, i], '<i8'))) elif table.get_type(i) == _types_.DOUBLE_LIST: col_arrays.append( pyarrow.Array.from_pandas( binary_from_list_generator( table._data_frame.iloc[:, i], '<f8'))) elif table.get_type(i) == _types_.FLOAT_LIST: col_arrays.append( pyarrow.Array.from_pandas( binary_from_list_generator( table._data_frame.iloc[:, i], '<f4'))) elif table.get_type(i) == _types_.BOOLEAN_LIST: col_arrays.append( pyarrow.Array.from_pandas( binary_from_boolean_list_generator( table._data_frame.iloc[:, i]))) elif table.get_type(i) == _types_.STRING_LIST: col_arrays.append( pyarrow.Array.from_pandas( binary_from_string_list_generator( table._data_frame.iloc[:, i]))) elif table.get_type(i) == _types_.BYTES_LIST: col_arrays.append( pyarrow.Array.from_pandas( binary_from_bytes_list_generator( table._data_frame.iloc[:, i]))) elif table.get_type(i) == _types_.INTEGER_SET: col_arrays.append( pyarrow.Array.from_pandas( binary_from_set_generator(table._data_frame.iloc[:, i], '<i4'))) elif table.get_type(i) == _types_.LONG_SET: col_arrays.append( pyarrow.Array.from_pandas( binary_from_set_generator(table._data_frame.iloc[:, i], '<i8'))) elif table.get_type(i) == _types_.DOUBLE_SET: col_arrays.append( pyarrow.Array.from_pandas( binary_from_set_generator(table._data_frame.iloc[:, i], '<f8'))) elif table.get_type(i) == _types_.FLOAT_SET: col_arrays.append( pyarrow.Array.from_pandas( binary_from_set_generator(table._data_frame.iloc[:, i], '<f4'))) elif table.get_type(i) == _types_.BOOLEAN_SET: col_arrays.append( pyarrow.Array.from_pandas( binary_from_boolean_set_generator( table._data_frame.iloc[:, i]))) elif table.get_type(i) == _types_.STRING_SET: col_arrays.append( pyarrow.Array.from_pandas( binary_from_string_set_generator( table._data_frame.iloc[:, i]))) elif table.get_type(i) == _types_.BYTES_SET: col_arrays.append( pyarrow.Array.from_pandas( binary_from_bytes_set_generator( table._data_frame.iloc[:, i]))) # Workaround until numpy typecasts are implemented in pyarrow elif table.get_type( i ) == _types_.INTEGER and table._data_frame.iloc[:, i].dtype == np.int64: col_arrays.append( pyarrow.Array.from_pandas(np.array( table._data_frame.iloc[:, i], dtype=np.int32), memory_pool=mp)) # Workaround until fixed in pyarrow ... it is assumed that the first non-None object is bytearray if any elif table.get_type(i) == _types_.BYTES and type( get_first_not_None( table._data_frame.iloc[:, i])) == bytearray: col_arrays.append( pyarrow.Array.from_pandas(map( lambda x: x if x is None else bytes(x), table._data_frame.iloc[:, i]), memory_pool=mp)) # create pyarrow.Array else: pa_type = to_pyarrow_type(table.get_type(i)) # pyarrow.binary() type is not allowed as argument for type atm if pa_type == pyarrow.binary(): col_arrays.append( pyarrow.BinaryArray.from_pandas( table._data_frame.iloc[:, i], memory_pool=mp)) else: col_arrays.append( pyarrow.Array.from_pandas(table._data_frame.iloc[:, i], type=pa_type, memory_pool=mp)) col_names.append(table.get_name(i)) all_names.append(table.get_name(i)) # Construct metadata custom_metadata = { "index_columns": [all_names[0]], "columns": [{ "name": all_names[0], "metadata": { "serializer_id": "", "type_id": _types_.STRING } }], "missing_columns": missing_names, "num_rows": len(table._data_frame) } real_col_names = list(table._data_frame.columns) for name in all_names[1:]: col_idx = real_col_names.index(name) if table.get_type(col_idx) in [ _types_.BYTES, _types_.BYTES_LIST, _types_.BYTES_SET ]: custom_metadata['columns'].append({ "name": name, "metadata": { "serializer_id": table.get_column_serializers().get(name, ""), "type_id": table.get_type(col_idx) } }) else: custom_metadata['columns'].append({ "name": name, "metadata": { "serializer_id": "", "type_id": table.get_type(col_idx) } }) metadata = { b'ArrowSerializationLibrary': json.dumps(custom_metadata).encode('utf-8') } batch = pyarrow.RecordBatch.from_arrays(col_arrays, col_names) schema = batch.schema.remove_metadata() schema = schema.add_metadata(metadata) # Write data to file and return filepath with pyarrow.OSFile(path, 'wb') as f: stream_writer = pyarrow.RecordBatchStreamWriter(f, schema) stream_writer.write_batch(batch) stream_writer.close() return bytearray(path, 'utf-8') except BaseException: PythonUtils.invoke_safely(None, os.remove, [path]) raise
def memory_and_io_interfaces_example(): # pyarrow.Buffer. data = b"abcdefghijklmnopqrstuvwxyz" # Creating a Buffer in this way does not allocate any memory; it is a zero-copy view on the memory exported from the data bytes object. buf = pa.py_buffer(data) # External memory, under the form of a raw pointer and size, can also be referenced using the foreign_buffer() function. #buf = pa.foreign_buffer(data) print("buf = {}.".format(buf)) print("buf.size = {}.".format(buf.size)) print("memoryview(buf) = {}.".format(memoryview(buf))) print("buf.to_pybytes() = {}.".format(buf.to_pybytes())) #-------------------- # Memory pools. print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes())) buf = pa.allocate_buffer(1024, resizable=True) print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes())) buf.resize(2048) print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes())) buf = None print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes())) print("pa.default_memory_pool().backend_name = {}.".format(pa.default_memory_pool().backend_name)) #-------------------- # Input and output streams. buf = memoryview(b"some data") stream = pa.input_stream(buf) print("stream.read(4) = {}.".format(stream.read(4))) import gzip with gzip.open("./example.gz", "wb") as f: f.write(b"some data\n" * 3) stream = pa.input_stream("./example.gz") print("stream.read() = {}.".format(stream.read())) with pa.output_stream("./example1.dat") as stream: stream.write(b"some data") f = open("./example1.dat", "rb") print("f.read() = {}.".format(f.read())) #-------------------- # On-disk and memory mapped files. # Using regular Python. with open("./example2.dat", "wb") as f: f.write(b"some example data") file_obj = pa.OSFile("./example2.dat") print("file_obj.read(4) = {}.".format(file_obj.read(4))) # Using pyarrow's OSFile class. with pa.OSFile("./example3.dat", "wb") as f: f.write(b"some example data") mmap = pa.memory_map("./example3.dat") print("mmap.read(4) = {}.".format(mmap.read(4))) mmap.seek(0) buf = mmap.read_buffer(4) print("buf = {}.".format(buf)) print("buf.to_pybytes() = {}.".format(buf.to_pybytes())) #-------------------- # In-memory reading and writing. writer = pa.BufferOutputStream() writer.write(b"hello, friends") buf = writer.getvalue() print("buf = {}.".format(buf)) print("buf.size = {}.".format(buf.size)) reader = pa.BufferReader(buf) reader.seek(7) print("reader.read(7) = {}.".format(reader.read(7)))
def test_default_allocated_bytes(): pool = pa.default_memory_pool() with allocate_bytes(pool, 1024): check_allocated_bytes(pool) assert pool.bytes_allocated() == pa.total_allocated_bytes()
def __init__(self): self.start_use = pa.total_allocated_bytes() self.start_rss = get_rss() self.pool = pa.default_memory_pool() self.start_peak_use = self.pool.max_memory()
def test_default_backend_name(): pool = pa.default_memory_pool() assert pool.backend_name in possible_backends
def table_to_blockmanager(options, table, categories=None, ignore_metadata=False): index_columns = [] columns = [] column_indexes = [] index_arrays = [] index_names = [] schema = table.schema row_count = table.num_rows metadata = schema.metadata has_pandas_metadata = (not ignore_metadata and metadata is not None and b'pandas' in metadata) if has_pandas_metadata: pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8')) index_columns = pandas_metadata['index_columns'] columns = pandas_metadata['columns'] column_indexes = pandas_metadata.get('column_indexes', []) table = _add_any_metadata(table, pandas_metadata) block_table = table index_columns_set = frozenset(index_columns) # 0. 'field_name' is the name of the column in the arrow Table # 1. 'name' is the user-facing name of the column, that is, it came from # pandas # 2. 'field_name' and 'name' differ for index columns # 3. We fall back on c['name'] for backwards compatibility logical_index_names = [ c['name'] for c in columns if c.get('field_name', c['name']) in index_columns_set ] # There must be the same number of field names and physical names # (fields in the arrow Table) assert len(logical_index_names) == len(index_columns_set) # It can never be the case in a released version of pyarrow that # c['name'] is None *and* 'field_name' is not a key in the column metadata, # because the change to allow c['name'] to be None and the change to add # 'field_name' are in the same release (0.8.0) assert all( (c['name'] is None and 'field_name' in c) or c['name'] is not None for c in columns) # Build up a list of index columns and names while removing those columns # from the original table for raw_name, logical_name in zip(index_columns, logical_index_names): i = schema.get_field_index(raw_name) if i != -1: col = table.column(i) col_pandas = col.to_pandas() values = col_pandas.values if hasattr(values, 'flags') and not values.flags.writeable: # ARROW-1054: in pandas 0.19.2, factorize will reject # non-writeable arrays when calling MultiIndex.from_arrays values = values.copy() if isinstance(col_pandas.dtype, DatetimeTZDtype): index_array = ( pd.Series(values).dt.tz_localize('utc').dt.tz_convert( col_pandas.dtype.tz)) else: index_array = pd.Series(values, dtype=col_pandas.dtype) index_arrays.append(index_array) index_names.append( _backwards_compatible_index_name(raw_name, logical_name)) block_table = block_table.remove_column( block_table.schema.get_field_index(raw_name)) blocks = _table_to_blocks(options, block_table, pa.default_memory_pool(), categories) # Construct the row index if len(index_arrays) > 1: index = pd.MultiIndex.from_arrays(index_arrays, names=index_names) elif len(index_arrays) == 1: index = pd.Index(index_arrays[0], name=index_names[0]) else: index = pd.RangeIndex(row_count) column_strings = [x.name for x in block_table.itercolumns()] if columns: columns_name_dict = { c.get('field_name', _column_name_to_strings(c['name'])): c['name'] for c in columns } columns_values = [ columns_name_dict.get(name, name) for name in column_strings ] else: columns_values = column_strings # If we're passed multiple column indexes then evaluate with # ast.literal_eval, since the column index values show up as a list of # tuples to_pair = ast.literal_eval if len(column_indexes) > 1 else lambda x: (x, ) # Create the column index # Construct the base index if not columns_values: columns = pd.Index(columns_values) else: columns = pd.MultiIndex.from_tuples( list(map(to_pair, columns_values)), names=[col_index['name'] for col_index in column_indexes] or None, ) # if we're reconstructing the index if has_pandas_metadata: columns = _reconstruct_columns_from_metadata(columns, column_indexes) # ARROW-1751: flatten a single level column MultiIndex for pandas 0.21.0 columns = _flatten_single_level_multiindex(columns) axes = [columns, index] return _int.BlockManager(blocks, axes)
def test_proxy_memory_pool(): pool = pa.proxy_memory_pool(pa.default_memory_pool()) check_allocated_bytes(pool)
def test_release_unused(): pool = pa.default_memory_pool() pool.release_unused()
def test_in_expr_todo(): import pyarrow.gandiva as gandiva # TODO: Implement reasonable support for timestamp, time & date. # Current exceptions: # pyarrow.lib.ArrowException: ExpressionValidationError: # Evaluation expression for IN clause returns XXXX values are of typeXXXX # binary arr = pa.array([b"ga", b"an", b"nd", b"di", b"iv", b"va"]) table = pa.Table.from_arrays([arr], ["a"]) builder = gandiva.TreeExprBuilder() node_a = builder.make_field(table.schema.field("a")) cond = builder.make_in_expression(node_a, [b'an', b'nd'], pa.binary()) condition = builder.make_condition(cond) filter = gandiva.make_filter(table.schema, condition) result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool()) assert list(result.to_array()) == [1, 2] # timestamp datetime_1 = datetime.datetime.utcfromtimestamp(1542238951.621877) datetime_2 = datetime.datetime.utcfromtimestamp(1542238911.621877) datetime_3 = datetime.datetime.utcfromtimestamp(1542238051.621877) arr = pa.array([datetime_1, datetime_2, datetime_3]) table = pa.Table.from_arrays([arr], ["a"]) builder = gandiva.TreeExprBuilder() node_a = builder.make_field(table.schema.field("a")) cond = builder.make_in_expression(node_a, [datetime_2], pa.timestamp('ms')) condition = builder.make_condition(cond) filter = gandiva.make_filter(table.schema, condition) result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool()) assert list(result.to_array()) == [1] # time time_1 = datetime_1.time() time_2 = datetime_2.time() time_3 = datetime_3.time() arr = pa.array([time_1, time_2, time_3]) table = pa.Table.from_arrays([arr], ["a"]) builder = gandiva.TreeExprBuilder() node_a = builder.make_field(table.schema.field("a")) cond = builder.make_in_expression(node_a, [time_2], pa.time64('ms')) condition = builder.make_condition(cond) filter = gandiva.make_filter(table.schema, condition) result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool()) assert list(result.to_array()) == [1] # date date_1 = datetime_1.date() date_2 = datetime_2.date() date_3 = datetime_3.date() arr = pa.array([date_1, date_2, date_3]) table = pa.Table.from_arrays([arr], ["a"]) builder = gandiva.TreeExprBuilder() node_a = builder.make_field(table.schema.field("a")) cond = builder.make_in_expression(node_a, [date_2], pa.date32()) condition = builder.make_condition(cond) filter = gandiva.make_filter(table.schema, condition) result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool()) assert list(result.to_array()) == [1]