def test_memory_pool_cannot_use_ctor(): with pytest.raises(TypeError): pa.MemoryPool() with pytest.raises(TypeError): pa.ProxyMemoryPool()
def table_to_bytes(table): fd, path = tempfile.mkstemp(suffix='.dat', prefix='arrow-memory-mapped', text=False) os.close(fd) try: #debug_util.breakpoint() mp = pyarrow.MemoryPool(2**64) col_arrays = [] col_names = [] all_names = [] missing_names = [] # add the index column to the list of columns all_names.append("__index_level_0__") if len(table._data_frame.index) > 0: col_names.append("__index_level_0__") col_arrays.append(pyarrow.Array.from_pandas(table._data_frame.index, type=to_pyarrow_type(_types_.STRING), memory_pool=mp)) else: missing_names.append("__index_level_0__") # Serialize the dataframe into a list of pyarrow.Array column by column for i in range(len(table._data_frame.columns)): #missing column ? -> save name and don't send any buffer for column if(table._data_frame.iloc[:,i].isnull().all()): missing_names.append(table.get_name(i)) all_names.append(table.get_name(i)) continue #Convert collection types to binary if table.get_type(i) == _types_.INTEGER_LIST: col_arrays.append(pyarrow.Array.from_pandas(binary_from_list_generator(table._data_frame.iloc[:,i], '<i4'))) elif table.get_type(i) == _types_.LONG_LIST: col_arrays.append(pyarrow.Array.from_pandas(binary_from_list_generator(table._data_frame.iloc[:,i], '<i8'))) elif table.get_type(i) == _types_.DOUBLE_LIST: col_arrays.append(pyarrow.Array.from_pandas(binary_from_list_generator(table._data_frame.iloc[:,i], '<f8'))) elif table.get_type(i) == _types_.BOOLEAN_LIST: col_arrays.append(pyarrow.Array.from_pandas(binary_from_boolean_list_generator(table._data_frame.iloc[:,i]))) elif table.get_type(i) == _types_.STRING_LIST: col_arrays.append(pyarrow.Array.from_pandas(binary_from_string_list_generator(table._data_frame.iloc[:,i]))) elif table.get_type(i) == _types_.BYTES_LIST: col_arrays.append(pyarrow.Array.from_pandas(binary_from_bytes_list_generator(table._data_frame.iloc[:,i]))) elif table.get_type(i) == _types_.INTEGER_SET: col_arrays.append(pyarrow.Array.from_pandas(binary_from_set_generator(table._data_frame.iloc[:,i], '<i4'))) elif table.get_type(i) == _types_.LONG_SET: col_arrays.append(pyarrow.Array.from_pandas(binary_from_set_generator(table._data_frame.iloc[:,i], '<i8'))) elif table.get_type(i) == _types_.DOUBLE_SET: col_arrays.append(pyarrow.Array.from_pandas(binary_from_set_generator(table._data_frame.iloc[:,i], '<f8'))) elif table.get_type(i) == _types_.BOOLEAN_SET: col_arrays.append(pyarrow.Array.from_pandas(binary_from_boolean_set_generator(table._data_frame.iloc[:,i]))) elif table.get_type(i) == _types_.STRING_SET: col_arrays.append(pyarrow.Array.from_pandas(binary_from_string_set_generator(table._data_frame.iloc[:,i]))) elif table.get_type(i) == _types_.BYTES_SET: col_arrays.append(pyarrow.Array.from_pandas(binary_from_bytes_set_generator(table._data_frame.iloc[:,i]))) #Workaround until numpy typecasts are implemented in pyarrow elif table.get_type(i) == _types_.INTEGER and table._data_frame.iloc[:,i].dtype == np.int64: col_arrays.append(pyarrow.Array.from_pandas(np.array(table._data_frame.iloc[:,i], dtype=np.int32), memory_pool=mp)) #Workaround until fixed in pyarrow ... it is assumed that the first non-None object is bytearray if any elif table.get_type(i) == _types_.BYTES and type(get_first_not_None(table._data_frame.iloc[:,i])) == bytearray: col_arrays.append(pyarrow.Array.from_pandas(map(lambda x: x if x is None else bytes(x), table._data_frame.iloc[:,i]), memory_pool=mp)) #create pyarrow.Array else: pa_type = to_pyarrow_type(table.get_type(i)) #pyarrow.binary() type is not allowed as argument for type atm if pa_type == pyarrow.binary(): col_arrays.append(pyarrow.BinaryArray.from_pandas(table._data_frame.iloc[:,i], memory_pool=mp)) else: col_arrays.append(pyarrow.Array.from_pandas(table._data_frame.iloc[:,i], type=pa_type, memory_pool=mp)) col_names.append(table.get_name(i)) all_names.append(table.get_name(i)) #Construct metadata custom_metadata = {"index_columns": [all_names[0]], "columns": [{"name": all_names[0], "metadata": {"serializer_id": "", "type_id": _types_.STRING}}], "missing_columns": missing_names, "num_rows": len(table._data_frame)} real_col_names = list(table._data_frame.columns) for name in all_names[1:]: col_idx = real_col_names.index(name) if table.get_type(col_idx) in [_types_.BYTES, _types_.BYTES_LIST, _types_.BYTES_SET]: custom_metadata['columns'].append({"name": name, "metadata": {"serializer_id": table.get_column_serializers().get(name, ""), "type_id": table.get_type(col_idx)}}) else: custom_metadata['columns'].append({"name": name, "metadata": {"serializer_id": "", "type_id": table.get_type(col_idx)}}) metadata = {b'ArrowSerializationLibrary': json.dumps(custom_metadata).encode('utf-8')} # Empty record batches are not supported, therefore add a dummy array if dataframe is empty if not col_arrays: col_arrays.append(pyarrow.array([0])) col_names.append('dummy') batch = pyarrow.RecordBatch.from_arrays(col_arrays, col_names) schema = batch.schema.remove_metadata() schema = schema.add_metadata(metadata) #Write data to file and return filepath with pyarrow.OSFile(path, 'wb') as f: stream_writer = pyarrow.RecordBatchStreamWriter(f, schema) stream_writer.write_batch(batch) stream_writer.close() return bytearray(path, 'utf-8') except Exception as error: os.remove(path) raise error