def test_jvm_record_batch(root_allocator, pa_type, py_data, jvm_type, jvm_spec): # Create vector cls = "org.apache.arrow.vector.{}".format(jvm_type) jvm_vector = jpype.JClass(cls)("vector", root_allocator) jvm_vector.allocateNew(len(py_data)) for i, val in enumerate(py_data): jvm_vector.setSafe(i, val) jvm_vector.setValueCount(len(py_data)) # Create field spec = { 'name': 'field_name', 'nullable': False, 'type': json.loads(jvm_spec), # TODO: This needs to be set for complex types 'children': [] } jvm_field = _jvm_field(json.dumps(spec)) # Create VectorSchemaRoot jvm_fields = jpype.JClass('java.util.ArrayList')() jvm_fields.add(jvm_field) jvm_vectors = jpype.JClass('java.util.ArrayList')() jvm_vectors.add(jvm_vector) jvm_vsr = jpype.JClass('org.apache.arrow.vector.VectorSchemaRoot') jvm_vsr = jvm_vsr(jvm_fields, jvm_vectors, len(py_data)) py_record_batch = pa.RecordBatch.from_arrays( [pa.array(py_data, type=pa_type)], ['col'] ) jvm_record_batch = pa_jvm.record_batch(jvm_vsr) assert py_record_batch.equals(jvm_record_batch)
def test_jvm_record_batch(root_allocator, pa_type, py_data, jvm_type, jvm_spec): # Create vector cls = "org.apache.arrow.vector.{}".format(jvm_type) jvm_vector = jpype.JClass(cls)("vector", root_allocator) jvm_vector.allocateNew(len(py_data)) for i, val in enumerate(py_data): jvm_vector.setSafe(i, val) jvm_vector.setValueCount(len(py_data)) # Create field spec = { 'name': 'field_name', 'nullable': False, 'type': json.loads(jvm_spec), # TODO: This needs to be set for complex types 'children': [] } jvm_field = _jvm_field(json.dumps(spec)) # Create VectorSchemaRoot jvm_fields = jpype.JClass('java.util.ArrayList')() jvm_fields.add(jvm_field) jvm_vectors = jpype.JClass('java.util.ArrayList')() jvm_vectors.add(jvm_vector) jvm_vsr = jpype.JClass('org.apache.arrow.vector.VectorSchemaRoot') jvm_vsr = jvm_vsr(jvm_fields, jvm_vectors, len(py_data)) py_record_batch = pa.RecordBatch.from_arrays( [pa.array(py_data, type=pa_type)], ['col']) jvm_record_batch = pa_jvm.record_batch(jvm_vsr) assert py_record_batch.equals(jvm_record_batch)
def read_sql_pyarrow(query,conn,batchsize=100000): from jpype import imports with jaydebeapi._jdbc_connect_jpype(**srccon) as conn: #conn = db_engine.raw_connection() from org.apache.arrow.adapter.jdbc import JdbcToArrow import org.apache.arrow.memory.RootAllocator as rootallocator from org.apache.arrow.adapter.jdbc import JdbcToArrowConfigBuilder from org.apache.arrow.adapter.jdbc import JdbcToArrowConfig from org.apache.arrow.adapter.jdbc import JdbcToArrowUtils ra=rootallocator(sys.maxsize) ca=JdbcToArrowUtils.getUtcCalendar() configbuild=JdbcToArrowConfigBuilder(ra,ca,True) configbuild.setTargetBatchSize(batchsize) config=configbuild.build() cur=conn.createStatement() rs=cur.executeQuery(query) batch=JdbcToArrow.sqlToArrowVectorIterator( rs, config) while batch.hasNext(): df = jvm.record_batch(batch.next()).to_pandas() yield df