def readCarbon(key, secret, end_point, path, label): from jnius import autoclass java_list_class = autoclass('java.util.ArrayList') projection_list = java_list_class() projection_list.add("name") reader = CarbonReader() \ .builder() \ .withBatch(780) \ .withFolder(path) \ .withHadoopConf("fs.s3a.access.key", key) \ .withHadoopConf("fs.s3a.secret.key", secret) \ .withHadoopConf("fs.s3a.endpoint", end_point) \ .projection(projection_list) \ .filterEqual("name", label) \ .build() data_list = [] while reader.hasNext(): rows = reader.readNextBatchRow() for row in rows: data_list.append(row) reader.close() return data_list
def test_run_read_carbon_by_file_lists(): from jnius import autoclass java_list_class = autoclass('java.util.ArrayList') java_list = java_list_class() java_list.add(LOCAL_DATA_PATH + "/sub1/part-0-1196034485149392_batchno0-0-null-1196033673787967.carbondata") java_list.add(LOCAL_DATA_PATH + "/sub2/part-0-1196034758543568_batchno0-0-null-1196034721553227.carbondata") projection_list = java_list_class() projection_list.add("name") projection_list.add("age") projection_list.add("image1") projection_list.add("image2") projection_list.add("image3") reader = CarbonReader() \ .builder() \ .withFileLists(java_list) \ .withBatch(1000) \ .projection(projection_list) \ .build() num = 0 while reader.hasNext(): rows = reader.readNextBatchRow() num += len(rows) assert 20 == num reader.close()
def test_run_write_carbon_binary_base64_encode(): jsonSchema = "[{stringField:string},{shortField:short},{intField:int},{binaryField:binary}]" path = "/tmp/data/writeCarbon" + str(time.time()) if os.path.exists(path): shutil.rmtree(path) jpg_path = IMAGE_DATA_PATH + "/carbondatalogo.jpg" writer = CarbonWriter() \ .builder() \ .outputPath(path) \ .withCsvInput(jsonSchema) \ .writtenBy("pycarbon") \ .build() with open(jpg_path, mode='rb+') as file_object: content = file_object.read() for i in range(0, 10): from jnius import autoclass arrayListClass = autoclass("java.util.ArrayList") data_list = arrayListClass() data_list.add("pycarbon") data_list.add(str(i)) data_list.add(str(i * 10)) data_list.add(base64.b64encode(content)) writer.write(data_list.toArray()) writer.close() reader = CarbonReader() \ .builder() \ .withFolder(path) \ .withBatch(1000) \ .build() i = 0 while reader.hasNext(): rows = reader.readNextBatchRow() for row in rows: i += 1 for column in row: from jnius.jnius import ByteArray if 1 == i and isinstance(column, ByteArray) and len(column) > 1000: with open(path + "/image.jpg", 'wb+') as file_object: file_object.write(base64.b64decode(column.tostring())) assert 10 == i reader.close() shutil.rmtree(path)
def test_run_read_carbon_from_local(): reader = CarbonReader() \ .builder() \ .withBatch(780) \ .withFolder(LOCAL_DATA_PATH) \ .build() num = 0 while reader.hasNext(): rows = reader.readNextBatchRow() num += len(rows) assert 30 == num reader.close()
def test_run_read_carbon_by_file(): reader = CarbonReader() \ .builder() \ .withFile(LOCAL_DATA_PATH + "/sub1/part-0-1196034485149392_batchno0-0-null-1196033673787967.carbondata") \ .withBatch(1000) \ .build() num = 0 while reader.hasNext(): rows = reader.readNextBatchRow() num += len(rows) assert 10 == num reader.close()
def test_run_read_carbon_from_local_for_filter(): reader = CarbonReader() \ .builder() \ .withBatch(10) \ .withFolder(LOCAL_DATA_PATH) \ .filterEqual("name", "robot0") \ .build() num = 0 while reader.hasNext(): rows = reader.readNextBatchRow() num += len(rows) assert 3 == num reader.close()
def test_run_read_carbon_from_obs(): reader = CarbonReader() \ .builder() \ .withBatch(1000) \ .withFolder(S3_DATA_PATH) \ .withHadoopConf("fs.s3a.access.key", pytest.config.getoption("--access_key")) \ .withHadoopConf("fs.s3a.secret.key", pytest.config.getoption("--secret_key")) \ .withHadoopConf("fs.s3a.endpoint", pytest.config.getoption("--end_point")) \ .build() num = 0 while reader.hasNext(): rows = reader.readNextBatchRow() num += len(rows) assert 30 == num reader.close()
def test_run_write_carbon(): jsonSchema = "[{stringField:string},{shortField:short},{intField:int}]" path = "/tmp/data/writeCarbon" + str(time.time()) if os.path.exists(path): shutil.rmtree(path) writer = CarbonWriter() \ .builder() \ .outputPath(path) \ .withCsvInput(jsonSchema) \ .writtenBy("pycarbon") \ .build() for i in range(0, 10): from jnius import autoclass arrayListClass = autoclass("java.util.ArrayList") data_list = arrayListClass() data_list.add("pycarbon") data_list.add(str(i)) data_list.add(str(i * 10)) writer.write(data_list.toArray()) writer.close() reader = CarbonReader() \ .builder() \ .withFolder(path) \ .withBatch(1000) \ .build() i = 0 while reader.hasNext(): rows = reader.readNextBatchRow() i += len(rows) assert 10 == i reader.close() shutil.rmtree(path)
def test_run_read_carbon_from_local_for_projection(): from jnius import autoclass java_list_class = autoclass('java.util.ArrayList') projection_list = java_list_class() projection_list.add("name") projection_list.add("age") projection_list.add("image1") projection_list.add("image2") projection_list.add("image3") reader = CarbonReader() \ .builder() \ .withBatch(100) \ .withFolder(LOCAL_DATA_PATH) \ .projection(projection_list) \ .build() num = 0 while reader.hasNext(): rows = reader.readNextBatchRow() num += len(rows) assert 30 == num reader.close()
def test_run_read_carbon_from_obs_for_filter(): from jnius import autoclass java_list_class = autoclass('java.util.ArrayList') projection_list = java_list_class() projection_list.add("name") reader = CarbonReader() \ .builder() \ .withBatch(780) \ .withFolder(S3_DATA_PATH) \ .withHadoopConf("fs.s3a.access.key", pytest.config.getoption("--access_key")) \ .withHadoopConf("fs.s3a.secret.key", pytest.config.getoption("--secret_key")) \ .withHadoopConf("fs.s3a.endpoint", pytest.config.getoption("--end_point")) \ .projection(projection_list) \ .filterEqual("name", "robot0") \ .build() num = 0 while reader.hasNext(): rows = reader.readNextBatchRow() num += len(rows) assert 3 == num reader.close()
def test_run_write_carbon_binary_base64_encode_decodeInJava_many_files(): jsonSchema = "[{stringField:string},{shortField:short},{intField:int},{binaryField:binary},{txtField:string}]" path = "/tmp/data/writeCarbon" + str(time.time()) if os.path.exists(path): shutil.rmtree(path) jpg_path = IMAGE_DATA_PATH + "/flowers" from jnius import autoclass sdkUtilClass = autoclass("org.apache.carbondata.sdk.file.utils.SDKUtil") jpg_files = sdkUtilClass.listFiles(jpg_path, '.jpg') writer = CarbonWriter() \ .builder() \ .outputPath(path) \ .withCsvInput(jsonSchema) \ .writtenBy("pycarbon") \ .withLoadOption("binary_decoder", "base64") \ .withPageSizeInMb(1) \ .build() for i in range(0, jpg_files.size()): jpg_path = jpg_files.get(i) with open(jpg_path, mode='rb+') as file_object: content = file_object.read() with open(str(jpg_path).replace('.jpg', '.txt'), mode='r+') as file_object: txt = file_object.read() arrayListClass = autoclass("java.util.ArrayList") data_list = arrayListClass() data_list.add("pycarbon") data_list.add(str(i)) data_list.add(str(i * 10)) data_list.add(base64.b64encode(content)) data_list.add(txt) writer.write(data_list.toArray()) writer.close() reader = CarbonReader() \ .builder() \ .withFolder(path) \ .withBatch(1000) \ .build() i = 0 while reader.hasNext(): rows = reader.readNextBatchRow() for row in rows: i += 1 for column in row: from jnius.jnius import ByteArray if isinstance(column, ByteArray) and len(column) > 1000 and i < 20: with open(path + "/image" + str(i) + ".jpg", 'wb+') as file_object: file_object.write((column.tostring())) assert 3 == i reader.close() shutil.rmtree(path)
def test_download_carbon_from_obs_and_read(): key = pytest.config.getoption("--access_key") secret = pytest.config.getoption("--secret_key") end_point = pytest.config.getoption("--end_point") def list_obs_files(obs_client, bucket_name, prefix): files = [] pageSize = 1000 index = 1 nextMarker = None while True: resp = obs_client.listObjects(bucket_name, prefix=prefix, max_keys=pageSize, marker=nextMarker) for content in resp.body.contents: files.append(content.key) if not resp.body.is_truncated: break nextMarker = resp.body.next_marker index += 1 return files def read_obs_files(key, secret, end_point, bucket_name, prefix, downloadPath): obsClient = ObsClient( access_key_id=key, secret_access_key=secret, server=end_point, long_conn_mode=True ) files = list_obs_files(obsClient, bucket_name, prefix) numOfFiles = len(files) print(numOfFiles) num = 0 for file in files: num = num + 1 # obsClient.l obsClient.getObject(bucket_name, file, downloadPath=downloadPath + file) # resp.body.buffer if 0 == num % (numOfFiles / 10): print(str(num) + ":" + file) obsClient.close() downloadPath = '/tmp/carbonbinary/' if os.path.exists(downloadPath): shutil.rmtree(downloadPath) read_obs_files(key, secret, end_point, 'sdk', 'binary', downloadPath) reader = CarbonReader() \ .builder() \ .withBatch(1000) \ .withFolder(downloadPath) \ .build() num = 0 while reader.hasNext(): rows = reader.readNextBatchRow() num += len(rows) assert 30 == num reader.close() shutil.rmtree(downloadPath)