def test_read_carbon_from_local_by_folder_concurrently(): reader = CarbonReader() \ .builder() \ .withFolder(LOCAL_DATA_PATH) \ .withBatch(1000) \ .build() readers = reader.splitAsArray(int(3)) pool = ThreadPool(len(readers)) def readLogic(carbonReader): i = 0 while carbonReader.hasNext(): rows = carbonReader.readNextBatchRow() i += len(rows) carbonReader.close() pool.map(readLogic, readers) pool.close()
def readCarbon(key, secret, end_point, path, label): from jnius import autoclass java_list_class = autoclass('java.util.ArrayList') projection_list = java_list_class() projection_list.add("name") reader = CarbonReader() \ .builder() \ .withBatch(780) \ .withFolder(path) \ .withHadoopConf("fs.s3a.access.key", key) \ .withHadoopConf("fs.s3a.secret.key", secret) \ .withHadoopConf("fs.s3a.endpoint", end_point) \ .projection(projection_list) \ .filterEqual("name", label) \ .build() data_list = [] while reader.hasNext(): rows = reader.readNextBatchRow() for row in rows: data_list.append(row) reader.close() return data_list
def test_run_read_carbon_by_file_lists(): from jnius import autoclass java_list_class = autoclass('java.util.ArrayList') java_list = java_list_class() java_list.add(LOCAL_DATA_PATH + "/sub1/part-0-1196034485149392_batchno0-0-null-1196033673787967.carbondata") java_list.add(LOCAL_DATA_PATH + "/sub2/part-0-1196034758543568_batchno0-0-null-1196034721553227.carbondata") projection_list = java_list_class() projection_list.add("name") projection_list.add("age") projection_list.add("image1") projection_list.add("image2") projection_list.add("image3") reader = CarbonReader() \ .builder() \ .withFileLists(java_list) \ .withBatch(1000) \ .projection(projection_list) \ .build() num = 0 while reader.hasNext(): rows = reader.readNextBatchRow() num += len(rows) assert 20 == num reader.close()
def read_all(self, columns): # rebuilding the reader as need to read specific columns carbon_reader_builder = CarbonReader().builder(self.path) carbon_schema_reader = CarbonSchemaReader() if columns is not None: carbon_reader_builder = carbon_reader_builder.projection(columns) updatedSchema = carbon_schema_reader.reorderSchemaBasedOnProjection( columns, self.carbon_schema) else: # TODO Currently when projection is not added in carbon reader # carbon returns record in dimensions+measures,but here we need based on actual schema order # so for handling this adding projection columns based on schema updatedSchema = self.carbon_schema projection = carbon_schema_reader.getProjectionBasedOnSchema( updatedSchema) carbon_reader_builder = carbon_reader_builder.projection( projection) if self.use_s3: if self.proxy is None and self.proxy_port is None: carbon_reader = carbon_reader_builder \ .withHadoopConf("fs.s3a.access.key", self.key) \ .withHadoopConf("fs.s3a.secret.key", self.secret) \ .withHadoopConf("fs.s3a.endpoint", self.endpoint) \ .build_with_split(self.input_split) else: carbon_reader = carbon_reader_builder \ .withHadoopConf("fs.s3a.access.key", self.key) \ .withHadoopConf("fs.s3a.secret.key", self.secret) \ .withHadoopConf("fs.s3a.endpoint", self.endpoint) \ .withHadoopConf("fs.s3a.proxy.host", self.proxy) \ .withHadoopConf("fs.s3a.proxy.port", self.proxy_port) \ .build_with_split(self.input_split) else: carbon_reader = carbon_reader_builder.build_with_split( self.input_split) data = carbon_reader.read(updatedSchema) carbon_reader.close() return data
def test_run_write_carbon_binary_base64_encode(): jsonSchema = "[{stringField:string},{shortField:short},{intField:int},{binaryField:binary}]" path = "/tmp/data/writeCarbon" + str(time.time()) if os.path.exists(path): shutil.rmtree(path) jpg_path = IMAGE_DATA_PATH + "/carbondatalogo.jpg" writer = CarbonWriter() \ .builder() \ .outputPath(path) \ .withCsvInput(jsonSchema) \ .writtenBy("pycarbon") \ .build() with open(jpg_path, mode='rb+') as file_object: content = file_object.read() for i in range(0, 10): from jnius import autoclass arrayListClass = autoclass("java.util.ArrayList") data_list = arrayListClass() data_list.add("pycarbon") data_list.add(str(i)) data_list.add(str(i * 10)) data_list.add(base64.b64encode(content)) writer.write(data_list.toArray()) writer.close() reader = CarbonReader() \ .builder() \ .withFolder(path) \ .withBatch(1000) \ .build() i = 0 while reader.hasNext(): rows = reader.readNextBatchRow() for row in rows: i += 1 for column in row: from jnius.jnius import ByteArray if 1 == i and isinstance(column, ByteArray) and len(column) > 1000: with open(path + "/image.jpg", 'wb+') as file_object: file_object.write(base64.b64decode(column.tostring())) assert 10 == i reader.close() shutil.rmtree(path)
def test_run_read_carbon_from_local(): reader = CarbonReader() \ .builder() \ .withBatch(780) \ .withFolder(LOCAL_DATA_PATH) \ .build() num = 0 while reader.hasNext(): rows = reader.readNextBatchRow() num += len(rows) assert 30 == num reader.close()
def test_run_read_carbon_by_file(): reader = CarbonReader() \ .builder() \ .withFile(LOCAL_DATA_PATH + "/sub1/part-0-1196034485149392_batchno0-0-null-1196033673787967.carbondata") \ .withBatch(1000) \ .build() num = 0 while reader.hasNext(): rows = reader.readNextBatchRow() num += len(rows) assert 10 == num reader.close()
def test_run_read_carbon_from_local_for_filter(): reader = CarbonReader() \ .builder() \ .withBatch(10) \ .withFolder(LOCAL_DATA_PATH) \ .filterEqual("name", "robot0") \ .build() num = 0 while reader.hasNext(): rows = reader.readNextBatchRow() num += len(rows) assert 3 == num reader.close()
def test_run_read_carbon_from_obs(): reader = CarbonReader() \ .builder() \ .withBatch(1000) \ .withFolder(S3_DATA_PATH) \ .withHadoopConf("fs.s3a.access.key", pytest.config.getoption("--access_key")) \ .withHadoopConf("fs.s3a.secret.key", pytest.config.getoption("--secret_key")) \ .withHadoopConf("fs.s3a.endpoint", pytest.config.getoption("--end_point")) \ .build() num = 0 while reader.hasNext(): rows = reader.readNextBatchRow() num += len(rows) assert 30 == num reader.close()
def test_run_write_carbon(): jsonSchema = "[{stringField:string},{shortField:short},{intField:int}]" path = "/tmp/data/writeCarbon" + str(time.time()) if os.path.exists(path): shutil.rmtree(path) writer = CarbonWriter() \ .builder() \ .outputPath(path) \ .withCsvInput(jsonSchema) \ .writtenBy("pycarbon") \ .build() for i in range(0, 10): from jnius import autoclass arrayListClass = autoclass("java.util.ArrayList") data_list = arrayListClass() data_list.add("pycarbon") data_list.add(str(i)) data_list.add(str(i * 10)) writer.write(data_list.toArray()) writer.close() reader = CarbonReader() \ .builder() \ .withFolder(path) \ .withBatch(1000) \ .build() i = 0 while reader.hasNext(): rows = reader.readNextBatchRow() i += len(rows) assert 10 == i reader.close() shutil.rmtree(path)
def test_run_read_carbon_from_local_for_projection(): from jnius import autoclass java_list_class = autoclass('java.util.ArrayList') projection_list = java_list_class() projection_list.add("name") projection_list.add("age") projection_list.add("image1") projection_list.add("image2") projection_list.add("image3") reader = CarbonReader() \ .builder() \ .withBatch(100) \ .withFolder(LOCAL_DATA_PATH) \ .projection(projection_list) \ .build() num = 0 while reader.hasNext(): rows = reader.readNextBatchRow() num += len(rows) assert 30 == num reader.close()
def test_run_read_carbon_from_obs_for_filter(): from jnius import autoclass java_list_class = autoclass('java.util.ArrayList') projection_list = java_list_class() projection_list.add("name") reader = CarbonReader() \ .builder() \ .withBatch(780) \ .withFolder(S3_DATA_PATH) \ .withHadoopConf("fs.s3a.access.key", pytest.config.getoption("--access_key")) \ .withHadoopConf("fs.s3a.secret.key", pytest.config.getoption("--secret_key")) \ .withHadoopConf("fs.s3a.endpoint", pytest.config.getoption("--end_point")) \ .projection(projection_list) \ .filterEqual("name", "robot0") \ .build() num = 0 while reader.hasNext(): rows = reader.readNextBatchRow() num += len(rows) assert 3 == num reader.close()
def test_run_write_carbon_binary_base64_encode_decodeInJava_many_files(): jsonSchema = "[{stringField:string},{shortField:short},{intField:int},{binaryField:binary},{txtField:string}]" path = "/tmp/data/writeCarbon" + str(time.time()) if os.path.exists(path): shutil.rmtree(path) jpg_path = IMAGE_DATA_PATH + "/flowers" from jnius import autoclass sdkUtilClass = autoclass("org.apache.carbondata.sdk.file.utils.SDKUtil") jpg_files = sdkUtilClass.listFiles(jpg_path, '.jpg') writer = CarbonWriter() \ .builder() \ .outputPath(path) \ .withCsvInput(jsonSchema) \ .writtenBy("pycarbon") \ .withLoadOption("binary_decoder", "base64") \ .withPageSizeInMb(1) \ .build() for i in range(0, jpg_files.size()): jpg_path = jpg_files.get(i) with open(jpg_path, mode='rb+') as file_object: content = file_object.read() with open(str(jpg_path).replace('.jpg', '.txt'), mode='r+') as file_object: txt = file_object.read() arrayListClass = autoclass("java.util.ArrayList") data_list = arrayListClass() data_list.add("pycarbon") data_list.add(str(i)) data_list.add(str(i * 10)) data_list.add(base64.b64encode(content)) data_list.add(txt) writer.write(data_list.toArray()) writer.close() reader = CarbonReader() \ .builder() \ .withFolder(path) \ .withBatch(1000) \ .build() i = 0 while reader.hasNext(): rows = reader.readNextBatchRow() for row in rows: i += 1 for column in row: from jnius.jnius import ByteArray if isinstance(column, ByteArray) and len(column) > 1000 and i < 20: with open(path + "/image" + str(i) + ".jpg", 'wb+') as file_object: file_object.write((column.tostring())) assert 3 == i reader.close() shutil.rmtree(path)
def test_download_carbon_from_obs_and_read(): key = pytest.config.getoption("--access_key") secret = pytest.config.getoption("--secret_key") end_point = pytest.config.getoption("--end_point") def list_obs_files(obs_client, bucket_name, prefix): files = [] pageSize = 1000 index = 1 nextMarker = None while True: resp = obs_client.listObjects(bucket_name, prefix=prefix, max_keys=pageSize, marker=nextMarker) for content in resp.body.contents: files.append(content.key) if not resp.body.is_truncated: break nextMarker = resp.body.next_marker index += 1 return files def read_obs_files(key, secret, end_point, bucket_name, prefix, downloadPath): obsClient = ObsClient( access_key_id=key, secret_access_key=secret, server=end_point, long_conn_mode=True ) files = list_obs_files(obsClient, bucket_name, prefix) numOfFiles = len(files) print(numOfFiles) num = 0 for file in files: num = num + 1 # obsClient.l obsClient.getObject(bucket_name, file, downloadPath=downloadPath + file) # resp.body.buffer if 0 == num % (numOfFiles / 10): print(str(num) + ":" + file) obsClient.close() downloadPath = '/tmp/carbonbinary/' if os.path.exists(downloadPath): shutil.rmtree(downloadPath) read_obs_files(key, secret, end_point, 'sdk', 'binary', downloadPath) reader = CarbonReader() \ .builder() \ .withBatch(1000) \ .withFolder(downloadPath) \ .build() num = 0 while reader.hasNext(): rows = reader.readNextBatchRow() num += len(rows) assert 30 == num reader.close() shutil.rmtree(downloadPath)
def __init__(self, path, key=None, secret=None, endpoint=None, proxy=None, proxy_port=None, filesystem=None): self.path = path self.url_path = urlparse(path) if filesystem is None: a_path = self.path if isinstance(a_path, list): a_path = a_path[0] self.fs = _get_fs_from_path(a_path) else: self.fs = _ensure_filesystem(filesystem) self.pieces = list() if self.url_path.scheme == 's3a': if key is None or secret is None or endpoint is None: raise ValueError('key, secret, endpoint should not be None') if proxy is None and proxy_port is None: carbon_splits = CarbonReader().builder(self.path) \ .withHadoopConf("fs.s3a.access.key", key) \ .withHadoopConf("fs.s3a.secret.key", secret) \ .withHadoopConf("fs.s3a.endpoint", endpoint) \ .getSplits() configuration = Configuration() configuration.set("fs.s3a.access.key", key) configuration.set("fs.s3a.secret.key", secret) configuration.set("fs.s3a.endpoint", endpoint) self.configuration = configuration elif proxy is not None and proxy_port is not None: carbon_splits = CarbonReader().builder(self.path) \ .withHadoopConf("fs.s3a.access.key", key) \ .withHadoopConf("fs.s3a.secret.key", secret) \ .withHadoopConf("fs.s3a.endpoint", endpoint) \ .withHadoopConf("fs.s3a.proxy.host", proxy) \ .withHadoopConf("fs.s3a.proxy.port", proxy_port) \ .getSplits() configuration = Configuration() configuration.set("fs.s3a.access.key", key) configuration.set("fs.s3a.secret.key", secret) configuration.set("fs.s3a.endpoint", endpoint) configuration.set("fs.s3a.proxy.host", proxy) configuration.set("fs.s3a.proxy.port", proxy_port) self.configuration = configuration else: raise ValueError('wrong proxy & proxy_port configuration') carbon_schema = CarbonSchemaReader().readSchema( self.path, self.configuration.conf) for split in carbon_splits: # split = self.url_path.scheme + "://" + self.url_path.netloc + split self.pieces.append( CarbonDatasetPiece(path, carbon_schema, split, key=key, secret=secret, endpoint=endpoint, proxy=proxy, proxy_port=proxy_port)) else: carbon_splits = CarbonReader().builder(self.path) \ .getSplits() carbon_schema = CarbonSchemaReader().readSchema(self.path) for split in carbon_splits: # split = self.url_path.scheme + "://" + self.url_path.netloc + split self.pieces.append( CarbonDatasetPiece(path, carbon_schema, split)) self.number_of_splits = len(self.pieces) self.schema = self.getArrowSchema() # TODO add mechanism to get the file path based on file filter self.common_metadata_path = self.url_path.path + '/_common_metadata' self.common_metadata = None try: if self.fs.exists(self.common_metadata_path): with self.fs.open(self.common_metadata_path) as f: self.common_metadata = ParquetFile(f).metadata except: self.common_metadata = None