def getSplits(self, is_blocklet_split): from jnius import autoclass java_list_class = autoclass('java.util.ArrayList') if str(self.input_split).endswith(".manifest"): if str(self.input_split).startswith(LOCAL_FILE_PREFIX): self.manifest_path = str(self.input_split)[len(LOCAL_FILE_PREFIX):] else: self.manifest_path = self.input_split from obs import ObsClient if str(self.input_split).startswith("s3"): obsClient = ObsClient(access_key_id=self.ak, secret_access_key=self.sk, server=str(self.end_point).replace('http://', ''), long_conn_mode=True) sources = manifest.getSources(self.manifest_path, CARBON, obsClient) self.file_path = sources[0] else: sources = manifest.getSources(self.manifest_path, CARBON) java_list = java_list_class() for source in sources: java_list.add(source) return self.ArrowCarbonReaderBuilder.withFileLists(java_list).getSplits(is_blocklet_split) else: return self.ArrowCarbonReaderBuilder.getSplits(is_blocklet_split)
def test_single_default(path, obsClient): if obsClient is None: sources = manifest.getSources(path, CARBONDATA) else: sources = manifest.getSources(path, CARBONDATA, obsClient) assert 3 == len(sources) for source in sources: print(source) print("Success: test_single_default")
def __init__(self, path, key=None, secret=None, endpoint=None, proxy=None, proxy_port=None, filesystem=None): self.path = path self.url_path = urlparse(path) if str(path).endswith(".manifest"): self.manifest_path = path if str(path).startswith(LOCAL_FILE_PREFIX): self.manifest_path = str(path)[len(LOCAL_FILE_PREFIX):] if filesystem is None: a_path = self.path if isinstance(a_path, list): a_path = a_path[0] self.fs = _get_fs_from_path(a_path) else: self.fs = _ensure_filesystem(filesystem) self.pieces = list() if self.url_path.scheme == 's3a': if key is None or secret is None or endpoint is None: raise ValueError('key, secret, endpoint should not be None') if proxy is None and proxy_port is None: carbon_splits = ArrowCarbonReader().builder(self.path) \ .withHadoopConf("fs.s3a.access.key", key) \ .withHadoopConf("fs.s3a.secret.key", secret) \ .withHadoopConf("fs.s3a.endpoint", endpoint) \ .getSplits(True) configuration = Configuration() configuration.set("fs.s3a.access.key", key) configuration.set("fs.s3a.secret.key", secret) configuration.set("fs.s3a.endpoint", endpoint) self.configuration = configuration elif proxy is not None and proxy_port is not None: carbon_splits = ArrowCarbonReader().builder(self.path) \ .withHadoopConf("fs.s3a.access.key", key) \ .withHadoopConf("fs.s3a.secret.key", secret) \ .withHadoopConf("fs.s3a.endpoint", endpoint) \ .withHadoopConf("fs.s3a.proxy.host", proxy) \ .withHadoopConf("fs.s3a.proxy.port", proxy_port) \ .getSplits(True) configuration = Configuration() configuration.set("fs.s3a.access.key", key) configuration.set("fs.s3a.secret.key", secret) configuration.set("fs.s3a.endpoint", endpoint) configuration.set("fs.s3a.proxy.host", proxy) configuration.set("fs.s3a.proxy.port", proxy_port) self.configuration = configuration else: raise ValueError('wrong proxy & proxy_port configuration') if str(path).endswith(".manifest"): from obs import ObsClient obsClient = ObsClient(access_key_id=key, secret_access_key=secret, server=str(endpoint).replace( 'http://', ''), long_conn_mode=True) sources = manifest.getSources(self.manifest_path, CARBON, obsClient) if sources: self.file_path = sources[0] else: raise Exception("Manifest source can't be None!") carbon_schema = CarbonSchemaReader().readSchema( self.file_path, self.configuration.conf) else: carbon_schema = CarbonSchemaReader().readSchema( self.path, self.configuration.conf) for split in carbon_splits: # split = self.url_path.scheme + "://" + self.url_path.netloc + split folder_path = path if str(path).endswith(".manifest"): folder_path = str( self.file_path)[0:(str(self.file_path).rindex('/'))] self.pieces.append( CarbonDatasetPiece(folder_path, carbon_schema, split, key=key, secret=secret, endpoint=endpoint, proxy=proxy, proxy_port=proxy_port)) else: if str(path).endswith(".manifest"): sources = manifest.getSources(self.manifest_path, CARBON) if sources: self.file_path = sources[0] else: raise Exception("Manifest source can't be None!") try: carbon_schema = CarbonSchemaReader().readSchema( self.file_path) except: raise Exception("readSchema has some errors: " + self.file_path) else: try: carbon_schema = CarbonSchemaReader().readSchema(self.path) except: raise Exception("readSchema has some errors") carbon_splits = ArrowCarbonReader().builder(self.path) \ .getSplits(True) for split in carbon_splits: # split = self.url_path.scheme + "://" + self.url_path.netloc + split if str(path).endswith(".manifest"): self.pieces.append( CarbonDatasetPiece( str(self.file_path)[0:( str(self.file_path).rindex('/'))], carbon_schema, split)) else: self.pieces.append( CarbonDatasetPiece(path, carbon_schema, split)) self.number_of_splits = len(self.pieces) self.schema = self.getArrowSchema() # TODO add mechanism to get the file path based on file filter self.common_metadata_path = self.url_path.path + '/_common_metadata' self.common_metadata = None try: if self.fs.exists(self.common_metadata_path): with self.fs.open(self.common_metadata_path) as f: self.common_metadata = ParquetFile(f).metadata except: self.common_metadata = None