示例#1
0
  def getSplits(self, is_blocklet_split):
    from jnius import autoclass

    java_list_class = autoclass('java.util.ArrayList')

    if str(self.input_split).endswith(".manifest"):
      if str(self.input_split).startswith(LOCAL_FILE_PREFIX):
        self.manifest_path = str(self.input_split)[len(LOCAL_FILE_PREFIX):]
      else:
        self.manifest_path = self.input_split

      from obs import ObsClient
      if str(self.input_split).startswith("s3"):
        obsClient = ObsClient(access_key_id=self.ak, secret_access_key=self.sk,
                              server=str(self.end_point).replace('http://', ''),
                              long_conn_mode=True)
        sources = manifest.getSources(self.manifest_path, CARBON, obsClient)
        self.file_path = sources[0]
      else:
        sources = manifest.getSources(self.manifest_path, CARBON)
      java_list = java_list_class()
      for source in sources:
        java_list.add(source)
      return self.ArrowCarbonReaderBuilder.withFileLists(java_list).getSplits(is_blocklet_split)
    else:
      return self.ArrowCarbonReaderBuilder.getSplits(is_blocklet_split)
示例#2
0
def test_single_default(path, obsClient):
    if obsClient is None:
        sources = manifest.getSources(path, CARBONDATA)
    else:
        sources = manifest.getSources(path, CARBONDATA, obsClient)
    assert 3 == len(sources)
    for source in sources:
        print(source)
    print("Success: test_single_default")
示例#3
0
    def __init__(self,
                 path,
                 key=None,
                 secret=None,
                 endpoint=None,
                 proxy=None,
                 proxy_port=None,
                 filesystem=None):
        self.path = path
        self.url_path = urlparse(path)

        if str(path).endswith(".manifest"):
            self.manifest_path = path
            if str(path).startswith(LOCAL_FILE_PREFIX):
                self.manifest_path = str(path)[len(LOCAL_FILE_PREFIX):]

        if filesystem is None:
            a_path = self.path
            if isinstance(a_path, list):
                a_path = a_path[0]
            self.fs = _get_fs_from_path(a_path)
        else:
            self.fs = _ensure_filesystem(filesystem)

        self.pieces = list()

        if self.url_path.scheme == 's3a':
            if key is None or secret is None or endpoint is None:
                raise ValueError('key, secret, endpoint should not be None')

            if proxy is None and proxy_port is None:
                carbon_splits = ArrowCarbonReader().builder(self.path) \
                  .withHadoopConf("fs.s3a.access.key", key) \
                  .withHadoopConf("fs.s3a.secret.key", secret) \
                  .withHadoopConf("fs.s3a.endpoint", endpoint) \
                  .getSplits(True)

                configuration = Configuration()
                configuration.set("fs.s3a.access.key", key)
                configuration.set("fs.s3a.secret.key", secret)
                configuration.set("fs.s3a.endpoint", endpoint)

                self.configuration = configuration

            elif proxy is not None and proxy_port is not None:
                carbon_splits = ArrowCarbonReader().builder(self.path) \
                  .withHadoopConf("fs.s3a.access.key", key) \
                  .withHadoopConf("fs.s3a.secret.key", secret) \
                  .withHadoopConf("fs.s3a.endpoint", endpoint) \
                  .withHadoopConf("fs.s3a.proxy.host", proxy) \
                  .withHadoopConf("fs.s3a.proxy.port", proxy_port) \
                  .getSplits(True)

                configuration = Configuration()
                configuration.set("fs.s3a.access.key", key)
                configuration.set("fs.s3a.secret.key", secret)
                configuration.set("fs.s3a.endpoint", endpoint)
                configuration.set("fs.s3a.proxy.host", proxy)
                configuration.set("fs.s3a.proxy.port", proxy_port)

                self.configuration = configuration
            else:
                raise ValueError('wrong proxy & proxy_port configuration')

            if str(path).endswith(".manifest"):
                from obs import ObsClient
                obsClient = ObsClient(access_key_id=key,
                                      secret_access_key=secret,
                                      server=str(endpoint).replace(
                                          'http://', ''),
                                      long_conn_mode=True)
                sources = manifest.getSources(self.manifest_path, CARBON,
                                              obsClient)
                if sources:
                    self.file_path = sources[0]
                else:
                    raise Exception("Manifest source can't be None!")
                carbon_schema = CarbonSchemaReader().readSchema(
                    self.file_path, self.configuration.conf)
            else:
                carbon_schema = CarbonSchemaReader().readSchema(
                    self.path, self.configuration.conf)

            for split in carbon_splits:
                # split = self.url_path.scheme + "://" + self.url_path.netloc + split
                folder_path = path
                if str(path).endswith(".manifest"):
                    folder_path = str(
                        self.file_path)[0:(str(self.file_path).rindex('/'))]
                self.pieces.append(
                    CarbonDatasetPiece(folder_path,
                                       carbon_schema,
                                       split,
                                       key=key,
                                       secret=secret,
                                       endpoint=endpoint,
                                       proxy=proxy,
                                       proxy_port=proxy_port))

        else:
            if str(path).endswith(".manifest"):
                sources = manifest.getSources(self.manifest_path, CARBON)
                if sources:
                    self.file_path = sources[0]
                else:
                    raise Exception("Manifest source can't be None!")

                try:
                    carbon_schema = CarbonSchemaReader().readSchema(
                        self.file_path)
                except:
                    raise Exception("readSchema has some errors: " +
                                    self.file_path)
            else:
                try:
                    carbon_schema = CarbonSchemaReader().readSchema(self.path)
                except:
                    raise Exception("readSchema has some errors")

            carbon_splits = ArrowCarbonReader().builder(self.path) \
              .getSplits(True)

            for split in carbon_splits:
                # split = self.url_path.scheme + "://" + self.url_path.netloc + split
                if str(path).endswith(".manifest"):
                    self.pieces.append(
                        CarbonDatasetPiece(
                            str(self.file_path)[0:(
                                str(self.file_path).rindex('/'))],
                            carbon_schema, split))
                else:
                    self.pieces.append(
                        CarbonDatasetPiece(path, carbon_schema, split))

        self.number_of_splits = len(self.pieces)
        self.schema = self.getArrowSchema()
        # TODO add mechanism to get the file path based on file filter
        self.common_metadata_path = self.url_path.path + '/_common_metadata'
        self.common_metadata = None
        try:
            if self.fs.exists(self.common_metadata_path):
                with self.fs.open(self.common_metadata_path) as f:
                    self.common_metadata = ParquetFile(f).metadata
        except:
            self.common_metadata = None