Пример #1
0
    def __init__(self,
                 dirpath,
                 filesystem=None,
                 pathsep='/',
                 partition_scheme='hive',
                 metadata_nthreads=1):
        self.filesystem = filesystem or _get_fs_from_path(dirpath)
        self.pathsep = pathsep
        self.dirpath = _stringify_path(dirpath)
        self.partition_scheme = partition_scheme
        self.partitions = ParquetPartitions()
        self.pieces = []
        self._metadata_nthreads = metadata_nthreads
        self._thread_pool = futures.ThreadPoolExecutor(
            max_workers=metadata_nthreads)

        self.common_metadata_path = None
        self.metadata_path = None

        self._visit_level(0, self.dirpath, [])

        # Due to concurrency, pieces will potentially by out of order if the
        # dataset is partitioned so we sort them to yield stable results
        self.pieces.sort(key=lambda piece: piece.path)

        if self.common_metadata_path is None:
            # _common_metadata is a subset of _metadata
            self.common_metadata_path = self.metadata_path

        self._thread_pool.shutdown()
Пример #2
0
def read_table(source, columns=None, use_threads=True, metadata=None,
               use_pandas_metadata=False, memory_map=True):
    if _is_path_like(source):
        fs = _get_fs_from_path(source)
        return fs.read_parquet(source, columns=columns,
                               use_threads=use_threads, metadata=metadata,
                               use_pandas_metadata=use_pandas_metadata)

    pf = ParquetFile(source, metadata=metadata)
    return pf.read(columns=columns, use_threads=use_threads,
                   use_pandas_metadata=use_pandas_metadata)
Пример #3
0
    def __init__(self,
                 path_or_paths,
                 filesystem=None,
                 schema=None,
                 metadata=None,
                 split_row_groups=False,
                 validate_schema=True,
                 filters=None,
                 metadata_nthreads=1):
        if filesystem is None:
            a_path = path_or_paths
            if isinstance(a_path, list):
                a_path = a_path[0]
            self.fs = _get_fs_from_path(a_path)
        else:
            self.fs = _ensure_filesystem(filesystem)

        self.paths = path_or_paths

        (self.pieces, self.partitions, self.common_metadata_path,
         self.metadata_path) = _make_manifest(
             path_or_paths, self.fs, metadata_nthreads=metadata_nthreads)

        if self.common_metadata_path is not None:
            with self.fs.open(self.common_metadata_path) as f:
                self.common_metadata = ParquetFile(f).metadata
        else:
            self.common_metadata = None

        if metadata is None and self.metadata_path is not None:
            with self.fs.open(self.metadata_path) as f:
                self.metadata = ParquetFile(f).metadata
        else:
            self.metadata = metadata

        self.schema = schema

        self.split_row_groups = split_row_groups

        if split_row_groups:
            raise NotImplementedError("split_row_groups not yet implemented")

        if validate_schema:
            self.validate_schemas()

        if filters is not None:
            filters = _check_filters(filters)
            self._filter(filters)
Пример #4
0
    def __init__(self,
                 where,
                 schema,
                 flavor=None,
                 version='1.0',
                 use_dictionary=True,
                 compression='snappy',
                 use_deprecated_int96_timestamps=None,
                 **options):
        if use_deprecated_int96_timestamps is None:
            # Use int96 timestamps for Spark
            if flavor is not None and 'spark' in flavor:
                use_deprecated_int96_timestamps = True
            else:
                use_deprecated_int96_timestamps = False

        self.flavor = flavor
        if flavor is not None:
            schema, self.schema_changed = _sanitize_schema(schema, flavor)
        else:
            self.schema_changed = False

        self.schema = schema
        self.where = where

        # If we open a file using an implied filesystem, so it can be assured
        # to be closed
        self.file_handle = None

        if _is_path_like(where):
            fs = _get_fs_from_path(where)
            sink = self.file_handle = fs.open(where, 'wb')
        else:
            sink = where

        self.writer = _parquet.ParquetWriter(
            sink,
            schema,
            version=version,
            compression=compression,
            use_dictionary=use_dictionary,
            use_deprecated_int96_timestamps=use_deprecated_int96_timestamps,
            **options)
        self.is_open = True
Пример #5
0
def write_to_dataset(table,
                     root_path,
                     partition_cols=None,
                     filesystem=None,
                     preserve_index=True,
                     **kwargs):
    """
    Wrapper around parquet.write_table for writing a Table to
    Parquet format by partitions.
    For each combination of partition columns and values,
    a subdirectories are created in the following
    manner:

    root_dir/
      group1=value1
        group2=value1
          <uuid>.parquet
        group2=value2
          <uuid>.parquet
      group1=valueN
        group2=value1
          <uuid>.parquet
        group2=valueN
          <uuid>.parquet

    Parameters
    ----------
    table : pyarrow.Table
    root_path : string,
        The root directory of the dataset
    filesystem : FileSystem, default None
        If nothing passed, paths assumed to be found in the local on-disk
        filesystem
    partition_cols : list,
        Column names by which to partition the dataset
        Columns are partitioned in the order they are given
    preserve_index : bool,
        Parameter for instantiating Table; preserve pandas index or not.
    **kwargs : dict, kwargs for write_table function.
    """
    if filesystem is None:
        fs = _get_fs_from_path(root_path)
    else:
        fs = _ensure_filesystem(filesystem)

    _mkdir_if_not_exists(fs, root_path)

    if partition_cols is not None and len(partition_cols) > 0:
        df = table.to_pandas()
        partition_keys = [df[col] for col in partition_cols]
        data_df = df.drop(partition_cols, axis='columns')
        data_cols = df.columns.drop(partition_cols)
        if len(data_cols) == 0:
            raise ValueError('No data left to save outside partition columns')
        subschema = table.schema
        # ARROW-2891: Ensure the output_schema is preserved when writing a
        # partitioned dataset
        for partition_col in partition_cols:
            subschema = subschema.remove(
                subschema.get_field_index(partition_col))
        for keys, subgroup in data_df.groupby(partition_keys):
            if not isinstance(keys, tuple):
                keys = (keys, )
            subdir = '/'.join([
                '{colname}={value}'.format(colname=name, value=val)
                for name, val in zip(partition_cols, keys)
            ])
            subtable = pa.Table.from_pandas(subgroup,
                                            preserve_index=preserve_index,
                                            schema=subschema,
                                            safe=False)
            prefix = '/'.join([root_path, subdir])
            _mkdir_if_not_exists(fs, prefix)
            outfile = guid() + '.parquet'
            full_path = '/'.join([prefix, outfile])
            with fs.open(full_path, 'wb') as f:
                write_table(subtable, f, **kwargs)
    else:
        outfile = guid() + '.parquet'
        full_path = '/'.join([root_path, outfile])
        with fs.open(full_path, 'wb') as f:
            write_table(table, f, **kwargs)
Пример #6
0
    def __init__(self,
                 path,
                 key=None,
                 secret=None,
                 endpoint=None,
                 proxy=None,
                 proxy_port=None,
                 filesystem=None):
        self.path = path
        self.url_path = urlparse(path)

        if str(path).endswith(".manifest"):
            self.manifest_path = path
            if str(path).startswith(LOCAL_FILE_PREFIX):
                self.manifest_path = str(path)[len(LOCAL_FILE_PREFIX):]

        if filesystem is None:
            a_path = self.path
            if isinstance(a_path, list):
                a_path = a_path[0]
            self.fs = _get_fs_from_path(a_path)
        else:
            self.fs = _ensure_filesystem(filesystem)

        self.pieces = list()

        if self.url_path.scheme == 's3a':
            if key is None or secret is None or endpoint is None:
                raise ValueError('key, secret, endpoint should not be None')

            if proxy is None and proxy_port is None:
                carbon_splits = ArrowCarbonReader().builder(self.path) \
                  .withHadoopConf("fs.s3a.access.key", key) \
                  .withHadoopConf("fs.s3a.secret.key", secret) \
                  .withHadoopConf("fs.s3a.endpoint", endpoint) \
                  .getSplits(True)

                configuration = Configuration()
                configuration.set("fs.s3a.access.key", key)
                configuration.set("fs.s3a.secret.key", secret)
                configuration.set("fs.s3a.endpoint", endpoint)

                self.configuration = configuration

            elif proxy is not None and proxy_port is not None:
                carbon_splits = ArrowCarbonReader().builder(self.path) \
                  .withHadoopConf("fs.s3a.access.key", key) \
                  .withHadoopConf("fs.s3a.secret.key", secret) \
                  .withHadoopConf("fs.s3a.endpoint", endpoint) \
                  .withHadoopConf("fs.s3a.proxy.host", proxy) \
                  .withHadoopConf("fs.s3a.proxy.port", proxy_port) \
                  .getSplits(True)

                configuration = Configuration()
                configuration.set("fs.s3a.access.key", key)
                configuration.set("fs.s3a.secret.key", secret)
                configuration.set("fs.s3a.endpoint", endpoint)
                configuration.set("fs.s3a.proxy.host", proxy)
                configuration.set("fs.s3a.proxy.port", proxy_port)

                self.configuration = configuration
            else:
                raise ValueError('wrong proxy & proxy_port configuration')

            if str(path).endswith(".manifest"):
                from obs import ObsClient
                obsClient = ObsClient(access_key_id=key,
                                      secret_access_key=secret,
                                      server=str(endpoint).replace(
                                          'http://', ''),
                                      long_conn_mode=True)
                sources = manifest.getSources(self.manifest_path, CARBON,
                                              obsClient)
                if sources:
                    self.file_path = sources[0]
                else:
                    raise Exception("Manifest source can't be None!")
                carbon_schema = CarbonSchemaReader().readSchema(
                    self.file_path, self.configuration.conf)
            else:
                carbon_schema = CarbonSchemaReader().readSchema(
                    self.path, self.configuration.conf)

            for split in carbon_splits:
                # split = self.url_path.scheme + "://" + self.url_path.netloc + split
                folder_path = path
                if str(path).endswith(".manifest"):
                    folder_path = str(
                        self.file_path)[0:(str(self.file_path).rindex('/'))]
                self.pieces.append(
                    CarbonDatasetPiece(folder_path,
                                       carbon_schema,
                                       split,
                                       key=key,
                                       secret=secret,
                                       endpoint=endpoint,
                                       proxy=proxy,
                                       proxy_port=proxy_port))

        else:
            if str(path).endswith(".manifest"):
                sources = manifest.getSources(self.manifest_path, CARBON)
                if sources:
                    self.file_path = sources[0]
                else:
                    raise Exception("Manifest source can't be None!")

                try:
                    carbon_schema = CarbonSchemaReader().readSchema(
                        self.file_path)
                except:
                    raise Exception("readSchema has some errors: " +
                                    self.file_path)
            else:
                try:
                    carbon_schema = CarbonSchemaReader().readSchema(self.path)
                except:
                    raise Exception("readSchema has some errors")

            carbon_splits = ArrowCarbonReader().builder(self.path) \
              .getSplits(True)

            for split in carbon_splits:
                # split = self.url_path.scheme + "://" + self.url_path.netloc + split
                if str(path).endswith(".manifest"):
                    self.pieces.append(
                        CarbonDatasetPiece(
                            str(self.file_path)[0:(
                                str(self.file_path).rindex('/'))],
                            carbon_schema, split))
                else:
                    self.pieces.append(
                        CarbonDatasetPiece(path, carbon_schema, split))

        self.number_of_splits = len(self.pieces)
        self.schema = self.getArrowSchema()
        # TODO add mechanism to get the file path based on file filter
        self.common_metadata_path = self.url_path.path + '/_common_metadata'
        self.common_metadata = None
        try:
            if self.fs.exists(self.common_metadata_path):
                with self.fs.open(self.common_metadata_path) as f:
                    self.common_metadata = ParquetFile(f).metadata
        except:
            self.common_metadata = None
Пример #7
0
    def __init__(self,
                 path,
                 key=None,
                 secret=None,
                 endpoint=None,
                 proxy=None,
                 proxy_port=None,
                 filesystem=None):
        self.path = path
        self.url_path = urlparse(path)

        if filesystem is None:
            a_path = self.path
            if isinstance(a_path, list):
                a_path = a_path[0]
            self.fs = _get_fs_from_path(a_path)
        else:
            self.fs = _ensure_filesystem(filesystem)

        self.pieces = list()

        if self.url_path.scheme == 's3a':
            if key is None or secret is None or endpoint is None:
                raise ValueError('key, secret, endpoint should not be None')

            if proxy is None and proxy_port is None:
                carbon_splits = CarbonReader().builder(self.path) \
                  .withHadoopConf("fs.s3a.access.key", key) \
                  .withHadoopConf("fs.s3a.secret.key", secret) \
                  .withHadoopConf("fs.s3a.endpoint", endpoint) \
                  .getSplits()

                configuration = Configuration()
                configuration.set("fs.s3a.access.key", key)
                configuration.set("fs.s3a.secret.key", secret)
                configuration.set("fs.s3a.endpoint", endpoint)

                self.configuration = configuration

            elif proxy is not None and proxy_port is not None:
                carbon_splits = CarbonReader().builder(self.path) \
                  .withHadoopConf("fs.s3a.access.key", key) \
                  .withHadoopConf("fs.s3a.secret.key", secret) \
                  .withHadoopConf("fs.s3a.endpoint", endpoint) \
                  .withHadoopConf("fs.s3a.proxy.host", proxy) \
                  .withHadoopConf("fs.s3a.proxy.port", proxy_port) \
                  .getSplits()

                configuration = Configuration()
                configuration.set("fs.s3a.access.key", key)
                configuration.set("fs.s3a.secret.key", secret)
                configuration.set("fs.s3a.endpoint", endpoint)
                configuration.set("fs.s3a.proxy.host", proxy)
                configuration.set("fs.s3a.proxy.port", proxy_port)

                self.configuration = configuration
            else:
                raise ValueError('wrong proxy & proxy_port configuration')

            carbon_schema = CarbonSchemaReader().readSchema(
                self.path, self.configuration.conf)

            for split in carbon_splits:
                # split = self.url_path.scheme + "://" + self.url_path.netloc + split
                self.pieces.append(
                    CarbonDatasetPiece(path,
                                       carbon_schema,
                                       split,
                                       key=key,
                                       secret=secret,
                                       endpoint=endpoint,
                                       proxy=proxy,
                                       proxy_port=proxy_port))

        else:
            carbon_splits = CarbonReader().builder(self.path) \
              .getSplits()

            carbon_schema = CarbonSchemaReader().readSchema(self.path)

            for split in carbon_splits:
                # split = self.url_path.scheme + "://" + self.url_path.netloc + split
                self.pieces.append(
                    CarbonDatasetPiece(path, carbon_schema, split))

        self.number_of_splits = len(self.pieces)
        self.schema = self.getArrowSchema()
        # TODO add mechanism to get the file path based on file filter
        self.common_metadata_path = self.url_path.path + '/_common_metadata'
        self.common_metadata = None
        try:
            if self.fs.exists(self.common_metadata_path):
                with self.fs.open(self.common_metadata_path) as f:
                    self.common_metadata = ParquetFile(f).metadata
        except:
            self.common_metadata = None