Exemplo n.º 1
0
def _get_filesystem_and_path(passed_filesystem, path):
    if passed_filesystem is None:
        return get_filesystem_from_uri(path)
    else:
        passed_filesystem = _ensure_filesystem(passed_filesystem)
        parsed_path = _parse_uri(path)
        return passed_filesystem, parsed_path
Exemplo n.º 2
0
def _get_filesystem_and_path(passed_filesystem, path):
    if passed_filesystem is None:
        return resolve_filesystem_and_path(path, passed_filesystem)
    else:
        passed_filesystem = _ensure_filesystem(passed_filesystem)
        parsed_path = _parse_uri(path)
        return passed_filesystem, parsed_path
Exemplo n.º 3
0
def write_dataset(
    df, path, partition_cols, preserve_index, session_primitives, file_format, mode
):
    fs = get_fs(session_primitives=session_primitives)
    fs = _ensure_filesystem(fs)
    mkdir_if_not_exists(fs, path)
    partition_paths = []
    dead_keys = []
    for keys, subgroup in df.groupby(partition_cols):
        subgroup = subgroup.drop(partition_cols, axis="columns")
        if not isinstance(keys, tuple):
            keys = (keys,)
        subdir = "/".join(
            [
                "{colname}={value}".format(colname=name, value=val)
                for name, val in zip(partition_cols, keys)
            ]
        )
        prefix = "/".join([path, subdir])
        if mode == "overwrite_partitions":
            dead_keys += list_objects(prefix, session_primitives=session_primitives)
        full_path = write_file(
            df=subgroup,
            path=prefix,
            preserve_index=preserve_index,
            session_primitives=session_primitives,
            file_format=file_format,
        )
        partition_path = full_path.rpartition("/")[0] + "/"
        keys_str = [str(x) for x in keys]
        partition_paths.append((partition_path, keys_str))
    if mode == "overwrite_partitions" and dead_keys:
        bucket = path.replace("s3://", "").split("/", 1)[0]
        delete_listed_objects(bucket, dead_keys, session_primitives=session_primitives)
    return partition_paths
Exemplo n.º 4
0
    def __init__(self,
                 path_or_paths,
                 filesystem=None,
                 schema=None,
                 metadata=None,
                 split_row_groups=False,
                 validate_schema=True,
                 filters=None,
                 metadata_nthreads=1):
        if filesystem is None:
            a_path = path_or_paths
            if isinstance(a_path, list):
                a_path = a_path[0]
            self.fs = _get_fs_from_path(a_path)
        else:
            self.fs = _ensure_filesystem(filesystem)

        self.paths = path_or_paths

        (self.pieces, self.partitions, self.common_metadata_path,
         self.metadata_path) = _make_manifest(
             path_or_paths, self.fs, metadata_nthreads=metadata_nthreads)

        if self.common_metadata_path is not None:
            with self.fs.open(self.common_metadata_path) as f:
                self.common_metadata = ParquetFile(f).metadata
        else:
            self.common_metadata = None

        if metadata is None and self.metadata_path is not None:
            with self.fs.open(self.metadata_path) as f:
                self.metadata = ParquetFile(f).metadata
        else:
            self.metadata = metadata

        self.schema = schema

        self.split_row_groups = split_row_groups

        if split_row_groups:
            raise NotImplementedError("split_row_groups not yet implemented")

        if validate_schema:
            self.validate_schemas()

        if filters is not None:
            filters = _check_filters(filters)
            self._filter(filters)
Exemplo n.º 5
0
def write_file(df, path, preserve_index, session_primitives, file_format):
    fs = get_fs(session_primitives=session_primitives)
    fs = _ensure_filesystem(fs)
    mkdir_if_not_exists(fs, path)
    if file_format == "parquet":
        outfile = guid() + ".parquet"
    elif file_format == "csv":
        outfile = guid() + ".csv"
    full_path = "/".join([path, outfile])
    if file_format == "parquet":
        write_parquet_dataframe(
            df=df, path=full_path, preserve_index=preserve_index, fs=fs
        )
    elif file_format == "csv":
        write_csv_dataframe(df=df, path=full_path, preserve_index=preserve_index, fs=fs)
    return full_path
Exemplo n.º 6
0
def write_to_dataset(table,
                     root_path,
                     partition_cols=None,
                     filesystem=None,
                     preserve_index=True,
                     **kwargs):
    """
    Wrapper around parquet.write_table for writing a Table to
    Parquet format by partitions.
    For each combination of partition columns and values,
    a subdirectories are created in the following
    manner:

    root_dir/
      group1=value1
        group2=value1
          <uuid>.parquet
        group2=value2
          <uuid>.parquet
      group1=valueN
        group2=value1
          <uuid>.parquet
        group2=valueN
          <uuid>.parquet

    Parameters
    ----------
    table : pyarrow.Table
    root_path : string,
        The root directory of the dataset
    filesystem : FileSystem, default None
        If nothing passed, paths assumed to be found in the local on-disk
        filesystem
    partition_cols : list,
        Column names by which to partition the dataset
        Columns are partitioned in the order they are given
    preserve_index : bool,
        Parameter for instantiating Table; preserve pandas index or not.
    **kwargs : dict, kwargs for write_table function.
    """
    if filesystem is None:
        fs = _get_fs_from_path(root_path)
    else:
        fs = _ensure_filesystem(filesystem)

    _mkdir_if_not_exists(fs, root_path)

    if partition_cols is not None and len(partition_cols) > 0:
        df = table.to_pandas()
        partition_keys = [df[col] for col in partition_cols]
        data_df = df.drop(partition_cols, axis='columns')
        data_cols = df.columns.drop(partition_cols)
        if len(data_cols) == 0:
            raise ValueError('No data left to save outside partition columns')
        subschema = table.schema
        # ARROW-2891: Ensure the output_schema is preserved when writing a
        # partitioned dataset
        for partition_col in partition_cols:
            subschema = subschema.remove(
                subschema.get_field_index(partition_col))
        for keys, subgroup in data_df.groupby(partition_keys):
            if not isinstance(keys, tuple):
                keys = (keys, )
            subdir = '/'.join([
                '{colname}={value}'.format(colname=name, value=val)
                for name, val in zip(partition_cols, keys)
            ])
            subtable = pa.Table.from_pandas(subgroup,
                                            preserve_index=preserve_index,
                                            schema=subschema,
                                            safe=False)
            prefix = '/'.join([root_path, subdir])
            _mkdir_if_not_exists(fs, prefix)
            outfile = guid() + '.parquet'
            full_path = '/'.join([prefix, outfile])
            with fs.open(full_path, 'wb') as f:
                write_table(subtable, f, **kwargs)
    else:
        outfile = guid() + '.parquet'
        full_path = '/'.join([root_path, outfile])
        with fs.open(full_path, 'wb') as f:
            write_table(table, f, **kwargs)
Exemplo n.º 7
0
    def __init__(self,
                 path,
                 key=None,
                 secret=None,
                 endpoint=None,
                 proxy=None,
                 proxy_port=None,
                 filesystem=None):
        self.path = path
        self.url_path = urlparse(path)

        if str(path).endswith(".manifest"):
            self.manifest_path = path
            if str(path).startswith(LOCAL_FILE_PREFIX):
                self.manifest_path = str(path)[len(LOCAL_FILE_PREFIX):]

        if filesystem is None:
            a_path = self.path
            if isinstance(a_path, list):
                a_path = a_path[0]
            self.fs = _get_fs_from_path(a_path)
        else:
            self.fs = _ensure_filesystem(filesystem)

        self.pieces = list()

        if self.url_path.scheme == 's3a':
            if key is None or secret is None or endpoint is None:
                raise ValueError('key, secret, endpoint should not be None')

            if proxy is None and proxy_port is None:
                carbon_splits = ArrowCarbonReader().builder(self.path) \
                  .withHadoopConf("fs.s3a.access.key", key) \
                  .withHadoopConf("fs.s3a.secret.key", secret) \
                  .withHadoopConf("fs.s3a.endpoint", endpoint) \
                  .getSplits(True)

                configuration = Configuration()
                configuration.set("fs.s3a.access.key", key)
                configuration.set("fs.s3a.secret.key", secret)
                configuration.set("fs.s3a.endpoint", endpoint)

                self.configuration = configuration

            elif proxy is not None and proxy_port is not None:
                carbon_splits = ArrowCarbonReader().builder(self.path) \
                  .withHadoopConf("fs.s3a.access.key", key) \
                  .withHadoopConf("fs.s3a.secret.key", secret) \
                  .withHadoopConf("fs.s3a.endpoint", endpoint) \
                  .withHadoopConf("fs.s3a.proxy.host", proxy) \
                  .withHadoopConf("fs.s3a.proxy.port", proxy_port) \
                  .getSplits(True)

                configuration = Configuration()
                configuration.set("fs.s3a.access.key", key)
                configuration.set("fs.s3a.secret.key", secret)
                configuration.set("fs.s3a.endpoint", endpoint)
                configuration.set("fs.s3a.proxy.host", proxy)
                configuration.set("fs.s3a.proxy.port", proxy_port)

                self.configuration = configuration
            else:
                raise ValueError('wrong proxy & proxy_port configuration')

            if str(path).endswith(".manifest"):
                from obs import ObsClient
                obsClient = ObsClient(access_key_id=key,
                                      secret_access_key=secret,
                                      server=str(endpoint).replace(
                                          'http://', ''),
                                      long_conn_mode=True)
                sources = manifest.getSources(self.manifest_path, CARBON,
                                              obsClient)
                if sources:
                    self.file_path = sources[0]
                else:
                    raise Exception("Manifest source can't be None!")
                carbon_schema = CarbonSchemaReader().readSchema(
                    self.file_path, self.configuration.conf)
            else:
                carbon_schema = CarbonSchemaReader().readSchema(
                    self.path, self.configuration.conf)

            for split in carbon_splits:
                # split = self.url_path.scheme + "://" + self.url_path.netloc + split
                folder_path = path
                if str(path).endswith(".manifest"):
                    folder_path = str(
                        self.file_path)[0:(str(self.file_path).rindex('/'))]
                self.pieces.append(
                    CarbonDatasetPiece(folder_path,
                                       carbon_schema,
                                       split,
                                       key=key,
                                       secret=secret,
                                       endpoint=endpoint,
                                       proxy=proxy,
                                       proxy_port=proxy_port))

        else:
            if str(path).endswith(".manifest"):
                sources = manifest.getSources(self.manifest_path, CARBON)
                if sources:
                    self.file_path = sources[0]
                else:
                    raise Exception("Manifest source can't be None!")

                try:
                    carbon_schema = CarbonSchemaReader().readSchema(
                        self.file_path)
                except:
                    raise Exception("readSchema has some errors: " +
                                    self.file_path)
            else:
                try:
                    carbon_schema = CarbonSchemaReader().readSchema(self.path)
                except:
                    raise Exception("readSchema has some errors")

            carbon_splits = ArrowCarbonReader().builder(self.path) \
              .getSplits(True)

            for split in carbon_splits:
                # split = self.url_path.scheme + "://" + self.url_path.netloc + split
                if str(path).endswith(".manifest"):
                    self.pieces.append(
                        CarbonDatasetPiece(
                            str(self.file_path)[0:(
                                str(self.file_path).rindex('/'))],
                            carbon_schema, split))
                else:
                    self.pieces.append(
                        CarbonDatasetPiece(path, carbon_schema, split))

        self.number_of_splits = len(self.pieces)
        self.schema = self.getArrowSchema()
        # TODO add mechanism to get the file path based on file filter
        self.common_metadata_path = self.url_path.path + '/_common_metadata'
        self.common_metadata = None
        try:
            if self.fs.exists(self.common_metadata_path):
                with self.fs.open(self.common_metadata_path) as f:
                    self.common_metadata = ParquetFile(f).metadata
        except:
            self.common_metadata = None
Exemplo n.º 8
0
    def __init__(self,
                 path,
                 key=None,
                 secret=None,
                 endpoint=None,
                 proxy=None,
                 proxy_port=None,
                 filesystem=None):
        self.path = path
        self.url_path = urlparse(path)

        if filesystem is None:
            a_path = self.path
            if isinstance(a_path, list):
                a_path = a_path[0]
            self.fs = _get_fs_from_path(a_path)
        else:
            self.fs = _ensure_filesystem(filesystem)

        self.pieces = list()

        if self.url_path.scheme == 's3a':
            if key is None or secret is None or endpoint is None:
                raise ValueError('key, secret, endpoint should not be None')

            if proxy is None and proxy_port is None:
                carbon_splits = CarbonReader().builder(self.path) \
                  .withHadoopConf("fs.s3a.access.key", key) \
                  .withHadoopConf("fs.s3a.secret.key", secret) \
                  .withHadoopConf("fs.s3a.endpoint", endpoint) \
                  .getSplits()

                configuration = Configuration()
                configuration.set("fs.s3a.access.key", key)
                configuration.set("fs.s3a.secret.key", secret)
                configuration.set("fs.s3a.endpoint", endpoint)

                self.configuration = configuration

            elif proxy is not None and proxy_port is not None:
                carbon_splits = CarbonReader().builder(self.path) \
                  .withHadoopConf("fs.s3a.access.key", key) \
                  .withHadoopConf("fs.s3a.secret.key", secret) \
                  .withHadoopConf("fs.s3a.endpoint", endpoint) \
                  .withHadoopConf("fs.s3a.proxy.host", proxy) \
                  .withHadoopConf("fs.s3a.proxy.port", proxy_port) \
                  .getSplits()

                configuration = Configuration()
                configuration.set("fs.s3a.access.key", key)
                configuration.set("fs.s3a.secret.key", secret)
                configuration.set("fs.s3a.endpoint", endpoint)
                configuration.set("fs.s3a.proxy.host", proxy)
                configuration.set("fs.s3a.proxy.port", proxy_port)

                self.configuration = configuration
            else:
                raise ValueError('wrong proxy & proxy_port configuration')

            carbon_schema = CarbonSchemaReader().readSchema(
                self.path, self.configuration.conf)

            for split in carbon_splits:
                # split = self.url_path.scheme + "://" + self.url_path.netloc + split
                self.pieces.append(
                    CarbonDatasetPiece(path,
                                       carbon_schema,
                                       split,
                                       key=key,
                                       secret=secret,
                                       endpoint=endpoint,
                                       proxy=proxy,
                                       proxy_port=proxy_port))

        else:
            carbon_splits = CarbonReader().builder(self.path) \
              .getSplits()

            carbon_schema = CarbonSchemaReader().readSchema(self.path)

            for split in carbon_splits:
                # split = self.url_path.scheme + "://" + self.url_path.netloc + split
                self.pieces.append(
                    CarbonDatasetPiece(path, carbon_schema, split))

        self.number_of_splits = len(self.pieces)
        self.schema = self.getArrowSchema()
        # TODO add mechanism to get the file path based on file filter
        self.common_metadata_path = self.url_path.path + '/_common_metadata'
        self.common_metadata = None
        try:
            if self.fs.exists(self.common_metadata_path):
                with self.fs.open(self.common_metadata_path) as f:
                    self.common_metadata = ParquetFile(f).metadata
        except:
            self.common_metadata = None