示例#1
0
import os, glob, warnings
import numpy as np
import numba as nb
import pandas as pd
import pyarrow.parquet as pq

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns

from load_config import c, config

if __name__ == '__main__':
    filelist = sorted(glob.glob(f"../pq/fdim_hres_dump_*.pq"))
    df = pq.ParquetDataset(filelist).read(nthreads=16).to_pandas()

    #---- Plotting
    fig = plt.figure(1, figsize=(4.5, 3))
    fig.clf()
    sns.set_context('paper')
    sns.set_style(
        'ticks', {
            'axes.grid': False,
            'axes.linewidth': '0.75',
            'grid.color': '0.75',
            'grid.linestyle': u':',
            'legend.frameon': True,
        })
    plt.rc('text', usetex=True)
    plt.rc('font', family='Serif')
示例#2
0
def _read_pyarrow(
    fs,
    fs_token,
    paths,
    columns=None,
    filters=None,
    categories=None,
    index=None,
    infer_divisions=None,
):
    from ...bytes.core import get_pyarrow_filesystem
    import pyarrow.parquet as pq

    # In pyarrow, the physical storage field names may differ from
    # the actual dataframe names. This is true for Index names when
    # PyArrow >= 0.8.
    # We would like to resolve these to the correct dataframe names
    # as soon as possible.

    if isinstance(categories, string_types):
        categories = [categories]
    elif categories is None:
        categories = []
    else:
        categories = list(categories)

    if isinstance(columns, tuple):
        columns = list(columns)

    dataset = pq.ParquetDataset(
        paths, filesystem=get_pyarrow_filesystem(fs), filters=filters
    )
    if dataset.partitions is not None:
        partitions = [n for n in dataset.partitions.partition_names if n is not None]
    else:
        partitions = []
    schema = dataset.schema.to_arrow_schema()
    has_pandas_metadata = schema.metadata is not None and b"pandas" in schema.metadata

    if has_pandas_metadata:
        pandas_metadata = json.loads(schema.metadata[b"pandas"].decode("utf8"))
        index_names, column_names, storage_name_mapping, column_index_names = _parse_pandas_metadata(
            pandas_metadata
        )
    else:
        index_names = []
        column_names = schema.names
        storage_name_mapping = {k: k for k in column_names}
        column_index_names = [None]

    column_names += [p for p in partitions if p not in column_names]
    column_names, index_names, out_type = _normalize_index_columns(
        columns, column_names, index, index_names
    )

    all_columns = index_names + column_names

    # Find non-empty pieces
    non_empty_pieces = []
    # Determine valid pieces
    _open = lambda fn: pq.ParquetFile(fs.open(fn, mode="rb"))
    for piece in dataset.pieces:
        pf = piece.get_metadata(_open)
        # non_empty_pieces.append(piece)
        if pf.num_row_groups > 0:
            non_empty_pieces.append(piece)

    # Sort pieces naturally
    # If a single input path resulted in multiple dataset pieces, then sort
    # the pieces naturally. If multiple paths were supplied then we leave
    # the order of the resulting pieces unmodified
    if len(paths) == 1 and len(dataset.pieces) > 1:
        non_empty_pieces = sorted(
            non_empty_pieces, key=lambda piece: natural_sort_key(piece.path)
        )

    # Determine divisions
    if len(index_names) == 1:

        # Look up storage name of the single index column
        divisions_names = [
            storage_name
            for storage_name, name in storage_name_mapping.items()
            if index_names[0] == name
        ]

        if divisions_names:
            divisions_name = divisions_names[0]
        else:
            divisions_name = None
    else:
        divisions_name = None

    divisions = _get_pyarrow_divisions(
        non_empty_pieces, divisions_name, schema, infer_divisions
    )

    # Build task
    dtypes = _get_pyarrow_dtypes(schema, categories)
    dtypes = {storage_name_mapping.get(k, k): v for k, v in dtypes.items()}

    meta = _meta_from_dtypes(all_columns, dtypes, index_names, column_index_names)
    meta = clear_known_categories(meta, cols=categories)

    if out_type == Series:
        assert len(meta.columns) == 1
        meta = meta[meta.columns[0]]

    task_name = "read-parquet-" + tokenize(fs_token, paths, all_columns)

    if non_empty_pieces:
        task_plan = {
            (task_name, i): (
                _read_pyarrow_parquet_piece,
                fs,
                piece,
                column_names,
                index_names,
                out_type == Series,
                dataset.partitions,
                categories,
            )
            for i, piece in enumerate(non_empty_pieces)
        }
    else:
        meta = strip_unknown_categories(meta)
        task_plan = {(task_name, 0): meta}

    return out_type(task_plan, task_name, meta, divisions)
示例#3
0
def save_data(run_all=True):

    for dataset_item in db.get_all(data_source='Chicago Data Portal'):
        print dataset_item
        dataset = dataset_item['dataset']
        if dataset == 'business_grants':
            print dataset
            today = datetime.datetime.today().date()
            date_list = set([today.strftime('%Y-%m')])
            date_list.add(
                (today - datetime.timedelta(days=32)).strftime('%Y-%m'))
            date_list = sorted(
                list(
                    set([(today - datetime.timedelta(days=x)).strftime('%Y-%m')
                         for x in range(32)])))
            paths = []

            if run_all:
                paths = ['bnroths/chicago-data/%s' % dataset]
                cnts = {}

            else:
                for month in date_list:
                    year, month = month.split('-')
                    paths.append('bnroths/chicago-data/%s/year=%s/month=%s' %
                                 (dataset, year, month))
                print paths
                cnts = datasets[dataset]['cnts']
                # exit(0)

            print paths
            for path in paths:
                ds = pq.ParquetDataset(path_or_paths=path,
                                       filesystem=S3FS,
                                       validate_schema=False)
                # print datasets[dataset].keys()
                columns = dataset_item['columns']
                dt = columns[1]
                table = ds.read()
                df = table.to_pandas()
                print df.columns
                print df.head()
                # exit(0)
                df['dt'] = df[dt].astype(str).str[:7]

                groups = dict(list(df.groupby('dt')))
                print groups.keys()
                # exit(0)
                for group in groups:
                    print group, type(group)
                    if group != "None":  # there is seriously a blank date
                        year, month = group.split('-')

                        a = groups[group][['longitude', 'latitude'
                                           ]].to_json(orient='values')
                        if dataset == 'building_permits':
                            if group >= '2016':
                                cnts[group] = groups[group].count()[0]
                        elif dataset == 'business_liscenses':
                            if group >= '2002':
                                cnts[group] = groups[group].count()[0]
                        else:
                            cnts[group] = groups[group].count()[0]

                        filename = '../data/%s/%s-%s/all.json' % (dataset,
                                                                  year, month)

                        if not os.path.exists(os.path.dirname(filename)):
                            try:
                                os.makedirs(os.path.dirname(filename))
                            except OSError as exc:  # Guard against race condition
                                if exc.errno != errno.EEXIST:
                                    raise

                        with open(filename, 'w') as f:
                            f.write(a)

                        ## write to s3
                        s3.save_file_public(local='../data/%s/%s-%s/all.json' %
                                            (dataset, year, month),
                                            dataset=dataset,
                                            dt="%s-%s" % (year, month),
                                            filename='all.json')
                        db.update_col(dataset=dataset,
                                      col='cnts',
                                      update=json.dumps(cnts))
    return None
    parser.add_argument('--print-values', action='store_true',
                        help='Print index values (dataset piece indexes)')
    parser.add_argument('--skip-index', nargs='+', type=str,
                        help='Donot display indexed values for given fields')
    parser.add_argument('--hdfs-driver', type=str, default='libhdfs3',
                        help='A string denoting the hdfs driver to use (if using a dataset on hdfs). '
                             'Current choices are libhdfs (java through JNI) or libhdfs3 (C++)')

    args = parser.parse_args()

    if args.dataset_url and args.dataset_url[-1] == '/':
        args.dataset_url = args.dataset_url[:-1]

    # Create pyarrow file system
    resolver = FilesystemResolver(args.dataset_url, hdfs_driver=args.hdfs_driver)
    dataset = pq.ParquetDataset(resolver.get_dataset_path(), filesystem=resolver.filesystem(),
                                validate_schema=False)

    print_all = not args.schema and not args.index
    if args.schema or print_all:
        print('*** Schema from dataset metadata ***')
        print((dataset_metadata.get_schema(dataset)))

    if args.index or print_all:
        index_dict = rowgroup_indexing.get_row_group_indexes(dataset)
        print('*** Row group indexes from dataset metadata ***')
        for index_name in index_dict:
            print(('Index: {}'.format(index_name)))
            if args.skip_index is None or index_name not in args.skip_index:
                for field_value in index_dict[index_name].indexed_values:
                    print('  -- {}({})'.format(field_value,
                                               len(index_dict[index_name].get_row_group_indexes(field_value))))
示例#5
0
def print_parquet_pandas_shape(bucket_uri, file_system):
    dataset = pq.ParquetDataset(bucket_uri, filesystem=file_system)
    table = dataset.read()
    df = table.to_pandas()
    print(df.shape)
示例#6
0
def read_parquet(paths: Union[str, List[str]],
                 filesystem: Optional["pyarrow.fs.FileSystem"] = None,
                 columns: Optional[List[str]] = None,
                 parallelism: int = 200,
                 **arrow_parquet_args) -> Dataset[ArrowRow]:
    """Create an Arrow dataset from parquet files.

    Examples:
        # Read a directory of files in remote storage.
        >>> ds.read_parquet("s3://bucket/path")

        # Read multiple local files.
        >>> ds.read_parquet(["/path/to/file1", "/path/to/file2"])

    Args:
        paths: A single file path or a list of file paths (or directories).
        filesystem: The filesystem implementation to read from.
        columns: A list of column names to read.
        parallelism: The amount of parallelism to use for the dataset.
        arrow_parquet_args: Other parquet read options to pass to pyarrow.

    Returns:
        Dataset holding Arrow records read from the specified paths.
    """
    import pyarrow.parquet as pq

    pq_ds = pq.ParquetDataset(paths, **arrow_parquet_args)

    read_tasks = [[] for _ in builtins.range(parallelism)]
    # TODO(ekl) support reading row groups (maybe as an option)
    for i, piece in enumerate(pq_ds.pieces):
        read_tasks[i % len(read_tasks)].append(piece)
    nonempty_tasks = [r for r in read_tasks if r]
    partitions = pq_ds.partitions

    @ray.remote
    def gen_read(pieces: List[pq.ParquetDatasetPiece]):
        import pyarrow
        logger.debug("Reading {} parquet pieces".format(len(pieces)))
        tables = [
            piece.read(columns=columns,
                       use_threads=False,
                       partitions=partitions) for piece in pieces
        ]
        if len(tables) > 1:
            table = pyarrow.concat_tables(tables)
        else:
            table = tables[0]
        return ArrowBlock(table)

    calls: List[Callable[[], ObjectRef[Block]]] = []
    metadata: List[BlockMetadata] = []
    for pieces in nonempty_tasks:
        calls.append(lambda pieces=pieces: gen_read.remote(pieces))
        piece_metadata = [p.get_metadata() for p in pieces]
        metadata.append(
            BlockMetadata(num_rows=sum(m.num_rows for m in piece_metadata),
                          size_bytes=sum(
                              sum(
                                  m.row_group(i).total_byte_size
                                  for i in builtins.range(m.num_row_groups))
                              for m in piece_metadata),
                          schema=piece_metadata[0].schema.to_arrow_schema(),
                          input_files=[p.path for p in pieces]))

    return Dataset(LazyBlockList(calls, metadata))
示例#7
0
def _determine_dataset_parts(fs, paths, gather_statistics, filters,
                             dataset_kwargs):
    """ Determine how to access metadata and break read into ``parts``

    This logic is mostly to handle `gather_statistics=False` cases,
    because this also means we should avoid scanning every file in the
    dataset.
    """
    parts = []
    if len(paths) > 1:
        base, fns = _analyze_paths(paths, fs)
        if "_metadata" in fns:
            # We have a _metadata file
            # PyArrow cannot handle "_metadata"
            # when `paths` is a list.
            paths.remove(base + fs.sep + "_metadata")
            fns.remove("_metadata")
            if gather_statistics is not False:
                # If we are allowed to gather statistics,
                # lets use "_metadata" instead of opening
                # every file. Note that we don't need to check if
                # the dataset is flat here, because PyArrow cannot
                # properly handle partitioning in this case anyway.
                dataset = pq.ParquetDataset(
                    base + fs.sep + "_metadata",
                    filesystem=fs,
                    filters=filters,
                    **dataset_kwargs,
                )
                dataset.metadata = dataset.pieces[0].get_metadata()
                dataset.pieces = [SimplePiece(path) for path in paths]
                dataset.partitions = None
                return parts, dataset
        if gather_statistics is not False:
            # This scans all the files
            dataset = pq.ParquetDataset(paths,
                                        filesystem=fs,
                                        filters=filters,
                                        **dataset_kwargs)
            if dataset.schema is None:
                # The dataset may have inconsistent schemas between files.
                # If so, we should try to use a "_common_metadata" file
                proxy_path = (base + fs.sep + "_common_metadata"
                              if "_common_metadata" in fns else paths[0])
                dataset.schema = pq.ParquetDataset(proxy_path,
                                                   filesystem=fs).schema
        else:
            # Rely on schema for 0th file.
            # Will need to pass a list of paths to read_partition
            dataset = pq.ParquetDataset(paths[0],
                                        filesystem=fs,
                                        **dataset_kwargs)
            parts = [base + fs.sep + fn for fn in fns]
    elif fs.isdir(paths[0]):
        # This is a directory, check for _metadata, then _common_metadata
        allpaths = fs.glob(paths[0] + fs.sep + "*")
        base, fns = _analyze_paths(allpaths, fs)
        # Check if dataset is "not flat" (partitioned into directories).
        # If so, we will need to let pyarrow generate the `dataset` object.
        not_flat = any(
            [fs.isdir(p) for p in fs.glob(fs.sep.join([base, "*"]))])
        if "_metadata" in fns and "validate_schema" not in dataset_kwargs:
            dataset_kwargs["validate_schema"] = False
        if not_flat or "_metadata" in fns or gather_statistics is not False:
            # Let arrow do its thing (use _metadata or scan files)
            dataset = pq.ParquetDataset(paths,
                                        filesystem=fs,
                                        filters=filters,
                                        **dataset_kwargs)
            if dataset.schema is None:
                # The dataset may have inconsistent schemas between files.
                # If so, we should try to use a "_common_metadata" file
                proxy_path = (base + fs.sep + "_common_metadata"
                              if "_common_metadata" in fns else allpaths[0])
                dataset.schema = pq.ParquetDataset(proxy_path,
                                                   filesystem=fs).schema
        else:
            # Use _common_metadata file if it is available.
            # Otherwise, just use 0th file
            if "_common_metadata" in fns:
                dataset = pq.ParquetDataset(base + fs.sep + "_common_metadata",
                                            filesystem=fs,
                                            **dataset_kwargs)
            else:
                dataset = pq.ParquetDataset(allpaths[0],
                                            filesystem=fs,
                                            **dataset_kwargs)
            parts = [
                base + fs.sep + fn for fn in fns if fn != "_common_metadata"
            ]
    else:
        # There is only one file to read
        dataset = pq.ParquetDataset(paths, filesystem=fs, **dataset_kwargs)
    return parts, dataset
示例#8
0
 def read_parquet(self, filename, to_df=True, **pq_kwargs):
     prqt_name = 's3://{bucket}/{filename}'.format(bucket=self.bucket_name,
                                                   filename=filename)
     prqt = pq.ParquetDataset(prqt_name, filesystem=self.s3fs)
     return prqt.read_pandas(**pq_kwargs).to_pandas() if to_df else prqt
示例#9
0
 def read_multiple_files(paths, columns=None, nthreads=None, **kwargs):
     dataset = pq.ParquetDataset(paths, **kwargs)
     return dataset.read(columns=columns, nthreads=nthreads)
示例#10
0
def get_dataset(key):
    return pq.ParquetDataset(key).read_pandas().to_pandas()
示例#11
0
文件: parquet.py 项目: stefanv/dask
def _read_pyarrow(fs,
                  fs_token,
                  paths,
                  columns=None,
                  filters=None,
                  categories=None,
                  index=None):
    from ...bytes.core import get_pyarrow_filesystem
    import pyarrow.parquet as pq
    import pyarrow as pa

    # In pyarrow, the physical storage field names may differ from
    # the actual dataframe names. This is true for Index names when
    # PyArrow >= 0.8.
    # We would like to resolve these to the correct dataframe names
    # as soon as possible.

    if filters is not None:
        raise NotImplementedError("Predicate pushdown not implemented")

    if isinstance(categories, string_types):
        categories = [categories]
    elif categories is None:
        categories = []
    else:
        categories = list(categories)

    if isinstance(columns, tuple):
        columns = list(columns)

    dataset = pq.ParquetDataset(paths, filesystem=get_pyarrow_filesystem(fs))
    schema = dataset.schema.to_arrow_schema()
    has_pandas_metadata = schema.metadata is not None and b'pandas' in schema.metadata

    if has_pandas_metadata:
        pandas_metadata = json.loads(schema.metadata[b'pandas'].decode('utf8'))
        index_names, column_names, storage_name_mapping, column_index_names = (
            _parse_pandas_metadata(pandas_metadata))
    else:
        index_names = []
        column_names = schema.names
        storage_name_mapping = {k: k for k in column_names}
        column_index_names = [None]

    if pa.__version__ < distutils.version.LooseVersion('0.8.0'):
        # the pyarrow 0.7.0 *reader* expects the storage names for index names
        # that are None.
        if any(x is None for x in index_names):
            name_storage_mapping = {
                v: k
                for k, v in storage_name_mapping.items()
            }
            index_names = [
                name_storage_mapping.get(name, name) for name in index_names
            ]

    column_names, index_names, out_type = _normalize_index_columns(
        columns, column_names, index, index_names)

    all_columns = index_names + column_names

    dtypes = _get_pyarrow_dtypes(schema, categories)
    dtypes = {storage_name_mapping.get(k, k): v for k, v in dtypes.items()}

    meta = _meta_from_dtypes(all_columns, dtypes, index_names,
                             column_index_names)
    meta = clear_known_categories(meta, cols=categories)

    if out_type == Series:
        assert len(meta.columns) == 1
        meta = meta[meta.columns[0]]

    task_name = 'read-parquet-' + tokenize(fs_token, paths, all_columns)

    if dataset.pieces:
        divisions = (None, ) * (len(dataset.pieces) + 1)
        task_plan = {(task_name, i):
                     (_read_pyarrow_parquet_piece, fs, piece, column_names,
                      index_names, out_type == Series, dataset.partitions,
                      categories)
                     for i, piece in enumerate(dataset.pieces)}
    else:
        meta = strip_unknown_categories(meta)
        divisions = (None, None)
        task_plan = {(task_name, 0): meta}

    return out_type(task_plan, task_name, meta, divisions)
示例#12
0
    def initialize_write(
        df,
        fs,
        path,
        append=False,
        partition_on=None,
        ignore_divisions=False,
        division_info=None,
        schema=None,
        index_cols=None,
        **kwargs,
    ):
        # Infer schema if "infer"
        # (also start with inferred schema if user passes a dict)
        if schema == "infer" or isinstance(schema, dict):

            # Start with schema from _meta_nonempty
            _schema = pa.Schema.from_pandas(
                df._meta_nonempty.set_index(index_cols)
                if index_cols
                else df._meta_nonempty
            )

            # Use dict to update our inferred schema
            if isinstance(schema, dict):
                schema = pa.schema(schema)
                for name in schema.names:
                    i = _schema.get_field_index(name)
                    j = schema.get_field_index(name)
                    _schema = _schema.set(i, schema.field(j))

            # If we have object columns, we need to sample partitions
            # until we find non-null data for each column in `sample`
            sample = [col for col in df.columns if df[col].dtype == "object"]
            if schema_field_supported and sample and schema == "infer":
                delayed_schema_from_pandas = delayed(pa.Schema.from_pandas)
                for i in range(df.npartitions):
                    # Keep data on worker
                    _s = delayed_schema_from_pandas(
                        df[sample].to_delayed()[i]
                    ).compute()
                    for name, typ in zip(_s.names, _s.types):
                        if typ != "null":
                            i = _schema.get_field_index(name)
                            j = _s.get_field_index(name)
                            _schema = _schema.set(i, _s.field(j))
                            sample.remove(name)
                    if not sample:
                        break

            # Final (inferred) schema
            schema = _schema

        dataset = fmd = None
        i_offset = 0
        if append and division_info is None:
            ignore_divisions = True
        fs.mkdirs(path, exist_ok=True)

        if append:
            try:
                # Allow append if the dataset exists.
                # Also need dataset.metadata object if
                # ignore_divisions is False (to check divisions)
                dataset = pq.ParquetDataset(path, filesystem=fs)
                if not dataset.metadata and not ignore_divisions:
                    # TODO: Be more flexible about existing metadata.
                    raise NotImplementedError(
                        "_metadata file needed to `append` "
                        "with `engine='pyarrow'` "
                        "unless `ignore_divisions` is `True`"
                    )
                fmd = dataset.metadata
            except (IOError, ValueError, IndexError):
                # Original dataset does not exist - cannot append
                append = False
        if append:
            names = dataset.metadata.schema.names
            has_pandas_metadata = (
                dataset.schema.to_arrow_schema().metadata is not None
                and b"pandas" in dataset.schema.to_arrow_schema().metadata
            )
            if has_pandas_metadata:
                pandas_metadata = json.loads(
                    dataset.schema.to_arrow_schema().metadata[b"pandas"].decode("utf8")
                )
                categories = [
                    c["name"]
                    for c in pandas_metadata["columns"]
                    if c["pandas_type"] == "categorical"
                ]
            else:
                categories = None
            dtypes = _get_pyarrow_dtypes(dataset.schema.to_arrow_schema(), categories)
            if set(names) != set(df.columns) - set(partition_on):
                raise ValueError(
                    "Appended columns not the same.\n"
                    "Previous: {} | New: {}".format(names, list(df.columns))
                )
            elif (pd.Series(dtypes).loc[names] != df[names].dtypes).any():
                # TODO Coerce values for compatible but different dtypes
                raise ValueError(
                    "Appended dtypes differ.\n{}".format(
                        set(dtypes.items()) ^ set(df.dtypes.iteritems())
                    )
                )
            i_offset = len(dataset.pieces)

            if division_info["name"] not in names:
                ignore_divisions = True
            if not ignore_divisions:
                old_end = None
                row_groups = [
                    dataset.metadata.row_group(i)
                    for i in range(dataset.metadata.num_row_groups)
                ]
                for row_group in row_groups:
                    for i, name in enumerate(names):
                        if name != division_info["name"]:
                            continue
                        column = row_group.column(i)
                        if column.statistics:
                            if not old_end:
                                old_end = column.statistics.max
                            else:
                                old_end = max(old_end, column.statistics.max)
                            break

                divisions = division_info["divisions"]
                if divisions[0] < old_end:
                    raise ValueError(
                        "Appended divisions overlapping with the previous ones"
                        " (set ignore_divisions=True to append anyway).\n"
                        "Previous: {} | New: {}".format(old_end, divisions[0])
                    )

        return fmd, schema, i_offset
import pyarrow.parquet as pq
import s3fs

file = "C:\\Users\\mlodhi\\OneDrive - Nice Systems Ltd\\Desktop\\Python Pract\\python_practice\\PractPackage\\files\\gender.parquet"
pq_data = pq.ParquetDataset(file)
reader = pq_data.read_pandas()
# print(type(reader))
# print(reader)
# print("Row count : ", reader.num_rows)
# print("Column count : ", reader.num_columns)
# print("Column names : ", reader.column_names)
pq_df = reader.to_pandas()
# print("Read data : \n", pq_df)

data = pq_data.read()
# print("Row count : ", data.num_rows)
# print("Column count : ", data.num_columns)
# print("Column names : ", data.column_names)
dataframe = data.to_pandas()
# print("Read data : \n", dataframe)
# print(type(dataframe))
# print(type(pq_df))
# print(dataframe.shape)
# print(pq_df.shape)

# print(pq_df.isnull().any())
# print(pq_df.isna().any())
# print(pq_df.index)
# print(pq_df.columns)
# print(pq_df.GENDER_CD)
# print(pq_df['GENDER_CD'])
示例#14
0
    def __init__(self, pyarrow_filesystem, dataset_path, schema_fields=None,
                 shuffle_row_groups=True, shuffle_row_drop_partitions=1,
                 predicate=None, rowgroup_selector=None, reader_pool=None, num_epochs=1,
                 cur_shard=None, shard_count=None, cache=None, infer_schema=False):
        """Initializes a reader object.

        :param pyarrow_filesystem: An instance of ``pyarrow.FileSystem`` that will be used. If not specified,
            then a default one will be selected based on the url (only for ``hdfs://`` or ``file://``; for
            ``s3://`` support, use ``make_reader``). The default hdfs driver is ``libhdfs3``. If you want
            to to use ``libhdfs``, use
            ``pyarrow_filesystem=pyarrow.hdfs.connect('hdfs:///some/path', driver='libhdfs')``.
        :param dataset_path: filepath to a parquet directory on the specified filesystem.
            e.g. ``'/user/yevgeni/parquet8'``, or ``'/tmp/mydataset'``.
        :param schema_fields: Either list of unischema fields to subset, or ``None`` to read all fields.
            OR an NGram object, then it will return an NGram of the specified properties.
        :param shuffle_row_groups: Whether to shuffle row groups (the order in which full row groups are read)
        :param shuffle_row_drop_partitions: This is is a positive integer which determines how many partitions to
            break up a row group into for increased shuffling in exchange for worse performance (extra reads).
            For example if you specify 2 each row group read will drop half of the rows within every row group and
            read the remaining rows in separate reads. It is recommended to keep this number below the regular row
            group size in order to not waste reads which drop all rows.
        :param predicate: instance of predicate object to filter rows to be returned by reader.
        :param rowgroup_selector: instance of row group selector object to select row groups to be read
        :param reader_pool: parallelization pool. ``ThreadPool(10)`` (10 threads) is used by default.
            This pool is a custom implementation used to parallelize reading data from the dataset.
            Any object from workers_pool package can be used
            (e.g. :class:`petastorm.workers_pool.process_pool.ProcessPool`).
        :param num_epochs: An epoch is a single pass over all rows in the dataset. Setting ``num_epochs`` to
            ``None`` will result in an infinite number of epochs.
        :param cur_shard: An int denoting the current shard number used. Each reader instance should
            pass in a unique shard number in the range ``[0, shard_count)``.
            ``shard_count`` must be supplied as well. Defaults to None
        :param shard_count: An int denoting the number of shard partitions there are. Defaults to None
        :param cache: An object conforming to :class:`.CacheBase` interface. Before loading row groups from a parquet
            file the Reader will attempt to load these values from cache. Caching is useful when communication
            to the main data store is either slow or expensive and the local machine has large enough storage
            to store entire dataset (or a partition of a dataset if shards are used).
            By default, use the :class:`.NullCache` implementation.
        """

        # 1. Open the parquet storage (dataset)
        # 2. Get a list of all groups
        # 3. Filter rowgroups
        #    a. predicates
        #    b. row-group selector (our indexing mechanism)
        #    c. partition: used to get a subset of data for distributed training
        # 4. Create a rowgroup ventilator object
        # 5. Start workers pool
        if not (isinstance(schema_fields, collections.Iterable) or isinstance(schema_fields, NGram)
                or schema_fields is None):
            raise ValueError("""Fields must be either None, an iterable collection of Unischema fields or an NGram
            object.""")

        self.ngram = schema_fields if isinstance(schema_fields, NGram) else None

        if self.ngram and not self.ngram.timestamp_overlap and shuffle_row_drop_partitions > 1:
            raise NotImplementedError('Using timestamp_overlap=False is not implemented with'
                                      ' shuffle_options.shuffle_row_drop_partitions > 1')

        cache = cache or NullCache()

        self._workers_pool = reader_pool or ThreadPool(10)
        # 1. Resolve dataset path (hdfs://, file://) and open the parquet storage (dataset)
        self.dataset = pq.ParquetDataset(dataset_path, filesystem=pyarrow_filesystem,
                                         validate_schema=False)

        if infer_schema:
            # If inferring schema, just retrieve the schema from a file of the dataset
            meta = self.dataset.pieces[0].get_metadata(self.dataset.fs.open)
            arrow_schema = meta.schema.to_arrow_schema()
            stored_schema = Unischema.from_arrow_schema(arrow_schema)
        else:
            # Otherwise, get the stored schema
            stored_schema = dataset_metadata.get_schema(self.dataset)

        # Make a schema view (a view is a Unischema containing only a subset of fields
        # Will raise an exception if invalid schema fields are in schema_fields
        fields = schema_fields if isinstance(schema_fields, collections.Iterable) else None
        self.schema = stored_schema.create_schema_view(fields) if fields else stored_schema

        # 2. Get a list of all groups
        row_groups = dataset_metadata.load_row_groups(self.dataset, infer_schema)

        # 3. Filter rowgroups
        filtered_row_group_indexes, worker_predicate = self._filter_row_groups(self.dataset, row_groups, predicate,
                                                                               rowgroup_selector, cur_shard,
                                                                               shard_count)
        # 4. Create a rowgroup ventilator object
        normalized_shuffle_row_drop_partitions = \
            self._normalize_shuffle_options(shuffle_row_drop_partitions, self.dataset)
        ventilator = self._create_ventilator(filtered_row_group_indexes, shuffle_row_groups,
                                             normalized_shuffle_row_drop_partitions, num_epochs, worker_predicate,
                                             self._workers_pool.workers_count + _VENTILATE_EXTRA_ROWGROUPS)

        # 5. Start workers pool
        self._workers_pool.start(ReaderWorker,
                                 (pyarrow_filesystem, dataset_path, self.schema, self.ngram, row_groups, cache),
                                 ventilator=ventilator)
        logger.debug('Workers pool started')

        self.last_row_consumed = False

        # _result
        self._result_buffer = []
示例#15
0
    def process(self, piece_index, worker_predicate,
                shuffle_row_drop_partition):
        """Main worker function. Loads and returns all rows matching the predicate from a rowgroup

        Looks up the requested piece (a single row-group in a parquet file). If a predicate is specified,
        columns needed by the predicate are loaded first. If no rows in the rowgroup matches the predicate criteria
        the rest of the columns are not loaded.

        :param piece_index:
        :param shuffle_row_drop_partition: A tuple 2 of the current row drop partition and the total number
            of partitions.
        :return:
        """

        if not self._dataset:
            self._dataset = pq.ParquetDataset(self._dataset_path_or_paths,
                                              filesystem=self._filesystem,
                                              validate_schema=False,
                                              filters=self._arrow_filters)

        if self._dataset.partitions is None:
            # When read from parquet file list, the `dataset.partitions` will be None.
            # But other petastorm code require at least an empty `ParquetPartitions` object.
            self._dataset.partitions = pq.ParquetPartitions()

        piece = self._split_pieces[piece_index]

        # Create pyarrow file system
        parquet_file = ParquetFile(self._dataset.fs.open(piece.path))

        if not isinstance(self._local_cache, NullCache):
            if worker_predicate:
                raise RuntimeError(
                    'Local cache is not supported together with predicates, '
                    'unless the dataset is partitioned by the column the predicate operates on.'
                )
            if shuffle_row_drop_partition[1] != 1:
                raise RuntimeError(
                    'Local cache is not supported together with shuffle_row_drop_partitions > 1'
                )

        if worker_predicate:
            all_cols = self._load_rows_with_predicate(
                parquet_file, piece, worker_predicate,
                shuffle_row_drop_partition)
        else:
            # Using hash of the dataset path with the relative path in order to:
            #  1. Make sure if a common cache serves multiple processes (e.g. redis), we don't have conflicts
            #  2. Dataset path is hashed, to make sure we don't create too long keys, which maybe incompatible with
            #     some cache implementations
            #  3. Still leave relative path and the piece_index in plain text to make it easier to debug
            if isinstance(self._dataset_path_or_paths, list):
                path_str = ','.join(self._dataset_path_or_paths)
            else:
                path_str = self._dataset_path_or_paths
            cache_key = '{}:{}:{}'.format(
                hashlib.md5(path_str.encode('utf-8')).hexdigest(), piece.path,
                piece_index)
            all_cols = self._local_cache.get(
                cache_key, lambda: self._load_rows(parquet_file, piece,
                                                   shuffle_row_drop_partition))

        if all_cols:
            self.publish_func(all_cols)
 def load_chunk_as_tensor(self, chunk_idx):
     print('Loading chunk %d from disk.' % chunk_idx)
     chunk = pq.ParquetDataset(self[chunk_idx]).read_pandas()
     chunk = chunk.to_pandas()
     chunk = torch.from_numpy(chunk.values)
     return chunk
示例#17
0
    def prepare_read(self,
                     parallelism: int,
                     paths: Union[str, List[str]],
                     filesystem: Optional["pyarrow.fs.FileSystem"] = None,
                     columns: Optional[List[str]] = None,
                     schema: Optional[Union[type,
                                            "pyarrow.lib.Schema"]] = None,
                     _block_udf: Optional[Callable[[Block], Block]] = None,
                     **reader_args) -> List[ReadTask]:
        """Creates and returns read tasks for a Parquet file-based datasource.
        """
        # NOTE: We override the base class FileBasedDatasource.prepare_read
        # method in order to leverage pyarrow's ParquetDataset abstraction,
        # which simplifies partitioning logic. We still use
        # FileBasedDatasource's write side (do_write), however.
        _check_pyarrow_version()
        from ray import cloudpickle
        import pyarrow as pa
        import pyarrow.parquet as pq
        import numpy as np

        paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem)
        if len(paths) == 1:
            paths = paths[0]

        dataset_kwargs = reader_args.pop("dataset_kwargs", {})
        pq_ds = pq.ParquetDataset(paths,
                                  **dataset_kwargs,
                                  filesystem=filesystem,
                                  use_legacy_dataset=False)
        if schema is None:
            schema = pq_ds.schema
        if columns:
            schema = pa.schema([schema.field(column) for column in columns],
                               schema.metadata)

        def read_pieces(serialized_pieces: List[str]) -> pa.Table:
            # Implicitly trigger S3 subsystem initialization by importing
            # pyarrow.fs.
            import pyarrow.fs  # noqa: F401

            # Deserialize after loading the filesystem class.
            pieces: List["pyarrow._dataset.ParquetFileFragment"] = [
                cloudpickle.loads(p) for p in serialized_pieces
            ]

            # Ensure that we're reading at least one dataset fragment.
            assert len(pieces) > 0

            from pyarrow.dataset import _get_partition_keys

            logger.debug(f"Reading {len(pieces)} parquet pieces")
            use_threads = reader_args.pop("use_threads", False)
            tables = []
            for piece in pieces:
                table = piece.to_table(use_threads=use_threads,
                                       columns=columns,
                                       schema=schema,
                                       **reader_args)
                part = _get_partition_keys(piece.partition_expression)
                if part:
                    for col, value in part.items():
                        table = table.set_column(
                            table.schema.get_field_index(col), col,
                            pa.array([value] * len(table)))
                # If the table is empty, drop it.
                if table.num_rows > 0:
                    tables.append(table)
            if len(tables) > 1:
                table = pa.concat_tables(tables, promote=True)
            elif len(tables) == 1:
                table = tables[0]
            if _block_udf is not None:
                table = _block_udf(table)
            # If len(tables) == 0, all fragments were empty, and we return the
            # empty table from the last fragment.
            return table

        if _block_udf is not None:
            # Try to infer dataset schema by passing dummy table through UDF.
            dummy_table = schema.empty_table()
            try:
                inferred_schema = _block_udf(dummy_table).schema
                inferred_schema = inferred_schema.with_metadata(
                    schema.metadata)
            except Exception:
                logger.debug(
                    "Failed to infer schema of dataset by passing dummy table "
                    "through UDF due to the following exception:",
                    exc_info=True)
                inferred_schema = schema
        else:
            inferred_schema = schema
        read_tasks = []
        serialized_pieces = [cloudpickle.dumps(p) for p in pq_ds.pieces]
        if len(pq_ds.pieces) > PARALLELIZE_META_FETCH_THRESHOLD:
            metadata = _fetch_metadata_remotely(serialized_pieces)
        else:
            metadata = _fetch_metadata(pq_ds.pieces)
        for piece_data in np.array_split(
                list(zip(pq_ds.pieces, serialized_pieces, metadata)),
                parallelism):
            if len(piece_data) == 0:
                continue
            pieces, serialized_pieces, metadata = zip(*piece_data)
            meta = _build_block_metadata(pieces, metadata, inferred_schema)
            read_tasks.append(
                ReadTask(
                    lambda pieces_=serialized_pieces: [read_pieces(pieces_)],
                    meta))

        return read_tasks
示例#18
0
def load_parquet_s3(file_system, bucket, file):
    s3_path = 's3://{}/{}'.format(bucket, file)
    dataset = pq.ParquetDataset(s3_path, filesystem=s3)
    df = dataset.read_pandas().to_pandas()
    return df
示例#19
0
def _read_pyarrow(fs,
                  paths,
                  file_opener,
                  columns=None,
                  filters=None,
                  categories=None,
                  index=None):
    from ...bytes.core import get_pyarrow_filesystem
    import pyarrow.parquet as pq

    if filters is not None:
        raise NotImplementedError("Predicate pushdown not implemented")

    if categories is not None:
        raise NotImplementedError("Categorical reads not yet implemented")

    if isinstance(columns, tuple):
        columns = list(columns)

    dataset = pq.ParquetDataset(paths, filesystem=get_pyarrow_filesystem(fs))
    schema = dataset.schema.to_arrow_schema()
    has_pandas_metadata = schema.metadata is not None and b'pandas' in schema.metadata
    task_name = 'read-parquet-' + tokenize(dataset, columns)

    if columns is None:
        all_columns = schema.names
    else:
        all_columns = columns

    if not isinstance(all_columns, list):
        out_type = Series
        all_columns = [all_columns]
    else:
        out_type = DataFrame

    if index is False:
        index_cols = []
    elif index is None:
        if has_pandas_metadata:
            pandas_metadata = json.loads(
                schema.metadata[b'pandas'].decode('utf8'))
            index_cols = pandas_metadata.get('index_columns', [])
        else:
            index_cols = []
    else:
        index_cols = index if isinstance(index, list) else [index]

    if index_cols:
        all_columns = list(unique(all_columns + index_cols))

    dtypes = _get_pyarrow_dtypes(schema)

    meta = _meta_from_dtypes(all_columns, schema.names, dtypes, index_cols)

    if out_type == Series:
        assert len(meta.columns) == 1
        meta = meta[meta.columns[0]]

    if dataset.pieces:
        divisions = (None, ) * (len(dataset.pieces) + 1)
        task_plan = {(task_name, i): (_read_pyarrow_parquet_piece, file_opener,
                                      piece, all_columns, index_cols,
                                      out_type == Series, dataset.partitions)
                     for i, piece in enumerate(dataset.pieces)}
    else:
        divisions = (None, None)
        task_plan = {(task_name, 0): meta}

    return out_type(task_plan, task_name, meta, divisions)
示例#20
0
def _determine_dataset_parts(fs, paths, gather_statistics, filters,
                             dataset_kwargs):
    """ Determine how to access metadata and break read into ``parts``

    This logic is mostly to handle `gather_statistics=False` cases,
    because this also means we should avoid scanning every file in the
    dataset.
    """
    parts = []
    if len(paths) > 1:
        if gather_statistics is not False:
            # This scans all the files
            dataset = pq.ParquetDataset(paths,
                                        filesystem=fs,
                                        filters=filters,
                                        **dataset_kwargs)
        else:
            base, fns = _analyze_paths(paths, fs)
            relpaths = [path.replace(base, "").lstrip("/") for path in paths]
            if "_metadata" in relpaths:
                # We have a _metadata file, lets use it
                dataset = pq.ParquetDataset(
                    base + fs.sep + "_metadata",
                    filesystem=fs,
                    filters=filters,
                    **dataset_kwargs,
                )
            else:
                # Rely on metadata for 0th file.
                # Will need to pass a list of paths to read_partition
                dataset = pq.ParquetDataset(paths[0],
                                            filesystem=fs,
                                            **dataset_kwargs)
                parts = [base + fs.sep + fn for fn in fns]
    else:
        if fs.isdir(paths[0]):
            # This is a directory, check for _metadata, then _common_metadata
            allpaths = fs.glob(paths[0] + fs.sep + "*")
            base, fns = _analyze_paths(allpaths, fs)
            relpaths = [
                path.replace(base, "").lstrip("/") for path in allpaths
            ]
            if "_metadata" in relpaths and "validate_schema" not in dataset_kwargs:
                dataset_kwargs["validate_schema"] = False
            if "_metadata" in relpaths or gather_statistics is not False:
                # Let arrow do its thing (use _metadata or scan files)
                dataset = pq.ParquetDataset(paths,
                                            filesystem=fs,
                                            filters=filters,
                                            **dataset_kwargs)
            else:
                # Use _common_metadata file if it is available.
                # Otherwise, just use 0th file
                if "_common_metadata" in relpaths:
                    dataset = pq.ParquetDataset(
                        base + fs.sep + "_common_metadata",
                        filesystem=fs,
                        **dataset_kwargs,
                    )
                else:
                    dataset = pq.ParquetDataset(allpaths[0],
                                                filesystem=fs,
                                                **dataset_kwargs)
                parts = [base + fs.sep + fn for fn in fns]
        else:
            # There is only one file to read
            dataset = pq.ParquetDataset(paths, filesystem=fs, **dataset_kwargs)
    return parts, dataset
示例#21
0
    def initialize_write(
        df,
        fs,
        path,
        append=False,
        partition_on=None,
        ignore_divisions=False,
        division_info=None,
        **kwargs,
    ):
        dataset = fmd = None
        i_offset = 0
        if append and division_info is None:
            ignore_divisions = True
        fs.mkdirs(path, exist_ok=True)

        if append:
            try:
                # Allow append if the dataset exists.
                # Also need dataset.metadata object if
                # ignore_divisions is False (to check divisions)
                dataset = pq.ParquetDataset(path, filesystem=fs)
                if not dataset.metadata and not ignore_divisions:
                    # TODO: Be more flexible about existing metadata.
                    raise NotImplementedError(
                        "_metadata file needed to `append` "
                        "with `engine='pyarrow'` "
                        "unless `ignore_divisions` is `True`")
                fmd = dataset.metadata
            except (IOError, ValueError, IndexError):
                # Original dataset does not exist - cannot append
                append = False
        if append:
            names = dataset.metadata.schema.names
            has_pandas_metadata = (
                dataset.schema.to_arrow_schema().metadata is not None
                and b"pandas" in dataset.schema.to_arrow_schema().metadata)
            if has_pandas_metadata:
                pandas_metadata = json.loads(dataset.schema.to_arrow_schema(
                ).metadata[b"pandas"].decode("utf8"))
                categories = [
                    c["name"] for c in pandas_metadata["columns"]
                    if c["pandas_type"] == "categorical"
                ]
            else:
                categories = None
            dtypes = _get_pyarrow_dtypes(dataset.schema.to_arrow_schema(),
                                         categories)
            if set(names) != set(df.columns) - set(partition_on):
                raise ValueError("Appended columns not the same.\n"
                                 "Previous: {} | New: {}".format(
                                     names, list(df.columns)))
            elif (pd.Series(dtypes).loc[names] != df[names].dtypes).any():
                # TODO Coerce values for compatible but different dtypes
                raise ValueError("Appended dtypes differ.\n{}".format(
                    set(dtypes.items()) ^ set(df.dtypes.iteritems())))
            i_offset = len(dataset.pieces)

            if division_info["name"] not in names:
                ignore_divisions = True
            if not ignore_divisions:
                old_end = None
                row_groups = [
                    dataset.metadata.row_group(i)
                    for i in range(dataset.metadata.num_row_groups)
                ]
                for row_group in row_groups:
                    for i, name in enumerate(names):
                        if name != division_info["name"]:
                            continue
                        column = row_group.column(i)
                        if column.statistics:
                            if not old_end:
                                old_end = column.statistics.max
                            else:
                                old_end = max(old_end, column.statistics.max)
                            break

                divisions = division_info["divisions"]
                if divisions[0] < old_end:
                    raise ValueError(
                        "Appended divisions overlapping with the previous ones"
                        " (set ignore_divisions=True to append anyway).\n"
                        "Previous: {} | New: {}".format(old_end, divisions[0]))

        return fmd, i_offset
def get_dataset(key):
    s3 = s3fs.S3FileSystem(secret=ceph_secret,
                           key=ceph_key,
                           client_kwargs=client_kwargs)
    return pq.ParquetDataset(key, filesystem=s3).read_pandas().to_pandas()
示例#23
0
    def __init__(self,
                 pyarrow_filesystem,
                 dataset_path,
                 schema_fields=None,
                 shuffle_row_groups=True,
                 shuffle_row_drop_partitions=1,
                 predicate=None,
                 rowgroup_selector=None,
                 reader_pool=None,
                 num_epochs=1,
                 cur_shard=None,
                 shard_count=None,
                 cache=None,
                 worker_class=None,
                 transform_spec=None,
                 is_batched_reader=False,
                 filters=None,
                 shard_seed=None):
        """Initializes a reader object.

        :param pyarrow_filesystem: An instance of ``pyarrow.FileSystem`` that will be used. If not specified,
            then a default one will be selected based on the url (only for ``hdfs://`` or ``file://``; for
            ``s3://`` and ``gs://`` support, use ``make_reader``). The default hdfs driver is ``libhdfs3``.
            If you want to to use ``libhdfs``, use
            ``pyarrow_filesystem=pyarrow.hdfs.connect('hdfs:///some/path', driver='libhdfs')``.
        :param dataset_path: filepath to a parquet directory or parquet file path list on the specified filesystem.
            e.g. ``'/user/yevgeni/parquet8'``, or ``'/tmp/mydataset'``,
            or ``[/tmp/mydataset/00000.parquet, /tmp/mydataset/00001.parquet]``
        :param schema_fields: Either list of unischema fields to subset, or ``None`` to read all fields.
            OR an NGram object, then it will return an NGram of the specified properties.
        :param shuffle_row_groups: Whether to shuffle row groups (the order in which full row groups are read)
        :param shuffle_row_drop_partitions: This is is a positive integer which determines how many partitions to
            break up a row group into for increased shuffling in exchange for worse performance (extra reads).
            For example if you specify 2 each row group read will drop half of the rows within every row group and
            read the remaining rows in separate reads. It is recommended to keep this number below the regular row
            group size in order to not waste reads which drop all rows.
        :param predicate: instance of predicate object to filter rows to be returned by reader.
        :param rowgroup_selector: instance of row group selector object to select row groups to be read
        :param reader_pool: parallelization pool. ``ThreadPool(10)`` (10 threads) is used by default.
            This pool is a custom implementation used to parallelize reading data from the dataset.
            Any object from workers_pool package can be used
            (e.g. :class:`petastorm.workers_pool.process_pool.ProcessPool`).
        :param num_epochs: An epoch is a single pass over all rows in the dataset. Setting ``num_epochs`` to
            ``None`` will result in an infinite number of epochs.
        :param cur_shard: An int denoting the current shard number used. Each reader instance should
            pass in a unique shard number in the range ``[0, shard_count)``.
            ``shard_count`` must be supplied as well. Defaults to None
        :param shard_count: An int denoting the number of shard partitions there are. Defaults to None
        :param cache: An object conforming to :class:`.CacheBase` interface. Before loading row groups from a parquet
            file the Reader will attempt to load these values from cache. Caching is useful when communication
            to the main data store is either slow or expensive and the local machine has large enough storage
            to store entire dataset (or a partition of a dataset if shards are used).
            By default, use the :class:`.NullCache` implementation.
        :param worker_class: This is the class that will be instantiated on a different thread/process. It's
            responsibility is to load and filter the data.
        :param filters: (List[Tuple] or List[List[Tuple]]): Standard PyArrow filters.
            These will be applied when loading the parquet file with PyArrow. More information
            here: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
        :param shard_seed: Random seed to shuffle row groups for data sharding. Defaults to None
        """
        self.num_epochs = num_epochs

        # 1. Open the parquet storage (dataset)
        # 2. Get a list of all groups
        # 3. Filter rowgroups
        #    a. predicates
        #    b. row-group selector (our indexing mechanism)
        #    c. partition: used to get a subset of data for distributed training
        # 4. Create a rowgroup ventilator object
        # 5. Start workers pool
        if not (isinstance(schema_fields, collections.Iterable)
                or isinstance(schema_fields, NGram) or schema_fields is None):
            raise ValueError(
                'Fields must be either None, an iterable collection of Unischema fields '
                'or an NGram object.')

        self.is_batched_reader = is_batched_reader
        # 1. Resolve dataset path (hdfs://, file://) and open the parquet storage (dataset)
        self.dataset = pq.ParquetDataset(dataset_path,
                                         filesystem=pyarrow_filesystem,
                                         validate_schema=False,
                                         metadata_nthreads=10,
                                         filters=filters)

        if self.dataset.partitions is None:
            # When read from parquet file list, the `dataset.partitions` will be None.
            # But other petastorm code require at least an empty `ParquetPartitions` object.
            self.dataset.partitions = pq.ParquetPartitions()

        stored_schema = infer_or_load_unischema(self.dataset)

        if isinstance(schema_fields, NGram):
            self.ngram = schema_fields
            self.ngram.resolve_regex_field_names(stored_schema)
        else:
            self.ngram = None

        # By default, use original method of working with list of dictionaries and not arrow tables
        worker_class = worker_class or PyDictReaderWorker
        self._results_queue_reader = worker_class.new_results_queue_reader()

        if self.ngram and not self.ngram.timestamp_overlap and shuffle_row_drop_partitions > 1:
            raise NotImplementedError(
                'Using timestamp_overlap=False is not implemented with'
                ' shuffle_options.shuffle_row_drop_partitions > 1')

        cache = cache or NullCache()

        self._workers_pool = reader_pool or ThreadPool(10)

        # Make a schema view (a view is a Unischema containing only a subset of fields
        # Will raise an exception if invalid schema fields are in schema_fields
        if self.ngram:
            fields = self.ngram.get_field_names_at_all_timesteps()
        else:
            fields = schema_fields if isinstance(
                schema_fields, collections.Iterable) else None

        storage_schema = stored_schema.create_schema_view(
            fields) if fields else stored_schema
        if len(storage_schema.fields) == 0:
            raise RuntimeError(
                f"No fields matching the criteria '{fields}' were found in the dataset {dataset_path}."
            )
        if transform_spec:
            self.schema = transform_schema(storage_schema, transform_spec)
        else:
            self.schema = storage_schema

        # 2. Get a list of all row groups
        row_groups = dataset_metadata.load_row_groups(self.dataset)

        # 3. Filter rowgroups
        filtered_row_group_indexes, worker_predicate = self._filter_row_groups(
            self.dataset, row_groups, predicate, rowgroup_selector, cur_shard,
            shard_count, shard_seed)
        # 4. Create a rowgroup ventilator object
        normalized_shuffle_row_drop_partitions = \
            self._normalize_shuffle_options(shuffle_row_drop_partitions, self.dataset)
        self.ventilator = self._create_ventilator(
            filtered_row_group_indexes, shuffle_row_groups,
            normalized_shuffle_row_drop_partitions, self.num_epochs,
            worker_predicate,
            self._workers_pool.workers_count + _VENTILATE_EXTRA_ROWGROUPS)

        # 5. Start workers pool
        self._workers_pool.start(
            worker_class,
            (pyarrow_filesystem, dataset_path, storage_schema, self.ngram,
             row_groups, cache, transform_spec, self.schema, filters),
            ventilator=self.ventilator)
        logger.debug('Workers pool started')

        self.last_row_consumed = False
        self.stopped = False
示例#24
0
def read_parquet(path,
                 engine: str = "auto",
                 columns=None,
                 groups_as_chunks=False,
                 use_arrow_dtype=None,
                 incremental_index=False,
                 storage_options=None,
                 memory_scale=None,
                 **kwargs):
    """
    Load a parquet object from the file path, returning a DataFrame.

    Parameters
    ----------
    path : str, path object or file-like object
        Any valid string path is acceptable. The string could be a URL.
        For file URLs, a host is expected. A local file could be:
        ``file://localhost/path/to/table.parquet``.
        A file URL can also be a path to a directory that contains multiple
        partitioned parquet files. Both pyarrow and fastparquet support
        paths to directories as well as file URLs. A directory path could be:
        ``file://localhost/path/to/tables``.
        By file-like object, we refer to objects with a ``read()`` method,
        such as a file handler (e.g. via builtin ``open`` function)
        or ``StringIO``.
    engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
        Parquet library to use. The default behavior is to try 'pyarrow',
        falling back to 'fastparquet' if 'pyarrow' is unavailable.
    columns : list, default=None
        If not None, only these columns will be read from the file.
    groups_as_chunks : bool, default False
        if True, each row group correspond to a chunk.
        if False, each file correspond to a chunk.
        Only available for 'pyarrow' engine.
    incremental_index: bool, default False
        Create a new RangeIndex if csv doesn't contain index columns.
    use_arrow_dtype: bool, default None
        If True, use arrow dtype to store columns.
    storage_options: dict, optional
        Options for storage connection.
    memory_scale: int, optional
        Scale that real memory occupation divided with raw file size.
    **kwargs
        Any additional kwargs are passed to the engine.

    Returns
    -------
    Mars DataFrame
    """

    engine_type = check_engine(engine)
    engine = get_engine(engine_type)

    if get_fs(path, storage_options).isdir(path):
        # If path is a directory, we will read as a partitioned datasets.
        if engine_type != 'pyarrow':
            raise TypeError('Only support pyarrow engine when reading from'
                            'partitioned datasets.')
        dataset = pq.ParquetDataset(path)
        dtypes = dataset.schema.to_arrow_schema().empty_table().to_pandas(
        ).dtypes
        for partition in dataset.partitions:
            dtypes[partition.name] = pd.CategoricalDtype()
    else:
        if not isinstance(path, list):
            file_path = glob(path, storage_options=storage_options)[0]
        else:
            file_path = path[0]

        with open_file(file_path, storage_options=storage_options) as f:
            dtypes = engine.read_dtypes(f)

        if columns:
            dtypes = dtypes[columns]

        if use_arrow_dtype is None:
            use_arrow_dtype = options.dataframe.use_arrow_dtype
        if use_arrow_dtype:
            dtypes = to_arrow_dtypes(dtypes)

    index_value = parse_index(pd.RangeIndex(-1))
    columns_value = parse_index(dtypes.index, store_data=True)
    op = DataFrameReadParquet(path=path,
                              engine=engine_type,
                              columns=columns,
                              groups_as_chunks=groups_as_chunks,
                              use_arrow_dtype=use_arrow_dtype,
                              read_kwargs=kwargs,
                              incremental_index=incremental_index,
                              storage_options=storage_options,
                              memory_scale=memory_scale)
    return op(index_value=index_value,
              columns_value=columns_value,
              dtypes=dtypes)
示例#25
0
 def get_parquet_dataset(self, path):
     return pq.ParquetDataset(self.get_localized_path(path),
                              filesystem=self.get_filesystem())
示例#26
0
        date_list.add((today - datetime.timedelta(days=32)).strftime('%Y-%m'))
        date_list = sorted(
            list(
                set([(today - datetime.timedelta(days=x)).strftime('%Y-%m')
                     for x in range(32)])))
        print date_list
        paths = []
        for month in date_list:
            year, month = month.split('-')
            paths.append('bnroths/chicago-data/%s/year=%s/month=%s' %
                         (dataset, year, month))
        print paths
        # exit(0)
        for path in paths:
            ds = pq.ParquetDataset(
                path_or_paths=path,  # 'bnroths/chicago-data/%s' % dataset,
                filesystem=S3FS,
                validate_schema=False)

            columns = datasets[dataset]['columns']
            dt = columns[1]
            table = ds.read()
            df = table.to_pandas()
            print df.columns
            print df.head()
            df['dt'] = df[dt].astype(str).str[:7]

            cnts = datasets[dataset]['cnts']
            dts = []
            groups = dict(list(df.groupby('dt')))
            print groups.keys()
            for group in groups:
def generate_petastorm_metadata(spark,
                                dataset_url,
                                unischema_class=None,
                                use_summary_metadata=False,
                                hdfs_driver='libhdfs3'):
    """
    Generates metadata necessary to read a petastorm dataset to an existing dataset.

    :param spark: spark session
    :param dataset_url: url of existing dataset
    :param unischema_class: (optional) fully qualified dataset unischema class. If not specified will attempt
        to find one already in the dataset. (e.g.
        :class:`examples.hello_world.generate_hello_world_dataset.HelloWorldSchema`)
    :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
    :param user: String denoting username when connecting to HDFS
    """
    sc = spark.sparkContext

    resolver = FilesystemResolver(dataset_url,
                                  sc._jsc.hadoopConfiguration(),
                                  hdfs_driver=hdfs_driver,
                                  user=spark.sparkContext.sparkUser())
    fs = resolver.filesystem()
    dataset = pq.ParquetDataset(resolver.get_dataset_path(),
                                filesystem=fs,
                                validate_schema=False)

    if unischema_class:
        schema = locate(unischema_class)
        if not isinstance(schema, Unischema):
            raise ValueError(
                'The specified class %s is not an instance of a petastorm.Unischema object.',
                unischema_class)
    else:

        try:
            schema = get_schema(dataset)
        except ValueError:
            raise ValueError(
                'Unischema class could not be located in existing dataset,'
                ' please specify it')

    # In order to be backwards compatible, we retrieve the common metadata from the dataset before
    # overwriting the metadata to keep row group indexes and the old row group per file index
    arrow_metadata = dataset.common_metadata or None

    with materialize_dataset(spark,
                             dataset_url,
                             schema,
                             use_summary_metadata=use_summary_metadata,
                             filesystem_factory=resolver.filesystem_factory()):
        if use_summary_metadata:
            # Inside the materialize dataset context we just need to write the metadata file as the schema will
            # be written by the context manager.
            # We use the java ParquetOutputCommitter to write the metadata file for the existing dataset
            # which will read all the footers of the dataset in parallel and merge them.
            hadoop_config = sc._jsc.hadoopConfiguration()
            Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
            parquet_output_committer = sc._gateway.jvm.org.apache.parquet.hadoop.ParquetOutputCommitter
            parquet_output_committer.writeMetaDataFile(hadoop_config,
                                                       Path(dataset_url))

    spark.stop()

    if use_summary_metadata and arrow_metadata:
        # When calling writeMetaDataFile it will overwrite the _common_metadata file which could have schema information
        # or row group indexers. Therefore we want to retain this information and will add it to the new
        # _common_metadata file. If we were using the old legacy metadata method this file wont be deleted
        base_schema = arrow_metadata.schema.to_arrow_schema()
        metadata_dict = base_schema.metadata
        if ROW_GROUPS_PER_FILE_KEY in metadata_dict:
            add_to_dataset_metadata(dataset, ROW_GROUPS_PER_FILE_KEY,
                                    metadata_dict[ROW_GROUPS_PER_FILE_KEY])
        if ROWGROUPS_INDEX_KEY in metadata_dict:
            add_to_dataset_metadata(dataset, ROWGROUPS_INDEX_KEY,
                                    metadata_dict[ROWGROUPS_INDEX_KEY])
示例#28
0
文件: arrow.py 项目: xvr-hlt/dask
    def read_metadata(
        fs,
        paths,
        categories=None,
        index=None,
        gather_statistics=None,
        filters=None,
        **kwargs,
    ):
        dataset = pq.ParquetDataset(paths,
                                    filesystem=fs,
                                    **kwargs.get("dataset", {}))

        if dataset.partitions is not None:
            partitions = [
                n for n in dataset.partitions.partition_names if n is not None
            ]
        else:
            partitions = []

        schema = dataset.schema.to_arrow_schema()
        columns = None

        has_pandas_metadata = (schema.metadata is not None
                               and b"pandas" in schema.metadata)

        if has_pandas_metadata:
            pandas_metadata = json.loads(
                schema.metadata[b"pandas"].decode("utf8"))
            (
                index_names,
                column_names,
                storage_name_mapping,
                column_index_names,
            ) = _parse_pandas_metadata(pandas_metadata)
        else:
            index_names = []
            column_names = schema.names
            storage_name_mapping = {k: k for k in column_names}
            column_index_names = [None]

        if index is None and index_names:
            index = index_names

        if set(column_names).intersection(partitions):
            raise ValueError("partition(s) should not exist in columns.\n"
                             "categories: {} | partitions: {}".format(
                                 column_names, partitions))

        column_names, index_names = _normalize_index_columns(
            columns, column_names + partitions, index, index_names)

        all_columns = index_names + column_names

        pieces = sorted(dataset.pieces,
                        key=lambda piece: natural_sort_key(piece.path))

        # Check that categories are included in columns
        if categories and not set(categories).intersection(all_columns):
            raise ValueError("categories not in available columns.\n"
                             "categories: {} | columns: {}".format(
                                 categories, list(all_columns)))

        dtypes = _get_pyarrow_dtypes(schema, categories)
        dtypes = {storage_name_mapping.get(k, k): v for k, v in dtypes.items()}

        index_cols = index or ()
        meta = _meta_from_dtypes(all_columns, dtypes, index_cols,
                                 column_index_names)

        meta = clear_known_categories(meta, cols=categories)
        if (gather_statistics is None and dataset.metadata
                and dataset.metadata.num_row_groups == len(pieces)):
            gather_statistics = True
        if not pieces:
            gather_statistics = False

        if gather_statistics:
            # Read from _metadata file
            if dataset.metadata and dataset.metadata.num_row_groups == len(
                    pieces):
                row_groups = [
                    dataset.metadata.row_group(i)
                    for i in range(dataset.metadata.num_row_groups)
                ]
                names = dataset.metadata.schema.names
            else:
                # Read from each individual piece (quite possibly slow).
                row_groups = _get_md_row_groups(pieces)
                if row_groups:
                    piece = pieces[0]
                    md = piece.get_metadata()
                    names = md.schema.names
                else:
                    gather_statistics = False

        if gather_statistics:
            stats = []
            for row_group in row_groups:
                s = {"num-rows": row_group.num_rows, "columns": []}
                for i, name in enumerate(names):
                    column = row_group.column(i)
                    d = {"name": name}
                    if column.statistics:
                        cs_min = column.statistics.min
                        cs_max = column.statistics.max
                        d.update({
                            "min": cs_min,
                            "max": cs_max,
                            "null_count": column.statistics.null_count,
                        })
                    s["columns"].append(d)
                stats.append(s)
        else:
            stats = None

        if dataset.partitions:
            for partition in dataset.partitions:
                if isinstance(index, list) and partition.name == index[0]:
                    meta.index = pd.CategoricalIndex(categories=partition.keys,
                                                     name=index[0])
                elif partition.name == meta.index.name:
                    meta.index = pd.CategoricalIndex(categories=partition.keys,
                                                     name=meta.index.name)
                elif partition.name in meta.columns:
                    meta[partition.name] = pd.Categorical(
                        categories=partition.keys, values=[])

        # Create `parts` (list of row-group-descriptor dicts)
        parts = [{
            "piece": piece,
            "kwargs": {
                "partitions": dataset.partitions,
                "categories": categories
            },
        } for piece in pieces]

        return (meta, stats, parts)
示例#29
0
def materialize_dataset(spark,
                        dataset_url,
                        schema,
                        row_group_size_mb=None,
                        use_summary_metadata=False,
                        filesystem_factory=None):
    """
    A Context Manager which handles all the initialization and finalization necessary
    to generate metadata for a petastorm dataset. This should be used around your
    spark logic to materialize a dataset (specifically the writing of parquet output).

    Note: Any rowgroup indexing should happen outside the materialize_dataset block

    Example:

    >>> spark = SparkSession.builder...
    >>> ds_url = 'hdfs:///path/to/my/dataset'
    >>> with materialize_dataset(spark, ds_url, MyUnischema, 64):
    >>>   spark.sparkContext.parallelize(range(0, 10)).
    >>>     ...
    >>>     .write.parquet(ds_url)
    >>> indexer = [SingleFieldIndexer(...)]
    >>> build_rowgroup_index(ds_url, spark.sparkContext, indexer)

    A user may provide their own recipe for creation of pyarrow filesystem object in ``filesystem_factory``
    argument (otherwise, petastorm will create a default one based on the url).

    The following example shows how a custom pyarrow HDFS filesystem, instantiated using ``libhdfs`` driver can be used
    during Petastorm dataset generation:

    >>> resolver=FilesystemResolver(dataset_url, spark.sparkContext._jsc.hadoopConfiguration(),
    >>>                             hdfs_driver='libhdfs')
    >>> with materialize_dataset(..., filesystem_factory=resolver.filesystem_factory()):
    >>>     ...


    :param spark: The spark session you are using
    :param dataset_url: The dataset url to output your dataset to (e.g. ``hdfs:///path/to/dataset``)
    :param schema: The :class:`petastorm.unischema.Unischema` definition of your dataset
    :param row_group_size_mb: The parquet row group size to use for your dataset
    :param use_summary_metadata: Whether to use the parquet summary metadata for row group indexing or a custom
      indexing method. The custom indexing method is more scalable for very large datasets.
    :param filesystem_factory: A filesystem factory function to be used when saving Petastorm specific metadata to the
      Parquet store.
    """
    spark_config = {}
    _init_spark(spark, spark_config, row_group_size_mb, use_summary_metadata)
    yield
    # After job completes, add the unischema metadata and check for the metadata summary file
    if filesystem_factory is None:
        resolver = FilesystemResolver(
            dataset_url,
            spark.sparkContext._jsc.hadoopConfiguration(),
            user=spark.sparkContext.sparkUser())
        filesystem_factory = resolver.filesystem_factory()
        dataset_path = resolver.get_dataset_path()
    else:
        dataset_path = get_dataset_path(urlparse(dataset_url))
    filesystem = filesystem_factory()

    dataset = pq.ParquetDataset(dataset_path,
                                filesystem=filesystem,
                                validate_schema=False)

    _generate_unischema_metadata(dataset, schema)
    if not use_summary_metadata:
        _generate_num_row_groups_per_file(dataset, spark.sparkContext,
                                          filesystem_factory)

    # Reload the dataset to take into account the new metadata
    dataset = pq.ParquetDataset(dataset_path,
                                filesystem=filesystem,
                                validate_schema=False)
    try:
        # Try to load the row groups, if it fails that means the metadata was not generated properly
        load_row_groups(dataset)
    except PetastormMetadataError:
        raise PetastormMetadataGenerationError(
            'Could not find summary metadata file. The dataset will exist but you will need'
            ' to execute petastorm-generate-metadata.py before you can read your dataset '
            ' in order to generate the necessary metadata.'
            ' Try increasing spark driver memory next time and making sure you are'
            ' using parquet-mr >= 1.8.3')

    _cleanup_spark(spark, spark_config, row_group_size_mb)
示例#30
0
    def prepare_read(
        self,
        parallelism: int,
        paths: Union[str, List[str]],
        filesystem: Optional["pyarrow.fs.FileSystem"] = None,
        columns: Optional[List[str]] = None,
        schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None,
        meta_provider: ParquetMetadataProvider = DefaultParquetMetadataProvider(),
        _block_udf: Optional[Callable[[Block], Block]] = None,
        **reader_args,
    ) -> List[ReadTask]:
        """Creates and returns read tasks for a Parquet file-based datasource."""
        # NOTE: We override the base class FileBasedDatasource.prepare_read
        # method in order to leverage pyarrow's ParquetDataset abstraction,
        # which simplifies partitioning logic. We still use
        # FileBasedDatasource's write side (do_write), however.
        _check_pyarrow_version()
        from ray import cloudpickle
        import pyarrow as pa
        import pyarrow.parquet as pq
        import numpy as np

        paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem)
        if len(paths) == 1:
            paths = paths[0]

        dataset_kwargs = reader_args.pop("dataset_kwargs", {})
        pq_ds = pq.ParquetDataset(
            paths, **dataset_kwargs, filesystem=filesystem, use_legacy_dataset=False
        )
        if schema is None:
            schema = pq_ds.schema
        if columns:
            schema = pa.schema(
                [schema.field(column) for column in columns], schema.metadata
            )

        def read_pieces(serialized_pieces: str) -> Iterator[pa.Table]:
            # Implicitly trigger S3 subsystem initialization by importing
            # pyarrow.fs.
            import pyarrow.fs  # noqa: F401

            # Deserialize after loading the filesystem class.
            try:
                _register_parquet_file_fragment_serialization()
                pieces: List[
                    "pyarrow._dataset.ParquetFileFragment"
                ] = cloudpickle.loads(serialized_pieces)
            finally:
                _deregister_parquet_file_fragment_serialization()

            # Ensure that we're reading at least one dataset fragment.
            assert len(pieces) > 0

            from pyarrow.dataset import _get_partition_keys

            ctx = DatasetContext.get_current()
            output_buffer = BlockOutputBuffer(
                block_udf=_block_udf, target_max_block_size=ctx.target_max_block_size
            )

            logger.debug(f"Reading {len(pieces)} parquet pieces")
            use_threads = reader_args.pop("use_threads", False)
            for piece in pieces:
                part = _get_partition_keys(piece.partition_expression)
                batches = piece.to_batches(
                    use_threads=use_threads,
                    columns=columns,
                    schema=schema,
                    batch_size=PARQUET_READER_ROW_BATCH_SIZE,
                    **reader_args,
                )
                for batch in batches:
                    table = pyarrow.Table.from_batches([batch], schema=schema)
                    if part:
                        for col, value in part.items():
                            table = table.set_column(
                                table.schema.get_field_index(col),
                                col,
                                pa.array([value] * len(table)),
                            )
                    # If the table is empty, drop it.
                    if table.num_rows > 0:
                        output_buffer.add_block(table)
                        if output_buffer.has_next():
                            yield output_buffer.next()
            output_buffer.finalize()
            if output_buffer.has_next():
                yield output_buffer.next()

        if _block_udf is not None:
            # Try to infer dataset schema by passing dummy table through UDF.
            dummy_table = schema.empty_table()
            try:
                inferred_schema = _block_udf(dummy_table).schema
                inferred_schema = inferred_schema.with_metadata(schema.metadata)
            except Exception:
                logger.debug(
                    "Failed to infer schema of dataset by passing dummy table "
                    "through UDF due to the following exception:",
                    exc_info=True,
                )
                inferred_schema = schema
        else:
            inferred_schema = schema
        read_tasks = []
        metadata = meta_provider.prefetch_file_metadata(pq_ds.pieces) or []
        try:
            _register_parquet_file_fragment_serialization()
            for pieces, metadata in zip(
                np.array_split(pq_ds.pieces, parallelism),
                np.array_split(metadata, parallelism),
            ):
                if len(pieces) <= 0:
                    continue
                serialized_pieces = cloudpickle.dumps(pieces)
                input_files = [p.path for p in pieces]
                meta = meta_provider(
                    input_files,
                    inferred_schema,
                    pieces=pieces,
                    prefetched_metadata=metadata,
                )
                read_tasks.append(
                    ReadTask(lambda p=serialized_pieces: read_pieces(p), meta)
                )
        finally:
            _deregister_parquet_file_fragment_serialization()

        return read_tasks