Пример #1
0
def _reconstruct_columns_from_metadata(columns, column_indexes):
    # Part of table_to_blockmanager

    # Get levels and labels, and provide sane defaults if the index has a
    # single level to avoid if/else spaghetti.
    levels = getattr(columns, 'levels', None) or [columns]
    labels = getattr(columns, 'labels', None) or [
        pd.RangeIndex(len(level)) for level in levels
    ]

    # Convert each level to the dtype provided in the metadata
    levels_dtypes = [
        (level, col_index.get('numpy_type', level.dtype))
        for level, col_index in zip_longest(
            levels, column_indexes, fillvalue={}
        )
    ]
    new_levels = [
        _level if _level.dtype == _dtype else _level.astype(_dtype)
        for _level, _dtype in levels_dtypes
    ]

    return pd.MultiIndex(
        levels=new_levels,
        labels=labels,
        names=columns.names
    )
Пример #2
0
def _reconstruct_columns_from_metadata(columns, column_indexes):
    # Part of table_to_blockmanager

    # Get levels and labels, and provide sane defaults if the index has a
    # single level to avoid if/else spaghetti.
    levels = getattr(columns, 'levels', None) or [columns]
    labels = getattr(columns, 'labels', None) or [
        pd.RangeIndex(len(level)) for level in levels
    ]

    # Convert each level to the dtype provided in the metadata
    levels_dtypes = [
        (level, col_index.get('numpy_type', level.dtype))
        for level, col_index in zip_longest(
            levels, column_indexes, fillvalue={}
        )
    ]
    new_levels = [
        _level if _level.dtype == _dtype else _level.astype(_dtype)
        for _level, _dtype in levels_dtypes
    ]

    return pd.MultiIndex(
        levels=new_levels,
        labels=labels,
        names=columns.names
    )
Пример #3
0
def _reconstruct_columns_from_metadata(columns, column_indexes):
    """Construct a pandas MultiIndex from `columns` and column index metadata
    in `column_indexes`.

    Parameters
    ----------
    columns : List[pd.Index]
        The columns coming from a pyarrow.Table
    column_indexes : List[Dict[str, str]]
        The column index metadata deserialized from the JSON schema metadata
        in a :class:`~pyarrow.Table`.

    Returns
    -------
    result : MultiIndex
        The index reconstructed using `column_indexes` metadata with levels of
        the correct type.

    Notes
    -----
    * Part of :func:`~pyarrow.pandas_compat.table_to_blockmanager`
    """
    pd = _pandas_api.pd
    # Get levels and labels, and provide sane defaults if the index has a
    # single level to avoid if/else spaghetti.
    levels = getattr(columns, 'levels', None) or [columns]
    labels = _get_multiindex_codes(columns) or [
        pd.RangeIndex(len(level)) for level in levels
    ]

    # Convert each level to the dtype provided in the metadata
    levels_dtypes = [
        (level, col_index.get('pandas_type', str(level.dtype)))
        for level, col_index in zip_longest(
            levels, column_indexes, fillvalue={}
        )
    ]

    new_levels = []
    encoder = operator.methodcaller('encode', 'UTF-8')

    for level, pandas_dtype in levels_dtypes:
        dtype = _pandas_type_to_numpy_type(pandas_dtype)

        # Since our metadata is UTF-8 encoded, Python turns things that were
        # bytes into unicode strings when json.loads-ing them. We need to
        # convert them back to bytes to preserve metadata.
        if dtype == np.bytes_:
            level = level.map(encoder)
        elif level.dtype != dtype:
            level = level.astype(dtype)

        new_levels.append(level)

    return pd.MultiIndex(new_levels, labels, names=columns.names)
Пример #4
0
def _reconstruct_columns_from_metadata(columns, column_indexes):
    """Construct a pandas MultiIndex from `columns` and column index metadata
    in `column_indexes`.

    Parameters
    ----------
    columns : List[pd.Index]
        The columns coming from a pyarrow.Table
    column_indexes : List[Dict[str, str]]
        The column index metadata deserialized from the JSON schema metadata
        in a :class:`~pyarrow.Table`.

    Returns
    -------
    result : MultiIndex
        The index reconstructed using `column_indexes` metadata with levels of
        the correct type.

    Notes
    -----
    * Part of :func:`~pyarrow.pandas_compat.table_to_blockmanager`
    """

    # Get levels and labels, and provide sane defaults if the index has a
    # single level to avoid if/else spaghetti.
    levels = getattr(columns, 'levels', None) or [columns]
    labels = getattr(columns, 'labels', None) or [
        pd.RangeIndex(len(level)) for level in levels
    ]

    # Convert each level to the dtype provided in the metadata
    levels_dtypes = [
        (level, col_index.get('pandas_type', str(level.dtype)))
        for level, col_index in zip_longest(
            levels, column_indexes, fillvalue={}
        )
    ]

    new_levels = []
    encoder = operator.methodcaller('encode', 'UTF-8')
    for level, pandas_dtype in levels_dtypes:
        dtype = _pandas_type_to_numpy_type(pandas_dtype)

        # Since our metadata is UTF-8 encoded, Python turns things that were
        # bytes into unicode strings when json.loads-ing them. We need to
        # convert them back to bytes to preserve metadata.
        if dtype == np.bytes_:
            level = level.map(encoder)
        elif level.dtype != dtype:
            level = level.astype(dtype)

        new_levels.append(level)

    return pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names)
Пример #5
0
def table_to_blockmanager(options, table, memory_pool, nthreads=1):
    import pandas.core.internals as _int
    import pyarrow.lib as lib

    index_columns = []
    columns = []
    column_indexes = []
    index_arrays = []
    index_names = []
    schema = table.schema
    row_count = table.num_rows
    metadata = schema.metadata
    columns_metadata = None

    has_pandas_metadata = metadata is not None and b'pandas' in metadata

    if has_pandas_metadata:
        pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8'))
        index_columns = pandas_metadata['index_columns']
        columns = pandas_metadata['columns']
        column_indexes = pandas_metadata.get('column_indexes', [])
        table = _add_any_metadata(table, pandas_metadata)
        columns_metadata = pandas_metadata.get('columns', None)

    block_table = table

    # Build up a list of index columns and names while removing those columns
    # from the original table
    logical_index_names = [c['name'] for c in columns[-len(index_columns):]]
    for raw_name, logical_name in zip(index_columns, logical_index_names):
        i = schema.get_field_index(raw_name)
        if i != -1:
            col = table.column(i)
            col_pandas = col.to_pandas()
            values = col_pandas.values
            if hasattr(values, 'flags') and not values.flags.writeable:
                # ARROW-1054: in pandas 0.19.2, factorize will reject
                # non-writeable arrays when calling MultiIndex.from_arrays
                values = values.copy()

            index_arrays.append(pd.Series(values, dtype=col_pandas.dtype))
            index_names.append(
                backwards_compatible_index_name(raw_name, logical_name))
            block_table = block_table.remove_column(
                block_table.schema.get_field_index(raw_name))

    # Convert an arrow table to Block from the internal pandas API
    result = lib.table_to_blocks(options, block_table, nthreads, memory_pool)

    # Construct the individual blocks converting dictionary types to pandas
    # categorical types and Timestamps-with-timezones types to the proper
    # pandas Blocks
    blocks = []
    for item in result:
        block_arr = item['block']
        placement = item['placement']
        if 'dictionary' in item:
            cat = pd.Categorical(block_arr,
                                 categories=item['dictionary'],
                                 ordered=item['ordered'],
                                 fastpath=True)
            block = _int.make_block(cat,
                                    placement=placement,
                                    klass=_int.CategoricalBlock,
                                    fastpath=True)
        elif 'timezone' in item:
            dtype = make_datetimetz(item['timezone'])
            block = _int.make_block(block_arr,
                                    placement=placement,
                                    klass=_int.DatetimeTZBlock,
                                    dtype=dtype,
                                    fastpath=True)
        else:
            block = _int.make_block(block_arr, placement=placement)
        blocks.append(block)

    # Construct the row index
    if len(index_arrays) > 1:
        index = pd.MultiIndex.from_arrays(index_arrays, names=index_names)
    elif len(index_arrays) == 1:
        index = pd.Index(index_arrays[0], name=index_names[0])
    else:
        index = pd.RangeIndex(row_count)

    column_strings = [x.name for x in block_table.itercolumns()]
    if columns_metadata is not None:
        columns_name_dict = dict(
            (str(x['name']), x['name']) for x in columns_metadata)
        columns_values = [
            columns_name_dict[y] if y in columns_name_dict.keys() else y
            for y in column_strings
        ]
    else:
        columns_values = column_strings

    # If we're passed multiple column indexes then evaluate with
    # ast.literal_eval, since the column index values show up as a list of
    # tuples
    to_pair = ast.literal_eval if len(column_indexes) > 1 else lambda x: (x, )

    # Create the column index

    # Construct the base index
    if not columns_values:
        columns = pd.Index(columns_values)
    else:
        columns = pd.MultiIndex.from_tuples(
            list(map(to_pair, columns_values)),
            names=[col_index['name'] for col_index in column_indexes] or None,
        )

    # if we're reconstructing the index
    if has_pandas_metadata:

        # Get levels and labels, and provide sane defaults if the index has a
        # single level to avoid if/else spaghetti.
        levels = getattr(columns, 'levels', None) or [columns]
        labels = getattr(columns, 'labels', None) or [
            pd.RangeIndex(len(level)) for level in levels
        ]

        # Convert each level to the dtype provided in the metadata
        levels_dtypes = [(level, col_index.get('numpy_type', level.dtype))
                         for level, col_index in zip_longest(
                             levels, column_indexes, fillvalue={})]
        new_levels = [
            _level if _level.dtype == _dtype else _level.astype(_dtype)
            for _level, _dtype in levels_dtypes
        ]

        columns = pd.MultiIndex(levels=new_levels,
                                labels=labels,
                                names=columns.names)

    # ARROW-1751: flatten a single level column MultiIndex for pandas 0.21.0
    columns = _flatten_single_level_multiindex(columns)

    axes = [columns, index]
    return _int.BlockManager(blocks, axes)
Пример #6
0
def table_to_blockmanager(options, table, memory_pool, nthreads=1):
    import pandas.core.internals as _int
    import pyarrow.lib as lib

    index_columns = []
    column_indexes = []
    index_arrays = []
    index_names = []
    schema = table.schema
    row_count = table.num_rows
    metadata = schema.metadata

    has_pandas_metadata = metadata is not None and b'pandas' in metadata

    if has_pandas_metadata:
        pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8'))
        index_columns = pandas_metadata['index_columns']
        column_indexes = pandas_metadata.get('column_indexes', [])
        table = _add_any_metadata(table, pandas_metadata)

    block_table = table

    # Build up a list of index columns and names while removing those columns
    # from the original table
    for name in index_columns:
        i = schema.get_field_index(name)
        if i != -1:
            col = table.column(i)
            index_name = None if is_unnamed_index_level(name) else name
            col_pandas = col.to_pandas()
            values = col_pandas.values
            if not values.flags.writeable:
                # ARROW-1054: in pandas 0.19.2, factorize will reject
                # non-writeable arrays when calling MultiIndex.from_arrays
                values = values.copy()

            index_arrays.append(pd.Series(values, dtype=col_pandas.dtype))
            index_names.append(index_name)
            block_table = block_table.remove_column(
                block_table.schema.get_field_index(name)
            )

    # Convert an arrow table to Block from the internal pandas API
    result = lib.table_to_blocks(options, block_table, nthreads, memory_pool)

    # Construct the individual blocks converting dictionary types to pandas
    # categorical types and Timestamps-with-timezones types to the proper
    # pandas Blocks
    blocks = []
    for item in result:
        block_arr = item['block']
        placement = item['placement']
        if 'dictionary' in item:
            cat = pd.Categorical(block_arr,
                                 categories=item['dictionary'],
                                 ordered=item['ordered'], fastpath=True)
            block = _int.make_block(cat, placement=placement,
                                    klass=_int.CategoricalBlock,
                                    fastpath=True)
        elif 'timezone' in item:
            dtype = make_datetimetz(item['timezone'])
            block = _int.make_block(block_arr, placement=placement,
                                    klass=_int.DatetimeTZBlock,
                                    dtype=dtype, fastpath=True)
        else:
            block = _int.make_block(block_arr, placement=placement)
        blocks.append(block)

    # Construct the row index
    if len(index_arrays) > 1:
        index = pd.MultiIndex.from_arrays(index_arrays, names=index_names)
    elif len(index_arrays) == 1:
        index = pd.Index(index_arrays[0], name=index_names[0])
    else:
        index = pd.RangeIndex(row_count)

    column_strings = [x.name for x in block_table.itercolumns()]

    # If we're passed multiple column indexes then evaluate with
    # ast.literal_eval, since the column index values show up as a list of
    # tuples
    to_pair = ast.literal_eval if len(column_indexes) > 1 else lambda x: (x,)

    # Create the column index

    # Construct the base index
    if not column_strings:
        columns = pd.Index(column_strings)
    else:
        columns = pd.MultiIndex.from_tuples(
            list(map(to_pair, column_strings)),
            names=[col_index['name'] for col_index in column_indexes] or None,
        )

    # if we're reconstructing the index
    if has_pandas_metadata:

        # Get levels and labels, and provide sane defaults if the index has a
        # single level to avoid if/else spaghetti.
        levels = getattr(columns, 'levels', None) or [columns]
        labels = getattr(columns, 'labels', None) or [
            pd.RangeIndex(len(level)) for level in levels
        ]

        # Convert each level to the dtype provided in the metadata
        levels_dtypes = [
            (level, col_index.get('numpy_type', level.dtype))
            for level, col_index in zip_longest(
                levels, column_indexes, fillvalue={}
            )
        ]
        new_levels = [
            _level if _level.dtype == _dtype else _level.astype(_dtype)
            for _level, _dtype in levels_dtypes
        ]
        columns = pd.MultiIndex(
            levels=new_levels,
            labels=labels,
            names=columns.names
        )
    axes = [columns, index]
    return _int.BlockManager(blocks, axes)