Exemplo n.º 1
0
def table_to_blockmanager(options, table, memory_pool, nthreads=1):
    import pandas.core.internals as _int
    import pyarrow.lib as lib

    index_columns = []
    columns = []
    column_indexes = []
    index_arrays = []
    index_names = []
    schema = table.schema
    row_count = table.num_rows
    metadata = schema.metadata
    columns_metadata = None

    has_pandas_metadata = metadata is not None and b'pandas' in metadata

    if has_pandas_metadata:
        pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8'))
        index_columns = pandas_metadata['index_columns']
        columns = pandas_metadata['columns']
        column_indexes = pandas_metadata.get('column_indexes', [])
        table = _add_any_metadata(table, pandas_metadata)
        columns_metadata = pandas_metadata.get('columns', None)

    block_table = table

    # Build up a list of index columns and names while removing those columns
    # from the original table
    logical_index_names = [c['name'] for c in columns[-len(index_columns):]]
    for raw_name, logical_name in zip(index_columns, logical_index_names):
        i = schema.get_field_index(raw_name)
        if i != -1:
            col = table.column(i)
            col_pandas = col.to_pandas()
            values = col_pandas.values
            if hasattr(values, 'flags') and not values.flags.writeable:
                # ARROW-1054: in pandas 0.19.2, factorize will reject
                # non-writeable arrays when calling MultiIndex.from_arrays
                values = values.copy()

            index_arrays.append(pd.Series(values, dtype=col_pandas.dtype))
            index_names.append(
                backwards_compatible_index_name(raw_name, logical_name))
            block_table = block_table.remove_column(
                block_table.schema.get_field_index(raw_name))

    # Convert an arrow table to Block from the internal pandas API
    result = lib.table_to_blocks(options, block_table, nthreads, memory_pool)

    # Construct the individual blocks converting dictionary types to pandas
    # categorical types and Timestamps-with-timezones types to the proper
    # pandas Blocks
    blocks = []
    for item in result:
        block_arr = item['block']
        placement = item['placement']
        if 'dictionary' in item:
            cat = pd.Categorical(block_arr,
                                 categories=item['dictionary'],
                                 ordered=item['ordered'],
                                 fastpath=True)
            block = _int.make_block(cat,
                                    placement=placement,
                                    klass=_int.CategoricalBlock,
                                    fastpath=True)
        elif 'timezone' in item:
            dtype = make_datetimetz(item['timezone'])
            block = _int.make_block(block_arr,
                                    placement=placement,
                                    klass=_int.DatetimeTZBlock,
                                    dtype=dtype,
                                    fastpath=True)
        else:
            block = _int.make_block(block_arr, placement=placement)
        blocks.append(block)

    # Construct the row index
    if len(index_arrays) > 1:
        index = pd.MultiIndex.from_arrays(index_arrays, names=index_names)
    elif len(index_arrays) == 1:
        index = pd.Index(index_arrays[0], name=index_names[0])
    else:
        index = pd.RangeIndex(row_count)

    column_strings = [x.name for x in block_table.itercolumns()]
    if columns_metadata is not None:
        columns_name_dict = dict(
            (str(x['name']), x['name']) for x in columns_metadata)
        columns_values = [
            columns_name_dict[y] if y in columns_name_dict.keys() else y
            for y in column_strings
        ]
    else:
        columns_values = column_strings

    # If we're passed multiple column indexes then evaluate with
    # ast.literal_eval, since the column index values show up as a list of
    # tuples
    to_pair = ast.literal_eval if len(column_indexes) > 1 else lambda x: (x, )

    # Create the column index

    # Construct the base index
    if not columns_values:
        columns = pd.Index(columns_values)
    else:
        columns = pd.MultiIndex.from_tuples(
            list(map(to_pair, columns_values)),
            names=[col_index['name'] for col_index in column_indexes] or None,
        )

    # if we're reconstructing the index
    if has_pandas_metadata:

        # Get levels and labels, and provide sane defaults if the index has a
        # single level to avoid if/else spaghetti.
        levels = getattr(columns, 'levels', None) or [columns]
        labels = getattr(columns, 'labels', None) or [
            pd.RangeIndex(len(level)) for level in levels
        ]

        # Convert each level to the dtype provided in the metadata
        levels_dtypes = [(level, col_index.get('numpy_type', level.dtype))
                         for level, col_index in zip_longest(
                             levels, column_indexes, fillvalue={})]
        new_levels = [
            _level if _level.dtype == _dtype else _level.astype(_dtype)
            for _level, _dtype in levels_dtypes
        ]

        columns = pd.MultiIndex(levels=new_levels,
                                labels=labels,
                                names=columns.names)

    # ARROW-1751: flatten a single level column MultiIndex for pandas 0.21.0
    columns = _flatten_single_level_multiindex(columns)

    axes = [columns, index]
    return _int.BlockManager(blocks, axes)
Exemplo n.º 2
0
def table_to_blockmanager(table, nthreads=1):
    import pandas.core.internals as _int
    from pyarrow.compat import DatetimeTZDtype
    import pyarrow.lib as lib

    block_table = table

    index_columns = []
    index_arrays = []
    index_names = []
    schema = table.schema
    row_count = table.num_rows
    metadata = schema.metadata

    if metadata is not None and b'pandas' in metadata:
        pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8'))
        index_columns = pandas_metadata['index_columns']

    for name in index_columns:
        i = schema.get_field_index(name)
        if i != -1:
            col = table.column(i)
            index_name = (None if is_unnamed_index_level(name)
                          else name)
            values = col.to_pandas().values
            if not values.flags.writeable:
                # ARROW-1054: in pandas 0.19.2, factorize will reject
                # non-writeable arrays when calling MultiIndex.from_arrays
                values = values.copy()

            index_arrays.append(values)
            index_names.append(index_name)
            block_table = block_table.remove_column(
                block_table.schema.get_field_index(name)
            )

    result = lib.table_to_blocks(block_table, nthreads)

    blocks = []
    for item in result:
        block_arr = item['block']
        placement = item['placement']
        if 'dictionary' in item:
            cat = pd.Categorical(block_arr,
                                 categories=item['dictionary'],
                                 ordered=False, fastpath=True)
            block = _int.make_block(cat, placement=placement,
                                    klass=_int.CategoricalBlock,
                                    fastpath=True)
        elif 'timezone' in item:
            dtype = DatetimeTZDtype('ns', tz=item['timezone'])
            block = _int.make_block(block_arr, placement=placement,
                                    klass=_int.DatetimeTZBlock,
                                    dtype=dtype, fastpath=True)
        else:
            block = _int.make_block(block_arr, placement=placement)
        blocks.append(block)

    if len(index_arrays) > 1:
        index = pd.MultiIndex.from_arrays(index_arrays, names=index_names)
    elif len(index_arrays) == 1:
        index = pd.Index(index_arrays[0], name=index_names[0])
    else:
        index = pd.RangeIndex(row_count)

    axes = [
        [column.name for column in block_table.itercolumns()],
        index
    ]

    return _int.BlockManager(blocks, axes)
def table_to_blockmanager(table, nthreads=1):
    import pandas.core.internals as _int
    from pyarrow.compat import DatetimeTZDtype
    import pyarrow.lib as lib

    block_table = table

    index_columns = []
    index_arrays = []
    index_names = []
    schema = table.schema
    row_count = table.num_rows
    metadata = schema.metadata

    if metadata is not None and b'pandas' in metadata:
        pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8'))
        index_columns = pandas_metadata['index_columns']

    for name in index_columns:
        i = schema.get_field_index(name)
        if i != -1:
            col = table.column(i)
            index_name = (None if is_unnamed_index_level(name) else name)
            values = col.to_pandas().values
            if not values.flags.writeable:
                # ARROW-1054: in pandas 0.19.2, factorize will reject
                # non-writeable arrays when calling MultiIndex.from_arrays
                values = values.copy()

            index_arrays.append(values)
            index_names.append(index_name)
            block_table = block_table.remove_column(
                block_table.schema.get_field_index(name))

    result = lib.table_to_blocks(block_table, nthreads)

    blocks = []
    for item in result:
        block_arr = item['block']
        placement = item['placement']
        if 'dictionary' in item:
            cat = pd.Categorical(block_arr,
                                 categories=item['dictionary'],
                                 ordered=False,
                                 fastpath=True)
            block = _int.make_block(cat,
                                    placement=placement,
                                    klass=_int.CategoricalBlock,
                                    fastpath=True)
        elif 'timezone' in item:
            dtype = DatetimeTZDtype('ns', tz=item['timezone'])
            block = _int.make_block(block_arr,
                                    placement=placement,
                                    klass=_int.DatetimeTZBlock,
                                    dtype=dtype,
                                    fastpath=True)
        else:
            block = _int.make_block(block_arr, placement=placement)
        blocks.append(block)

    if len(index_arrays) > 1:
        index = pd.MultiIndex.from_arrays(index_arrays, names=index_names)
    elif len(index_arrays) == 1:
        index = pd.Index(index_arrays[0], name=index_names[0])
    else:
        index = pd.RangeIndex(row_count)

    axes = [[column.name for column in block_table.itercolumns()], index]

    return _int.BlockManager(blocks, axes)
Exemplo n.º 4
0
def table_to_blockmanager(options, table, memory_pool, nthreads=1):
    import pandas.core.internals as _int
    import pyarrow.lib as lib

    index_columns = []
    column_indexes = []
    index_arrays = []
    index_names = []
    schema = table.schema
    row_count = table.num_rows
    metadata = schema.metadata

    has_pandas_metadata = metadata is not None and b'pandas' in metadata

    if has_pandas_metadata:
        pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8'))
        index_columns = pandas_metadata['index_columns']
        column_indexes = pandas_metadata.get('column_indexes', [])
        table = _add_any_metadata(table, pandas_metadata)

    block_table = table

    # Build up a list of index columns and names while removing those columns
    # from the original table
    for name in index_columns:
        i = schema.get_field_index(name)
        if i != -1:
            col = table.column(i)
            index_name = None if is_unnamed_index_level(name) else name
            col_pandas = col.to_pandas()
            values = col_pandas.values
            if not values.flags.writeable:
                # ARROW-1054: in pandas 0.19.2, factorize will reject
                # non-writeable arrays when calling MultiIndex.from_arrays
                values = values.copy()

            index_arrays.append(pd.Series(values, dtype=col_pandas.dtype))
            index_names.append(index_name)
            block_table = block_table.remove_column(
                block_table.schema.get_field_index(name)
            )

    # Convert an arrow table to Block from the internal pandas API
    result = lib.table_to_blocks(options, block_table, nthreads, memory_pool)

    # Construct the individual blocks converting dictionary types to pandas
    # categorical types and Timestamps-with-timezones types to the proper
    # pandas Blocks
    blocks = []
    for item in result:
        block_arr = item['block']
        placement = item['placement']
        if 'dictionary' in item:
            cat = pd.Categorical(block_arr,
                                 categories=item['dictionary'],
                                 ordered=item['ordered'], fastpath=True)
            block = _int.make_block(cat, placement=placement,
                                    klass=_int.CategoricalBlock,
                                    fastpath=True)
        elif 'timezone' in item:
            dtype = make_datetimetz(item['timezone'])
            block = _int.make_block(block_arr, placement=placement,
                                    klass=_int.DatetimeTZBlock,
                                    dtype=dtype, fastpath=True)
        else:
            block = _int.make_block(block_arr, placement=placement)
        blocks.append(block)

    # Construct the row index
    if len(index_arrays) > 1:
        index = pd.MultiIndex.from_arrays(index_arrays, names=index_names)
    elif len(index_arrays) == 1:
        index = pd.Index(index_arrays[0], name=index_names[0])
    else:
        index = pd.RangeIndex(row_count)

    column_strings = [x.name for x in block_table.itercolumns()]

    # If we're passed multiple column indexes then evaluate with
    # ast.literal_eval, since the column index values show up as a list of
    # tuples
    to_pair = ast.literal_eval if len(column_indexes) > 1 else lambda x: (x,)

    # Create the column index

    # Construct the base index
    if not column_strings:
        columns = pd.Index(column_strings)
    else:
        columns = pd.MultiIndex.from_tuples(
            list(map(to_pair, column_strings)),
            names=[col_index['name'] for col_index in column_indexes] or None,
        )

    # if we're reconstructing the index
    if has_pandas_metadata:

        # Get levels and labels, and provide sane defaults if the index has a
        # single level to avoid if/else spaghetti.
        levels = getattr(columns, 'levels', None) or [columns]
        labels = getattr(columns, 'labels', None) or [
            pd.RangeIndex(len(level)) for level in levels
        ]

        # Convert each level to the dtype provided in the metadata
        levels_dtypes = [
            (level, col_index.get('numpy_type', level.dtype))
            for level, col_index in zip_longest(
                levels, column_indexes, fillvalue={}
            )
        ]
        new_levels = [
            _level if _level.dtype == _dtype else _level.astype(_dtype)
            for _level, _dtype in levels_dtypes
        ]
        columns = pd.MultiIndex(
            levels=new_levels,
            labels=labels,
            names=columns.names
        )
    axes = [columns, index]
    return _int.BlockManager(blocks, axes)