Exemplo n.º 1
0
def column_from_arrow_array(arrow_array):
    arrow_type = arrow_array.type
    buffers = arrow_array.buffers()
    if len(buffers) == 2:
        return numpy_array_from_arrow_array(arrow_array)
    elif len(buffers) == 3 and isinstance(arrow_array.type,
                                          type(pyarrow.string())):
        bitmap_buffer, offsets, string_bytes = arrow_array.buffers()
        if arrow_array.null_count == 0:
            null_bitmap = None  # we drop any null_bitmap when there are no null counts
        else:
            null_bitmap = np.frombuffer(bitmap_buffer, 'uint8',
                                        len(bitmap_buffer))
        offsets = np.frombuffer(offsets, np.int32, len(offsets) // 4)
        if string_bytes is None:
            string_bytes = np.array([], dtype='S1')
        else:
            string_bytes = np.frombuffer(string_bytes, 'S1', len(string_bytes))
        column = ColumnStringArrow(offsets,
                                   string_bytes,
                                   len(arrow_array),
                                   null_bitmap=null_bitmap)
        return column

    # Edited by Ank
    elif len(buffers) == 4 and isinstance(arrow_array.type,
                                          pyarrow.lib.ListType):
        return np.array(arrow_array)

    else:
        raise TypeError('type unsupported: %r' % arrow_type)
Exemplo n.º 2
0
 def _map_column(self, column: h5py.Group, as_arrow=False):
     data = column["data"]
     if "dtype" in data.attrs and data.attrs["dtype"] == "str":
         indices = self._map_hdf5_array(column['indices'])
         bytes = self._map_hdf5_array(data)
         if "null_bitmap" in column:
             null_bitmap = self._map_hdf5_array(column['null_bitmap'])
         else:
             null_bitmap = None
         if isinstance(indices,
                       np.ndarray):  # this is a real mmappable file
             return vaex.arrow.convert.arrow_string_array_from_buffers(
                 bytes, indices, null_bitmap)
         else:
             # if not a reall mmappable array, we fall back to this, maybe we can generalize this
             return ColumnStringArrow(indices,
                                      bytes,
                                      null_bitmap=null_bitmap)
     else:
         if self._version > 1 and 'mask' in column:
             return self._map_hdf5_array(data,
                                         column['mask'],
                                         as_arrow=as_arrow)
         else:
             return self._map_hdf5_array(data, as_arrow=as_arrow)
Exemplo n.º 3
0
 def mmap(self, mmap, file):
     # from now on, we only work with the mmapped array
     # we cannot support USE_MMAP=False for strings yet
     self.array = h5mmap(mmap, file, self.array)
     self.index_array = h5mmap(mmap, file, self.index_array)
     if self.null_bitmap_array is not None:
         self.null_bitmap_array = h5mmap(mmap, file, self.null_bitmap_array)
     if isinstance(self.index_array,
                   np.ndarray):  # this is a real mmappable file
         self.to_array = vaex.arrow.convert.arrow_string_array_from_buffers(
             self.array, self.index_array, self.null_bitmap_array)
     else:
         self.to_array = ColumnStringArrow(
             self.index_array,
             self.array,
             null_bitmap=self.null_bitmap_array)
     # if not isinstance(to_array, ColumnStringArrow):
     self.to_array = ColumnStringArrow.from_arrow(self.to_array)
Exemplo n.º 4
0
def _export_column(dataset_input,
                   dataset_output,
                   column_name,
                   shuffle,
                   sort,
                   selection,
                   N,
                   order_array,
                   order_array_inverse,
                   progress_status,
                   parallel=True):

    if 1:
        block_scope = dataset_input._block_scope(
            0, vaex.execution.buffer_size_default)
        to_array = dataset_output.columns[column_name]
        dtype = dataset_input.data_type(column_name, array_type='numpy')
        is_string = vaex.array_types.is_string_type(dtype)
        if is_string:
            assert isinstance(to_array,
                              pa.Array)  # we don't support chunked arrays here
            # TODO legacy: we still use ColumnStringArrow to write, find a way to do this with arrow
            to_array = ColumnStringArrow.from_arrow(to_array)
        if shuffle or sort:  # we need to create a in memory copy, otherwise we will do random writes which is VERY inefficient
            to_array_disk = to_array
            if np.ma.isMaskedArray(to_array):
                to_array = np.empty_like(to_array_disk)
            else:
                if vaex.array_types.is_string_type(dtype):
                    # we create an empty column copy
                    to_array = to_array._zeros_like()
                else:
                    to_array = np.zeros_like(to_array_disk)
        to_offset = 0  # we need this for selections
        to_offset_unselected = 0  # we need this for filtering
        count = len(
            dataset_input
        )  # if not selection else dataset_input.length_unfiltered()
        # TODO: if no filter, selection or mask, we can choose the quick path for str
        string_byte_offset = 0

        for i1, i2, values in dataset_input.evaluate(column_name,
                                                     chunk_size=max_length,
                                                     filtered=True,
                                                     parallel=parallel,
                                                     selection=selection,
                                                     array_type='numpy-arrow'):
            logger.debug("from %d to %d (total length: %d, output length: %d)",
                         i1, i2, len(dataset_input), N)
            no_values = len(values)
            if no_values:
                if is_string:
                    # for strings, we don't take sorting/shuffling into account when building the structure
                    to_column = to_array
                    from_sequence = _to_string_sequence(values)
                    to_sequence = to_column.string_sequence.slice(
                        to_offset, to_offset + no_values, string_byte_offset)
                    string_byte_offset += to_sequence.fill_from(from_sequence)
                    to_offset += no_values
                else:
                    fill_value = np.nan if dtype.kind == "f" else None
                    # assert np.ma.isMaskedArray(to_array) == np.ma.isMaskedArray(values), "to (%s) and from (%s) array are not of both masked or unmasked (%s)" %\
                    # (np.ma.isMaskedArray(to_array), np.ma.isMaskedArray(values), column_name)
                    if shuffle or sort:
                        target_set_item = order_array[i1:i2]
                    else:
                        target_set_item = slice(to_offset,
                                                to_offset + no_values)
                    if dtype.is_datetime:
                        values = values.view(np.int64)
                    if np.ma.isMaskedArray(to_array) and np.ma.isMaskedArray(
                            values):
                        to_array.data[target_set_item] = values.filled(
                            fill_value)
                        to_array.mask[target_set_item] = values.mask
                    elif not np.ma.isMaskedArray(
                            to_array) and np.ma.isMaskedArray(values):
                        to_array[target_set_item] = values.filled(fill_value)
                    else:
                        to_array[target_set_item] = values
                    to_offset += no_values

            with progress_lock:
                progress_status.value += i2 - i1
            if progress_status.cancelled:
                break
            #if not progress(progress_value / float(progress_total)):
            #    break
        if is_string:  # write out the last index
            to_column = to_array
            if selection:
                to_column.indices[to_offset] = string_byte_offset
            else:
                to_column.indices[count] = string_byte_offset
        if shuffle or sort:  # write to disk in one go
            if is_string:  # strings are sorted afterwards
                view = to_array.string_sequence.lazy_index(order_array_inverse)
                to_array_disk.string_sequence.fill_from(view)
            else:
                if np.ma.isMaskedArray(to_array) and np.ma.isMaskedArray(
                        to_array_disk):
                    to_array_disk.data[:] = to_array.data
                    to_array_disk.mask[:] = to_array.mask
                else:
                    to_array_disk[:] = to_array
Exemplo n.º 5
0
    def _load_columns(self, h5data, first=[]):
        # print h5data
        # make sure x y x etc are first

        finished = set()
        if "description" in h5data.attrs:
            self.description = ensure_string(h5data.attrs["description"])
        # hdf5, or h5py doesn't keep the order of columns, so manually track that, also enables reordering later
        h5columns = h5data if self._version == 1 else h5data['columns']
        if "column_order" in h5columns.attrs:
            column_order = ensure_string(
                h5columns.attrs["column_order"]).split(",")
        else:
            column_order = []
        # for name in list(h5columns):
        #     if name not in column_order:
        #         column_order.append(name)
        # for column_name in column_order:
        # if column_name in h5columns and column_name not in finished:
        for group_name in list(h5columns):
            logger.debug('loading column: %s', group_name)
            group = h5columns[group_name]
            if 'type' in group.attrs:
                if group.attrs['type'] in ['csr_matrix']:
                    from scipy.sparse import csc_matrix, csr_matrix

                    class csr_matrix_nocheck(csr_matrix):
                        def check_format(self, *args, **kwargs):
                            pass

                    data = self._map_hdf5_array(group['data'])
                    indptr = self._map_hdf5_array(group['indptr'])
                    indices = self._map_hdf5_array(group['indices'])
                    #column_names = ensure_string(group.attrs["column_names"]).split(",")
                    # make sure we keep the original order
                    groups = [(name, value) for name, value in group.items()
                              if isinstance(value, h5py.Group)]
                    column_names = [None] * len(groups)
                    for name, column in groups:
                        column_names[column.attrs['column_index']] = name
                    matrix = csr_matrix_nocheck(
                        (data, indices, indptr),
                        shape=(len(indptr) - 1, len(column_names)))
                    assert matrix.data is data
                    # assert matrix.indptr is indptr
                    assert matrix.indices is indices
                    self.add_columns(column_names, matrix)
            else:
                column_name = group_name
                column = h5columns[column_name]
                if "alias" in column.attrs:
                    column_name = column.attrs["alias"]
                if "ucd" in column.attrs:
                    self.ucds[column_name] = ensure_string(column.attrs["ucd"])
                if "description" in column.attrs:
                    self.descriptions[column_name] = ensure_string(
                        column.attrs["description"])
                if "unit" in column.attrs:
                    try:
                        unitname = ensure_string(column.attrs["unit"])
                        if unitname and unitname != "None":
                            self.units[column_name] = _try_unit(unitname)
                    except:
                        logger.exception("error parsing unit: %s",
                                         column.attrs["unit"])
                if "units" in column.attrs:  # Amuse case
                    unitname = ensure_string(column.attrs["units"])
                    logger.debug("amuse unit: %s", unitname)
                    if unitname == "(0.01 * system.get('S.I.').base('length'))":
                        self.units[column_name] = astropy.units.Unit("cm")
                    if unitname == "((0.01 * system.get('S.I.').base('length')) * (system.get('S.I.').base('time')**-1))":
                        self.units[column_name] = astropy.units.Unit("cm/s")
                    if unitname == "(0.001 * system.get('S.I.').base('mass'))":
                        self.units[column_name] = astropy.units.Unit("gram")

                    if unitname == "system.get('S.I.').base('length')":
                        self.units[column_name] = astropy.units.Unit("m")
                    if unitname == "(system.get('S.I.').base('length') * (system.get('S.I.').base('time')**-1))":
                        self.units[column_name] = astropy.units.Unit("m/s")
                    if unitname == "system.get('S.I.').base('mass')":
                        self.units[column_name] = astropy.units.Unit("kg")
                data = column if self._version == 1 else column['data']
                if hasattr(data, "dtype"):
                    if "dtype" in data.attrs and data.attrs["dtype"] == "str":
                        indices = self._map_hdf5_array(column['indices'])
                        bytes = self._map_hdf5_array(data)
                        if "null_bitmap" in column:
                            null_bitmap = self._map_hdf5_array(
                                column['null_bitmap'])
                        else:
                            null_bitmap = None
                        if isinstance(
                                indices,
                                np.ndarray):  # this is a real mmappable file
                            self.add_column(
                                column_name,
                                vaex.arrow.convert.
                                arrow_string_array_from_buffers(
                                    bytes, indices, null_bitmap))
                        else:
                            # if not a reall mmappable array, we fall back to this, maybe we can generalize this
                            self.add_column(
                                column_name,
                                ColumnStringArrow(indices,
                                                  bytes,
                                                  null_bitmap=null_bitmap))
                    else:
                        shape = data.shape
                        if True:  # len(shape) == 1:
                            dtype = data.dtype
                            if "dtype" in data.attrs:
                                dtype = data.attrs["dtype"]
                            logger.debug("adding column %r with dtype %r",
                                         column_name, dtype)
                            # self.addColumn(column_name, offset, len(data), dtype=dtype)
                            if self._version > 1 and 'mask' in column:
                                self.add_column(
                                    column_name,
                                    self._map_hdf5_array(data, column['mask']))
                            else:
                                self.add_column(column_name,
                                                self._map_hdf5_array(data))
                        else:
                            transposed = shape[1] < shape[0]
                            self.addRank1(column_name,
                                          offset,
                                          shape[1],
                                          length1=shape[0],
                                          dtype=data.dtype,
                                          stride=1,
                                          stride1=1,
                                          transposed=transposed)
        all_columns = dict(**self._columns)
        # in case the column_order refers to non-existing columns
        column_order = [k for k in column_order if k in all_columns]
        column_names = []
        self._columns = {}
        for name in column_order:
            self._columns[name] = all_columns.pop(name)
        # add the rest
        for name, col in all_columns.items():
            self._columns[name] = col