示例#1
0
def column_from_arrow_array(arrow_array):
    arrow_type = arrow_array.type
    buffers = arrow_array.buffers()
    if len(buffers) == 2:
        return numpy_array_from_arrow_array(arrow_array)
    elif len(buffers) == 3 and isinstance(arrow_array.type,
                                          type(pyarrow.string())):
        bitmap_buffer, offsets, string_bytes = arrow_array.buffers()
        if arrow_array.null_count == 0:
            null_bitmap = None  # we drop any null_bitmap when there are no null counts
        else:
            null_bitmap = np.frombuffer(bitmap_buffer, 'uint8',
                                        len(bitmap_buffer))
        offsets = np.frombuffer(offsets, np.int32, len(offsets) // 4)
        if string_bytes is None:
            string_bytes = np.array([], dtype='S1')
        else:
            string_bytes = np.frombuffer(string_bytes, 'S1', len(string_bytes))
        column = ColumnStringArrow(offsets,
                                   string_bytes,
                                   len(arrow_array),
                                   null_bitmap=null_bitmap)
        return column

    # Edited by Ank
    elif len(buffers) == 4 and isinstance(arrow_array.type,
                                          pyarrow.lib.ListType):
        return np.array(arrow_array)

    else:
        raise TypeError('type unsupported: %r' % arrow_type)
示例#2
0
文件: dataset.py 项目: t-triobox/vaex
 def _map_column(self, column: h5py.Group, as_arrow=False):
     data = column["data"]
     if "dtype" in data.attrs and data.attrs["dtype"] == "str":
         indices = self._map_hdf5_array(column['indices'])
         bytes = self._map_hdf5_array(data)
         if "null_bitmap" in column:
             null_bitmap = self._map_hdf5_array(column['null_bitmap'])
         else:
             null_bitmap = None
         if isinstance(indices,
                       np.ndarray):  # this is a real mmappable file
             return vaex.arrow.convert.arrow_string_array_from_buffers(
                 bytes, indices, null_bitmap)
         else:
             # if not a reall mmappable array, we fall back to this, maybe we can generalize this
             return ColumnStringArrow(indices,
                                      bytes,
                                      null_bitmap=null_bitmap)
     else:
         if self._version > 1 and 'mask' in column:
             return self._map_hdf5_array(data,
                                         column['mask'],
                                         as_arrow=as_arrow)
         else:
             return self._map_hdf5_array(data, as_arrow=as_arrow)
示例#3
0
文件: writer.py 项目: t-triobox/vaex
 def mmap(self, mmap, file):
     # from now on, we only work with the mmapped array
     # we cannot support USE_MMAP=False for strings yet
     self.array = h5mmap(mmap, file, self.array)
     self.index_array = h5mmap(mmap, file, self.index_array)
     if self.null_bitmap_array is not None:
         self.null_bitmap_array = h5mmap(mmap, file, self.null_bitmap_array)
     if isinstance(self.index_array,
                   np.ndarray):  # this is a real mmappable file
         self.to_array = vaex.arrow.convert.arrow_string_array_from_buffers(
             self.array, self.index_array, self.null_bitmap_array)
     else:
         self.to_array = ColumnStringArrow(
             self.index_array,
             self.array,
             null_bitmap=self.null_bitmap_array)
     # if not isinstance(to_array, ColumnStringArrow):
     self.to_array = ColumnStringArrow.from_arrow(self.to_array)
示例#4
0
    def _load_columns(self, h5data, first=[]):
        # print h5data
        # make sure x y x etc are first

        finished = set()
        if "description" in h5data.attrs:
            self.description = ensure_string(h5data.attrs["description"])
        # hdf5, or h5py doesn't keep the order of columns, so manually track that, also enables reordering later
        h5columns = h5data if self._version == 1 else h5data['columns']
        if "column_order" in h5columns.attrs:
            column_order = ensure_string(
                h5columns.attrs["column_order"]).split(",")
        else:
            column_order = []
        # for name in list(h5columns):
        #     if name not in column_order:
        #         column_order.append(name)
        # for column_name in column_order:
        # if column_name in h5columns and column_name not in finished:
        for group_name in list(h5columns):
            logger.debug('loading column: %s', group_name)
            group = h5columns[group_name]
            if 'type' in group.attrs:
                if group.attrs['type'] in ['csr_matrix']:
                    from scipy.sparse import csc_matrix, csr_matrix

                    class csr_matrix_nocheck(csr_matrix):
                        def check_format(self, *args, **kwargs):
                            pass

                    data = self._map_hdf5_array(group['data'])
                    indptr = self._map_hdf5_array(group['indptr'])
                    indices = self._map_hdf5_array(group['indices'])
                    #column_names = ensure_string(group.attrs["column_names"]).split(",")
                    # make sure we keep the original order
                    groups = [(name, value) for name, value in group.items()
                              if isinstance(value, h5py.Group)]
                    column_names = [None] * len(groups)
                    for name, column in groups:
                        column_names[column.attrs['column_index']] = name
                    matrix = csr_matrix_nocheck(
                        (data, indices, indptr),
                        shape=(len(indptr) - 1, len(column_names)))
                    assert matrix.data is data
                    # assert matrix.indptr is indptr
                    assert matrix.indices is indices
                    self.add_columns(column_names, matrix)
            else:
                column_name = group_name
                column = h5columns[column_name]
                if "alias" in column.attrs:
                    column_name = column.attrs["alias"]
                if "ucd" in column.attrs:
                    self.ucds[column_name] = ensure_string(column.attrs["ucd"])
                if "description" in column.attrs:
                    self.descriptions[column_name] = ensure_string(
                        column.attrs["description"])
                if "unit" in column.attrs:
                    try:
                        unitname = ensure_string(column.attrs["unit"])
                        if unitname and unitname != "None":
                            self.units[column_name] = _try_unit(unitname)
                    except:
                        logger.exception("error parsing unit: %s",
                                         column.attrs["unit"])
                if "units" in column.attrs:  # Amuse case
                    unitname = ensure_string(column.attrs["units"])
                    logger.debug("amuse unit: %s", unitname)
                    if unitname == "(0.01 * system.get('S.I.').base('length'))":
                        self.units[column_name] = astropy.units.Unit("cm")
                    if unitname == "((0.01 * system.get('S.I.').base('length')) * (system.get('S.I.').base('time')**-1))":
                        self.units[column_name] = astropy.units.Unit("cm/s")
                    if unitname == "(0.001 * system.get('S.I.').base('mass'))":
                        self.units[column_name] = astropy.units.Unit("gram")

                    if unitname == "system.get('S.I.').base('length')":
                        self.units[column_name] = astropy.units.Unit("m")
                    if unitname == "(system.get('S.I.').base('length') * (system.get('S.I.').base('time')**-1))":
                        self.units[column_name] = astropy.units.Unit("m/s")
                    if unitname == "system.get('S.I.').base('mass')":
                        self.units[column_name] = astropy.units.Unit("kg")
                data = column if self._version == 1 else column['data']
                if hasattr(data, "dtype"):
                    if "dtype" in data.attrs and data.attrs["dtype"] == "str":
                        indices = self._map_hdf5_array(column['indices'])
                        bytes = self._map_hdf5_array(data)
                        if "null_bitmap" in column:
                            null_bitmap = self._map_hdf5_array(
                                column['null_bitmap'])
                        else:
                            null_bitmap = None
                        if isinstance(
                                indices,
                                np.ndarray):  # this is a real mmappable file
                            self.add_column(
                                column_name,
                                vaex.arrow.convert.
                                arrow_string_array_from_buffers(
                                    bytes, indices, null_bitmap))
                        else:
                            # if not a reall mmappable array, we fall back to this, maybe we can generalize this
                            self.add_column(
                                column_name,
                                ColumnStringArrow(indices,
                                                  bytes,
                                                  null_bitmap=null_bitmap))
                    else:
                        shape = data.shape
                        if True:  # len(shape) == 1:
                            dtype = data.dtype
                            if "dtype" in data.attrs:
                                dtype = data.attrs["dtype"]
                            logger.debug("adding column %r with dtype %r",
                                         column_name, dtype)
                            # self.addColumn(column_name, offset, len(data), dtype=dtype)
                            if self._version > 1 and 'mask' in column:
                                self.add_column(
                                    column_name,
                                    self._map_hdf5_array(data, column['mask']))
                            else:
                                self.add_column(column_name,
                                                self._map_hdf5_array(data))
                        else:
                            transposed = shape[1] < shape[0]
                            self.addRank1(column_name,
                                          offset,
                                          shape[1],
                                          length1=shape[0],
                                          dtype=data.dtype,
                                          stride=1,
                                          stride1=1,
                                          transposed=transposed)
        all_columns = dict(**self._columns)
        # in case the column_order refers to non-existing columns
        column_order = [k for k in column_order if k in all_columns]
        column_names = []
        self._columns = {}
        for name in column_order:
            self._columns[name] = all_columns.pop(name)
        # add the rest
        for name, col in all_columns.items():
            self._columns[name] = col