def mmap(self, mmap, file): # from now on, we only work with the mmapped array # we cannot support USE_MMAP=False for strings yet self.array = h5mmap(mmap, file, self.array) self.index_array = h5mmap(mmap, file, self.index_array) if self.null_bitmap_array is not None: self.null_bitmap_array = h5mmap(mmap, file, self.null_bitmap_array) if isinstance(self.index_array, np.ndarray): # this is a real mmappable file self.to_array = vaex.arrow.convert.arrow_string_array_from_buffers( self.array, self.index_array, self.null_bitmap_array) else: self.to_array = ColumnStringArrow( self.index_array, self.array, null_bitmap=self.null_bitmap_array) # if not isinstance(to_array, ColumnStringArrow): self.to_array = ColumnStringArrow.from_arrow(self.to_array)
def _export_column(dataset_input, dataset_output, column_name, shuffle, sort, selection, N, order_array, order_array_inverse, progress_status, parallel=True): if 1: block_scope = dataset_input._block_scope( 0, vaex.execution.buffer_size_default) to_array = dataset_output.columns[column_name] dtype = dataset_input.data_type(column_name, array_type='numpy') is_string = vaex.array_types.is_string_type(dtype) if is_string: assert isinstance(to_array, pa.Array) # we don't support chunked arrays here # TODO legacy: we still use ColumnStringArrow to write, find a way to do this with arrow to_array = ColumnStringArrow.from_arrow(to_array) if shuffle or sort: # we need to create a in memory copy, otherwise we will do random writes which is VERY inefficient to_array_disk = to_array if np.ma.isMaskedArray(to_array): to_array = np.empty_like(to_array_disk) else: if vaex.array_types.is_string_type(dtype): # we create an empty column copy to_array = to_array._zeros_like() else: to_array = np.zeros_like(to_array_disk) to_offset = 0 # we need this for selections to_offset_unselected = 0 # we need this for filtering count = len( dataset_input ) # if not selection else dataset_input.length_unfiltered() # TODO: if no filter, selection or mask, we can choose the quick path for str string_byte_offset = 0 for i1, i2, values in dataset_input.evaluate(column_name, chunk_size=max_length, filtered=True, parallel=parallel, selection=selection, array_type='numpy-arrow'): logger.debug("from %d to %d (total length: %d, output length: %d)", i1, i2, len(dataset_input), N) no_values = len(values) if no_values: if is_string: # for strings, we don't take sorting/shuffling into account when building the structure to_column = to_array from_sequence = _to_string_sequence(values) to_sequence = to_column.string_sequence.slice( to_offset, to_offset + no_values, string_byte_offset) string_byte_offset += to_sequence.fill_from(from_sequence) to_offset += no_values else: fill_value = np.nan if dtype.kind == "f" else None # assert np.ma.isMaskedArray(to_array) == np.ma.isMaskedArray(values), "to (%s) and from (%s) array are not of both masked or unmasked (%s)" %\ # (np.ma.isMaskedArray(to_array), np.ma.isMaskedArray(values), column_name) if shuffle or sort: target_set_item = order_array[i1:i2] else: target_set_item = slice(to_offset, to_offset + no_values) if dtype.is_datetime: values = values.view(np.int64) if np.ma.isMaskedArray(to_array) and np.ma.isMaskedArray( values): to_array.data[target_set_item] = values.filled( fill_value) to_array.mask[target_set_item] = values.mask elif not np.ma.isMaskedArray( to_array) and np.ma.isMaskedArray(values): to_array[target_set_item] = values.filled(fill_value) else: to_array[target_set_item] = values to_offset += no_values with progress_lock: progress_status.value += i2 - i1 if progress_status.cancelled: break #if not progress(progress_value / float(progress_total)): # break if is_string: # write out the last index to_column = to_array if selection: to_column.indices[to_offset] = string_byte_offset else: to_column.indices[count] = string_byte_offset if shuffle or sort: # write to disk in one go if is_string: # strings are sorted afterwards view = to_array.string_sequence.lazy_index(order_array_inverse) to_array_disk.string_sequence.fill_from(view) else: if np.ma.isMaskedArray(to_array) and np.ma.isMaskedArray( to_array_disk): to_array_disk.data[:] = to_array.data to_array_disk.mask[:] = to_array.mask else: to_array_disk[:] = to_array