def update_histogram( self, created: Optional[bitmap] = None, updated: Optional[bitmap] = None, deleted: Optional[bitmap] = None, ) -> None: "Update the histogram index" created = bitmap.asbitmap(created) updated = bitmap.asbitmap(updated) deleted = bitmap.asbitmap(deleted) # if deleted: # self._tdigest_is_valid = False if deleted or updated: to_remove = updated | deleted for i, bm in enumerate(self.bitmaps): self.bitmaps[i] = bm - to_remove if created or updated: to_add = created | updated ids = np.array(to_add, np.int64) values = self.column.loc[to_add] bins = np.digitize(values, self.bins) # type: ignore counts = np.bincount(bins) for i in np.nonzero(counts)[0]: bm = self.bitmaps[i] selection = bins == i # boolean mask of values in bin i bm.update(ids[selection]) # add them to the bitmap
def _process_created(ret: Dict[str, Any]) -> None: b = dialog.bag if not created: return if how == "outer": return _process_created_outer(ret) # if first_key not in created: return first_ids = created.get(first_key, None) second_ids = created.get(second_key, None) only_1st, common, only_2nd = inter_slice(first_ids, second_ids) assert isinstance(join_table, Table) if first_ids is not None: new_size = _len(first_ids) if ( isinstance(first_ids, slice) and join_table.is_identity and ( join_table.last_id + 1 == first_ids.start or join_table.last_id == 0 ) ): # the nice case (no gaps) join_table.resize(new_size) else: # there are gaps ...we have to keep trace of existing ids join_table.resize(new_size, index=bitmap.asbitmap(first_ids)) if b.get("existing_ids", None) is None: b["existing_ids"] = bitmap.asbitmap(join_table.index) else: b["existing_ids"] = bitmap.union( b["existing_ids"], bitmap.asbitmap(first_ids) ) join_table.loc[_fix(first_ids), first_cols] = first.loc[ _fix(first_ids), first.columns ] if not _void(common): join_table.loc[_fix(common), second_cols] = second.loc[ _fix(common), second.columns ] # first matching: older orphans on the second table with new orphans on the first only_1st_bm = bitmap.asbitmap(only_1st) paired = b["second_orphans"] & only_1st_bm if paired: join_table.loc[paired, second_cols] = second.loc[paired, second.columns] b["second_orphans"] = b["second_orphans"] - paired only_1st_bm -= paired b["first_orphans"] = bitmap.union(b["first_orphans"], only_1st_bm) # 2nd matching: older orphans on the first table with new orphans on the second only_2nd_bm = bitmap.asbitmap(only_2nd) paired = b["first_orphans"] & only_2nd_bm if paired: join_table.loc[paired, second_cols] = second.loc[paired, second.columns] b["first_orphans"] = b["first_orphans"] - paired only_2nd_bm -= paired b["second_orphans"] = bitmap.union(b["second_orphans"], only_2nd_bm)
def resize(self, newsize: int, index: Optional[Union[bitmap, List[int]]] = None) -> None: # NB: newsize means how many active rows the table must contain if index is not None: index = bitmap.asbitmap(index) newsize_ = index.max() + 1 if index else 0 if newsize < newsize_: logger.warning(f"Wrong newsize={newsize}, fixed to {newsize_}") newsize = newsize_ assert newsize is not None delta = newsize - len(self.index) # if delta < 0: # return newsize = self.last_id + delta + 1 crt_index = bitmap(self._index) self._resize_rows(newsize, index) del_index = crt_index - self._index if del_index: self.add_deleted(del_index) if delta < 0: return self._storagegroup.attrs[metadata.ATTR_NROWS] = newsize assert newsize is not None for column in self._columns: col = cast(Column, column) col._resize(newsize)
def restricted_range_query( self, lower: float, upper: float, only_locs: Any, approximate: bool = APPROX ) -> bitmap: """ Return the bitmap of only_locs rows in range [`lower`, `upper`[ """ if lower > upper: lower, upper = upper, lower only_locs = bitmap.asbitmap(only_locs) pos_lo, pos_up = np.digitize([lower, upper], self.bins) # type: ignore union = bitmap.union( *[(bm & only_locs) for bm in self.bitmaps[pos_lo + 1 : pos_up]] ) if not approximate: detail = bitmap() ids = np.array(self.bitmaps[pos_lo] & only_locs, np.int64) values = self.column.loc[ids] if pos_lo == pos_up: selected = ids[(lower <= values) & (values < upper)] detail.update(selected) else: selected = ids[lower <= values] detail.update(selected) ids = np.array(self.bitmaps[pos_up] & only_locs, np.int64) values = self.column.loc[ids] selected = ids[values < upper] detail.update(selected) union.update(detail) return union
def restricted_query( self, operator_: Callable[[Any, Any], int], limit: Any, only_locs: Any, approximate: bool = APPROX, ) -> bitmap: # blocking... """ Returns the subset of only_locs matching the query. """ only_locs = bitmap.asbitmap(only_locs) assert self.bins is not None pos = np.digitize(limit, self.bins) detail = bitmap() if not approximate: ids = np.array(self.bitmaps[pos] & only_locs, np.int64) values = self.column.loc[ids] selected = ids[operator_(values, limit)] detail.update(selected) if operator_ in (operator.lt, operator.le): for bm in self.bitmaps[:pos]: detail.update(bm & only_locs) else: for bm in self.bitmaps[pos + 1 :]: detail.update(bm & only_locs) return detail
def restricted_range_query(self, lower, upper, only_locs, approximate=APPROX): """ Return the bitmap of only_locs rows in range [`lower`, `upper`[ """ if lower > upper: lower, upper = upper, lower only_locs = bitmap.asbitmap(only_locs) pos = np.digitize([lower, upper], self.bins) detail = bitmap() if not approximate: ids = np.array(self.bitmaps[pos[0]] & only_locs, np.int64) values = self.column.loc[ids] if pos[0] == pos[1]: selected = ids[(lower <= values) & (values < upper)] else: selected = ids[lower <= values] detail.update(selected) ids = np.array(self.bitmaps[pos[1]] & only_locs, np.int64) values = self.column.loc[ids] selected = ids[values < upper] detail.update(selected) for bm in self.bitmaps[pos[0] + 1:pos[1]]: detail.update(bm & only_locs) return detail
def update_histogram(self, created, updated=(), deleted=()): "Update the histogram index" created = bitmap.asbitmap(created) updated = bitmap.asbitmap(updated) deleted = bitmap.asbitmap(deleted) # if deleted: # self._tdigest_is_valid = False if deleted or updated: to_remove = updated | deleted for bm in self.bitmaps: bm -= to_remove if created or updated: to_add = created | updated ids = np.array(to_add, np.int64) values = self.column.loc[to_add] #self._tdigest.batch_update(values) bins = np.digitize(values, self.bins) counts = np.bincount(bins) for i in np.nonzero(counts)[0]: bm = self.bitmaps[i] selection = (bins == i) # boolean mask of values in bin i bm.update(ids[selection]) # add them to the bitmap
def _allocate(self, count: int, index: Optional[Union[bitmap, List[int]]] = None) -> bitmap: start = self.last_id + 1 index = (bitmap(range(start, start + count)) if index is None else bitmap.asbitmap(index)) newsize = max(index.max(), self.last_id) + 1 self.add_created(index) self._storagegroup.attrs[metadata.ATTR_NROWS] = newsize for column in self._columns: col = cast(Column, column) col._resize(newsize) self._resize_rows(newsize, index) return index
def __getitem__(self, key): index, col_key, _ = self.parse_key(key) if isinstance(index, integer_types): row = self._table.row(index) if col_key != slice(None): return row[col_key] return row if isinstance(index, slice) and index.step in (None, 1): from .table_sliced import TableSlicedView return TableSlicedView(self._table, index, col_key) elif isinstance(index, Iterable): from .table_selected import TableSelectedView selection = bitmap.asbitmap(self._table.index[index]) return TableSelectedView(self._table, selection, col_key, self._table.name) raise ValueError('getitem not implemented for index "%s"', index)
def restricted_query(self, operator_, limit, only_locs, approximate=APPROX): # blocking... """ Returns the subset of only_locs matching the query. """ only_locs = bitmap.asbitmap(only_locs) pos = np.digitize(limit, self.bins) detail = bitmap() if not approximate: ids = np.array(self.bitmaps[pos] & only_locs, np.int64) values = self.column.loc[ids] selected = ids[operator_(values, limit)] detail.update(selected) if operator_ in (operator.lt, operator.le): for bm in self.bitmaps[:pos]: detail.update(bm & only_locs) else: for bm in self.bitmaps[pos + 1:]: detail.update(bm & only_locs) return detail