def run_step(self, run_number, step_size, howlong): input_slot = self.get_input_slot('table') input_slot.update(run_number) steps = 0 deleted = None if input_slot.deleted.any(): deleted = input_slot.deleted.next(step_size) steps += indices_len(deleted) created = None if input_slot.created.any(): created = input_slot.created.next(step_size) steps += indices_len(created) updated = None if input_slot.updated.any(): updated = input_slot.updated.next(step_size) steps += indices_len(updated) with input_slot.lock: input_table = input_slot.data() if not self._table: self._table = TableSelectedView(input_table, bitmap([])) if steps == 0: return self._return_run_step(self.state_blocked, steps_run=0) param = self.params limit_slot = self.get_input_slot('limit') limit_slot.update(run_number) limit_changed = False if limit_slot.deleted.any(): limit_slot.deleted.next() if limit_slot.updated.any(): limit_slot.updated.next() limit_changed = True if limit_slot.created.any(): limit_slot.created.next() limit_changed = True if len(limit_slot.data()) == 0: return self._return_run_step(self.state_blocked, steps_run=0) if param.limit_key: limit_value = limit_slot.data().last(param.limit_key) else: limit_value = limit_slot.data().last()[0] if not self._impl.is_started: #self._table = TableSelectedView(input_table, bitmap([])) status = self._impl.start(input_table, limit_value, limit_changed, created=created, updated=updated, deleted=deleted) self._table.selection = self._impl.result._values else: status = self._impl.resume(limit_value, limit_changed, created=created, updated=updated, deleted=deleted) self._table.selection = self._impl.result._values return self._return_run_step(self.next_state(input_slot), steps_run=steps)
def run_step(self, run_number, step_size, howlong): dfslot = self.get_input_slot("df") df = dfslot.data() dfslot.update(run_number) if dfslot.has_updated() or dfslot.has_deleted(): dfslot.reset() logger.info("Reseting history because of changes in the input df") dfslot.update(run_number, df) # TODO: be smarter with changed values m = step_size indices = dfslot.next_created(m) m = indices_len(indices) i = None j = None Si = self._buf.matrix() arrayslot = self.get_input_slot("array") if arrayslot is not None and arrayslot.data() is not None: array = arrayslot.data() logger.debug("Using array instead of DataFrame columns") if Si is not None: i = array[self._last_index] j = array[indices] if j is None: if self.columns is None: self.columns = df.columns.delete(np.where(df.columns == Module.UPDATE_COLUMN)) elif not isinstance(self.columns, pd.Index): self.columns = pd.Index(self.columns) rows = df[self.columns] if Si is not None: i = rows.loc[self._last_index] assert len(i) == len(self._last_index) j = rows.loc[fix_loc(indices)] assert len(j) == indices_len(indices) Sj = pairwise_distances(j, metric=self._metric, n_jobs=self._n_jobs) if Si is None: mat = self._buf.resize(Sj.shape[0]) mat[:, :] = Sj self._last_index = dfslot.last_index[indices] else: Sij = pairwise_distances(i, j, metric=self._metric, n_jobs=self._n_jobs) n0 = i.shape[0] n1 = n0 + j.shape[0] mat = self._buf.resize(n1) mat[0:n0, n0:n1] = Sij mat[n0:n1, 0:n0] = Sij.T mat[n0:n1, n0:n1] = Sj self._last_index = self._last_index.append(df.index[indices]) # truth = pairwise_distances(array[0:n1], metric=self._metric) # import pdb # pdb.set_trace() # assert np.allclose(mat,truth) return self._return_run_step(dfslot.next_state(), steps_run=m)
def run_step(self, run_number: int, step_size: int, howlong: float) -> ReturnRunStep: first_slot = self.get_input_slot("first") # first_slot.update(run_number) second_slot = self.get_input_slot("second") # second_slot.update(run_number) steps = 0 if first_slot.deleted.any() or second_slot.deleted.any(): first_slot.reset() second_slot.reset() if self.result is not None: self.table.resize(0) join_reset(self._dialog) first_slot.update(run_number) second_slot.update(run_number) created = {} if first_slot.created.any(): indices = first_slot.created.next(length=step_size) steps += indices_len(indices) created["table"] = indices if second_slot.created.any(): indices = second_slot.created.next(length=step_size) steps += indices_len(indices) created["other"] = indices updated = {} if first_slot.updated.any(): indices = first_slot.updated.next(length=step_size) steps += indices_len(indices) updated["table"] = indices if second_slot.updated.any(): indices = second_slot.updated.next(length=step_size) steps += indices_len(indices) updated["other"] = indices first_table = first_slot.data() second_table = second_slot.data() if not self._dialog.is_started: join_start(first_table, second_table, dialog=self._dialog, created=created, updated=updated, **self.join_kwds) else: join_cont( first_table, second_table, dialog=self._dialog, created=created, updated=updated, ) return self._return_run_step(self.next_state(first_slot), steps_run=steps)
def run_step(self, run_number, step_size, howlong): first_slot = self.get_input_slot('first') first_slot.update(run_number) second_slot = self.get_input_slot('second') second_slot.update(run_number) steps = 0 if first_slot.deleted.any() or second_slot.deleted.any(): first_slot.reset() second_slot.reset() if self._table is not None: self._table.resize(0) join_reset(self._dialog) first_slot.update(run_number) second_slot.update(run_number) created = {} if first_slot.created.any(): indices = first_slot.created.next(step_size) steps += indices_len(indices) created["table"] = indices if second_slot.created.any(): indices = second_slot.created.next(step_size) steps += indices_len(indices) created["other"] = indices updated = {} if first_slot.updated.any(): indices = first_slot.updated.next(step_size) steps += indices_len(indices) updated["table"] = indices if second_slot.updated.any(): indices = second_slot.updated.next(step_size) steps += indices_len(indices) updated["other"] = indices with first_slot.lock: first_table = first_slot.data() with second_slot.lock: second_table = second_slot.data() if not self._dialog.is_started: join_start(first_table, second_table, dialog=self._dialog, created=created, updated=updated, **self.join_kwds) else: join_cont(first_table, second_table, dialog=self._dialog, created=created, updated=updated) return self._return_run_step(self.next_state(first_slot), steps_run=steps)
def run_step(self, run_number, step_size, howlong): dfslot = self.get_input_slot('table') dfslot.update(run_number) if dfslot.updated.any() or dfslot.deleted.any(): dfslot.reset() if self._table is not None: self._table.resize(0) dfslot.update(run_number) indices = dfslot.created.next(step_size) # returns a slice steps = indices_len(indices) if steps == 0: return self._return_run_step(self.state_blocked, steps_run=0) input_df = dfslot.data() data = pd.DataFrame(dict(counter=steps), index=[0]) if self._table is None: self._table = Table( self.generate_table_name('counter'), data=data, # scheduler=self.scheduler(), create=True) elif len(self._table) == 0: # has been resetted self._table.append(data) else: self._table['counter'].loc[0] += steps return self._return_run_step(self.next_state(dfslot), steps_run=steps)
def run_step(self, run_number: int, step_size: int, howlong: float) -> ReturnRunStep: prev_min = prev_max = np.nan dfslot = self.get_input_slot("table") assert dfslot is not None if dfslot.updated.any() or dfslot.deleted.any(): dfslot.reset() dfslot.update(run_number) else: df = self.table prev = df.last_id if prev > 0: prev_min = df.at[prev, self._min_column] prev_max = df.at[prev, self._max_column] indices = dfslot.created.next(length=step_size) # returns a slice input_df = dfslot.data() steps = indices_len(indices) if steps > 0: x = input_df.to_array(locs=fix_loc(indices), columns=[self._column]) new_min = np.nanmin(x) # type: ignore new_max = np.nanmax(x) # type: ignore row = { self._min_column: np.nanmin([prev_min, new_min]), # type: ignore self._max_column: np.nanmax([prev_max, new_max]), # type: ignore } if run_number in df.index: df.loc[run_number] = row else: df.add(row, index=run_number) return self._return_run_step(self.next_state(dfslot), steps)
def run_step(self, run_number: int, step_size: int, howlong: float) -> ReturnRunStep: if self.params.fixed_step_size and False: step_size = self.params.fixed_step_size input_slot = self.get_input_slot("table") assert input_slot is not None steps = 0 if not input_slot.created.any(): return self._return_run_step(self.state_blocked, steps_run=0) created = input_slot.created.next(length=step_size, as_slice=False) # created = fix_loc(created) steps = indices_len(created) input_table = input_slot.data() if self.result is None: self.result = TableSelectedView(input_table, bitmap([])) before_ = bitmap(self.table.index) self.selected.selection |= created # print(len(self.table.index)) delete = [] if self._delete_rows and self.test_delete_threshold(before_): if isinstance(self._delete_rows, int): delete = random.sample(tuple(before_), min(self._delete_rows, len(before_))) elif self._delete_rows == "half": delete = random.sample(tuple(before_), len(before_) // 2) elif self._delete_rows == "all": delete = before_ else: delete = self._delete_rows self.selected.selection -= bitmap(delete) return self._return_run_step(self.next_state(input_slot), steps_run=steps)
def run_step(self,run_number,step_size,howlong): dfslot = self.get_input_slot('df') dfslot.update(run_number) if dfslot.has_updated() or dfslot.has_deleted(): dfslot.reset() self._df = None dfslot.update(run_number) indices = dfslot.next_created(step_size) # returns a slice steps = indices_len(indices) if steps==0: return self._return_run_step(self.state_blocked, steps_run=0) input_df = dfslot.data() op = self.filter_columns(input_df, fix_loc(indices)).max() if not op.index.equals(self._columns): # some columns are not numerical self._columns = op.index op[self.UPDATE_COLUMN] = run_number if self._df is None: self._df = pd.DataFrame([op],index=[run_number]) else: op = pd.concat([last_row(self._df), op], axis=1).max(axis=1) # Also computed the max over the UPDATE_COLUMNS so reset it op[self.UPDATE_COLUMN] = run_number self._df.loc[run_number] = op if len(self._df) > self.params.history: self._df = self._df.loc[self._df.index[-self.params.history:]] return self._return_run_step(dfslot.next_state(), steps_run=steps)
def run_step(self, run_number, step_size, howlong): dfslot = self.get_input_slot('df') dfslot.update(run_number) if dfslot.has_deleted() or dfslot.has_updated(): logger.debug('has deleted or updated, reseting') self.reset() dfslot.update(run_number) print('dfslot has buffered %d elements'% dfslot.created_length()) if dfslot.created_length() < self.mbk.n_clusters: # Should add more than k items per loop return self._return_run_step(self.state_blocked, steps_run=0) indices = dfslot.next_created(step_size) # returns a slice steps = indices_len(indices) if steps==0: return self._return_run_step(self.state_blocked, steps_run=0) input_df = dfslot.data() X = self.filter_columns(input_df, fix_loc(indices)).values batch_size = self.mbk.batch_size or 100 for batch in gen_batches(steps, batch_size): self.mbk.partial_fit(X[batch]) if self._buffer is not None: df = pd.DataFrame({'labels': self.mbk.labels_}) df[self.UPDATE_COLUMN] = run_number self._buffer.append(df) with self.lock: self._df = pd.DataFrame(self.mbk.cluster_centers_, columns=self.columns) self._df[self.UPDATE_COLUMN] = run_number if self._buffer is not None: logger.debug('Setting the labels') self._labels = self._buffer.df() return self._return_run_step(dfslot.next_state(), steps_run=steps)
def run_step(self,run_number,step_size,howlong): dfslot = self.get_input_slot('df') dfslot.update(run_number) if dfslot.has_updated() or dfslot.has_deleted(): dfslot.reset() dfslot.update(run_number) self.tdigest = TDigest() # reset indices = dfslot.next_created(step_size) steps = indices_len(indices) if steps == 0: return self._return_run_step(self.state_blocked, steps_run=steps) input_df = dfslot.data() with dfslot.lock: x = self.filter_columns(input_df, fix_loc(indices)) self.tdigest.batch_update(x) df = self._df values = [] for p in self._percentiles: values.append(self.tdigest.percentile(p*100)) values.append(run_number) with self.lock: df.loc[run_number] = values if len(df) > self.params.history: self._df = df.loc[df.index[-self.params.history:]] return self._return_run_step(dfslot.next_state(), steps_run=steps, reads=steps, updates=len(self._df))
def run_step(self, run_number, step_size, howlong): dfslot = self.get_input_slot('table') dfslot.update(run_number) if dfslot.updated.any() or dfslot.deleted.any(): dfslot.reset() if self._table is not None: self._table.resize(0) dfslot.update(run_number) indices = dfslot.created.next(step_size) # returns a slice steps = indices_len(indices) if steps == 0: return self._return_run_step(self.state_blocked, steps_run=0) input_df = dfslot.data() op = self.filter_columns(input_df, fix_loc(indices)).max(keepdims=True) if self._table is None: self._table = Table( self.generate_table_name('max'), data=op, # scheduler=self.scheduler(), create=True) elif len(self._table) == 0: # has been resetted self._table.append(op) else: last = self._table.last() for colname in last: current_max = op[colname] current_max[0] = np.maximum(current_max, last[colname]) self._table.append(op) #TODO manage the history in a more efficient way #if len(self._table) > self.params.history: # self._table = self._table.loc[self._df.index[-self.params.history:]] return self._return_run_step(self.next_state(dfslot), steps_run=steps)
def run_step_progress(self, run_number: int, step_size: int, howlong: float) -> ReturnRunStep: _b = bitmap.asbitmap # to_delete: List[bitmap] to_create: List[bitmap] steps = 0 tables = [] ph_table = None # assert len(self.inputs) > 0 reset_ = False for name in self.get_input_slot_multiple(): slot = self.get_input_slot(name) t = slot.data() assert isinstance(t, BaseTable) if ph_table is None: ph_table = _get_physical_table(t) else: assert ph_table is _get_physical_table(t) tables.append(t) # slot.update(run_number) if reset_ or slot.updated.any() or slot.deleted.any(): slot.reset() reset_ = True steps += 1 # if slot.deleted.any(): # deleted = slot.deleted.next(step_size) # steps += 1 # to_delete.append(_b(deleted)) # if slot.updated.any(): # actually don't care # _ = slot.updated.next(step_size) # #to_delete |= _b(updated) # #to_create |= _b(updated) # #steps += 1 # indices_len(updated) + 1 if slot.created.any(): created = slot.created.next(step_size) bm = _b(created) # - to_delete to_create.append(bm) steps += indices_len(created) if steps == 0: return self._return_run_step(self.state_blocked, steps_run=0) # to_delete = bitmap.union(*to_delete) to_create_4sure = bitmap() if len(to_create) == len(tables): to_create_4sure = bitmap.intersection(*to_create) to_create_maybe = bitmap.union(*to_create) if not self.result: self.result = TableSelectedView(ph_table, bitmap([])) if reset_: self.selected.selection = bitmap([]) self.selected.selection = self.selected.index | to_create_4sure to_create_maybe -= to_create_4sure eff_create = to_create_maybe for t in tables: eff_create &= t.index self.selected.selection = self.selected.index | eff_create return self._return_run_step(self.state_blocked, steps)
def to_array( self, locs: Indexer = None, columns: Optional[List[str]] = None, # returns_indices=False, ret: Optional[np.ndarray[Any, Any]] = None, ) -> np.ndarray[Any, Any]: """Convert this table to a numpy array Parameters ---------- locs: a list of ids or None The rows to extract. Locs can be specified with multiple formats: integer, list, numpy array, Iterable, or slice. columns: a list or None the columns to extract or, if None, all the table columns return_indices: Boolean if True, returns a tuple with the indices of the returned values as indices, followed by the array ret: array or None if None, the returned array is allocated, otherwise, ret is reused. It should be an array of the right dtype and size otherwise it is ignored. """ if columns is None: columns = self.columns assert columns is not None shapes = [self[c].shape for c in columns] offsets = self.column_offsets(columns, shapes) dtype = self.columns_common_dtype(columns) indices = None # TODO split the copy in chunks if locs is None: indices = self.index elif isinstance(locs, slice): indices = self._slice_to_bitmap(locs) # indices = self._any_to_bitmap(locs) else: indices = locs shape: Shape = (indices_len(indices), offsets[-1]) arr: np.ndarray[Any, Any] if isinstance( ret, np.ndarray) and ret.shape == shape and ret.dtype == dtype: arr = ret else: arr = np.empty(shape, dtype=dtype) for i, column in enumerate(columns): col = self._column(column) shape = shapes[i] if len(shape) == 1: col.read_direct(arr, indices, dest_sel=np.s_[:, offsets[i]]) else: col.read_direct(arr, indices, dest_sel=np.s_[:, offsets[i]:offsets[i + 1]]) # if returns_indices: # return indices, arr return arr
def run_step(self, run_number, step_size, howlong): dfslot = self.get_input_slot('table') dfslot.update(run_number) if dfslot.updated.any() or dfslot.deleted.any(): dfslot.reset() self._table = None dfslot.update(run_number) indices = dfslot.created.next(step_size) # returns a slice steps = indices_len(indices) if steps == 0: return self._return_run_step(self.state_blocked, steps_run=0) input_table = dfslot.data() op = self.filter_columns(input_table, fix_loc(indices)).idxmin() #if not op.index.equals(self._columns): # # some columns are not numerical # self._columns = op.index if self._min is None: min_ = OrderedDict(zip(op.keys(), [np.nan] * len(op.keys()))) for col, ix in op.items(): min_[col] = input_table.at[ ix, col] # lookup value, is there a better way? self._min = Table(self.generate_table_name('_min'), dshape=input_table.dshape, create=True) self._min.append(min_, indices=[run_number]) self._table = Table(self.generate_table_name('_table'), dshape=input_table.dshape, create=True) self._table.append(op, indices=[run_number]) else: prev_min = self._min.last() prev_idx = self._table.last() min_ = OrderedDict(prev_min.items()) for col, ix in op.items(): val = input_table.at[ix, col] if np.isnan(val): pass elif np.isnan(min_[col]) or val < min_[col]: op[col] = prev_idx[col] min_[col] = val with self.lock: self._table.append(op, indices=[run_number]) self._min.append(min_, indices=[run_number]) if len(self._table) > self.params.history: data = self._table.loc[ self._table.index[-self.params.history:]] self._table = Table(self.generate_table_name('_table'), data=data, create=True) data = self._min.loc[ self._min.index[-self.params.history:]] self._min = Table(self.generate_table_name('_min'), data=data, create=True) return self._return_run_step(self.next_state(dfslot), steps_run=steps)
def run_step(self,run_number,step_size,howlong): query_slot = self.get_input_slot('query') df_slot = self.get_input_slot('df') if not query_slot: query = None else: query_df = query_slot.data() query_slot.update(run_number) if query_slot.has_created(): # ignore deleted and updated df_slot.reset() # re-filter self._buffer.reset(); indices = query_slot.next_created() # read it all with query_slot.lock: query = last_row(query_df)[self._query_column] # get the query expression if query is not None: if len(query)==0: query=None else: query = unicode(query) # make sure we have a string df_slot.update(run_number) if df_slot.has_deleted() or df_slot.has_updated(): df_slot.reset() self._buffer.reset() df_slot.update(run_number) indices = df_slot.next_created(step_size) steps = indices_len(indices) if steps==0: return self._return_run_step(self.state_blocked, steps_run=steps) if query is None: # nothing to query, just pass through logger.info('No query, passing data through') self._df = df_slot.data() return self._return_run_step(self.state_blocked, steps_run=steps) with df_slot.lock: new_df = df_slot.data().loc[fix_loc(indices)] try: selected_df = new_df.eval(query) #print 'Select evaluated %d/%d rows'%(len(selected_df),steps) if isinstance(selected_df, pd.Series): if selected_df.index.has_duplicates: import pdb pdb.set_trace() selected_df = new_df.loc[selected_df] except Exception as e: logger.error('Probably a syntax error in query expression: %s', e) self._df = df_slot.data() return self._return_run_step(self.state_blocked, steps_run=steps) selected_df.loc[:,self.UPDATE_COLUMN] = run_number self._buffer.append(selected_df) #, ignore_index=False) TODO later self._df = self._buffer.df() return self._return_run_step(self.state_blocked, steps_run=steps)
def run_step(self, run_number: int, step_size: int, howlong: float) -> ReturnRunStep: if self.params.fixed_step_size and False: step_size = self.params.fixed_step_size input_slot = self.get_input_slot("table") assert input_slot is not None steps = 0 if not input_slot.created.any(): return self._return_run_step(self.state_blocked, steps_run=0) created = input_slot.created.next(length=step_size) steps = indices_len(created) self._steps += steps input_table = input_slot.data() if self.result is None: self.result = Table( self.generate_table_name("stirrer"), dshape=input_table.dshape, ) raw_ids = self.table.index before_ = raw_ids # bitmap(raw_ids[raw_ids >= 0]) v = input_table.loc[fix_loc(created), :] self.table.append(v) # indices=bitmap(created)) delete = [] if self._delete_rows and self.test_delete_threshold(before_): if isinstance(self._delete_rows, int): delete = random.sample(tuple(before_), min(self._delete_rows, len(before_))) elif self._delete_rows == "half": delete = random.sample(tuple(before_), len(before_) // 2) elif self._delete_rows == "all": delete = before_ else: delete = self._delete_rows if delete and self.params.del_twice: mid = len(delete) // 2 del self.table.loc[delete[:mid]] del self.table.loc[delete[mid:]] elif delete: steps += len(delete) del self.table.loc[delete] if self._update_rows and len(before_): before_ -= bitmap(delete) if isinstance(self._update_rows, int): updated = random.sample(tuple(before_), min(self._update_rows, len(before_))) else: updated = self._update_rows v = np.random.rand(len(updated)) if updated: steps += len(updated) self.table.loc[fix_loc(updated), [self._update_column]] = [v] return self._return_run_step(self.next_state(input_slot), steps_run=steps)
def run_step(self, run_number, step_size, howlong): if self.params.fixed_step_size and False: step_size = self.params.fixed_step_size input_slot = self.get_input_slot('table') input_slot.update(run_number) steps = 0 if not input_slot.created.any(): return self._return_run_step(self.state_blocked, steps_run=0) created = input_slot.created.next(step_size) steps = indices_len(created) with input_slot.lock: input_table = input_slot.data() p = self.params if self._table is None: self._table = Table( self.generate_table_name('dummy'), dshape=input_table.dshape, ) raw_ids = self._table.index.values before_ = bitmap(raw_ids[raw_ids >= 0]) v = input_table.loc[fix_loc(created), :] #print("creations: ", created) self._table.append(v) # indices=bitmap(created)) delete = [] if self._delete_rows and self.test_delete_threshold(before_): if isinstance(self._delete_rows, int): delete = random.sample(tuple(before_), min(self._delete_rows, len(before_))) elif self._delete_rows == 'half': delete = random.sample(tuple(before_), len(before_) // 2) elif self._delete_rows == 'all': delete = before_ else: delete = self._delete_rows #print("deletions: ", delete) if self.params.del_twice: mid = len(delete) // 2 del self._table.loc[delete[:mid]] del self._table.loc[delete[mid:]] else: del self._table.loc[delete] if self._update_rows and len(before_): before_ -= bitmap(delete) if isinstance(self._update_rows, int): updated = random.sample(tuple(before_), min(self._update_rows, len(before_))) else: updated = self._update_rows v = np.random.rand(len(updated)) if updated: self._table.loc[fix_loc(updated), [self._update_column]] = [v] return self._return_run_step(self.next_state(input_slot), steps_run=steps)
def run_step(self, run_number: int, step_size: int, howlong: float) -> ReturnRunStep: assert self.context with self.context as ctx: indices_t = ctx.table.created.next( length=step_size) # returns a slice steps_t = indices_len(indices_t) ctx.table.clear_buffers() indices_l = ctx.labels.created.next( length=step_size) # returns a slice steps_l = indices_len(indices_l) ctx.labels.clear_buffers() steps = steps_t + steps_l if steps == 0: return self._return_run_step(self.state_blocked, steps_run=0) if self.result is None: self.result = TableSelectedView(ctx.table.data(), ctx.labels.data().selection) else: self.selected.selection = ctx.labels.data().selection return self._return_run_step(self.next_state(ctx.table), steps_run=steps)
def run_step(self,run_number,step_size,howlong): df_slot = self.get_input_slot('df') df_slot.update(run_number, buffer_created=True, buffer_updated=True) if df_slot.has_deleted(): self.reset() df_slot.reset() df_slot.update(run_number) input_df = df_slot.data() columns = self.get_columns(input_df) if input_df is None or len(input_df)==0: return self._return_run_step(self.state_blocked, steps_run=0) indices = df_slot.next_created(step_size) steps = indices_len(indices) step_size -= steps steps_run = steps if steps != 0: indices = fix_loc(indices) self._buffer.append(input_df.loc[indices]) self._df = self._buffer.df() self._df.loc[indices,self.UPDATE_COLUMN] = run_number if step_size > 0 and df_slot.has_updated(): indices = df_slot.next_updated(step_size,as_slice=False) steps = indices_len(indices) if steps != 0: steps_run += steps indices = fix_loc(indices) # no need, but stick to the stereotype updated = self.filter_columns(input_df, indices) df = self.filter_columns(self._df, indices) norms = row_norms(updated-df) selected = (norms > (self._delta*self.get_scale())) indices = indices[selected] if selected.any(): logger.debug('updating at %d', run_number) self._df.loc[indices, self._columns] = updated.loc[indices, self._columns] self._df.loc[indices, self.UPDATE_COLUMN] = run_number else: logger.debug('Not updating at %d', run_number) return self._return_run_step(df_slot.next_state(), steps_run=steps_run)
def run_step(self,run_number,step_size,howlong): dfslot = self.get_input_slot('array') input_df = dfslot.data() dfslot.update(run_number) indices = dfslot.next_created() steps = indices_len(indices) if steps == 0: return self._return_run_step(self.state_blocked, steps_run=1) with dfslot.lock: histo = input_df.at[input_df.index[-1], 'array'] if histo is None: return self._return_run_step(self.state_blocked, steps_run=1) p = self.params cmax = p.cmax if np.isnan(cmax): cmax = None cmin = p.cmin if np.isnan(cmin): cmin = None high = p.high low = p.low try: image = sp.misc.toimage(sp.special.cbrt(histo), cmin=cmin, cmax=cmax, high=high, low=low, mode='I') image = image.transpose(Image.FLIP_TOP_BOTTOM) filename = p.filename except: image = None filename = None if filename is not None: try: if re.search(r'%(0[\d])?d', filename): filename = filename % (run_number) filename = self.storage.fullname(self, filename) image.save(filename, format='PNG', bits=16) logger.debug('Saved image %s', filename) image = None except: logger.error('Cannot save image %s', filename) raise values = [image, filename, run_number] with self.lock: df = self._df df.loc[run_number] = values if len(df) > p.history: self._df = df.loc[df.index[-p.history:]] return self._return_run_step(self.state_blocked, steps_run=1, reads=1, updates=1)
def to_array(self, locs=None, columns=None): """Convert this table to a numpy array Parameters ---------- locs: a list of ids or None The rows to extract. Locs can be specified with multiple formats: integer, list, numpy array, Iterable, or slice. columns: a list or None the columns to extract """ if columns is None: columns = self.columns shapes = [self[c].shape for c in columns] offsets = self.column_offsets(columns, shapes) dtypes = [self[c].dtype for c in columns] dtype = np.find_common_type(dtypes, []) indices = None #TODO split the copy in chunks if locs is None: if self._ids.has_freelist(): indices = self._ids[:] mask = np.one(locs.shape, dtype=np.bool) mask[self._ids.freelist()] = False indices = np.ma.masked_array(indices, mask) else: indices = slice(0, self.size) elif isinstance(locs, (list, np.ndarray)): indices = np.asarray(locs, np.int64) indices = self.id_to_index(indices) elif isinstance(locs, Iterable): indices = self.id_to_index(locs) elif isinstance(locs, integer_types): indices = self.id_to_index(slice(locs, locs + 1, 1)) elif isinstance(locs, slice): indices = self.id_to_index(locs) arr = np.empty((indices_len(indices), offsets[-1]), dtype=dtype) for i, column in enumerate(columns): col = self._column(column) shape = shapes[i] if len(shape) == 1: col.read_direct(arr, indices, dest_sel=np.s_[:, offsets[i]]) else: col.read_direct(arr, indices, dest_sel=np.s_[:, offsets[i]:offsets[i + 1]]) return arr
def run_step(self, run_number: int, step_size: int, howlong: float) -> ReturnRunStep: assert self.context with self.context as ctx: indices = ctx.table.created.next(step_size) # returns a slice steps = indices_len(indices) input_df = ctx.table.data() op = input_df.loc[fix_loc(indices)].max(keepdims=False) if self.result is None: self.result = PsDict(op) else: for k, v in self.psdict.items(): self.result[k] = np.maximum(op[k], v) return self._return_run_step(self.next_state(ctx.table), steps_run=steps)
def run_step(self,run_number,step_size,howlong): dfslot = self.get_input_slot('df') dfslot.update(run_number) if dfslot.has_updated() or dfslot.has_deleted(): dfslot.reset() self._df = None dfslot.update(run_number) indices = dfslot.next_created(step_size) # returns a slice steps = indices_len(indices) if steps==0: return self._return_run_step(self.state_blocked, steps_run=0) input_df = dfslot.data() op = self.filter_columns(input_df, fix_loc(indices)).idxmin() if not op.index.equals(self._columns): # some columns are not numerical self._columns = op.index op[self.UPDATE_COLUMN] = run_number if self._min is None: min = pd.Series([np.nan], index=op.index) # the UPDATE_COLUMN is included min[self.UPDATE_COLUMN] = run_number for col in op.index: if col==self.UPDATE_COLUMN: continue min[col] = input_df.loc[op[col], col] # lookup value, is there a better way? self._min = pd.DataFrame([min], columns=op.index) self._df = pd.DataFrame([op], columns=op.index) else: prev_min = last_row(self._min) prev_idx = last_row(self._df) min = pd.Series(prev_min) min[self.UPDATE_COLUMN] = run_number for col in op.index: if col==self.UPDATE_COLUMN: continue val = input_df.loc[op[col], col] if np.isnan(val): pass elif np.isnan(min[col]) or val < min[col]: op[col] = prev_idx[col] min[col] = val op[self.UPDATE_COLUMN] = run_number with self.lock: self._df = self._df.append(op, ignore_index=True) self._min = self._min.append(min, ignore_index=True) if len(self._df) > self.params.history: self._df = self._df.loc[self._df.index[-self.params.history:]] self._min = self._min.loc[self._min.index[-self.params.history:]] return self._return_run_step(dfslot.next_state(), steps_run=steps)
def run_step(self, run_number, step_size, howlong): dfslot = self.get_input_slot('table') dfslot.update(run_number) if dfslot.deleted.any() or dfslot.updated.any(): logger.debug('has deleted or updated, reseting') self.reset() dfslot.update(run_number) #print('dfslot has buffered %d elements'% dfslot.created_length()) input_df = dfslot.data() if (input_df is None or len(input_df) == 0) and dfslot.created_length() < self.mbk.n_clusters: # Should add more than k items per loop return self._return_run_step(self.state_blocked, steps_run=0) indices = dfslot.created.next(step_size) # returns a slice steps = indices_len(indices) if steps == 0: return self._return_run_step(self.state_blocked, steps_run=0) cols = self.get_columns(input_df) if len(cols) == 0: return self._return_run_step(self.state_blocked, steps_run=0) locs = fix_loc(indices) if self._labels is not None and isinstance(indices, slice): indices = np.arange(indices.start, indices.stop) X = input_df.to_array(columns=cols, locs=locs) batch_size = self.mbk.batch_size or 100 for batch in gen_batches(steps, batch_size): self.mbk.partial_fit(X[batch]) if self._labels is not None: self._labels.append({'labels': self.mbk.labels_}, indices=indices[batch]) if self._table is None: dshape = self.dshape_from_columns(input_df, cols, dshape_from_dtype(X.dtype)) self._table = Table(self.generate_table_name('centers'), dshape=dshape, create=True) self._table.resize(self.mbk.cluster_centers_.shape[0]) self._table[cols] = self.mbk.cluster_centers_ return self._return_run_step(self.next_state(dfslot), steps_run=steps)
def run_step(self, run_number: int, step_size: int, howlong: float) -> ReturnRunStep: input_slot = self.get_input_slot("table") # input_slot.update(run_number) steps = 0 if not input_slot.created.any(): return self._return_run_step(self.state_blocked, steps_run=0) created = input_slot.created.next(step_size) steps = indices_len(created) input_table = input_slot.data() if self.result is None: self.result = Table( self.generate_table_name("stirrer"), dshape=input_table.dshape, ) v = input_table.loc[fix_loc(created), :] self.table.append(v) if not self.done: module = self.scheduler()[self.watched] sensitive_ids = bitmap(getattr(module, "_sensitive_ids").values()) if sensitive_ids: if self.proc_sensitive: if self.mode == "delete": # print('delete sensitive', sensitive_ids) del self.table.loc[sensitive_ids] else: # print('update sensitive', sensitive_ids) self.table.loc[sensitive_ids, 0] = self.value self.done = True else: # non sensitive if len(self.result) > 10: for i in range(10): id_ = self.table.index[i] if id_ not in sensitive_ids: if self.mode == "delete": del self.table.loc[id_] else: self.table.loc[id_, 0] = self.value self.done = True return self._return_run_step(self.next_state(input_slot), steps_run=steps)
def run_step(self, run_number: int, step_size: int, howlong: float) -> ReturnRunStep: slot = self.get_input_slot("table") if slot.updated.any() or slot.deleted.any(): slot.reset() if self.result is not None: self.psdict.clear() # resize(0) slot.update(run_number) indices = slot.created.next(step_size) steps = indices_len(indices) if steps == 0: return self._return_run_step(self.state_blocked, steps_run=0) data = slot.data() op = data.loc[fix_loc(indices)].max(keepdims=False) if self.result is None: self.result = PsDict(op) else: for k, v in self.psdict.items(): self.result[k] = np.maximum(op[k], v) return self._return_run_step(self.next_state(slot), steps_run=steps)
def run_step(self, run_number, step_size, howlong): prev_min = prev_max = np.nan dfslot = self.get_input_slot('table') dfslot.update(run_number) if dfslot.updated.any() or dfslot.deleted.any(): dfslot.reset() dfslot.update(run_number) else: df = self._table prev = len(df)-1 if prev > 0: prev_min = df.iat[prev, self._min_column] prev_max = df.iat[prev, self._max_column] indices = dfslot.created.next(step_size) # returns a slice input_df = dfslot.data() steps = indices_len(indices) if steps > 0: x = input_df.to_array(locs=fix_loc(indices), columns=[self._column]) new_min = np.nanmin(x) new_max = np.nanmax(x) row = {self._min_column: np.nanmin([prev_min, new_min]), self._max_column: np.nanmax([prev_max, new_max])} with self.lock: if run_number in df.index: df.loc[run_number] = row else: df.add(row, index=run_number) # while len(df) > self.params.history: # drop ...self._table # if self._reset_index: # new_ = Table(get_random_name('stats_'), dshape=self._table.dshape) # new_.resize(len(self._table)) # new_.iloc[:,self._min_column] = self._table[self._min_column] # new_.iloc[:,self._max_column] = self._table[self._max_column] # self._table = new_ #print(repr(df)) return self._return_run_step(self.next_state(dfslot), steps_run=steps, reads=steps, updates=len(self._table))
def run_step(self,run_number,step_size,howlong): dfslot = self.get_input_slot('table') dfslot.update(run_number) if dfslot.updated.any() or dfslot.deleted.any(): dfslot.reset() self._table = None dfslot.update(run_number) indices = dfslot.created.next(step_size) # returns a slice steps = indices_len(indices) if steps==0: return self._return_run_step(self.state_blocked, steps_run=0) input_df = dfslot.data() op = self.op(self.filter_columns(input_df,fix_loc(indices))) if self._table is None: self._table = Table(self.generate_table_name('var'), dshape=input_df.dshape, # scheduler=self.scheduler(), create=True) self._table.append(op, indices=[run_number]) print(self._table) if len(self._table) > self.params.history: self._table = self._table.loc[self._table.index[-self.params.history:]] return self._return_run_step(self.next_state(dfslot), steps_run=steps)
def run_step(self,run_number,step_size,howlong): dfslot = self.get_input_slot('df') dfslot.update(run_number) if dfslot.has_updated() or dfslot.has_deleted(): dfslot.reset() self._df = None dfslot.update(run_number) indices = dfslot.next_created(step_size) # returns a slice steps = indices_len(indices) if steps==0: return self._return_run_step(self.state_blocked, steps_run=0) input_df = dfslot.data() op = self.op(self.filter_columns(input_df,fix_loc(indices))) op[self.UPDATE_COLUMN] = run_number if self._df is None: self._df = pd.DataFrame([op], index=[run_number]) else: self._df.loc[run_number] = op print self._df if len(self._df) > self.params.history: self._df = self._df.loc[self._df.index[-self.params.history:]] return self._return_run_step(dfslot.next_state(), steps_run=steps)
def run_step(self,run_number,step_size,howlong): dfslot = self.get_input_slot('df') input_df = dfslot.data() dfslot.update(run_number) if dfslot.has_updated() or dfslot.has_deleted(): raise ProgressiveError('%s module does not manage updates or deletes', self.__class__.__name__) indices = dfslot.next_created(step_size) steps = indices_len(indices) if steps == 0: return self._return_run_step(self.state_blocked, steps_run=steps) (x, y) = input_df.loc[fix_loc(indices),[self._x, self._y]] df = self._df sum_x = df.at[0, 'sum_x'] + x.sum() sum_x_sqr = df.at[0, 'sum_x_sqr'] + (x*x).sum() sum_y = df.at[0, 'sum_y'] + y.sum() sum_xy = df.at[0, 'sum_xy'] + (x*y).sum() denom = len(x) * sum_x_sqr - sum_x*sum_x coef = (sum_y*sum_x_sqr - sum_x*sum_xy) / denom intercept = (len(x)*sum_xy - sum_x*sum_y) / denom df.loc[0] = [coef, intercept, sum_x, sum_x_sqr, sum_y, sum_xy, run_number] return self._return_run_step(dfslot.next_state(), steps_run=steps, reads=steps, updates=1)
def run_step(self, run_number, step_size, howlong): input_slot = self.get_input_slot('table') input_slot.update(run_number) steps = 0 deleted = None if input_slot.deleted.any(): deleted = input_slot.deleted.next(step_size) steps += indices_len(deleted) created = None if input_slot.created.any(): created = input_slot.created.next(step_size) steps += indices_len(created) updated = None if input_slot.updated.any(): updated = input_slot.updated.next(step_size) steps += indices_len(updated) with input_slot.lock: input_table = input_slot.data() if not self._table: self._table = TableSelectedView(input_table, bitmap([])) self._create_min_max() param = self.params # # lower/upper # lower_slot = self.get_input_slot('lower') lower_slot.update(run_number) upper_slot = self.get_input_slot('upper') limit_changed = False if lower_slot.deleted.any(): lower_slot.deleted.next() if lower_slot.updated.any(): lower_slot.updated.next() limit_changed = True if lower_slot.created.any(): lower_slot.created.next() limit_changed = True if not (lower_slot is upper_slot): upper_slot.update(run_number) if upper_slot.deleted.any(): upper_slot.deleted.next() if upper_slot.updated.any(): upper_slot.updated.next() limit_changed = True if upper_slot.created.any(): upper_slot.created.next() limit_changed = True # # min/max # min_slot = self.get_input_slot('min') min_slot.update(run_number) min_slot.created.next() min_slot.updated.next() min_slot.deleted.next() max_slot = self.get_input_slot('max') max_slot.update(run_number) max_slot.created.next() max_slot.updated.next() max_slot.deleted.next() if (lower_slot.data() is None or upper_slot.data() is None or len(lower_slot.data()) == 0 or len(upper_slot.data()) == 0): return self._return_run_step(self.state_blocked, steps_run=0) # X ... lower_value_x = lower_slot.data().last(self._watched_key_lower_x) upper_value_x = upper_slot.data().last(self._watched_key_upper_x) # Y ... lower_value_y = lower_slot.data().last(self._watched_key_lower_y) upper_value_y = upper_slot.data().last(self._watched_key_upper_y) if (lower_slot.data() is None or upper_slot.data() is None or len(min_slot.data()) == 0 or len(max_slot.data()) == 0): return self._return_run_step(self.state_blocked, steps_run=0) # X ... minv_x = min_slot.data().last(self._watched_key_lower_x) maxv_x = max_slot.data().last(self._watched_key_upper_x) # Y ... minv_y = min_slot.data().last(self._watched_key_lower_y) maxv_y = max_slot.data().last(self._watched_key_upper_y) # X ... if lower_value_x is None or np.isnan( lower_value_x ) or lower_value_x < minv_x or lower_value_x >= maxv_x: lower_value_x = minv_x limit_changed = True if (upper_value_x is None or np.isnan(upper_value_x) or upper_value_x > maxv_x or upper_value_x <= minv_x or upper_value_x <= lower_value_x): upper_value_x = maxv_x limit_changed = True # Y ... if lower_value_y is None or np.isnan( lower_value_y ) or lower_value_y < minv_y or lower_value_y >= maxv_y: lower_value_y = minv_y limit_changed = True if (upper_value_y is None or np.isnan(upper_value_y) or upper_value_y > maxv_y or upper_value_y <= minv_y or upper_value_y <= lower_value_y): upper_value_y = maxv_y limit_changed = True self._set_min_out(lower_value_x, lower_value_y) self._set_max_out(upper_value_x, upper_value_y) if steps == 0 and not limit_changed: return self._return_run_step(self.state_blocked, steps_run=0) # ... if not self._impl.is_started: status = self._impl.start(input_table, lower_value_x, upper_value_x, lower_value_y, upper_value_y, limit_changed, created=created, updated=updated, deleted=deleted) self._table.selection = self._impl.result._values else: status = self._impl.resume(lower_value_x, upper_value_x, lower_value_y, upper_value_y, limit_changed, created=created, updated=updated, deleted=deleted) self._table.selection = self._impl.result._values return self._return_run_step(self.next_state(input_slot), steps_run=steps)
def run_step_progress(self, run_number, step_size, howlong): _b = bitmap.asbitmap to_delete = [] to_create = [] steps = 0 tables = [] ph_table = None assert len(self.inputs) > 0 reset_ = False for name in self.inputs: if not name.startswith('table'): continue slot = self.get_input_slot(name) t = slot.data() assert isinstance(t, TableSelectedView) if ph_table is None: ph_table = _get_physical_table(t) else: assert ph_table is _get_physical_table(t) tables.append(t) slot.update(run_number) if reset_ or slot.updated.any() or slot.deleted.any(): slot.reset() reset_ = True steps += 1 #if slot.deleted.any(): # deleted = slot.deleted.next(step_size) # steps += 1 # to_delete.append(_b(deleted)) #if slot.updated.any(): # actually don't care # _ = slot.updated.next(step_size) # #to_delete |= _b(updated) # #to_create |= _b(updated) # #steps += 1 # indices_len(updated) + 1 if slot.created.any(): created = slot.created.next(step_size) bm = _b(created) #- to_delete to_create.append(bm) steps += indices_len(created) if steps == 0: return self._return_run_step(self.state_blocked, steps_run=0) to_delete = bitmap.union(*to_delete) to_create_4sure = bitmap() if len(to_create) == len(tables): to_create_4sure = bitmap.intersection(*to_create) to_create_maybe = bitmap.union(*to_create) if not self._table: self._table = TableSelectedView(ph_table, bitmap([])) if reset_: self._table.selection = bitmap([]) #self._table.selection -= to_delete self._table.selection |= to_create_4sure to_create_maybe -= to_create_4sure eff_create = to_create_maybe for t in tables: eff_create &= t.selection self._table.selection |= eff_create return self._return_run_step(self.next_state( self.get_input_slot(self.inputs[0])), steps_run=steps)
def run_step(self,run_number,step_size,howlong): dfslot = self.get_input_slot('df') dfslot.update(run_number) min_slot = self.get_input_slot('min') min_slot.update(run_number) max_slot = self.get_input_slot('max') max_slot.update(run_number) if dfslot.has_updated() or dfslot.has_deleted(): logger.debug('reseting histogram') dfslot.reset() self._histo = None self._xedges = None self._yedges = None dfslot.update(run_number) if not (dfslot.has_created() or min_slot.has_created() or max_slot.has_created()): # nothing to do, just wait logger.info('Input buffers empty') return self._return_run_step(self.state_blocked, steps_run=0) bounds = self.get_bounds(min_slot, max_slot) if bounds is None: print('No bounds yet at run %d'%run_number) logger.debug('No bounds yet at run %d', run_number) return self._return_run_step(self.state_blocked, steps_run=0) xmin, xmax, ymin, ymax = bounds if self._bounds is None: (xdelta, ydelta) = self.get_delta(*bounds) self._bounds = (xmin-xdelta,xmax+xdelta,ymin-ydelta,ymax+ydelta) print('New bounds at run %d: %s'%(run_number,self._bounds)) else: (dxmin, dxmax, dymin, dymax) = self._bounds (xdelta, ydelta) = self.get_delta(*bounds) # Either the min/max has extended, or it has shrunk beyond the deltas if (xmin<dxmin or xmax>dxmax or ymin<dymin or ymax>dymax) \ or (xmin>(dxmin+xdelta) or xmax<(dxmax-xdelta) or ymin>(dymin+ydelta) or ymax<(dymax-ydelta)): self._bounds = (xmin-xdelta,xmax+xdelta,ymin-ydelta,ymax+ydelta) print('Updated bounds at run %d: %s'%(run_number,self._bounds)) logger.info('Updated bounds at run %s: %s', run_number, self._bounds) dfslot.reset() dfslot.update(run_number) # should recompute the histogram from scatch self._histo = None self._xedges = None self._yedges = None xmin, xmax, ymin, ymax = self._bounds if xmin>=xmax or ymin>=ymax: logger.error('Invalid bounds: %s', self._bounds) return self._return_run_step(self.state_blocked, steps_run=0) # Now, we know we have data and bounds, proceed to create a new histogram input_df = dfslot.data() indices = dfslot.next_created(step_size) steps = indices_len(indices) logger.info('Read %d rows', steps) self.total_read += steps filtered_df = input_df.loc[fix_loc(indices)] x = filtered_df[self.x_column] y = filtered_df[self.y_column] p = self.params if self._xedges is not None: bins = [self._xedges, self._yedges] else: bins = [p.ybins, p.xbins] if len(x)>0: histo, xedges, yedges = np.histogram2d(y, x, bins=bins, range=[[ymin, ymax], [xmin, xmax]], normed=False) self._xedges = xedges self._yedges = yedges else: histo = None cmax = 0 if self._histo is None: self._histo = histo elif histo is not None: self._histo += histo if self._histo is not None: cmax = self._histo.max() print 'cmax=%d'%cmax values = [self._histo, 0, cmax, xmin, xmax, ymin, ymax, run_number] with self.lock: self._df.loc[run_number] = values if len(self._df) > p.history: self._df = self._df.loc[self._df.index[-p.history:]] return self._return_run_step(dfslot.next_state(), steps_run=steps)
def run_step(self, run_number, step_size, howlong): dfslot = self.get_input_slot('table') df = dfslot.data() dfslot.update(run_number) if dfslot.updated.any() or dfslot.deleted.any(): dfslot.reset() logger.info( 'Reseting history because of changes in the input table') dfslot.update(run_number) #TODO: be smarter with changed values m = step_size indices = dfslot.created.next(m) m = indices_len(indices) i = None j = None Si = self._table['document'] arrayslot = self.get_input_slot('array') if arrayslot is not None and arrayslot.data() is not None: array = arrayslot.data() logger.debug('Using array instead of DataFrame columns') if Si is not None: i = array[self._last_index] j = array[indices] if j is None: if self.columns is None: self.columns = df.columns.delete( np.where(df.columns == UPDATE_COLUMN)) elif not isinstance(self.columns, pd.Index): self.columns = pd.Index(self.columns) rows = df[self.columns] if Si is not None: i = rows.loc[self._last_index] assert len(i) == len(self._last_index) j = rows.loc[fix_loc(indices)] assert len(j) == indices_len(indices) Sj = pairwise_distances(j, metric=self._metric, n_jobs=self._n_jobs) if Si is None: mat = self._buf.resize(Sj.shape[0]) mat[:, :] = Sj self._last_index = dfslot.last_index[indices] else: Sij = pairwise_distances(i, j, metric=self._metric, n_jobs=self._n_jobs) n0 = i.shape[0] n1 = n0 + j.shape[0] mat = self._buf.resize(n1) mat[0:n0, n0:n1] = Sij mat[n0:n1, 0:n0] = Sij.T mat[n0:n1, n0:n1] = Sj self._last_index = self._last_index.append(df.index[indices]) #truth = pairwise_distances(array[0:n1], metric=self._metric) #import pdb #pdb.set_trace() #assert np.allclose(mat,truth) return self._return_run_step(self.next_state(dfslot), steps_run=m)
def run_step(self, run_number, step_size, howlong): dfslot = self.get_input_slot('array') input_df = dfslot.data() dfslot.update(run_number) indices = dfslot.created.next() steps = indices_len(indices) if steps == 0: indices = dfslot.updated.next() steps = indices_len(indices) if steps == 0: return self._return_run_step(self.state_blocked, steps_run=1) with dfslot.lock: histo = input_df.last()['array'] if histo is None: return self._return_run_step(self.state_blocked, steps_run=1) params = self.params cmax = params.cmax if np.isnan(cmax): cmax = None cmin = params.cmin if np.isnan(cmin): cmin = None high = params.high low = params.low try: image = sp.misc.toimage(sp.special.cbrt(histo), cmin=cmin, cmax=cmax, high=high, low=low, mode='I') image = image.transpose(Image.FLIP_TOP_BOTTOM) filename = params.filename except: image = None filename = None if filename is not None: try: if re.search(r'%(0[\d])?d', filename): filename = filename % (run_number) filename = self.storage.fullname(self, filename) #TODO should do it atomically since it will be called 4 times with the same fn image.save(filename, format='PNG') #, bits=16) logger.debug('Saved image %s', filename) image = None except: logger.error('Cannot save image %s', filename) raise else: buffered = six.BytesIO() image.save(buffered, format='PNG', bits=16) res = base64.b64encode(buffered.getvalue()) if six.PY3: res = str(base64.b64encode(buffered.getvalue()), "ascii") filename = "data:image/png;base64," + res if len(self._table) == 0 or self._table.last()['time'] != run_number: values = {'filename': filename, 'time': run_number} with self.lock: self._table.add(values) return self._return_run_step(self.state_blocked, steps_run=1, reads=1, updates=1)
def run_step(self, run_number: int, step_size: int, howlong: float) -> ReturnRunStep: input_slot = self.get_input_slot("table") self._create_min_max() # # lower/upper # lower_slot = self.get_input_slot("lower") # lower_slot.update(run_number) upper_slot = self.get_input_slot("upper") limit_changed = False if lower_slot.deleted.any(): lower_slot.deleted.next() if lower_slot.updated.any(): lower_slot.updated.next() limit_changed = True if lower_slot.created.any(): lower_slot.created.next() limit_changed = True if not (lower_slot is upper_slot): # upper_slot.update(run_number) if upper_slot.deleted.any(): upper_slot.deleted.next() if upper_slot.updated.any(): upper_slot.updated.next() limit_changed = True if upper_slot.created.any(): upper_slot.created.next() limit_changed = True # # min/max # min_slot = self.get_input_slot("min") min_slot.clear_buffers() # min_slot.update(run_number) # min_slot.created.next() # min_slot.updated.next() # min_slot.deleted.next() max_slot = self.get_input_slot("max") max_slot.clear_buffers() # max_slot.update(run_number) # max_slot.created.next() # max_slot.updated.next() # max_slot.deleted.next() if (lower_slot.data() is None or upper_slot.data() is None or len(lower_slot.data()) == 0 or len(upper_slot.data()) == 0): return self._return_run_step(self.state_blocked, steps_run=0) lower_value = lower_slot.data().get(self.watched_key_lower) upper_value = upper_slot.data().get(self.watched_key_upper) if (lower_slot.data() is None or upper_slot.data() is None or min_slot.data() is None or max_slot.data() is None or len(min_slot.data()) == 0 or len(max_slot.data()) == 0): return self._return_run_step(self.state_blocked, steps_run=0) minv = min_slot.data().get(self.watched_key_lower) maxv = max_slot.data().get(self.watched_key_upper) if lower_value == "*": lower_value = minv elif (lower_value is None or np.isnan(lower_value) or lower_value < minv or lower_value >= maxv): lower_value = minv limit_changed = True if upper_value == "*": upper_value = maxv elif (upper_value is None or np.isnan(upper_value) or upper_value > maxv or upper_value <= minv or upper_value <= lower_value): upper_value = maxv limit_changed = True self._set_min_out(lower_value) self._set_max_out(upper_value) # input_slot.update(run_number) if not input_slot.has_buffered() and not limit_changed: return self._return_run_step(self.state_blocked, steps_run=0) # ... steps = 0 deleted: Optional[bitmap] = None if input_slot.deleted.any(): deleted = input_slot.deleted.next(length=step_size, as_slice=False) steps += indices_len(deleted) created: Optional[bitmap] = None if input_slot.created.any(): created = input_slot.created.next(length=step_size, as_slice=False) steps += indices_len(created) updated: Optional[bitmap] = None if input_slot.updated.any(): updated = input_slot.updated.next(length=step_size, as_slice=False) steps += indices_len(updated) input_table = input_slot.data() if self.result is None: self.result = TableSelectedView(input_table, bitmap([])) assert self._impl hist_slot = self.get_input_slot("hist") hist_slot.clear_buffers() if not self._impl.is_started: self._impl.start( input_table, cast(HistogramIndex, hist_slot.output_module), lower_value, upper_value, limit_changed, created=created, updated=updated, deleted=deleted, ) else: self._impl.resume( cast(HistogramIndex, hist_slot.output_module), lower_value, upper_value, limit_changed, created=created, updated=updated, deleted=deleted, ) assert self._impl.result self.selected.selection = self._impl.result._values return self._return_run_step(self.next_state(input_slot), steps)
def run_step(self, run_number, step_size, howlong): dfslot = self.get_input_slot('table') dfslot.update(run_number) min_slot = self.get_input_slot('min') min_slot.update(run_number) max_slot = self.get_input_slot('max') max_slot.update(run_number) if dfslot.updated.any(): logger.debug('reseting histogram') self.reset() dfslot.update(run_number) if not (dfslot.created.any() or min_slot.created.any() or max_slot.created.any()): logger.info('Input buffers empty') return self._return_run_step(self.state_blocked, steps_run=0) bounds = self.get_bounds(min_slot, max_slot) if bounds is None: logger.debug('No bounds yet at run %d', run_number) return self._return_run_step(self.state_blocked, steps_run=0) xmin, xmax, ymin, ymax = bounds if self._bounds is None: (xdelta, ydelta) = self.get_delta(*bounds) self._bounds = (xmin - xdelta, xmax + xdelta, ymin - ydelta, ymax + ydelta) logger.info("New bounds at run %d: %s", run_number, self._bounds) else: (dxmin, dxmax, dymin, dymax) = self._bounds (xdelta, ydelta) = self.get_delta(*bounds) assert xdelta >= 0 and ydelta >= 0 # Either the min/max has extended, or it has shrunk beyond the deltas if ((xmin < dxmin or xmax > dxmax or ymin < dymin or ymax > dymax) or (xmin > (dxmin + xdelta) or xmax < (dxmax - xdelta) or ymin > (dymin + ydelta) or ymax < (dymax - ydelta))): #print('Old bounds: %s,%s,%s,%s'%(dxmin,dxmax,dymin,dymax)) self._bounds = (xmin - xdelta, xmax + xdelta, ymin - ydelta, ymax + ydelta) #print('Updated bounds at run %d: %s old %s deltas %s, %s'%(run_number,self._bounds, bounds, xdelta, ydelta)) logger.info('Updated bounds at run %s: %s', run_number, self._bounds) self.reset() dfslot.update(run_number) xmin, xmax, ymin, ymax = self._bounds if xmin >= xmax or ymin >= ymax: logger.error('Invalid bounds: %s', self._bounds) return self._return_run_step(self.state_blocked, steps_run=0) # Now, we know we have data and bounds, proceed to create a new histogram # or to update the previous if is still exists (i.e. no reset) p = self.params steps = 0 # if there are new deletions, build the histogram of the deleted pairs # then subtract it from the main histogram if dfslot.deleted.any() and self._histo is not None: input_df = get_physical_base(dfslot.data()) indices = dfslot.deleted.next(step_size) steps += indices_len(indices) #print('Histogram2D steps :%d'% steps) logger.info('Read %d rows', steps) x = input_df[self.x_column] y = input_df[self.y_column] idx = input_df.id_to_index(fix_loc(indices)) #print(idx) x = x[idx] y = y[idx] bins = [p.ybins, p.xbins] if len(x) > 0: histo = histogram2d(y, x, bins=bins, range=[[ymin, ymax], [xmin, xmax]]) self._histo -= histo # if there are new creations, build a partial histogram with them then # add it to the main histogram input_df = dfslot.data() indices = dfslot.created.next(step_size) steps += indices_len(indices) #print('Histogram2D steps :%d'% steps) logger.info('Read %d rows', steps) self.total_read += steps x = input_df[self.x_column] y = input_df[self.y_column] idx = input_df.id_to_index(fix_loc(indices)) #print(idx) x = x[idx] y = y[idx] if self._xedges is not None: bins = [self._xedges, self._yedges] else: bins = [p.ybins, p.xbins] if len(x) > 0: #t = default_timer() # using fast_histogram histo = histogram2d(y, x, bins=bins, range=[[ymin, ymax], [xmin, xmax]]) # using numpy histogram #histo, xedges, yedges = np.histogram2d(y, x, # bins=bins, # range=[[ymin, ymax], [xmin, xmax]], # normed=False) #t = default_timer()-t #print('Time for histogram2d: %f'%t) #self._xedges = xedges #self._yedges = yedges else: histo = None cmax = 0 if self._histo is None: self._histo = histo elif histo is not None: self._histo += histo if self._histo is not None: cmax = self._histo.max() values = { 'array': np.flip(self._histo, axis=0), 'cmin': 0, 'cmax': cmax, 'xmin': xmin, 'xmax': xmax, 'ymin': ymin, 'ymax': ymax, 'time': run_number } if self._with_output: with self.lock: table = self._table table['array'].set_shape([p.ybins, p.xbins]) l = len(table) last = table.last() if l == 0 or last['time'] != run_number: table.add(values) else: table.iloc[last.row] = values self.build_heatmap(values) return self._return_run_step(self.next_state(dfslot), steps_run=steps)
def run_step( self, run_number: int, step_size: int, howlong: float ) -> ReturnRunStep: self._run_once = True input_slot = self.get_input_slot("table") # input_slot.update(run_number) steps = 0 deleted = None if input_slot.deleted.any(): deleted = input_slot.deleted.next(as_slice=False) steps += 1 # indices_len(deleted) created = None if input_slot.created.any(): created = input_slot.created.next(length=step_size, as_slice=False) steps += indices_len(created) updated = None if input_slot.updated.any(): updated = input_slot.updated.next(length=step_size, as_slice=False) steps += indices_len(updated) input_table = input_slot.data() if input_table is None: return self._return_run_step(self.state_blocked, steps_run=0) if self.result is None: self.result = TableSelectedView(input_table, bitmap([])) if steps == 0: return self._return_run_step(self.state_blocked, steps_run=0) param = self.params limit_slot = self.get_input_slot("limit") # limit_slot.update(run_number) limit_changed = False if limit_slot.deleted.any(): limit_slot.deleted.next() if limit_slot.updated.any(): limit_slot.updated.next() limit_changed = True if limit_slot.created.any(): limit_slot.created.next() limit_changed = True if len(limit_slot.data()) == 0: return self._return_run_step(self.state_blocked, steps_run=0) if param.limit_key: limit_value = limit_slot.data().last(param.limit_key) else: limit_value = limit_slot.data().last()[0] if not self._impl.is_started: self._impl.start( input_table, limit_value, limit_changed, created=created, updated=updated, deleted=deleted, ) else: self._impl.resume( limit_value, limit_changed, created=created, updated=updated, deleted=deleted, ) self.selected.selection = self._impl.result._values return self._return_run_step(self.next_state(input_slot), steps)
def run_step(self, run_number, step_size, howlong): dfslot = self.get_input_slot('df') dfslot.update(run_number) min_slot = self.get_input_slot('min') min_slot.update(run_number) max_slot = self.get_input_slot('max') max_slot.update(run_number) if dfslot.has_updated() or dfslot.has_deleted(): logger.debug('resetting histogram') dfslot.reset() self._histo = None self._edges = None dfslot.update(run_number) if not (dfslot.has_created() or min_slot.has_created() or max_slot.has_created()): logger.info('input buffers empty') return self._return_run_step(self.state_blocked, steps_run=0) bounds = self.get_bounds(min_slot, max_slot) if bounds is None: logger.debug('No bounds yet at run %d', run_number) return self._return_run_step(self.state_blocked, steps_run=0) bound_min, bound_max = bounds if self._bounds is None: delta = self.get_delta(*bounds) self._bounds = (bound_min - delta, bound_max + delta) logger.info("New bounds at run %d: %s"%(run_number, self._bounds)) else: (old_min, old_max) = self._bounds delta = self.get_delta(*bounds) if(bound_min < old_min or bound_max > old_max) \ or bound_min > (old_min + delta) or bound_max < (old_max - delta): self._bounds = (bound_min - delta, bound_max + delta) logger.info('Updated bounds at run %d: %s', run_number, self._bounds) dfslot.reset() dfslot.update(run_number) self._histo = None (curr_min, curr_max) = self._bounds if curr_min >= curr_max: logger.error('Invalid bounds: %s', self._bounds) return self._return_run_step(self.state_blocked, steps_run=0) input_df = dfslot.data() indices = dfslot.next_created(step_size) # returns a slice or ... ? steps = indices_len(indices) logger.info('Read %d rows', steps) self.total_read += steps filtered_df = input_df.loc[fix_loc(indices)] column = filtered_df[self.column] bins = self._edges if self._edges is not None else self.params.bins histo = None if len(column) > 0: histo, self._edges = np.histogram(column, bins=bins, range=[curr_min, curr_max], normed=False) if self._histo is None: self._histo = histo elif histo is not None: self._histo += histo values = [self._histo, curr_min, curr_max, run_number] with self.lock: self._df.loc[run_number] = values self._df = self._df.loc[self._df.index[-1:]] return self._return_run_step(dfslot.next_state(), steps_run=steps)
def run_step(self, run_number, step_size, howlong): dfslot = self.get_input_slot('table') dfslot.update(run_number) min_slot = self.get_input_slot('min') min_slot.update(run_number) max_slot = self.get_input_slot('max') max_slot.update(run_number) if dfslot.updated.any() or dfslot.deleted.any(): logger.debug('reseting histogram') dfslot.reset() self._histo = None self._edges = None dfslot.update(run_number) if not (dfslot.created.any() or min_slot.created.any() or max_slot.created.any()): logger.info('Input buffers empty') return self._return_run_step(self.state_blocked, steps_run=0) bounds = self.get_bounds(min_slot, max_slot) if bounds is None: logger.debug('No bounds yet at run %d', run_number) return self._return_run_step(self.state_blocked, steps_run=0) bound_min, bound_max = bounds if self._bounds is None: delta = self.get_delta(*bounds) self._bounds = (bound_min - delta, bound_max + delta) logger.info("New bounds at run %d: %s", run_number, self._bounds) else: (old_min, old_max) = self._bounds delta = self.get_delta(*bounds) if(bound_min < old_min or bound_max > old_max) \ or bound_min > (old_min + delta) or bound_max < (old_max - delta): self._bounds = (bound_min - delta, bound_max + delta) logger.info('Updated bounds at run %d: %s', run_number, self._bounds) dfslot.reset() dfslot.update(run_number) self._histo = None self._edges = None (curr_min, curr_max) = self._bounds if curr_min >= curr_max: logger.error('Invalid bounds: %s', self._bounds) return self._return_run_step(self.state_blocked, steps_run=0) input_df = dfslot.data() indices = dfslot.created.next(step_size) # returns a slice or ... ? steps = indices_len(indices) logger.info('Read %d rows', steps) self.total_read += steps column = input_df[self.column] column = column.loc[fix_loc(indices)] bins = self._edges if self._edges is not None else self.params.bins histo = None if len(column) > 0: histo, self._edges = np.histogram(column, bins=bins, range=[curr_min, curr_max], normed=False, density=False) if self._histo is None: self._histo = histo elif histo is not None: self._histo += histo values = { 'array': [self._histo], 'min': [curr_min], 'max': [curr_max], 'time': [run_number] } with self.lock: self._table['array'].set_shape((self.params.bins, )) self._table.append(values) return self._return_run_step(self.next_state(dfslot), steps_run=steps)