def test_buffered_matrix(self): buf = BufferedMatrix() omat = buf.matrix() self.assertEquals(len(buf), 0) self.assertEquals(buf.allocated_size(), 0) o = 0 for i in range(10,100,10): mat = buf.resize(i) self.assertEquals(len(buf), i) self.assertIs(mat.base,buf._base) if omat is not None: self.assertTrue((omat==mat[0:omat.shape[0],0:omat.shape[1]]).all()) mat[:,:] = np.random.rand(i,i) omat = mat
def __init__(self, metric="euclidean", columns=None, n_jobs=1, **kwds): self._add_slots( kwds, "input_descriptors", [SlotDescriptor("df", type=pd.DataFrame), SlotDescriptor("array", required=False)], ) self._add_slots(kwds, "output_descriptors", [SlotDescriptor("dist", type=np.ndarray, required=False)]) super(PairwiseDistances, self).__init__(dataframe_slot="distance", **kwds) self.default_step_size = kwds.get("step_Size", 100) # initial guess self.columns = columns self._metric = metric self._n_jobs = n_jobs self._last_index = None if metric not in _VALID_METRICS and not callable(metric) and metric != "precomputed": raise ProgressiveError("Unknown distance %s", metric) self._buf = BufferedMatrix()
class PairwiseDistances(Module): def __init__(self, metric="euclidean", columns=None, n_jobs=1, **kwds): self._add_slots( kwds, "input_descriptors", [SlotDescriptor("df", type=pd.DataFrame), SlotDescriptor("array", required=False)], ) self._add_slots(kwds, "output_descriptors", [SlotDescriptor("dist", type=np.ndarray, required=False)]) super(PairwiseDistances, self).__init__(dataframe_slot="distance", **kwds) self.default_step_size = kwds.get("step_Size", 100) # initial guess self.columns = columns self._metric = metric self._n_jobs = n_jobs self._last_index = None if metric not in _VALID_METRICS and not callable(metric) and metric != "precomputed": raise ProgressiveError("Unknown distance %s", metric) self._buf = BufferedMatrix() def is_ready(self): if self.get_input_slot("df").has_created(): return True return super(PairwiseDistances, self).is_ready() def dist(self): return self._buf.matrix() def get_data(self, name): if name == "dist": return self.dist() return super(PairwiseDistances, self).get_data(name) @synchronized def run_step(self, run_number, step_size, howlong): dfslot = self.get_input_slot("df") df = dfslot.data() dfslot.update(run_number) if dfslot.has_updated() or dfslot.has_deleted(): dfslot.reset() logger.info("Reseting history because of changes in the input df") dfslot.update(run_number, df) # TODO: be smarter with changed values m = step_size indices = dfslot.next_created(m) m = indices_len(indices) i = None j = None Si = self._buf.matrix() arrayslot = self.get_input_slot("array") if arrayslot is not None and arrayslot.data() is not None: array = arrayslot.data() logger.debug("Using array instead of DataFrame columns") if Si is not None: i = array[self._last_index] j = array[indices] if j is None: if self.columns is None: self.columns = df.columns.delete(np.where(df.columns == Module.UPDATE_COLUMN)) elif not isinstance(self.columns, pd.Index): self.columns = pd.Index(self.columns) rows = df[self.columns] if Si is not None: i = rows.loc[self._last_index] assert len(i) == len(self._last_index) j = rows.loc[fix_loc(indices)] assert len(j) == indices_len(indices) Sj = pairwise_distances(j, metric=self._metric, n_jobs=self._n_jobs) if Si is None: mat = self._buf.resize(Sj.shape[0]) mat[:, :] = Sj self._last_index = dfslot.last_index[indices] else: Sij = pairwise_distances(i, j, metric=self._metric, n_jobs=self._n_jobs) n0 = i.shape[0] n1 = n0 + j.shape[0] mat = self._buf.resize(n1) mat[0:n0, n0:n1] = Sij mat[n0:n1, 0:n0] = Sij.T mat[n0:n1, n0:n1] = Sj self._last_index = self._last_index.append(df.index[indices]) # truth = pairwise_distances(array[0:n1], metric=self._metric) # import pdb # pdb.set_trace() # assert np.allclose(mat,truth) return self._return_run_step(dfslot.next_state(), steps_run=m)