def test_oversized_initialization(self): buffer = RingBuffer([1, 2, 3, 4, 5, 6], shape=(5, ), dtype=np.int) npt.assert_equal(buffer.max_shape, (5, )) npt.assert_equal(buffer.view, np.array([2, 3, 4, 5, 6])) self.assertEqual(buffer[0], 2) self.assertTrue(buffer.push([7])) npt.assert_equal(buffer.view, np.array([3, 4, 5, 6, 7])) self.assertEqual(buffer[0], 3) self.assertTrue(buffer.push([8, 9, 10])) npt.assert_equal(buffer.view, np.array([6, 7, 8, 9, 10])) self.assertEqual(buffer[0], 6)
def test_partial_intialization(self): buffer = RingBuffer([1, 2], shape=(5, ), dtype=np.int) npt.assert_equal(buffer.max_shape, (5, )) npt.assert_equal(buffer.view, np.array([1, 2])) self.assertEqual(buffer[0], 1) self.assertFalse(buffer.push([3])) npt.assert_equal(buffer.view, np.array([1, 2, 3])) self.assertEqual(buffer[0], 1) self.assertTrue(buffer.push([4, 5, 6])) npt.assert_equal(buffer.view, np.array([2, 3, 4, 5, 6])) self.assertEqual(buffer[0], 2)
def test_empty_intialization(self): buffer = RingBuffer(None, shape=(5, ), dtype=np.int) npt.assert_equal(buffer.max_shape, (5, )) npt.assert_equal(buffer.view, np.array([])) self.assertEqual(buffer.push([1]), 0) npt.assert_equal(buffer.view, np.array([1])) self.assertEqual(buffer[0], 1) self.assertEqual(buffer.push([2, 3]), 0) npt.assert_equal(buffer.view, np.array([1, 2, 3])) self.assertEqual(buffer[0], 1) self.assertEqual(buffer.push([4, 5, 6]), 1) npt.assert_equal(buffer.view, np.array([2, 3, 4, 5, 6])) self.assertEqual(buffer[0], 2)
def test_one_dimensional(self): buffer = RingBuffer([0, 1, 2, 3, 4]) npt.assert_equal(buffer.view, np.array([0, 1, 2, 3, 4])) npt.assert_equal(buffer.max_shape, (5, )) self.assertFalse(buffer.push([])) npt.assert_equal(buffer.view, np.array([0, 1, 2, 3, 4])) self.assertEqual(buffer[0], 0) self.assertTrue(buffer.push(5)) npt.assert_equal(buffer.view, np.array([1, 2, 3, 4, 5])) self.assertEqual(buffer[0], 1) self.assertTrue(buffer.push([6])) self.assertTrue(buffer.push([7])) npt.assert_equal(buffer.view, np.array([3, 4, 5, 6, 7])) self.assertEqual(buffer[0], 3) self.assertTrue(buffer.push([8, 9, 10])) npt.assert_equal(buffer.view, np.array([6, 7, 8, 9, 10])) self.assertEqual(buffer[0], 6) self.assertTrue(buffer.push([11, 12, 13, 14])) npt.assert_equal(buffer.view, np.array([10, 11, 12, 13, 14])) self.assertEqual(buffer[0], 10) self.assertTrue(buffer.push([15, 16, 17, 18, 19])) npt.assert_equal(buffer.view, np.array([15, 16, 17, 18, 19])) self.assertEqual(buffer[0], 15) self.assertTrue(buffer.push([20, 21, 22, 23, 24, 25])) npt.assert_equal(buffer.view, np.array([21, 22, 23, 24, 25])) self.assertEqual(buffer[0], 21)
def test_multi_dimensional(self): buffer = RingBuffer([[0, 1, 2, 3, 4], [0, -1, -2, -3, -4]]) npt.assert_equal(buffer.view, np.array([[0, 1, 2, 3, 4], [0, -1, -2, -3, -4]])) npt.assert_equal(buffer.max_shape, (2, 5)) self.assertEqual(buffer.push([[], []]), 0) npt.assert_equal(buffer.view, np.array([[0, 1, 2, 3, 4], [0, -1, -2, -3, -4]])) npt.assert_equal(buffer[:, 0], [0, 0]) self.assertEqual(buffer.push([[5], [-5]]), 1) npt.assert_equal(buffer.view, np.array([[1, 2, 3, 4, 5], [-1, -2, -3, -4, -5]])) npt.assert_equal(buffer[:, 0], [1, -1]) self.assertEqual(buffer.push([[6, 7], [-6, -7]]), 2) npt.assert_equal(buffer.view, np.array([[3, 4, 5, 6, 7], [-3, -4, -5, -6, -7]])) npt.assert_equal(buffer[:, 0], [3, -3]) self.assertEqual(buffer.push([[8, 9, 10], [-8, -9, -10]]), 3) npt.assert_equal(buffer.view, np.array([[6, 7, 8, 9, 10], [-6, -7, -8, -9, -10]])) npt.assert_equal(buffer[:, 0], [6, -6]) self.assertEqual(buffer.push([[11, 12, 13, 14], [-11, -12, -13, -14]]), 4) npt.assert_equal( buffer.view, np.array([[10, 11, 12, 13, 14], [-10, -11, -12, -13, -14]])) npt.assert_equal(buffer[:, 0], [10, -10]) self.assertEqual( buffer.push([[15, 16, 17, 18, 19], [-15, -16, -17, -18, -19]]), 5) npt.assert_equal( buffer.view, np.array([[15, 16, 17, 18, 19], [-15, -16, -17, -18, -19]])) npt.assert_equal(buffer[:, 0], [15, -15]) self.assertEqual( buffer.push([[20, 21, 22, 23, 24, 25], [-20, -21, -22, -23, -24, -25]]), 6) npt.assert_equal( buffer.view, np.array([[21, 22, 23, 24, 25], [-21, -22, -23, -24, -25]])) npt.assert_equal(buffer[:, 0], [21, -21])
class ContextualMatrixProfile(AbstractStreamingConsumer): """ A consumer that constructs the contextual matrix profile. The contextual matrix profile is formed by taking the minimum of rectangles across the full distance matrix (where the matrix profile takes the minimum across columns). This consumer supports streaming if the provided context manager does. """ def __init__(self, context_manager: AbstractContextManager, rb_scale_factor=2.): """ Creates a new consumer that calculates a contextual matrix profile, according to the contexts defined by the manager. :param context_manager: object responsible for defining the spans of each context over the query and series axis :param rb_scale_factor: scaling factor used for RingBuffers in case of streaming data (should be >= 1), this allows choosing a balance between less memory (low values) and reduced data copying (higher values) """ if rb_scale_factor < 1.: raise ValueError("rb_scale_factor should be >= 1, it was: " + str(rb_scale_factor)) self._num_series_subseq = None self._num_query_subseq = None self._range = None self._contexts = context_manager self._query_shift = 0 self._series_shift = 0 self._distance_matrix = None self._match_index_series = None self._match_index_query = None self._rb_scale_factor = rb_scale_factor def initialise(self, dims, query_subseq, series_subseq): self._num_series_subseq = series_subseq self._num_query_subseq = query_subseq self._range = np.arange(0, max(series_subseq, query_subseq), dtype=np.int) num_query_contexts, num_series_contexts = self._contexts.context_matrix_shape( ) self._distance_matrix = RingBuffer( np.full((num_query_contexts, num_series_contexts), np.Inf, dtype=np.float), scaling_factor=self._rb_scale_factor) self._match_index_series = RingBuffer( np.full((num_query_contexts, num_series_contexts), -1, dtype=np.int), scaling_factor=self._rb_scale_factor) self._match_index_query = RingBuffer( np.full((num_query_contexts, num_series_contexts), -1, dtype=np.int), scaling_factor=self._rb_scale_factor) def process_diagonal(self, diag, values): values = values[0] num_values = len(values) if diag >= 0: values_idx1_start = diag context0_idxs = self._contexts.query_contexts(0, num_values) else: values_idx1_start = 0 context0_idxs = self._contexts.query_contexts( -diag, self._num_query_subseq) for c0_start, c0_end, c0_identifier in context0_idxs: # We now have a sub-sequence (ss) defined by the first context on the query axis # In absolute coordinates, start/end of this subsequence on 2nd axis (series axis) ss1_start = min(max(0, c0_start + diag), self._num_series_subseq) ss1_end = min(self._num_series_subseq, min(self._num_query_subseq, c0_end) + diag) if ss1_start == ss1_end: continue context1_idxs = self._contexts.series_contexts(ss1_start, ss1_end) for c1_start, c1_end, c1_identifier in context1_idxs: # In absolute coordinates, start/end of the subsequence on 2nd axis defined by both contexts sss1_start = max(ss1_start, c1_start) sss1_end = min(ss1_end, c1_end) # Values that belong to both contexts sss_values = values[sss1_start - values_idx1_start:sss1_end - values_idx1_start] # Compare if better than current min_sss_value = np.min(sss_values) is_better = min_sss_value < self._distance_matrix[ c0_identifier, c1_identifier] if is_better: self._distance_matrix[c0_identifier, c1_identifier] = min_sss_value rel_indices = np.argmin(sss_values) sss0_start = sss1_start - diag self._match_index_query[ c0_identifier, c1_identifier] = rel_indices + sss0_start + self._query_shift self._match_index_series[ c0_identifier, c1_identifier] = rel_indices + sss1_start + self._series_shift def process_column(self, column_index, values): values = values[0] context1_idxs = self._contexts.series_contexts(column_index, column_index + 1) for _, _, c1_identifier in context1_idxs: query_contexts = self._contexts.query_contexts( 0, self._num_query_subseq) for c0_start, c0_end, c0_identifier in query_contexts: subseq = values[c0_start:c0_end] best_value = np.min(subseq) if best_value < self._distance_matrix[c0_identifier, c1_identifier]: self._distance_matrix[c0_identifier, c1_identifier] = best_value self._match_index_query[ c0_identifier, c1_identifier] = np.argmin( subseq) + c0_start + self._query_shift self._match_index_series[ c0_identifier, c1_identifier] = column_index + self._series_shift def shift_series(self, amount): context_shift = self._contexts.shift_series(amount) self._series_shift += amount if context_shift > 0: height = self._distance_matrix.max_shape[0] self._distance_matrix.push( np.full((height, context_shift), np.Inf, dtype=np.float)) self._match_index_series.push( np.full((height, context_shift), -1, dtype=np.int)) self._match_index_query.push( np.full((height, context_shift), -1, dtype=np.int)) def shift_query(self, amount): context_shift = self._contexts.shift_query(amount) self._query_shift += amount if context_shift > 0: # Note: This could be more efficient using a 2D Ringbuffer. height = min(context_shift, self._distance_matrix.max_shape[0]) self._distance_matrix.view = np.roll(self._distance_matrix.view, context_shift, axis=0) self._distance_matrix[-height:, :] = np.Inf self._match_index_series.view = np.roll( self._match_index_series.view, context_shift, axis=0) self._match_index_series[-height:, :] = -1 self._match_index_query.view = np.roll( self._match_index_query.view, context_shift, axis=0) self._match_index_query[-height:, :] = -1 @property def match_index_query(self): return self._match_index_query.view @property def match_index_series(self): return self._match_index_series.view @property def distance_matrix(self): return self._distance_matrix.view
class MultidimensionalMatrixProfileLR(AbstractStreamingConsumer): """ A consumer that builds the multidimensional matrix profile. This consumer takes in distance measures from multiple channels (dimensions) at the same time and tracks the best distance, the index of this match and the dimensions used in this match. More specifically, if the input has N data channels, this consumer will select for each number of channels (1, 2, ..., N), the channels containing the best match, index and dimensions. It will not track matches for any possible combination of channels. This consumer keeps track of the left and right multidimensional profile, and can be used to create the (normal) multidimensional profile from it. The left profile, index and dimensions at index i contain information about a match whose index is less than or equal to i, while the right profile, index and dimensions track information about a match whose index is larger than i. The profile is an array with shape (num_dimensions, num_distances). The value at row i, j contains the best averaged distances encountered at index j for any i+1 dimensions. The index is similar, but tracks the index of the query series that had the best match. The dimensions being tracked is a list of length num_dimensions. Entry i of this list contains an (i+1, num_distances) array that lists the indices of the dimensions that contained the best match. This consumer supports streaming. """ def __init__(self, rb_scale_factor=2.): """ Creates a new consumer that calculates the left and right matrix profile, the corresponding indices and the used dimensions over multiple dimensions (data channels). :param rb_scale_factor: scaling factor used for RingBuffers in case of streaming data (should be >= 1), this allows choosing a balance between less memory (low values) and reduced data copying (higher values) """ if rb_scale_factor < 1.: raise ValueError("rb_scale_factor should be >= 1, it was: " + str(rb_scale_factor)) self._num_subseq = None self._range = None self._n_dim = None self._md_matrix_profile_left = None self._md_profile_index_left = None self._md_profile_dimension_left = None self._md_matrix_profile_right = None self._md_profile_index_right = None self._md_profile_dimension_right = None self._series_shift = 0 self._query_shift = 0 self._rb_scale_factor = rb_scale_factor def initialise(self, dims, query_subseq, series_subseq): self._n_dim = dims self._num_subseq = series_subseq self._range = RingBuffer(np.arange(0, self._num_subseq, dtype=np.int), scaling_factor=self._rb_scale_factor) self._md_matrix_profile_left = RingBuffer( np.full((dims, self._num_subseq), np.inf, dtype=np.float), scaling_factor=self._rb_scale_factor) self._md_profile_index_left = RingBuffer( np.full((dims, self._num_subseq), -1, dtype=np.int), scaling_factor=self._rb_scale_factor) self._md_profile_dimension_left = \ [RingBuffer(np.full((i + 1, self._num_subseq), -1, dtype=np.int), scaling_factor=self._rb_scale_factor) for i in range(dims)] self._md_matrix_profile_right = RingBuffer( np.full((dims, self._num_subseq), np.inf, dtype=np.float), scaling_factor=self._rb_scale_factor) self._md_profile_index_right = RingBuffer( np.full((dims, self._num_subseq), -1, dtype=np.int), scaling_factor=self._rb_scale_factor) self._md_profile_dimension_right = \ [RingBuffer(np.full((i + 1, self._num_subseq), -1, dtype=np.int), scaling_factor=self._rb_scale_factor) for i in range(dims)] def process_diagonal(self, diag, values): n_dim, num_values = values.shape shift_diff = self._series_shift - self._query_shift values_sort_order = np.argsort(values, axis=0) values_sorted = np.sort(values, axis=0) values_cumsum = np.zeros(num_values) if diag + shift_diff >= 0: # left MP if diag >= 0: for dim in range(n_dim): values_cumsum += values_sorted[dim, :] values_mean_over_dim = values_cumsum / (dim + 1) self._update_matrix_profile( values_mean_over_dim, self._range[:num_values], values_sort_order[:dim + 1, :], self._md_matrix_profile_left[dim, diag:diag + num_values], self._md_profile_index_left[dim, diag:diag + num_values], self._md_profile_dimension_left[dim][:, diag:diag + num_values]) else: for dim in range(n_dim): values_cumsum += values_sorted[dim, :] values_mean_over_dim = values_cumsum / (dim + 1) self._update_matrix_profile( values_mean_over_dim, self._range[-diag:-diag + num_values], values_sort_order[:dim + 1, :], self._md_matrix_profile_left[dim, :num_values], self._md_profile_index_left[dim, :num_values], self._md_profile_dimension_left[dim][:, :num_values]) else: # right MP if diag >= 0: for dim in range(n_dim): values_cumsum += values_sorted[dim, :] values_mean_over_dim = values_cumsum / (dim + 1) self._update_matrix_profile( values_mean_over_dim, self._range[num_values], values_sort_order[:dim + 1, :], self._md_matrix_profile_right[dim, diag:diag + num_values], self._md_profile_index_right[dim, diag:diag + num_values], self._md_profile_dimension_right[dim][:, diag:diag + num_values]) else: for dim in range(n_dim): values_cumsum += values_sorted[dim, :] values_mean_over_dim = values_cumsum / (dim + 1) self._update_matrix_profile( values_mean_over_dim, self._range[-diag:-diag + num_values], values_sort_order[:dim + 1, :], self._md_matrix_profile_right[dim, :num_values], self._md_profile_index_right[dim, :num_values], self._md_profile_dimension_right[dim][:, :num_values]) if diag >= 0: for dim in range(n_dim): values_cumsum += values_sorted[dim, :] values_mean_over_dim = values_cumsum / (dim + 1) self._update_matrix_profile( values_mean_over_dim, self._range[:num_values], values_sort_order[:dim + 1, :], self._md_matrix_profile_left[dim, diag:diag + num_values], self._md_profile_index_left[dim, diag:diag + num_values], self._md_profile_dimension_left[dim][:, diag:diag + num_values]) else: for dim in range(n_dim): values_cumsum += values_sorted[dim, :] values_mean_over_dim = values_cumsum / (dim + 1) self._update_matrix_profile( values_mean_over_dim, self._range[-diag:-diag + num_values], values_sort_order[:dim + 1, :], self._md_matrix_profile_right[dim, :num_values], self._md_profile_index_right[dim, :num_values], self._md_profile_dimension_right[dim][:, :num_values]) def _update_matrix_profile(self, new_distances, new_distance_indices, new_distance_dimensions, matrix_profile, matrix_profile_index, matrix_profile_dims): update_pos = new_distances < matrix_profile matrix_profile[update_pos] = new_distances[update_pos] matrix_profile_index[update_pos] = new_distance_indices[update_pos] matrix_profile_dims[:, update_pos] = new_distance_dimensions[:, update_pos] def process_column(self, column_index, values): n_dim, num_values = values.shape shift_diff = self._series_shift - self._query_shift border = max(0, column_index + 1 + shift_diff) values_sorted = np.sort(values, axis=0) values_cumsum = np.zeros(num_values) for dim in range(n_dim): values_cumsum += values_sorted[dim, :] if border > 0: min_position_l = np.argmin(values_cumsum[:border]) new_min_value = values_cumsum[min_position_l] / (dim + 1) if new_min_value < self._md_matrix_profile_left[dim, column_index]: self._md_matrix_profile_left[dim, column_index] = new_min_value self._md_profile_index_left[ dim, column_index] = min_position_l + self._query_shift self._md_profile_dimension_left[dim][:, column_index] =\ np.argsort(values[:, min_position_l])[:dim + 1] # Check if column crosses into the lower triangle of the distance matrix if num_values > border: min_position_r = np.argmin(values_cumsum[border:]) + border new_min_value = values_cumsum[min_position_r] / (dim + 1) # In case of shifting, a lower value could already be present if new_min_value < self._md_matrix_profile_right[dim, column_index]: self._md_matrix_profile_right[dim, column_index] = new_min_value self._md_profile_index_right[ dim, column_index] = min_position_r + self._query_shift self._md_profile_dimension_right[dim][:, column_index] =\ np.argsort(values[:, min_position_r])[:dim + 1] def shift_query(self, amount): if amount == 0: return self._query_shift += amount self._range.push( np.arange(self._range[-1] + 1, self._range[-1] + 1 + amount)) def shift_series(self, amount): if amount == 0: return self._series_shift += amount push_values = np.full((self._n_dim, amount), np.inf) self._md_matrix_profile_left.push(push_values) self._md_matrix_profile_right.push(push_values) push_values[:] = -1 self._md_profile_index_left.push(push_values) self._md_profile_index_right.push(push_values) for dim in range(self._n_dim): self._md_profile_dimension_left[dim].push(push_values[:dim + 1, :]) self._md_profile_dimension_right[dim].push(push_values[:dim + 1, :]) def md_matrix_profile(self): """ Merges the left and right multidimensional matrix profile, to create the multidimensional matrix profile. :return: ndarray of shape (num_dimensions, num_subsequences) """ left_best = self._md_matrix_profile_left.view < self._md_matrix_profile_right.view return np.where(left_best, self._md_matrix_profile_left.view, self._md_matrix_profile_right.view) def md_profile_index(self): """ Merges the left and right multidimensional matrix profile index, to create the multidimensional matrix profile index. :return: ndarray of shape (num_dimensions, num_subsequences) """ left_best = self._md_matrix_profile_left.view < self._md_matrix_profile_right.view return np.where(left_best, self._md_profile_index_left.view, self._md_profile_index_right.view) def md_profile_dimensions(self): """ Merges the left and right dimensions, to create the dimensions for the multidimensional matrix profile. :return: list of length num_dimensions, where the entry at index i is an ndarray of shape (i+1, num_subsequences). """ profile_dimension = [ np.full((i + 1, self._num_subseq), -1, dtype=np.int) for i in range(self._n_dim) ] for dim in range(self._n_dim): left_best = self._md_matrix_profile_left[ dim, :] < self._md_matrix_profile_right[dim, :] profile_dimension[dim] = np.where( left_best, self._md_profile_dimension_left[dim].view, self._md_profile_dimension_right[dim].view) return profile_dimension @property def md_matrix_profile_left(self): return self._md_matrix_profile_left.view @property def md_matrix_profile_right(self): return self._md_matrix_profile_right.view @property def md_profile_index_left(self): return self._md_profile_index_left.view @property def md_profile_index_right(self): return self._md_profile_index_right.view @property def md_profile_dimension_left(self): return [buffer.view for buffer in self._md_profile_dimension_left] @property def md_profile_dimension_right(self): return [buffer.view for buffer in self._md_profile_dimension_right]
class StreamingStats(object): """ Class that tracks a data stream and corresponding mean and standard deviation of a window over this data. The data stream has to be updated by the user, after which the mean/std stream will be updated automatically. This class uses RingBuffers internally, so any old view (data, mean, std) should be considered unreliable after new data was pushed to this class. """ def __init__(self, series, m) -> None: """ Creates a new instance. This instance will keep track of a data stream (with dimensions matching those of series) and a stream of moving mean and standard deviation using a window of length m. :param series: Starting data of the data stream :param m: window size for mean and variance """ if m > series.shape[-1]: raise RuntimeError("M should be <= series.shape[-1].") self._data_buffer = RingBuffer(series) self._m = m sliding_avg, sliding_std = sliding_mean_std(series, m) self._mean_buffer = RingBuffer(sliding_avg) self._std_buffer = RingBuffer(sliding_std) def append(self, data): data_length = data.shape[-1] if data_length == 0: return self._data_buffer.push(data) new_means, new_stds = sliding_mean_std( self._data_buffer[max(-self._m - 1 - data_length, 0):], self._m) self._mean_buffer.push(new_means) self._std_buffer.push(new_stds) # Original implementation below, this approach might still be interesting if the current approach proves to be # too slow in practice. One issue that remains to be solved (why this method was replaced) is that # a mid-signal constant window will not result in variance of 0. One approach might be to simply check # for constant signals. A starting point might be: # https://stackoverflow.com/questions/1066758/find-length-of-sequences-of-identical-values-in-a-numpy-array-run-length-encodi?rq=1 # The numerical stability test gives a use case where this method fails. # # buffer_length = self._data_buffer.view.shape[-1] # if data_length >= buffer_length: # sliding_avg, sliding_var = sliding_mean_var(data[..., -buffer_length:], self._m) # self._mean_buffer.push(sliding_avg) # self._var_buffer.push(sliding_var) # else: # # Sliding variance formula: http://jonisalonen.com/2014/efficient-and-accurate-rolling-standard-deviation/ # # First steps of derivation: http://jonisalonen.com/2013/deriving-welfords-method-for-computing-variance/ # # (For non-online calculation, the formula used in sliding_mean_var is faster) # # old_mean = self._mean_buffer.view[..., -1] # old_var = self._var_buffer.view[..., -1] # values_to_remove = self._data_buffer.view[..., -self._m: min(-1, -self._m + data_length)] # values_to_add = data[..., :values_to_remove.shape[-1]] # new_means = old_mean + np.cumsum(- values_to_remove + values_to_add) / self._m # old_means = np.concatenate((np.atleast_1d(old_mean), new_means[..., :-1])) # new_vars = old_var + np.cumsum((values_to_add - values_to_remove) * ( # values_to_add - new_means + values_to_remove - old_means) / self._m) # new_vars[new_vars < 1e-12] = 0. # Unreliable! # # self._mean_buffer.push(new_means) # self._var_buffer.push(new_vars) # # if data_length >= self._m: # sliding_avg, sliding_var = sliding_mean_var(data, self._m) # self._mean_buffer.push(sliding_avg) # self._var_buffer.push(sliding_var) # # self._data_buffer.push(data) @property def data(self): return self._data_buffer.view @property def mean(self): return self._mean_buffer.view @property def std(self): return self._std_buffer.view
class BoundStreamingEuclidean(AbstractBoundStreamingGenerator): def __init__(self, m, series, query, self_join): self.m = m self.series = series self.query = query self.self_join = self_join self.first_row = None self.first_row_backlog = 0 # The number of values not yet processed for the first row cache self.prev_calc_column_index = None self.prev_calc_column_sq_dist = None def append_series(self, values): if len(values) == 0: return data_dropped = self.series.push(values) num_dropped = len(values) - (self.series.max_shape[0] - self.series.view.shape[0]) self.first_row_backlog += len(values) if self.prev_calc_column_index is not None and num_dropped > 0: self.prev_calc_column_index -= num_dropped if self.self_join: if data_dropped: self.first_row = None # The first row was dropped by new data self.prev_calc_column_index = None def append_query(self, values): if self.self_join: raise RuntimeError("Cannot append query data in case of a self join.") if len(values) == 0: return if self.query.push(values): self.first_row = None # The first row was dropped by new data self.prev_calc_column_index = None def calc_diagonal(self, diag): dl = diag_length(len(self.query.view), len(self.series.view), diag) cumsum = np.zeros(dl + 1, dtype=np.float) if diag >= 0: # Eg: for diag = 2: # D = (y0 - x2)², (y1 - x3)², (y2 - x4)²... # cumsum = 0, D0, D0+D1, D0+D1+D2, ... cumsum[1:] = np.cumsum(np.square(self.query[:dl] - self.series[diag: diag + dl])) else: # Eg: for diag = -2: # D = (y2 - x0)², (y3 - x1)², (y4 - x2)²... # cumsum = 0, D0, D0+D1, D0+D1+D2, ... cumsum[1:] = np.cumsum(np.square(self.query[-diag: -diag + dl] - self.series[:dl])) return np.sqrt(cumsum[self.m:] - cumsum[:len(cumsum) - self.m]) def calc_column(self, column): if self.prev_calc_column_index != column - 1 or column == 0: # Previous column not cached or data for incremental calculation not available: full calculation sq_dist = _euclidean_distance_squared(self.query.view, self.series[column:column + self.m]) else: # Previous column cached, reuse it if self.first_row is None: self.first_row = RingBuffer(_euclidean_distance_squared(self.series.view, self.query[0: self.m]), shape=(self.series.max_shape[0] - self.m + 1,)) self.first_row_backlog = 0 elif self.first_row_backlog > 0: # Series has been updated since last calculation of first_row elems_to_recalc = self.first_row_backlog + self.m - 1 self.first_row.push(_euclidean_distance_squared(self.series[-elems_to_recalc:], self.query[0: self.m])) self.first_row_backlog = 0 sq_dist = self.prev_calc_column_sq_dist # work in same array sq_dist[1:] = (self.prev_calc_column_sq_dist[:-1] - np.square(self.series[column - 1] - self.query[:len(self.query.view)-self.m]) + np.square(self.series[column + self.m - 1] - self.query[self.m:])) sq_dist[0] = self.first_row[column] self.prev_calc_column_sq_dist = sq_dist self.prev_calc_column_index = column return np.sqrt(sq_dist)
class BoundStreamingFilterGenerator(BoundFilterGenerator, AbstractBoundStreamingGenerator): """ Wrapper around other generators that will replace values in the distance matrix marked as invalid by positive infinity. It can also perform a data pre-processing step before data reaches the wrapped generator, by setting values marked as invalid to zero, this can be useful for example to remove nan values for a generator that does not support nan values. """ def __init__(self, generator, m, num_s_subseq, num_q_subseq, invalid_data_function, rb_scale_factor): """ Creates a new generator by wrapping another generator. :param generator: the generator whose results and input data will be filtered :param invalid_data_function: optional - a function that takes in the original data (series or query) and subsequence length and returns a boolean array of the same size that has a True value for any invalid values. These values will be replaced by zeros before reaching the wrapped generator. Any distance values that were calculated using invalid data points will be positive infinite values. """ self._invalid_data_function = invalid_data_function invalid_s_subseq_buffer = RingBuffer(None, shape=(num_s_subseq, ), dtype=np.bool, scaling_factor=rb_scale_factor) self.invalid_series = RingBuffer(None, shape=(num_s_subseq + m - 1, ), dtype=np.bool, scaling_factor=rb_scale_factor) if num_q_subseq is None: self.self_join = True invalid_q_subseq_buffer = invalid_s_subseq_buffer num_q_subseq = num_s_subseq self.invalid_query = self.invalid_series else: self.self_join = False invalid_q_subseq_buffer = RingBuffer( None, shape=(num_q_subseq, ), dtype=np.bool, scaling_factor=rb_scale_factor) self.invalid_query = RingBuffer(None, shape=(num_q_subseq + m - 1, ), dtype=np.bool, scaling_factor=rb_scale_factor) super().__init__(generator, m, num_q_subseq, invalid_s_subseq_buffer, invalid_q_subseq_buffer) def append_series(self, values): invalid_points = _apply_data_validation(values, self.m, self._invalid_data_function) self.invalid_series.push(invalid_points) if np.any(invalid_points): values = values.copy() values[invalid_points] = 0 if len(self.invalid_series.view) >= self.m: rel_values = self.invalid_series[-(len(values) + self.m - 1):] self.invalid_series_subseq.push( np.any(sliding_window_view(rel_values, (self.m, )), axis=-1)) self.generator.append_series(values) def append_query(self, values): if self.self_join: raise RuntimeError("Cannot append to query for a self-join.") invalid_points = _apply_data_validation(values, self.m, self._invalid_data_function) self.invalid_query.push(invalid_points) if np.any(invalid_points): values = values.copy() values[invalid_points] = 0 if len(self.invalid_query.view) >= self.m: rel_values = self.invalid_query[-(len(values) + self.m - 1):] self.invalid_query_subseq.push( np.any(sliding_window_view(rel_values, (self.m, )), axis=-1)) self.generator.append_query(values) def calc_column(self, column): if self.invalid_series_subseq[column]: return np.full(len(self.invalid_query_subseq.view), np.Inf) distances = self.generator.calc_column(column) distances[self.invalid_query_subseq.view] = np.Inf return distances
class BoundZNormEuclidean(AbstractBoundStreamingGenerator): def __init__(self, m, series, query, self_join, noise_std, series_mu, series_std, series_std_nz, query_mu, query_std, query_std_nz,): """ :param m: subsequence length to consider for distance calculations :param series: empty ringbuffer, properly sized to contain the desired window for series :param query: empty ringbuffer, properly sized to contain the desired window for query, or the same buffer as series in case of a self-join :param self_join: whether or not a self-join should be done :param noise_std: standard deviation of noise on series/query, zero to disable noise cancellation """ # Core values self.m = m self.series = series self.query = query self.noise_std = noise_std self.self_join = self_join # Derivated values self.mu_s = series_mu self.std_s = series_std self.std_s_nonzero = series_std_nz self.mu_q = query_mu self.std_q = query_std self.std_q_nonzero = query_std_nz # Caching self.first_row = None self.first_row_backlog = 0 self.prev_calc_column_index = None self.prev_calc_column_dot_prod = None def append_series(self, values): if len(values) == 0: return data_dropped = self.series.push(values) num_dropped = len(values) - (self.series.max_shape[0] - self.series.view.shape[0]) self.first_row_backlog += len(values) if len(self.series.view) >= self.m: num_affected = len(values) + self.m - 1 new_mu, new_std = sliding_mean_std(self.series[-num_affected:], self.m) self.mu_s.push(new_mu) self.std_s.push(new_std) self.std_s_nonzero.push(new_std != 0.) if self.prev_calc_column_index is not None and num_dropped > 0: self.prev_calc_column_index -= num_dropped if self.self_join: if data_dropped: self.first_row = None # The first row was dropped by new data self.prev_calc_column_index = None def append_query(self, values): if self.self_join: raise RuntimeError("Cannot append query data in case of a self join.") if len(values) == 0: return if self.query.push(values): self.first_row = None # The first row was dropped by new data self.prev_calc_column_index = None if len(self.query.view) >= self.m: num_affected = len(values) + self.m - 1 new_mu, new_std = sliding_mean_std(self.query[-num_affected:], self.m) self.mu_q.push(new_mu) self.std_q.push(new_std) self.std_q_nonzero.push(new_std != 0.) def calc_diagonal(self, diag): dl = diag_length(len(self.query.view), len(self.series.view), diag) # Number of affected data points dlr = dl - self.m + 1 # Number of entries in diagonal cumsum = np.zeros(dl + 1, dtype=np.float) if diag >= 0: # Eg: for diag = 2: # D = (y0 * x2), (y1 * x3), (y2 * x4)... # cumsum = 0, D0, D0+D1, D0+D1+D2, ... cumsum[1:] = np.cumsum(self.query[:dl] * self.series[diag: diag + dl]) q_range = slice(0, dlr) s_range = slice(diag, diag + dlr) else: # Eg: for diag = -2: # D = (y2 * x0), (y3 * x1), (y4 * x2)... # cumsum = 0, D0, D0+D1, D0+D1+D2, ... cumsum[1:] = np.cumsum(self.query[-diag: -diag + dl] * self.series[:dl]) s_range = slice(0, dlr) q_range = slice(-diag, -diag + dlr) mean_q = self.mu_q[q_range] mean_s = self.mu_s[s_range] std_q = self.std_q[q_range] std_s = self.std_s[s_range] dot_prod = cumsum[self.m:] - cumsum[:dlr] dist_sq = np.zeros(dlr, dtype=np.float) non_zero_std_q = self.std_q_nonzero[q_range] non_zero_std_s = self.std_s_nonzero[s_range] # For subsequences where both signals are stable (std = 0), we define the distance as zero. # This is covered by the initialization of the dist array. # For subsequences where exactly one signal is stable, the distance is sqrt(m) by definition. dist_sq[np.logical_xor(non_zero_std_q, non_zero_std_s)] = self.m # Formula for regular (non-stable) subsequences mask = np.logical_and(non_zero_std_q, non_zero_std_s) dist_sq[mask] = (2 * (self.m - (dot_prod[mask] - self.m * mean_q[mask] * mean_s[mask]) / (std_q[mask] * std_s[mask]))) # Noise correction - See paper "Eliminating noise in the matrix profile" if self.noise_std != 0.: mask = np.logical_or(non_zero_std_q, non_zero_std_s) dist_sq[mask] -= (2 * (self.m + 1) * np.square(self.noise_std) / np.square(np.maximum(std_s[mask], std_q[mask]))) # Before the noise correction, small negative values are possible due to rounding. # After the noise, larger negative values are also possible. # Correct all negative values to zero. dist_sq[dist_sq < _EPS] = 0 return np.sqrt(dist_sq) def calc_column(self, column): dist_sq = np.zeros(len(self.query.view) - self.m + 1, dtype=np.float) series_subseq = self.series[column: column + self.m] if self.prev_calc_column_index != column - 1 or column == 0: # Previous column not cached or data for incremental calculation not available: full calculation dot_prod = fftconvolve(self.query.view, series_subseq[::-1], 'valid') else: # Previous column cached, reuse it if self.first_row is None: first_query = self.query[0:self.m] self.first_row = RingBuffer(fftconvolve(self.series.view, first_query[::-1], 'valid'), shape=(self.series.max_shape[0] - self.m + 1,)) self.first_row_backlog = 0 elif self.first_row_backlog > 0: # Series has been updated since last calculation of first_row elems_to_recalc = self.first_row_backlog + self.m - 1 first_query = self.query[0:self.m] self.first_row.push(fftconvolve(self.series[-elems_to_recalc:], first_query[::-1], 'valid')) self.first_row_backlog = 0 dot_prod = self.prev_calc_column_dot_prod # work in same array dot_prod[1:] = (self.prev_calc_column_dot_prod[:-1] - self.series[column - 1] * self.query[:len(self.query.view) - self.m] + self.series[column + self.m - 1] * self.query[self.m:]) dot_prod[0] = self.first_row[column] self.prev_calc_column_dot_prod = dot_prod self.prev_calc_column_index = column if self.std_s[column] != 0: q_valid = self.std_q.view != 0 # Series subsequence is not stable, if query subsequence is stable, the distance is sqrt(m) by definition. dist_sq[~q_valid] = self.m dist_sq[q_valid] = 2 * (self.m - (dot_prod[q_valid] - self.m * self.mu_q[q_valid] * self.mu_s[column]) / (self.std_q[q_valid] * self.std_s[column])) else: # Series subsequence is stable, results are either sqrt(m) or 0, depending on whether or not # query subsequences are stable as well. dist_sq[self.std_q.view != 0] = self.m # dist_sq[self.std_q == 0] = 0 # Covered by array initialization # Noise correction - See paper "Eliminating noise in the matrix profile" if self.noise_std != 0.: if self.std_s[column] != 0: mask = slice(None) else: mask = self.std_q != 0 dist_sq[mask] -= (2 * (self.m + 1) * np.square(self.noise_std) / np.square(np.maximum(self.std_s[column], self.std_q[mask]))) # Before the noise correction, small negative values are possible due to rounding. # After the noise, larger negative values are also possible. # Correct all negative values to zero. dist_sq[dist_sq < _EPS] = 0 return np.sqrt(dist_sq) def calc_single(self, row, column, dot_prod=None): """ Calculates a single point of the distance matrix. :param row: index of the subsequence in the query series :param column: index of the subsequence in the data series :param dot_prod: the dotproduct of the subsequences, if provided, this method can run in constant time :return: z-normalised distance of the 2 subsequences """ std_q = self.std_q[row] std_s = self.std_s[column] if std_q == 0. and std_s == 0.: return 0. if std_q == 0. or std_s == 0.: return self.m if not dot_prod: dot_prod = np.sum(self.query[row: row+self.m] * self.series[column: column+self.m]) mean_q = self.mu_q[row] mean_s = self.mu_s[column] dist_sq = 2 * (self.m - (dot_prod - self.m * mean_q * mean_s) / (std_q * std_s)) if self.noise_std != 0.: dist_sq -= (2 * (self.m + 1) * np.square(self.noise_std) / np.square(np.maximum(std_s, std_q))) if dist_sq < _EPS: return 0. else: return np.sqrt(dist_sq)
class ShiftingMatrixProfileLR(MatrixProfileLR, AbstractStreamingConsumer): """ Extension of MatrixProfileLR which supports streaming. The profile indices tracked in this consumer refer to positions in the complete query series. As an example, if the original query consisted of 10 sequences, but has since shifted by 5 sequences, the profile indices can contain any value in [0..15], or -1 if no matrix profile value exists. These indices can be converted to indices local to the current window by subtracting the query_shift, keep in mind that some indices of the left matrix profile can refer to positions outside the current window. The concept of left and right matrix profile is only useful when both query and series shift at the same time (distances are calculated over a self-join). Even if this is not the case, the values in this consumer are correct: the left matrix profile stores any values on or above the (unshifted) main diagonal, the right matrix profile stores any values below the (unshifted) main diagonal. (Since the diagonal shifts away when only the series is shifted, eventually only the left matrix profile will be used.) """ def __init__(self, rb_scale_factor=2.): """ Creates a new instance. :param rb_scale_factor: scaling factor used for RingBuffers in case of streaming data (should be >= 1), this allows choosing a balance between less memory (low values) and reduced data copying (higher values) """ if rb_scale_factor < 1.: raise ValueError("rb_scale_factor should be >= 1, it was: " + str(rb_scale_factor)) super().__init__() self.series_shift = 0 self.query_shift = 0 self._rb_scale_factor = rb_scale_factor def initialise(self, dims, query_subseq, series_subseq): super().initialise(dims, query_subseq, series_subseq) self._range = RingBuffer(self._range, scaling_factor=self._rb_scale_factor) self._matrix_profile_left = RingBuffer(self._matrix_profile_left, scaling_factor=self._rb_scale_factor) self._profile_index_left = RingBuffer(self._profile_index_left, scaling_factor=self._rb_scale_factor) self._matrix_profile_right = RingBuffer(self._matrix_profile_right, scaling_factor=self._rb_scale_factor) self._profile_index_right = RingBuffer(self._profile_index_right, scaling_factor=self._rb_scale_factor) def process_diagonal(self, diag, values): values = values[0] num_values = len(values) shift_diff = self.series_shift - self.query_shift if diag + shift_diff >= 0: # left MP if diag >= 0: self._update_matrix_profile( values, self._range[:num_values], self._matrix_profile_left[diag:diag + num_values], self._profile_index_left[diag:diag + num_values]) else: self._update_matrix_profile( values, self._range[-diag:-diag + num_values], self._matrix_profile_left[:num_values], self._profile_index_left[:num_values]) else: # right MP if diag >= 0: self._update_matrix_profile( values, self._range[:num_values], self._matrix_profile_right[diag:diag + num_values], self._profile_index_right[diag:diag + num_values]) else: self._update_matrix_profile( values, self._range[-diag:-diag + num_values], self._matrix_profile_right[:num_values], self._profile_index_right[:num_values]) def process_column(self, column_index, values): values = values[0] shift_diff = self.series_shift - self.query_shift border = max(0, column_index + 1 + shift_diff) if border > 0: min_value = np.min(values[:border]) # In case of shifting, a lower value could already be present if min_value < self._matrix_profile_left[column_index]: self._matrix_profile_left[column_index] = min_value self._profile_index_left[column_index] = np.argmin(values[:border]) + self.query_shift if len(values) > border: min_value = np.min(values[border:]) # In case of shifting, a lower value could already be present if min_value < self._matrix_profile_right[column_index]: self._matrix_profile_right[column_index] = np.min(values[border:]) self._profile_index_right[column_index] = np.argmin(values[border:]) + border + self.query_shift def shift_query(self, amount): if amount == 0: return self.query_shift += amount self._range.push(np.arange(self._range[-1] + 1, self._range[-1] + 1 + amount)) def shift_series(self, amount): if amount == 0: return self.series_shift += amount push_values = np.full(amount, np.inf) self._matrix_profile_left.push(push_values) self._matrix_profile_right.push(push_values) push_values[:] = -1 self._profile_index_left.push(push_values) self._profile_index_right.push(push_values) @property def matrix_profile_right(self): return self._matrix_profile_right.view @property def matrix_profile_left(self): return self._matrix_profile_left.view @property def profile_index_right(self): return self._profile_index_right.view @property def profile_index_left(self): return self._profile_index_left.view