def condense_around_knots( sorted_x: np.ndarray, y: np.ndarray, w: np.ndarray, sorted_knots: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """Returns X', Y', W' that replicates the PWLFit MSE of x, y, w, knots. This function compresses an arbitrary number of (x,y,weight) points into at most 2 * (len(sorted_knots) - 1) compressed points, such that the difference of the MSEs of any two PWLCurves defined on those knots is the same on the compressed points as it is on the given points. O(len(sorted_x)). Args: sorted_x: (numpy array) independent variable in sorted order. y: (numpy array) dependent variable. w: (numpy array) the weights on data points. Each weight must be positive. sorted_knots: (numpy array) x-values of the candidate knots in sorted order. Returns: A tuple of 3 lists x', y', w', each of equal length no greater than 2 * (len(sorted_knots) - 1), such that, given any two PWLCurves c1 and c2 with x-knots from sorted_knots, the difference of the MSEs of c1 and c2 is the same when evaluated on x', y', w' as it is on x, y, w. """ if sorted_x[0] < sorted_knots[0] or sorted_knots[-1] < sorted_x[-1]: # Clamp sorted_x to the range of knot xs before condensing. sorted_x = np.clip(sorted_x, sorted_knots[0], sorted_knots[-1]) sorted_x, y, w = utils.fuse_sorted_points(sorted_x, y, w) knot_indices = sorted_x.searchsorted(sorted_knots) knot_indices[-1] = len(sorted_x) return _condense_between_indices(sorted_x, y, w, knot_indices)
def sample_condense_points( sorted_x: np.ndarray, y: np.ndarray, w: np.ndarray, num_knots: int ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """Picks knots and linearly condenses (sorted_x, y, w) around those knots. Args: sorted_x: (numpy array) independent variable in sorted order. y: (numpy array) dependent variable. w: (numpy array) the weights on data points. num_knots: (int) Number of knot-x candidates to return. Returns: A tuple of 4 lists: x_knots, condensed_x, condensed_y, condensed_w. """ utils.expect(num_knots >= 2, 'num_knots must be at least 2.') utils.expect(len(sorted_x) == len(y) == len(w)) sorted_x, y, w = utils.fuse_sorted_points(sorted_x, y, w) if len(sorted_x) <= num_knots: return sorted_x, sorted_x, y, w knot_xs = _pick_knot_candidates(sorted_x, w, num_knots) condensed_x, condensed_y, condensed_w = (condense_around_knots( sorted_x, y, w, knot_xs)) return knot_xs, condensed_x, condensed_y, condensed_w
def test_fuse_sorted_points_on_example(self): x = np.array([0., 0., 1., 1., 1., 2.]) y = np.array([1., 4., 2., 3., 4., 5.]) w = np.array([2., 1., 1., 1., 1., 2.]) fused_x, fused_y, fused_w = utils.fuse_sorted_points(x, y, w) self.assert_allclose(np.array([0., 1., 2.]), fused_x) self.assert_allclose(np.array([2., 3., 5.]), fused_y) self.assert_allclose(np.array([3., 3., 2.]), fused_w)
def test_fuse_sorted_points_maintains_totals(self): np.random.seed(954346) x = np.sort(np.random.randint(10, size=1000).astype(float)) y = x + np.random.normal(size=1000) w = np.random.uniform(size=1000) fused_x, fused_y, fused_w = utils.fuse_sorted_points(x, y, w) np.testing.assert_equal(np.unique(x), fused_x) self.assertAlmostEqual(w.sum(), fused_w.sum()) self.assertAlmostEqual((y * w).sum(), (fused_y * fused_w).sum())
def test_fuse_sorted_points_does_nothing_on_unique_xs(self): np.random.seed(954345) x = np.arange(100) y = np.random.normal(size=100) w = np.random.uniform(size=100) fused_x, fused_y, fused_w = utils.fuse_sorted_points(x, y, w) self.assert_allclose(x, fused_x) self.assert_allclose(y, fused_y) self.assert_allclose(w, fused_w)
def test_fuse_sorted_points_maintains_best_line(self): np.random.seed(954346) x = np.sort(np.random.randint(10, size=1000).astype(float)) y = x + np.random.normal(size=1000) w = np.random.uniform(size=1000) slope, intercept = _best_fit_line(x, y, w) fused_x, fused_y, fused_w = utils.fuse_sorted_points(x, y, w) fused_slope, fused_intercept = _best_fit_line(fused_x, fused_y, fused_w) self.assertAlmostEqual(slope, fused_slope) self.assertAlmostEqual(intercept, fused_intercept)
def test_sample_condense_points_invariant_to_fusion(self): # condense_around_knots doesn't care whether its x-values are unique, # or whether points with the same x-value have been fused ahead of time. np.random.seed(5) x = np.sort(np.random.randint(0, 100, size=777).astype(float)) y = np.random.uniform(size=777) w = np.random.uniform(size=777) x_fused, y_fused, w_fused = utils.fuse_sorted_points(x, y, w) np.testing.assert_equal( linear_condense.sample_condense_points(x, y, w, 100), linear_condense.sample_condense_points(x_fused, y_fused, w_fused, 100))
def test_linear_condense_invariant_to_fusing_points(self): np.random.seed(954348) x = np.sort(np.random.randint(10, size=1000).astype(float)) y = x + np.random.normal(size=1000) w = np.random.uniform(size=1000) condensed_x, condensed_y, condensed_w = linear_condense.linear_condense( x, y, w) fused_x, fused_y, fused_w = utils.fuse_sorted_points(x, y, w) fused_condensed_x, fused_condensed_y, fused_condensed_w = ( linear_condense.linear_condense(fused_x, fused_y, fused_w)) self.assert_allclose(condensed_x, fused_condensed_x) self.assert_allclose(condensed_y, fused_condensed_y) self.assert_allclose(condensed_w, fused_condensed_w)
def sort_and_sample( x: Sequence[float], y: Sequence[float], w: Optional[Sequence[float]], downsample_to: float = 1e6 ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """Samples and sorts the data to fit a PWLCurve on. Samples each point with equal likelihood, once or zero times per point. Weight is not considered when sampling. For performance reasons, the precise number of final points is not guaranteed. Args: x: (Sequence of floats) The independent variable. y: (Sequence of floats) The dependent variable. w: (None or Sequence of floats) The weights of data points. Weights are NOT used in downsampling. downsample_to: (int or float) The approximate number of samples to take. Raises: ValueError: invalid input. Returns: A triple (sorted_x, y, w) of numpy arrays representing the dependent variable in sorted order, the independent variable, and the weights respectively. """ x = np.array(x, copy=False) y = np.array(y, copy=False) if w is None: w = np.ones_like(x) else: w = np.array(w, copy=False) utils.expect((w > 0).all(), 'Weights must be positive.') utils.expect(len(x) == len(y) == len(w) >= 1) utils.expect(np.isfinite(x).all(), 'x-values must all be finite.') utils.expect(np.isfinite(y).all(), 'y-values must all be finite.') utils.expect(np.isfinite(w).all(), 'w-values must all be finite.') # Downsample to a manageable number of points to limit runtime. if len(x) > downsample_to * 1.01: np.random.seed(125) # Select each xyw with probability (downsample_to / len(x)) to yield # approximately downsample_to selections. fraction_kept = float(downsample_to) / len(x) mask = np.random.sample(size=len(x)) < fraction_kept x, y, w = x[mask], y[mask], w[mask] # Sort the points by x if any are out of order. if (x[1:] < x[:-1]).any(): point_order = np.argsort(x) x, y, w = x[point_order], y[point_order], w[point_order] # Use float64 for precision. x = x.astype(float, copy=False) y = y.astype(float, copy=False) w = w.astype(float, copy=False) # Fuse points with the same x, so that all xs become unique. x, y, w = utils.fuse_sorted_points(x, y, w) return x, y, w