def test_find_best_transform_log1p_transform(self): np.random.seed(7) x = np.sort(np.random.uniform(-1, 1, size=1000)) w = np.random.uniform(size=1000) found_transform = transform.find_best_transform(x, np.log1p(x), w) self.assertEqual(log1p_transform, found_transform) found_transform = transform.find_best_transform(np.expm1(x), x, w) self.assertEqual(log1p_transform, found_transform)
def test_find_best_transform_does_not_mutate_inputs(self): np.random.seed(5) x = np.sort(np.random.normal(size=123)) y = np.random.normal(size=123) w = np.random.uniform(size=123) x_copy, y_copy, w_copy = x.copy(), y.copy(), w.copy() transform.find_best_transform(x, y, w) np.testing.assert_array_equal(x, x_copy) np.testing.assert_array_equal(y, y_copy) np.testing.assert_array_equal(w, w_copy)
def test_find_best_transform_is_identity_for_constant_ys(self): np.random.seed(5) x = np.arange(10) y = np.ones(10) w = np.random.uniform(size=len(x)) found_transform = transform.find_best_transform(x, y, w, pct_to_clip=0) self.assertEqual(identity_transform, found_transform)
def test_find_best_transform_clips_by_weight(self): # Generate data that's mostly linear but with a logarithmic tail. np.random.seed(5) x = np.linspace(1, 10, num=1000) y = np.array(x, copy=True) x[-5:] = 2**x[-5:] # If all weights are equal, we clip the log-tail, so the data is linear. w = np.ones_like(x) found_transform = transform.find_best_transform(x, y, w, .005) self.assertEqual(identity_transform, found_transform) # If the tail is heavy, it will dominate even after clipping. w = np.ones_like(x) w[-5:] = 10000 found_transform = transform.find_best_transform(x, y, w, .005) self.assertEqual(log_transform, found_transform)
def test_find_best_transform_symmetriclog1p_transform(self): np.random.seed(8) # SymmetricLogP1 extends the log distribution to include negative values. x = np.sort(np.random.uniform(low=-4, high=5, size=1000)) w = np.random.uniform(size=1000) y = transform.symmetriclog1p(x) found_transform = transform.find_best_transform(x, y, w) self.assertEqual(symmetriclog1p_transform, found_transform)
def test_find_best_transform_symlog1p_transform(self): np.random.seed(8) # symlog1p extends the log distribution to allow negative inputs. x = np.sort(np.random.uniform(low=-4, high=5, size=1000)) w = np.random.uniform(size=1000) y = transform.symlog1p(x) found_transform = transform.find_best_transform(x, y, w) self.assertEqual(symlog1p_transform, found_transform)
def test_find_best_transform_is_identity_for_ys_constant_after_clipping( self): np.random.seed(5) y = np.array([1.] + [2] * 100 + [3]) x = np.arange(len(y)) w = np.ones_like(x) found_transform = transform.find_best_transform(x, y, w, pct_to_clip=.01) self.assertEqual(identity_transform, found_transform)
def test_find_best_transform_identity(self): np.random.seed(5) x = np.sort(np.random.uniform(high=10, size=1000)) w = np.random.uniform(size=len(x)) found_transform = transform.find_best_transform(x, x, w) self.assertEqual(identity_transform, found_transform) # Linear transforms are still best fit with the identity transform. found_transform = transform.find_best_transform(x * 97 + 5, x - 60, w) self.assertEqual(identity_transform, found_transform) found_transform = transform.find_best_transform( x / 1e5, -x / 52, w * 99) self.assertEqual(identity_transform, found_transform) # Other transforms maintain the relationship so long as they're applied to # both x and y. found_transform = transform.find_best_transform( np.exp(x), np.exp(x), w) self.assertEqual(identity_transform, found_transform) found_transform = transform.find_best_transform( np.log(x), np.log(x), w) self.assertEqual(identity_transform, found_transform) found_transform = transform.find_best_transform(x**3, x**3, w) self.assertEqual(identity_transform, found_transform)
def fit_pwl(x: Sequence[float], y: Sequence[float], w: Optional[Sequence[float]] = None, num_segments: int = 3, num_samples: int = 100, mono: Union[MonoType, bool] = MonoType.mono, min_slope: Optional[float] = None, max_slope: Optional[float] = None, fx: Optional[Callable[[np.ndarray], np.ndarray]] = None, learn_ends: bool = True) -> pwlcurve.PWLCurve: """Fits a PWLCurve from x to y, minimizing weighted MSE. Attempts to find a piecewise linear curve which is as close to ys as possible, in a least squares sense. ~O(len(x) + qlog(q) + (num_samples^2)(num_segments^3)) time complexity, where q is ~min(10**6, len(x)). The len(x) term occurs because of downsampling to q points. The qlog(q) term comes from sorting after downsampling. The other term comes from fit_pwl_points, which greedily searches for the best combination of knots and solves a constrained linear least squares expression for each. Args: x: (Sequence of floats) independent variable. y: (Sequence of floats) dependent variable. w: (None or Sequence of floats) the weights on data points. num_segments: (positive int) Number of linear segments. More segments increases quality at the cost of complexity. num_samples: (positive int) Number of potential knot locations to try for the PWL curve. More samples improves fit quality, but slows fitting. At 100 samples, fit_pwl runs in 1-2 seconds. At 1000 samples, it runs in under a minute. At 10,000 samples, expect an hour. mono: (MonoType enum) Restrictions to apply in curve fitting, with monotonicity as the default. See MonoType for all options. min_slope: (None or float) Minimum slope between each adjacent pair of knots. Set to 0 for a monotone increasing solution. max_slope: (None or float) Maximum slope between each adjacent pair of knots. Set to 0 for a monotone decreasing solution. fx: (None or a strictly increasing 1D function) User-specified transform on x, to apply before piecewise-linear curve fitting. If None, fit_pwl chooses a transform using a heuristic. To specify fitting with no transform, pass in transform.identity. learn_ends: (boolean) Whether to learn x-values for the curve's endpoints. Learning endpoints allows for better-fitting curves with the same number of segments. If False, fit_pwl forces the curve to use min(x) and max(x) as knots, which constrains the solution space. Returns: The fit curve. """ utils.expect(num_segments > 0, 'Cannot fit %d segment PWL' % num_segments) utils.expect(num_samples > num_segments, 'num_samples must be at least num_segments + 1') x, y, w = sort_and_sample(x, y, w) if fx is None: fx = transform.find_best_transform(x, y, w) original_x = x trans_x = fx(x) utils.expect( np.isfinite(trans_x[[0, -1]]).all(), 'Transform must be defined on x.') # Pick a subset of x to use as candidate knots, and compress x, y, w around # those candidate knots. x_knots, x, y, w = (linear_condense.sample_condense_points( trans_x, y, w, num_samples)) if mono == MonoType.mono: min_slope, max_slope = _get_mono_slope_bounds(y, w, min_slope, max_slope) bitonic_peak, bitonic_concave_down = _bitonic_peak_and_direction( x, y, w, mono) # Fit a piecewise-linear curve in the transformed space. required_knots = None if learn_ends else x_knots[[0, -1]] x_pnts, y_pnts = fit_pwl_points(x_knots, x, y, w, num_segments, min_slope, max_slope, bitonic_peak, bitonic_concave_down, required_knots) # Recover the control point xs in the pre-transform space. x_pnts = original_x[trans_x.searchsorted(x_pnts)] if np.all(y_pnts == y_pnts[0]): # The curve is constant. curve_points = [(x_pnts[0] - 1, y_pnts[0]), (x_pnts[0], y_pnts[0])] else: curve_points = list(zip(x_pnts, y_pnts)) return pwlcurve.PWLCurve(curve_points, fx)