示例#1
0
    def test_find_best_transform_log1p_transform(self):
        np.random.seed(7)
        x = np.sort(np.random.uniform(-1, 1, size=1000))
        w = np.random.uniform(size=1000)

        found_transform = transform.find_best_transform(x, np.log1p(x), w)
        self.assertEqual(log1p_transform, found_transform)

        found_transform = transform.find_best_transform(np.expm1(x), x, w)
        self.assertEqual(log1p_transform, found_transform)
示例#2
0
    def test_find_best_transform_does_not_mutate_inputs(self):
        np.random.seed(5)
        x = np.sort(np.random.normal(size=123))
        y = np.random.normal(size=123)
        w = np.random.uniform(size=123)
        x_copy, y_copy, w_copy = x.copy(), y.copy(), w.copy()

        transform.find_best_transform(x, y, w)
        np.testing.assert_array_equal(x, x_copy)
        np.testing.assert_array_equal(y, y_copy)
        np.testing.assert_array_equal(w, w_copy)
示例#3
0
 def test_find_best_transform_is_identity_for_constant_ys(self):
     np.random.seed(5)
     x = np.arange(10)
     y = np.ones(10)
     w = np.random.uniform(size=len(x))
     found_transform = transform.find_best_transform(x, y, w, pct_to_clip=0)
     self.assertEqual(identity_transform, found_transform)
示例#4
0
    def test_find_best_transform_clips_by_weight(self):
        # Generate data that's mostly linear but with a logarithmic tail.
        np.random.seed(5)
        x = np.linspace(1, 10, num=1000)
        y = np.array(x, copy=True)
        x[-5:] = 2**x[-5:]

        # If all weights are equal, we clip the log-tail, so the data is linear.
        w = np.ones_like(x)
        found_transform = transform.find_best_transform(x, y, w, .005)
        self.assertEqual(identity_transform, found_transform)

        # If the tail is heavy, it will dominate even after clipping.
        w = np.ones_like(x)
        w[-5:] = 10000
        found_transform = transform.find_best_transform(x, y, w, .005)
        self.assertEqual(log_transform, found_transform)
示例#5
0
    def test_find_best_transform_symmetriclog1p_transform(self):
        np.random.seed(8)
        # SymmetricLogP1 extends the log distribution to include negative values.
        x = np.sort(np.random.uniform(low=-4, high=5, size=1000))
        w = np.random.uniform(size=1000)
        y = transform.symmetriclog1p(x)

        found_transform = transform.find_best_transform(x, y, w)
        self.assertEqual(symmetriclog1p_transform, found_transform)
示例#6
0
    def test_find_best_transform_symlog1p_transform(self):
        np.random.seed(8)
        # symlog1p extends the log distribution to allow negative inputs.
        x = np.sort(np.random.uniform(low=-4, high=5, size=1000))
        w = np.random.uniform(size=1000)
        y = transform.symlog1p(x)

        found_transform = transform.find_best_transform(x, y, w)
        self.assertEqual(symlog1p_transform, found_transform)
示例#7
0
 def test_find_best_transform_is_identity_for_ys_constant_after_clipping(
         self):
     np.random.seed(5)
     y = np.array([1.] + [2] * 100 + [3])
     x = np.arange(len(y))
     w = np.ones_like(x)
     found_transform = transform.find_best_transform(x,
                                                     y,
                                                     w,
                                                     pct_to_clip=.01)
     self.assertEqual(identity_transform, found_transform)
示例#8
0
    def test_find_best_transform_identity(self):
        np.random.seed(5)
        x = np.sort(np.random.uniform(high=10, size=1000))
        w = np.random.uniform(size=len(x))

        found_transform = transform.find_best_transform(x, x, w)
        self.assertEqual(identity_transform, found_transform)

        # Linear transforms are still best fit with the identity transform.
        found_transform = transform.find_best_transform(x * 97 + 5, x - 60, w)
        self.assertEqual(identity_transform, found_transform)

        found_transform = transform.find_best_transform(
            x / 1e5, -x / 52, w * 99)
        self.assertEqual(identity_transform, found_transform)

        # Other transforms maintain the relationship so long as they're applied to
        # both x and y.
        found_transform = transform.find_best_transform(
            np.exp(x), np.exp(x), w)
        self.assertEqual(identity_transform, found_transform)

        found_transform = transform.find_best_transform(
            np.log(x), np.log(x), w)
        self.assertEqual(identity_transform, found_transform)

        found_transform = transform.find_best_transform(x**3, x**3, w)
        self.assertEqual(identity_transform, found_transform)
示例#9
0
def fit_pwl(x: Sequence[float],
            y: Sequence[float],
            w: Optional[Sequence[float]] = None,
            num_segments: int = 3,
            num_samples: int = 100,
            mono: Union[MonoType, bool] = MonoType.mono,
            min_slope: Optional[float] = None,
            max_slope: Optional[float] = None,
            fx: Optional[Callable[[np.ndarray], np.ndarray]] = None,
            learn_ends: bool = True) -> pwlcurve.PWLCurve:
    """Fits a PWLCurve from x to y, minimizing weighted MSE.

  Attempts to find a piecewise linear curve which is as close to ys as possible,
  in a least squares sense.

  ~O(len(x) + qlog(q) + (num_samples^2)(num_segments^3)) time complexity, where
  q is ~min(10**6, len(x)). The len(x) term occurs because of downsampling
  to q points. The qlog(q) term comes from sorting after downsampling. The other
  term comes from fit_pwl_points, which greedily searches for the best
  combination of knots and solves a constrained linear least squares expression
  for each.

  Args:
    x: (Sequence of floats) independent variable.
    y: (Sequence of floats) dependent variable.
    w: (None or Sequence of floats) the weights on data points.
    num_segments: (positive int) Number of linear segments. More segments
      increases quality at the cost of complexity.
    num_samples: (positive int) Number of potential knot locations to try for
      the PWL curve. More samples improves fit quality, but slows fitting. At
      100 samples, fit_pwl runs in 1-2 seconds. At 1000 samples, it runs in
      under a minute. At 10,000 samples, expect an hour.
    mono: (MonoType enum) Restrictions to apply in curve fitting, with
      monotonicity as the default. See MonoType for all options.
    min_slope: (None or float) Minimum slope between each adjacent pair of
      knots. Set to 0 for a monotone increasing solution.
    max_slope: (None or float) Maximum slope between each adjacent pair of
      knots. Set to 0 for a monotone decreasing solution.
    fx: (None or a strictly increasing 1D function) User-specified transform on
      x, to apply before piecewise-linear curve fitting. If None, fit_pwl
      chooses a transform using a heuristic. To specify fitting with no
      transform, pass in transform.identity.
    learn_ends: (boolean) Whether to learn x-values for the curve's endpoints.
      Learning endpoints allows for better-fitting curves with the same number
      of segments. If False, fit_pwl forces the curve to use min(x) and max(x)
      as knots, which constrains the solution space.

  Returns:
    The fit curve.
  """
    utils.expect(num_segments > 0, 'Cannot fit %d segment PWL' % num_segments)
    utils.expect(num_samples > num_segments,
                 'num_samples must be at least num_segments + 1')

    x, y, w = sort_and_sample(x, y, w)
    if fx is None:
        fx = transform.find_best_transform(x, y, w)

    original_x = x
    trans_x = fx(x)
    utils.expect(
        np.isfinite(trans_x[[0, -1]]).all(), 'Transform must be defined on x.')

    # Pick a subset of x to use as candidate knots, and compress x, y, w around
    # those candidate knots.
    x_knots, x, y, w = (linear_condense.sample_condense_points(
        trans_x, y, w, num_samples))

    if mono == MonoType.mono:
        min_slope, max_slope = _get_mono_slope_bounds(y, w, min_slope,
                                                      max_slope)

    bitonic_peak, bitonic_concave_down = _bitonic_peak_and_direction(
        x, y, w, mono)

    # Fit a piecewise-linear curve in the transformed space.
    required_knots = None if learn_ends else x_knots[[0, -1]]
    x_pnts, y_pnts = fit_pwl_points(x_knots, x, y, w, num_segments, min_slope,
                                    max_slope, bitonic_peak,
                                    bitonic_concave_down, required_knots)

    # Recover the control point xs in the pre-transform space.
    x_pnts = original_x[trans_x.searchsorted(x_pnts)]
    if np.all(y_pnts == y_pnts[0]):  # The curve is constant.
        curve_points = [(x_pnts[0] - 1, y_pnts[0]), (x_pnts[0], y_pnts[0])]
    else:
        curve_points = list(zip(x_pnts, y_pnts))
    return pwlcurve.PWLCurve(curve_points, fx)