예제 #1
0
def _compute_tensor_to_feature_dict(input_fn, feature_columns, dtype):
    """Computes a feature_name-to-tensor dict for the given features.

  Args:
    input_fn: See the same argument in 'save_quantiles_for_keypoints'.
    feature_columns: See the same argument in 'save_quantiles_for_keypoints'.
    dtype: See the same argument in 'save_quantiles_for_keypoints'.

  Returns:
    A str->tensor dict mapping each feature name to the tensor containing its
    feature values for the current batch. The dict contains all the features
    returned by input_fn if feature_columns are none, or only those features
    included in 'feature_columns', otherwise. If a non-None label is returned by
    'input_fn', it will also be included in the dict.
  """
    if feature_columns is not None:
        transformed_columns_to_tensors, label = input_fn()
        features_to_tensors = {
            f_col.name:
            tools.input_from_feature_column(transformed_columns_to_tensors,
                                            f_col, dtype)
            for f_col in feature_columns
        }
    else:
        features_to_tensors, label = input_fn()
    if label is None:
        return features_to_tensors
    if _LABEL_FEATURE_NAME in features_to_tensors:
        raise ValueError(
            ("Can't save a label as there's already a feature named: '%s'."
             " Try renaming that feature. ") % _LABEL_FEATURE_NAME)
    features_to_tensors[_LABEL_FEATURE_NAME] = label
    return features_to_tensors
예제 #2
0
 def _materialize_feature_column(self, feature_column, x):
   """Creates input_fn with x then transform and materialize feature_column."""
   input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
       x={'x': x},
       y=None,
       batch_size=self._test_data.num_examples,
       num_epochs=1,
       shuffle=False)
   with tf.Graph().as_default():
     features = input_fn()
     input_tensor = tools.input_from_feature_column(features, feature_column)
     materialized = self._materialize_locally(input_tensor)
   return materialized
예제 #3
0
def input_calibration_layer(columns_to_tensors,
                            num_keypoints,
                            feature_columns=None,
                            keypoints_initializers=None,
                            keypoints_initializer_fns=None,
                            bound=False,
                            monotonic=None,
                            missing_input_values=None,
                            missing_output_values=None,
                            dtype=dtypes.float32,
                            **regularizer_amounts):
    """Creates a calibration layer for the given input and feature_columns.

  Returns a tensor with the calibrated values of the given features, a list
  of the names of the features in the order they feature in the returned, and
  a list of projection ops, that must be applied at each step (or every so many
  steps) to project the model to a feasible space: used for bounding the outputs
  or for imposing monotonic -- the list will be empty if bound and
  monotonic are not set.

  Args:
    columns_to_tensors: A mapping from feature name to tensors. 'string' key
      means a base feature (not-transformed). If feature_columns is not set
      these are the features calibrated. Otherwise the transformed
      feature_columns are the ones calibrated.
    num_keypoints: Number of keypoints to use. Either a single int, or a dict
      mapping feature names to num_keypoints. If a value of the dict is 0 or
      None the correspondent feature won't be calibrated.
    feature_columns: Optional. If set to a set of FeatureColumns, these will
      be the features used and calibrated.
    keypoints_initializers: For evaluation or inference (or when resuming
      training from a checkpoint) the values will be loaded from disk, so they
      don't need to be given (leave it as None).
      Either a tuple of two tensors of shape [num_keypoints], or a dict mapping
      feature names to pair of tensors of shape [num_keypoints[feature_name]].
      See load_keypoints_from_quantiles or uniform_keypoints_for_signal on how
      to generate these (module keypoints_initialization).
    keypoints_initializer_fns: Like keypoints_initializers but using lambda
      initializers. They should be compatible with tf.get_variable. If this is
      set, then keypoints_initializers must be None.
    bound: boolean whether output of calibration must be bound. Alternatively
      a dict mapping feature name to boundness.
    monotonic: whether calibration has to be kept monotonic: None or 0 means
      no monotonic. Positive or negative values mean increasing or decreasing
      monotonic respectively. Alternatively a dict mapping feature name
      to monotonic.
    missing_input_values: If set, and if the input has this value it is assumed
      to be missing and the output will either be calibrated to some value
      between `[calibration_output_min, calibration_output_max]` or set to a
      fixed value set by missing_output_value. Limitation: it only works for
      scalars. Either one value for all inputs, or a dict mapping feature name
      to missing_input_value for the respective feature.
    missing_output_values: Requires missing_input_value also to be set. If set
      if will convert missing input to this value. Either one value for all
      inputs, or a dict mapping feature name to missing_input_value for the
      respective feature.
    dtype: If any of the scalars are not given as tensors, they are converted
      to tensors with this dtype.
    **regularizer_amounts: Keyword args of regularization amounts passed to
      regularizers.calibrator_regularization(). Keyword names should be among
      supported regularizers.CALIBRATOR_REGULARIZERS and values should be
      either float or {feature_name: float}. If float, then same value is
      applied to all features.

  Returns:
    A tuple of:
    * calibrated tensor of shape [batch_size, sum(features dimensions)].
    * list of the feature names in the order they feature in the calibrated
      tensor. A name may appear more than once if the feature is
      multi-dimension (for instance a multi-dimension embedding)
    * list of projection ops, that must be applied at each step (or every so
      many steps) to project the model to a feasible space: used for bounding
      the outputs or for imposing monotonicity. Empty if none are requested.
    * None or tensor with regularization loss.

  Raises:
    ValueError: if dtypes are incompatible.


  """
    with ops.name_scope('input_calibration_layer'):
        feature_names = tools.get_sorted_feature_names(columns_to_tensors,
                                                       feature_columns)
        num_keypoints = tools.cast_to_dict(num_keypoints, feature_names,
                                           'num_keypoints')
        bound = tools.cast_to_dict(bound, feature_names, 'bound')
        monotonic = tools.cast_to_dict(monotonic, feature_names, 'monotonic')
        keypoints_initializers = tools.cast_to_dict(keypoints_initializers,
                                                    feature_names,
                                                    'keypoints_initializers')
        keypoints_initializer_fns = tools.cast_to_dict(
            keypoints_initializer_fns, feature_names,
            'keypoints_initializer_fns')
        missing_input_values = tools.cast_to_dict(missing_input_values,
                                                  feature_names,
                                                  'missing_input_values')
        missing_output_values = tools.cast_to_dict(missing_output_values,
                                                   feature_names,
                                                   'missing_output_values')
        regularizer_amounts = {
            regularizer_name:
            tools.cast_to_dict(regularizer_amounts[regularizer_name],
                               feature_names, regularizer_name)
            for regularizer_name in regularizer_amounts
        }

        per_dimension_feature_names = []

        # Get uncalibrated tensors, either from columns_to_tensors, or using
        # feature_columns.
        if feature_columns is None:
            uncalibrated_features = [
                columns_to_tensors[name] for name in feature_names
            ]
        else:
            transformed_columns_to_tensors = columns_to_tensors.copy()
            dict_feature_columns = {
                f_col.name: f_col
                for f_col in feature_columns
            }
            uncalibrated_features = [
                tools.input_from_feature_column(transformed_columns_to_tensors,
                                                dict_feature_columns[name],
                                                dtype)
                for name in feature_names
            ]

        projection_ops = []
        calibrated_splits = []
        total_regularization = None
        for feature_idx in range(len(feature_names)):
            name = feature_names[feature_idx]
            uncalibrated_feature = uncalibrated_features[feature_idx]
            if uncalibrated_feature.shape.ndims == 1:
                feature_dim = 1
                uncalibrated_splits = [uncalibrated_feature]
            elif uncalibrated_feature.shape.ndims == 2:
                feature_dim = uncalibrated_feature.shape.dims[1].value
                uncalibrated_splits = array_ops.unstack(uncalibrated_feature,
                                                        axis=1)
            else:
                raise ValueError(
                    'feature {}: it has rank {}, but only ranks 1 or 2 are '
                    'supported; feature shape={}'.format(
                        name, uncalibrated_feature.shape.ndims,
                        uncalibrated_feature.shape))
            missing_input_value = missing_input_values[name]
            missing_output_value = missing_output_values[name]
            feature_regularizer_amounts = {
                regularizer_name: regularizer_amounts[regularizer_name][name]
                for regularizer_name in regularizer_amounts
            }

            # FutureWork: make the interpolation ops handle multi-dimension values,
            #   so this step is not needed.
            for dim_idx in range(feature_dim):
                per_dimension_feature_names += [name]
                split_name = name
                if feature_dim > 1:
                    split_name = '{}_dim_{}'.format(name, dim_idx)
                uncalibrated = uncalibrated_splits[dim_idx]
                if not num_keypoints[name]:
                    # No calibration for this feature:
                    calibrated_splits += [uncalibrated]
                    if (missing_input_value is not None
                            or missing_output_value is not None):
                        raise ValueError(
                            'feature %s: cannot handle missing values if feature is not '
                            'calibrated, missing_input_value=%s, missing_output_value=%s'
                            %
                            (name, missing_input_value, missing_output_value))
                else:
                    calibrated, projection, reg = one_dimensional_calibration_layer(
                        uncalibrated,
                        num_keypoints[name],
                        signal_name=split_name,
                        keypoints_initializers=keypoints_initializers[name],
                        keypoints_initializer_fns=keypoints_initializer_fns[
                            name],
                        bound=bound[name],
                        monotonic=monotonic[name],
                        missing_input_value=missing_input_value,
                        missing_output_value=missing_output_value,
                        **feature_regularizer_amounts)
                    calibrated_splits += [calibrated]
                    if projection is not None:
                        projection_ops += [projection]
                    total_regularization = tools.add_if_not_none(
                        total_regularization, reg)

        all_calibrated = array_ops.stack(calibrated_splits,
                                         axis=1,
                                         name='stack_calibrated')
        return (all_calibrated, per_dimension_feature_names, projection_ops,
                total_regularization)
예제 #4
0
    def _base_model_fn(self, features, labels, mode, config):  # pylint: disable=unused-argument
        """Creates the prediction, loss, and train ops.

    Args:
      features: A dictionary of tensors keyed by the feature name.
      labels: A tensor representing the label.
      mode: The execution mode, as defined in tf.estimator.ModeKeys.
      config: Optional configuration object. Will receive what is passed to
        Estimator in `config` parameter, or the default `config`. Allows
        updating things in your model_fn based on configuration such as
        `num_ps_replicas`.

    Returns:
      ModelFnOps, with the predictions, loss, and train_op.

    Raises:
      ValueError: if incompatible parameters are given.
    """
        with tf.compat.v1.variable_scope(self._name):
            if self._feature_columns is None:
                columns_to_tensors = features.copy()
            else:
                with tf.compat.v1.variable_scope(
                        "feature_column_transformation"):
                    columns_to_tensors = {
                        feature_column.name: tools.input_from_feature_column(
                            features.copy(), feature_column, self._dtype)
                        for feature_column in self._feature_columns
                    }
            (prediction, projection_ops,
             regularization) = self.prediction_builder(columns_to_tensors,
                                                       mode, self._hparams,
                                                       self._dtype)

            def _train_op_fn(loss):
                """Returns train_op tensor if TRAIN mode, or None."""
                train_op = None
                if mode == tf.estimator.ModeKeys.TRAIN:
                    if regularization is not None:
                        loss += regularization
                        tf.compat.v1.summary.scalar("loss_with_regularization",
                                                    loss)
                    optimizer = self._optimizer
                    if optimizer is None:
                        optimizer = tf.compat.v1.train.AdamOptimizer
                    if callable(optimizer):
                        optimizer = optimizer()
                    train_op = optimizer.minimize(
                        loss,
                        global_step=tf.compat.v1.train.get_global_step(),
                        name=_TRAIN_OP_NAME)
                    self._projection_hook.set_projection_ops(projection_ops)
                return train_op

            # Use head to generate model_fn outputs.
            estimator_spec = self._head.create_estimator_spec(
                features=features,
                labels=labels,
                mode=mode,
                train_op_fn=_train_op_fn,
                logits=prediction)

            # Update chief worker's training session run hooks to include
            # projection_hook. This means that in a distributed setting, only the
            # chief worker will run the projection op after its own update and without
            # synchronization with other workers. Thus, the parameters may temporary
            # leave the feasible space.
            if mode == tf.estimator.ModeKeys.TRAIN:
                updated_training_chief_hooks = (
                    estimator_spec.training_chief_hooks +
                    (self._projection_hook, ))
                estimator_spec = estimator_spec._replace(
                    training_chief_hooks=updated_training_chief_hooks)

            return estimator_spec
예제 #5
0
def save_quantiles_for_keypoints(input_fn,
                                 save_dir,
                                 feature_columns=None,
                                 num_steps=1,
                                 override=True,
                                 num_quantiles=1000,
                                 dtype=dtypes.float32):
    """Calculates and saves quantiles for given features.

  These values can later be retrieved and used by keypoints_from_quantiles()
  below.

  Repeated values are discarded before the quantiles are calculated. That means
  that the quantiles of a very skewed distribution (for instance where 99%
  of the values are 0), will be different. But for the purpose of calibration
  this approach is more useful.

  Nothing is returned, the values are simply saved in the given location.

  This function can be called as a preprocessing step before actual training
  starts. Typically one will run this in a separate process locally, before
  starting training for instance.

  Args:
    input_fn: Similar to input_fn provided to Estimators. Typically one
      doesn't need to go over the full data to get good quantiles. Typically
      some 100 random examples per quantile is good enough for the purpose of
      calibration. If you don't have too much data, just use everything.
      If input_fn returns a target (used in training) it is ignored.
    save_dir: Where to save these quantiles. Since when optimizing
      hyper-parameters we train various models, we can share the quantiles
      information generated here. So this should be a directory that can be
      accessed by all training sessions. A subdirectory called "quantiles" will
      be created, and inside one file per feature is created: named after the
      feature name, and with the quantiles stored in JSON format.
    feature_columns: If set, quantiles are generated for these feature columns.
      The file name used to save the quantiles uses a hash of the names of the
      feature_columns, so it can support different quantiles sets for different
      parts of the model if needed. If not set quantiles will be generated for
      all features returned by input_fn.
    num_steps: number of steps to take over input_fn to gather enough data to
      create quantiles. Set to 0 or None to run until queue is exhausted,
      like if you used num_epochs in your input_fn.
    override: if False it won't regenerate quantiles for files that are already
      there. This works as long as the features definition/distribution hasn't
      change from one run to another.
    num_quantiles: This value should be larger than the maximum number of
      keypoints that will be considered for calibrating these features. If
      there are not enough quantiles for the keypoints, the system is robust and
      will simply interpolate the missing quantiles. Similarly if there are not
      enough examples to represent the quantiles, it will interpolate the
      quantiles from the examples given.
    dtype: Deafult dtype to use, in particular for categorical values.

  Returns: Nothing, results are saved to disk.

  Raises:
    errors.OpError: For I/O errors.

  FutureWork:
    * Use Munro-Paterson algorithm to calculate quantiles in a streaming
      fashion. See Squawd library.
    * Add support to weighted examples.
    * Handle cases where there are not enough different values in quantiles.
  """
    subdir = os.path.join(save_dir, _QUANTILES_SUBDIRECTORY)
    file_io.recursive_create_dir(subdir)
    with ops.Graph().as_default():
        tensors = None

        if feature_columns is not None:
            # Features from feature_columns.
            if not override:
                # Remove feature_columns for which we already have the quantiles.
                missing_feature_columns = []
                for f_col in feature_columns:
                    try:
                        _ = _load_quantiles(subdir, f_col.name)
                    except errors.NotFoundError:
                        missing_feature_columns += [f_col]
                feature_columns = missing_feature_columns
                if not feature_columns:
                    return

            transformed_columns_to_tensors, unused_label = input_fn()

            tensors = {
                f_col.name:
                tools.input_from_feature_column(transformed_columns_to_tensors,
                                                f_col, dtype)
                for f_col in feature_columns
            }

        else:
            # Features directly from columns_to_tensors.
            columns_to_tensors, unused_label = input_fn()
            tensors = {}

            if override:
                tensors = columns_to_tensors
            else:
                for name, tensor in columns_to_tensors.items():
                    try:
                        _ = _load_quantiles(subdir, name)
                    except errors.NotFoundError:
                        tensors[name] = tensor
                if not tensors:
                    # All features already calculated.
                    return

        # Here a dict of feature_name to tensor is in tensors.
        arrays = _materialize_locally(tensors, num_steps)

    percentiles = np.linspace(0., 100., num_quantiles)
    for key, values in arrays.items():
        values = np.unique(values)
        quantiles = np.percentile(values, percentiles, interpolation="nearest")
        quantiles = list(quantiles)
        _save_quantiles(subdir, key, quantiles)