def _min_label_position( name=MIN_LABEL_POSITION_NAME, label_key: Optional[Text] = None, eval_config: Optional[config.EvalConfig] = None, model_names: Optional[List[Text]] = None, output_names: Optional[List[Text]] = None, query_key: Text = '') -> metric_types.MetricComputations: """Returns metric computations for min label position.""" if not query_key: raise ValueError( 'a query_key is required to use MinLabelPosition metric') if model_names is None: model_names = [''] if output_names is None: output_names = [''] keys = [] computations = [] preprocessor = None if label_key: preprocessor = metric_types.FeaturePreprocessor( feature_keys=[label_key]) for model_name in model_names: for output_name in output_names: key = metric_types.MetricKey(name=name, model_name=model_name, output_name=output_name) keys.append(key) computations.append( metric_types.MetricComputation( keys=[key], preprocessor=preprocessor, combiner=_MinLabelPositionCombiner(key, eval_config, label_key))) return computations
def _fixed_size_sample( sampled_key: Text, size: int, name: Text, random_seed: Optional[int], model_names: Optional[List[Text]] = None, output_names: Optional[List[Text]] = None, sub_keys: Optional[List[metric_types.SubKey]] = None, example_weighted: bool = False) -> metric_types.MetricComputations: """Returns metrics computations for FixedSizeSample metrcs.""" keys = [] for model_name in model_names or ['']: for output_name in output_names or ['']: for sub_key in sub_keys or [None]: keys.append( metric_types.MetricKey(name, model_name=model_name, output_name=output_name, sub_key=sub_key, example_weighted=example_weighted)) return [ metric_types.MetricComputation( keys=keys, preprocessor=metric_types.FeaturePreprocessor( feature_keys=[sampled_key]), combiner=_FixedSizeSampleCombineFn( metric_keys=keys, sampled_key=sampled_key, size=size, example_weighted=example_weighted, random_seed=random_seed)) ]
def _ndcg(gain_key: str, top_k_list: Optional[List[int]] = None, name: str = NDCG_NAME, eval_config: Optional[config_pb2.EvalConfig] = None, model_names: Optional[List[str]] = None, output_names: Optional[List[str]] = None, sub_keys: Optional[List[metric_types.SubKey]] = None, example_weighted: bool = False, query_key: str = '') -> metric_types.MetricComputations: """Returns metric computations for NDCG.""" if not query_key: raise ValueError('a query_key is required to use NDCG metric') sub_keys = [k for k in sub_keys if k is not None] if top_k_list: if sub_keys is None: sub_keys = [] for k in top_k_list: if not any([sub_key.top_k == k for sub_key in sub_keys]): sub_keys.append(metric_types.SubKey(top_k=k)) if not sub_keys or any([sub_key.top_k is None for sub_key in sub_keys]): raise ValueError( 'top_k values are required to use NDCG metric: {}'.format(sub_keys)) computations = [] for model_name in model_names if model_names else ['']: for output_name in output_names if output_names else ['']: keys = [] for sub_key in sub_keys: keys.append( metric_types.MetricKey( name, model_name=model_name, output_name=output_name, sub_key=sub_key, example_weighted=example_weighted)) computations.append( metric_types.MetricComputation( keys=keys, preprocessor=metric_types.FeaturePreprocessor( feature_keys=[query_key, gain_key]), combiner=_NDCGCombiner( metric_keys=keys, eval_config=eval_config, model_name=model_name, output_name=output_name, example_weighted=example_weighted, query_key=query_key, gain_key=gain_key))) return computations
def testPreprocessorsWithoutDefaults(self): preprocessor = metric_types.StandardMetricInputsPreprocessorList([ metric_types.FeaturePreprocessor( feature_keys=['feature1', 'feature2'], include_default_inputs=False), metric_types.TransformedFeaturePreprocessor( feature_keys=['feature1'], include_default_inputs=False), metric_types.AttributionPreprocessor(feature_keys=['feature1'], include_default_inputs=False) ]) self.assertEqual( preprocessor.include_filter, { 'features': { 'feature1': {}, 'feature2': {}, }, 'transformed_features': { 'feature1': {}, }, 'attributions': { 'feature1': {}, }, })
def _ndcg(gain_key: Text, name: Text = NDCG_NAME, eval_config: Optional[config.EvalConfig] = None, model_names: List[Text] = None, output_names: List[Text] = None, sub_keys: Optional[List[metric_types.SubKey]] = None, query_key: Text = '') -> metric_types.MetricComputations: """Returns metric computations for NDCG.""" if not query_key: raise ValueError('a query_key is required to use NDCG metric') if sub_keys is None or any([sub_key.top_k is None for sub_key in sub_keys]): raise ValueError( 'top_k values are required to use NDCG metric: {}'.format(sub_keys)) computations = [] for model_name in model_names if model_names else ['']: for output_name in output_names if output_names else ['']: keys = [] for sub_key in sub_keys: keys.append( metric_types.MetricKey( name, model_name=model_name, output_name=output_name, sub_key=sub_key)) computations.append( metric_types.MetricComputation( keys=keys, preprocessor=metric_types.FeaturePreprocessor( feature_keys=[query_key, gain_key]), combiner=_NDCGCombiner( metric_keys=keys, eval_config=eval_config, model_name=model_name, output_name=output_name, query_key=query_key, gain_key=gain_key))) return computations
def testPreprocessors(self): preprocessor = metric_types.StandardMetricInputsPreprocessorList([ metric_types.FeaturePreprocessor( feature_keys=['feature1', 'feature2']), metric_types.TransformedFeaturePreprocessor( feature_keys=['feature1']), metric_types.AttributionPreprocessor(feature_keys=['feature1']) ]) self.assertEqual( preprocessor.include_filter, { 'labels': {}, 'predictions': {}, 'example_weights': {}, 'features': { 'feature1': {}, 'feature2': {}, }, 'transformed_features': { 'feature1': {}, }, 'attributions': { 'feature1': {}, }, })
def metric_computations_using_keras_saved_model( model_name: str, model_loader: types.ModelLoader, eval_config: Optional[config_pb2.EvalConfig], batch_size: Optional[int] = None) -> metric_types.MetricComputations: """Returns computations for computing metrics natively using keras. Args: model_name: Name of model. model_loader: Loader for shared model containing keras saved model to use for metric computations. eval_config: Eval config. batch_size: Batch size to use during evaluation (testing only). """ model = model_loader.load() # If metrics were only added using model.compile then use # model.compiled_metrics and model.compiled_loss to compute the metrics, # otherwise custom metrics added via model.add_metric were also used and we # need to call model.evaluate. if not model.metrics: return [] elif (hasattr(model, 'compiled_metrics') and model.compiled_metrics and hasattr(model, 'compiled_loss') and model.compiled_loss and len(model.compiled_metrics.metrics) + len(model.compiled_loss.metrics) == len(model.metrics)): if hasattr(model, 'output_names') and model.output_names: output_names = model.output_names else: output_names = [] keys = _metric_keys( chain(model.compiled_metrics.metrics, model.compiled_loss.metrics), model_name, output_names) return [ metric_types.MetricComputation( keys=keys, preprocessor=None, combiner=_KerasCompiledMetricsCombiner(keys, model_name, model_loader, eval_config, batch_size)) ] else: if hasattr(model, 'output_names') and model.output_names: output_names = model.output_names else: output_names = [] keys = _metric_keys(model.metrics, model_name, output_names) specs = model_util.get_input_specs(model_name, signature_name=None) feature_keys = list(specs.keys()) if specs else [] return [ metric_types.MetricComputation( keys=keys, preprocessor=metric_types.StandardMetricInputsPreprocessorList( [ metric_types.FeaturePreprocessor( feature_keys=feature_keys, model_names=[model_name]), metric_types.TransformedFeaturePreprocessor( feature_keys=feature_keys, model_names=[model_name]) ]), combiner=_KerasEvaluateCombiner(keys, model_name, model_loader, eval_config, batch_size)) ]
def flip_count( counterfactual_prediction_key: Optional[str] = None, example_id_key: Optional[str] = None, example_ids_count: int = DEFAULT_NUM_EXAMPLE_IDS, name: str = FLIP_COUNT_NAME, thresholds: Sequence[float] = DEFAULT_THRESHOLDS, model_name: str = '', output_name: str = '', eval_config: Optional[config_pb2.EvalConfig] = None, example_weighted: bool = False) -> metric_types.MetricComputations: """Returns metric computations for computing flip counts.""" keys, metric_key_by_name_by_threshold = create_metric_keys( thresholds, METRICS_LIST, name, model_name, output_name, example_weighted) feature_keys = [counterfactual_prediction_key] if example_id_key: feature_keys.append(example_id_key) def extract_label_prediction_and_weight( inputs: metric_types.StandardMetricInputs, eval_config: Optional[config_pb2.EvalConfig] = None, model_name: str = '', output_name: str = '', sub_key: Optional[metric_types.SubKey] = None, aggregation_type: Optional[metric_types.AggregationType] = None, class_weights: Optional[Dict[int, float]] = None, example_weighted: bool = False, fractional_labels: bool = False, flatten: bool = True, ) -> Iterator[Tuple[np.ndarray, np.ndarray, np.ndarray]]: """Yields label, prediction, and example weights to be used in calculations. This function is a customized metric_util.to_label_prediction_example_weight function which yields original prediction as label and counterfactual prediction as prediction and derive flip count metrics from false positives and false negatives of binary confusion matrix. Args: inputs: Standard metric inputs. eval_config: Eval config model_name: Optional model name (if multi-model evaluation). output_name: Optional output name (if multi-output model type). sub_key: Optional sub key. (unused) aggregation_type: Optional aggregation type. (unused) class_weights: Optional class weights to apply to multi-class / multi-label labels and predictions. (unused) example_weighted: True if example weights should be applied. fractional_labels: If true, each incoming tuple of (label, prediction, example weight) will be split into two tuples as follows (where l, p, w represent the resulting label, prediction, and example weight values): (1) l = 0.0, p = prediction, and w = example_weight * (1.0 - label) (2) l = 1.0, p = prediction, and w = example_weight * label If enabled, an exception will be raised if labels are not within [0, 1]. The implementation is such that tuples associated with a weight of zero are not yielded. This means it is safe to enable fractional_labels even when the labels only take on the values of 0.0 or 1.0. (unused) flatten: True to flatten the final label and prediction outputs so that the yielded values are always arrays of size 1. For example, multi-class /multi-label outputs would be converted into label and prediction pairs that could then be processed by a binary classification metric in order to compute a micro average over all classes. (unused) Yields: Tuple of (label, prediction, example_weight). Raises: ValueError: If counterfactual prediction key is not found within either the features or predictions. ValueError: If predictions is None or empty. """ del (sub_key, aggregation_type, class_weights, fractional_labels, flatten) # unused # TODO(sokeefe): Look into removing the options to pass counterfactual # predictions in a feature and instead as a baseline model. if (counterfactual_prediction_key is not None and counterfactual_prediction_key in inputs.features): counterfactual_prediction = inputs.features[ counterfactual_prediction_key] elif eval_config is not None: counterfactual_model_spec = model_util.get_baseline_model_spec( eval_config) if counterfactual_model_spec is not None: _, counterfactual_prediction, _ = next( metric_util.to_label_prediction_example_weight( inputs, eval_config=eval_config, model_name=counterfactual_model_spec.name, output_name=output_name, example_weighted=example_weighted, fractional_labels= False, # Labels are ignored for flip counts. flatten=False, # Flattened below allow_none=True, # Allow None labels require_single_example_weight=True)) else: raise ValueError( 'The Counterfactual model must be listed with ' f'`is_baseline` equal to `True`. Found: {eval_config}') else: raise ValueError( '`counterfactual_prediction` was not found within the provided ' 'inputs. It must be included as either a feature key or within the ' 'predictions. Found:\n' f'`counterfactual_prediction_key`: {counterfactual_prediction_key}\n' f'`inputs.prediction`:{inputs.prediction}') if counterfactual_prediction is None: raise ValueError( '%s feature key is None (required for FlipCount metric)' % counterfactual_prediction_key) def get_by_keys(value: Any, keys: List[str]) -> Any: if isinstance(value, dict): new_value = util.get_by_keys(value, keys, optional=True) if new_value is not None: return new_value return value if model_name: counterfactual_prediction = get_by_keys(counterfactual_prediction, [model_name]) if output_name: counterfactual_prediction = get_by_keys(counterfactual_prediction, [output_name]) _, prediction, example_weight = next( metric_util.to_label_prediction_example_weight( inputs, eval_config=eval_config, model_name=model_name, output_name=output_name, example_weighted=example_weighted, fractional_labels=False, # Labels are ignored for flip counts. flatten=False, # Flattened below allow_none=True, # Allow None labels require_single_example_weight=True)) if prediction.size != counterfactual_prediction.size: raise ValueError( 'prediction and counterfactual_prediction size should be same for ' 'FlipCount metric, %f != %f' % (prediction.size, counterfactual_prediction.size)) if prediction.size == 0: raise ValueError( 'prediction is empty (required for FlipCount metric)') else: # Always flatten example_weight = np.array( [float(example_weight) for i in range(prediction.shape[-1])]) for p, cfp, w in zip(prediction.flatten(), counterfactual_prediction.flatten(), example_weight.flatten()): yield np.array([p]), np.array([cfp]), np.array([w]) # Setting fractional label to false, since prediction is being used as label # and it can be a non-binary value. computations = binary_confusion_matrices.binary_confusion_matrices( thresholds=list(thresholds), eval_config=eval_config, model_name=model_name, output_name=output_name, example_weighted=example_weighted, extract_label_prediction_and_weight=extract_label_prediction_and_weight, preprocessor=metric_types.FeaturePreprocessor( feature_keys=feature_keys), example_id_key=example_id_key, example_ids_count=example_ids_count, fractional_labels=False) examples_metric_key, matrices_metric_key = computations[-1].keys def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, Any]: """Returns flip count metrics values.""" matrix = metrics[matrices_metric_key] examples = metrics[examples_metric_key] output = {} for i, threshold in enumerate(matrix.thresholds): output[metric_key_by_name_by_threshold[threshold] ['positive_to_negative']] = matrix.fn[i] output[metric_key_by_name_by_threshold[threshold] ['negative_to_positive']] = matrix.fp[i] output[metric_key_by_name_by_threshold[threshold] ['positive_to_negative_examples_ids']] = np.array( examples.fn_examples[i]) output[metric_key_by_name_by_threshold[threshold] ['negative_to_positive_examples_ids']] = np.array( examples.fp_examples[i]) output[metric_key_by_name_by_threshold[threshold] ['positive_examples_count']] = matrix.fn[i] + matrix.tp[i] output[metric_key_by_name_by_threshold[threshold] ['negative_examples_count']] = matrix.fp[i] + matrix.tn[i] return output derived_computation = metric_types.DerivedMetricComputation(keys=keys, result=result) computations.append(derived_computation) return computations