Exemplo n.º 1
0
def _WriteEvalConfig(  # pylint: disable=invalid-name
        evaluation: evaluator.Evaluation, eval_config: config.EvalConfig,
        output_path: Text, data_location: Text, file_format: Text,
        model_locations: Dict[Text,
                              Text], filename: Text) -> beam.pvalue.PDone:
    """Writes EvalConfig to file.

  Args:
    evaluation: Evaluation data. This transform only makes use of the pipeline.
    eval_config: EvalConfig.
    output_path: Output path.
    data_location: Path indicating where input data is read from.
    file_format: Format of the input data.
    model_locations: Dict of model locations keyed by model name.
    filename: Name of file to store the config as.

  Returns:
    beam.pvalue.PDone.
  """
    pipeline = list(evaluation.values())[0].pipeline

    # Skip writing file if its output is disabled
    if EVAL_CONFIG_FILE in eval_config.options.disabled_outputs.values:
        return beam.pvalue.PDone(pipeline)

    return (pipeline
            | 'CreateEvalConfig' >> beam.Create([
                _serialize_eval_run(eval_config, data_location, file_format,
                                    model_locations)
            ])
            | 'WriteEvalConfig' >> beam.io.WriteToText(
                os.path.join(output_path, filename), shard_name_template=''))
def _WriteMetricsPlotsAndValidations(  # pylint: disable=invalid-name
        evaluation: evaluator.Evaluation, output_paths: Dict[Text, Text],
        eval_config: config.EvalConfig,
        add_metrics_callbacks: List[types.AddMetricsCallbackType],
        metrics_key: Text, plots_key: Text, validations_key: Text,
        output_file_format: Text) -> beam.pvalue.PDone:
    """PTransform to write metrics and plots."""

    if output_file_format and output_file_format != 'tfrecord':
        raise ValueError(
            'only "{}" format is currently supported: output_file_format={}'.
            format('tfrecord', output_file_format))

    if metrics_key in evaluation:
        metrics = (evaluation[metrics_key] | 'ConvertSliceMetricsToProto' >>
                   beam.Map(convert_slice_metrics_to_proto,
                            add_metrics_callbacks=add_metrics_callbacks))

        if constants.METRICS_KEY in output_paths:
            _ = metrics | 'WriteMetrics' >> beam.io.WriteToTFRecord(
                file_path_prefix=output_paths[constants.METRICS_KEY],
                shard_name_template=None if output_file_format else '',
                file_name_suffix=('.' + output_file_format
                                  if output_file_format else ''),
                coder=beam.coders.ProtoCoder(
                    metrics_for_slice_pb2.MetricsForSlice))

    if plots_key in evaluation:
        plots = (evaluation[plots_key] | 'ConvertSlicePlotsToProto' >>
                 beam.Map(convert_slice_plots_to_proto,
                          add_metrics_callbacks=add_metrics_callbacks))

        if constants.PLOTS_KEY in output_paths:
            _ = plots | 'WritePlots' >> beam.io.WriteToTFRecord(
                file_path_prefix=output_paths[constants.PLOTS_KEY],
                shard_name_template=None if output_file_format else '',
                file_name_suffix=('.' + output_file_format
                                  if output_file_format else ''),
                coder=beam.coders.ProtoCoder(
                    metrics_for_slice_pb2.PlotsForSlice))

    if validations_key in evaluation:
        validations = (evaluation[validations_key]
                       | 'MergeValidationResults' >> beam.CombineGlobally(
                           _CombineValidations(eval_config)))

        if constants.VALIDATIONS_KEY in output_paths:
            # We only use a single shard here because validations are usually single
            # values.
            _ = validations | 'WriteValidations' >> beam.io.WriteToTFRecord(
                file_path_prefix=output_paths[constants.VALIDATIONS_KEY],
                shard_name_template='',
                file_name_suffix=('.' + output_file_format
                                  if output_file_format else ''),
                coder=beam.coders.ProtoCoder(
                    validation_result_pb2.ValidationResult))

    return beam.pvalue.PDone(list(evaluation.values())[0].pipeline)
Exemplo n.º 3
0
def _WriteMetricsPlotsAndValidations(
        evaluation: evaluator.Evaluation, output_paths: Dict[Text, Text],
        add_metrics_callbacks: List[types.AddMetricsCallbackType],
        metrics_key: Text, plots_key: Text,
        validations_key: Text) -> beam.pvalue.PDone:
    """PTransform to write metrics and plots."""
    # Skip write if no metrics, plots, or validations are used.
    if (metrics_key not in evaluation and plots_key not in evaluation
            and validations_key not in evaluation):
        return beam.pvalue.PDone(list(evaluation.values())[0].pipeline)

    if metrics_key in evaluation:
        metrics = (evaluation[metrics_key] | 'SerializeMetrics' >>
                   metrics_and_plots_serialization.SerializeMetrics(
                       add_metrics_callbacks=add_metrics_callbacks))
        if constants.METRICS_KEY in output_paths:
            # We only use a single shard here because metrics are usually single
            # values so even with 1M slices and a handful of metrics the size
            # requirements will only be a few hundred MB.
            _ = metrics | 'WriteMetrics' >> beam.io.WriteToTFRecord(
                file_path_prefix=output_paths[constants.METRICS_KEY],
                shard_name_template='')

    if plots_key in evaluation:
        plots = (
            evaluation[plots_key]
            |
            'SerializePlots' >> metrics_and_plots_serialization.SerializePlots(
                add_metrics_callbacks=add_metrics_callbacks))
        if constants.PLOTS_KEY in output_paths:
            # We only use a single shard here because we are assuming that plots will
            # not be enabled when millions of slices are in use. By default plots are
            # stored with 1K thresholds with each plot entry taking up to 7 fields
            # (tp, fp, ... recall) so if this assumption is false the output can end
            # up in the hundreds of GB.
            _ = plots | 'WritePlots' >> beam.io.WriteToTFRecord(
                file_path_prefix=output_paths[constants.PLOTS_KEY],
                shard_name_template='')

    if validations_key in evaluation:
        validations = (evaluation[validations_key]
                       | 'MergeValidationResults' >> beam.CombineGlobally(
                           _CombineValidations())
                       |
                       'SerializeValidationResults' >> SerializeValidations())
        if constants.VALIDATIONS_KEY in output_paths:
            # We only use a single shard here because validations are usually single
            # values.
            _ = validations | 'WriteValidations' >> beam.io.WriteToTFRecord(
                file_path_prefix=output_paths[constants.VALIDATIONS_KEY],
                shard_name_template='')
    return beam.pvalue.PDone(metrics.pipeline)
Exemplo n.º 4
0
def _WriteMetricsPlotsAndValidations(  # pylint: disable=invalid-name
        evaluation: evaluator.Evaluation,
        output_paths: Dict[Text, Text],
        eval_config: config.EvalConfig,
        add_metrics_callbacks: List[types.AddMetricsCallbackType],
        metrics_key: Text,
        plots_key: Text,
        attributions_key: Text,
        validations_key: Text,
        output_file_format: Text,
        rubber_stamp: bool = False) -> beam.pvalue.PDone:
    """PTransform to write metrics and plots."""

    if output_file_format and output_file_format not in _SUPPORTED_FORMATS:
        raise ValueError('only "{}" formats are currently supported but got '
                         'output_file_format={}'.format(
                             _SUPPORTED_FORMATS, output_file_format))

    def convert_slice_key_to_parquet_dict(
            slice_key: metrics_for_slice_pb2.SliceKey
    ) -> _SliceKeyDictPythonType:
        single_slice_key_dicts = []
        for single_slice_key in slice_key.single_slice_keys:
            kind = single_slice_key.WhichOneof('kind')
            if not kind:
                continue
            single_slice_key_dicts.append(
                {kind: getattr(single_slice_key, kind)})
        return {_SINGLE_SLICE_KEYS_PARQUET_FIELD_NAME: single_slice_key_dicts}

    def convert_to_parquet_columns(
        value: Union[metrics_for_slice_pb2.MetricsForSlice,
                     metrics_for_slice_pb2.PlotsForSlice,
                     metrics_for_slice_pb2.AttributionsForSlice]
    ) -> Dict[Text, Union[_SliceKeyDictPythonType, bytes]]:
        return {
            _SLICE_KEY_PARQUET_COLUMN_NAME:
            convert_slice_key_to_parquet_dict(value.slice_key),
            _SERIALIZED_VALUE_PARQUET_COLUMN_NAME:
            value.SerializeToString()
        }

    if metrics_key in evaluation and constants.METRICS_KEY in output_paths:
        metrics = (evaluation[metrics_key] | 'ConvertSliceMetricsToProto' >>
                   beam.Map(convert_slice_metrics_to_proto,
                            add_metrics_callbacks=add_metrics_callbacks))

        file_path_prefix = output_paths[constants.METRICS_KEY]
        if output_file_format == _PARQUET_FORMAT:
            _ = (metrics
                 | 'ConvertToParquetColumns' >>
                 beam.Map(convert_to_parquet_columns)
                 | 'WriteMetricsToParquet' >> beam.io.WriteToParquet(
                     file_path_prefix=file_path_prefix,
                     schema=_SLICED_PARQUET_SCHEMA,
                     file_name_suffix='.' + output_file_format))
        elif not output_file_format or output_file_format == _TFRECORD_FORMAT:
            _ = metrics | 'WriteMetrics' >> beam.io.WriteToTFRecord(
                file_path_prefix=file_path_prefix,
                shard_name_template=None if output_file_format else '',
                file_name_suffix=('.' + output_file_format
                                  if output_file_format else ''),
                coder=beam.coders.ProtoCoder(
                    metrics_for_slice_pb2.MetricsForSlice))

    if plots_key in evaluation and constants.PLOTS_KEY in output_paths:
        plots = (evaluation[plots_key] | 'ConvertSlicePlotsToProto' >>
                 beam.Map(convert_slice_plots_to_proto,
                          add_metrics_callbacks=add_metrics_callbacks))

        file_path_prefix = output_paths[constants.PLOTS_KEY]
        if output_file_format == _PARQUET_FORMAT:
            _ = (plots
                 | 'ConvertPlotsToParquetColumns' >>
                 beam.Map(convert_to_parquet_columns)
                 | 'WritePlotsToParquet' >> beam.io.WriteToParquet(
                     file_path_prefix=file_path_prefix,
                     schema=_SLICED_PARQUET_SCHEMA,
                     file_name_suffix='.' + output_file_format))
        elif not output_file_format or output_file_format == _TFRECORD_FORMAT:
            _ = plots | 'WritePlotsToTFRecord' >> beam.io.WriteToTFRecord(
                file_path_prefix=file_path_prefix,
                shard_name_template=None if output_file_format else '',
                file_name_suffix=('.' + output_file_format
                                  if output_file_format else ''),
                coder=beam.coders.ProtoCoder(
                    metrics_for_slice_pb2.PlotsForSlice))

    if (attributions_key in evaluation
            and constants.ATTRIBUTIONS_KEY in output_paths):
        attributions = (evaluation[attributions_key]
                        | 'ConvertSliceAttributionsToProto' >>
                        beam.Map(convert_slice_attributions_to_proto))

        file_path_prefix = output_paths[constants.ATTRIBUTIONS_KEY]
        if output_file_format == _PARQUET_FORMAT:
            _ = (attributions
                 | 'ConvertAttributionsToParquetColumns' >>
                 beam.Map(convert_to_parquet_columns)
                 | 'WriteAttributionsToParquet' >> beam.io.WriteToParquet(
                     file_path_prefix=file_path_prefix,
                     schema=_SLICED_PARQUET_SCHEMA,
                     file_name_suffix='.' + output_file_format))
        elif not output_file_format or output_file_format == _TFRECORD_FORMAT:
            _ = attributions | 'WriteAttributionsToTFRecord' >> beam.io.WriteToTFRecord(
                file_path_prefix=file_path_prefix,
                shard_name_template=None if output_file_format else '',
                file_name_suffix=('.' + output_file_format
                                  if output_file_format else ''),
                coder=beam.coders.ProtoCoder(
                    metrics_for_slice_pb2.AttributionsForSlice))

    if (validations_key in evaluation
            and constants.VALIDATIONS_KEY in output_paths):
        validations = (
            evaluation[validations_key]
            | 'MergeValidationResults' >> beam.CombineGlobally(
                CombineValidations(eval_config, rubber_stamp=rubber_stamp)))

        file_path_prefix = output_paths[constants.VALIDATIONS_KEY]
        # We only use a single shard here because validations are usually single
        # values. Setting the shard_name_template to the empty string forces this.
        shard_name_template = ''
        if output_file_format == _PARQUET_FORMAT:
            _ = (
                validations
                | 'ConvertValidationsToParquetColumns' >>
                beam.Map(lambda v:  # pylint: disable=g-long-lambda
                         {
                             _SERIALIZED_VALUE_PARQUET_COLUMN_NAME:
                             v.SerializeToString()
                         })
                | 'WriteValidationsToParquet' >> beam.io.WriteToParquet(
                    file_path_prefix=file_path_prefix,
                    shard_name_template=shard_name_template,
                    schema=_UNSLICED_PARQUET_SCHEMA,
                    file_name_suffix='.' + output_file_format))
        elif not output_file_format or output_file_format == _TFRECORD_FORMAT:
            _ = (validations
                 | 'WriteValidationsToTFRecord' >> beam.io.WriteToTFRecord(
                     file_path_prefix=file_path_prefix,
                     shard_name_template=shard_name_template,
                     file_name_suffix=('.' + output_file_format
                                       if output_file_format else ''),
                     coder=beam.coders.ProtoCoder(
                         validation_result_pb2.ValidationResult)))

    return beam.pvalue.PDone(list(evaluation.values())[0].pipeline)