示例#1
0
    def testReadTransformFn(self):
        path = self.get_temp_dir()
        # NOTE: we don't need to create or write to the transform_fn directory since
        # ReadTransformFn never inspects this directory.
        transform_fn_dir = os.path.join(path, 'transform_fn')
        transformed_metadata_dir = os.path.join(path, 'transformed_metadata')
        metadata_io.write_metadata(_TEST_METADATA, transformed_metadata_dir)

        with beam.Pipeline() as pipeline:
            saved_model_dir_pcoll, metadata = (
                pipeline | transform_fn_io.ReadTransformFn(path))
            beam_test_util.assert_that(saved_model_dir_pcoll,
                                       beam_test_util.equal_to(
                                           [transform_fn_dir]),
                                       label='AssertSavedModelDir')
            # NOTE: metadata is currently read in a non-deferred manner.
            self.assertEqual(metadata, _TEST_METADATA)
示例#2
0
    def testTransformFnExportAndImportRoundtrip(self):
        tranform_fn_dir = os.path.join(self.get_temp_dir(),
                                       'export_transform_fn')
        metadata_dir = os.path.join(self.get_temp_dir(), 'export_metadata')

        with beam.Pipeline() as p:

            def preprocessing_fn(inputs):
                return {'x_scaled': tft.scale_to_0_1(inputs['x'])}

            metadata = self.toMetadata(
                {'x': tf.FixedLenFeature((), tf.float32, 0)})
            columns = p | 'CreateTrainingData' >> beam.Create([{
                'x': v
            } for v in [4, 1, 5, 2]])
            with beam_impl.Context(temp_dir=self.get_temp_dir()):
                _, transform_fn = (
                    (columns, metadata)
                    | 'Analyze and Transform' >>
                    beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

            _ = transform_fn | transform_fn_io.WriteTransformFn(
                tranform_fn_dir)
            _ = metadata | beam_metadata_io.WriteMetadata(metadata_dir,
                                                          pipeline=p)

        with beam.Pipeline() as p:
            transform_fn = p | transform_fn_io.ReadTransformFn(tranform_fn_dir)
            metadata = p | beam_metadata_io.ReadMetadata(metadata_dir)
            # Run transform_columns on some eval dataset.
            eval_data = p | 'CreateEvalData' >> beam.Create([{
                'x': v
            } for v in [6, 3]])
            transformed_eval_data, _ = (
                ((eval_data, metadata), transform_fn)
                | 'Transform' >> beam_impl.TransformDataset())
            expected_transformed_eval_data = [{
                'x_scaled': v
            } for v in [1.25, 0.5]]
            beam_test_util.assert_that(
                transformed_eval_data,
                beam_test_util.equal_to(expected_transformed_eval_data))
示例#3
0
def transform_data(input_handle,
                   outfile_prefix,
                   working_dir,
                   schema_file,
                   transform_dir=None,
                   max_rows=None,
                   pipeline_args=None):
    """The main tf.transform method which analyzes and transforms data.

  Args:
    input_handle: BigQuery table name to process specified as DATASET.TABLE or
      path to csv file with input data.
    outfile_prefix: Filename prefix for emitted transformed examples
    working_dir: Directory in which transformed examples and transform function
      will be emitted.
    schema_file: An file path that contains a text-serialized TensorFlow
      metadata schema of the input data.
    transform_dir: Directory in which the transform output is located. If
      provided, this will load the transform_fn from disk instead of computing
      it over the data. Hint: this is useful for transforming eval data.
    max_rows: Number of rows to query from BigQuery
    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.
  """
    def preprocessing_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
        outputs = {}
        for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[taxi.transformed_name(key)] = transform.scale_to_z_score(
                _fill_in_missing(inputs[key]))

        for key in taxi.VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            outputs[taxi.transformed_name(
                key)] = transform.compute_and_apply_vocabulary(
                    _fill_in_missing(inputs[key]),
                    top_k=taxi.VOCAB_SIZE,
                    num_oov_buckets=taxi.OOV_SIZE)

        for key in taxi.BUCKET_FEATURE_KEYS:
            outputs[taxi.transformed_name(key)] = transform.bucketize(
                _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT)

        for key in taxi.CATEGORICAL_FEATURE_KEYS:
            outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key])

        # Was this passenger a big tipper?
        taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY])
        tips = _fill_in_missing(inputs[taxi.LABEL_KEY])
        outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
            tf.is_nan(taxi_fare),
            tf.cast(tf.zeros_like(taxi_fare), tf.int64),
            # Test if the tip was > 20% of the fare.
            tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
                    tf.int64))

        return outputs

    schema = taxi.read_schema(schema_file)
    raw_feature_spec = taxi.get_raw_feature_spec(schema)
    raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema)

    with beam.Pipeline(argv=pipeline_args) as pipeline:
        with beam_impl.Context(temp_dir=working_dir):
            if input_handle.lower().endswith('csv'):
                csv_coder = taxi.make_csv_coder(schema)
                raw_data = (pipeline
                            | 'ReadFromText' >> beam.io.ReadFromText(
                                input_handle, skip_header_lines=1)
                            | 'ParseCSV' >> beam.Map(csv_coder.decode))
            else:
                query = taxi.make_sql(input_handle, max_rows, for_eval=False)
                raw_data = (
                    pipeline
                    | 'ReadBigQuery' >> beam.io.Read(
                        beam.io.BigQuerySource(query=query,
                                               use_standard_sql=True))
                    |
                    'CleanData' >> beam.Map(taxi.clean_raw_data_dict,
                                            raw_feature_spec=raw_feature_spec))

            if transform_dir is None:
                transform_fn = (
                    (raw_data, raw_data_metadata)
                    |
                    ('Analyze' >> beam_impl.AnalyzeDataset(preprocessing_fn)))

                _ = (transform_fn
                     | ('WriteTransformFn' >>
                        transform_fn_io.WriteTransformFn(working_dir)))
            else:
                transform_fn = pipeline | transform_fn_io.ReadTransformFn(
                    transform_dir)

            # Shuffling the data before materialization will improve Training
            # effectiveness downstream.
            shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle(
            )

            (transformed_data, transformed_metadata) = (
                ((shuffled_data, raw_data_metadata), transform_fn)
                | 'Transform' >> beam_impl.TransformDataset())

            coder = example_proto_coder.ExampleProtoCoder(
                transformed_metadata.schema)
            _ = (transformed_data
                 | 'SerializeExamples' >> beam.Map(coder.encode)
                 | 'WriteExamples' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir, outfile_prefix),
                     file_name_suffix='.gz'))
示例#4
0
    def _RunBeamImpl(self, analyze_data_list: List[executor._Dataset],
                     transform_data_list: List[executor._Dataset],
                     transform_graph_uri: Text,
                     input_dataset_metadata: dataset_metadata.DatasetMetadata,
                     transform_output_path: Text,
                     raw_examples_data_format: int, temp_path: Text,
                     compute_statistics: bool,
                     per_set_stats_output_paths: Sequence[Text],
                     materialization_format: Optional[Text],
                     analyze_paths_count: int) -> executor._Status:
        """Perform data preprocessing with TFT.

    Args:
      analyze_data_list: List of datasets for analysis.
      transform_data_list: List of datasets for transform.
      preprocessing_fn: The tf.Transform preprocessing_fn.
      input_dataset_metadata: A DatasetMetadata object for the input data.
      transform_output_path: An absolute path to write the output to.
      raw_examples_data_format: The data format of the raw examples. One of the
        enums from example_gen_pb2.PayloadFormat.
      temp_path: A path to a temporary dir.
      compute_statistics: A bool indicating whether or not compute statistics.
      per_set_stats_output_paths: Paths to per-set statistics output. If empty,
        per-set statistics is not produced.
      materialization_format: A string describing the format of the materialized
        data or None if materialization is not enabled.
      analyze_paths_count: An integer, the number of paths that should be used
        for analysis.

    Returns:
      Status of the execution.
    """
        self._AssertSameTFXIOSchema(analyze_data_list)
        unprojected_typespecs = (
            analyze_data_list[0].tfxio.TensorAdapter().OriginalTypeSpecs())

        tf_transform_output = tft.TFTransformOutput(transform_graph_uri)

        analyze_input_columns = tft.get_analyze_input_columns(
            tf_transform_output.transform_raw_features, unprojected_typespecs)
        transform_input_columns = tft.get_transform_input_columns(
            tf_transform_output.transform_raw_features, unprojected_typespecs)
        # Use the same dataset (same columns) for AnalyzeDataset and computing
        # pre-transform stats so that the data will only be read once for these
        # two operations.
        if compute_statistics:
            analyze_input_columns = list(
                set(
                    list(analyze_input_columns) +
                    list(transform_input_columns)))

        for d in analyze_data_list:
            d.tfxio = d.tfxio.Project(analyze_input_columns)

        self._AssertSameTFXIOSchema(analyze_data_list)
        analyze_data_tensor_adapter_config = (
            analyze_data_list[0].tfxio.TensorAdapterConfig())

        for d in transform_data_list:
            d.tfxio = d.tfxio.Project(transform_input_columns)

        desired_batch_size = self._GetDesiredBatchSize(
            raw_examples_data_format)

        with self._CreatePipeline(transform_output_path) as pipeline:
            with tft_beam.Context(
                    temp_dir=temp_path,
                    desired_batch_size=desired_batch_size,
                    passthrough_keys=self._GetTFXIOPassthroughKeys(),
                    use_deep_copy_optimization=True,
                    use_tfxio=True):
                # pylint: disable=expression-not-assigned
                # pylint: disable=no-value-for-parameter
                # _ = (
                #     pipeline
                #     | 'IncrementPipelineMetrics' >> self._IncrementPipelineMetrics(
                #         len(unprojected_typespecs), len(analyze_input_columns),
                #         len(transform_input_columns), analyze_paths_count))
                #
                # # (new_analyze_data_dict, input_cache) = (
                # #     pipeline
                # #     | 'OptimizeRun' >> self._OptimizeRun(
                # #         input_cache_dir, output_cache_dir, analyze_data_list,
                # #         unprojected_typespecs, preprocessing_fn,
                # #         self._GetCacheSource()))
                #
                # # if input_cache:
                # #   absl.logging.debug('Analyzing data with cache.')
                #
                # full_analyze_dataset_keys_list = [
                #     dataset.dataset_key for dataset in analyze_data_list
                # ]
                #
                # # Removing unneeded datasets if they won't be needed for statistics or
                # # materialization.
                # # if materialization_format is None and not compute_statistics:
                # #   if None in new_analyze_data_dict.values():
                # #     absl.logging.debug(
                # #         'Not reading the following datasets due to cache: %s', [
                # #             dataset.file_pattern
                # #             for dataset in analyze_data_list
                # #             if new_analyze_data_dict[dataset.dataset_key] is None
                # #         ])
                # #   analyze_data_list = [
                # #       d for d in new_analyze_data_dict.values() if d is not None
                # #   ]
                #
                # input_analysis_data = {}
                # for dataset in analyze_data_list:
                #   infix = 'AnalysisIndex{}'.format(dataset.index)
                #   dataset.standardized = (
                #       pipeline
                #       | 'TFXIOReadAndDecode[{}]'.format(infix) >>
                #       dataset.tfxio.BeamSource(desired_batch_size))
                #
                #   input_analysis_data[dataset.dataset_key] = dataset.standardized
                # # input_analysis_data = {}
                # # for key, dataset in new_analyze_data_dict.items():
                # #   input_analysis_data[key] = (
                # #       None if dataset is None else dataset.standardized)
                #
                # # transform_fn, cache_output = (
                # #     (input_analysis_data, input_cache,
                # #      analyze_data_tensor_adapter_config)
                #     # | 'Analyze' >> tft_beam.AnalyzeDatasetWithCache(
                #     #     preprocessing_fn, pipeline=pipeline))
                # transform_fn = (
                #     (input_analysis_data, analyze_data_tensor_adapter_config)
                #     | 'Analyze' >> tft_beam.AnalyzeDataset(
                #         tf_transform_output.transform_raw_features, pipeline=pipeline))

                # WriteTransformFn writes transform_fn and metadata to subdirectories
                # tensorflow_transform.SAVED_MODEL_DIR and
                # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively.
                # (transform_fn
                #  | 'WriteTransformFn'
                #  >> tft_beam.WriteTransformFn(transform_output_path))

                if compute_statistics or materialization_format is not None:
                    transform_fn = (
                        pipeline
                        | transform_fn_io.ReadTransformFn(transform_graph_uri))

                    # Do not compute pre-transform stats if the input format is raw proto,
                    # as StatsGen would treat any input as tf.Example. Note that
                    # tf.SequenceExamples are wire-format compatible with tf.Examples.
                    if (compute_statistics and not self._IsDataFormatProto(
                            raw_examples_data_format)):
                        # Aggregated feature stats before transformation.
                        pre_transform_feature_stats_path = os.path.join(
                            transform_output_path, tft.TFTransformOutput.
                            PRE_TRANSFORM_FEATURE_STATS_PATH)

                        if self._IsDataFormatSequenceExample(
                                raw_examples_data_format):
                            schema_proto = None
                        else:
                            schema_proto = executor._GetSchemaProto(
                                input_dataset_metadata)

                        if self._IsDataFormatSequenceExample(
                                raw_examples_data_format):

                            def _ExtractRawExampleBatches(record_batch):
                                return record_batch.column(
                                    record_batch.schema.get_field_index(
                                        RAW_EXAMPLE_KEY)).flatten().to_pylist(
                                        )

                            # Make use of the fact that tf.SequenceExample is wire-format
                            # compatible with tf.Example
                            stats_input = []
                            for dataset in analyze_data_list:
                                infix = 'AnalysisIndex{}'.format(dataset.index)
                                stats_input.append(
                                    dataset.standardized
                                    | 'ExtractRawExampleBatches[{}]'.format(
                                        infix) >> beam.Map(
                                            _ExtractRawExampleBatches)
                                    |
                                    'DecodeSequenceExamplesAsExamplesIntoRecordBatches[{}]'
                                    .format(infix) >> beam.ParDo(
                                        self._ToArrowRecordBatchesFn(
                                            schema_proto)))
                        else:
                            stats_input = [
                                dataset.standardized
                                for dataset in analyze_data_list
                            ]

                        pre_transform_stats_options = (
                            transform_stats_options.
                            get_pre_transform_stats_options())
                        (stats_input
                         | 'FlattenAnalysisDatasets' >>
                         beam.Flatten(pipeline=pipeline)
                         | 'GenerateStats[FlattenedAnalysisDataset]' >>
                         self._GenerateStats(
                             pre_transform_feature_stats_path,
                             schema_proto,
                             stats_options=pre_transform_stats_options))

                    # transform_data_list is a superset of analyze_data_list, we pay the
                    # cost to read the same dataset (analyze_data_list) again here to
                    # prevent certain beam runner from doing large temp materialization.
                    for dataset in transform_data_list:
                        infix = 'TransformIndex{}'.format(dataset.index)
                        dataset.standardized = (
                            pipeline | 'TFXIOReadAndDecode[{}]'.format(infix)
                            >> dataset.tfxio.BeamSource(desired_batch_size))
                        (dataset.transformed,
                         metadata) = (((dataset.standardized,
                                        dataset.tfxio.TensorAdapterConfig()),
                                       transform_fn)
                                      | 'Transform[{}]'.format(infix) >>
                                      tft_beam.TransformDataset())

                        dataset.transformed_and_serialized = (
                            dataset.transformed
                            | 'EncodeAndSerialize[{}]'.format(infix) >>
                            beam.ParDo(self._EncodeAsSerializedExamples(),
                                       executor._GetSchemaProto(metadata)))

                    if compute_statistics:
                        # Aggregated feature stats after transformation.
                        _, metadata = transform_fn

                        # TODO(b/70392441): Retain tf.Metadata (e.g., IntDomain) in
                        # schema. Currently input dataset schema only contains dtypes,
                        # and other metadata is dropped due to roundtrip to tensors.
                        transformed_schema_proto = executor._GetSchemaProto(
                            metadata)

                        for dataset in transform_data_list:
                            infix = 'TransformIndex{}'.format(dataset.index)
                            dataset.transformed_and_standardized = (
                                dataset.transformed_and_serialized
                                | 'FromTransformedToArrowRecordBatches[{}]'.
                                format(infix) >> self._ToArrowRecordBatches(
                                    schema=transformed_schema_proto))

                        post_transform_feature_stats_path = os.path.join(
                            transform_output_path, tft.TFTransformOutput.
                            POST_TRANSFORM_FEATURE_STATS_PATH)

                        post_transform_stats_options = (
                            transform_stats_options.
                            get_post_transform_stats_options())
                        ([
                            dataset.transformed_and_standardized
                            for dataset in transform_data_list
                        ]
                         | 'FlattenTransformedDatasets' >> beam.Flatten()
                         | 'GenerateStats[FlattenedTransformedDatasets]' >>
                         self._GenerateStats(
                             post_transform_feature_stats_path,
                             transformed_schema_proto,
                             stats_options=post_transform_stats_options))

                        if per_set_stats_output_paths:
                            # TODO(b/130885503): Remove duplicate stats gen compute that is
                            # done both on a flattened view of the data, and on each span
                            # below.
                            for dataset in transform_data_list:
                                infix = 'TransformIndex{}'.format(
                                    dataset.index)
                                (dataset.transformed_and_standardized
                                 | 'GenerateStats[{}]'.format(infix) >>
                                 self._GenerateStats(
                                     dataset.stats_output_path,
                                     transformed_schema_proto,
                                     stats_options=post_transform_stats_options
                                 ))

                    if materialization_format is not None:
                        for dataset in transform_data_list:
                            infix = 'TransformIndex{}'.format(dataset.index)
                            (dataset.transformed_and_serialized
                             | 'Materialize[{}]'.format(infix) >>
                             self._WriteExamples(
                                 materialization_format,
                                 dataset.materialize_output_path))

        return executor._Status.OK()
示例#5
0
def write_to_tfrecord(args):
    """
    This function is supposed to be called as a script.
    """
    # Decode arguments
    current_index, num_shards, train_split_fname_out, eval_split_fname_out, \
    exp_log_data_file_train_tfrecord, exp_log_data_file_eval_tfrecord, working_dir, data_formatter_module_path = args

    # num_shards = "32"
    current_index, num_shards = int(current_index), int(num_shards)

    split_train_file_pattern = '{}-{:05}-of-{:05}'.format(
        train_split_fname_out, current_index, num_shards) + '*'
    split_eval_file_pattern = '{}-{:05}-of-{:05}'.format(
        eval_split_fname_out, current_index, num_shards)

    log.info('exp_log_data_file_train_tfrecord {}'.format(
        exp_log_data_file_train_tfrecord))
    log.info('exp_log_data_file_eval_tfrecord {}'.format(
        exp_log_data_file_eval_tfrecord))
    log.info('split_train_file_pattern {}'.format(split_train_file_pattern))
    log.info('split_eval_file_pattern {}'.format(split_eval_file_pattern))

    data_formatter = import_from_uri(
        data_formatter_module_path).DataFormatter()

    # Set up the preprocessing pipeline.
    pipeline = beam.Pipeline(runner=DirectRunner())

    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
        # Read raw data files: CSV format ordered according to the `data_formatter`, that are then converted
        # into a cleaned up format.
        raw_train_data = (
            pipeline
            | 'ReadTrainDataFile' >> textio.ReadFromText(
                split_train_file_pattern, skip_header_lines=0)
            | 'DecodeTrainDataCSV' >> MapAndFilterErrors(
                tft_coders.CsvCoder(
                    data_formatter.get_features_and_targets(),
                    data_formatter.get_features_metadata().schema).decode))

        raw_eval_data = (
            pipeline
            | 'ReadEvalDataFile' >> textio.ReadFromText(
                split_eval_file_pattern, skip_header_lines=0)
            | 'DecodeEvalDataCSV' >> MapAndFilterErrors(
                tft_coders.CsvCoder(
                    data_formatter.get_features_and_targets(),
                    data_formatter.get_features_metadata().schema).decode))

        # Examples in tf-example format (for model analysis purposes).
        # raw_feature_spec = data_formatter.RAW_DATA_METADATA.schema.as_feature_spec()
        # raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
        # coder = example_proto_coder.ExampleProtoCoder(raw_schema)
        #
        # _ = (
        #         raw_eval_data
        #         | 'ToSerializedTFExample' >> beam.Map(coder.encode)
        #         | 'WriteAnalysisTFRecord' >> tfrecordio.WriteToTFRecord(
        #     '{}-{:05}-of-{:05}'.format(analysis_fname_out, i, num_shards),
        #     shard_name_template='', num_shards=1)
        # )

        # Write SavedModel and metadata to two subdirectories of working_dir, given by
        # `transform_fn_io.TRANSFORM_FN_DIR` and `transform_fn_io.TRANSFORMED_METADATA_DIR` respectively.
        transform_fn = (pipeline
                        | 'ReadTransformGraph' >>
                        transform_fn_io.ReadTransformFn(working_dir))

        # Applies the transformation `transform_fn` to the raw eval dataset
        (transformed_train_data, transformed_metadata) = (
            ((raw_train_data, data_formatter.get_features_metadata()),
             transform_fn)
            | 'TransformTrainData' >> beam_impl.TransformDataset())

        # Applies the transformation `transform_fn` to the raw eval dataset
        (transformed_eval_data, transformed_metadata) = (
            ((raw_eval_data, data_formatter.get_features_metadata()),
             transform_fn)
            | 'TransformEvalData' >> beam_impl.TransformDataset())

        # The data schema of the transformed data gets used to build a signature to create
        # a TFRecord (tf binary data format). This signature is a wrapper function used to
        # encode transformed data.
        transformed_data_coder = tft.coders.ExampleProtoCoder(
            transformed_metadata.schema)

        _ = (transformed_train_data
             | 'EncodeTrainDataTransform' >> MapAndFilterErrors(
                 transformed_data_coder.encode)
             | 'WriteTrainDataTFRecord' >>
             tfrecordio.WriteToTFRecord('{}-{:05}-of-{:05}'.format(
                 exp_log_data_file_train_tfrecord, current_index, num_shards),
                                        shard_name_template='',
                                        num_shards=1))

        _ = (transformed_eval_data
             | 'EncodeEvalDataTransform' >> MapAndFilterErrors(
                 transformed_data_coder.encode)
             | 'WriteEvalDataTFRecord' >>
             tfrecordio.WriteToTFRecord('{}-{:05}-of-{:05}'.format(
                 exp_log_data_file_eval_tfrecord, current_index, num_shards),
                                        shard_name_template='',
                                        num_shards=1))

    result = pipeline.run()
    result.wait_until_finish()
示例#6
0
def tftransform(
    pipeline_args,  # type: List[str]
    temp_location,  # type: str
    schema_file,  # type: str
    output_dir,  # type: str
    preprocessing_fn,  # type: Any
    training_data=None,  # type: Union[None, str]
    evaluation_data=None,  # type: Union[None, str]
    transform_fn_dir=None,  # type: Union[None, str]
    compression_type=None  # type: str
):  # type: (...) -> PipelineState
    """
    Generic tf.transform pipeline that takes tf.{example, record} training and evaluation
    datasets and outputs transformed data together with transform function Saved Model.

    :param pipeline_args: un-parsed Dataflow arguments
    :param temp_location: temporary location for dataflow job working dir
    :param schema_file: path to the raw feature schema text file
    :param output_dir: output dir for transformed data and function
    :param preprocessing_fn: tf.transform preprocessing function
    :param training_data: path to the training data
    :param evaluation_data: path to the evaluation data
    :param transform_fn_dir: dir to previously saved transformation function to apply
    :param compression_type: compression type for writing of tf.records
    :return final state of the Beam pipeline
    """
    assert_not_empty_string(temp_location)
    assert_not_empty_string(schema_file)
    assert_not_empty_string(output_dir)
    assert_not_none(preprocessing_fn)

    if compression_type is None:
        compression_type = CompressionTypes.AUTO

    raw_feature_spec = schema_txt_file_to_feature_spec(schema_file)
    raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema)
    raw_data_coder = ExampleProtoCoder(raw_data_metadata.schema)

    transformed_train_output_dir = os.path.join(output_dir, "training")
    transformed_eval_output_dir = os.path.join(output_dir, "evaluation")

    if not any(i.startswith("--job_name") for i in pipeline_args):
        pipeline_args.append("--job_name=tf-transform-{}-{}".format(
            getpass.getuser(), int(time.time())))

    pipeline = beam.Pipeline(argv=pipeline_args)
    with beam_impl.Context(temp_dir=temp_location):
        if training_data is not None:
            # if training data is provided, transform_fn_dir will be ignored
            if transform_fn_dir is not None:
                warnings.warn(
                    "Transform_fn_dir is ignored because training_data is provided"
                )

            transform_fn_output = os.path.join(output_dir, "transform_fn",
                                               "saved_model.pb")
            if FileSystems.exists(transform_fn_output):
                raise ValueError("Transform fn already exists at %s!" %
                                 transform_fn_output)

            # compute the transform_fn and apply to the training data
            raw_train_data = (pipeline
                              | "ReadTrainData" >> tfrecordio.ReadFromTFRecord(
                                  training_data, coder=raw_data_coder))

            ((transformed_train_data, transformed_train_metadata),
             transform_fn) = (
                 (raw_train_data, raw_data_metadata)
                 | ("AnalyzeAndTransformTrainData" >>
                    beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
             )  # noqa: E501

            _ = (  # noqa: F841
                transform_fn
                | "WriteTransformFn" >>
                transform_fn_io.WriteTransformFn(output_dir))

            transformed_train_coder = ExampleProtoCoder(
                transformed_train_metadata.schema)
            _ = (  # noqa: F841
                transformed_train_data
                | "WriteTransformedTrainData" >> tfrecordio.WriteToTFRecord(
                    os.path.join(transformed_train_output_dir,
                                 "part"),  # noqa: E501
                    coder=transformed_train_coder,  # noqa: E501
                    compression_type=compression_type,  # noqa: E501
                    file_name_suffix=".tfrecords"))  # noqa: E501
        else:
            if transform_fn_dir is None:
                raise ValueError(
                    "Either training_data or transformed_fn needs to be provided"
                )
            # load the transform_fn
            transform_fn = pipeline | transform_fn_io.ReadTransformFn(
                transform_fn_dir)

        if evaluation_data is not None:
            # if evaluation_data exists, apply the transform_fn to the evaluation data
            raw_eval_data = (pipeline
                             | "ReadEvalData" >> tfrecordio.ReadFromTFRecord(
                                 evaluation_data, coder=raw_data_coder))

            (transformed_eval_data, transformed_eval_metadata) = (
                ((raw_eval_data, raw_data_metadata), transform_fn)
                | "TransformEvalData" >> beam_impl.TransformDataset())

            transformed_eval_coder = ExampleProtoCoder(
                transformed_eval_metadata.schema)
            _ = (  # noqa: F841
                transformed_eval_data
                | "WriteTransformedEvalData" >> tfrecordio.WriteToTFRecord(
                    os.path.join(transformed_eval_output_dir,
                                 "part"),  # noqa: E501
                    coder=transformed_eval_coder,  # noqa: E501
                    compression_type=compression_type,  # noqa: E501
                    file_name_suffix=".tfrecords"))  # noqa: E501
    result = pipeline.run().wait_until_finish()

    return result
示例#7
0
def transform_data(input_handle,
                   outfile_prefix,
                   working_dir,
                   setup_file,
                   ts1,
                   ts2,
                   project=None,
                   max_rows=None,
                   mode=None,
                   stage=None,
                   preprocessing_fn=None):
    """The main tf.transform method which analyzes and transforms data.

  Args:
    input_handle: BigQuery table name to process specified as
      DATASET.TABLE or path to csv file with input data.
    outfile_prefix: Filename prefix for emitted transformed examples
    working_dir: Directory in which transformed examples and transform
      function will be emitted.
    max_rows: Number of rows to query from BigQuery
    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.
  """
    def def_preprocessing_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
        outputs = {}
        for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[taxi.transformed_name(key)] = transform.scale_to_z_score(
                _fill_in_missing(inputs[key]))

        for key in taxi.VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            outputs[taxi.transformed_name(
                key)] = transform.compute_and_apply_vocabulary(
                    _fill_in_missing(inputs[key]),
                    top_k=taxi.VOCAB_SIZE,
                    num_oov_buckets=taxi.OOV_SIZE)

        for key in taxi.BUCKET_FEATURE_KEYS:
            outputs[taxi.transformed_name(key)] = transform.bucketize(
                _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT)

        for key in taxi.CATEGORICAL_FEATURE_KEYS:
            outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key])

        # Was this passenger a big tipper?
        taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY])
        tips = _fill_in_missing(inputs[taxi.LABEL_KEY])
        outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
            tf.is_nan(taxi_fare),
            tf.cast(tf.zeros_like(taxi_fare), tf.int64),
            # Test if the tip was > 20% of the fare.
            tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
                    tf.int64))

        return outputs

    preprocessing_fn = preprocessing_fn or def_preprocessing_fn

    print('ts1 %s, ts2 %s' % (ts1, ts2))

    schema = taxi.read_schema('./schema.pbtxt')
    raw_feature_spec = taxi.get_raw_feature_spec(schema)
    raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema)

    transform_dir = None

    temp_dir = os.path.join(working_dir, 'tmp')
    if stage is None:
        stage = 'train'

    if mode == 'local':
        options = {'project': project}
        pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
        runner = 'DirectRunner'
    elif mode == 'cloud':
        options = {
            'job_name': 'tft-' + stage + '-' + str(uuid.uuid4()),
            'temp_location': temp_dir,
            'project': project,
            'save_main_session': True,
            'setup_file': setup_file
        }
        pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
        runner = 'DataFlowRunner'
    else:
        raise ValueError("Invalid mode %s." % mode)

    with beam.Pipeline(runner, options=pipeline_options) as pipeline:
        with beam_impl.Context(temp_dir=temp_dir):
            csv_coder = taxi.make_csv_coder(schema)
            # temp tft bug workaround
            mcsv_coder = make_mcsv_coder(schema)
            if 'csv' in input_handle.lower():
                # if input_handle.lower().endswith('csv'):
                raw_data = (pipeline
                            | 'ReadFromText' >> beam.io.ReadFromText(
                                input_handle, skip_header_lines=1)
                            | 'ParseCSV' >> beam.Map(csv_coder.decode))
            else:
                query = make_sql(input_handle,
                                 ts1,
                                 ts2,
                                 stage,
                                 max_rows=max_rows,
                                 for_eval=False)
                raw_data1 = (pipeline
                             | 'ReadBigQuery' >> beam.io.Read(
                                 beam.io.BigQuerySource(
                                     query=query, use_standard_sql=True)))
                raw_data = (
                    raw_data1
                    | 'CleanData' >>
                    beam.Map(lambda x:
                             (taxi.clean_raw_data_dict(x, raw_feature_spec))))

            if transform_dir is None:
                transform_fn = (
                    (raw_data, raw_data_metadata)
                    |
                    ('Analyze' >> beam_impl.AnalyzeDataset(preprocessing_fn)))

                _ = (transform_fn
                     | ('WriteTransformFn' >>
                        transform_fn_io.WriteTransformFn(working_dir)))
            else:
                transform_fn = pipeline | transform_fn_io.ReadTransformFn(
                    transform_dir)

            # Shuffling the data before materialization will improve Training
            # effectiveness downstream.
            shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle(
            )

            (transformed_data, transformed_metadata) = (
                ((shuffled_data, raw_data_metadata), transform_fn)
                | 'Transform' >> beam_impl.TransformDataset())

            if 'csv' not in input_handle.lower():  # if querying BQ
                _ = (raw_data
                     | beam.Map(mcsv_coder.encode)
                     | beam.io.WriteToText(os.path.join(
                         working_dir, '{}.csv'.format(stage)),
                                           num_shards=1))

            coder = example_proto_coder.ExampleProtoCoder(
                transformed_metadata.schema)
            _ = (transformed_data
                 | 'SerializeExamples' >> beam.Map(coder.encode)
                 | 'WriteExamples' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir, outfile_prefix),
                     file_name_suffix='.gz'))