Exemplo n.º 1
0
  def test_non_training(self):
    """Tests case where dataset contains non-training (e.g. test) data."""

    with self.pipeline as p:
      with tft_beam.Context(temp_dir=os.path.join(self.test_dir, 'tmp')):

        df = self.raw_df[self.raw_df.split == 'TEST']
        dataset = self._get_dataset(p, df)
        transform_fn = p | tft_beam.ReadTransformFn(self.transform_fn_path)
        beam_pipeline._transform_and_write_tfr(
            dataset, self.tfr_writer, transform_fn=transform_fn,
            raw_metadata=self.raw_metadata, label='Test')

    self.assertFalse(glob.glob(os.path.join(self.test_dir, 'train*.gz')))
    self.assertFalse(glob.glob(os.path.join(self.test_dir, 'validation*.gz')))
    self.assertTrue(glob.glob(os.path.join(self.test_dir, 'test*.gz')))
def _main(argv=None):
    logging.getLogger().setLevel(logging.INFO)

    parser = argparse.ArgumentParser()
    parser.add_argument('--raw_examples_path', required=True)
    parser.add_argument('--raw_examples_schema_path', required=True)
    parser.add_argument('--transform_fn_dir', required=True)
    parser.add_argument('--transformed_examples_path_prefix', required=True)
    known_args, pipeline_args = parser.parse_known_args(argv)

    raw_examples_schema = load_schema(known_args.raw_examples_schema_path)
    raw_examples_coder = tft.coders.ExampleProtoCoder(raw_examples_schema)
    raw_examples_metadata = dataset_metadata.DatasetMetadata(
        raw_examples_schema)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=pipeline_options) as pipeline:
        with tft_beam.Context(temp_dir=get_beam_temp_dir(pipeline_options)):
            transform_fn = pipeline | tft_beam.ReadTransformFn(
                known_args.transform_fn_dir)
            raw_examples = (
                pipeline
                | 'ReadRawExamples' >> beam.io.ReadFromTFRecord(
                    known_args.raw_examples_path, coder=raw_examples_coder))
            raw_examples_dataset = (raw_examples, raw_examples_metadata)
            transformed_examples, transform_examples_metadata = (
                (raw_examples_dataset, transform_fn)
                | tft_beam.TransformDataset())
            transformed_examples_coder = tft.coders.ExampleProtoCoder(
                transform_examples_metadata.schema)
            transformed_examples | 'WriteTransformedExamples' >> beam.io.WriteToTFRecord(
                known_args.transformed_examples_path_prefix,
                file_name_suffix='.tfrecord.gz',
                coder=transformed_examples_coder)
Exemplo n.º 3
0
def transform_data(input_handle,
                   outfile_prefix,
                   working_dir,
                   schema_file,
                   transform_dir=None,
                   max_rows=None,
                   pipeline_args=None):
    """The main tf.transform method which analyzes and transforms data.

  Args:
    input_handle: BigQuery table name to process specified as DATASET.TABLE or
      path to csv file with input data.
    outfile_prefix: Filename prefix for emitted transformed examples
    working_dir: Directory in which transformed examples and transform function
      will be emitted.
    schema_file: An file path that contains a text-serialized TensorFlow
      metadata schema of the input data.
    transform_dir: Directory in which the transform output is located. If
      provided, this will load the transform_fn from disk instead of computing
      it over the data. Hint: this is useful for transforming eval data.
    max_rows: Number of rows to query from BigQuery
    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.
  """
    def preprocessing_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
        outputs = {}
        for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[taxi.transformed_name(key)] = transform.scale_to_z_score(
                _fill_in_missing(inputs[key]))

        for key in taxi.VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            outputs[taxi.transformed_name(
                key)] = transform.compute_and_apply_vocabulary(
                    _fill_in_missing(inputs[key]),
                    top_k=taxi.VOCAB_SIZE,
                    num_oov_buckets=taxi.OOV_SIZE)

        for key in taxi.BUCKET_FEATURE_KEYS:
            outputs[taxi.transformed_name(key)] = transform.bucketize(
                _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT)

        for key in taxi.CATEGORICAL_FEATURE_KEYS:
            outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key])

        # Was this passenger a big tipper?
        taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY])
        tips = _fill_in_missing(inputs[taxi.LABEL_KEY])
        outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
            tf.is_nan(taxi_fare),
            tf.cast(tf.zeros_like(taxi_fare), tf.int64),
            # Test if the tip was > 20% of the fare.
            tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
                    tf.int64))

        return outputs

    schema = taxi.read_schema(schema_file)
    raw_feature_spec = taxi.get_raw_feature_spec(schema)
    raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema)

    with beam.Pipeline(argv=pipeline_args) as pipeline:
        with tft_beam.Context(temp_dir=working_dir):
            if input_handle.lower().endswith('csv'):
                csv_coder = taxi.make_csv_coder(schema)
                raw_data = (pipeline
                            | 'ReadFromText' >> beam.io.ReadFromText(
                                input_handle, skip_header_lines=1))
                decode_transform = beam.Map(csv_coder.decode)
            else:
                query = taxi.make_sql(input_handle, max_rows, for_eval=False)
                raw_data = (pipeline
                            | 'ReadBigQuery' >> beam.io.Read(
                                beam.io.BigQuerySource(query=query,
                                                       use_standard_sql=True)))
                decode_transform = beam.Map(taxi.clean_raw_data_dict,
                                            raw_feature_spec=raw_feature_spec)

            if transform_dir is None:
                decoded_data = raw_data | 'DecodeForAnalyze' >> decode_transform
                transform_fn = (
                    (decoded_data, raw_data_metadata) |
                    ('Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn)))

                _ = (transform_fn
                     | ('WriteTransformFn' >>
                        tft_beam.WriteTransformFn(working_dir)))
            else:
                transform_fn = pipeline | tft_beam.ReadTransformFn(
                    transform_dir)

            # Shuffling the data before materialization will improve Training
            # effectiveness downstream. Here we shuffle the raw_data (as opposed to
            # decoded data) since it has a compact representation.
            shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle(
            )

            decoded_data = shuffled_data | 'DecodeForTransform' >> decode_transform
            (transformed_data, transformed_metadata) = (
                ((decoded_data, raw_data_metadata), transform_fn)
                | 'Transform' >> tft_beam.TransformDataset())

            coder = example_proto_coder.ExampleProtoCoder(
                transformed_metadata.schema)
            _ = (transformed_data
                 | 'SerializeExamples' >> beam.Map(coder.encode)
                 | 'WriteExamples' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir, outfile_prefix),
                     file_name_suffix='.gz'))
Exemplo n.º 4
0
def transform_data(input_handle,
                   outfile_prefix,
                   working_dir,
                   schema_file,
                   transform_dir=None,
                   max_rows=None,
                   pipeline_args=None):
  """The main tf.transform method which analyzes and transforms data.

  Args:
    input_handle: BigQuery table name to process specified as DATASET.TABLE or
      path to csv file with input data.
    outfile_prefix: Filename prefix for emitted transformed examples
    working_dir: Directory in which transformed examples and transform function
      will be emitted.
    schema_file: An file path that contains a text-serialized TensorFlow
      metadata schema of the input data.
    transform_dir: Directory in which the transform output is located. If
      provided, this will load the transform_fn from disk instead of computing
      it over the data. Hint: this is useful for transforming eval data.
    max_rows: Number of rows to query from BigQuery
    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.
  """

  def transform_ngrams(input, ngram_range):
    """ helper function to transform ngrams and print output. """
    # this print statement causes output to concat itself!
    # input = tf.Print(input, [input], "raw input:", first_n=-1, summarize=100)

    transformed = transform.ngrams(
      tf.string_split(input, delimiter=" "),
      ngram_range=ngram_range,
      separator=' ')

    # SparseTensor basically cannot be printed because it's made up of 3
    # tensors. We can use this trick to print the values column, but without the index
    # it's not too meaningful.
    #
    # values = tf.Print(transformed.values, [transformed.values], "ngram output:")
    # transformed = tf.SparseTensor(
    #       indices=transformed.indices,
    #       values=values,
    #       dense_shape=transformed.dense_shape)
    return transformed

  def preprocessing_fn(inputs):
    """tf.transform's callback function for preprocessing inputs.
    https://cloud.google.com/solutions/machine-learning/data-preprocessing-for-ml-with-tf-transform-pt2

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
    outputs = {}
    for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
      print('processing key', key)
      print('input:', inputs[key])
      # Preserve this feature as a dense float, setting nan's to the mean.
      outputs[taxi.transformed_name(key)] = transform.scale_to_z_score(
          _fill_in_missing(inputs[key]))

    for key in taxi.VOCAB_FEATURE_KEYS:
      # Build a vocabulary for this feature.
      outputs[
          taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary(
              _fill_in_missing(inputs[key]),
              top_k=taxi.VOCAB_SIZE,
              num_oov_buckets=taxi.OOV_SIZE)

    # for key in taxi.FEATURE_NGRAM:
    #   # Extract nggrams and build a vocab.
    #   outputs[
    #       taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary(
    #           transform.ngrams(
    #             tf.string_split(_fill_in_missing(inputs[key])),
    #             ngram_range=taxi.NGRAM_RANGE,
    #             separator=' '),
    #           top_k=512,
    #           num_oov_buckets=taxi.OOV_SIZE)

    for key in taxi.FEATURE_NGRAM:
      # Extract nggrams and build a vocab.
      outputs[
          taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary(
            transform_ngrams(_fill_in_missing(inputs[key]), taxi.NGRAM_RANGE),
            top_k=taxi.VOCAB_SIZE,
            num_oov_buckets=taxi.OOV_SIZE)

    for key in taxi.BUCKET_FEATURE_KEYS:
      outputs[taxi.transformed_name(key)] = transform.bucketize(
          _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT)

    for key in taxi.CATEGORICAL_FEATURE_KEYS:
      outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key])

    # Was this passenger a big tipper?
    taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY])
    tips = _fill_in_missing(inputs[taxi.LABEL_KEY])
    outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
        tf.is_nan(taxi_fare),
        tf.cast(tf.zeros_like(taxi_fare), tf.int64),
        # Test if the tip was > 20% of the fare.
        tf.cast(
            tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
            tf.int64))

    return outputs

  schema = taxi.read_schema(schema_file)
  raw_feature_spec = taxi.get_raw_feature_spec(schema)
  raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
  raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema)

  with beam.Pipeline(argv=pipeline_args) as pipeline:
    with tft_beam.Context(temp_dir=working_dir):
      if input_handle.lower().endswith('csv'):
        csv_coder = taxi.make_csv_coder(schema, input_handle.lower())
        raw_data = (
            pipeline
            | 'ReadFromText' >> beam.io.ReadFromText(
                input_handle, skip_header_lines=1))
        decode_transform = beam.Map(csv_coder.decode)
      else:
        query = taxi.make_sql(input_handle, max_rows, for_eval=False)
        raw_data = (
            pipeline
            | 'ReadBigQuery' >> beam.io.Read(
                beam.io.BigQuerySource(query=query, use_standard_sql=True)))
        decode_transform = beam.Map(
            taxi.clean_raw_data_dict, raw_feature_spec=raw_feature_spec)

      if transform_dir is None:
        decoded_data = raw_data | 'DecodeForAnalyze' >> decode_transform
        transform_fn = (
            (decoded_data, raw_data_metadata) |
            ('Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn)))

        _ = (
            transform_fn
            | ('WriteTransformFn' >>
               tft_beam.WriteTransformFn(working_dir)))
      else:
        transform_fn = pipeline | tft_beam.ReadTransformFn(transform_dir)

      # Shuffling the data before materialization will improve Training
      # effectiveness downstream. Here we shuffle the raw_data (as opposed to
      # decoded data) since it has a compact representation.
      shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle()

      decoded_data = shuffled_data | 'DecodeForTransform' >> decode_transform
      (transformed_data, transformed_metadata) = (
          ((decoded_data, raw_data_metadata), transform_fn)
          | 'Transform' >> tft_beam.TransformDataset())

      coder = example_proto_coder.ExampleProtoCoder(transformed_metadata.schema)
      _ = (
          transformed_data
          | 'SerializeExamples' >> beam.Map(coder.encode)
          | 'WriteExamples' >> beam.io.WriteToTFRecord(
              os.path.join(working_dir, outfile_prefix), file_name_suffix='.gz')
      )
Exemplo n.º 5
0
def transform_data(bq_table,
                   step,
                   schema_file,
                   working_dir,
                   outfile_prefix,
                   max_rows=None,
                   transform_dir=None,
                   pipeline_args=None):
    # todo : documentation
    """

    :param project:
    :param dataset:
    :param table:
    :param step:
    :param negative_sampling_ratio:
    :param train_cut:
    :param test_tenth:
    :param schema_file:
    :param working_dir:
    :param outfile_prefix:
    :param transform_dir:
    :param pipeline_args:
    :return:
    """

    def preprocessing_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.

        Args:
          inputs: map from feature keys to raw not-yet-transformed features.

        Returns:
          Map from string feature key to transformed feature operations.
        """
        outputs = {}
        for key in my_metadata.NUMERIC_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[my_metadata.transformed_name(key)] = transform.scale_to_z_score(_fill_in_missing(inputs[key]))

        for key in my_metadata.VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            outputs[my_metadata.transformed_name(key)] = transform.compute_and_apply_vocabulary(
                _fill_in_missing(inputs[key]),
                vocab_filename=my_metadata.transformed_name(key),
                num_oov_buckets=my_metadata.OOV_SIZE,
                top_k=my_metadata.VOCAB_SIZE
            )

        for key, hash_buckets in my_metadata.HASH_STRING_FEATURE_KEYS.items():
            outputs[my_metadata.transformed_name(key)] = transform.hash_strings(
                _fill_in_missing(inputs[key]),
                hash_buckets=hash_buckets
            )

        for key, nb_buckets in my_metadata.TO_BE_BUCKETIZED_FEATURE.items():
            outputs[my_metadata.transformed_name(key +'_bucketized')] = transform.bucketize(
                _fill_in_missing(inputs[key]), nb_buckets)


        # Was this passenger a big tipper?
        taxi_fare = _fill_in_missing(inputs[my_metadata.FARE_KEY])
        tips = _fill_in_missing(inputs[my_metadata.LABEL_KEY])
        outputs[my_metadata.transformed_name(my_metadata.LABEL_KEY)] = tf.where(
            tf.is_nan(taxi_fare),
            tf.cast(tf.zeros_like(taxi_fare), tf.int64),
            # Test if the tip was > 20% of the fare.
            tf.cast(
                tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
                tf.int64))

        return outputs

    schema = my_metadata.read_schema(schema_file)
    raw_feature_spec = my_metadata.get_raw_feature_spec(schema)
    raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema)

    with beam.Pipeline(argv=pipeline_args) as pipeline:
        with tft_beam.Context(temp_dir=working_dir):
            query = sql_queries.get_train_test_sql_query(bq_table, step, max_rows)
            raw_data = (
                    pipeline
                    | 'ReadBigQuery' >> beam.io.Read(
                beam.io.BigQuerySource(query=query, use_standard_sql=True))
                    | 'CleanData' >> beam.Map(
                my_metadata.clean_raw_data_dict, raw_feature_spec=raw_feature_spec))

            if transform_dir is None:
                transform_fn = (
                        (raw_data, raw_data_metadata)
                        | ('Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn)))

                _ = (
                        transform_fn
                        | ('WriteTransformFn' >>
                           tft_beam.WriteTransformFn(working_dir)))
            else:
                transform_fn = pipeline | tft_beam.ReadTransformFn(transform_dir)

            # Shuffling the data before materialization will improve Training
            # effectiveness downstream.
            shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle()

            (transformed_data, transformed_metadata) = (
                    ((shuffled_data, raw_data_metadata), transform_fn)
                    | 'Transform' >> tft_beam.TransformDataset())

            coder = example_proto_coder.ExampleProtoCoder(transformed_metadata.schema)
            _ = (
                    transformed_data
                    | 'SerializeExamples' >> beam.Map(coder.encode)
                    | 'WriteExamples' >> beam.io.WriteToTFRecord(
                os.path.join(working_dir, outfile_prefix))
            )