Пример #1
0
def _run_tft_fn(raw_data, tft_fn, transform_fn_path, user_freq, item_freq):
    """Applys the TensorFlow Transform function to the given data.

  Args:
    raw_data: a dict of shape {$user_key: $user_id, $item_key: ...}.
    tft_fn: a TensorFlow Transform function.
    transform_fn_path: the location to save transformation outputs to.
    user_freq: minimum frequency of a user to include it in the user vocab.
    item_freq: minimum frequency of an item to include it in the item vocab.

  Returns:
    A pCollection of dicts, where each dict is an element of raw_data with the
      preprocess_fn applied to it:
      {$user_key: $user_id, $item_key: $item_id, $count_key: $count}.
  """
    raw_data_metadata = tft.tf_metadata.dataset_metadata.DatasetMetadata(
        tft.tf_metadata.dataset_schema.from_feature_spec(constants.TRAIN_SPEC))
    transformed_dataset, transform_fn = (
        (raw_data, raw_data_metadata)
        | beam_impl.AnalyzeAndTransformDataset(
            lambda x: tft_fn(x, user_freq, item_freq)))
    (transform_fn | "WriteTransformFn" >>
     tft.beam.tft_beam_io.transform_fn_io.WriteTransformFn(
         os.path.join(transform_fn_path, "transform_fn")))
    return transformed_dataset[0]
Пример #2
0
    def testComposedTransforms(self):
        # User defined preprocessing_fn accepts and returns a dict of Columns.
        def preprocessing_fn(inputs):
            return {
                'a(b+c)':
                tft.map(tf.multiply, inputs['a'],
                        tft.map(tf.add, inputs['b'], inputs['c']))
            }

        input_data = [{'a': 4, 'b': 3, 'c': 3}, {'a': 1, 'b': 2, 'c': 1}]
        input_metadata = self.toMetadata({
            'a':
            tf.FixedLenFeature((), tf.float32, 0),
            'b':
            tf.FixedLenFeature((), tf.float32, 0),
            'c':
            tf.FixedLenFeature((), tf.float32, 0)
        })
        transformed_dataset, _ = ((input_data, input_metadata)
                                  | beam_impl.AnalyzeAndTransformDataset(
                                      preprocessing_fn,
                                      os.path.join(self.get_temp_dir(),
                                                   'composed')))

        expected_transformed_data = [{'a(b+c)': 24}, {'a(b+c)': 3}]
        expected_transformed_metadata = self.toMetadata(
            {'a(b+c)': tf.FixedLenFeature((), tf.float32, None)})
        self.assertDatasetsEqual(
            transformed_dataset,
            (expected_transformed_data, expected_transformed_metadata))
Пример #3
0
    def testMultipleLevelsOfAnalysis(self):
        # Test a preprocessing function similar to scale_to_0_1 except that it
        # involves multiple interleavings of analyzers and transforms.
        def preprocessing_fn(inputs):
            scaled_to_0 = tft.map(lambda x, y: x - y, inputs['x'],
                                  tft.min(inputs['x']))
            scaled_to_0_1 = tft.map(lambda x, y: x / y, scaled_to_0,
                                    tft.max(scaled_to_0))
            return {'x_scaled': scaled_to_0_1}

        metadata = self.toMetadata(
            {'x': tf.FixedLenFeature((), tf.float32, 0)})
        input_columns = [{'x': v} for v in [4, 1, 5, 2]]
        input_dataset = (input_columns, metadata)

        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            transformed, _ = (
                input_dataset
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

        output_columns, _ = transformed

        self.assertEqual(output_columns, [{
            'x_scaled': v
        } for v in [0.75, 0.0, 1.0, 0.25]])
Пример #4
0
def main(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--cloud', type=str, help='y' )
    args = parser.parse_args(argv) # Parse the arguments 
    if args.cloud=="y":
        pipeline_options = get_cloud_pipeline_options()
    else:
        pipeline_options = beam.pipeline.PipelineOptions(flags=[],**{'project': "iotpubsub-1536350750202"})
    with beam_impl.Context(temp_dir="gs://relation_extraction/beam"):
        p = beam.Pipeline(options=pipeline_options)
        train_data, test_data = (p | "Read from bigquery" >> ReadBigQuery())

        (test_data | "test it" >> beam.Map(printy))
        train_data = (train_data, train_metadata)
        train_dataset, transform_fn = (train_data
                                            | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)
                                            )
        test_data = (test_data, train_metadata)
        test_data, _ = ((test_data, transform_fn) | 'Transform test data' >> beam_impl.TransformDataset())
        train_data, transformed_metadata = train_dataset
        transformed_data_coder = tft.coders.ExampleProtoCoder(transformed_metadata.schema)
        _ = (train_data
                | 'Encode train data to save it' >> beam.Map(transformed_data_coder.encode)
                | 'Write the train data to tfrecords' >> tfrecordio.WriteToTFRecord(os.path.join("gs://relation_extraction/beam/Train","TRAIN"))
                )
        _ = (test_data
                | 'Encode test data to save it' >> beam.Map(transformed_data_coder.encode)
                | 'Write the test data to tfrecords' >> tfrecordio.WriteToTFRecord(os.path.join("gs://relation_extraction/beam/Test","TEST"))
                )
        _ = (transform_fn | "WriteTransformFn" >> transform_fn_io.WriteTransformFn("gs://relation_extraction/beam/"))

        p.run().wait_until_finish()
Пример #5
0
    def testTransformUnicode(self):
        # User defined preprocessing_fn accepts and returns a dict of Columns.
        def preprocessing_fn(inputs):
            def tito_string_join(*tensors):
                return tf.string_join(tensors, separator=' ')

            return {'a b': tft.map(tito_string_join, inputs['a'], inputs['b'])}

        input_data = [{
            'a': 'Hello',
            'b': 'world'
        }, {
            'a': 'Hello',
            'b': u'κόσμε'
        }]
        input_metadata = self.toMetadata({
            'a': tf.FixedLenFeature((), tf.string),
            'b': tf.FixedLenFeature((), tf.string)
        })
        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            transformed_dataset, _ = (
                (input_data, input_metadata)
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

        expected_transformed_data = [{
            'a b': 'Hello world'
        }, {
            'a b': u'Hello κόσμε'.encode('utf-8')
        }]
        expected_transformed_metadata = self.toMetadata(
            {'a b': tf.FixedLenFeature((), tf.string, None)})
        self.assertDatasetsEqual(
            transformed_dataset,
            (expected_transformed_data, expected_transformed_metadata))
Пример #6
0
    def testUniquesAnalyzerWithTokenization(self):
        # User defined transform_fn accepts and returns a dict of Columns.
        def preprocessing_fn(inputs):
            return {
                'index':
                tft.string_to_int(tft.map(tf.string_split, inputs['a']))
            }

        input_data = [{'a': 'hello hello world'}, {'a': 'hello goodbye world'}]
        input_metadata = self.toMetadata({
            'a':
            tf.FixedLenFeature((), tf.string, ''),
        })

        transformed_dataset, _ = ((input_data, input_metadata)
                                  | beam_impl.AnalyzeAndTransformDataset(
                                      preprocessing_fn,
                                      os.path.join(self.get_temp_dir(),
                                                   'uniques_tokenize')))

        expected_transformed_data = [{
            'index': [0, 0, 1],
        }, {
            'index': [0, 2, 1]
        }]
        expected_transformed_metadata = self.toMetadata(
            {'index': tf.VarLenFeature(tf.int64)})
        self.assertDatasetsEqual(
            transformed_dataset,
            (expected_transformed_data, expected_transformed_metadata))
Пример #7
0
    def testUniquesAnalyzerWithFrequencyThreshold(self):
        # User defined transform_fn accepts and returns a dict of Columns.
        def preprocessing_fn(inputs):
            return {
                'index1':
                tft.string_to_int(tft.map(tf.string_split, inputs['a']),
                                  default_value=-99,
                                  frequency_threshold=2),

                # As above but using a string for frequency_threshold (and changing
                # the default_value to showcase things).
                'index2':
                tft.string_to_int(tft.map(tf.string_split, inputs['a']),
                                  default_value=-9,
                                  frequency_threshold='2')
            }

        input_data = [{
            'a': 'hello hello world'
        }, {
            'a': 'hello goodbye world'
        }, {
            'a': 'hello goodbye foo'
        }]
        input_schema = self.toMetadata({
            'a':
            tf.FixedLenFeature((), tf.string, ''),
        })

        transformed_dataset, _ = (
            (input_data, input_schema)
            | beam_impl.AnalyzeAndTransformDataset(
                preprocessing_fn,
                os.path.join(self.get_temp_dir(),
                             'uniques_tokenize_frequency_threshold')))

        # Generated vocab (ordered by frequency, then value) should be:
        # ["hello", "world", "goodbye", "foo"]. After applying frequency_threshold=2
        # this becomes
        # ["hello", "world", "goodbye"].
        expected_transformed_data = [{
            'index1': [0, 0, 1],
            'index2': [0, 0, 1]
        }, {
            'index1': [0, 2, 1],
            'index2': [0, 2, 1]
        }, {
            'index1': [0, 2, -99],
            'index2': [0, 2, -9]
        }]
        expected_transformed_schema = self.toMetadata({
            'index1':
            tf.VarLenFeature(tf.int64),
            'index2':
            tf.VarLenFeature(tf.int64)
        })
        self.assertDatasetsEqual(
            transformed_dataset,
            (expected_transformed_data, expected_transformed_schema))
Пример #8
0
    def testAnalyzeBeforeTransform(self):
        def preprocessing_fn(inputs):
            return {'x_scaled': tft.scale_to_0_1(inputs['x'])}

        # Run AnalyzeAndTransform on some input data and compare with expected
        # output.
        input_data = [{'x': 4}, {'x': 1}, {'x': 5}, {'x': 2}]
        input_metadata = self.toMetadata(
            {'x': tf.FixedLenFeature((), tf.float32, 0)})
        transformed_dataset, transform_fn = (
            (input_data, input_metadata)
            | beam_impl.AnalyzeAndTransformDataset(
                preprocessing_fn,
                os.path.join(self.get_temp_dir(),
                             'analyze_before_transform_at')))

        expected_transformed_data = [{
            'x_scaled': 0.75
        }, {
            'x_scaled': 0.0
        }, {
            'x_scaled': 1.0
        }, {
            'x_scaled': 0.25
        }]
        expected_transformed_metadata = self.toMetadata(
            {'x_scaled': tf.FixedLenFeature((), tf.float32, None)})
        self.assertDatasetsEqual(
            transformed_dataset,
            (expected_transformed_data, expected_transformed_metadata))

        # Take the transform function and use TransformDataset to apply it to
        # some eval data, and compare with expected output.
        eval_data = [{'x': 6}, {'x': 3}]
        transformed_eval_dataset = (((eval_data, input_metadata), transform_fn)
                                    | beam_impl.TransformDataset())

        expected_transformed_eval_data = [{
            'x_scaled': 1.25
        }, {
            'x_scaled': 0.5
        }]
        self.assertDatasetsEqual(
            transformed_eval_dataset,
            (expected_transformed_eval_data, expected_transformed_metadata))

        # Redo test with eval data, using AnalyzeDataset instead of
        # AnalyzeAndTransformDataset to genereate transform_fn.
        transform_fn = ((input_data, input_metadata)
                        | beam_impl.AnalyzeDataset(
                            preprocessing_fn,
                            os.path.join(self.get_temp_dir(),
                                         'analyze_before_transform_a')))
        transformed_eval_dataset = (((eval_data, input_metadata), transform_fn)
                                    | beam_impl.TransformDataset())
        self.assertDatasetsEqual(
            transformed_eval_dataset,
            (expected_transformed_eval_data, expected_transformed_metadata))
Пример #9
0
def main(p=None):
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        x = inputs['x']
        y = inputs['y']
        s = inputs['s']
        x_centered = x - tft.mean(x)
        y_normalized = tft.scale_to_0_1(y)
        s_integerized = tft.string_to_int(s)
        x_centered_times_y_normalized = (x_centered * y_normalized)
        return {
            'x_centered': x_centered,
            'y_normalized': y_normalized,
            'x_centered_times_y_normalized': x_centered_times_y_normalized,
            's_integerized': s_integerized
        }

    raw_data = [{
        'x': 1,
        'y': 1,
        's': 'hello'
    }, {
        'x': 2,
        'y': 2,
        's': 'world'
    }, {
        'x': 3,
        'y': 3,
        's': 'hello'
    }]

    # raw_data_p = p | beam.Create(raw_data)

    raw_data_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.Schema({
            's':
            dataset_schema.ColumnSchema(
                tf.string, [], dataset_schema.FixedColumnRepresentation()),
            'y':
            dataset_schema.ColumnSchema(
                tf.float32, [], dataset_schema.FixedColumnRepresentation()),
            'x':
            dataset_schema.ColumnSchema(
                tf.float32, [], dataset_schema.FixedColumnRepresentation())
        }))

    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
        transformed_dataset, transform_fn = (  # pylint: disable=unused-variable
            (raw_data, raw_data_metadata)
            | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

        transformed_data, transformed_metadata = transformed_dataset  # pylint: disable=unused-variable

        pprint.pprint(transformed_data)
        (transformed_data
         | beam.io.WriteToText(
             '/Users/luoshixin/Personal/GCPStudy/src/tensorflow/tftransform/tmp'
         ))
def data_transform():
    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        transformed_dataset, transform_fn = ((dict_features, data_metadata) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
        
        transformed_data, transformed_metadata = transformed_dataset
        
        for i in range(len(transformed_data)):
            print('Raw: ', dict_features[i])
            print('Transformed: ', transformed_data[i])
def data_transform():

    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        transformed_dataset, transform_fn = (
            (dict_features, data_metadata) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) # Preprocessing_fn automatically called with the correct inputs

    transformed_data, transformed_metadata = transformed_dataset # Break down the dataset

    for i in range(len(transformed_data)):
        print("Raw: ", dict_features[i]) # See the raw data
        print("Transformed:", transformed_data[i]) # Compare with the transformed data
Пример #12
0
def main():
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        x = inputs['x']
        y = inputs['y']
        s = inputs['s']
        x_centered = x - tft.mean(x)
        y_normalized = tft.scale_to_0_1(y)
        s_integerized = tft.compute_and_apply_vocabulary(s)
        x_centered_times_y_normalized = (x_centered * y_normalized)
        return {
            'x_centered': x_centered,
            'y_normalized': y_normalized,
            'x_centered_times_y_normalized': x_centered_times_y_normalized,
            's_integerized': s_integerized
        }

    raw_data = [{
        'x': 1,
        'y': 1,
        's': 'hello'
    }, {
        'x': 2,
        'y': 2,
        's': 'world'
    }, {
        'x': 3,
        'y': 3,
        's': 'hello'
    }]

    raw_data_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.Schema({
            's':
            dataset_schema.ColumnSchema(
                tf.string, [], dataset_schema.FixedColumnRepresentation()),
            'y':
            dataset_schema.ColumnSchema(
                tf.float32, [], dataset_schema.FixedColumnRepresentation()),
            'x':
            dataset_schema.ColumnSchema(
                tf.float32, [], dataset_schema.FixedColumnRepresentation())
        }))

    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
        transformed_dataset, transform_fn = (  # pylint: disable=unused-variable
            (raw_data, raw_data_metadata)
            | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

    transformed_data, transformed_metadata = transformed_dataset  # pylint: disable=unused-variable

    pprint.pprint(transformed_data)
Пример #13
0
def main(argv=None):
    """Run preprocessing as a Dataflow pipeline.
    Args:
        argv (list): list of arguments
    """
    args = parse_arguments(sys.argv if argv is None else argv)

    if args.cloud:
        pipeline_options = get_cloud_pipeline_options()
    else:
        pipeline_options = None

    p = beam.Pipeline(options=pipeline_options)
    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
        # read data and join by key
        raw_data_input = (p
                          | 'ReadInputData' >> beam.io.ReadFromText(
                              TRAIN_INPUT_DATA, skip_header_lines=1)
                          | 'ParseInputCSV' >> beam.Map(converter_input.decode)
                          | 'ExtractBatchKeyIn' >> beam.Map(extract_batchkey))

        raw_data_output = (
            p
            | 'ReadOutputData' >> beam.io.ReadFromText(TRAIN_OUTPUT_DATA,
                                                       skip_header_lines=1)
            | 'ParseOutputCSV' >> beam.Map(converter_output.decode)
            | 'ExtractBatchKeyOut' >> beam.Map(extract_batchkey))

        raw_data = ((raw_data_input, raw_data_output)
                    | 'JoinData' >> beam.CoGroupByKey()
                    | 'RemoveKeys' >> beam.FlatMap(remove_keys))

        # analyse and transform dataset
        raw_dataset = (raw_data, input_metadata)
        transformed_dataset, transform_fn = (
            raw_dataset
            | 'AnalyzeAndTransform' >>
            beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
        transformed_data, transformed_metadata = transformed_dataset

        # save data and serialize TransformFn
        transformed_data_coder = tft.coders.ExampleProtoCoder(
            transformed_metadata.schema)
        _ = (transformed_data
             | 'EncodeData' >> beam.Map(transformed_data_coder.encode)
             | 'WriteData' >> tfrecordio.WriteToTFRecord(
                 os.path.join(TFRECORD_DIR, 'records')))
        _ = (transform_fn
             |
             "WriteTransformFn" >> transform_fn_io.WriteTransformFn(MODEL_DIR))

        p.run().wait_until_finish()
Пример #14
0
    def testNumericAnalyzersWithNDInputs(self):
        def preprocessing_fn(inputs):
            def repeat(in_tensor, value):
                batch_size = tf.shape(in_tensor)[0]
                return tf.ones([batch_size], value.dtype) * value

            return {
                'min': tft.map(repeat, inputs['a'], tft.min(inputs['a'])),
                'max': tft.map(repeat, inputs['a'], tft.max(inputs['a'])),
                'sum': tft.map(repeat, inputs['a'], tft.sum(inputs['a'])),
                'size': tft.map(repeat, inputs['a'], tft.size(inputs['a'])),
                'mean': tft.map(repeat, inputs['a'], tft.mean(inputs['a']))
            }

        input_data = [{'a': [[4, 5], [6, 7]]}, {'a': [[1, 2], [3, 4]]}]
        input_metadata = self.toMetadata(
            {'a': tf.FixedLenFeature((2, 2), tf.int64)})
        transformed_dataset, _ = ((input_data, input_metadata)
                                  | beam_impl.AnalyzeAndTransformDataset(
                                      preprocessing_fn,
                                      os.path.join(self.get_temp_dir(),
                                                   'ndarray')))

        expected_transformed_data = [{
            'min': 1,
            'max': 7,
            'sum': 32,
            'size': 8,
            'mean': 4.0
        }, {
            'min': 1,
            'max': 7,
            'sum': 32,
            'size': 8,
            'mean': 4.0
        }]
        expected_transformed_metadata = self.toMetadata({
            'min':
            tf.FixedLenFeature((), tf.int64, None),
            'max':
            tf.FixedLenFeature((), tf.int64, None),
            'sum':
            tf.FixedLenFeature((), tf.int64, None),
            'size':
            tf.FixedLenFeature((), tf.int64, None),
            'mean':
            tf.FixedLenFeature((), tf.float64, None)
        })
        self.assertDatasetsEqual(
            transformed_dataset,
            (expected_transformed_data, expected_transformed_metadata))
Пример #15
0
    def assertAnalyzeAndTransformResults(self,
                                         input_data,
                                         input_metadata,
                                         preprocessing_fn,
                                         expected_data,
                                         expected_metadata=None):
        """Assert that input data and metadata is transformed as expected.

    Args:
      input_data: A sequence of dicts whose values are
          either strings, lists of strings, numeric types or a pair of those.
      input_metadata: DatasetMetadata describing input_data.
      preprocessing_fn: A function taking a dict of tensors and returning
          a dict of tensors.
      expected_data: A dataset with the same type constraints as input_data,
          but representing the output after transformation.
      expected_metadata: (optional) DatasetMeatadata describing the transformed
          data.
    Raises:
      AssertionError: if the expected data does not match the results of
          transforming input_data according to preprocessing_fn, or
          (if provided) if the expected metadata does not match.
    """
        temp_dir = self.get_temp_dir()
        with beam_impl.Context(temp_dir=temp_dir):
            # Note: we don't separately test AnalyzeDataset and TransformDataset as
            # AnalyzeAndTransformDataset currently simply composes these two
            # transforms.  If in future versions of the code, the implementation
            # differs, we should also run AnalyzeDataset and TransformDatset composed.
            #
            # Also, the dataset_metadata that is returned along with
            # `transformed_data` is incomplete as it does not contain the deferred
            # components, so we instead inspect the metadata returned along with the
            # transform function.
            (transformed_data,
             _), (_, (transformed_metadata, deferred_metadata)) = (
                 (input_data, input_metadata)
                 | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

        self.assertDataCloseOrEqual(expected_data, transformed_data)
        if expected_metadata:
            # deferred_metadata should be a singleton PCollection.
            self.assertEqual(len(deferred_metadata), 1)
            unresolved_futures = transformed_metadata.substitute_futures(
                deferred_metadata[0])
            self.assertEqual(unresolved_futures, [])
            # Use extra assertEqual for schemas, since full metadata assertEqual error
            # message is not conducive to debugging.
            self.assertEqual(expected_metadata.schema.column_schemas,
                             transformed_metadata.schema.column_schemas)
            self.assertEqual(expected_metadata, transformed_metadata)
Пример #16
0
def preprocess(p, args):
    """Run preprocessing as pipeline."""
    train_eval_schema = _make_input_schema()

    train_eval_metadata = dataset_metadata.DatasetMetadata(
        schema=train_eval_schema)

    _ = (train_eval_metadata
         | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata(os.path.join(
             args.output_dir, constants.RAW_METADATA_DIR),
                                                             pipeline=p))

    train_eval_data = (p | 'ReadDataFromBQ' >> beam.io.Read(
        beam.io.BigQuerySource(query=_get_query('bigquery-public-data',
                                                'samples', 'gsod'),
                               use_standard_sql=True)))

    train_eval_data = train_eval_data | 'ValidateData' >> beam.ParDo(
        DataValidator())

    (transformed_train_eval_data,
     transformed_train_eval_metadata), transform_fn = (
         (train_eval_data, train_eval_metadata)
         | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset(
             get_preprocessing_fn()))

    _ = (transform_fn
         | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(args.output_dir))

    transformed_train_eval_coder = coders.ExampleProtoCoder(
        transformed_train_eval_metadata.schema)

    transformed_train_data, transformed_eval_data = (
        transformed_train_eval_data
        | 'Partition' >> beam.Partition(get_partition_fn(0.7), 2))

    (transformed_train_data
     |
     'SerializeTrainExamples' >> beam.Map(transformed_train_eval_coder.encode)
     | 'WriteTraining' >>
     beam.io.WriteToTFRecord(os.path.join(
         args.output_dir, constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX),
                             file_name_suffix=constants.DATA_FILE_SUFFIX))

    (transformed_eval_data
     | 'SerializeEvalExamples' >> beam.Map(transformed_train_eval_coder.encode)
     | 'WriteEval' >>
     beam.io.WriteToTFRecord(os.path.join(
         args.output_dir, constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX),
                             file_name_suffix=constants.DATA_FILE_SUFFIX))
Пример #17
0
    def testNumericAnalyzersWithScalarInputs(self):
        def preprocessing_fn(inputs):
            def repeat(in_tensor, value):
                batch_size = tf.shape(in_tensor)[0]
                return tf.ones([batch_size], dtype=value.dtype) * value

            return {
                'min': tft.map(repeat, inputs['a'], tft.min(inputs['a'])),
                'max': tft.map(repeat, inputs['a'], tft.max(inputs['a'])),
                'sum': tft.map(repeat, inputs['a'], tft.sum(inputs['a'])),
                'size': tft.map(repeat, inputs['a'], tft.size(inputs['a'])),
                'mean': tft.map(repeat, inputs['a'], tft.mean(inputs['a']))
            }

        input_data = [{'a': 4}, {'a': 1}]
        input_metadata = self.toMetadata(
            {'a': tf.FixedLenFeature((), tf.int64, 0)})
        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            transformed_dataset, _ = (
                (input_data, input_metadata)
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

        expected_transformed_data = [{
            'min': 1,
            'max': 4,
            'sum': 5,
            'size': 2,
            'mean': 2.5
        }, {
            'min': 1,
            'max': 4,
            'sum': 5,
            'size': 2,
            'mean': 2.5
        }]
        expected_transformed_metadata = self.toMetadata({
            'min':
            tf.FixedLenFeature((), tf.int64, None),
            'max':
            tf.FixedLenFeature((), tf.int64, None),
            'sum':
            tf.FixedLenFeature((), tf.int64, None),
            'size':
            tf.FixedLenFeature((), tf.int64, None),
            'mean':
            tf.FixedLenFeature((), tf.float64, None)
        })
        self.assertDatasetsEqual(
            transformed_dataset,
            (expected_transformed_data, expected_transformed_metadata))
Пример #18
0
    def testUniquesAnalyzer(self):
        # User defined transform_fn accepts and returns a dict of Columns.
        def preprocessing_fn(inputs):
            return {'index': tft.string_to_int(inputs['a'])}

        input_data = [{
            'a': 'hello'
        }, {
            'a': 'world'
        }, {
            'a': 'hello'
        }, {
            'a': 'hello'
        }, {
            'a': 'goodbye'
        }, {
            'a': 'world'
        }, {
            'a': 'aaaaa'
        }]
        input_metadata = self.toMetadata({
            'a':
            tf.FixedLenFeature((), tf.string),
        })
        transformed_dataset, _ = ((input_data, input_metadata)
                                  | beam_impl.AnalyzeAndTransformDataset(
                                      preprocessing_fn,
                                      os.path.join(self.get_temp_dir(),
                                                   'uniques')))

        expected_transformed_data = [{
            'index': 0
        }, {
            'index': 1
        }, {
            'index': 0
        }, {
            'index': 0
        }, {
            'index': 2
        }, {
            'index': 1
        }, {
            'index': 3
        }]
        expected_transformed_metadata = self.toMetadata(
            {'index': tf.FixedLenFeature((), tf.int64)})
        self.assertDatasetsEqual(
            transformed_dataset,
            (expected_transformed_data, expected_transformed_metadata))
Пример #19
0
def run_tft_pipeline(args):
    """
    This is where all the data we have available in our database is processed and 
    transformed into Tensorflow tfrecords for later training and testing.

    The code runs in distributed manner automatically in the engine choosen by
    the `runner` argument in input.
    """
    pipeline_options = build_pipeline_options(args)
    temp_tft_folder = (tempfile.mkdtemp(
        dir='/tmp/') if not args.tft_temp else args.tft_temp)
    tft_transform_folder = (tempfile.mkdtemp(
        dir='/tmp/') if not args.tft_transform else args.tft_transform)

    with beam.Pipeline(options=pipeline_options) as pipeline:
        with beam_impl.Context(temp_dir=temp_tft_folder):

            train_data = read_input_data(args, pipeline, 'train')

            write_total_distinct_keys_to_file(train_data, args.nitems_filename,
                                              'sku')

            train_dataset = (train_data, metadata.RAW_DATA_METADATA)
            (train_data, transformed_train_metadata), transform_fn = (
                train_dataset
                | beam_impl.AnalyzeAndTransformDataset(preprocess_fn))

            _ = (transform_fn
                 | 'WriteTransformFn' >>
                 transform_fn_io.WriteTransformFn(tft_transform_folder))

            train_data = aggregate_transformed_data(train_data, 'train')

            write_tfrecords(train_data, metadata.OUTPUT_TRAIN_SCHEMA,
                            args.output_train_filename, 'output train')

            test_data = read_input_data(args, pipeline, 'test')

            test_dataset = (test_data, metadata.RAW_DATA_METADATA)

            (test_data,
             _) = ((test_dataset, transform_fn) | beam_impl.TransformDataset())

            test_data = aggregate_transformed_data(test_data, 'test')

            test_data = aggregate_final_test_data(train_data, test_data)

            write_tfrecords(test_data, metadata.OUTPUT_TEST_SCHEMA,
                            args.output_test_filename, 'output test')
Пример #20
0
def main():
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        x = inputs['x']
        y = inputs['y']
        s = inputs['s']
        x_centered = x - tft.mean(x)
        y_normalized = tft.scale_to_0_1(y)
        s_integerized = tft.compute_and_apply_vocabulary(s)
        x_centered_times_y_normalized = (x_centered * y_normalized)
        return {
            'x_centered': x_centered,
            'y_normalized': y_normalized,
            'x_centered_times_y_normalized': x_centered_times_y_normalized,
            's_integerized': s_integerized
        }

    raw_data = [{
        'x': 1,
        'y': 1,
        's': 'hello'
    }, {
        'x': 2,
        'y': 2,
        's': 'world'
    }, {
        'x': 3,
        'y': 3,
        's': 'hello'
    }]

    raw_data_metadata = dataset_metadata.DatasetMetadata(
        schema_utils.schema_from_feature_spec({
            's':
            tf.io.FixedLenFeature([], tf.string),
            'y':
            tf.io.FixedLenFeature([], tf.float32),
            'x':
            tf.io.FixedLenFeature([], tf.float32),
        }))

    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        transformed_dataset, transform_fn = (  # pylint: disable=unused-variable
            (raw_data, raw_data_metadata)
            | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))

    transformed_data, transformed_metadata = transformed_dataset  # pylint: disable=unused-variable

    pprint.pprint(transformed_data)
Пример #21
0
 def _make_transform_fn(self, p, output_path):
   def preprocessing_fn(inputs):
     return {'x_scaled': tft.scale_to_0_1(inputs['x'])}
   schema = dataset_schema.from_feature_spec(
       {'x': tf.FixedLenFeature((), tf.float32, 0)})
   metadata = dataset_metadata.DatasetMetadata(schema=schema)
   columns = p | 'CreateTrainingData' >> beam.Create([{
       'x': v
   } for v in [4, 1, 5, 2]])
   _, result = (
       (columns, metadata)
       | 'AnalyzeAndTransform'
       >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn, output_path))
   coder = coders.CsvCoder(['x'], schema, delimiter='\t')
   return result, coder
Пример #22
0
def main():
    def preprocessing_fn(inputs):
        return {
            'x_centered': x - tft.mean(inputs['x']),
            'y_normalized': tft.scale_to_0_1(inputs['y']),
            's_integerized': tft.compute_and_apply_vocabulary(inputs['s'])
        }

    ...
    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        transformed_dataset, transform_fn = (
            (raw_data, raw_data_metadata)
            | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))

    transformed_data, transformed_metadata = transformed_dataset
Пример #23
0
    def testTransformMoreThanDesiredBatchSize(self):
        # User defined preprocessing_fn accepts and returns a dict of Columns.
        def preprocessing_fn(inputs):
            return {
                'ab': tft.map(tf.multiply, inputs['a'], inputs['b']),
                'i': tft.string_to_int(inputs['c'])
            }

        input_data = [
            {
                'a': 2,
                'b': i,
                'c': '%.10i' %
                i,  # Front-padded to facilitate lexicographic sorting.
            } for i in range(beam_impl._DEFAULT_DESIRED_BATCH_SIZE + 1)
        ]
        input_metadata = self.toMetadata({
            'a':
            tf.FixedLenFeature((), tf.float32, 0),
            'b':
            tf.FixedLenFeature((), tf.float32, 0),
            'c':
            tf.FixedLenFeature((), tf.string, ''),
        })
        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            transformed_dataset, _ = (
                (input_data, input_metadata)
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

        expected_transformed_data = [
            {
                'ab': 2 * i,
                'i': (len(input_data) - 1) -
                i,  # Due to reverse lexicographic sorting.
            } for i in range(len(input_data))
        ]
        expected_transformed_metadata = self.toMetadata({
            'ab':
            tf.FixedLenFeature((), tf.float32, None),
            'i':
            tf.FixedLenFeature((), tf.int64, None),
        })
        self.assertDatasetsEqual(
            transformed_dataset,
            (expected_transformed_data, expected_transformed_metadata))
Пример #24
0
def run(pipeline_options, known_args):

    pipeline = beam.Pipeline(options=pipeline_options)
    gcp_project = pipeline_options.get_all_options()['project']

    with impl.Context(known_args.transform_temp_dir):
        articles = (pipeline
                    | 'Read articles from BigQuery' >> beam.io.Read(
                        beam.io.BigQuerySource(project=gcp_project,
                                               query=get_source_query(
                                                   known_args.limit),
                                               use_standard_sql=True)))

        articles_dataset = (articles, get_metadata())
        embeddings_dataset, _ = (
            articles_dataset
            | 'Extract embeddings' >>
            impl.AnalyzeAndTransformDataset(preprocess_fn))

        embeddings, transformed_metadata = embeddings_dataset

        embeddings | 'Write embeddings to TFRecords' >> beam.io.tfrecordio.WriteToTFRecord(
            file_path_prefix='{0}'.format(known_args.output_dir),
            file_name_suffix='.tfrecords',
            coder=tft_coders.example_proto_coder.ExampleProtoCoder(
                transformed_metadata.schema),
            num_shards=int(known_args.limit / 25000))

        (articles
         |
         'Convert to entity' >> beam.Map(lambda input_features: create_entity(
             input_features, known_args.kind))
         | 'Write to Datastore' >> WriteToDatastore(project=gcp_project))

        if known_args.enable_debug:
            embeddings | 'Debug Output' >> beam.io.textio.WriteToText(
                file_path_prefix=known_args.debug_output_prefix,
                file_name_suffix='.txt')

    job = pipeline.run()

    if pipeline_options.get_all_options()['runner'] == 'DirectRunner':
        job.wait_until_finish()
Пример #25
0
    def testTransformFnExportAndImportRoundtrip(self):
        tranform_fn_dir = os.path.join(self.get_temp_dir(),
                                       'export_transform_fn')
        metadata_dir = os.path.join(self.get_temp_dir(), 'export_metadata')

        with beam.Pipeline() as p:

            def preprocessing_fn(inputs):
                return {'x_scaled': tft.scale_to_0_1(inputs['x'])}

            metadata = self.toMetadata(
                {'x': tf.FixedLenFeature((), tf.float32, 0)})
            columns = p | 'CreateTrainingData' >> beam.Create([{
                'x': v
            } for v in [4, 1, 5, 2]])
            _, transform_fn = (
                (columns, metadata)
                | 'Analyze and Transform' >>
                beam_impl.AnalyzeAndTransformDataset(
                    preprocessing_fn,
                    os.path.join(self.get_temp_dir(), 'no_automaterialize')))

            _ = transform_fn | transform_fn_io.WriteTransformFn(
                tranform_fn_dir)
            _ = metadata | beam_metadata_io.WriteMetadata(metadata_dir,
                                                          pipeline=p)

        with beam.Pipeline() as p:
            transform_fn = p | transform_fn_io.ReadTransformFn(tranform_fn_dir)
            metadata = p | beam_metadata_io.ReadMetadata(metadata_dir)
            # Run transform_columns on some eval dataset.
            eval_data = p | 'CreateEvalData' >> beam.Create([{
                'x': v
            } for v in [6, 3]])
            transformed_eval_data, _ = (
                ((eval_data, metadata), transform_fn)
                | 'Transform' >> beam_impl.TransformDataset())
            expected_transformed_eval_data = [{
                'x_scaled': v
            } for v in [1.25, 0.5]]
            beam_test_util.assert_that(
                transformed_eval_data,
                beam_test_util.equal_to(expected_transformed_eval_data))
Пример #26
0
    def testTransform(self):
        # User defined preprocessing_fn accepts and returns a dict of Columns.
        def preprocessing_fn(inputs):
            return {'ab': tft.map(tf.multiply, inputs['a'], inputs['b'])}

        input_data = [{
            'a': 4,
            'b': 3
        }, {
            'a': 1,
            'b': 2
        }, {
            'a': 5,
            'b': 6
        }, {
            'a': 2,
            'b': 3
        }]
        input_metadata = self.toMetadata({
            'a':
            tf.FixedLenFeature((), tf.float32, 0),
            'b':
            tf.FixedLenFeature((), tf.float32, 0)
        })
        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            transformed_dataset, _ = (
                (input_data, input_metadata)
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

        expected_transformed_data = [{
            'ab': 12
        }, {
            'ab': 2
        }, {
            'ab': 30
        }, {
            'ab': 6
        }]
        expected_transformed_metadata = self.toMetadata(
            {'ab': tf.FixedLenFeature((), tf.float32, None)})
        self.assertDatasetsEqual(
            transformed_dataset,
            (expected_transformed_data, expected_transformed_metadata))
Пример #27
0
    def testUniquesAnalyzerWithNDInputs(self):
        # User defined transform_fn accepts and returns a dict of Columns.
        def preprocessing_fn(inputs):
            return {'index': tft.string_to_int(inputs['a'])}

        input_data = [
            {
                'a': [['some', 'say'], ['the', 'world']]
            },
            {
                'a': [['will', 'end'], ['in', 'fire']]
            },
            {
                'a': [['some', 'say'], ['in', 'ice']]
            },
        ]
        input_metadata = self.toMetadata({
            'a':
            tf.FixedLenFeature((2, 2), tf.string),
        })

        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            transformed_dataset, _ = (
                (input_data, input_metadata)
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

        expected_transformed_data = [
            {
                'index': [[0, 1], [5, 3]]
            },
            {
                'index': [[4, 8], [2, 7]]
            },
            {
                'index': [[0, 1], [2, 6]]
            },
        ]
        expected_transformed_metadata = self.toMetadata(
            {'index': tf.FixedLenFeature((2, 2), tf.int64)})
        self.assertDatasetsEqual(
            transformed_dataset,
            (expected_transformed_data, expected_transformed_metadata))
Пример #28
0
    def testPassthroughKeys(self):
        passthrough_key = '__passthrough__'

        def preprocessing_fn(inputs):
            self.assertNotIn(passthrough_key, inputs)
            return {'x_scaled': tft.scale_to_0_1(inputs['x'])}

        x_data = [0., 1., 2.]
        passthrough_data = [1, None, 3]
        input_record_batch = pa.RecordBatch.from_arrays([
            pa.array([[x] for x in x_data], type=pa.list_(pa.float32())),
            pa.array([None if p is None else [p] for p in passthrough_data],
                     type=pa.list_(pa.int64())),
        ], ['x', passthrough_key])
        tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
            input_record_batch.schema, {
                'x':
                text_format.Parse('dense_tensor { column_name: "x" shape {} }',
                                  schema_pb2.TensorRepresentation())
            })
        expected_data = [{
            'x_scaled': x / 2.0,
            passthrough_key: p
        } for x, p in zip(x_data, passthrough_data)]

        with self._makeTestPipeline() as pipeline:
            input_data = (pipeline | beam.Create([input_record_batch]))
            with beam_impl.Context(temp_dir=self.get_temp_dir(),
                                   passthrough_keys=set([passthrough_key])):
                (transformed_data, _), _ = (
                    (input_data, tensor_adapter_config)
                    | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

                def _assert_fn(output_data):
                    self.assertCountEqual(expected_data, output_data)

                beam_test_util.assert_that(transformed_data, _assert_fn)
Пример #29
0
    def testPipelineWithoutAutomaterialization(self):
        # The tests in BaseTFTransformImplTest, when run with the beam
        # implementation, pass lists instead of PCollections and thus invoke
        # automaterialization where each call to a beam PTransform will implicitly
        # run its own pipeline.
        #
        # In order to test the case where PCollections are not materialized in
        # between calls to the tf.Transform PTransforms, we include a test that is
        # not based on automaterialization.
        def preprocessing_fn(inputs):
            return {'x_scaled': tft.scale_to_0_1(inputs['x'])}

        p = beam.Pipeline()
        metadata = self.toMetadata(
            {'x': tf.FixedLenFeature((), tf.float32, 0)})
        columns = p | 'CreateTrainingData' >> beam.Create([{
            'x': v
        } for v in [4, 1, 5, 2]])
        _, transform_fn = (
            (columns, metadata)
            | 'Analyze and Transform' >> beam_impl.AnalyzeAndTransformDataset(
                preprocessing_fn,
                os.path.join(self.get_temp_dir(), 'no_automaterialize')))

        # Run transform_columns on some eval dataset.
        eval_data = p | 'CreateEvalData' >> beam.Create([{
            'x': v
        } for v in [6, 3]])
        transformed_eval_data, _ = (
            ((eval_data, metadata), transform_fn)
            | 'Transform' >> beam_impl.TransformDataset())
        p.run()
        expected_transformed_eval_data = [{'x_scaled': v} for v in [1.25, 0.5]]
        beam_test_util.assert_that(
            transformed_eval_data,
            beam_test_util.equal_to(expected_transformed_eval_data))
Пример #30
0
            orders_by_date_train = (
                raw_data_train | "Merge - train" >> beam.CombineGlobally(
                    GroupItemsByDate(community_area_list,
                                     (start_datetime, split_datetime))))

            ts_windows_train = (
                orders_by_date_train | "Extract timeseries windows - train" >>
                beam.ParDo(ExtractRawTimeseriesWindow(known_args.window_size))
                | "Fusion breaker train" >> beam.Reshuffle())

            ts_windows_schema = _get_feature_spec(known_args.window_size)
            norm_ts_windows_train, transform_fn = (
                (ts_windows_train, ts_windows_schema)
                | "Analyze and Transform - train" >>
                impl.AnalyzeAndTransformDataset(lambda t: _preprocess_fn(
                    t, known_args.window_size, znorm_stats)))
            norm_ts_windows_train_data, norm_ts_windows_train_metadata = norm_ts_windows_train

            _ = norm_ts_windows_train_data | 'Write TFrecords - train' >> beam.io.tfrecordio.WriteToTFRecord(
                file_path_prefix=train_tfrecord_path,
                file_name_suffix=".tfrecords",
                coder=example_proto_coder.ExampleProtoCoder(
                    norm_ts_windows_train_metadata.schema))

            # Process evaluation data
            raw_data_eval = _read_data_from_bq(pipeline, known_args.split_date,
                                               known_args.end_date)

            orders_by_date_eval = (
                raw_data_eval | "Merge SKUs - eval" >> beam.CombineGlobally(
                    GroupItemsByDate(community_area_list,