Exemplo n.º 1
0
    def testAnalyzeBeforeTransform(self):
        def preprocessing_fn(inputs):
            return {'x_scaled': tft.scale_to_0_1(inputs['x'])}

        # Run AnalyzeAndTransform on some input data and compare with expected
        # output.
        input_data = [{'x': 4}, {'x': 1}, {'x': 5}, {'x': 2}]
        input_metadata = self.toMetadata(
            {'x': tf.FixedLenFeature((), tf.float32, 0)})

        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            transformed_dataset, transform_fn = (
                (input_data, input_metadata)
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

        expected_transformed_data = [{
            'x_scaled': 0.75
        }, {
            'x_scaled': 0.0
        }, {
            'x_scaled': 1.0
        }, {
            'x_scaled': 0.25
        }]
        expected_transformed_metadata = self.toMetadata(
            {'x_scaled': tf.FixedLenFeature((), tf.float32, None)})
        self.assertDatasetsEqual(
            transformed_dataset,
            (expected_transformed_data, expected_transformed_metadata))

        # Take the transform function and use TransformDataset to apply it to
        # some eval data, and compare with expected output.
        eval_data = [{'x': 6}, {'x': 3}]
        transformed_eval_dataset = (((eval_data, input_metadata), transform_fn)
                                    | beam_impl.TransformDataset())

        expected_transformed_eval_data = [{
            'x_scaled': 1.25
        }, {
            'x_scaled': 0.5
        }]
        self.assertDatasetsEqual(
            transformed_eval_dataset,
            (expected_transformed_eval_data, expected_transformed_metadata))

        # Redo test with eval data, using AnalyzeDataset instead of
        # AnalyzeAndTransformDataset to genereate transform_fn.
        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            transform_fn = ((input_data, input_metadata)
                            | beam_impl.AnalyzeDataset(preprocessing_fn))
            transformed_eval_dataset = ((
                (eval_data, input_metadata), transform_fn)
                                        | beam_impl.TransformDataset())
        self.assertDatasetsEqual(
            transformed_eval_dataset,
            (expected_transformed_eval_data, expected_transformed_metadata))
Exemplo n.º 2
0
    def testComposedTransforms(self):
        # User defined preprocessing_fn accepts and returns a dict of Columns.
        def preprocessing_fn(inputs):
            return {
                'a(b+c)':
                tft.map(tf.multiply, inputs['a'],
                        tft.map(tf.add, inputs['b'], inputs['c']))
            }

        input_data = [{'a': 4, 'b': 3, 'c': 3}, {'a': 1, 'b': 2, 'c': 1}]
        input_metadata = self.toMetadata({
            'a':
            tf.FixedLenFeature((), tf.float32, 0),
            'b':
            tf.FixedLenFeature((), tf.float32, 0),
            'c':
            tf.FixedLenFeature((), tf.float32, 0)
        })
        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            transformed_dataset, _ = (
                (input_data, input_metadata)
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

        expected_transformed_data = [{'a(b+c)': 24}, {'a(b+c)': 3}]
        expected_transformed_metadata = self.toMetadata(
            {'a(b+c)': tf.FixedLenFeature((), tf.float32, None)})
        self.assertDatasetsEqual(
            transformed_dataset,
            (expected_transformed_data, expected_transformed_metadata))
Exemplo n.º 3
0
    def testTransformUnicode(self):
        # User defined preprocessing_fn accepts and returns a dict of Columns.
        def preprocessing_fn(inputs):
            def tito_string_join(*tensors):
                return tf.string_join(tensors, separator=' ')

            return {'a b': tft.map(tito_string_join, inputs['a'], inputs['b'])}

        input_data = [{
            'a': 'Hello',
            'b': 'world'
        }, {
            'a': 'Hello',
            'b': u'κόσμε'
        }]
        input_metadata = self.toMetadata({
            'a': tf.FixedLenFeature((), tf.string),
            'b': tf.FixedLenFeature((), tf.string)
        })
        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            transformed_dataset, _ = (
                (input_data, input_metadata)
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

        expected_transformed_data = [{
            'a b': 'Hello world'
        }, {
            'a b': u'Hello κόσμε'.encode('utf-8')
        }]
        expected_transformed_metadata = self.toMetadata(
            {'a b': tf.FixedLenFeature((), tf.string, None)})
        self.assertDatasetsEqual(
            transformed_dataset,
            (expected_transformed_data, expected_transformed_metadata))
Exemplo n.º 4
0
    def testTransformWithExcludedOutputs(self):
        def preprocessing_fn(inputs):
            return {
                'x_scaled': tft.scale_to_0_1(inputs['x']),
                'y_scaled': tft.scale_to_0_1(inputs['y'])
            }

        # Run AnalyzeAndTransform on some input data and compare with expected
        # output.
        input_data = [{'x': 5, 'y': 1}, {'x': 1, 'y': 2}]
        input_metadata = self.toMetadata({
            'x':
            tf.FixedLenFeature((), tf.float32, 0),
            'y':
            tf.FixedLenFeature((), tf.float32, 0)
        })
        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            transform_fn = ((input_data, input_metadata)
                            | beam_impl.AnalyzeDataset(preprocessing_fn))

        # Take the transform function and use TransformDataset to apply it to
        # some eval data, with missing 'y' column.
        eval_data = [{'x': 6}]
        eval_metadata = self.toMetadata(
            {'x': tf.FixedLenFeature((), tf.float32, 0)})
        transformed_eval_dataset = (
            ((eval_data, eval_metadata), transform_fn)
            | beam_impl.TransformDataset(exclude_outputs=['y_scaled']))

        expected_transformed_eval_data = [{'x_scaled': 1.25}]
        expected_transformed_eval_schema = self.toMetadata(
            {'x_scaled': tf.FixedLenFeature((), tf.float32, None)})
        self.assertDatasetsEqual(
            transformed_eval_dataset,
            (expected_transformed_eval_data, expected_transformed_eval_schema))
Exemplo n.º 5
0
 def testNestedContextCreateBaseTempDir(self):
     level_1_dir = self.get_temp_dir()
     with beam_impl.Context(temp_dir=level_1_dir):
         self.assertEqual(
             os.path.join(level_1_dir, beam_impl.Context._TEMP_SUBDIR),
             beam_impl.Context.create_base_temp_dir())
         level_2_dir = self.get_temp_dir()
         with beam_impl.Context(temp_dir=level_2_dir):
             self.assertEqual(
                 os.path.join(level_2_dir, beam_impl.Context._TEMP_SUBDIR),
                 beam_impl.Context.create_base_temp_dir())
         self.assertEqual(
             os.path.join(level_1_dir, beam_impl.Context._TEMP_SUBDIR),
             beam_impl.Context.create_base_temp_dir())
     with self.assertRaises(ValueError):
         beam_impl.Context.create_base_temp_dir()
Exemplo n.º 6
0
def run(files_pattern,
        table_name,
        table_schema,
        feature_scaling=None,
        eval_percent=20.0,
        beam_options=None,
        work_dir=None):
    """Runs the whole preprocessing step.
    This runs the feature extraction from CSV files in the given Google Storage entrypoint and upload them in a BigQuery Table.
    """
    tft_temp_dir = os.path.join(work_dir, 'tft-temp')

    transform_fn_dir = os.path.join(work_dir, transform_fn_io.TRANSFORM_FN_DIR)
    if tf.gfile.Exists(transform_fn_dir):
        tf.gfile.DeleteRecursively(transform_fn_dir)

    # Build and run a Beam Pipeline
    with beam.Pipeline(options=beam_options) as p, \
            beam_impl.Context(temp_dir=tft_temp_dir):

        # Extract records from sources
        dataset = p | 'Read Record' >> beam.io.Read(
            dataflow_tutorial.ParseRecords(files_pattern))

        # Write the Table on BigQuery
        dataset | 'Write to BigQuery' >> beam.io.WriteToBigQuery(
            table_name,
            schema=table_schema,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY)
Exemplo n.º 7
0
def main(argv=None):
    """Run Preprocessing as a Dataflow."""
    args = parse_arguments(sys.argv if argv is None else argv)
    if args.cloud:
        pipeline_name = 'DataflowRunner'
        options = {
            'job_name': ('cloud-ml-sample-criteo-preprocess-{}'.format(
                datetime.datetime.now().strftime('%Y%m%d%H%M%S'))),
            'temp_location':
            os.path.join(args.output_dir, 'tmp'),
            'project':
            args.project_id,
            # TODO(b/35727492): Remove this.
            'max_num_workers':
            1000,
            'setup_file':
            os.path.abspath(os.path.join(os.path.dirname(__file__),
                                         'setup.py')),
        }
        pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
    else:
        pipeline_name = 'DirectRunner'
        pipeline_options = None

    temp_dir = os.path.join(args.output_dir, 'tmp')
    with beam.Pipeline(pipeline_name, options=pipeline_options) as p:
        with tft.Context(temp_dir=temp_dir):
            preprocess(pipeline=p,
                       training_data=args.training_data,
                       eval_data=args.eval_data,
                       predict_data=args.predict_data,
                       output_dir=args.output_dir,
                       frequency_threshold=args.frequency_threshold,
                       delimiter=args.delimiter)
Exemplo n.º 8
0
def main(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--cloud', type=str, help='y' )
    args = parser.parse_args(argv) # Parse the arguments 
    if args.cloud=="y":
        pipeline_options = get_cloud_pipeline_options()
    else:
        pipeline_options = beam.pipeline.PipelineOptions(flags=[],**{'project': "iotpubsub-1536350750202"})
    with beam_impl.Context(temp_dir="gs://relation_extraction/beam"):
        p = beam.Pipeline(options=pipeline_options)
        train_data, test_data = (p | "Read from bigquery" >> ReadBigQuery())

        (test_data | "test it" >> beam.Map(printy))
        train_data = (train_data, train_metadata)
        train_dataset, transform_fn = (train_data
                                            | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)
                                            )
        test_data = (test_data, train_metadata)
        test_data, _ = ((test_data, transform_fn) | 'Transform test data' >> beam_impl.TransformDataset())
        train_data, transformed_metadata = train_dataset
        transformed_data_coder = tft.coders.ExampleProtoCoder(transformed_metadata.schema)
        _ = (train_data
                | 'Encode train data to save it' >> beam.Map(transformed_data_coder.encode)
                | 'Write the train data to tfrecords' >> tfrecordio.WriteToTFRecord(os.path.join("gs://relation_extraction/beam/Train","TRAIN"))
                )
        _ = (test_data
                | 'Encode test data to save it' >> beam.Map(transformed_data_coder.encode)
                | 'Write the test data to tfrecords' >> tfrecordio.WriteToTFRecord(os.path.join("gs://relation_extraction/beam/Test","TEST"))
                )
        _ = (transform_fn | "WriteTransformFn" >> transform_fn_io.WriteTransformFn("gs://relation_extraction/beam/"))

        p.run().wait_until_finish()
def main(argv=None):
  """Run Preprocessing as a Dataflow."""
  args = parse_arguments(sys.argv if argv is None else argv)
  temp_dir = os.path.join(args.output_dir, 'tmp')

  if args.cloud:
    pipeline_name = 'DataflowRunner'
  else:
    pipeline_name = 'DirectRunner'

  options = {
      'job_name': args.job_name,
      'temp_location': temp_dir,
      'project': args.project_id,
      'setup_file':
          os.path.abspath(os.path.join(
              os.path.dirname(__file__),
              'setup.py')),
  }
  pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)

  with beam.Pipeline(pipeline_name, options=pipeline_options) as p:
    with tft.Context(temp_dir=temp_dir):
      preprocess(
          pipeline=p,
          args=args)
 def setUp(self):
   super(VocabularyIntegrationV2Test, self).setUp()
   tft_unit.skip_if_not_tf2('Tensorflow 2.x required')
   tf.compat.v1.logging.info('Starting test case: %s', self._testMethodName)
   self._force_tf_compat_v1_context = beam_impl.Context(
       force_tf_compat_v1=False)
   self._force_tf_compat_v1_context.__enter__()
Exemplo n.º 11
0
    def testUniquesAnalyzerWithTokenization(self):
        # User defined transform_fn accepts and returns a dict of Columns.
        def preprocessing_fn(inputs):
            return {
                'index':
                tft.string_to_int(tft.map(tf.string_split, inputs['a']))
            }

        input_data = [{'a': 'hello hello world'}, {'a': 'hello goodbye world'}]
        input_metadata = self.toMetadata({
            'a':
            tf.FixedLenFeature((), tf.string, ''),
        })

        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            transformed_dataset, _ = (
                (input_data, input_metadata)
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

        expected_transformed_data = [{
            'index': [0, 0, 1],
        }, {
            'index': [0, 2, 1]
        }]
        expected_transformed_metadata = self.toMetadata(
            {'index': tf.VarLenFeature(tf.int64)})
        self.assertDatasetsEqual(
            transformed_dataset,
            (expected_transformed_data, expected_transformed_metadata))
Exemplo n.º 12
0
def main(argv=None):
  """Run Preprocessing as a Dataflow."""

  args = parse_arguments(sys.argv if argv is None else argv)
  runner = get_pipeline_name(args.runner, args.cloud)
  if args.cloud:
    options = {
        'job_name': ('cloud-ml-sample-movielens-preprocess-{}'.format(
            datetime.datetime.now().strftime('%Y%m%d%H%M%S'))),
        'temp_location':
            os.path.join(args.output_dir, 'tmp'),
        'project':
            args.project_id,
        # TODO(b/35727492) Remove this.
        'max_num_workers':
            250,
        'setup_file':
            os.path.abspath(os.path.join(
                os.path.dirname(__file__),
                'setup.py')),
    }
    pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
  else:
    pipeline_options = None

  temp_dir = os.path.join(args.output_dir, 'tmp')
  with beam.Pipeline(runner, options=pipeline_options) as pipeline:
    with tft.Context(temp_dir=temp_dir):
      preprocess(pipeline, args)
Exemplo n.º 13
0
    def testMultipleLevelsOfAnalysis(self):
        # Test a preprocessing function similar to scale_to_0_1 except that it
        # involves multiple interleavings of analyzers and transforms.
        def preprocessing_fn(inputs):
            scaled_to_0 = tft.map(lambda x, y: x - y, inputs['x'],
                                  tft.min(inputs['x']))
            scaled_to_0_1 = tft.map(lambda x, y: x / y, scaled_to_0,
                                    tft.max(scaled_to_0))
            return {'x_scaled': scaled_to_0_1}

        metadata = self.toMetadata(
            {'x': tf.FixedLenFeature((), tf.float32, 0)})
        input_columns = [{'x': v} for v in [4, 1, 5, 2]]
        input_dataset = (input_columns, metadata)

        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            transformed, _ = (
                input_dataset
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

        output_columns, _ = transformed

        self.assertEqual(output_columns, [{
            'x_scaled': v
        } for v in [0.75, 0.0, 1.0, 0.25]])
Exemplo n.º 14
0
    def testNumericAnalyzersWithSparseInputs(self):
        def repeat(in_tensor, value):
            batch_size = tf.shape(in_tensor)[0]
            return tf.ones([batch_size], value.dtype) * value

        input_data = [{'a': [4, 5, 6]}, {'a': [1, 2]}]
        input_metadata = self.toMetadata({'a': tf.VarLenFeature(tf.int64)})
        input_dataset = (input_data, input_metadata)

        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            with self.assertRaises(TypeError):

                def min_fn(inputs):
                    return {
                        'min': tft.map(repeat, inputs['a'],
                                       tft.min(inputs['a']))
                    }

                _ = input_dataset | beam_impl.AnalyzeDataset(min_fn)

            with self.assertRaises(TypeError):

                def max_fn(inputs):
                    return {
                        'max': tft.map(repeat, inputs['a'],
                                       tft.max(inputs['a']))
                    }

                _ = input_dataset | beam_impl.AnalyzeDataset(max_fn)

            with self.assertRaises(TypeError):

                def sum_fn(inputs):
                    return {
                        'sum': tft.map(repeat, inputs['a'],
                                       tft.sum(inputs['a']))
                    }

                _ = input_dataset | beam_impl.AnalyzeDataset(sum_fn)

            with self.assertRaises(TypeError):

                def size_fn(inputs):
                    return {
                        'size': tft.map(repeat, inputs['a'],
                                        tft.size(inputs['a']))
                    }

                _ = input_dataset | beam_impl.AnalyzeDataset(size_fn)

            with self.assertRaises(TypeError):

                def mean_fn(inputs):
                    return {
                        'mean': tft.map(repeat, inputs['a'],
                                        tft.mean(inputs['a']))
                    }

                _ = input_dataset | beam_impl.AnalyzeDataset(mean_fn)
Exemplo n.º 15
0
def main(p=None):
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        x = inputs['x']
        y = inputs['y']
        s = inputs['s']
        x_centered = x - tft.mean(x)
        y_normalized = tft.scale_to_0_1(y)
        s_integerized = tft.string_to_int(s)
        x_centered_times_y_normalized = (x_centered * y_normalized)
        return {
            'x_centered': x_centered,
            'y_normalized': y_normalized,
            'x_centered_times_y_normalized': x_centered_times_y_normalized,
            's_integerized': s_integerized
        }

    raw_data = [{
        'x': 1,
        'y': 1,
        's': 'hello'
    }, {
        'x': 2,
        'y': 2,
        's': 'world'
    }, {
        'x': 3,
        'y': 3,
        's': 'hello'
    }]

    # raw_data_p = p | beam.Create(raw_data)

    raw_data_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.Schema({
            's':
            dataset_schema.ColumnSchema(
                tf.string, [], dataset_schema.FixedColumnRepresentation()),
            'y':
            dataset_schema.ColumnSchema(
                tf.float32, [], dataset_schema.FixedColumnRepresentation()),
            'x':
            dataset_schema.ColumnSchema(
                tf.float32, [], dataset_schema.FixedColumnRepresentation())
        }))

    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
        transformed_dataset, transform_fn = (  # pylint: disable=unused-variable
            (raw_data, raw_data_metadata)
            | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

        transformed_data, transformed_metadata = transformed_dataset  # pylint: disable=unused-variable

        pprint.pprint(transformed_data)
        (transformed_data
         | beam.io.WriteToText(
             '/Users/luoshixin/Personal/GCPStudy/src/tensorflow/tftransform/tmp'
         ))
Exemplo n.º 16
0
  def setUp(self):
    super(CachedImplTest, self).setUp()
    self.base_test_dir = os.path.join(
        os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
        self._testMethodName)
    self._cache_dir = os.path.join(self.base_test_dir, 'cache')

    self._context = beam_impl.Context(temp_dir=self.get_temp_dir())
    self._context.__enter__()
Exemplo n.º 17
0
    def testUniquesAnalyzerWithFrequencyThresholdTooHigh(self):
        # User defined transform_fn accepts and returns a dict of Columns.
        # Expected to return an empty dict due to too high threshold.
        def preprocessing_fn(inputs):
            return {
                'index1':
                tft.string_to_int(tft.map(tf.string_split, inputs['a']),
                                  default_value=-99,
                                  frequency_threshold=77),

                # As above but using a string for frequency_threshold (and changing
                # the default_value to showcase things).
                'index2':
                tft.string_to_int(tft.map(tf.string_split, inputs['a']),
                                  default_value=-9,
                                  frequency_threshold='77')
            }

        input_data = [{
            'a': 'hello hello world'
        }, {
            'a': 'hello goodbye world'
        }, {
            'a': 'hello goodbye foo'
        }]
        input_schema = self.toMetadata({
            'a':
            tf.FixedLenFeature((), tf.string, ''),
        })

        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            transformed_dataset, _ = (
                (input_data, input_schema)
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

        # Generated vocab (ordered by frequency, then value) should be:
        # ["hello", "world", "goodbye", "foo"]. After applying frequency_threshold=2
        # this becomes empty.
        expected_transformed_data = [{
            'index1': [-99, -99, -99],
            'index2': [-9, -9, -9]
        }, {
            'index1': [-99, -99, -99],
            'index2': [-9, -9, -9]
        }, {
            'index1': [-99, -99, -99],
            'index2': [-9, -9, -9]
        }]
        expected_transformed_schema = self.toMetadata({
            'index1':
            tf.VarLenFeature(tf.int64),
            'index2':
            tf.VarLenFeature(tf.int64)
        })
        self.assertDatasetsEqual(
            transformed_dataset,
            (expected_transformed_data, expected_transformed_schema))
def data_transform():
    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        transformed_dataset, transform_fn = ((dict_features, data_metadata) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
        
        transformed_data, transformed_metadata = transformed_dataset
        
        for i in range(len(transformed_data)):
            print('Raw: ', dict_features[i])
            print('Transformed: ', transformed_data[i])
def run(pipeline_options, known_args):
  pipeline = beam.Pipeline(options=pipeline_options)

  with impl.Context(known_args.transform_temp_dir):
    articles = (
        pipeline
        | 'Get Paths' >> beam.Create(get_paths(known_args.file_pattern))
        | 'Get Articles' >> beam.Map(get_articles)
        | 'Get Article' >> beam.FlatMap(lambda x: x)
    )

    dataset = (articles, get_metadata())

    transform_fn = (
        dataset
        | 'Analyse dataset' >> impl.AnalyzeDataset(preprocess_fn)
    )

    transformed_data_with_meta = (
        (dataset, transform_fn)
        | 'Transform dataset' >> impl.TransformDataset()
    )

    transformed_data, transformed_metadata = transformed_data_with_meta

    transform_fn | 'Export Transform Fn' >> transform_fn_io.WriteTransformFn(
        known_args.transform_export_dir)

    (
        transformed_data
        | 'Convert to Insertable data' >> beam.Map(to_bq_row)
        | 'Write to BigQuery table' >> beam.io.WriteToBigQuery(
            project=known_args.bq_project,
            dataset=known_args.bq_dataset,
            table=known_args.bq_table,
            schema=get_bigquery_schema(),
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
    )

    if known_args.enable_tfrecord:
      transformed_data | 'Write TFRecords' >> beam.io.tfrecordio.WriteToTFRecord(
          file_path_prefix='{0}/{1}'.format(known_args.tfrecord_export_dir, 'reuter'),
          file_name_suffix='.tfrecords',
          coder=tft_coders.example_proto_coder.ExampleProtoCoder(transformed_metadata.schema))

    if known_args.enable_debug:
      transformed_data | 'Debug Output' >> beam.io.textio.WriteToText(
          file_path_prefix=known_args.debug_output_prefix, file_name_suffix='.txt')


  job = pipeline.run()

  if pipeline_options.get_all_options()['runner'] == 'DirectRunner':
    job.wait_until_finish()
Exemplo n.º 20
0
def main():
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        x = inputs['x']
        y = inputs['y']
        s = inputs['s']
        x_centered = x - tft.mean(x)
        y_normalized = tft.scale_to_0_1(y)
        s_integerized = tft.string_to_int(s)
        x_centered_times_y_normalized = (x_centered * y_normalized)
        return {
            'x_centered': x_centered,
            'y_normalized': y_normalized,
            'x_centered_times_y_normalized': x_centered_times_y_normalized,
            's_integerized': s_integerized
        }

    raw_data = [{
        'x': 1,
        'y': 1,
        's': 'hello'
    }, {
        'x': 2,
        'y': 2,
        's': 'world'
    }, {
        'x': 3,
        'y': 3,
        's': 'hello'
    }]

    raw_data_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.Schema({
            's':
            dataset_schema.ColumnSchema(
                tf.string, [], dataset_schema.FixedColumnRepresentation()),
            'y':
            dataset_schema.ColumnSchema(
                tf.float32, [], dataset_schema.FixedColumnRepresentation()),
            'x':
            dataset_schema.ColumnSchema(
                tf.float32, [], dataset_schema.FixedColumnRepresentation())
        }))

    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
        transform_fn = ((raw_data, raw_data_metadata)
                        | beam_impl.AnalyzeDataset(preprocessing_fn))
        transformed_dataset = (((raw_data, raw_data_metadata), transform_fn)
                               | beam_impl.TransformDataset())

    # pylint: disable=unused-variable
    transformed_data, transformed_metadata = transformed_dataset

    pprint.pprint(transformed_data)
def data_transform():

    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        transformed_dataset, transform_fn = (
            (dict_features, data_metadata) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) # Preprocessing_fn automatically called with the correct inputs

    transformed_data, transformed_metadata = transformed_dataset # Break down the dataset

    for i in range(len(transformed_data)):
        print("Raw: ", dict_features[i]) # See the raw data
        print("Transformed:", transformed_data[i]) # Compare with the transformed data
def main():
    """Parses execution arguments, creates and runs processing pipeline.

  Cheks current OS. Posix OS is required for local and GCP paths consistency.

  Raises:
    OSError: Posix OS required.
    ValueError: Train validation and test size dont add up to 1.0.
  """

    if os.name != 'posix':
        raise OSError('Posix OS required.')

    args = parse_arguments(sys.argv)

    if args.train_size + args.validation_size + args.test_size != 1.0:
        raise ValueError('Train validation and test size dont add up to 1.0.')

    output_dir = args.output_dir
    if args.cloud:
        output_dir = posixpath.join('gs://', args.bucket_id, output_dir)
        runner = 'DataflowRunner'
    else:
        output_dir = posixpath.join('.', output_dir)
        runner = 'DirectRunner'

    temp_dir = posixpath.join(output_dir, 'tmp')

    options = {
        'project':
        args.project_id,
        'job_name':
        '{}-{}'.format(args.project_id,
                       datetime.now().strftime('%Y%m%d%H%M%S')),
        'setup_file':
        posixpath.abspath(
            posixpath.join(posixpath.dirname(__file__), 'setup.py')),
        'temp_location':
        temp_dir,
        'save_main_session':
        True
    }
    pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)

    with beam.Pipeline(runner, options=pipeline_options) as p:
        with beam_impl.Context(temp_dir=temp_dir):
            preprocess(p=p,
                       output_dir=output_dir,
                       check_path=args.check_path,
                       data_size=(args.train_size, args.validation_size,
                                  args.test_size),
                       bq_table=args.bq_table,
                       split_data_path=args.split_data_path,
                       project_id=args.project_id)
Exemplo n.º 23
0
def main(argv=None):
    """Run preprocessing as a Dataflow pipeline.
    Args:
        argv (list): list of arguments
    """
    args = parse_arguments(sys.argv if argv is None else argv)

    if args.cloud:
        pipeline_options = get_cloud_pipeline_options()
    else:
        pipeline_options = None

    p = beam.Pipeline(options=pipeline_options)
    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
        # read data and join by key
        raw_data_input = (p
                          | 'ReadInputData' >> beam.io.ReadFromText(
                              TRAIN_INPUT_DATA, skip_header_lines=1)
                          | 'ParseInputCSV' >> beam.Map(converter_input.decode)
                          | 'ExtractBatchKeyIn' >> beam.Map(extract_batchkey))

        raw_data_output = (
            p
            | 'ReadOutputData' >> beam.io.ReadFromText(TRAIN_OUTPUT_DATA,
                                                       skip_header_lines=1)
            | 'ParseOutputCSV' >> beam.Map(converter_output.decode)
            | 'ExtractBatchKeyOut' >> beam.Map(extract_batchkey))

        raw_data = ((raw_data_input, raw_data_output)
                    | 'JoinData' >> beam.CoGroupByKey()
                    | 'RemoveKeys' >> beam.FlatMap(remove_keys))

        # analyse and transform dataset
        raw_dataset = (raw_data, input_metadata)
        transformed_dataset, transform_fn = (
            raw_dataset
            | 'AnalyzeAndTransform' >>
            beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
        transformed_data, transformed_metadata = transformed_dataset

        # save data and serialize TransformFn
        transformed_data_coder = tft.coders.ExampleProtoCoder(
            transformed_metadata.schema)
        _ = (transformed_data
             | 'EncodeData' >> beam.Map(transformed_data_coder.encode)
             | 'WriteData' >> tfrecordio.WriteToTFRecord(
                 os.path.join(TFRECORD_DIR, 'records')))
        _ = (transform_fn
             |
             "WriteTransformFn" >> transform_fn_io.WriteTransformFn(MODEL_DIR))

        p.run().wait_until_finish()
def main():
    """Configures and runs a pipeline."""
    args = parse_arguments(sys.argv)
    config = parse_config("CLOUD" if args.cloud else "LOCAL",
                          get_relative_path("config.ini"))
    set_logging(config.get("log_level"))
    options = get_pipeline_options(args, config)
    runner = str(config.get("runner"))

    with beam.Pipeline(runner, options=options) as pipeline:
        with beam_impl.Context(
                temp_dir=os.path.join(args.tft_dir, constants.TMP_DIR)):
            preprocess.run(pipeline, args)
Exemplo n.º 25
0
    def assertAnalyzeAndTransformResults(self,
                                         input_data,
                                         input_metadata,
                                         preprocessing_fn,
                                         expected_data,
                                         expected_metadata=None):
        """Assert that input data and metadata is transformed as expected.

    Args:
      input_data: A sequence of dicts whose values are
          either strings, lists of strings, numeric types or a pair of those.
      input_metadata: DatasetMetadata describing input_data.
      preprocessing_fn: A function taking a dict of tensors and returning
          a dict of tensors.
      expected_data: A dataset with the same type constraints as input_data,
          but representing the output after transformation.
      expected_metadata: (optional) DatasetMeatadata describing the transformed
          data.
    Raises:
      AssertionError: if the expected data does not match the results of
          transforming input_data according to preprocessing_fn, or
          (if provided) if the expected metadata does not match.
    """
        temp_dir = self.get_temp_dir()
        with beam_impl.Context(temp_dir=temp_dir):
            # Note: we don't separately test AnalyzeDataset and TransformDataset as
            # AnalyzeAndTransformDataset currently simply composes these two
            # transforms.  If in future versions of the code, the implementation
            # differs, we should also run AnalyzeDataset and TransformDatset composed.
            #
            # Also, the dataset_metadata that is returned along with
            # `transformed_data` is incomplete as it does not contain the deferred
            # components, so we instead inspect the metadata returned along with the
            # transform function.
            (transformed_data,
             _), (_, (transformed_metadata, deferred_metadata)) = (
                 (input_data, input_metadata)
                 | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

        self.assertDataCloseOrEqual(expected_data, transformed_data)
        if expected_metadata:
            # deferred_metadata should be a singleton PCollection.
            self.assertEqual(len(deferred_metadata), 1)
            unresolved_futures = transformed_metadata.substitute_futures(
                deferred_metadata[0])
            self.assertEqual(unresolved_futures, [])
            # Use extra assertEqual for schemas, since full metadata assertEqual error
            # message is not conducive to debugging.
            self.assertEqual(expected_metadata.schema.column_schemas,
                             transformed_metadata.schema.column_schemas)
            self.assertEqual(expected_metadata, transformed_metadata)
Exemplo n.º 26
0
    def build_graph(self):
        # Move percentage of train data to .`PPGRAPH_EXT` files, used for graph building.
        # num_lines = 0
        # for i in range(DATASET_NUM_SHARDS):
        #     _fname = '{}-{:05}-of-{:05}'.format(self.train_fname_out, i, self.config.DATASET_NUM_SHARDS)
        #     num_lines += sum(1 for _ in open(_fname))
        #     _fname_marked = '{}-{:05}-of-{:05}.{}'.format(self.train_fname_out, i, self.config.DATASET_NUM_SHARDS,
        #                                                   PPGRAPH_EXT)
        #     shutil.move(_fname, _fname_marked)
        #     if num_lines >= self.config.PPGRAPH_MAX_SAMPLES:
        #         break

        # Set up the preprocessing pipeline for analyzing the dataset. The analyze call is not combined with the
        # transform call because we will parallelize the transform call later. We had the issue that this process
        # runs on a single core and tends to cause OOM issues.
        pipeline = beam.Pipeline(runner=DirectRunner())

        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
            # todo: maybe, I should only use train data (or percentage of train data) to build the graph
            raw_train_data = (
                pipeline
                | 'ReadTrainDataFile' >> textio.ReadFromText(
                    'data/features' + '*' + 'shard' + '*', skip_header_lines=0)
                | 'DecodeTrainDataCSV' >> MapAndFilterErrors(
                    tft_coders.CsvCoder(
                        self.data_formatter.get_ordered_columns(),
                        self.data_formatter.get_raw_data_metadata().schema).
                    decode))

            # Combine data and schema into a dataset tuple.  Note that we already used
            # the schema to read the CSV data, but we also need it to interpret
            # raw_data.
            # That is when to use vocabulary, scale_to_0_1 or sparse_to_dense ...
            transform_fn = (
                (raw_train_data, self.data_formatter.get_raw_data_metadata())
                | beam_impl.AnalyzeDataset(
                    PreprocessingFunction().transform_to_tfrecord))

            # Write SavedModel and metadata to two subdirectories of working_dir, given by
            # `transform_fn_io.TRANSFORM_FN_DIR` and `transform_fn_io.TRANSFORMED_METADATA_DIR` respectively.
            _ = (transform_fn
                 | 'WriteTransformGraph' >>
                 transform_fn_io.WriteTransformFn(TARGET_DIR))  # working dir

        # Run the Beam preprocessing pipeline.
        st = time.time()
        result = pipeline.run()
        result.wait_until_finish()
        self.logger.info(
            'Transformation graph built and written in {:.2f} sec'.format(
                time.time() - st))
Exemplo n.º 27
0
    def testNumericAnalyzersWithNDInputs(self):
        def preprocessing_fn(inputs):
            def repeat(in_tensor, value):
                batch_size = tf.shape(in_tensor)[0]
                return tf.ones([batch_size], value.dtype) * value

            return {
                'min': tft.map(repeat, inputs['a'], tft.min(inputs['a'])),
                'max': tft.map(repeat, inputs['a'], tft.max(inputs['a'])),
                'sum': tft.map(repeat, inputs['a'], tft.sum(inputs['a'])),
                'size': tft.map(repeat, inputs['a'], tft.size(inputs['a'])),
                'mean': tft.map(repeat, inputs['a'], tft.mean(inputs['a']))
            }

        input_data = [{'a': [[4, 5], [6, 7]]}, {'a': [[1, 2], [3, 4]]}]
        input_metadata = self.toMetadata(
            {'a': tf.FixedLenFeature((2, 2), tf.int64)})
        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            transformed_dataset, _ = (
                (input_data, input_metadata)
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

        expected_transformed_data = [{
            'min': 1,
            'max': 7,
            'sum': 32,
            'size': 8,
            'mean': 4.0
        }, {
            'min': 1,
            'max': 7,
            'sum': 32,
            'size': 8,
            'mean': 4.0
        }]
        expected_transformed_metadata = self.toMetadata({
            'min':
            tf.FixedLenFeature((), tf.int64, None),
            'max':
            tf.FixedLenFeature((), tf.int64, None),
            'sum':
            tf.FixedLenFeature((), tf.int64, None),
            'size':
            tf.FixedLenFeature((), tf.int64, None),
            'mean':
            tf.FixedLenFeature((), tf.float64, None)
        })
        self.assertDatasetsEqual(
            transformed_dataset,
            (expected_transformed_data, expected_transformed_metadata))
Exemplo n.º 28
0
    def testUniquesAnalyzer(self):
        # User defined transform_fn accepts and returns a dict of Columns.
        def preprocessing_fn(inputs):
            return {'index': tft.string_to_int(inputs['a'])}

        input_data = [{
            'a': 'hello'
        }, {
            'a': 'world'
        }, {
            'a': 'hello'
        }, {
            'a': 'hello'
        }, {
            'a': 'goodbye'
        }, {
            'a': 'world'
        }, {
            'a': 'aaaaa'
        }]
        input_metadata = self.toMetadata({
            'a':
            tf.FixedLenFeature((), tf.string),
        })
        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            transformed_dataset, _ = (
                (input_data, input_metadata)
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

        expected_transformed_data = [{
            'index': 0
        }, {
            'index': 1
        }, {
            'index': 0
        }, {
            'index': 0
        }, {
            'index': 2
        }, {
            'index': 1
        }, {
            'index': 3
        }]
        expected_transformed_metadata = self.toMetadata(
            {'index': tf.FixedLenFeature((), tf.int64)})
        self.assertDatasetsEqual(
            transformed_dataset,
            (expected_transformed_data, expected_transformed_metadata))
Exemplo n.º 29
0
def run_tft_pipeline(args):
    """
    This is where all the data we have available in our database is processed and 
    transformed into Tensorflow tfrecords for later training and testing.

    The code runs in distributed manner automatically in the engine choosen by
    the `runner` argument in input.
    """
    pipeline_options = build_pipeline_options(args)
    temp_tft_folder = (tempfile.mkdtemp(
        dir='/tmp/') if not args.tft_temp else args.tft_temp)
    tft_transform_folder = (tempfile.mkdtemp(
        dir='/tmp/') if not args.tft_transform else args.tft_transform)

    with beam.Pipeline(options=pipeline_options) as pipeline:
        with beam_impl.Context(temp_dir=temp_tft_folder):

            train_data = read_input_data(args, pipeline, 'train')

            write_total_distinct_keys_to_file(train_data, args.nitems_filename,
                                              'sku')

            train_dataset = (train_data, metadata.RAW_DATA_METADATA)
            (train_data, transformed_train_metadata), transform_fn = (
                train_dataset
                | beam_impl.AnalyzeAndTransformDataset(preprocess_fn))

            _ = (transform_fn
                 | 'WriteTransformFn' >>
                 transform_fn_io.WriteTransformFn(tft_transform_folder))

            train_data = aggregate_transformed_data(train_data, 'train')

            write_tfrecords(train_data, metadata.OUTPUT_TRAIN_SCHEMA,
                            args.output_train_filename, 'output train')

            test_data = read_input_data(args, pipeline, 'test')

            test_dataset = (test_data, metadata.RAW_DATA_METADATA)

            (test_data,
             _) = ((test_dataset, transform_fn) | beam_impl.TransformDataset())

            test_data = aggregate_transformed_data(test_data, 'test')

            test_data = aggregate_final_test_data(train_data, test_data)

            write_tfrecords(test_data, metadata.OUTPUT_TEST_SCHEMA,
                            args.output_test_filename, 'output test')
def main():
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        x = inputs['x']
        y = inputs['y']
        s = inputs['s']
        x_centered = x - tft.mean(x)
        y_normalized = tft.scale_to_0_1(y)
        s_integerized = tft.compute_and_apply_vocabulary(s)
        x_centered_times_y_normalized = (x_centered * y_normalized)
        return {
            'x_centered': x_centered,
            'y_normalized': y_normalized,
            'x_centered_times_y_normalized': x_centered_times_y_normalized,
            's_integerized': s_integerized
        }

    raw_data = [{
        'x': 1,
        'y': 1,
        's': 'hello'
    }, {
        'x': 2,
        'y': 2,
        's': 'world'
    }, {
        'x': 3,
        'y': 3,
        's': 'hello'
    }]

    raw_data_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.from_feature_spec({
            's':
            tf.io.FixedLenFeature([], tf.string),
            'y':
            tf.io.FixedLenFeature([], tf.float32),
            'x':
            tf.io.FixedLenFeature([], tf.float32),
        }))

    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        transformed_dataset, transform_fn = (  # pylint: disable=unused-variable
            (raw_data, raw_data_metadata)
            | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))

    transformed_data, transformed_metadata = transformed_dataset  # pylint: disable=unused-variable

    pprint.pprint(transformed_data)