Exemplos de Context em Python, exemplos de tensorflow_transform.beam.impl.Context em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: impl_test.py Projeto: Kryndex/transform

    def testAnalyzeBeforeTransform(self):
        def preprocessing_fn(inputs):
            return {'x_scaled': tft.scale_to_0_1(inputs['x'])}

        # Run AnalyzeAndTransform on some input data and compare with expected
        # output.
        input_data = [{'x': 4}, {'x': 1}, {'x': 5}, {'x': 2}]
        input_metadata = self.toMetadata(
            {'x': tf.FixedLenFeature((), tf.float32, 0)})

        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            transformed_dataset, transform_fn = (
                (input_data, input_metadata)
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

        expected_transformed_data = [{
            'x_scaled': 0.75
        }, {
            'x_scaled': 0.0
        }, {
            'x_scaled': 1.0
        }, {
            'x_scaled': 0.25
        }]
        expected_transformed_metadata = self.toMetadata(
            {'x_scaled': tf.FixedLenFeature((), tf.float32, None)})
        self.assertDatasetsEqual(
            transformed_dataset,
            (expected_transformed_data, expected_transformed_metadata))

        # Take the transform function and use TransformDataset to apply it to
        # some eval data, and compare with expected output.
        eval_data = [{'x': 6}, {'x': 3}]
        transformed_eval_dataset = (((eval_data, input_metadata), transform_fn)
                                    | beam_impl.TransformDataset())

        expected_transformed_eval_data = [{
            'x_scaled': 1.25
        }, {
            'x_scaled': 0.5
        }]
        self.assertDatasetsEqual(
            transformed_eval_dataset,
            (expected_transformed_eval_data, expected_transformed_metadata))

        # Redo test with eval data, using AnalyzeDataset instead of
        # AnalyzeAndTransformDataset to genereate transform_fn.
        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            transform_fn = ((input_data, input_metadata)
                            | beam_impl.AnalyzeDataset(preprocessing_fn))
            transformed_eval_dataset = ((
                (eval_data, input_metadata), transform_fn)
                                        | beam_impl.TransformDataset())
        self.assertDatasetsEqual(
            transformed_eval_dataset,
            (expected_transformed_eval_data, expected_transformed_metadata))

Exemplo n.º 2

0

Exibir arquivo

Arquivo: impl_test.py Projeto: Kryndex/transform

    def testComposedTransforms(self):
        # User defined preprocessing_fn accepts and returns a dict of Columns.
        def preprocessing_fn(inputs):
            return {
                'a(b+c)':
                tft.map(tf.multiply, inputs['a'],
                        tft.map(tf.add, inputs['b'], inputs['c']))
            }

        input_data = [{'a': 4, 'b': 3, 'c': 3}, {'a': 1, 'b': 2, 'c': 1}]
        input_metadata = self.toMetadata({
            'a':
            tf.FixedLenFeature((), tf.float32, 0),
            'b':
            tf.FixedLenFeature((), tf.float32, 0),
            'c':
            tf.FixedLenFeature((), tf.float32, 0)
        })
        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            transformed_dataset, _ = (
                (input_data, input_metadata)
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

        expected_transformed_data = [{'a(b+c)': 24}, {'a(b+c)': 3}]
        expected_transformed_metadata = self.toMetadata(
            {'a(b+c)': tf.FixedLenFeature((), tf.float32, None)})
        self.assertDatasetsEqual(
            transformed_dataset,
            (expected_transformed_data, expected_transformed_metadata))

Exemplo n.º 3

0

Exibir arquivo

Arquivo: impl_test.py Projeto: Kryndex/transform

    def testTransformUnicode(self):
        # User defined preprocessing_fn accepts and returns a dict of Columns.
        def preprocessing_fn(inputs):
            def tito_string_join(*tensors):
                return tf.string_join(tensors, separator=' ')

            return {'a b': tft.map(tito_string_join, inputs['a'], inputs['b'])}

        input_data = [{
            'a': 'Hello',
            'b': 'world'
        }, {
            'a': 'Hello',
            'b': u'κόσμε'
        }]
        input_metadata = self.toMetadata({
            'a': tf.FixedLenFeature((), tf.string),
            'b': tf.FixedLenFeature((), tf.string)
        })
        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            transformed_dataset, _ = (
                (input_data, input_metadata)
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

        expected_transformed_data = [{
            'a b': 'Hello world'
        }, {
            'a b': u'Hello κόσμε'.encode('utf-8')
        }]
        expected_transformed_metadata = self.toMetadata(
            {'a b': tf.FixedLenFeature((), tf.string, None)})
        self.assertDatasetsEqual(
            transformed_dataset,
            (expected_transformed_data, expected_transformed_metadata))

Exemplo n.º 4

0

Exibir arquivo

Arquivo: impl_test.py Projeto: Kryndex/transform

    def testTransformWithExcludedOutputs(self):
        def preprocessing_fn(inputs):
            return {
                'x_scaled': tft.scale_to_0_1(inputs['x']),
                'y_scaled': tft.scale_to_0_1(inputs['y'])
            }

        # Run AnalyzeAndTransform on some input data and compare with expected
        # output.
        input_data = [{'x': 5, 'y': 1}, {'x': 1, 'y': 2}]
        input_metadata = self.toMetadata({
            'x':
            tf.FixedLenFeature((), tf.float32, 0),
            'y':
            tf.FixedLenFeature((), tf.float32, 0)
        })
        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            transform_fn = ((input_data, input_metadata)
                            | beam_impl.AnalyzeDataset(preprocessing_fn))

        # Take the transform function and use TransformDataset to apply it to
        # some eval data, with missing 'y' column.
        eval_data = [{'x': 6}]
        eval_metadata = self.toMetadata(
            {'x': tf.FixedLenFeature((), tf.float32, 0)})
        transformed_eval_dataset = (
            ((eval_data, eval_metadata), transform_fn)
            | beam_impl.TransformDataset(exclude_outputs=['y_scaled']))

        expected_transformed_eval_data = [{'x_scaled': 1.25}]
        expected_transformed_eval_schema = self.toMetadata(
            {'x_scaled': tf.FixedLenFeature((), tf.float32, None)})
        self.assertDatasetsEqual(
            transformed_eval_dataset,
            (expected_transformed_eval_data, expected_transformed_eval_schema))

Exemplo n.º 5

0

Exibir arquivo

Arquivo: impl_test.py Projeto: Kryndex/transform

 def testNestedContextCreateBaseTempDir(self):
     level_1_dir = self.get_temp_dir()
     with beam_impl.Context(temp_dir=level_1_dir):
         self.assertEqual(
             os.path.join(level_1_dir, beam_impl.Context._TEMP_SUBDIR),
             beam_impl.Context.create_base_temp_dir())
         level_2_dir = self.get_temp_dir()
         with beam_impl.Context(temp_dir=level_2_dir):
             self.assertEqual(
                 os.path.join(level_2_dir, beam_impl.Context._TEMP_SUBDIR),
                 beam_impl.Context.create_base_temp_dir())
         self.assertEqual(
             os.path.join(level_1_dir, beam_impl.Context._TEMP_SUBDIR),
             beam_impl.Context.create_base_temp_dir())
     with self.assertRaises(ValueError):
         beam_impl.Context.create_base_temp_dir()

Exemplo n.º 6

0

Exibir arquivo

def run(files_pattern,
        table_name,
        table_schema,
        feature_scaling=None,
        eval_percent=20.0,
        beam_options=None,
        work_dir=None):
    """Runs the whole preprocessing step.
    This runs the feature extraction from CSV files in the given Google Storage entrypoint and upload them in a BigQuery Table.
    """
    tft_temp_dir = os.path.join(work_dir, 'tft-temp')

    transform_fn_dir = os.path.join(work_dir, transform_fn_io.TRANSFORM_FN_DIR)
    if tf.gfile.Exists(transform_fn_dir):
        tf.gfile.DeleteRecursively(transform_fn_dir)

    # Build and run a Beam Pipeline
    with beam.Pipeline(options=beam_options) as p, \
            beam_impl.Context(temp_dir=tft_temp_dir):

        # Extract records from sources
        dataset = p | 'Read Record' >> beam.io.Read(
            dataflow_tutorial.ParseRecords(files_pattern))

        # Write the Table on BigQuery
        dataset | 'Write to BigQuery' >> beam.io.WriteToBigQuery(
            table_name,
            schema=table_schema,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: preprocess.py Projeto: shafiahmed/cloudml-samples

def main(argv=None):
    """Run Preprocessing as a Dataflow."""
    args = parse_arguments(sys.argv if argv is None else argv)
    if args.cloud:
        pipeline_name = 'DataflowRunner'
        options = {
            'job_name': ('cloud-ml-sample-criteo-preprocess-{}'.format(
                datetime.datetime.now().strftime('%Y%m%d%H%M%S'))),
            'temp_location':
            os.path.join(args.output_dir, 'tmp'),
            'project':
            args.project_id,
            # TODO(b/35727492): Remove this.
            'max_num_workers':
            1000,
            'setup_file':
            os.path.abspath(os.path.join(os.path.dirname(__file__),
                                         'setup.py')),
        }
        pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
    else:
        pipeline_name = 'DirectRunner'
        pipeline_options = None

    temp_dir = os.path.join(args.output_dir, 'tmp')
    with beam.Pipeline(pipeline_name, options=pipeline_options) as p:
        with tft.Context(temp_dir=temp_dir):
            preprocess(pipeline=p,
                       training_data=args.training_data,
                       eval_data=args.eval_data,
                       predict_data=args.predict_data,
                       output_dir=args.output_dir,
                       frequency_threshold=args.frequency_threshold,
                       delimiter=args.delimiter)

Exemplo n.º 8

0

Exibir arquivo

def main(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--cloud', type=str, help='y' )
    args = parser.parse_args(argv) # Parse the arguments 
    if args.cloud=="y":
        pipeline_options = get_cloud_pipeline_options()
    else:
        pipeline_options = beam.pipeline.PipelineOptions(flags=[],**{'project': "iotpubsub-1536350750202"})
    with beam_impl.Context(temp_dir="gs://relation_extraction/beam"):
        p = beam.Pipeline(options=pipeline_options)
        train_data, test_data = (p | "Read from bigquery" >> ReadBigQuery())

        (test_data | "test it" >> beam.Map(printy))
        train_data = (train_data, train_metadata)
        train_dataset, transform_fn = (train_data
                                            | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)
                                            )
        test_data = (test_data, train_metadata)
        test_data, _ = ((test_data, transform_fn) | 'Transform test data' >> beam_impl.TransformDataset())
        train_data, transformed_metadata = train_dataset
        transformed_data_coder = tft.coders.ExampleProtoCoder(transformed_metadata.schema)
        _ = (train_data
                | 'Encode train data to save it' >> beam.Map(transformed_data_coder.encode)
                | 'Write the train data to tfrecords' >> tfrecordio.WriteToTFRecord(os.path.join("gs://relation_extraction/beam/Train","TRAIN"))
                )
        _ = (test_data
                | 'Encode test data to save it' >> beam.Map(transformed_data_coder.encode)
                | 'Write the test data to tfrecords' >> tfrecordio.WriteToTFRecord(os.path.join("gs://relation_extraction/beam/Test","TEST"))
                )
        _ = (transform_fn | "WriteTransformFn" >> transform_fn_io.WriteTransformFn("gs://relation_extraction/beam/"))

        p.run().wait_until_finish()

Exemplo n.º 9

0

Exibir arquivo

Arquivo: transform_raw_data.py Projeto: shashwatsrivastava94/pydatalab

def main(argv=None):
  """Run Preprocessing as a Dataflow."""
  args = parse_arguments(sys.argv if argv is None else argv)
  temp_dir = os.path.join(args.output_dir, 'tmp')

  if args.cloud:
    pipeline_name = 'DataflowRunner'
  else:
    pipeline_name = 'DirectRunner'

  options = {
      'job_name': args.job_name,
      'temp_location': temp_dir,
      'project': args.project_id,
      'setup_file':
          os.path.abspath(os.path.join(
              os.path.dirname(__file__),
              'setup.py')),
  }
  pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)

  with beam.Pipeline(pipeline_name, options=pipeline_options) as p:
    with tft.Context(temp_dir=temp_dir):
      preprocess(
          pipeline=p,
          args=args)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: vocabulary_integration_v2_test.py Projeto: rdmontgomery/transform

 def setUp(self):
   super(VocabularyIntegrationV2Test, self).setUp()
   tft_unit.skip_if_not_tf2('Tensorflow 2.x required')
   tf.compat.v1.logging.info('Starting test case: %s', self._testMethodName)
   self._force_tf_compat_v1_context = beam_impl.Context(
       force_tf_compat_v1=False)
   self._force_tf_compat_v1_context.__enter__()

Exemplo n.º 11

0

Exibir arquivo

Arquivo: impl_test.py Projeto: Kryndex/transform

    def testUniquesAnalyzerWithTokenization(self):
        # User defined transform_fn accepts and returns a dict of Columns.
        def preprocessing_fn(inputs):
            return {
                'index':
                tft.string_to_int(tft.map(tf.string_split, inputs['a']))
            }

        input_data = [{'a': 'hello hello world'}, {'a': 'hello goodbye world'}]
        input_metadata = self.toMetadata({
            'a':
            tf.FixedLenFeature((), tf.string, ''),
        })

        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            transformed_dataset, _ = (
                (input_data, input_metadata)
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

        expected_transformed_data = [{
            'index': [0, 0, 1],
        }, {
            'index': [0, 2, 1]
        }]
        expected_transformed_metadata = self.toMetadata(
            {'index': tf.VarLenFeature(tf.int64)})
        self.assertDatasetsEqual(
            transformed_dataset,
            (expected_transformed_data, expected_transformed_metadata))

Exemplo n.º 12

0

Exibir arquivo

Arquivo: preprocess.py Projeto: shafiahmed/cloudml-samples

def main(argv=None):
  """Run Preprocessing as a Dataflow."""

  args = parse_arguments(sys.argv if argv is None else argv)
  runner = get_pipeline_name(args.runner, args.cloud)
  if args.cloud:
    options = {
        'job_name': ('cloud-ml-sample-movielens-preprocess-{}'.format(
            datetime.datetime.now().strftime('%Y%m%d%H%M%S'))),
        'temp_location':
            os.path.join(args.output_dir, 'tmp'),
        'project':
            args.project_id,
        # TODO(b/35727492) Remove this.
        'max_num_workers':
            250,
        'setup_file':
            os.path.abspath(os.path.join(
                os.path.dirname(__file__),
                'setup.py')),
    }
    pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
  else:
    pipeline_options = None

  temp_dir = os.path.join(args.output_dir, 'tmp')
  with beam.Pipeline(runner, options=pipeline_options) as pipeline:
    with tft.Context(temp_dir=temp_dir):
      preprocess(pipeline, args)

Exemplo n.º 13

0

Exibir arquivo

Arquivo: impl_test.py Projeto: Kryndex/transform

    def testMultipleLevelsOfAnalysis(self):
        # Test a preprocessing function similar to scale_to_0_1 except that it
        # involves multiple interleavings of analyzers and transforms.
        def preprocessing_fn(inputs):
            scaled_to_0 = tft.map(lambda x, y: x - y, inputs['x'],
                                  tft.min(inputs['x']))
            scaled_to_0_1 = tft.map(lambda x, y: x / y, scaled_to_0,
                                    tft.max(scaled_to_0))
            return {'x_scaled': scaled_to_0_1}

        metadata = self.toMetadata(
            {'x': tf.FixedLenFeature((), tf.float32, 0)})
        input_columns = [{'x': v} for v in [4, 1, 5, 2]]
        input_dataset = (input_columns, metadata)

        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            transformed, _ = (
                input_dataset
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

        output_columns, _ = transformed

        self.assertEqual(output_columns, [{
            'x_scaled': v
        } for v in [0.75, 0.0, 1.0, 0.25]])

Exemplo n.º 14

0

Exibir arquivo

Arquivo: impl_test.py Projeto: Kryndex/transform

    def testNumericAnalyzersWithSparseInputs(self):
        def repeat(in_tensor, value):
            batch_size = tf.shape(in_tensor)[0]
            return tf.ones([batch_size], value.dtype) * value

        input_data = [{'a': [4, 5, 6]}, {'a': [1, 2]}]
        input_metadata = self.toMetadata({'a': tf.VarLenFeature(tf.int64)})
        input_dataset = (input_data, input_metadata)

        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            with self.assertRaises(TypeError):

                def min_fn(inputs):
                    return {
                        'min': tft.map(repeat, inputs['a'],
                                       tft.min(inputs['a']))
                    }

                _ = input_dataset | beam_impl.AnalyzeDataset(min_fn)

            with self.assertRaises(TypeError):

                def max_fn(inputs):
                    return {
                        'max': tft.map(repeat, inputs['a'],
                                       tft.max(inputs['a']))
                    }

                _ = input_dataset | beam_impl.AnalyzeDataset(max_fn)

            with self.assertRaises(TypeError):

                def sum_fn(inputs):
                    return {
                        'sum': tft.map(repeat, inputs['a'],
                                       tft.sum(inputs['a']))
                    }

                _ = input_dataset | beam_impl.AnalyzeDataset(sum_fn)

            with self.assertRaises(TypeError):

                def size_fn(inputs):
                    return {
                        'size': tft.map(repeat, inputs['a'],
                                        tft.size(inputs['a']))
                    }

                _ = input_dataset | beam_impl.AnalyzeDataset(size_fn)

            with self.assertRaises(TypeError):

                def mean_fn(inputs):
                    return {
                        'mean': tft.map(repeat, inputs['a'],
                                        tft.mean(inputs['a']))
                    }

                _ = input_dataset | beam_impl.AnalyzeDataset(mean_fn)

Exemplo n.º 15

0

Exibir arquivo

Arquivo: first_trial.py Projeto: luotigerlsx/GCPStudy

def main(p=None):
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        x = inputs['x']
        y = inputs['y']
        s = inputs['s']
        x_centered = x - tft.mean(x)
        y_normalized = tft.scale_to_0_1(y)
        s_integerized = tft.string_to_int(s)
        x_centered_times_y_normalized = (x_centered * y_normalized)
        return {
            'x_centered': x_centered,
            'y_normalized': y_normalized,
            'x_centered_times_y_normalized': x_centered_times_y_normalized,
            's_integerized': s_integerized
        }

    raw_data = [{
        'x': 1,
        'y': 1,
        's': 'hello'
    }, {
        'x': 2,
        'y': 2,
        's': 'world'
    }, {
        'x': 3,
        'y': 3,
        's': 'hello'
    }]

    # raw_data_p = p | beam.Create(raw_data)

    raw_data_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.Schema({
            's':
            dataset_schema.ColumnSchema(
                tf.string, [], dataset_schema.FixedColumnRepresentation()),
            'y':
            dataset_schema.ColumnSchema(
                tf.float32, [], dataset_schema.FixedColumnRepresentation()),
            'x':
            dataset_schema.ColumnSchema(
                tf.float32, [], dataset_schema.FixedColumnRepresentation())
        }))

    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
        transformed_dataset, transform_fn = (  # pylint: disable=unused-variable
            (raw_data, raw_data_metadata)
            | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

        transformed_data, transformed_metadata = transformed_dataset  # pylint: disable=unused-variable

        pprint.pprint(transformed_data)
        (transformed_data
         | beam.io.WriteToText(
             '/Users/luoshixin/Personal/GCPStudy/src/tensorflow/tftransform/tmp'
         ))

Exemplo n.º 16

0

Exibir arquivo

Arquivo: cached_impl_test.py Projeto: yifanmai/transform

  def setUp(self):
    super(CachedImplTest, self).setUp()
    self.base_test_dir = os.path.join(
        os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
        self._testMethodName)
    self._cache_dir = os.path.join(self.base_test_dir, 'cache')

    self._context = beam_impl.Context(temp_dir=self.get_temp_dir())
    self._context.__enter__()

Exemplo n.º 17

0

Exibir arquivo

Arquivo: impl_test.py Projeto: Kryndex/transform

    def testUniquesAnalyzerWithFrequencyThresholdTooHigh(self):
        # User defined transform_fn accepts and returns a dict of Columns.
        # Expected to return an empty dict due to too high threshold.
        def preprocessing_fn(inputs):
            return {
                'index1':
                tft.string_to_int(tft.map(tf.string_split, inputs['a']),
                                  default_value=-99,
                                  frequency_threshold=77),

                # As above but using a string for frequency_threshold (and changing
                # the default_value to showcase things).
                'index2':
                tft.string_to_int(tft.map(tf.string_split, inputs['a']),
                                  default_value=-9,
                                  frequency_threshold='77')
            }

        input_data = [{
            'a': 'hello hello world'
        }, {
            'a': 'hello goodbye world'
        }, {
            'a': 'hello goodbye foo'
        }]
        input_schema = self.toMetadata({
            'a':
            tf.FixedLenFeature((), tf.string, ''),
        })

        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            transformed_dataset, _ = (
                (input_data, input_schema)
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

        # Generated vocab (ordered by frequency, then value) should be:
        # ["hello", "world", "goodbye", "foo"]. After applying frequency_threshold=2
        # this becomes empty.
        expected_transformed_data = [{
            'index1': [-99, -99, -99],
            'index2': [-9, -9, -9]
        }, {
            'index1': [-99, -99, -99],
            'index2': [-9, -9, -9]
        }, {
            'index1': [-99, -99, -99],
            'index2': [-9, -9, -9]
        }]
        expected_transformed_schema = self.toMetadata({
            'index1':
            tf.VarLenFeature(tf.int64),
            'index2':
            tf.VarLenFeature(tf.int64)
        })
        self.assertDatasetsEqual(
            transformed_dataset,
            (expected_transformed_data, expected_transformed_schema))

Exemplo n.º 18

0

Exibir arquivo

Arquivo: datapreprocessingwithtensorflowtransform.py Projeto: twheelertech/tensorflow2pymodels

def data_transform():
    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        transformed_dataset, transform_fn = ((dict_features, data_metadata) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
        
        transformed_data, transformed_metadata = transformed_dataset
        
        for i in range(len(transformed_data)):
            print('Raw: ', dict_features[i])
            print('Transformed: ', transformed_data[i])

Exemplo n.º 19

0

Exibir arquivo

Arquivo: pipeline.py Projeto: windfire-cd/tf-estimator-tutorials-1

def run(pipeline_options, known_args):
  pipeline = beam.Pipeline(options=pipeline_options)

  with impl.Context(known_args.transform_temp_dir):
    articles = (
        pipeline
        | 'Get Paths' >> beam.Create(get_paths(known_args.file_pattern))
        | 'Get Articles' >> beam.Map(get_articles)
        | 'Get Article' >> beam.FlatMap(lambda x: x)
    )

    dataset = (articles, get_metadata())

    transform_fn = (
        dataset
        | 'Analyse dataset' >> impl.AnalyzeDataset(preprocess_fn)
    )

    transformed_data_with_meta = (
        (dataset, transform_fn)
        | 'Transform dataset' >> impl.TransformDataset()
    )

    transformed_data, transformed_metadata = transformed_data_with_meta

    transform_fn | 'Export Transform Fn' >> transform_fn_io.WriteTransformFn(
        known_args.transform_export_dir)

    (
        transformed_data
        | 'Convert to Insertable data' >> beam.Map(to_bq_row)
        | 'Write to BigQuery table' >> beam.io.WriteToBigQuery(
            project=known_args.bq_project,
            dataset=known_args.bq_dataset,
            table=known_args.bq_table,
            schema=get_bigquery_schema(),
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
    )

    if known_args.enable_tfrecord:
      transformed_data | 'Write TFRecords' >> beam.io.tfrecordio.WriteToTFRecord(
          file_path_prefix='{0}/{1}'.format(known_args.tfrecord_export_dir, 'reuter'),
          file_name_suffix='.tfrecords',
          coder=tft_coders.example_proto_coder.ExampleProtoCoder(transformed_metadata.schema))

    if known_args.enable_debug:
      transformed_data | 'Debug Output' >> beam.io.textio.WriteToText(
          file_path_prefix=known_args.debug_output_prefix, file_name_suffix='.txt')


  job = pipeline.run()

  if pipeline_options.get_all_options()['runner'] == 'DirectRunner':
    job.wait_until_finish()

Exemplo n.º 20

0

Exibir arquivo

Arquivo: simple_example.py Projeto: mohisen/transform

def main():
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        x = inputs['x']
        y = inputs['y']
        s = inputs['s']
        x_centered = x - tft.mean(x)
        y_normalized = tft.scale_to_0_1(y)
        s_integerized = tft.string_to_int(s)
        x_centered_times_y_normalized = (x_centered * y_normalized)
        return {
            'x_centered': x_centered,
            'y_normalized': y_normalized,
            'x_centered_times_y_normalized': x_centered_times_y_normalized,
            's_integerized': s_integerized
        }

    raw_data = [{
        'x': 1,
        'y': 1,
        's': 'hello'
    }, {
        'x': 2,
        'y': 2,
        's': 'world'
    }, {
        'x': 3,
        'y': 3,
        's': 'hello'
    }]

    raw_data_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.Schema({
            's':
            dataset_schema.ColumnSchema(
                tf.string, [], dataset_schema.FixedColumnRepresentation()),
            'y':
            dataset_schema.ColumnSchema(
                tf.float32, [], dataset_schema.FixedColumnRepresentation()),
            'x':
            dataset_schema.ColumnSchema(
                tf.float32, [], dataset_schema.FixedColumnRepresentation())
        }))

    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
        transform_fn = ((raw_data, raw_data_metadata)
                        | beam_impl.AnalyzeDataset(preprocessing_fn))
        transformed_dataset = (((raw_data, raw_data_metadata), transform_fn)
                               | beam_impl.TransformDataset())

    # pylint: disable=unused-variable
    transformed_data, transformed_metadata = transformed_dataset

    pprint.pprint(transformed_data)

Exemplo n.º 21

0

Exibir arquivo

Arquivo: data-preprocessing.py Projeto: ColinFendrick/tensorflow-2

def data_transform():

    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        transformed_dataset, transform_fn = (
            (dict_features, data_metadata) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) # Preprocessing_fn automatically called with the correct inputs

    transformed_data, transformed_metadata = transformed_dataset # Break down the dataset

    for i in range(len(transformed_data)):
        print("Raw: ", dict_features[i]) # See the raw data
        print("Transformed:", transformed_data[i]) # Compare with the transformed data

Exemplo n.º 22

0

Exibir arquivo

Arquivo: preprocess.py Projeto: svsamipillai/professional-services

def main():
    """Parses execution arguments, creates and runs processing pipeline.

  Cheks current OS. Posix OS is required for local and GCP paths consistency.

  Raises:
    OSError: Posix OS required.
    ValueError: Train validation and test size dont add up to 1.0.
  """

    if os.name != 'posix':
        raise OSError('Posix OS required.')

    args = parse_arguments(sys.argv)

    if args.train_size + args.validation_size + args.test_size != 1.0:
        raise ValueError('Train validation and test size dont add up to 1.0.')

    output_dir = args.output_dir
    if args.cloud:
        output_dir = posixpath.join('gs://', args.bucket_id, output_dir)
        runner = 'DataflowRunner'
    else:
        output_dir = posixpath.join('.', output_dir)
        runner = 'DirectRunner'

    temp_dir = posixpath.join(output_dir, 'tmp')

    options = {
        'project':
        args.project_id,
        'job_name':
        '{}-{}'.format(args.project_id,
                       datetime.now().strftime('%Y%m%d%H%M%S')),
        'setup_file':
        posixpath.abspath(
            posixpath.join(posixpath.dirname(__file__), 'setup.py')),
        'temp_location':
        temp_dir,
        'save_main_session':
        True
    }
    pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)

    with beam.Pipeline(runner, options=pipeline_options) as p:
        with beam_impl.Context(temp_dir=temp_dir):
            preprocess(p=p,
                       output_dir=output_dir,
                       check_path=args.check_path,
                       data_size=(args.train_size, args.validation_size,
                                  args.test_size),
                       bq_table=args.bq_table,
                       split_data_path=args.split_data_path,
                       project_id=args.project_id)

Exemplo n.º 23

0

Exibir arquivo

Arquivo: preprocess.py Projeto: xfyecn/tftransform-demo

def main(argv=None):
    """Run preprocessing as a Dataflow pipeline.
    Args:
        argv (list): list of arguments
    """
    args = parse_arguments(sys.argv if argv is None else argv)

    if args.cloud:
        pipeline_options = get_cloud_pipeline_options()
    else:
        pipeline_options = None

    p = beam.Pipeline(options=pipeline_options)
    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
        # read data and join by key
        raw_data_input = (p
                          | 'ReadInputData' >> beam.io.ReadFromText(
                              TRAIN_INPUT_DATA, skip_header_lines=1)
                          | 'ParseInputCSV' >> beam.Map(converter_input.decode)
                          | 'ExtractBatchKeyIn' >> beam.Map(extract_batchkey))

        raw_data_output = (
            p
            | 'ReadOutputData' >> beam.io.ReadFromText(TRAIN_OUTPUT_DATA,
                                                       skip_header_lines=1)
            | 'ParseOutputCSV' >> beam.Map(converter_output.decode)
            | 'ExtractBatchKeyOut' >> beam.Map(extract_batchkey))

        raw_data = ((raw_data_input, raw_data_output)
                    | 'JoinData' >> beam.CoGroupByKey()
                    | 'RemoveKeys' >> beam.FlatMap(remove_keys))

        # analyse and transform dataset
        raw_dataset = (raw_data, input_metadata)
        transformed_dataset, transform_fn = (
            raw_dataset
            | 'AnalyzeAndTransform' >>
            beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
        transformed_data, transformed_metadata = transformed_dataset

        # save data and serialize TransformFn
        transformed_data_coder = tft.coders.ExampleProtoCoder(
            transformed_metadata.schema)
        _ = (transformed_data
             | 'EncodeData' >> beam.Map(transformed_data_coder.encode)
             | 'WriteData' >> tfrecordio.WriteToTFRecord(
                 os.path.join(TFRECORD_DIR, 'records')))
        _ = (transform_fn
             |
             "WriteTransformFn" >> transform_fn_io.WriteTransformFn(MODEL_DIR))

        p.run().wait_until_finish()

Exemplo n.º 24

0

Exibir arquivo

Arquivo: run_preprocess.py Projeto: kanetushar/professional-services-1

def main():
    """Configures and runs a pipeline."""
    args = parse_arguments(sys.argv)
    config = parse_config("CLOUD" if args.cloud else "LOCAL",
                          get_relative_path("config.ini"))
    set_logging(config.get("log_level"))
    options = get_pipeline_options(args, config)
    runner = str(config.get("runner"))

    with beam.Pipeline(runner, options=options) as pipeline:
        with beam_impl.Context(
                temp_dir=os.path.join(args.tft_dir, constants.TMP_DIR)):
            preprocess.run(pipeline, args)

Exemplo n.º 25

0

Exibir arquivo

Arquivo: tft_unit.py Projeto: satya-panda/transform

    def assertAnalyzeAndTransformResults(self,
                                         input_data,
                                         input_metadata,
                                         preprocessing_fn,
                                         expected_data,
                                         expected_metadata=None):
        """Assert that input data and metadata is transformed as expected.

    Args:
      input_data: A sequence of dicts whose values are
          either strings, lists of strings, numeric types or a pair of those.
      input_metadata: DatasetMetadata describing input_data.
      preprocessing_fn: A function taking a dict of tensors and returning
          a dict of tensors.
      expected_data: A dataset with the same type constraints as input_data,
          but representing the output after transformation.
      expected_metadata: (optional) DatasetMeatadata describing the transformed
          data.
    Raises:
      AssertionError: if the expected data does not match the results of
          transforming input_data according to preprocessing_fn, or
          (if provided) if the expected metadata does not match.
    """
        temp_dir = self.get_temp_dir()
        with beam_impl.Context(temp_dir=temp_dir):
            # Note: we don't separately test AnalyzeDataset and TransformDataset as
            # AnalyzeAndTransformDataset currently simply composes these two
            # transforms.  If in future versions of the code, the implementation
            # differs, we should also run AnalyzeDataset and TransformDatset composed.
            #
            # Also, the dataset_metadata that is returned along with
            # `transformed_data` is incomplete as it does not contain the deferred
            # components, so we instead inspect the metadata returned along with the
            # transform function.
            (transformed_data,
             _), (_, (transformed_metadata, deferred_metadata)) = (
                 (input_data, input_metadata)
                 | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

        self.assertDataCloseOrEqual(expected_data, transformed_data)
        if expected_metadata:
            # deferred_metadata should be a singleton PCollection.
            self.assertEqual(len(deferred_metadata), 1)
            unresolved_futures = transformed_metadata.substitute_futures(
                deferred_metadata[0])
            self.assertEqual(unresolved_futures, [])
            # Use extra assertEqual for schemas, since full metadata assertEqual error
            # message is not conducive to debugging.
            self.assertEqual(expected_metadata.schema.column_schemas,
                             transformed_metadata.schema.column_schemas)
            self.assertEqual(expected_metadata, transformed_metadata)

Exemplo n.º 26

0

Exibir arquivo

    def build_graph(self):
        # Move percentage of train data to .`PPGRAPH_EXT` files, used for graph building.
        # num_lines = 0
        # for i in range(DATASET_NUM_SHARDS):
        #     _fname = '{}-{:05}-of-{:05}'.format(self.train_fname_out, i, self.config.DATASET_NUM_SHARDS)
        #     num_lines += sum(1 for _ in open(_fname))
        #     _fname_marked = '{}-{:05}-of-{:05}.{}'.format(self.train_fname_out, i, self.config.DATASET_NUM_SHARDS,
        #                                                   PPGRAPH_EXT)
        #     shutil.move(_fname, _fname_marked)
        #     if num_lines >= self.config.PPGRAPH_MAX_SAMPLES:
        #         break

        # Set up the preprocessing pipeline for analyzing the dataset. The analyze call is not combined with the
        # transform call because we will parallelize the transform call later. We had the issue that this process
        # runs on a single core and tends to cause OOM issues.
        pipeline = beam.Pipeline(runner=DirectRunner())

        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
            # todo: maybe, I should only use train data (or percentage of train data) to build the graph
            raw_train_data = (
                pipeline
                | 'ReadTrainDataFile' >> textio.ReadFromText(
                    'data/features' + '*' + 'shard' + '*', skip_header_lines=0)
                | 'DecodeTrainDataCSV' >> MapAndFilterErrors(
                    tft_coders.CsvCoder(
                        self.data_formatter.get_ordered_columns(),
                        self.data_formatter.get_raw_data_metadata().schema).
                    decode))

            # Combine data and schema into a dataset tuple.  Note that we already used
            # the schema to read the CSV data, but we also need it to interpret
            # raw_data.
            # That is when to use vocabulary, scale_to_0_1 or sparse_to_dense ...
            transform_fn = (
                (raw_train_data, self.data_formatter.get_raw_data_metadata())
                | beam_impl.AnalyzeDataset(
                    PreprocessingFunction().transform_to_tfrecord))

            # Write SavedModel and metadata to two subdirectories of working_dir, given by
            # `transform_fn_io.TRANSFORM_FN_DIR` and `transform_fn_io.TRANSFORMED_METADATA_DIR` respectively.
            _ = (transform_fn
                 | 'WriteTransformGraph' >>
                 transform_fn_io.WriteTransformFn(TARGET_DIR))  # working dir

        # Run the Beam preprocessing pipeline.
        st = time.time()
        result = pipeline.run()
        result.wait_until_finish()
        self.logger.info(
            'Transformation graph built and written in {:.2f} sec'.format(
                time.time() - st))

Exemplo n.º 27

0

Exibir arquivo

Arquivo: impl_test.py Projeto: Kryndex/transform

    def testNumericAnalyzersWithNDInputs(self):
        def preprocessing_fn(inputs):
            def repeat(in_tensor, value):
                batch_size = tf.shape(in_tensor)[0]
                return tf.ones([batch_size], value.dtype) * value

            return {
                'min': tft.map(repeat, inputs['a'], tft.min(inputs['a'])),
                'max': tft.map(repeat, inputs['a'], tft.max(inputs['a'])),
                'sum': tft.map(repeat, inputs['a'], tft.sum(inputs['a'])),
                'size': tft.map(repeat, inputs['a'], tft.size(inputs['a'])),
                'mean': tft.map(repeat, inputs['a'], tft.mean(inputs['a']))
            }

        input_data = [{'a': [[4, 5], [6, 7]]}, {'a': [[1, 2], [3, 4]]}]
        input_metadata = self.toMetadata(
            {'a': tf.FixedLenFeature((2, 2), tf.int64)})
        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            transformed_dataset, _ = (
                (input_data, input_metadata)
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

        expected_transformed_data = [{
            'min': 1,
            'max': 7,
            'sum': 32,
            'size': 8,
            'mean': 4.0
        }, {
            'min': 1,
            'max': 7,
            'sum': 32,
            'size': 8,
            'mean': 4.0
        }]
        expected_transformed_metadata = self.toMetadata({
            'min':
            tf.FixedLenFeature((), tf.int64, None),
            'max':
            tf.FixedLenFeature((), tf.int64, None),
            'sum':
            tf.FixedLenFeature((), tf.int64, None),
            'size':
            tf.FixedLenFeature((), tf.int64, None),
            'mean':
            tf.FixedLenFeature((), tf.float64, None)
        })
        self.assertDatasetsEqual(
            transformed_dataset,
            (expected_transformed_data, expected_transformed_metadata))

Exemplo n.º 28

0

Exibir arquivo

Arquivo: impl_test.py Projeto: Kryndex/transform

    def testUniquesAnalyzer(self):
        # User defined transform_fn accepts and returns a dict of Columns.
        def preprocessing_fn(inputs):
            return {'index': tft.string_to_int(inputs['a'])}

        input_data = [{
            'a': 'hello'
        }, {
            'a': 'world'
        }, {
            'a': 'hello'
        }, {
            'a': 'hello'
        }, {
            'a': 'goodbye'
        }, {
            'a': 'world'
        }, {
            'a': 'aaaaa'
        }]
        input_metadata = self.toMetadata({
            'a':
            tf.FixedLenFeature((), tf.string),
        })
        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            transformed_dataset, _ = (
                (input_data, input_metadata)
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

        expected_transformed_data = [{
            'index': 0
        }, {
            'index': 1
        }, {
            'index': 0
        }, {
            'index': 0
        }, {
            'index': 2
        }, {
            'index': 1
        }, {
            'index': 3
        }]
        expected_transformed_metadata = self.toMetadata(
            {'index': tf.FixedLenFeature((), tf.int64)})
        self.assertDatasetsEqual(
            transformed_dataset,
            (expected_transformed_data, expected_transformed_metadata))

Exemplo n.º 29

0

Exibir arquivo

def run_tft_pipeline(args):
    """
    This is where all the data we have available in our database is processed and 
    transformed into Tensorflow tfrecords for later training and testing.

    The code runs in distributed manner automatically in the engine choosen by
    the `runner` argument in input.
    """
    pipeline_options = build_pipeline_options(args)
    temp_tft_folder = (tempfile.mkdtemp(
        dir='/tmp/') if not args.tft_temp else args.tft_temp)
    tft_transform_folder = (tempfile.mkdtemp(
        dir='/tmp/') if not args.tft_transform else args.tft_transform)

    with beam.Pipeline(options=pipeline_options) as pipeline:
        with beam_impl.Context(temp_dir=temp_tft_folder):

            train_data = read_input_data(args, pipeline, 'train')

            write_total_distinct_keys_to_file(train_data, args.nitems_filename,
                                              'sku')

            train_dataset = (train_data, metadata.RAW_DATA_METADATA)
            (train_data, transformed_train_metadata), transform_fn = (
                train_dataset
                | beam_impl.AnalyzeAndTransformDataset(preprocess_fn))

            _ = (transform_fn
                 | 'WriteTransformFn' >>
                 transform_fn_io.WriteTransformFn(tft_transform_folder))

            train_data = aggregate_transformed_data(train_data, 'train')

            write_tfrecords(train_data, metadata.OUTPUT_TRAIN_SCHEMA,
                            args.output_train_filename, 'output train')

            test_data = read_input_data(args, pipeline, 'test')

            test_dataset = (test_data, metadata.RAW_DATA_METADATA)

            (test_data,
             _) = ((test_dataset, transform_fn) | beam_impl.TransformDataset())

            test_data = aggregate_transformed_data(test_data, 'test')

            test_data = aggregate_final_test_data(train_data, test_data)

            write_tfrecords(test_data, metadata.OUTPUT_TEST_SCHEMA,
                            args.output_test_filename, 'output test')

Exemplo n.º 30

0

Exibir arquivo

Arquivo: simple_example.py Projeto: zahariesergiu-bestjobs/transform

def main():
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        x = inputs['x']
        y = inputs['y']
        s = inputs['s']
        x_centered = x - tft.mean(x)
        y_normalized = tft.scale_to_0_1(y)
        s_integerized = tft.compute_and_apply_vocabulary(s)
        x_centered_times_y_normalized = (x_centered * y_normalized)
        return {
            'x_centered': x_centered,
            'y_normalized': y_normalized,
            'x_centered_times_y_normalized': x_centered_times_y_normalized,
            's_integerized': s_integerized
        }

    raw_data = [{
        'x': 1,
        'y': 1,
        's': 'hello'
    }, {
        'x': 2,
        'y': 2,
        's': 'world'
    }, {
        'x': 3,
        'y': 3,
        's': 'hello'
    }]

    raw_data_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.from_feature_spec({
            's':
            tf.io.FixedLenFeature([], tf.string),
            'y':
            tf.io.FixedLenFeature([], tf.float32),
            'x':
            tf.io.FixedLenFeature([], tf.float32),
        }))

    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        transformed_dataset, transform_fn = (  # pylint: disable=unused-variable
            (raw_data, raw_data_metadata)
            | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))

    transformed_data, transformed_metadata = transformed_dataset  # pylint: disable=unused-variable

    pprint.pprint(transformed_data)