Exemplo n.º 1
0
def main(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--cloud', type=str, help='y' )
    args = parser.parse_args(argv) # Parse the arguments 
    if args.cloud=="y":
        pipeline_options = get_cloud_pipeline_options()
    else:
        pipeline_options = beam.pipeline.PipelineOptions(flags=[],**{'project': "iotpubsub-1536350750202"})
    with beam_impl.Context(temp_dir="gs://relation_extraction/beam"):
        p = beam.Pipeline(options=pipeline_options)
        train_data, test_data = (p | "Read from bigquery" >> ReadBigQuery())

        (test_data | "test it" >> beam.Map(printy))
        train_data = (train_data, train_metadata)
        train_dataset, transform_fn = (train_data
                                            | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)
                                            )
        test_data = (test_data, train_metadata)
        test_data, _ = ((test_data, transform_fn) | 'Transform test data' >> beam_impl.TransformDataset())
        train_data, transformed_metadata = train_dataset
        transformed_data_coder = tft.coders.ExampleProtoCoder(transformed_metadata.schema)
        _ = (train_data
                | 'Encode train data to save it' >> beam.Map(transformed_data_coder.encode)
                | 'Write the train data to tfrecords' >> tfrecordio.WriteToTFRecord(os.path.join("gs://relation_extraction/beam/Train","TRAIN"))
                )
        _ = (test_data
                | 'Encode test data to save it' >> beam.Map(transformed_data_coder.encode)
                | 'Write the test data to tfrecords' >> tfrecordio.WriteToTFRecord(os.path.join("gs://relation_extraction/beam/Test","TEST"))
                )
        _ = (transform_fn | "WriteTransformFn" >> transform_fn_io.WriteTransformFn("gs://relation_extraction/beam/"))

        p.run().wait_until_finish()
Exemplo n.º 2
0
def read_and_shuffle_data(train_neg_filepattern, train_pos_filepattern,
                          test_neg_filepattern, test_pos_filepattern,
                          working_dir):
    """Read and shuffle the data and write out as a TFRecord of Example protos.

  Read in the data from the positive and negative examples on disk, shuffle it
  and write it out in TFRecord format.
  transform it using a preprocessing pipeline that removes punctuation,
  tokenizes and maps tokens to int64 values indices.

  Args:
    train_neg_filepattern: Filepattern for training data negative examples
    train_pos_filepattern: Filepattern for training data positive examples
    test_neg_filepattern: Filepattern for test data negative examples
    test_pos_filepattern: Filepattern for test data positive examples
    working_dir: Directory to write shuffled data to
  """
    with beam.Pipeline() as pipeline:
        coder = tft.coders.ExampleProtoCoder(RAW_DATA_METADATA.schema)

        # pylint: disable=no-value-for-parameter
        _ = (pipeline
             | 'ReadAndShuffleTrain' >> ReadAndShuffleData(
                 (train_neg_filepattern, train_pos_filepattern))
             | 'EncodeTrainData' >> beam.Map(coder.encode)
             | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                 os.path.join(working_dir, SHUFFLED_TRAIN_DATA_FILEBASE)))

        _ = (pipeline
             | 'ReadAndShuffleTest' >> ReadAndShuffleData(
                 (test_neg_filepattern, test_pos_filepattern))
             | 'EncodeTestData' >> beam.Map(coder.encode)
             | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
                 os.path.join(working_dir, SHUFFLED_TEST_DATA_FILEBASE)))
Exemplo n.º 3
0
def read_and_shuffle_data(train_neg_filepattern, train_pos_filepattern,
                          test_neg_filepattern, test_pos_filepattern,
                          shuffled_train_filebase, shuffled_test_filebase):
    """Read and shuffle the data and write out as a TFRecord of Example protos.

  Read in the data from the positive and negative examples on disk, shuffle it
  and write it out in TFRecord format.
  transform it using a preprocessing pipeline that removes punctuation,
  tokenizes and maps tokens to int64 values indices.

  Args:
    train_neg_filepattern: Filepattern for training data negative examples
    train_pos_filepattern: Filepattern for training data positive examples
    test_neg_filepattern: Filepattern for test data negative examples
    test_pos_filepattern: Filepattern for test data positive examples
    shuffled_train_filebase: Base filename for shuffled training data shards
    shuffled_test_filebase: Base filename for shuffled test data shards
  """
    with beam.Pipeline() as pipeline:
        # pylint: disable=no-value-for-parameter
        _ = (pipeline
             | 'ReadAndShuffleTrain' >> ReadAndShuffleData(
                 (train_neg_filepattern, train_pos_filepattern))
             | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                 shuffled_train_filebase,
                 coder=example_proto_coder.ExampleProtoCoder(
                     RAW_DATA_METADATA.schema)))
        _ = (pipeline
             | 'ReadAndShuffleTest' >> ReadAndShuffleData(
                 (test_neg_filepattern, test_pos_filepattern))
             | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
                 shuffled_test_filebase,
                 coder=example_proto_coder.ExampleProtoCoder(
                     RAW_DATA_METADATA.schema)))
Exemplo n.º 4
0
 def expand(self, features):
     return (
         features
         | 'Write to %s' % self._file_path_prefix.replace('/', '_') >>
         tfrecordio.WriteToTFRecord(file_path_prefix=self._file_path_prefix,
                                    file_name_suffix='.tfrecord.gz',
                                    coder=ExampleProtoCoder()))
Exemplo n.º 5
0
def run_pipeline():
  '''
  Apache beam pipeline
  ARGS: None
  '''
  args, pipeline_args = get_args()
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True

  read_textline_from_csv = beam.io.ReadFromText(
      args.csv_path, skip_header_lines=1)
  load_img_from_path = LoadImageDoFn()

  augment_data = PreprocessImagesDoFn()

  img_to_tfexample = ImageToTfExampleDoFn()

  write_to_tf_record = tfrecordio.WriteToTFRecord(
      file_path_prefix='gs://bucket_name/Apache_beam_records/Test_records/',
      num_shards=20)

  with beam.Pipeline(options=pipeline_options) as pipe:
    _ = (pipe
         | 'ReadCSVFromText' >> read_textline_from_csv
         | 'LoadImageData' >> beam.ParDo(load_img_from_path)
         | 'PreprocessImages' >> beam.ParDo(augment_data)
         | 'ImageToTfExample' >> beam.ParDo(img_to_tfexample)
         | 'SerializeProto' >> beam.Map(lambda x: x.SerializeToString())
         | 'WriteTfRecord' >> write_to_tf_record)
    print('Done running')
Exemplo n.º 6
0
 def expand(self, features):
     return (features
             | 'Write to %s' % self._file_path_prefix.replace('/', '_') >>
             tfrecordio.WriteToTFRecord(
                 file_path_prefix=self._file_path_prefix,
                 file_name_suffix='.tfrecord.gz',
                 shard_name_template=fileio.DEFAULT_SHARD_NAME_TEMPLATE,
                 coder=ExampleProtoCoder(),
                 compression_type=fileio.CompressionTypes.AUTO))
def main(argv=None):
    '''Run Preprocessing as a Dataflow pipeline.'''
    args = parse_arguments(sys.argv if argv is None else argv)
    if args.cloud:
        logging.info('Start running in the cloud')
        options = {
            'runner':
            'DataflowRunner',
            'job_name': ('mlengine-boilerplate-{}'.format(
                datetime.datetime.now().strftime('%Y%m%d%H%M%S'))),
            'staging_location':
            os.path.join(args.output_dir, 'staging'),
            'temp_location':
            os.path.join(args.output_dir, 'tmp'),
            'project':
            args.project_id,
            'zone':
            'europe-west1-d',
            'autoscaling_algorithm':
            'THROUGHPUT_BASED',
            'save_main_session':
            True,
            'setup_file':
            './setup.py',
        }
        pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
        print(pipeline_options)
    else:
        pipeline_options = None

    train_coder = coders.ExampleProtoCoder(schema)

    p = beam.Pipeline(options=pipeline_options)

    examples = (p
                | 'ReadData' >> beam.io.ReadFromText(DATA_DIR + '/*',
                                                     skip_header_lines=1)
                | 'buildExamples' >>
                beam.FlatMap(lambda raw_input: buildExample(raw_input)))

    examples_split = examples | beam.Partition(partition_fn, 3)
    example_dict = {
        'train': examples_split[0],
        'validation': examples_split[1],
        'test': examples_split[2]
    }

    for part, examples in example_dict.items():
        _ = examples | part + '_writeExamples' >> tfrecordio.WriteToTFRecord(
            file_path_prefix=os.path.join(args.output_dir, part + '_examples'),
            compression_type=filesystem.CompressionTypes.GZIP,
            coder=train_coder,
            file_name_suffix='.gz')

    p.run()
def run(argv=None):
    """Runs the revise preprocessed data pipeline.

  Args:
    argv: Pipeline options as a list of arguments.
  """
    pipeline_options = PipelineOptions(flags=argv)
    revise_options = pipeline_options.view_as(ReviseOptions)
    cloud_options = pipeline_options.view_as(GoogleCloudOptions)
    output_dir = os.path.join(
        revise_options.output,
        datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(
        WorkerOptions).autoscaling_algorithm = 'THROUGHPUT_BASED'
    cloud_options.staging_location = os.path.join(output_dir, 'tmp', 'staging')
    cloud_options.temp_location = os.path.join(output_dir, 'tmp')
    cloud_options.job_name = 'relabel-examples-%s' % (
        datetime.datetime.now().strftime('%y%m%d-%H%M%S'))

    metadata_query = str(
        Template(open(revise_options.metadata,
                      'r').read()).render(METADATA_QUERY_REPLACEMENTS))
    logging.info('metadata query : %s', metadata_query)

    with beam.Pipeline(options=pipeline_options) as p:
        # Gather our sample metadata into a python dictionary.
        samples_metadata = (
            p
            | 'ReadSampleMetadata' >> beam.io.Read(
                beam.io.BigQuerySource(query=metadata_query,
                                       use_standard_sql=True))
            | 'TableToDictionary' >> beam.CombineGlobally(
                util.TableToDictCombineFn(key_column=encoder.KEY_COLUMN)))

        # Read the tf.Example protos into a PCollection.
        examples = p | 'ReadExamples' >> tfrecordio.ReadFromTFRecord(
            file_pattern=revise_options.input,
            compression_type=CompressionTypes.GZIP)

        # Filter the TensorFlow Example Protocol Buffers.
        filtered_examples = (examples | 'ReviseExamples' >> beam.FlatMap(
            lambda example, samples_metadata: filter_and_revise_example(
                example, samples_metadata),
            beam.pvalue.AsSingleton(samples_metadata)))

        # Write the subset of tf.Example protos to Cloud Storage.
        _ = (filtered_examples
             | 'SerializeExamples' >>
             beam.Map(lambda example: example.SerializeToString())
             | 'WriteExamples' >> tfrecordio.WriteToTFRecord(
                 file_path_prefix=os.path.join(output_dir, 'examples'),
                 compression_type=CompressionTypes.GZIP,
                 file_name_suffix='.tfrecord.gz'))
Exemplo n.º 9
0
def main(argv=None):
    """Run preprocessing as a Dataflow pipeline.
    Args:
        argv (list): list of arguments
    """
    args = parse_arguments(sys.argv if argv is None else argv)

    if args.cloud:
        pipeline_options = get_cloud_pipeline_options()
    else:
        pipeline_options = None

    p = beam.Pipeline(options=pipeline_options)
    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
        # read data and join by key
        raw_data_input = (p
                          | 'ReadInputData' >> beam.io.ReadFromText(
                              TRAIN_INPUT_DATA, skip_header_lines=1)
                          | 'ParseInputCSV' >> beam.Map(converter_input.decode)
                          | 'ExtractBatchKeyIn' >> beam.Map(extract_batchkey))

        raw_data_output = (
            p
            | 'ReadOutputData' >> beam.io.ReadFromText(TRAIN_OUTPUT_DATA,
                                                       skip_header_lines=1)
            | 'ParseOutputCSV' >> beam.Map(converter_output.decode)
            | 'ExtractBatchKeyOut' >> beam.Map(extract_batchkey))

        raw_data = ((raw_data_input, raw_data_output)
                    | 'JoinData' >> beam.CoGroupByKey()
                    | 'RemoveKeys' >> beam.FlatMap(remove_keys))

        # analyse and transform dataset
        raw_dataset = (raw_data, input_metadata)
        transformed_dataset, transform_fn = (
            raw_dataset
            | 'AnalyzeAndTransform' >>
            beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
        transformed_data, transformed_metadata = transformed_dataset

        # save data and serialize TransformFn
        transformed_data_coder = tft.coders.ExampleProtoCoder(
            transformed_metadata.schema)
        _ = (transformed_data
             | 'EncodeData' >> beam.Map(transformed_data_coder.encode)
             | 'WriteData' >> tfrecordio.WriteToTFRecord(
                 os.path.join(TFRECORD_DIR, 'records')))
        _ = (transform_fn
             |
             "WriteTransformFn" >> transform_fn_io.WriteTransformFn(MODEL_DIR))

        p.run().wait_until_finish()
def store_transformed_data(data, schema, path, name=''):
    """Stores data from input pipeline into TFRecord in the specified path.

  Args:
    data: `PCollection`, input pipeline.
    schema: `DatasetMetadata` object, describes schema of the input pipeline.
    path: string, where to write output.
    name: string: name describing pipeline to be written.

  Returns:
    PCollection
  """

    p = (data
         | 'WriteData{}'.format(name) >> tfrecordio.WriteToTFRecord(
             path, coder=example_proto_coder.ExampleProtoCoder(schema.schema)))
    return p
def pipeline(root):
  """Method to pass into flume runner."""
  _ = (
      root
      | 'Read RecordIO TSV' >> beam.io.ReadFromText(FLAGS.input_path)
      | 'Validate sentence pair' >> beam.ParDo(
          ValidateSentencePair(FLAGS.min_edit_distance))
      | 'Select TSV columns' >> beam.ParDo(
          SelectTSVColumns(
              source_column=FLAGS.tsv_source_column,
              target_column=FLAGS.tsv_target_column))
      | 'Reshuffle' >> beam.Reshuffle()
      | 'Batch elements' >> beam.BatchElements(
          min_batch_size=1024, max_batch_size=1024)
      | 'Make tf.Examples' >> beam.ParDo(
          PrepareTfExamples(
              spm_path=FLAGS.spm_path,
              packed_length=FLAGS.packed_length,
              pad_length=FLAGS.pad_length))
      | 'Write to tf.Record' >> tfrecordio.WriteToTFRecord(FLAGS.output_path))
Exemplo n.º 12
0
def main(argv=None):
    """Run preprocessing as a Dataflow pipeline.

    Args:
        argv (list): list of arguments

    """
    args = parse_arguments(sys.argv if argv is None else argv)

    if args.cloud:
        pipeline_options = get_cloud_pipeline_options(args.project_id,
                                                      args.output_dir)
    else:
        pipeline_options = None

    pipeline = beam.Pipeline(options=pipeline_options)

    examples = (
        pipeline
        # | 'ReadData' >> beam.Create(open('data/test.csv')
        #                             .readlines()[1:])
        |
        'ReadData' >> beam.io.ReadFromText(DATA_DIR + '*', skip_header_lines=1)
        | 'BuildExamples' >> beam.FlatMap(build_example))

    examples_split = examples | beam.Partition(partition_fn, 3)

    example_dict = {
        'train': examples_split[0],
        'validation': examples_split[1],
        'test': examples_split[2]
    }

    for part, examples in example_dict.items():
        examples | part + '_writeExamples' >> tfrecordio.WriteToTFRecord(
            file_path_prefix=os.path.join(TFRECORD_DIR, part + '_examples'),
            compression_type=filesystem.CompressionTypes.GZIP,
            coder=coders.ExampleProtoCoder(schema),
            file_name_suffix='.tfrecord.gz')

    pipeline.run().wait_until_finish()
Exemplo n.º 13
0
def run(argv=None):
    """Runs the sparse measurements preprocess pipeline.

  Args:
    argv: Pipeline options as a list of arguments.
  """
    pipeline_options = PipelineOptions(flags=argv)
    preprocess_options = pipeline_options.view_as(PreprocessOptions)
    cloud_options = pipeline_options.view_as(GoogleCloudOptions)
    output_dir = os.path.join(
        preprocess_options.output,
        datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(
        WorkerOptions).autoscaling_algorithm = 'THROUGHPUT_BASED'
    cloud_options.staging_location = os.path.join(output_dir, 'tmp', 'staging')
    cloud_options.temp_location = os.path.join(output_dir, 'tmp')
    cloud_options.job_name = 'preprocess-measurements-%s' % (
        datetime.datetime.now().strftime('%y%m%d-%H%M%S'))

    data_query = str(
        Template(open(preprocess_options.input,
                      'r').read()).render(DATA_QUERY_REPLACEMENTS))
    logging.info('data query : %s', data_query)

    with beam.Pipeline(options=pipeline_options) as p:
        # Read the table rows into a PCollection.
        rows = p | 'ReadMeasurements' >> beam.io.Read(
            beam.io.BigQuerySource(query=data_query, use_standard_sql=True))

        # Convert the data into TensorFlow Example Protocol Buffers.
        examples = measurements_to_examples(rows)

        # Write the serialized compressed protocol buffers to Cloud Storage.
        _ = (examples
             | 'EncodeExamples' >>
             beam.Map(lambda example: example.SerializeToString())
             | 'WriteExamples' >> tfrecordio.WriteToTFRecord(
                 file_path_prefix=os.path.join(output_dir, 'examples'),
                 compression_type=CompressionTypes.GZIP,
                 file_name_suffix='.tfrecord.gz'))
Exemplo n.º 14
0
def pipeline(root):
    """Method to pass into flume runner."""
    for i, tsv_in in enumerate(
            tf.io.gfile.glob(os.path.join(FLAGS.input_path, '*.tsv'))):
        print('Processing tsv input: %s' % tsv_in)
        tfr_out = tsv_in.replace('.tsv', '.tfr')
        num_output_shards = FLAGS.num_train_shards if 'train' in tsv_in else FLAGS.num_guide_shards
        _ = (root
             | 'Read RecordIO TSV__%s' % i >> beam.io.ReadFromText(tsv_in)
             | 'Validate sentence pair__%s' % i >> beam.ParDo(
                 ValidateSentencePair(FLAGS.min_edit_distance))
             | 'Select TSV columns__%s' % i >> beam.ParDo(
                 SelectTSVColumns(source_column=FLAGS.tsv_source_column,
                                  target_column=FLAGS.tsv_target_column))
             | 'Reshuffle__%s' % i >> beam.Reshuffle()
             | 'Batch elements__%s' % i >> beam.BatchElements(
                 min_batch_size=1024, max_batch_size=1024)
             | 'Make tf.Examples__%s' % i >> beam.ParDo(
                 PrepareTfExamples(spm_path=FLAGS.spm_path,
                                   packed_length=FLAGS.packed_length,
                                   pad_length=FLAGS.pad_length))
             | 'Write to tf.Record__%s' % i >> tfrecordio.WriteToTFRecord(
                 tfr_out, num_shards=num_output_shards))
Exemplo n.º 15
0
def run(p, params):
    """Defines Beam preprocessing pipeline.

  Performs the following:
    - Reads text files from pattern.
    - Split text files in train and validation sets.

  Args:
    p: PCollection, initial pipeline.
    params: Object holding a set of parameters as name-value pairs.
  """

    path_pattern = os.path.join(params.input_dir, '*',
                                '*{}'.format(constants.FILE_EXTENSION))
    data = (p
            | 'ListFiles' >> beam.Create(gfile.Glob(path_pattern))
            | 'ReadFiles' >> beam.ParDo(ReadFile())
            | 'SplitData' >> beam.ParDo(
                _SplitData(),
                train_size=params.train_size,
                val_label=_DatasetType.VAL.name).with_outputs(
                    _DatasetType.VAL.name, main=_DatasetType.TRAIN.name))

    schema = dataset_schema.from_feature_spec(
        utils.get_processed_data_schema())
    for dataset in _DatasetType:
        if not dataset.value:
            continue
        _ = (
            data[dataset.name]
            | 'Shuffle{}'.format(dataset.name) >> shuffle()  # pylint: disable=no-value-for-parameter
            |
            'WriteFiles{}'.format(dataset.name) >> tfrecordio.WriteToTFRecord(
                os.path.join(params.output_dir,
                             dataset.name + constants.TFRECORD),
                coder=example_proto_coder.ExampleProtoCoder(schema)))
def preprocess(query, in_test_mode):
  import os
  import os.path
  import tempfile
  from apache_beam.io import tfrecordio
  from tensorflow_transform.coders import example_proto_coder
  from tensorflow_transform.tf_metadata import dataset_metadata
  from tensorflow_transform.tf_metadata import dataset_schema
  from tensorflow_transform.beam.tft_beam_io import transform_fn_io

  job_name = 'preprocess-babyweight-features' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')    
  if in_test_mode:
    import shutil
    print 'Launching local job ... hang on'
    OUTPUT_DIR = './preproc_tft'
    shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
  else:
    print 'Launching Dataflow job {} ... hang on'.format(job_name)
    OUTPUT_DIR = 'gs://{0}/babyweight/preproc_tft/'.format(BUCKET)
    import subprocess
    subprocess.call('gsutil rm -r {}'.format(OUTPUT_DIR).split())
    
  options = {
    'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'),
    'temp_location': os.path.join(OUTPUT_DIR, 'tmp'),
    'job_name': job_name,
    'project': PROJECT,
    'max_num_workers': 24,
    'teardown_policy': 'TEARDOWN_ALWAYS',
    'no_save_main_session': True,
    'requirements_file': 'requirements.txt'
  }
  opts = beam.pipeline.PipelineOptions(flags=[], **options)
  if in_test_mode:
    RUNNER = 'DirectRunner'
  else:
    RUNNER = 'DataflowRunner'

  # set up metadata  
  raw_data_schema = {
    colname : dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation())
                   for colname in 'key,is_male,mother_race,mother_married,cigarette_use,alcohol_use'.split(',')
  }
  raw_data_schema.update({
      colname : dataset_schema.ColumnSchema(tf.float32, [], dataset_schema.FixedColumnRepresentation())
                   for colname in 'weight_pounds,mother_age,plurality,gestation_weeks'.split(',')
    })
  raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(raw_data_schema))

  def read_rawdata(p, step, test_mode):
    if step == 'train':
        selquery = 'SELECT * FROM ({}) WHERE ABS(MOD(hashmonth, 4)) < 3'.format(query)
    else:
        selquery = 'SELECT * FROM ({}) WHERE ABS(MOD(hashmonth, 4)) = 3'.format(query)
    if in_test_mode:
        selquery = selquery + ' LIMIT 100'
    #print 'Processing {} data from {}'.format(step, selquery)
    return (p 
          | '{}_read'.format(step) >> beam.io.Read(beam.io.BigQuerySource(query=selquery, use_standard_sql=True))
          | '{}_cleanup'.format(step) >> beam.FlatMap(cleanup)
                   )
  
  # run Beam  
  with beam.Pipeline(RUNNER, options=opts) as p:
    with beam_impl.Context(temp_dir=os.path.join(OUTPUT_DIR, 'tmp')):

      # analyze and transform training       
      raw_data = read_rawdata(p, 'train', in_test_mode)
      raw_dataset = (raw_data, raw_data_metadata)
      transformed_dataset, transform_fn = (
          raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocess_tft))
      transformed_data, transformed_metadata = transformed_dataset
      _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
          os.path.join(OUTPUT_DIR, 'train'),
          coder=example_proto_coder.ExampleProtoCoder(
              transformed_metadata.schema))
      
      # transform eval data
      raw_test_data = read_rawdata(p, 'eval', in_test_mode)
      raw_test_dataset = (raw_test_data, raw_data_metadata)
      transformed_test_dataset = (
          (raw_test_dataset, transform_fn) | beam_impl.TransformDataset())
      transformed_test_data, _ = transformed_test_dataset
      _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
          os.path.join(OUTPUT_DIR, 'eval'),
          coder=example_proto_coder.ExampleProtoCoder(
              transformed_metadata.schema))
      _ = (transform_fn
           | 'WriteTransformFn' >>
           transform_fn_io.WriteTransformFn(os.path.join(OUTPUT_DIR, 'metadata')))

  job = p.run()
Exemplo n.º 17
0
def transform_data(working_dir):
    """Transform the data and write out as a TFRecord of Example protos.

  Read in the data from the positive and negative examples on disk, and
  transform it using a preprocessing pipeline that removes punctuation,
  tokenizes and maps tokens to int64 values indices.

  Args:
    working_dir: Directory to read shuffled data from and write transformed data
        and metadata to.
  """

    with beam.Pipeline() as pipeline:
        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
            coder = tft.coders.ExampleProtoCoder(RAW_DATA_METADATA.schema)
            train_data = (pipeline
                          | 'ReadTrain' >> tfrecordio.ReadFromTFRecord(
                              os.path.join(working_dir,
                                           SHUFFLED_TRAIN_DATA_FILEBASE + '*'))
                          | 'DecodeTrain' >> beam.Map(coder.decode))

            test_data = (pipeline
                         | 'ReadTest' >> tfrecordio.ReadFromTFRecord(
                             os.path.join(working_dir,
                                          SHUFFLED_TEST_DATA_FILEBASE + '*'))
                         | 'DecodeTest' >> beam.Map(coder.decode))

            def preprocessing_fn(inputs):
                """Preprocess input columns into transformed columns."""
                review = inputs[REVIEW_KEY]

                review_tokens = tf.string_split(review, DELIMITERS)
                review_indices = tft.string_to_int(review_tokens,
                                                   top_k=VOCAB_SIZE)
                # Add one for the oov bucket created by string_to_int.
                review_bow_indices, review_weight = tft.tfidf(
                    review_indices, VOCAB_SIZE + 1)
                return {
                    REVIEW_KEY: review_bow_indices,
                    REVIEW_WEIGHT_KEY: review_weight,
                    LABEL_KEY: inputs[LABEL_KEY]
                }

            (transformed_train_data, transformed_metadata), transform_fn = (
                (train_data, RAW_DATA_METADATA)
                | 'AnalyzeAndTransform' >>
                beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data_coder = tft.coders.ExampleProtoCoder(
                transformed_metadata.schema)

            transformed_test_data, _ = (
                ((test_data, RAW_DATA_METADATA), transform_fn)
                | 'Transform' >> beam_impl.TransformDataset())

            _ = (transformed_train_data
                 | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode)
                 | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                     os.path.join(working_dir,
                                  TRANSFORMED_TRAIN_DATA_FILEBASE)))

            _ = (
                transformed_test_data
                | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode)
                | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
                    os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE)))

            # Will write a SavedModel and metadata to two subdirectories of
            # working_dir, given by transform_fn_io.TRANSFORM_FN_DIR and
            # transform_fn_io.TRANSFORMED_METADATA_DIR respectively.
            _ = (transform_fn
                 | 'WriteTransformFn' >>
                 transform_fn_io.WriteTransformFn(working_dir))
Exemplo n.º 18
0
def run(input_feature_spec,
        labels,
        feature_extraction,
        feature_scaling=None,
        eval_percent=20.0,
        beam_options=None,
        work_dir=None):
    """Runs the whole preprocessing step.

  This runs the feature extraction PTransform, validates that the data conforms
  to the schema provided, normalizes the features, and splits the dataset into
  a training and evaluation dataset.
  """

    # Populate optional arguments
    if not feature_scaling:
        feature_scaling = lambda inputs: inputs

    # Type checking
    if not isinstance(labels, list):
        raise ValueError('`labels` must be list(str). '
                         'Given: {} {}'.format(labels, type(labels)))

    if not isinstance(feature_extraction, beam.PTransform):
        raise ValueError('`feature_extraction` must be {}. '
                         'Given: {} {}'.format(beam.PTransform,
                                               feature_extraction,
                                               type(feature_extraction)))

    if not callable(feature_scaling):
        raise ValueError('`feature_scaling` must be callable. '
                         'Given: {} {}'.format(feature_scaling,
                                               type(feature_scaling)))

    if beam_options and not isinstance(beam_options, PipelineOptions):
        raise ValueError('`beam_options` must be {}. '
                         'Given: {} {}'.format(PipelineOptions, beam_options,
                                               type(beam_options)))

    if not work_dir:
        work_dir = tempfile.mkdtemp(prefix='tensorflow-preprocessing')

    tft_temp_dir = os.path.join(work_dir, 'tft-temp')
    train_dataset_dir = os.path.join(work_dir, 'train-dataset')
    eval_dataset_dir = os.path.join(work_dir, 'eval-dataset')

    transform_fn_dir = os.path.join(work_dir, transform_fn_io.TRANSFORM_FN_DIR)
    #  if tf.gfile.Exists(transform_fn_dir):
    if tf.io.gfile.exists(transform_fn_dir):
        tf.gfile.DeleteRecursively(transform_fn_dir)

    # [START dataflow_molecules_create_pipeline]
    # Build and run a Beam Pipeline
    with beam.Pipeline(options=beam_options) as p, \
         beam_impl.Context(temp_dir=tft_temp_dir):
        # [END dataflow_molecules_create_pipeline]

        # [START dataflow_molecules_feature_extraction]
        # Transform and validate the input data matches the input schema
        dataset = (
            p
            | 'Feature extraction' >> feature_extraction
            # [END dataflow_molecules_feature_extraction]
            # [START dataflow_molecules_validate_inputs]
            | 'Validate inputs' >> beam.ParDo(
                ValidateInputData(input_feature_spec)))
        # [END dataflow_molecules_validate_inputs]

        # [START dataflow_molecules_analyze_and_transform_dataset]
        # Apply the tf.Transform preprocessing_fn
        input_metadata = dataset_metadata.DatasetMetadata(
            dataset_schema.from_feature_spec(input_feature_spec))

        dataset_and_metadata, transform_fn = (
            (dataset, input_metadata)
            | 'Feature scaling' >>
            beam_impl.AnalyzeAndTransformDataset(feature_scaling))
        dataset, metadata = dataset_and_metadata
        # [END dataflow_molecules_analyze_and_transform_dataset]

        # [START dataflow_molecules_split_to_train_and_eval_datasets]
        # Split the dataset into a training set and an evaluation set
        assert 0 < eval_percent < 100, 'eval_percent must in the range (0-100)'
        train_dataset, eval_dataset = (
            dataset
            | 'Split dataset' >> beam.Partition(
                lambda elem, _: int(random.uniform(0, 100) < eval_percent), 2))
        # [END dataflow_molecules_split_to_train_and_eval_datasets]

        # [START dataflow_molecules_write_tfrecords]
        # Write the datasets as TFRecords
        coder = example_proto_coder.ExampleProtoCoder(metadata.schema)

        train_dataset_prefix = os.path.join(train_dataset_dir, 'part')
        _ = (train_dataset
             | 'Write train dataset' >> tfrecordio.WriteToTFRecord(
                 train_dataset_prefix, coder))

        eval_dataset_prefix = os.path.join(eval_dataset_dir, 'part')
        _ = (eval_dataset
             | 'Write eval dataset' >> tfrecordio.WriteToTFRecord(
                 eval_dataset_prefix, coder))

        # Write the transform_fn
        _ = (transform_fn
             |
             'Write transformFn' >> transform_fn_io.WriteTransformFn(work_dir))
        # [END dataflow_molecules_write_tfrecords]

    return PreprocessData(input_feature_spec, labels,
                          train_dataset_prefix + '*',
                          eval_dataset_prefix + '*')
Exemplo n.º 19
0
def transform_data(train_data_file, test_data_file, working_dir):
    """Transform the data and write out as a TFRecord of Example protos.

  Read in the data using the CSV reader, and transform it using a
  preprocessing pipeline that scales numeric data and converts categorical data
  from strings to int64 values indices, by creating a vocabulary for each
  category.

  Args:
    train_data_file: File containing training data
    test_data_file: File containing test data
    working_dir: Directory to write transformed data and metadata to
  """
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        outputs = {}

        # Scale numeric columns to have range [0, 1].
        for key in NUMERIC_FEATURE_KEYS:
            outputs[key] = tft.scale_to_0_1(inputs[key])

        # For all categorical columns except the label column, we use
        # tft.string_to_int which computes the set of unique values and uses this
        # to convert the strings to indices.
        for key in CATEGORICAL_FEATURE_KEYS:
            outputs[key] = tft.string_to_int(inputs[key])

        # For the label column we provide the mapping from string to index.
        def convert_label(label):
            table = lookup.index_table_from_tensor(['>50K', '<=50K'])
            return table.lookup(label)

        outputs[LABEL_KEY] = tft.apply_function(convert_label,
                                                inputs[LABEL_KEY])

        return outputs

    # The "with" block will create a pipeline, and run that pipeline at the exit
    # of the block.
    with beam.Pipeline() as pipeline:
        with tft.Context(temp_dir=tempfile.mkdtemp()):
            # Create a coder to read the census data with the schema.  To do this we
            # need to list all columns in order since the schema doesn't specify the
            # order of columns in the csv.
            ordered_columns = [
                'age', 'workclass', 'fnlwgt', 'education', 'education-num',
                'marital-status', 'occupation', 'relationship', 'race', 'sex',
                'capital-gain', 'capital-loss', 'hours-per-week',
                'native-country', 'label'
            ]
            converter = tft.CsvCoder(ordered_columns, RAW_DATA_METADATA.schema)

            # Read in raw data and convert using CSV converter.  Note that we apply
            # some Beam transformations here, which will not be encoded in the TF
            # graph since we don't do the from within tf.Transform's methods
            # (AnalyzeDataset, TransformDataset etc.).  These transformations are just
            # to get data into a format that the CSV converter can read, in particular
            # removing empty lines and removing spaces after commas.
            raw_data = (pipeline
                        |
                        'ReadTrainData' >> textio.ReadFromText(train_data_file)
                        | 'FilterTrainData' >> beam.Filter(lambda line: line)
                        | 'FixCommasTrainData' >>
                        beam.Map(lambda line: line.replace(', ', ','))
                        | 'DecodeTrainData' >> beam.Map(converter.decode))

            # Combine data and schema into a dataset tuple.  Note that we already used
            # the schema to read the CSV data, but we also need it to interpret
            # raw_data.
            raw_dataset = (raw_data, RAW_DATA_METADATA)
            transformed_dataset, transform_fn = (
                raw_dataset | tft.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data, transformed_metadata = transformed_dataset

            _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE),
                coder=tft.ExampleProtoCoder(transformed_metadata.schema))

            # Now apply transform function to test data.  In this case we also remove
            # the header line from the CSV file and the trailing period at the end of
            # each line.
            raw_test_data = (
                pipeline
                | 'ReadTestData' >> textio.ReadFromText(test_data_file)
                | 'FilterTestData' >> beam.Filter(
                    lambda line: line and line != '|1x3 Cross validator')
                | 'FixCommasTestData' >>
                beam.Map(lambda line: line.replace(', ', ','))
                | 'RemoveTrailingPeriodsTestData' >>
                beam.Map(lambda line: line[:-1])
                | 'DecodeTestData' >> beam.Map(converter.decode))

            raw_test_dataset = (raw_test_data, RAW_DATA_METADATA)

            transformed_test_dataset = ((raw_test_dataset, transform_fn)
                                        | tft.TransformDataset())
            # Don't need transformed data schema, it's the same as before.
            transformed_test_data, _ = transformed_test_dataset

            _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
                os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE),
                coder=tft.ExampleProtoCoder(transformed_metadata.schema))

            # Will write a SavedModel and metadata to two subdirectories of
            # working_dir, given by tft.TRANSFORM_FN_DIR and
            # tft.TRANSFORMED_METADATA_DIR respectively.
            _ = (transform_fn
                 | 'WriteTransformFn' >> tft.WriteTransformFn(working_dir))
Exemplo n.º 20
0
def write_to_tfrecord(args):
    """
    This function is supposed to be called as a script.
    """
    # Decode arguments
    current_index, num_shards, train_split_fname_out, eval_split_fname_out, \
    exp_log_data_file_train_tfrecord, exp_log_data_file_eval_tfrecord, working_dir, data_formatter_module_path = args

    # num_shards = "32"
    current_index, num_shards = int(current_index), int(num_shards)

    split_train_file_pattern = '{}-{:05}-of-{:05}'.format(
        train_split_fname_out, current_index, num_shards) + '*'
    split_eval_file_pattern = '{}-{:05}-of-{:05}'.format(
        eval_split_fname_out, current_index, num_shards)

    log.info('exp_log_data_file_train_tfrecord {}'.format(
        exp_log_data_file_train_tfrecord))
    log.info('exp_log_data_file_eval_tfrecord {}'.format(
        exp_log_data_file_eval_tfrecord))
    log.info('split_train_file_pattern {}'.format(split_train_file_pattern))
    log.info('split_eval_file_pattern {}'.format(split_eval_file_pattern))

    data_formatter = import_from_uri(
        data_formatter_module_path).DataFormatter()

    # Set up the preprocessing pipeline.
    pipeline = beam.Pipeline(runner=DirectRunner())

    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
        # Read raw data files: CSV format ordered according to the `data_formatter`, that are then converted
        # into a cleaned up format.
        raw_train_data = (
            pipeline
            | 'ReadTrainDataFile' >> textio.ReadFromText(
                split_train_file_pattern, skip_header_lines=0)
            | 'DecodeTrainDataCSV' >> MapAndFilterErrors(
                tft_coders.CsvCoder(
                    data_formatter.get_features_and_targets(),
                    data_formatter.get_features_metadata().schema).decode))

        raw_eval_data = (
            pipeline
            | 'ReadEvalDataFile' >> textio.ReadFromText(
                split_eval_file_pattern, skip_header_lines=0)
            | 'DecodeEvalDataCSV' >> MapAndFilterErrors(
                tft_coders.CsvCoder(
                    data_formatter.get_features_and_targets(),
                    data_formatter.get_features_metadata().schema).decode))

        # Examples in tf-example format (for model analysis purposes).
        # raw_feature_spec = data_formatter.RAW_DATA_METADATA.schema.as_feature_spec()
        # raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
        # coder = example_proto_coder.ExampleProtoCoder(raw_schema)
        #
        # _ = (
        #         raw_eval_data
        #         | 'ToSerializedTFExample' >> beam.Map(coder.encode)
        #         | 'WriteAnalysisTFRecord' >> tfrecordio.WriteToTFRecord(
        #     '{}-{:05}-of-{:05}'.format(analysis_fname_out, i, num_shards),
        #     shard_name_template='', num_shards=1)
        # )

        # Write SavedModel and metadata to two subdirectories of working_dir, given by
        # `transform_fn_io.TRANSFORM_FN_DIR` and `transform_fn_io.TRANSFORMED_METADATA_DIR` respectively.
        transform_fn = (pipeline
                        | 'ReadTransformGraph' >>
                        transform_fn_io.ReadTransformFn(working_dir))

        # Applies the transformation `transform_fn` to the raw eval dataset
        (transformed_train_data, transformed_metadata) = (
            ((raw_train_data, data_formatter.get_features_metadata()),
             transform_fn)
            | 'TransformTrainData' >> beam_impl.TransformDataset())

        # Applies the transformation `transform_fn` to the raw eval dataset
        (transformed_eval_data, transformed_metadata) = (
            ((raw_eval_data, data_formatter.get_features_metadata()),
             transform_fn)
            | 'TransformEvalData' >> beam_impl.TransformDataset())

        # The data schema of the transformed data gets used to build a signature to create
        # a TFRecord (tf binary data format). This signature is a wrapper function used to
        # encode transformed data.
        transformed_data_coder = tft.coders.ExampleProtoCoder(
            transformed_metadata.schema)

        _ = (transformed_train_data
             | 'EncodeTrainDataTransform' >> MapAndFilterErrors(
                 transformed_data_coder.encode)
             | 'WriteTrainDataTFRecord' >>
             tfrecordio.WriteToTFRecord('{}-{:05}-of-{:05}'.format(
                 exp_log_data_file_train_tfrecord, current_index, num_shards),
                                        shard_name_template='',
                                        num_shards=1))

        _ = (transformed_eval_data
             | 'EncodeEvalDataTransform' >> MapAndFilterErrors(
                 transformed_data_coder.encode)
             | 'WriteEvalDataTFRecord' >>
             tfrecordio.WriteToTFRecord('{}-{:05}-of-{:05}'.format(
                 exp_log_data_file_eval_tfrecord, current_index, num_shards),
                                        shard_name_template='',
                                        num_shards=1))

    result = pipeline.run()
    result.wait_until_finish()
Exemplo n.º 21
0
def transform_data(train_data_file, test_data_file, working_dir):
  """Transform the data and write out as a TFRecord of Example protos.

  Read in the data using the CSV reader, and transform it using a
  preprocessing pipeline that scales numeric data and converts categorical data
  from strings to int64 values indices, by creating a vocabulary for each
  category.

  Args:
    train_data_file: File containing training data
    test_data_file: File containing test data
    working_dir: Directory to write transformed data and metadata to
  """

  def preprocessing_fn(inputs):
    """Preprocess input columns into transformed columns."""
    # Since we are modifying some features and leaving others unchanged, we
    # start by setting `outputs` to a copy of `inputs.
    outputs = inputs.copy()

    # Scale numeric columns to have range [0, 1].
    for key in NUMERIC_FEATURE_KEYS:
      outputs[key] = tft.scale_to_0_1(outputs[key])

    # For all categorical columns except the label column, we generate a
    # vocabulary but do not modify the feature.  This vocabulary is instead
    # used in the trainer, by means of a feature column, to convert the feature
    # from a string to an integer id.
    for key in CATEGORICAL_FEATURE_KEYS:
      tft.vocabulary(inputs[key], vocab_filename=key)

    # For the label column we provide the mapping from string to index.
    table = tf.contrib.lookup.index_table_from_tensor(['>50K', '<=50K'])
    outputs[LABEL_KEY] = table.lookup(outputs[LABEL_KEY])

    return outputs

  # The "with" block will create a pipeline, and run that pipeline at the exit
  # of the block.
  with beam.Pipeline() as pipeline:
    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
      # Create a coder to read the census data with the schema.  To do this we
      # need to list all columns in order since the schema doesn't specify the
      # order of columns in the csv.
      ordered_columns = [
          'age', 'workclass', 'fnlwgt', 'education', 'education-num',
          'marital-status', 'occupation', 'relationship', 'race', 'sex',
          'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
          'label'
      ]
      converter = tft.coders.CsvCoder(ordered_columns, RAW_DATA_METADATA.schema)

      # Read in raw data and convert using CSV converter.  Note that we apply
      # some Beam transformations here, which will not be encoded in the TF
      # graph since we don't do the from within tf.Transform's methods
      # (AnalyzeDataset, TransformDataset etc.).  These transformations are just
      # to get data into a format that the CSV converter can read, in particular
      # removing spaces after commas.
      #
      # We use MapAndFilterErrors instead of Map to filter out decode errors in
      # convert.decode which should only occur for the trailing blank line.
      raw_data = (
          pipeline
          | 'ReadTrainData' >> textio.ReadFromText(train_data_file)
          | 'FixCommasTrainData' >> beam.Map(
              lambda line: line.replace(', ', ','))
          | 'DecodeTrainData' >> MapAndFilterErrors(converter.decode))

      # Combine data and schema into a dataset tuple.  Note that we already used
      # the schema to read the CSV data, but we also need it to interpret
      # raw_data.
      raw_dataset = (raw_data, RAW_DATA_METADATA)
      transformed_dataset, transform_fn = (
          raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
      transformed_data, transformed_metadata = transformed_dataset
      transformed_data_coder = tft.coders.ExampleProtoCoder(
          transformed_metadata.schema)

      _ = (
          transformed_data
          | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode)
          | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
              os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE)))

      # Now apply transform function to test data.  In this case we remove the
      # trailing period at the end of each line, and also ignore the header line
      # that is present in the test data file.
      raw_test_data = (
          pipeline
          | 'ReadTestData' >> textio.ReadFromText(test_data_file,
                                                  skip_header_lines=1)
          | 'FixCommasTestData' >> beam.Map(
              lambda line: line.replace(', ', ','))
          | 'RemoveTrailingPeriodsTestData' >> beam.Map(lambda line: line[:-1])
          | 'DecodeTestData' >> MapAndFilterErrors(converter.decode))

      raw_test_dataset = (raw_test_data, RAW_DATA_METADATA)

      transformed_test_dataset = (
          (raw_test_dataset, transform_fn) | beam_impl.TransformDataset())
      # Don't need transformed data schema, it's the same as before.
      transformed_test_data, _ = transformed_test_dataset

      _ = (
          transformed_test_data
          | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode)
          | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
              os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE)))

      # Will write a SavedModel and metadata to two subdirectories of
      # working_dir, given by transform_fn_io.TRANSFORM_FN_DIR and
      # transform_fn_io.TRANSFORMED_METADATA_DIR respectively.
      _ = (
          transform_fn
          | 'WriteTransformFn' >>
          transform_fn_io.WriteTransformFn(working_dir))
def data_transformation(EPOCHS: int, STEPS: int, BATCH_SIZE: int, HIDDEN_LAYER_SIZE: str, LEARNING_RATE: float):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/marshal"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    # -----------------------DATA LOADING START--------------------------------
    _kale_directory_file_names = [
        os.path.splitext(f)[0]
        for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
    ]

    if "column_names" not in _kale_directory_file_names:
        raise ValueError("column_names" + " does not exists in directory")

    _kale_load_file_name = [
        f
        for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f)) and
        os.path.splitext(f)[0] == "column_names"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " +
                         "column_names" + ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    column_names = _kale_resource_load(os.path.join(
        _kale_data_directory, _kale_load_file_name))

    if "schema" not in _kale_directory_file_names:
        raise ValueError("schema" + " does not exists in directory")

    _kale_load_file_name = [
        f
        for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f)) and
        os.path.splitext(f)[0] == "schema"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " +
                         "schema" + ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    schema = _kale_resource_load(os.path.join(
        _kale_data_directory, _kale_load_file_name))
    # -----------------------DATA LOADING END----------------------------------

    import os
    import shutil
    import logging
    import apache_beam as beam
    import tensorflow as tf
    import tensorflow_transform as tft
    import tensorflow_model_analysis as tfma
    import tensorflow_data_validation as tfdv

    from apache_beam.io import textio
    from apache_beam.io import tfrecordio

    from tensorflow_transform.beam import impl as beam_impl
    from tensorflow_transform.beam.tft_beam_io import transform_fn_io
    from tensorflow_transform.coders.csv_coder import CsvCoder
    from tensorflow_transform.coders.example_proto_coder import ExampleProtoCoder
    from tensorflow_transform.tf_metadata import dataset_metadata
    from tensorflow_transform.tf_metadata import metadata_io
    DATA_DIR = 'data/'
    TRAIN_DATA = os.path.join(DATA_DIR, 'taxi-cab-classification/train.csv')
    EVALUATION_DATA = os.path.join(
        DATA_DIR, 'taxi-cab-classification/eval.csv')

    # Categorical features are assumed to each have a maximum value in the dataset.
    MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12]
    CATEGORICAL_FEATURE_KEYS = ['trip_start_hour',
                                'trip_start_day', 'trip_start_month']

    DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds']

    # Number of buckets used by tf.transform for encoding each feature.
    FEATURE_BUCKET_COUNT = 10

    BUCKET_FEATURE_KEYS = [
        'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']

    # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform
    VOCAB_SIZE = 1000

    # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed.
    OOV_SIZE = 10

    VOCAB_FEATURE_KEYS = ['pickup_census_tract', 'dropoff_census_tract', 'payment_type', 'company',
                          'pickup_community_area', 'dropoff_community_area']

    # allow nan values in these features.
    OPTIONAL_FEATURES = ['dropoff_latitude', 'dropoff_longitude', 'pickup_census_tract', 'dropoff_census_tract',
                         'company', 'trip_seconds', 'dropoff_community_area']

    LABEL_KEY = 'tips'
    FARE_KEY = 'fare'
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
    # tf.get_logger().setLevel(logging.ERROR)

    def to_dense(tensor):
        """Takes as input a SparseTensor and return a Tensor with correct default value
        Args:
          tensor: tf.SparseTensor
        Returns:
          tf.Tensor with default value
        """
        if not isinstance(tensor, tf.sparse.SparseTensor):
            return tensor
        if tensor.dtype == tf.string:
            default_value = ''
        elif tensor.dtype == tf.float32:
            default_value = 0.0
        elif tensor.dtype == tf.int32:
            default_value = 0
        else:
            raise ValueError(f"Tensor type not recognized: {tensor.dtype}")

        return tf.squeeze(tf.sparse_to_dense(tensor.indices,
                                             [tensor.dense_shape[0], 1],
                                             tensor.values, default_value=default_value), axis=1)
        # TODO: Update to below version
        # return tf.squeeze(tf.sparse.to_dense(tensor, default_value=default_value), axis=1)

    def preprocess_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.
        Args:
          inputs: map from feature keys to raw not-yet-transformed features.
        Returns:
          Map from string feature key to transformed feature operations.
        """
        outputs = {}
        for key in DENSE_FLOAT_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[key] = tft.scale_to_z_score(to_dense(inputs[key]))

        for key in VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            if inputs[key].dtype == tf.string:
                vocab_tensor = to_dense(inputs[key])
            else:
                vocab_tensor = tf.as_string(to_dense(inputs[key]))
            outputs[key] = tft.compute_and_apply_vocabulary(
                vocab_tensor, vocab_filename='vocab_' + key,
                top_k=VOCAB_SIZE, num_oov_buckets=OOV_SIZE)

        for key in BUCKET_FEATURE_KEYS:
            outputs[key] = tft.bucketize(
                to_dense(inputs[key]), FEATURE_BUCKET_COUNT)

        for key in CATEGORICAL_FEATURE_KEYS:
            outputs[key] = tf.cast(to_dense(inputs[key]), tf.int64)

        taxi_fare = to_dense(inputs[FARE_KEY])
        taxi_tip = to_dense(inputs[LABEL_KEY])
        # Test if the tip was > 20% of the fare.
        tip_threshold = tf.multiply(taxi_fare, tf.constant(0.2))
        outputs[LABEL_KEY] = tf.logical_and(
            tf.logical_not(tf.math.is_nan(taxi_fare)),
            tf.greater(taxi_tip, tip_threshold))

        for key in outputs:
            if outputs[key].dtype == tf.bool:
                outputs[key] = tft.compute_and_apply_vocabulary(tf.as_string(outputs[key]),
                                                                vocab_filename='vocab_' + key)

        return outputs
    trns_output = os.path.join(DATA_DIR, "transformed")
    if os.path.exists(trns_output):
        shutil.rmtree(trns_output)

    tft_input_metadata = dataset_metadata.DatasetMetadata(schema)

    runner = 'DirectRunner'
    with beam.Pipeline(runner, options=None) as p:
        with beam_impl.Context(temp_dir=os.path.join(trns_output, 'tmp')):
            converter = CsvCoder(column_names, tft_input_metadata.schema)

            # READ TRAIN DATA
            train_data = (
                p
                | 'ReadTrainData' >> textio.ReadFromText(TRAIN_DATA, skip_header_lines=1)
                | 'DecodeTrainData' >> beam.Map(converter.decode))

            # TRANSFORM TRAIN DATA (and get transform_fn function)
            transformed_dataset, transform_fn = (
                (train_data, tft_input_metadata) | beam_impl.AnalyzeAndTransformDataset(preprocess_fn))
            transformed_data, transformed_metadata = transformed_dataset

            # SAVE TRANSFORMED TRAIN DATA
            _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                os.path.join(trns_output, 'train'),
                coder=ExampleProtoCoder(transformed_metadata.schema))

            # READ EVAL DATA
            eval_data = (
                p
                | 'ReadEvalData' >> textio.ReadFromText(EVALUATION_DATA, skip_header_lines=1)
                | 'DecodeEvalData' >> beam.Map(converter.decode))

            # TRANSFORM EVAL DATA (using previously created transform_fn function)
            eval_dataset = (eval_data, tft_input_metadata)
            transformed_eval_data, transformed_metadata = (
                (eval_dataset, transform_fn) | beam_impl.TransformDataset())

            # SAVE EVAL DATA
            _ = transformed_eval_data | 'WriteEvalData' >> tfrecordio.WriteToTFRecord(
                os.path.join(trns_output, 'eval'),
                coder=ExampleProtoCoder(transformed_metadata.schema))

            # SAVE transform_fn FUNCTION FOR LATER USE
            # TODO: check out what is the transform function (transform_fn) that came from previous step
            _ = (transform_fn | 'WriteTransformFn' >>
                 transform_fn_io.WriteTransformFn(trns_output))

            # SAVE TRANSFORMED METADATA
            metadata_io.write_metadata(
                metadata=tft_input_metadata,
                path=os.path.join(trns_output, 'metadata'))

    # -----------------------DATA SAVING START---------------------------------
    if "trns_output" in locals():
        _kale_resource_save(trns_output, os.path.join(
            _kale_data_directory, "trns_output"))
    else:
        print("_kale_resource_save: `trns_output` not found.")
Exemplo n.º 23
0
        beam.FlatMap(lambda stream: [stream.detrend('demean')])
        | 'Remove trend' >>
        beam.FlatMap(lambda stream: [stream.detrend('linear')])
        | 'Resample to 100 Hz' >>
        beam.FlatMap(lambda stream: [stream.resample(100)])
        | 'Trim traces' >> beam.ParDo(TrimTrace(points=3001)))

    station_location = location | '(sta, loc)' >> beam.FlatMap(
        lambda loc: [(loc['station'], loc)])
    station_pick = picks | '(sta, pick)' >> beam.FlatMap(
        lambda pick: [(pick.waveform_id.station_code, pick)])
    station_stream = streams | '(sta, stream)' >> beam.FlatMap(
        lambda stream: [(stream[0].stats.station, stream)])

    dataset = (
        {
            'pick': station_pick,
            'stream': station_stream,
            'location': station_location
        }
        | 'Join by station' >> beam.CoGroupByKey()
        | 'Drop empty station' >> beam.ParDo(DropEmptyStation())
        | 'Group stream pick by time' >> GroupStreamPick()
        | 'Generate stream PDFs' >> beam.ParDo(GeneratePDF(sigma=0.1))
        | 'Extract stream features' >> beam.ParDo(StreamFeatureExtraction()))

    transform = (dataset
                 | 'Feature to Example' >> beam.ParDo(FeatureToExample())
                 | 'Write dataset' >> tfrecordio.WriteToTFRecord(
                     tfrecord_dir, coder=beam.coders.ProtoCoder))
Exemplo n.º 24
0
def transform_data(train_neg_filepattern, train_pos_filepattern,
                   test_neg_filepattern, test_pos_filepattern,
                   transformed_train_filebase, transformed_test_filebase,
                   transformed_metadata_dir):
    """Transform the data and write out as a TFRecord of Example protos.

  Read in the data from the positive and negative examples on disk, and
  transform it using a preprocessing pipeline that removes punctuation,
  tokenizes and maps tokens to int64 values indices.

  Args:
    train_neg_filepattern: Filepattern for training data negative examples
    train_pos_filepattern: Filepattern for training data positive examples
    test_neg_filepattern: Filepattern for test data negative examples
    test_pos_filepattern: Filepattern for test data positive examples
    transformed_train_filebase: Base filename for transformed training data
        shards
    transformed_test_filebase: Base filename for transformed test data shards
    transformed_metadata_dir: Directory where metadata for transformed data
        should be written
  """

    with beam.Pipeline() as pipeline:
        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
            # pylint: disable=no-value-for-parameter
            train_data = pipeline | 'ReadTrain' >> ReadAndShuffleData(
                (train_neg_filepattern, train_pos_filepattern))
            # pylint: disable=no-value-for-parameter
            test_data = pipeline | 'ReadTest' >> ReadAndShuffleData(
                (test_neg_filepattern, test_pos_filepattern))

            metadata = dataset_metadata.DatasetMetadata(
                dataset_schema.Schema({
                    REVIEW_COLUMN:
                    dataset_schema.ColumnSchema(
                        tf.string, [],
                        dataset_schema.FixedColumnRepresentation()),
                    LABEL_COLUMN:
                    dataset_schema.ColumnSchema(
                        tf.int64, [],
                        dataset_schema.FixedColumnRepresentation()),
                }))

            def preprocessing_fn(inputs):
                """Preprocess input columns into transformed columns."""
                review = inputs[REVIEW_COLUMN]

                review_tokens = tf.string_split(review, DELIMITERS)
                review_indices = tft.string_to_int(review_tokens,
                                                   top_k=VOCAB_SIZE)
                # Add one for the oov bucket created by string_to_int.
                review_bow_indices, review_weight = tft.tfidf(
                    review_indices, VOCAB_SIZE + 1)
                return {
                    REVIEW_COLUMN: review_bow_indices,
                    REVIEW_WEIGHT: review_weight,
                    LABEL_COLUMN: inputs[LABEL_COLUMN]
                }

            (transformed_train_data, transformed_metadata), transform_fn = (
                (train_data, metadata)
                | 'AnalyzeAndTransform' >>
                beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

            transformed_test_data, _ = (
                ((test_data, metadata), transform_fn)
                | 'Transform' >> beam_impl.TransformDataset())

            _ = (transformed_train_data
                 | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                     transformed_train_filebase,
                     coder=example_proto_coder.ExampleProtoCoder(
                         transformed_metadata.schema)))

            _ = (transformed_test_data
                 | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
                     transformed_test_filebase,
                     coder=example_proto_coder.ExampleProtoCoder(
                         transformed_metadata.schema)))

            _ = (transformed_metadata
                 | 'WriteMetadata' >> beam_metadata_io.WriteMetadata(
                     transformed_metadata_dir, pipeline=pipeline))
Exemplo n.º 25
0
def preprocess_data(train_neg_file_pattern,
                    train_pos_file_pattern,
                    test_neg_file_pattern,
                    test_pos_file_pattern,
                    transformed_train_file_pattern,
                    transformed_test_file_pattern,
                    transformed_metadata_dir,
                    raw_metadata_dir,
                    transform_func_dir,
                    temp_dir,
                    vocab_size,
                    delimiters):
    """Transform the data and write out as a TFRecord of Example protos.
    Read in the data from the positive and negative examples on disk, and
    transform it using a preprocessing pipeline that removes punctuation,
    tokenizes and maps tokens to int64 values indices.

    Args:
    train_neg_filepattern: Filepattern for training data negative examples
    train_pos_filepattern: Filepattern for training data positive examples
    test_neg_filepattern: Filepattern for test data negative examples
    test_pos_filepattern: Filepattern for test data positive examples
    transformed_train_filebase: Base filename for transformed training data shards
    transformed_test_filebase: Base filename for transformed test data shards
    transformed_metadata_dir: Directory where metadata for transformed data should be written


    raw_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema({
        REVIEW_COLUMN: dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation()),
        LABEL_COLUMN: dataset_schema.ColumnSchema(tf.int64, [], dataset_schema.FixedColumnRepresentation()),
    }))
    """
    pipeline_name = 'DataflowRunner'
    options = {
        'job_name': ('cloud-ml-hazmat-preprocess-{}'.format(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))),
        'temp_location': temp_dir,
        'project': "stone-outpost-636",
        'max_num_workers': 8
    }
    pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
    #with beam.Pipeline(pipeline_name, options=pipeline_options) as pipeline:
    #    with beam_impl.Context(temp_dir=temp_dir):
    with beam.Pipeline() as pipeline:
        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):

            train_data = pipeline | 'ReadTrain' >> ReadAndShuffleData((train_neg_file_pattern, train_pos_file_pattern))
            test_data = pipeline | 'ReadTest' >> ReadAndShuffleData((test_neg_file_pattern, test_pos_file_pattern))
            preprocessing_fn = generate_preprocessing_fn(vocab_size, delimiters)

            (transformed_train_data, transformed_metadata), transform_fn = ((train_data, const.RAW_METADATA)
              | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

            _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(transform_func_dir))

            transformed_test_data, _ = (((test_data, const.RAW_METADATA), transform_fn)
              | 'Transform' >> beam_impl.TransformDataset())

            _ = (transformed_train_data
              | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(transformed_train_file_pattern,
                  coder=example_proto_coder.ExampleProtoCoder(transformed_metadata.schema)))

            _ = (transformed_test_data
              | 'WriteTestData' >> tfrecordio.WriteToTFRecord(transformed_test_file_pattern,
                  coder=example_proto_coder.ExampleProtoCoder(transformed_metadata.schema)))

            _ = (transformed_metadata
              | 'WriteTransformedMetadata' >> beam_metadata_io.WriteMetadata(transformed_metadata_dir, pipeline=pipeline))

            _ = (const.RAW_METADATA
              | 'WriteRawMetadata' >> beam_metadata_io.WriteMetadata(raw_metadata_dir, pipeline=pipeline))
Exemplo n.º 26
0
def run_transform(output_dir,
                  schema,
                  train_data_file,
                  eval_data_file,
                  project,
                  mode,
                  preprocessing_fn=None):
    """Writes a tft transform fn, and metadata files.
  Args:
    output_dir: output folder
    schema: schema list.
    train_data_file: training data file pattern.
    eval_data_file: eval data file pattern.
    project: the project to run dataflow in.
    local: whether the job should be local or cloud.
    preprocessing_fn: a function used to preprocess the raw data. If not
                      specified, a function will be automatically inferred
                      from the schema.
  """

    tft_input_metadata = make_tft_input_metadata(schema)
    temp_dir = os.path.join(output_dir, 'tmp')
    preprocessing_fn = preprocessing_fn or make_preprocessing_fn(schema)

    if mode == 'local':
        pipeline_options = None
        runner = 'DirectRunner'
    elif mode == 'cloud':
        options = {
            'job_name':
            'pipeline-tft-' +
            datetime.datetime.now().strftime('%y%m%d-%H%M%S'),
            'temp_location':
            temp_dir,
            'project':
            project,
            'extra_packages': [
                'gs://ml-pipeline-playground/tensorflow-transform-0.6.0.dev0.tar.gz'
            ]
        }
        pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
        runner = 'DataFlowRunner'
    else:
        raise ValueError("Invalid mode %s." % mode)

    with beam.Pipeline(runner, options=pipeline_options) as p:
        with beam_impl.Context(temp_dir=temp_dir):
            names = [x['name'] for x in schema]
            converter = CsvCoder(names, tft_input_metadata.schema)
            train_data = (
                p
                | 'ReadTrainData' >> textio.ReadFromText(train_data_file)
                | 'DecodeTrainData' >> beam.Map(converter.decode))

            train_dataset = (train_data, tft_input_metadata)
            transformed_dataset, transform_fn = (
                train_dataset
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data, transformed_metadata = transformed_dataset

            # Writes transformed_metadata and transfrom_fn folders
            _ = (transform_fn | 'WriteTransformFn' >>
                 transform_fn_io.WriteTransformFn(output_dir))

            # Write the raw_metadata
            metadata_io.write_metadata(metadata=tft_input_metadata,
                                       path=os.path.join(
                                           output_dir, 'metadata'))

            _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                os.path.join(output_dir, 'train'),
                coder=ExampleProtoCoder(transformed_metadata.schema))

            eval_data = (p
                         |
                         'ReadEvalData' >> textio.ReadFromText(eval_data_file)
                         | 'DecodeEvalData' >> beam.Map(converter.decode))

            eval_dataset = (eval_data, tft_input_metadata)

            transformed_eval_dataset = ((eval_dataset, transform_fn)
                                        | beam_impl.TransformDataset())
            transformed_eval_data, transformed_metadata = transformed_eval_dataset

            _ = transformed_eval_data | 'WriteEvalData' >> tfrecordio.WriteToTFRecord(
                os.path.join(output_dir, 'eval'),
                coder=ExampleProtoCoder(transformed_metadata.schema))
Exemplo n.º 27
0
def transform_data(train_neg_filepattern, train_pos_filepattern,
                   test_neg_filepattern, test_pos_filepattern,
                   transformed_train_filebase, transformed_test_filebase,
                   transformed_metadata_dir):
  """Transform the data and write out as a TFRecord of Example protos.

  Read in the data from the positive and negative examples on disk, and
  transform it using a preprocessing pipeline that removes punctuation,
  tokenizes and maps tokens to int64 values indices.

  Args:
    train_neg_filepattern: Filepattern for training data negative examples
    train_pos_filepattern: Filepattern for training data positive examples
    test_neg_filepattern: Filepattern for test data negative examples
    test_pos_filepattern: Filepattern for test data positive examples
    transformed_train_filebase: Base filename for transformed training data
        shards
    transformed_test_filebase: Base filename for transformed test data shards
    transformed_metadata_dir: Directory where metadata for transformed data
        should be written
  """

  with beam.Pipeline() as pipeline:
    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
      # pylint: disable=no-value-for-parameter
      train_data = pipeline | 'ReadTrain' >> ReadAndShuffleData(
          (train_neg_filepattern, train_pos_filepattern))
      # pylint: disable=no-value-for-parameter
      test_data = pipeline | 'ReadTest' >> ReadAndShuffleData(
          (test_neg_filepattern, test_pos_filepattern))

      metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema({
          REVIEW_COLUMN: dataset_schema.ColumnSchema(
              tf.string, [], dataset_schema.FixedColumnRepresentation()),
          LABEL_COLUMN: dataset_schema.ColumnSchema(
              tf.int64, [], dataset_schema.FixedColumnRepresentation()),
      }))

      def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        review = inputs[REVIEW_COLUMN]

        def remove_character(s, char):
          """Remove a character from a string.

          Args:
            s: A SparseTensor of rank 1 of type tf.string
            char: A string of length 1

          Returns:
            The string `s` with the given character removed (i.e. replaced by
            '')
          """
          # Hacky implementation where we split and rejoin.
          split = tf.string_split(s, char)
          rejoined = tf.reduce_join(
              tf.sparse_to_dense(
                  split.indices, split.dense_shape, split.values, ''),
              1)
          return rejoined

        def remove_punctuation(s):
          """Remove puncuation from a string.

          Args:
            s: A SparseTensor of rank 1 of type tf.string

          Returns:
            The string `s` with punctuation removed.
          """
          for char in PUNCTUATION_CHARACTERS:
            s = remove_character(s, char)
          return s

        cleaned_review = tft.map(remove_punctuation, review)
        review_tokens = tft.map(tf.string_split, cleaned_review)
        review_indices = tft.string_to_int(review_tokens, top_k=VOCAB_SIZE)
        return {
            REVIEW_COLUMN: review_indices,
            LABEL_COLUMN: inputs[LABEL_COLUMN]
        }

      (transformed_train_data, transformed_metadata), transform_fn = (
          (train_data, metadata)
          | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(
              preprocessing_fn))

      transformed_test_data, _ = (
          ((test_data, metadata), transform_fn)
          | 'Transform' >> beam_impl.TransformDataset())

      _ = (
          transformed_train_data
          | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
              transformed_train_filebase,
              coder=example_proto_coder.ExampleProtoCoder(
                  transformed_metadata.schema)))

      _ = (
          transformed_test_data
          | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
              transformed_test_filebase,
              coder=example_proto_coder.ExampleProtoCoder(
                  transformed_metadata.schema)))

      _ = (
          transformed_metadata
          | 'WriteMetadata' >> beam_metadata_io.WriteMetadata(
              transformed_metadata_dir, pipeline=pipeline))
Exemplo n.º 28
0
def transform_data(train_data_file, test_data_file, transformed_train_filebase,
                   transformed_test_filebase, transformed_metadata_dir):
    """Transform the data and write out as a TFRecord of Example protos.

  Read in the data using the CSV reader, and transform it using a
  preprocessing pipeline that scales numeric data and coverts categorical data
  from strings to int64 values indices, by creating a vocabulary for each
  category.

  Args:
    train_data_file: File containing training data
    test_data_file: File containing test data
    transformed_train_filebase: Base filename for transformed training data
        shards
    transformed_test_filebase: Base filename for transformed test data shards
    transformed_metadata_dir: Directory where metadata for transformed data
        should be written
  """
    raw_data_schema = {
        key:
        dataset_schema.ColumnSchema(tf.string, [],
                                    dataset_schema.FixedColumnRepresentation())
        for key in CATEGORICAL_COLUMNS
    }
    raw_data_schema.update({
        key:
        dataset_schema.ColumnSchema(tf.float32, [],
                                    dataset_schema.FixedColumnRepresentation())
        for key in NUMERIC_COLUMNS
    })
    raw_data_schema[LABEL_COLUMN] = dataset_schema.ColumnSchema(
        tf.string, [], dataset_schema.FixedColumnRepresentation())
    raw_data_schema = dataset_schema.Schema(raw_data_schema)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_data_schema)

    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        outputs = {}

        # Scale numeric columns to have range [0, 1].
        for key in NUMERIC_COLUMNS:
            outputs[key] = tft.scale_to_0_1(inputs[key])

        # For all categorical columns except the label column, we use
        # tft.string_to_int which computes the set of unique values and uses this
        # to convert the strings to indices.
        for key in CATEGORICAL_COLUMNS:
            outputs[key] = tft.string_to_int(inputs[key])

        # For the label column we provide the mapping from string to index.
        def convert_label(label):
            table = lookup.string_to_index_table_from_tensor(['>50K', '<=50K'])
            return table.lookup(label)

        outputs[LABEL_COLUMN] = tft.map(convert_label, inputs[LABEL_COLUMN])

        return outputs

    # The "with" block will create a pipeline, and run that pipeline at the exit
    # of the block.
    with beam.Pipeline() as pipeline:
        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
            # Create a coder to read the census data with the schema.  To do this we
            # need to list all columns in order since the schema doesn't specify the
            # order of columns in the csv.
            ordered_columns = [
                'age', 'workclass', 'fnlwgt', 'education', 'education-num',
                'marital-status', 'occupation', 'relationship', 'race', 'sex',
                'capital-gain', 'capital-loss', 'hours-per-week',
                'native-country', 'label'
            ]
            converter = csv_coder.CsvCoder(ordered_columns, raw_data_schema)

            # Read in raw data and convert using CSV converter.  Note that we apply
            # some Beam transformations here, which will not be encoded in the TF
            # graph since we don't do the from within tf.Transform's methods
            # (AnalyzeDataset, TransformDataset etc.).  These transformations are just
            # to get data into a format that the CSV converter can read, in particular
            # removing empty lines and removing spaces after commas.
            raw_data = (pipeline
                        |
                        'ReadTrainData' >> textio.ReadFromText(train_data_file)
                        | 'FilterTrainData' >> beam.Filter(lambda line: line)
                        | 'FixCommasTrainData' >>
                        beam.Map(lambda line: line.replace(', ', ','))
                        | 'DecodeTrainData' >> beam.Map(converter.decode))

            # Combine data and schema into a dataset tuple.  Note that we already used
            # the schema to read the CSV data, but we also need it to interpret
            # raw_data.
            raw_dataset = (raw_data, raw_data_metadata)
            transformed_dataset, transform_fn = (
                raw_dataset
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data, transformed_metadata = transformed_dataset

            _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                transformed_train_filebase,
                coder=example_proto_coder.ExampleProtoCoder(
                    transformed_metadata.schema))

            # Now apply transform function to test data.  In this case we also remove
            # the header line from the CSV file and the trailing period at the end of
            # each line.
            raw_test_data = (
                pipeline
                | 'ReadTestData' >> textio.ReadFromText(test_data_file)
                | 'FilterTestData' >> beam.Filter(
                    lambda line: line and line != '|1x3 Cross validator')
                | 'FixCommasTestData' >>
                beam.Map(lambda line: line.replace(', ', ','))
                | 'RemoveTrailingPeriodsTestData' >>
                beam.Map(lambda line: line[:-1])
                | 'DecodeTestData' >> beam.Map(converter.decode))

            raw_test_dataset = (raw_test_data, raw_data_metadata)

            transformed_test_dataset = ((raw_test_dataset, transform_fn)
                                        | beam_impl.TransformDataset())
            # Don't need transformed data schema, it's the same as before.
            transformed_test_data, _ = transformed_test_dataset

            _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
                transformed_test_filebase,
                coder=example_proto_coder.ExampleProtoCoder(
                    transformed_metadata.schema))

            _ = (transformed_metadata
                 | 'WriteMetadata' >> beam_metadata_io.WriteMetadata(
                     transformed_metadata_dir, pipeline=pipeline))
Exemplo n.º 29
0
 # split into train and test
 train, test = (
     transformed_data
     | 'Partition train/test' >> beam.Partition(
         split_train_test, 2, split_dt=datetime(2020, 1, 1)))
 # encoder for TFRecords
 transformed_data_coder = tft.coders.ExampleProtoCoder(
     transformed_metadata.schema)
 # write train dataset
 _ = (
     train
     | 'Encode & write train -> TFRecords' >>
     tfrecordio.WriteToTFRecord(
         file_path_prefix=os.path.join(args.data_dir, 'tfrecords',
                                       args.output_dir,
                                       TRAIN_FILES_PATTERN),
         coder=transformed_data_coder,
         file_name_suffix='.gz',
         num_shards=4,
         compression_type=beam.io.filesystem.CompressionTypes.GZIP))
 # write validation dataset
 _ = (
     test
     | 'Encode & write test -> TFRecords' >>
     tfrecordio.WriteToTFRecord(
         file_path_prefix=os.path.join(args.data_dir, 'tfrecords',
                                       args.output_dir,
                                       EVAL_FILES_PATTERN),
         coder=transformed_data_coder,
         file_name_suffix='.gz',
         num_shards=1,
         compression_type=beam.io.filesystem.CompressionTypes.GZIP))
def preprocess(in_test_mode):
  import os
  import os.path
  import tempfile
  from apache_beam.io import tfrecordio
  from tensorflow_transform.coders import example_proto_coder
  from tensorflow_transform.tf_metadata import dataset_metadata
  from tensorflow_transform.tf_metadata import dataset_schema
  from tensorflow_transform.beam import tft_beam_io
  from tensorflow_transform.beam.tft_beam_io import transform_fn_io

  job_name = 'preprocess-taxi-features' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')    
  if in_test_mode:
    import shutil
    print 'Launching local job ... hang on'
    OUTPUT_DIR = './preproc_tft'
    shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
    EVERY_N = 100000
  else:
    print 'Launching Dataflow job {} ... hang on'.format(job_name)
    OUTPUT_DIR = 'gs://{0}/taxifare/preproc_tft/'.format(BUCKET)
    import subprocess
    subprocess.call('gsutil rm -r {}'.format(OUTPUT_DIR).split())
    EVERY_N = 10000
    
  options = {
    'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'),
    'temp_location': os.path.join(OUTPUT_DIR, 'tmp'),
    'job_name': job_name,
    'project': PROJECT,
    'max_num_workers': 24,
    'teardown_policy': 'TEARDOWN_ALWAYS',
    'no_save_main_session': True,
    'requirements_file': 'requirements.txt'
  }
  opts = beam.pipeline.PipelineOptions(flags=[], **options)
  if in_test_mode:
    RUNNER = 'DirectRunner'
  else:
    RUNNER = 'DataflowRunner'

  # set up metadata
  raw_data_schema = {
    colname : dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation())
                   for colname in 'dayofweek,key'.split(',')
  }
  raw_data_schema.update({
      colname : dataset_schema.ColumnSchema(tf.float32, [], dataset_schema.FixedColumnRepresentation())
                   for colname in 'fare_amount,pickuplon,pickuplat,dropofflon,dropofflat'.split(',')
    })
  raw_data_schema.update({
      colname : dataset_schema.ColumnSchema(tf.int64, [], dataset_schema.FixedColumnRepresentation())
                   for colname in 'hourofday,passengers'.split(',')
    })
  raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(raw_data_schema))

  # run Beam  
  with beam.Pipeline(RUNNER, options=opts) as p:
    with beam_impl.Context(temp_dir=os.path.join(OUTPUT_DIR, 'tmp')):
      # save the raw data metadata
      _ = (raw_data_metadata
        | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata(
            os.path.join(OUTPUT_DIR, 'metadata/rawdata_metadata'),
            pipeline=p))
      
      # analyze and transform training       
      raw_data = (p 
        | 'train_read' >> beam.io.Read(beam.io.BigQuerySource(query=create_query(1, EVERY_N), use_standard_sql=True))
        | 'train_filter' >> beam.Filter(is_valid))

      raw_dataset = (raw_data, raw_data_metadata)
      transformed_dataset, transform_fn = (
          raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocess_tft))
      transformed_data, transformed_metadata = transformed_dataset
      _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
          os.path.join(OUTPUT_DIR, 'train'),
          file_name_suffix='.gz',
          coder=example_proto_coder.ExampleProtoCoder(
              transformed_metadata.schema))
      
      # transform eval data
      raw_test_data = (p 
        | 'eval_read' >> beam.io.Read(beam.io.BigQuerySource(query=create_query(2, EVERY_N), use_standard_sql=True))
        | 'eval_filter' >> beam.Filter(is_valid))
      
      raw_test_dataset = (raw_test_data, raw_data_metadata)
      transformed_test_dataset = (
          (raw_test_dataset, transform_fn) | beam_impl.TransformDataset())
      transformed_test_data, _ = transformed_test_dataset
      _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
          os.path.join(OUTPUT_DIR, 'eval'),
          file_name_suffix='.gz',
          coder=example_proto_coder.ExampleProtoCoder(
              transformed_metadata.schema))
      _ = (transform_fn
           | 'WriteTransformFn' >>
           transform_fn_io.WriteTransformFn(os.path.join(OUTPUT_DIR, 'metadata')))