Exemplo n.º 1
0
def build_pipeline(p, flags):
    """Sets up Apache Beam pipeline for execution."""

    raw_data = (
        p | 'QueryTable' >> beam.io.Read(
            beam.io.BigQuerySource(query=query.get_query(flags.bq_table),
                                   project=flags.project_id,
                                   use_standard_sql=True))
        # omit 'Generate data' step if working with real data
        | 'Generate data' >> beam.Map(_generate_fake_data)
        | 'Extract lifetime ' >> beam.Map(append_lifetime_duration)
        | 'Extract label' >> beam.Map(append_label)
        | 'Generate label array' >> beam.Map(combine_censorship_duration))
    raw_train, raw_eval, raw_test = (
        raw_data | 'RandomlySplitData' >> randomly_split(
            train_size=.7, validation_size=.15, test_size=.15))
    raw_metadata = features.get_raw_dataset_metadata()
    preprocess_fn = features.preprocess_fn
    transform_fn = ((raw_train, raw_metadata)
                    | 'AnalyzeTrain' >> tft_beam.AnalyzeDataset(preprocess_fn))
    (transform_fn
     | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(flags.output_dir))

    for dataset_type, dataset in [('Train', raw_train), ('Eval', raw_eval),
                                  ('Test', raw_test)]:
        transform_label = 'Transform{}'.format(dataset_type)
        t, metadata = (((dataset, raw_metadata), transform_fn)
                       | transform_label >> tft_beam.TransformDataset())
        if dataset_type == 'Train':
            (metadata | 'WriteMetadata' >> tft_beam_io.WriteMetadata(
                os.path.join(flags.output_dir, 'transformed_metadata'),
                pipeline=p))
        write_label = 'Write{}TFRecord'.format(dataset_type)
        t | write_label >> write_tfrecord(dataset_type, flags.output_dir,
                                          metadata)
Exemplo n.º 2
0
def preprocess(p, args):
    """Run preprocessing as pipeline."""
    train_eval_schema = _make_input_schema()

    train_eval_metadata = dataset_metadata.DatasetMetadata(
        schema=train_eval_schema)

    _ = (train_eval_metadata
         | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata(os.path.join(
             args.output_dir, constants.RAW_METADATA_DIR),
                                                             pipeline=p))

    train_eval_data = (p | 'ReadDataFromBQ' >> beam.io.Read(
        beam.io.BigQuerySource(query=_get_query('bigquery-public-data',
                                                'samples', 'gsod'),
                               use_standard_sql=True)))

    train_eval_data = train_eval_data | 'ValidateData' >> beam.ParDo(
        DataValidator())

    (transformed_train_eval_data,
     transformed_train_eval_metadata), transform_fn = (
         (train_eval_data, train_eval_metadata)
         | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset(
             get_preprocessing_fn()))

    _ = (transform_fn
         | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(args.output_dir))

    transformed_train_eval_coder = coders.ExampleProtoCoder(
        transformed_train_eval_metadata.schema)

    transformed_train_data, transformed_eval_data = (
        transformed_train_eval_data
        | 'Partition' >> beam.Partition(get_partition_fn(0.7), 2))

    (transformed_train_data
     |
     'SerializeTrainExamples' >> beam.Map(transformed_train_eval_coder.encode)
     | 'WriteTraining' >>
     beam.io.WriteToTFRecord(os.path.join(
         args.output_dir, constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX),
                             file_name_suffix=constants.DATA_FILE_SUFFIX))

    (transformed_eval_data
     | 'SerializeEvalExamples' >> beam.Map(transformed_train_eval_coder.encode)
     | 'WriteEval' >>
     beam.io.WriteToTFRecord(os.path.join(
         args.output_dir, constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX),
                             file_name_suffix=constants.DATA_FILE_SUFFIX))
Exemplo n.º 3
0
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir,
               frequency_threshold, delimiter):
    """Run pre-processing step as a pipeline.

  Args:
    pipeline: beam pipeline
    training_data: file paths to input csv files.
    eval_data: file paths to input csv files.
    predict_data: file paths to input csv files.
    output_dir: file path to where to write all the output files.
    frequency_threshold: frequency threshold to use for categorical values.
    delimiter: the column delimiter for the CSV format.
  """
    # 1) The schema can be either defined in-memory or read from a configuration
    #    file, in this case we are creating the schema in-memory.
    input_schema = criteo.make_input_schema()

    # 2) Configure the coder to map the source file column names to a dictionary
    #    of key -> tensor_proto with the appropiate type derived from the
    #    input_schema.
    coder = criteo.make_csv_coder(input_schema, delimiter)

    # 3) Read from text using the coder.
    train_data = (pipeline
                  | 'ReadTrainingData' >> beam.io.ReadFromText(training_data)
                  | 'ParseTrainingCsv' >> beam.Map(coder.decode))

    evaluate_data = (pipeline
                     | 'ReadEvalData' >> beam.io.ReadFromText(eval_data)
                     | 'ParseEvalCsv' >> beam.Map(coder.decode))

    input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema)
    _ = (input_metadata
         | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata(
             os.path.join(output_dir, path_constants.RAW_METADATA_DIR),
             pipeline=pipeline))

    preprocessing_fn = criteo.make_preprocessing_fn(frequency_threshold)
    (train_dataset, train_metadata), transform_fn = (
        (train_data, input_metadata)
        | 'AnalyzeAndTransform' >>
        tft.AnalyzeAndTransformDataset(preprocessing_fn))

    # WriteTransformFn writes transform_fn and metadata to fixed subdirectories
    # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and
    # path_constants.TRANSFORMED_METADATA_DIR.
    _ = (transform_fn
         | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir))

    # TODO(b/34231369) Remember to eventually also save the statistics.

    (evaluate_dataset,
     evaluate_metadata) = (((evaluate_data, input_metadata), transform_fn)
                           | 'TransformEval' >> tft.TransformDataset())

    train_coder = coders.ExampleProtoCoder(train_metadata.schema)
    _ = (
        train_dataset
        | 'SerializeTrainExamples' >> beam.Map(train_coder.encode)
        | 'ShuffleTraining' >> _Shuffle()  # pylint: disable=no-value-for-parameter
        | 'WriteTraining' >>
        beam.io.WriteToTFRecord(os.path.join(
            output_dir, path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX),
                                file_name_suffix='.tfrecord.gz'))

    evaluate_coder = coders.ExampleProtoCoder(evaluate_metadata.schema)
    _ = (
        evaluate_dataset
        | 'SerializeEvalExamples' >> beam.Map(evaluate_coder.encode)
        | 'ShuffleEval' >> _Shuffle()  # pylint: disable=no-value-for-parameter
        | 'WriteEval' >>
        beam.io.WriteToTFRecord(os.path.join(
            output_dir, path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX),
                                file_name_suffix='.tfrecord.gz'))

    if predict_data:
        predict_mode = tf.contrib.learn.ModeKeys.INFER
        predict_schema = criteo.make_input_schema(mode=predict_mode)
        csv_coder = criteo.make_csv_coder(predict_schema, mode=predict_mode)
        predict_coder = coders.ExampleProtoCoder(predict_schema)
        serialized_examples = (
            pipeline
            | 'ReadPredictData' >> beam.io.ReadFromText(predict_data)
            | 'ParsePredictCsv' >> beam.Map(csv_coder.decode)
            # TODO(b/35194257) Obviate the need for this explicit serialization.
            | 'EncodePredictData' >> beam.Map(predict_coder.encode))
        _ = (serialized_examples
             | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord(
                 os.path.join(
                     output_dir,
                     path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
                 file_name_suffix='.tfrecord.gz'))
        _ = (serialized_examples
             | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json)
             | 'WritePredictDataAsText' >> beam.io.WriteToText(
                 os.path.join(
                     output_dir,
                     path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
                 file_name_suffix='.txt'))
Exemplo n.º 4
0
      movie_rating_history=args.movie_rating_history)

  movies_sideinput = beam.pvalue.AsDict(movies_data)
  eval_data |= 'BuildEvalFeatures' >> beam.ParDo(
      BuildExampleFn(args.random_seed),
      movies_data=movies_sideinput,
      rating_threshold=args.eval_score_threshold,
      is_ranking_problem=(args.eval_type == RANKING),
      is_train=False,
      num_ranking_candidate_movie_ids=args.num_ranking_candidate_movie_ids)

  # TFTransform based preprocessing.
  raw_metadata = dataset_metadata.DatasetMetadata(
      schema=movielens.make_examples_schema())
  _ = (raw_metadata
       | 'WriteRawMetadata' >> tft_beam_io.WriteMetadata(
           os.path.join(args.output_dir, 'raw_metadata'), pipeline))

  preprocessing_fn = movielens.make_preprocessing_fn()
  train_features_transformed, transform_fn = (
      (train_data, raw_metadata)
      | 'AnalyzeAndTransform' >> tft.AnalyzeAndTransformDataset(
          preprocessing_fn))

  eval_features_transformed = (
      ((eval_data, raw_metadata), transform_fn)
      | 'TransformEval' >> tft.TransformDataset())

  train_dataset_transformed, train_metadata = train_features_transformed
  training_coder = tft_coders.ExampleProtoCoder(train_metadata.schema)
  _ = (train_dataset_transformed
       | 'EncodeTraining' >> beam.Map(training_coder.encode)
Exemplo n.º 5
0
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir,
               frequency_threshold):
  """Run pre-processing step as a pipeline.

  Args:
    pipeline: beam pipeline
    training_data: the name of the table to train on.
    eval_data: the name of the table to evaluate on.
    predict_data: the name of the table to predict on.
    output_dir: file path to where to write all the output files.
    frequency_threshold: frequency threshold to use for categorical values.
  """

  # 1) The schema can be either defined in-memory or read from a configuration
  #    file, in this case we are creating the schema in-memory.
  input_schema = reddit.make_input_schema()

  # 2) Read from BigQuery or from CSV.
  train_data = pipeline | 'ReadTrainingData' >> _ReadData(training_data)
  evaluate_data = pipeline | 'ReadEvalData' >> _ReadData(eval_data)

  input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema)

  _ = (input_metadata
       | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata(
           os.path.join(output_dir, path_constants.RAW_METADATA_DIR),
           pipeline=pipeline))

  preprocessing_fn = reddit.make_preprocessing_fn(frequency_threshold)
  (train_dataset, train_metadata), transform_fn = (
      (train_data, input_metadata)
      | 'AnalyzeAndTransform' >> tft.AnalyzeAndTransformDataset(
          preprocessing_fn))

  # WriteTransformFn writes transform_fn and metadata to fixed subdirectories
  # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and
  # path_constants.TRANSFORMED_METADATA_DIR.
  _ = (transform_fn
       | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir))

  (evaluate_dataset, evaluate_metadata) = (
      ((evaluate_data, input_metadata), transform_fn)
      | 'TransformEval' >> tft.TransformDataset())

  # pylint: disable=expression-not-assigned
  # TODO(b/34231369) Remember to eventually also save the statistics and the
  # metadata.

  train_coder = coders.ExampleProtoCoder(train_metadata.schema)
  (train_dataset
   | 'SerializeTrainExamples' >> beam.Map(train_coder.encode)
   | 'ShuffleTraining' >> _Shuffle()  # pylint: disable=no-value-for-parameter
   | 'WriteTraining' >> beam.io.WriteToTFRecord(
       os.path.join(output_dir,
                    path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX),
       file_name_suffix='.tfrecord.gz'))

  evaluate_coder = coders.ExampleProtoCoder(evaluate_metadata.schema)
  (evaluate_dataset
   | 'SerializeEvalExamples' >> beam.Map(evaluate_coder.encode)
   | 'ShuffleEval' >> _Shuffle()  # pylint: disable=no-value-for-parameter
   | 'WriteEval' >> beam.io.WriteToTFRecord(
       os.path.join(output_dir,
                    path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX),
       file_name_suffix='.tfrecord.gz'))

  if predict_data:
    predict_mode = tf.contrib.learn.ModeKeys.INFER
    predict_schema = reddit.make_input_schema(mode=predict_mode)
    predict_coder = coders.ExampleProtoCoder(predict_schema)

    serialized_examples = (pipeline
                           | 'ReadPredictData' >> _ReadData(
                               predict_data, mode=predict_mode)
                           # TODO(b/35194257) Obviate the need for this explicit
                           # serialization.
                           | 'EncodePredictData' >> beam.Map(
                               predict_coder.encode))
    _ = (serialized_examples
         | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord(
             os.path.join(output_dir,
                          path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
             file_name_suffix='.tfrecord.gz'))
    _ = (serialized_examples
         | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json)
         | 'WritePredictDataAsText' >> beam.io.WriteToText(
             os.path.join(output_dir,
                          path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
             file_name_suffix='.txt'))
def preprocess(in_test_mode):
  import os
  import os.path
  import tempfile
  from apache_beam.io import tfrecordio
  from tensorflow_transform.coders import example_proto_coder
  from tensorflow_transform.tf_metadata import dataset_metadata
  from tensorflow_transform.tf_metadata import dataset_schema
  from tensorflow_transform.beam import tft_beam_io
  from tensorflow_transform.beam.tft_beam_io import transform_fn_io

  job_name = 'preprocess-taxi-features' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')    
  if in_test_mode:
    import shutil
    print 'Launching local job ... hang on'
    OUTPUT_DIR = './preproc_tft'
    shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
    EVERY_N = 100000
  else:
    print 'Launching Dataflow job {} ... hang on'.format(job_name)
    OUTPUT_DIR = 'gs://{0}/taxifare/preproc_tft/'.format(BUCKET)
    import subprocess
    subprocess.call('gsutil rm -r {}'.format(OUTPUT_DIR).split())
    EVERY_N = 10000
    
  options = {
    'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'),
    'temp_location': os.path.join(OUTPUT_DIR, 'tmp'),
    'job_name': job_name,
    'project': PROJECT,
    'max_num_workers': 24,
    'teardown_policy': 'TEARDOWN_ALWAYS',
    'no_save_main_session': True,
    'requirements_file': 'requirements.txt'
  }
  opts = beam.pipeline.PipelineOptions(flags=[], **options)
  if in_test_mode:
    RUNNER = 'DirectRunner'
  else:
    RUNNER = 'DataflowRunner'

  # set up metadata
  raw_data_schema = {
    colname : dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation())
                   for colname in 'dayofweek,key'.split(',')
  }
  raw_data_schema.update({
      colname : dataset_schema.ColumnSchema(tf.float32, [], dataset_schema.FixedColumnRepresentation())
                   for colname in 'fare_amount,pickuplon,pickuplat,dropofflon,dropofflat'.split(',')
    })
  raw_data_schema.update({
      colname : dataset_schema.ColumnSchema(tf.int64, [], dataset_schema.FixedColumnRepresentation())
                   for colname in 'hourofday,passengers'.split(',')
    })
  raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(raw_data_schema))

  # run Beam  
  with beam.Pipeline(RUNNER, options=opts) as p:
    with beam_impl.Context(temp_dir=os.path.join(OUTPUT_DIR, 'tmp')):
      # save the raw data metadata
      _ = (raw_data_metadata
        | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata(
            os.path.join(OUTPUT_DIR, 'metadata/rawdata_metadata'),
            pipeline=p))
      
      # analyze and transform training       
      raw_data = (p 
        | 'train_read' >> beam.io.Read(beam.io.BigQuerySource(query=create_query(1, EVERY_N), use_standard_sql=True))
        | 'train_filter' >> beam.Filter(is_valid))

      raw_dataset = (raw_data, raw_data_metadata)
      transformed_dataset, transform_fn = (
          raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocess_tft))
      transformed_data, transformed_metadata = transformed_dataset
      _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
          os.path.join(OUTPUT_DIR, 'train'),
          file_name_suffix='.gz',
          coder=example_proto_coder.ExampleProtoCoder(
              transformed_metadata.schema))
      
      # transform eval data
      raw_test_data = (p 
        | 'eval_read' >> beam.io.Read(beam.io.BigQuerySource(query=create_query(2, EVERY_N), use_standard_sql=True))
        | 'eval_filter' >> beam.Filter(is_valid))
      
      raw_test_dataset = (raw_test_data, raw_data_metadata)
      transformed_test_dataset = (
          (raw_test_dataset, transform_fn) | beam_impl.TransformDataset())
      transformed_test_data, _ = transformed_test_dataset
      _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
          os.path.join(OUTPUT_DIR, 'eval'),
          file_name_suffix='.gz',
          coder=example_proto_coder.ExampleProtoCoder(
              transformed_metadata.schema))
      _ = (transform_fn
           | 'WriteTransformFn' >>
           transform_fn_io.WriteTransformFn(os.path.join(OUTPUT_DIR, 'metadata')))
Exemplo n.º 7
0
def run(flags, pipeline_args):
    """Run Apache Beam pipeline to generate TFRecords for Survival Analysis"""
    options = PipelineOptions(flags=[], **pipeline_args)
    options.view_as(WorkerOptions).machine_type = flags.machine_type
    temp_dir = os.path.join(flags.output_dir, 'tmp')
    runner = 'DataflowRunner' if flags.cloud else 'DirectRunner'

    files = tf.gfile.Glob(flags.input_dir + "*")
    if not flags.cloud:
        files = files[0:
                      20]  # if running locally for testing, process less files

    logging.warning("Number of files: " + str(len(files)))
    labels = get_labels_array(
        "gs://columbia-dl-storage-bucket/ADNI_t1_list_with_fsstatus_20190111.csv"
    )

    with beam.Pipeline(runner, options=options) as p:
        with tft_beam.Context(temp_dir=temp_dir):

            input_metadata = dataset_metadata.DatasetMetadata(
                dataset_schema.from_feature_spec(features.RAW_FEATURE_SPEC))

            filenames = (p | 'Create filenames' >> beam.Create(files))
            nii = (filenames | 'Read NII' >> beam.Map(read_nii))
            nii_with_labels = (
                nii
                | 'Get Label' >> beam.FlatMap(lambda x: read_label(x, labels)))

            raw_train, raw_eval, raw_test = (
                nii_with_labels | 'RandomlySplitData' >> randomly_split(
                    train_size=.7, validation_size=.15, test_size=.15))

            raw_train = raw_train | 'FlattenTrain' >> beam.FlatMap(
                lambda x: x[1])
            raw_eval = (raw_eval
                        | 'FlattenEval' >> beam.FlatMap(lambda x: x[1]))
            raw_test = (raw_test
                        | 'FlattenTest' >> beam.FlatMap(lambda x: x[1]))

            raw_train | 'CountLabelFreq' >> extractAndCount(flags.output_dir)

            dataset_and_metadata, transform_fn = (
                (raw_train, input_metadata)
                | 'TransformData' >> tft_beam.AnalyzeAndTransformDataset(
                    features.preprocess))
            transform_fn = (
                (raw_train, input_metadata)
                |
                'AnalyzeTrain' >> tft_beam.AnalyzeDataset(features.preprocess))
            _ = (transform_fn
                 | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(
                     flags.output_dir))
            for dataset_type, dataset in [('Train', raw_train),
                                          ('Eval', raw_eval),
                                          ('Predict', raw_test)]:

                transform_label = 'Transform{}'.format(dataset_type)
                t, metadata = (((dataset, input_metadata), transform_fn)
                               |
                               transform_label >> tft_beam.TransformDataset())
                if dataset_type == 'Train':
                    _ = (metadata
                         | 'WriteMetadata' >>
                         tft_beam_io.WriteMetadata(os.path.join(
                             flags.output_dir, 'transformed_metadata'),
                                                   pipeline=p))
                write_label = 'Write{}TFRecord'.format(dataset_type)
                _ = t | write_label >> WriteTFRecord(
                    dataset_type, flags.output_dir, metadata)