Exemplo n.º 1
0
def preprocess(pipeline, args):
  """Run pre-processing step as a pipeline.

  Args:
    pipeline: beam pipeline.
    args: parsed command line arguments.
  """
  from preproc import movielens  # pylint: disable=g-import-not-at-top

  # 1) Read the data into pcollections.
  movies_coder = tft_coders.CsvCoder(movielens.MOVIE_COLUMNS,
                                     movielens.make_movies_schema(),
                                     secondary_delimiter='|',
                                     multivalent_columns=['genres'])
  movies_data = (pipeline
                 | 'ReadMoviesData' >> beam.io.ReadFromText(
                     os.path.join(args.input_dir, 'movies.csv'),
                     # TODO(b/35653662): Obviate the need for setting this.
                     coder=beam.coders.BytesCoder(),
                     skip_header_lines=args.skip_header_lines)
                 | 'DecodeMovies' >> beam.Map(movies_coder.decode)
                 | 'KeyByMovie' >> beam.Map(lambda x: (x['movie_id'], x)))
  ratings_coder = tft_coders.CsvCoder(movielens.RATING_COLUMNS,
                                      movielens.make_ratings_schema())
  ratings_data = (pipeline
                  | 'ReadRatingsData' >> beam.io.ReadFromText(
                      os.path.join(args.input_dir, 'ratings*'),
                      skip_header_lines=args.skip_header_lines)
                  | 'DecodeRatings' >> beam.Map(ratings_coder.decode)
                  | 'KeyByUser' >> beam.Map(lambda x: (x['user_id'], x))
                  | 'GroupByUser' >> beam.GroupByKey())
  def train_eval_partition_fn((user_id, _), unused_num_partitions):
    return movielens.partition_fn(
        user_id, args.partition_random_seed, args.percent_eval)
Exemplo n.º 2
0
def make_csv_coder(schema_file, mode):
    """Creates instance of CsvCoder.

  Args:
    schema_file: Serialized Schema proto file.
    mode: One of tf.estimator.ModeKeys.{TRAIN, EVAL, PREDICT}.

  Returns:
    Instance of CsvCoder.
  """
    schema = make_dataset_schema(schema_file, mode)
    if mode == tf.estimator.ModeKeys.PREDICT:
        features = list(features_config.ALL_FEATURES)
        features.remove(features_config.TARGET_FEATURE)
        return tft_coders.CsvCoder(features, schema)
    return tft_coders.CsvCoder(features_config.ALL_FEATURES, schema)
def make_csv_coder(schema):
    """Return a coder for tf.transform to read csv files."""
    raw_feature_spec = get_raw_feature_spec(schema)
    parsing_schema = dataset_schema.from_feature_spec(raw_feature_spec)
    return tft_coders.CsvCoder(_CSV_COLUMNS_NAMES,
                               parsing_schema,
                               delimiter='|')
Exemplo n.º 4
0
def make_csv_coder(schema, mode=tf.contrib.learn.ModeKeys.TRAIN):
    """Produces a CsvCoder from a data schema.
  Args:
    schema: A tf.Transform `Schema` object.
    mode: tf.contrib.learn.ModeKeys specifying if the source is being used for
      train/eval or prediction.
  Returns:
    A tf.Transform CsvCoder.
  """
    column_names += ['score', 'subreddit', 'example_id']
    return coders.CsvCoder(column_names, schema)
Exemplo n.º 5
0
def make_csv_coder():
    """Return a coder for tf.transform to read csv files."""
    column_names = [
        'pickup_community_area', 'fare', 'trip_start_month', 'trip_start_hour',
        'trip_start_day', 'trip_start_timestamp', 'pickup_latitude',
        'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude',
        'trip_miles', 'pickup_census_tract', 'dropoff_census_tract',
        'payment_type', 'company', 'dropoff_community_area', 'tips',
        'trip_seconds'
    ]
    parsing_feature_spec = get_raw_feature_spec()
    parsing_schema = dataset_schema.from_feature_spec(parsing_feature_spec)
    return tft_coders.CsvCoder(column_names, parsing_schema)
Exemplo n.º 6
0
    def build_graph(self):
        # Move percentage of train data to .`PPGRAPH_EXT` files, used for graph building.
        # num_lines = 0
        # for i in range(DATASET_NUM_SHARDS):
        #     _fname = '{}-{:05}-of-{:05}'.format(self.train_fname_out, i, self.config.DATASET_NUM_SHARDS)
        #     num_lines += sum(1 for _ in open(_fname))
        #     _fname_marked = '{}-{:05}-of-{:05}.{}'.format(self.train_fname_out, i, self.config.DATASET_NUM_SHARDS,
        #                                                   PPGRAPH_EXT)
        #     shutil.move(_fname, _fname_marked)
        #     if num_lines >= self.config.PPGRAPH_MAX_SAMPLES:
        #         break

        # Set up the preprocessing pipeline for analyzing the dataset. The analyze call is not combined with the
        # transform call because we will parallelize the transform call later. We had the issue that this process
        # runs on a single core and tends to cause OOM issues.
        pipeline = beam.Pipeline(runner=DirectRunner())

        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
            # todo: maybe, I should only use train data (or percentage of train data) to build the graph
            raw_train_data = (
                pipeline
                | 'ReadTrainDataFile' >> textio.ReadFromText(
                    'data/features' + '*' + 'shard' + '*', skip_header_lines=0)
                | 'DecodeTrainDataCSV' >> MapAndFilterErrors(
                    tft_coders.CsvCoder(
                        self.data_formatter.get_ordered_columns(),
                        self.data_formatter.get_raw_data_metadata().schema).
                    decode))

            # Combine data and schema into a dataset tuple.  Note that we already used
            # the schema to read the CSV data, but we also need it to interpret
            # raw_data.
            # That is when to use vocabulary, scale_to_0_1 or sparse_to_dense ...
            transform_fn = (
                (raw_train_data, self.data_formatter.get_raw_data_metadata())
                | beam_impl.AnalyzeDataset(
                    PreprocessingFunction().transform_to_tfrecord))

            # Write SavedModel and metadata to two subdirectories of working_dir, given by
            # `transform_fn_io.TRANSFORM_FN_DIR` and `transform_fn_io.TRANSFORMED_METADATA_DIR` respectively.
            _ = (transform_fn
                 | 'WriteTransformGraph' >>
                 transform_fn_io.WriteTransformFn(TARGET_DIR))  # working dir

        # Run the Beam preprocessing pipeline.
        st = time.time()
        result = pipeline.run()
        result.wait_until_finish()
        self.logger.info(
            'Transformation graph built and written in {:.2f} sec'.format(
                time.time() - st))
Exemplo n.º 7
0
def run_analysis(output_dir, model_dir, eval_path, schema, project, mode,
                 slice_columns):
    if mode == 'local':
        pipeline_options = None
        runner = 'DirectRunner'
    elif mode == 'cloud':
        tmp_location = os.path.join(output_dir, 'tmp')
        options = {
            'job_name':
            'pipeline-tfma-' +
            datetime.datetime.now().strftime('%y%m%d-%H%M%S'),
            'setup_file':
            './analysis/setup.py',
            'project':
            project,
            'temp_location':
            tmp_location,
        }
        pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
        runner = 'DataFlowRunner'
    else:
        raise ValueError("Invalid mode %s." % mode)

    column_names = [x['name'] for x in schema]
    for slice_column in slice_columns:
        if slice_column not in column_names:
            raise ValueError("Unknown slice column: %s" % slice_column)

    slice_spec = [
        slicer.SingleSliceSpec(
        ),  # An empty spec is required for the 'Overall' slice
        slicer.SingleSliceSpec(columns=slice_columns)
    ]

    with beam.Pipeline(runner=runner, options=pipeline_options) as pipeline:
        raw_feature_spec = get_raw_feature_spec(schema)
        raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
        example_coder = tft_coders.example_proto_coder.ExampleProtoCoder(
            raw_schema)
        csv_coder = tft_coders.CsvCoder(column_names, raw_schema)

        raw_data = (
            pipeline
            | 'ReadFromText' >> beam.io.ReadFromText(eval_path)
            | 'ParseCSV' >> beam.Map(csv_coder.decode)
            | 'CleanData' >> beam.Map(clean_raw_data_dict(raw_feature_spec))
            | 'ToSerializedTFExample' >> beam.Map(example_coder.encode)
            | 'EvaluateAndWriteResults' >> tfma.EvaluateAndWriteResults(
                eval_saved_model_path=model_dir,
                slice_spec=slice_spec,
                output_path=output_dir))
Exemplo n.º 8
0
 def _make_transform_fn(self, p, output_path):
   def preprocessing_fn(inputs):
     return {'x_scaled': tft.scale_to_0_1(inputs['x'])}
   schema = dataset_schema.from_feature_spec(
       {'x': tf.FixedLenFeature((), tf.float32, 0)})
   metadata = dataset_metadata.DatasetMetadata(schema=schema)
   columns = p | 'CreateTrainingData' >> beam.Create([{
       'x': v
   } for v in [4, 1, 5, 2]])
   _, result = (
       (columns, metadata)
       | 'AnalyzeAndTransform'
       >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn, output_path))
   coder = coders.CsvCoder(['x'], schema, delimiter='\t')
   return result, coder
Exemplo n.º 9
0
def make_csv_coder(schema, mode=tf.contrib.learn.ModeKeys.TRAIN):
    """Produces a CsvCoder from a data schema.
  Args:
    schema: A tf.Transform `Schema` object.
    mode: tf.contrib.learn.ModeKeys specifying if the source is being used for
      train/eval or prediction.
  Returns:
    A tf.Transform CsvCoder.
  """
    column_names = [] if mode == tf.contrib.learn.ModeKeys.INFER else ['score']
    column_names += [
        'created_utc', 'subreddit', 'author', 'comment_body',
        'comment_parent_body', 'toplevel', 'example_id'
    ]
    return coders.CsvCoder(column_names, schema)
def make_csv_coder(schema, mode=tf.contrib.learn.ModeKeys.TRAIN):
  """Produces a CsvCoder (with tab as the delimiter) from a data schema.

  Args:
    schema: A tf.Transform `Schema` object.
    mode: tf.contrib.learn.ModeKeys specifying if the source is being used for
      train/eval or prediction.

  Returns:
    A tf.Transform CsvCoder.
  """

  column_names = CSV_ORDERED_COLUMNS
  if mode == tf.contrib.learn.ModeKeys.INFER:
    column_names.remove(LABEL_COLUMN)

  return coders.CsvCoder(column_names, schema, delimiter=',')
Exemplo n.º 11
0
def make_tsv_coder(schema, mode=tf.contrib.learn.ModeKeys.TRAIN):
  """Produces a CsvCoder (with tab as the delimiter) from a data schema.

  Args:
    schema: A tf.Transform `Schema` object.
    mode: tf.contrib.learn.ModeKeys specifying if the source is being used for
      train/eval or prediction.

  Returns:
    A tf.Transform CsvCoder.
  """
  column_names = [] if mode == tf.contrib.learn.ModeKeys.INFER else ['clicked']
  for name in INTEGER_COLUMN_NAMES:
    column_names.append(name)
  for name in CATEGORICAL_COLUMN_NAMES:
    column_names.append(name)

  return coders.CsvCoder(column_names, schema, delimiter='\t')
def preprocess(pipeline, args):
  input_metadata = metadata_io.read_metadata(
      os.path.join(args.analyze_output_dir, RAW_METADATA_DIR))

  schema = json.loads(file_io.read_file_to_string(
      os.path.join(args.analyze_output_dir, SCHEMA_FILE)).decode())
  features = json.loads(file_io.read_file_to_string(
      os.path.join(args.analyze_output_dir, FEATURES_FILE)).decode())

  column_names = [col['name'] for col in schema]

  exclude_outputs = None
  if not args.target:
    for name, transform in six.iteritems(features):
      if transform['transform'] == TARGET_TRANSFORM:
        target_name = name
        column_names.remove(target_name)
        exclude_outputs = [target_name]
        del input_metadata.schema.column_schemas[target_name]
        break

  if args.csv_file_pattern:
    coder = coders.CsvCoder(column_names, input_metadata.schema, delimiter=',')
    raw_data = (
        pipeline
        | 'ReadCsvData' >> beam.io.ReadFromText(args.csv_file_pattern)
        | 'ParseCsvData' >> beam.Map(coder.decode))
  else:
    columns = ', '.join(column_names)
    query = 'SELECT {columns} FROM `{table}`'.format(columns=columns,
                                                     table=args.bigquery_table)
    raw_data = (
        pipeline
        | 'ReadBiqQueryData'
        >> beam.io.Read(beam.io.BigQuerySource(query=query,
                                               use_standard_sql=True)))

  # Note that prepare_image_transforms does not make embeddints, it justs reads
  # the image files and converts them to base64 stings. tft.TransformDataset()
  # will apply the saved model that makes the image embeddings.
  image_columns = image_transform_columns(features)
  raw_data = (
      raw_data
      | 'PreprocessTransferredLearningTransformations'
      >> beam.Map(prepare_image_transforms, image_columns))

  if args.shuffle:
    raw_data = raw_data | 'ShuffleData' >> shuffle()

  transform_fn = (
      pipeline
      | 'ReadTransformFn'
      >> tft_beam_io.ReadTransformFn(args.analyze_output_dir))

  (transformed_data, transform_metadata) = (
      ((raw_data, input_metadata), transform_fn)
      | 'ApplyTensorflowPreprocessingGraph' 
      >> tft.TransformDataset(exclude_outputs))

  tfexample_coder = coders.ExampleProtoCoder(transform_metadata.schema)
  _ = (transformed_data
       | 'SerializeExamples' >> beam.Map(tfexample_coder.encode)
       | 'WriteExamples'
       >> beam.io.WriteToTFRecord(
           os.path.join(args.output_dir, args.output_filename_prefix),
           file_name_suffix='.tfrecord.gz'))
Exemplo n.º 13
0
import apache_beam as beam
import tensorflow as tf
import tensorflow_transform as tft
from apache_beam.io import tfrecordio
from tensorflow_transform import coders
from tensorflow_transform.beam import impl as beam_impl
from tensorflow_transform.beam.tft_beam_io import transform_fn_io
from tensorflow_transform.tf_metadata import dataset_metadata

from trainer.config import PROJECT_ID, BUCKET, TRAIN_INPUT_DATA, TRAIN_OUTPUT_DATA, TFRECORD_DIR, MODEL_DIR, \
    INPUT_SCHEMA, OUTPUT_SCHEMA, EXAMPLE_SCHEMA

delimiter = ';'
converter_input = coders.CsvCoder([
    'BatchId', 'ButterMass', 'ButterTemperature', 'SugarMass', 'SugarHumidity',
    'FlourMass', 'FlourHumidity', 'HeatingTime', 'MixingSpeed', 'MixingTime'
],
                                  INPUT_SCHEMA,
                                  delimiter=delimiter)
converter_output = coders.CsvCoder([
    'BatchId', 'TotalVolume', 'Density', 'Temperature', 'Humidity', 'Energy',
    'Problems'
],
                                   OUTPUT_SCHEMA,
                                   delimiter=delimiter)
input_metadata = dataset_metadata.DatasetMetadata(schema=EXAMPLE_SCHEMA)


def extract_batchkey(record):
    """Extracts the BatchId out of the record
        Args:
            record (dict): record of decoded CSV line
Exemplo n.º 14
0
def _make_csv_coder(schema, column_names):
    """Return a coder for tf.transform to read csv files."""
    raw_feature_spec = _get_raw_feature_spec(schema)
    parsing_schema = schema_utils.schema_from_feature_spec(raw_feature_spec)
    return tft_coders.CsvCoder(column_names, parsing_schema)
Exemplo n.º 15
0
def make_csv_coder(schema):
    """Return a coder for tf.transform to read csv files."""
    raw_feature_spec = get_raw_feature_spec(schema)
    parsing_schema = schema_utils.schema_from_feature_spec(raw_feature_spec)
    return tft_coders.CsvCoder(CSV_COLUMN_NAMES, parsing_schema)
Exemplo n.º 16
0
def write_to_tfrecord(args):
    """
    This function is supposed to be called as a script.
    """
    # Decode arguments
    current_index, num_shards, train_split_fname_out, eval_split_fname_out, \
    exp_log_data_file_train_tfrecord, exp_log_data_file_eval_tfrecord, working_dir, data_formatter_module_path = args

    # num_shards = "32"
    current_index, num_shards = int(current_index), int(num_shards)

    split_train_file_pattern = '{}-{:05}-of-{:05}'.format(
        train_split_fname_out, current_index, num_shards) + '*'
    split_eval_file_pattern = '{}-{:05}-of-{:05}'.format(
        eval_split_fname_out, current_index, num_shards)

    log.info('exp_log_data_file_train_tfrecord {}'.format(
        exp_log_data_file_train_tfrecord))
    log.info('exp_log_data_file_eval_tfrecord {}'.format(
        exp_log_data_file_eval_tfrecord))
    log.info('split_train_file_pattern {}'.format(split_train_file_pattern))
    log.info('split_eval_file_pattern {}'.format(split_eval_file_pattern))

    data_formatter = import_from_uri(
        data_formatter_module_path).DataFormatter()

    # Set up the preprocessing pipeline.
    pipeline = beam.Pipeline(runner=DirectRunner())

    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
        # Read raw data files: CSV format ordered according to the `data_formatter`, that are then converted
        # into a cleaned up format.
        raw_train_data = (
            pipeline
            | 'ReadTrainDataFile' >> textio.ReadFromText(
                split_train_file_pattern, skip_header_lines=0)
            | 'DecodeTrainDataCSV' >> MapAndFilterErrors(
                tft_coders.CsvCoder(
                    data_formatter.get_features_and_targets(),
                    data_formatter.get_features_metadata().schema).decode))

        raw_eval_data = (
            pipeline
            | 'ReadEvalDataFile' >> textio.ReadFromText(
                split_eval_file_pattern, skip_header_lines=0)
            | 'DecodeEvalDataCSV' >> MapAndFilterErrors(
                tft_coders.CsvCoder(
                    data_formatter.get_features_and_targets(),
                    data_formatter.get_features_metadata().schema).decode))

        # Examples in tf-example format (for model analysis purposes).
        # raw_feature_spec = data_formatter.RAW_DATA_METADATA.schema.as_feature_spec()
        # raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
        # coder = example_proto_coder.ExampleProtoCoder(raw_schema)
        #
        # _ = (
        #         raw_eval_data
        #         | 'ToSerializedTFExample' >> beam.Map(coder.encode)
        #         | 'WriteAnalysisTFRecord' >> tfrecordio.WriteToTFRecord(
        #     '{}-{:05}-of-{:05}'.format(analysis_fname_out, i, num_shards),
        #     shard_name_template='', num_shards=1)
        # )

        # Write SavedModel and metadata to two subdirectories of working_dir, given by
        # `transform_fn_io.TRANSFORM_FN_DIR` and `transform_fn_io.TRANSFORMED_METADATA_DIR` respectively.
        transform_fn = (pipeline
                        | 'ReadTransformGraph' >>
                        transform_fn_io.ReadTransformFn(working_dir))

        # Applies the transformation `transform_fn` to the raw eval dataset
        (transformed_train_data, transformed_metadata) = (
            ((raw_train_data, data_formatter.get_features_metadata()),
             transform_fn)
            | 'TransformTrainData' >> beam_impl.TransformDataset())

        # Applies the transformation `transform_fn` to the raw eval dataset
        (transformed_eval_data, transformed_metadata) = (
            ((raw_eval_data, data_formatter.get_features_metadata()),
             transform_fn)
            | 'TransformEvalData' >> beam_impl.TransformDataset())

        # The data schema of the transformed data gets used to build a signature to create
        # a TFRecord (tf binary data format). This signature is a wrapper function used to
        # encode transformed data.
        transformed_data_coder = tft.coders.ExampleProtoCoder(
            transformed_metadata.schema)

        _ = (transformed_train_data
             | 'EncodeTrainDataTransform' >> MapAndFilterErrors(
                 transformed_data_coder.encode)
             | 'WriteTrainDataTFRecord' >>
             tfrecordio.WriteToTFRecord('{}-{:05}-of-{:05}'.format(
                 exp_log_data_file_train_tfrecord, current_index, num_shards),
                                        shard_name_template='',
                                        num_shards=1))

        _ = (transformed_eval_data
             | 'EncodeEvalDataTransform' >> MapAndFilterErrors(
                 transformed_data_coder.encode)
             | 'WriteEvalDataTFRecord' >>
             tfrecordio.WriteToTFRecord('{}-{:05}-of-{:05}'.format(
                 exp_log_data_file_eval_tfrecord, current_index, num_shards),
                                        shard_name_template='',
                                        num_shards=1))

    result = pipeline.run()
    result.wait_until_finish()