Exemplo n.º 1
0
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir):
  """Read in input files, runs ml.Preprocess, and writes preprocessed output.

  Args:
    pipeline: beam pipeline
    training_data, eval_data, predict_data: file paths to input csv files.
    output_dir: file path to where to write all the output files.

  Returns:
    metadata and preprocessed features as pcollections.
  """
  feature_set = iris.IrisFeatures()

  coder_with_target = io.CsvCoder.from_feature_set(feature_set,
                                                   feature_set.csv_columns)
  coder_without_target = io.CsvCoder.from_feature_set(feature_set,
                                                      feature_set.csv_columns,
                                                      has_target_columns=False)

  train = (
      pipeline
      | 'ReadTrainingData'
      >> beam.io.textio.ReadFromText(training_data, coder=coder_with_target))
  evaluate = (
      pipeline
      | 'ReadEvalData'
      >> beam.io.textio.ReadFromText(eval_data, coder=coder_with_target))
  predict = (
      pipeline
      | 'ReadPredictData'
      >> beam.io.textio.ReadFromText(predict_data, coder=coder_without_target))

  # TODO(b/32726166) Update input_format and format_metadata to read from these
  # values directly from the coder.
  (metadata, train_features, eval_features, predict_features) = (
      (train, evaluate, predict)
      | 'Preprocess' >> ml.Preprocess(
          feature_set,
          input_format='csv',
          format_metadata={
              'headers': feature_set.csv_columns
          }))

  # Writes metadata.json - specified through METADATA_FILENAME- (text file),
  # features_train, features_eval, and features_eval (TFRecord files)
  (metadata | 'SaveMetadata'
   >> io.SaveMetadata(os.path.join(output_dir, METADATA_FILE_NAME)))

  # We turn off sharding of the feature files because the dataset is very small.
  (train_features | 'SaveTrain'
   >> io.SaveFeatures(os.path.join(output_dir, 'features_train')))
  (eval_features | 'SaveEval'
   >> io.SaveFeatures(os.path.join(output_dir, 'features_eval')))
  (predict_features | 'SavePredict'
   >> io.SaveFeatures(os.path.join(output_dir, 'features_predict')))

  return metadata, train_features, eval_features, predict_features
Exemplo n.º 2
0
def preprocess(pipeline, training_data, eval_data, output_dir,
               frequency_threshold, metadata_file_name):
  """Run pre-processing step as a pipeline.

  Args:
    pipeline: beam pipeline
    training_data: file paths to input csv files.
    eval_data: file paths to input csv files.
    output_dir: file path to where to write all the output files.
    frequency_threshold: frequency threshold to use for categorical values.
    metadata_file_name: one of metadata.{json|yaml}.
  """
  feature_set, csv_columns = criteo.criteo_features(
      frequency_threshold=frequency_threshold)

  coder = io.CsvCoder.from_feature_set(feature_set, csv_columns, delimiter='\t')

  train = (
      pipeline
      | 'ReadTrainingData'
      >> beam.io.ReadFromText(
          training_data, strip_trailing_newlines=True, coder=coder))

  evaluate = (
      pipeline
      | 'ReadEvalData'
      >> beam.io.ReadFromText(
          eval_data, strip_trailing_newlines=True, coder=coder))

  # TODO(b/32726166) Update input_format and format_metadata to read from these
  # values directly from the coder.
  (metadata, train_features, evaluate_features) = (
      (train, evaluate)
      | 'Preprocess' >> ml.Preprocess(
          feature_set,
          input_format='csv',
          format_metadata={'headers': csv_columns,
                           'delimiter': '\t'}))

  # Writes metadata.json, features_train, features_eval, and features_eval files
  # pylint: disable=expression-not-assigned
  (metadata
   | 'SaveMetadata'
   >> io.SaveMetadata(os.path.join(output_dir, metadata_file_name)))
  (train_features
   | 'WriteTraining'
   >> io.SaveFeatures(os.path.join(output_dir, 'features_train')))
  (evaluate_features
   | 'WriteEval'
   >> io.SaveFeatures(os.path.join(output_dir, 'features_eval')))
Exemplo n.º 3
0
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir,
               metadata_file_name):
    feature_set = iris.IrisFeatures()

    read_training_data = beam.io.ReadFromText(
        training_data,
        strip_trailing_newlines=True,
        coder=io.CsvCoder.from_feature_set(feature_set,
                                           feature_set.csv_columns))

    read_eval_data = beam.io.ReadFromText(eval_data,
                                          strip_trailing_newlines=True,
                                          coder=io.CsvCoder.from_feature_set(
                                              feature_set,
                                              feature_set.csv_columns))

    read_predict_data = beam.io.ReadFromText(
        predict_data,
        strip_trailing_newlines=True,
        coder=io.CsvCoder.from_feature_set(feature_set,
                                           feature_set.csv_columns,
                                           has_target_columns=False))

    train = pipeline | 'ReadTrainingData' >> read_training_data
    evaluate = pipeline | 'ReadEvalData' >> read_eval_data
    predict = pipeline | 'ReadPredictData' >> read_predict_data

    # TODO(b/32726166) Update input_format and format_metadata to read from these
    # values directly from the coder.
    (metadata, train_features, eval_features, predict_features) = (
        (train, evaluate, predict)
        | 'Preprocess' >> ml.Preprocess(
            feature_set,
            input_format='csv',
            format_metadata={'headers': feature_set.csv_columns}))

    # pylint: disable=expression-not-assigned
    (metadata | 'SaveMetadata' >> io.SaveMetadata(
        os.path.join(output_dir, metadata_file_name)))

    # We turn off sharding of these feature files because the dataset very small.
    (train_features | 'SaveTrain' >> io.SaveFeatures(
        os.path.join(output_dir, 'features_train')))
    (eval_features |
     'SaveEval' >> io.SaveFeatures(os.path.join(output_dir, 'features_eval')))
    (predict_features | 'SavePredict' >> io.SaveFeatures(
        os.path.join(output_dir, 'features_predict')))
    # pylint: enable=expression-not-assigned

    return metadata, train_features, eval_features, predict_features
Exemplo n.º 4
0
def preprocess(pipeline):
    feature_set = iris.IrisFeatures()

    training_data = beam.io.TextFileSource(args.training_data,
                                           strip_trailing_newlines=True,
                                           coder=io.CsvCoder.from_feature_set(
                                               feature_set,
                                               feature_set.csv_columns))

    eval_data = beam.io.TextFileSource(args.eval_data,
                                       strip_trailing_newlines=True,
                                       coder=io.CsvCoder.from_feature_set(
                                           feature_set,
                                           feature_set.csv_columns))

    predict_data = beam.io.TextFileSource(args.predict_data,
                                          strip_trailing_newlines=True,
                                          coder=io.CsvCoder.from_feature_set(
                                              feature_set,
                                              feature_set.csv_columns,
                                              has_target_columns=False))

    train = pipeline | beam.Read('ReadTrainingData', training_data)
    evaluate = pipeline | beam.Read('ReadEvalData', eval_data)
    predict = pipeline | beam.Read('ReadPredictData', predict_data)

    (metadata, train_features, eval_features, predict_features) = (
        (train, evaluate, predict)
        | 'Preprocess' >> ml.Preprocess(
            feature_set,
            input_format='csv',
            format_metadata={'headers': feature_set.csv_columns}))

    # Writes metadata.yaml, features_train, features_eval, and features_eval files
    # pylint: disable=expression-not-assigned
    (metadata | 'SaveMetadata' >> io.SaveMetadata(
        os.path.join(args.output_dir, 'metadata.yaml')))

    # We turn off sharding of these feature files because the dataset very small.
    (train_features | 'SaveTrain' >> io.SaveFeatures(
        os.path.join(args.output_dir, 'features_train')))
    (eval_features | 'SaveEval' >> io.SaveFeatures(
        os.path.join(args.output_dir, 'features_eval')))
    (predict_features | 'SavePredict' >> io.SaveFeatures(
        os.path.join(args.output_dir, 'features_predict')))
    # pylint: enable=expression-not-assigned

    return metadata, train_features, eval_features, predict_features