def _build_keras_model(tf_transform_output, hidden_units, learning_rate): """Creates a DNN Keras model for classifying taxi data. Args: hidden_units: [int], the layer sizes of the DNN (input layer first). Returns: A keras Model. """ numeric_columns = [ tf.feature_column.numeric_column(key=features.transformed_name(key), shape=()) for key in features.NUMERIC_FEATURE_KEYS ] categorical_columns = [ tf.feature_column.categorical_column_with_identity( key=features.transformed_name(key), num_buckets=tf_transform_output. num_buckets_for_transformed_feature( features.transformed_name(key)), default_value=0) for key in features.CATEGORICAL_FEATURE_KEYS ] indicator_columns = [ tf.feature_column.indicator_column(categorical_column) for categorical_column in categorical_columns ] model = _wide_and_deep_classifier( # TODO(b/139668410) replace with premade wide_and_deep keras model wide_columns=indicator_columns, deep_columns=numeric_columns, dnn_hidden_units=hidden_units, learning_rate=learning_rate) return model
def preprocessing_fn(inputs): """Preprocesses input columns into transformed columns. Preprocesses Covertype Dataset features using Tensorflow Transform library. Args: inputs(dict): A `dict` of `string` to `Tensor` or `SparseTensor`, where key is a Features key in Example proto, value is a Tensor containing the Feature proto's value. Returns: outputs(dict): A `dict` of `string` to `Tensor` or `SparseTensor`, where key is a new set of Feature keys, and values are possibly transformed `Tensor` or `SparseTensor`. """ outputs = {} # Scale numerical features for key in features.NUMERIC_FEATURE_KEYS: # TODO: your code here to scale numeric features with z-score with Tensorflow Transform. outputs[features.transformed_name(key)] = # Generate vocabularies and maps categorical features for key in features.CATEGORICAL_FEATURE_KEYS: # TODO: your code here to integerize categorical features and generate vocabulary file with Tensorflow Transform. outputs[features.transformed_name(key)] = # Convert Cover_Type to dense tensor outputs[features.transformed_name(features.LABEL_KEY)] = _fill_in_missing( inputs[features.LABEL_KEY]) return outputs
def _build_keras_model(hparams: kerastuner.HyperParameters, tf_transform_output: tft.TFTransformOutput) -> tf.keras.Model: """Creates a Keras WideDeep Classifier model. Args: hparams: Holds HyperParameters for tuning. tf_transform_output: A TFTransformOutput. Returns: A keras Model. """ # Defines deep feature columns and input layers. deep_columns = [ tf.feature_column.numeric_column( key=features.transformed_name(key), shape=()) for key in features.NUMERIC_FEATURE_KEYS ] input_layers = { column.key: tf.keras.layers.Input(name=column.key, shape=(), dtype=tf.float32) for column in deep_columns } # Defines wide feature columns and input layers. categorical_columns = [ tf.feature_column.categorical_column_with_identity( key=features.transformed_name(key), num_buckets=tf_transform_output.num_buckets_for_transformed_feature(features.transformed_name(key)), default_value=0) for key in features.CATEGORICAL_FEATURE_KEYS ] wide_columns = [ tf.feature_column.indicator_column(categorical_column) for categorical_column in categorical_columns ] input_layers.update({ column.categorical_column.key: tf.keras.layers.Input(name=column.categorical_column.key, shape=(), dtype=tf.int32) for column in wide_columns }) # Build Keras model using hparams. deep = tf.keras.layers.DenseFeatures(deep_columns)(input_layers) for n in range(int(hparams.get('n_layers'))): deep = tf.keras.layers.Dense(units=hparams.get('n_units_' + str(n + 1)))(deep) wide = tf.keras.layers.DenseFeatures(wide_columns)(input_layers) output = tf.keras.layers.Dense(features.NUM_CLASSES, activation='softmax')( tf.keras.layers.concatenate([deep, wide])) model = tf.keras.Model(input_layers, output) model.compile( loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(lr=hparams.get('learning_rate')), metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]) model.summary(print_fn=absl.logging.info) return model
def _input_fn(file_pattern: List[Text], data_accessor: DataAccessor, tf_transform_output: tft.TFTransformOutput, batch_size: int = 200) -> tf.data.Dataset: """Generates features and label for tuning/training. Args: file_pattern: List of paths or patterns of input tfrecord files. data_accessor: DataAccessor for converting input to RecordBatch. tf_transform_output: A TFTransformOutput. batch_size: representing the number of consecutive elements of returned dataset to combine in a single batch Returns: A dataset that contains (features, indices) tuple where features is a dictionary of Tensors, and indices is a single Tensor of label indices. """ dataset = data_accessor.tf_dataset_factory( file_pattern, dataset_options.TensorFlowDatasetOptions( batch_size=batch_size, label_key=features.transformed_name(features.LABEL_KEY)), tf_transform_output.transformed_metadata.schema) return dataset
def _input_fn(filenames, tf_transform_output, batch_size=200): """Generates features and labels for training or evaluation. Args: filenames: [str] list of CSV files to read data from. tf_transform_output: A TFTransformOutput. batch_size: int First dimension size of the Tensors returned by input_fn Returns: A (features, indices) tuple where features is a dictionary of Tensors, and indices is a single Tensor of label indices. """ transformed_feature_spec = ( tf_transform_output.transformed_feature_spec().copy()) dataset = tf.data.experimental.make_batched_features_dataset( filenames, batch_size, transformed_feature_spec, reader=_gzip_reader_fn) transformed_features = tf.compat.v1.data.make_one_shot_iterator( dataset).get_next() # We pop the label because we do not want to use it as a feature while we're # training. return transformed_features, transformed_features.pop( features.transformed_name(features.LABEL_KEY))
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in features.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[features.transformed_name(key)] = tft.scale_to_z_score( _fill_in_missing(inputs[key])) for key in features.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[features.transformed_name( key)] = tft.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=features.VOCAB_SIZE, num_oov_buckets=features.OOV_SIZE) for key, num_buckets in zip(features.BUCKET_FEATURE_KEYS, features.BUCKET_FEATURE_BUCKET_COUNT): outputs[features.transformed_name(key)] = tft.bucketize( _fill_in_missing(inputs[key]), num_buckets, always_return_num_quantiles=False) for key in features.CATEGORICAL_FEATURE_KEYS: outputs[features.transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? fare_key = 'fare' taxi_fare = _fill_in_missing(inputs[fare_key]) tips = _fill_in_missing(inputs[features.LABEL_KEY]) outputs[features.transformed_name( features.LABEL_KEY)] = tf.compat.v1.where( tf.math.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs
def serve_tf_examples_fn(serialized_tf_examples): """Returns the output to be used in the serving signature.""" feature_spec = tf_transform_output.raw_feature_spec() feature_spec.pop(features.LABEL_KEY) parsed_features = tf.io.parse_example(serialized_tf_examples, feature_spec) transformed_features = model.tft_layer(parsed_features) transformed_features.pop(features.transformed_name(features.LABEL_KEY)) return model(transformed_features)
def preprocessing_fn(inputs): """Preprocesses Covertype Dataset.""" outputs = {} # Scale numerical features for key in features.NUMERIC_FEATURE_KEYS: outputs[features.transformed_name(key)] = tft.scale_to_z_score( _fill_in_missing(inputs[key])) # Generate vocabularies and maps categorical features for key in features.CATEGORICAL_FEATURE_KEYS: outputs[features.transformed_name( key)] = tft.compute_and_apply_vocabulary(x=_fill_in_missing( inputs[key]), num_oov_buckets=1, vocab_filename=key) # Convert Cover_Type to dense tensor outputs[features.transformed_name(features.LABEL_KEY)] = _fill_in_missing( inputs[features.LABEL_KEY]) return outputs
def _eval_input_receiver_fn(tf_transform_output, schema): """Build everything needed for the tf-model-analysis to run the model. Args: tf_transform_output: A TFTransformOutput. schema: the schema of the input data. Returns: EvalInputReceiver function, which contains: - Tensorflow graph which parses raw untransformed features, applies the tf-transform preprocessing operators. - Set of raw, untransformed features. - Label against which predictions will be compared. """ # Notice that the inputs are raw features, not transformed features here. raw_feature_spec = _get_raw_feature_spec(schema) serialized_tf_example = tf.compat.v1.placeholder( dtype=tf.string, shape=[None], name='input_example_tensor') # Add a parse_example operator to the tensorflow graph, which will parse # raw, untransformed, tf examples. raw_features = tf.io.parse_example(serialized=serialized_tf_example, features=raw_feature_spec) # Now that we have our raw examples, process them through the tf-transform # function computed during the preprocessing step. transformed_features = tf_transform_output.transform_raw_features( raw_features) # The key name MUST be 'examples'. receiver_tensors = {'examples': serialized_tf_example} # NOTE: Model is driven by transformed features (since training works on the # materialized output of TFT, but slicing will happen on raw features. raw_features.update(transformed_features) return tfma.export.EvalInputReceiver( features=raw_features, receiver_tensors=receiver_tensors, labels=transformed_features[features.transformed_name( features.LABEL_KEY)])
def _input_fn(file_pattern, tf_transform_output, batch_size=200): """Generates features and label for tuning/training. Args: file_pattern: input tfrecord file pattern. tf_transform_output: A TFTransformOutput. batch_size: representing the number of consecutive elements of returned dataset to combine in a single batch Returns: A dataset that contains (features, indices) tuple where features is a dictionary of Tensors, and indices is a single Tensor of label indices. """ transformed_feature_spec = ( tf_transform_output.transformed_feature_spec().copy()) dataset = tf.data.experimental.make_batched_features_dataset( file_pattern=file_pattern, batch_size=batch_size, features=transformed_feature_spec, reader=_gzip_reader_fn, label_key=features.transformed_name(features.LABEL_KEY)) return dataset
def preprocessing_fn(inputs): """Preprocesses Titanic Dataset.""" outputs = {} # Scale numerical features for key in features.NUMERIC_FEATURE_KEYS: mean_value = compute_mean_ignore_nan(inputs[key].values) absl.logging.info(f'TFT preprocessing. Mean value for {key} = {mean_value}') outputs[features.transformed_name(key)] = tft.scale_to_z_score( _fill_in_missing_with_impute(inputs[key], mean_value)) for key in features.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=features.VOCAB_SIZE_MAP.get(key, features.VOCAB_SIZE), num_oov_buckets=features.OOV_SIZE) for key in features.BUCKET_FEATURE_KEYS: if key in features.FEATURE_BUCKET_BOUNDARIES: bucket_boundaries = tf.constant(features.FEATURE_BUCKET_BOUNDARIES.get(key)) # tf.print("bucket_boundaries:", bucket_boundaries, output_stream=absl.logging.info) outputs[features.transformed_name(key)] = tft.apply_buckets(_fill_in_missing(inputs[key]), bucket_boundaries) else: outputs[features.transformed_name(key)] = tft.bucketize( _fill_in_missing(inputs[key]), features.FEATURE_BUCKET_COUNT_MAP.get(key, features.FEATURE_BUCKET_COUNT)) # Generate vocabularies and maps categorical features for key in features.CATEGORICAL_FEATURE_KEYS: outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary( x=_fill_in_missing(inputs[key]), num_oov_buckets=1, vocab_filename=key) # Convert Cover_Type to dense tensor outputs[features.transformed_name(features.LABEL_KEY)] = _fill_in_missing( inputs[features.LABEL_KEY]) return outputs
def _build_keras_model( hparams: kerastuner.HyperParameters, tf_transform_output: tft.TFTransformOutput) -> tf.keras.Model: """Creates a Keras WideDeep Classifier model. Args: hparams: Holds HyperParameters for tuning. tf_transform_output: A TFTransformOutput. Returns: A keras Model. """ real_keys = features.NUMERIC_FEATURE_KEYS sparse_keys = features.VOCAB_FEATURE_KEYS + features.BUCKET_FEATURE_KEYS + features.CATEGORICAL_FEATURE_KEYS # Defines deep feature columns and input layers. deep_columns = [ tf.feature_column.numeric_column(key=features.transformed_name(key), shape=()) for key in features.NUMERIC_FEATURE_KEYS ] input_layers = { column.key: tf.keras.layers.Input(name=column.key, shape=(), dtype=tf.float32) for column in deep_columns } # Defines wide feature columns and input layers. categorical_columns = [ tf.feature_column.categorical_column_with_identity( key=features.transformed_name(key), num_buckets=tf_transform_output. num_buckets_for_transformed_feature( features.transformed_name(key)), default_value=0) for key in features.CATEGORICAL_FEATURE_KEYS ] categorical_columns += [ tf.feature_column.categorical_column_with_identity( # pylint: disable=g-complex-comprehension key, num_buckets=features.VOCAB_SIZE + features.OOV_SIZE, default_value=0) for key in features.transformed_names(features.VOCAB_FEATURE_KEYS) ] categorical_columns += [ tf.feature_column.categorical_column_with_identity( # pylint: disable=g-complex-comprehension key, num_buckets=num_buckets, default_value=0) for key, num_buckets in zip( features.transformed_names(features.BUCKET_FEATURE_KEYS), features.BUCKET_FEATURE_BUCKET_COUNT) ] wide_columns = [ tf.feature_column.indicator_column(categorical_column) for categorical_column in categorical_columns ] input_layers.update({ column.categorical_column.key: tf.keras.layers.Input(name=column.categorical_column.key, shape=(), dtype=tf.int32) for column in wide_columns }) # Build Keras model using hparams. deep = tf.keras.layers.DenseFeatures(deep_columns)(input_layers) for n in range(int(hparams.get('n_layers'))): deep = tf.keras.layers.Dense(units=hparams.get('n_units_' + str(n + 1)))(deep) wide = tf.keras.layers.DenseFeatures(wide_columns)(input_layers) # output = tf.keras.layers.Dense(features.NUM_CLASSES, activation='softmax')( # tf.keras.layers.concatenate([deep, wide])) output = tf.keras.layers.Dense(1, activation='sigmoid')( tf.keras.layers.concatenate([deep, wide])) output = tf.squeeze(output, -1) model = tf.keras.Model(input_layers, output) model.compile( loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(lr=hparams.get('learning_rate')), # metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]) metrics=[ tf.keras.metrics.TruePositives(name='tp'), tf.keras.metrics.FalsePositives(name='fp'), tf.keras.metrics.TrueNegatives(name='tn'), tf.keras.metrics.FalseNegatives(name='fn'), tf.keras.metrics.BinaryAccuracy(name='binary_accuracy'), tf.keras.metrics.Precision(name='precision'), tf.keras.metrics.Recall(name='recall'), tf.keras.metrics.AUC(name='auc'), ]) model.summary(print_fn=absl.logging.info) return model