def build_and_train_model(opts, data_config_train, data_config_eval, output_dir, num_epochs, fake_training=False): """Compile and fit a Keras Criteo model.""" model = build_model(opts) logging.info('Compiling model...') model.compile( keras.optimizers.Adam(opts.learning_rate), loss=keras.losses.BinaryCrossentropy(from_logits=False), metrics=[keras.metrics.BinaryAccuracy(), keras.metrics.AUC()]) logging.info('Building datasets...') bs = opts.batch_size steps_per_epoch = 2 if fake_training else data_lib.NUM_TRAIN_EXAMPLES // bs dataset_eval = data_lib.build_dataset(data_config_eval, batch_size=bs) dataset_train = data_lib.build_dataset(data_config_train, batch_size=bs, is_training=True) tensorboard_cb = keras.callbacks.TensorBoard(update_freq=int(1e5), log_dir=output_dir, write_graph=False) logging.info('Starting training...') model.fit(dataset_train, validation_data=dataset_eval, epochs=1 if fake_training else num_epochs, validation_steps=100, callbacks=[tensorboard_cb], steps_per_epoch=steps_per_epoch) return model
def run(model_dir, dataset_name, predictions_per_example, max_examples, output_dir, fake_data=False): """Runs predictions on the given dataset using the specified model.""" tf.io.gfile.makedirs(output_dir) data_config = data_lib.DataConfig.from_name(dataset_name, fake_data=fake_data) dataset = data_lib.build_dataset(data_config, batch_size=_BATCH_SIZE) if max_examples: dataset = dataset.take(max_examples // _BATCH_SIZE) model = models_lib.load_trained_model(model_dir) logging.info('Starting predictions.') predictions = models_lib.make_predictions(model, dataset, predictions_per_example) array_utils.write_npz(output_dir, 'predictions_%s.npz' % dataset_name, predictions) del predictions['probs_samples'] array_utils.write_npz(output_dir, 'predictions_small_%s.npz' % dataset_name, predictions)
def test_build_dataset(self): config = data_lib.DataConfig(split='train', fake_data=True) dataset = data_lib.build_dataset(config, batch_size=8, is_training=False, fake_training=False) # Check output_shapes. features_shapes, label_shape = dataset.output_shapes self.assertEqual([None], label_shape.as_list()) expected_keys = [ data_lib.feature_name(i) for i in range(1, data_lib.NUM_TOTAL_FEATURES + 1) ] self.assertSameElements(expected_keys, list(features_shapes.keys())) for key, shape in six.iteritems(features_shapes): self.assertEqual([None], shape.as_list(), 'Unexpected shape at key=' + key) # Check output_types. features_types, label_type = tf.compat.v1.data.get_output_types( dataset) self.assertEqual(tf.float32, label_type) for idx in data_lib.INT_FEATURE_INDICES: self.assertEqual(tf.float32, features_types[data_lib.feature_name(idx)]) for idx in data_lib.CAT_FEATURE_INDICES: self.assertEqual(tf.string, features_types[data_lib.feature_name(idx)])