def test_keras_direct_parquet_train(self, mock_fit_fn, mock_pin_gpu_fn): mock_fit_fn.return_value = get_mock_fit_fn() mock_pin_gpu_fn.return_value = mock.Mock() with spark_session('test_keras_direct_parquet_train') as spark: df = create_xor_data(spark) backend = CallbackBackend() with local_store() as store: store.get_train_data_path = lambda v=None: store._train_path store.get_val_data_path = lambda v=None: store._val_path with util.prepare_data(backend.num_processes(), store, df, feature_columns=['features'], label_columns=['y']): model = create_xor_model() optimizer = tf.keras.optimizers.SGD(lr=0.1) loss = 'binary_crossentropy' for reader_pool_type in ['process', 'thread']: est = hvd.KerasEstimator( backend=backend, store=store, model=model, optimizer=optimizer, loss=loss, feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, reader_pool_type=reader_pool_type, verbose=2) transformer = est.fit_on_parquet() predictions = transformer.transform(df) assert predictions.count() == df.count()
def test_torch_direct_parquet_train(self): with spark_session('test_torch_direct_parquet_train') as spark: df = create_xor_data(spark) backend = CallbackBackend() with local_store() as store: store.get_train_data_path = lambda v=None: store._train_path store.get_val_data_path = lambda v=None: store._val_path with util.prepare_data(backend.num_processes(), store, df, feature_columns=['features'], label_columns=['y']): model = create_xor_model() optimizer = torch.optim.SGD(model.parameters(), lr=0.1) loss = nn.BCELoss() for inmemory_cache_all in [False, True]: est = hvd_spark.TorchEstimator( backend=backend, store=store, model=model, optimizer=optimizer, input_shapes=[[2]], feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2, inmemory_cache_all=inmemory_cache_all) # To make sure that setLoss works with non-list loss. est.setLoss(loss) transformer = est.fit_on_parquet() predictions = transformer.transform(df) assert predictions.count() == df.count()
def test_direct_parquet_train(self): with spark_session('test_direct_parquet_train') as spark: df = create_noisy_xor_data_with_val(spark) backend = CallbackBackend() with local_store() as store: store.get_train_data_path = lambda v=None: store._train_path store.get_val_data_path = lambda v=None: store._val_path # Make sure to cover val dataloader cases for validation in [None, 'val']: with util.prepare_data(backend.num_processes(), store, df, feature_columns=['features'], label_columns=['y'], validation=validation): model = create_xor_model() for inmemory_cache_all in [False, True]: for reader_pool_type in ['process', 'thread']: est = hvd_spark.TorchEstimator( backend=backend, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=validation, batch_size=1, epochs=3, verbose=2, inmemory_cache_all=inmemory_cache_all, reader_pool_type=reader_pool_type) transformer = est.fit_on_parquet() predictions = transformer.transform(df) assert predictions.count() == df.count()
def test_restore_from_checkpoint(self, mock_fit_fn, mock_pin_gpu_fn): mock_fit_fn.return_value = get_mock_fit_fn() mock_pin_gpu_fn.return_value = mock.Mock() model = create_xor_model() optimizer = tf.keras.optimizers.SGD(lr=0.1) loss = 'binary_crossentropy' with spark_session('test_restore_from_checkpoint') as spark: df = create_xor_data(spark) backend = CallbackBackend() run_id = 'run01' with local_store() as store: keras_estimator = hvd.KerasEstimator( backend=backend, store=store, model=model, optimizer=optimizer, loss=loss, feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2, run_id=run_id) keras_estimator._load_model_from_checkpoint = mock.Mock( side_effect=keras_estimator._load_model_from_checkpoint) ckpt_path = store.get_checkpoint_path(run_id) assert not store.exists(ckpt_path) keras_estimator._load_model_from_checkpoint.assert_not_called() keras_model = keras_estimator.fit(df) trained_model = keras_model.getModel() pred = trained_model.predict([np.ones([1, 2], dtype=np.float64)]) assert len(pred) == 1 assert store.exists(ckpt_path) keras_estimator.fit(df) keras_estimator._load_model_from_checkpoint.assert_called()
def test_legacy_restore_from_checkpoint(self): self.skipTest( 'There is a bug in current lightning version for checkpoint' 'call back. Will add this test back when it is solved.') model = create_legacy_xor_model() optimizer = torch.optim.SGD(model.parameters(), lr=0.1) loss = nn.BCELoss() with spark_session('test_restore_from_checkpoint') as spark: df = create_noisy_xor_data(spark) ctx = CallbackBackend() run_id = 'run01' with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( backend=ctx, store=store, model=model, optimizer=optimizer, loss=loss, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2, run_id=run_id) torch_estimator._read_checkpoint = mock.Mock( side_effect=torch_estimator._read_checkpoint) ckpt_path = store.get_checkpoint_path(run_id) assert not store.exists(ckpt_path) torch_estimator._read_checkpoint.assert_not_called() torch_estimator.fit(df) assert store.exists(ckpt_path) torch_estimator.fit(df) torch_estimator._read_checkpoint.assert_called()
def test_model_serialization(self, mock_remote_trainer): model = create_xor_model() optimizer = tf.keras.optimizers.SGD(lr=0.1) loss = 'binary_crossentropy' def train(serialized_model, train_rows, val_rows, avg_row_size): return None, serialized_model, 2 mock_remote_trainer.return_value = train with spark_session('test_model_serialization') as spark: df = create_xor_data(spark) keras_estimator = hvd.KerasEstimator(model=model, optimizer=optimizer, loss=loss, feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2) backend = CallbackBackend() with local_store() as store: with temppath() as saved_path: keras_estimator.save(saved_path) keras_estimator_loaded = hvd.KerasEstimator.load( saved_path) keras_model = keras_estimator_loaded.fit( df, params={ keras_estimator_loaded.backend: backend, keras_estimator_loaded.store: store }) trained_model = keras_model.getModel() pred = trained_model.predict( [np.ones([1, 2], dtype=np.float32)]) assert len(pred) == 1 assert pred.dtype == np.float32
def test_restore_from_checkpoint(self): model = create_xor_model() optimizer = torch.optim.SGD(model.parameters(), lr=0.1) loss = nn.BCELoss() with spark_session('test_restore_from_checkpoint') as spark: df = create_xor_data(spark) ctx = CallbackBackend() run_id = 'run01' with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( backend=ctx, store=store, model=model, optimizer=optimizer, loss=loss, input_shapes=[[2]], feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=1, verbose=2, run_id=run_id) torch_estimator._load_checkpoint = mock.Mock( side_effect=torch_estimator._load_checkpoint) ckpt_path = store.get_checkpoint_path(run_id) assert not store.exists(ckpt_path) torch_estimator._load_checkpoint.assert_not_called() torch_estimator.fit(df) assert store.exists(ckpt_path) torch_estimator.fit(df) torch_estimator._load_checkpoint.assert_called()
def test_restore_from_checkpoint(self): self.skipTest('There is a deadlock bug for checkpoint call back. ' + 'Will add this test back when it is solved.') model = create_xor_model() with spark_session('test_restore_from_checkpoint') as spark: df = create_noisy_xor_data(spark) ctx = CallbackBackend() run_id = 'run01' with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( backend=ctx, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2, run_id=run_id) torch_estimator._read_checkpoint = Mock( side_effect=torch_estimator._read_checkpoint) ckpt_path = store.get_checkpoint_path(run_id) assert not store.exists(ckpt_path) torch_estimator._read_checkpoint.assert_not_called() torch_estimator.fit(df) assert store.exists(ckpt_path) torch_estimator.fit(df) torch_estimator._read_checkpoint.assert_called()
def test_keras_model_checkpoint_callback(self, mock_fit_fn, mock_pin_gpu_fn): from horovod.tensorflow.keras.callbacks import BestModelCheckpoint def _get_mock_fit_fn(checkpoint_callback_provided): def fit(model, train_data, val_data, steps_per_epoch, validation_steps, callbacks, verbose): returned_model_checkpoint_present = False model_checkpoint_present = False for callback in callbacks: callback.set_model(model) if checkpoint_callback_provided: callback.on_epoch_end(0, logs={'binary_crossentropy': 0.3}) else: callback.on_epoch_end(0, logs={'binary_crossentropy': 0.3}) if checkpoint_callback_provided and isinstance(callback, BestModelCheckpoint): self.assertIsNotNone(callback.filepath) self.assertTrue(callback.save_best_only) self.assertEqual(callback.monitor, 'binary_crossentropy') returned_model_checkpoint_present = True if not checkpoint_callback_provided and isinstance(callback, tf.keras.callbacks.ModelCheckpoint): self.assertFalse(callback.save_best_only) self.assertFalse(callback.save_best_only) self.assertEqual(callback.monitor, 'val_loss') model_checkpoint_present = True if checkpoint_callback_provided: self.assertTrue(returned_model_checkpoint_present) self.assertFalse(model_checkpoint_present) else: self.assertFalse(returned_model_checkpoint_present) self.assertTrue(model_checkpoint_present) return mock.Mock() return fit mock_pin_gpu_fn.return_value = mock.Mock() with spark_session('test_keras_model_chekcpoint_callbacks') as spark: df = create_xor_data(spark) backend = CallbackBackend() with local_store() as store: store.get_train_data_path = lambda v=None: store._train_path store.get_val_data_path = lambda v=None: store._val_path with util.prepare_data(backend.num_processes(), store, df, feature_columns=['features'], label_columns=['y']): model = create_xor_model() optimizer = tf.keras.optimizers.SGD(lr=0.1) loss = 'binary_crossentropy' # Test when the checkpoint callback is not set, the correct one is created mock_fit_fn.return_value = _get_mock_fit_fn(checkpoint_callback_provided=False) est = hvd.KerasEstimator( backend=backend, store=store, model=model, optimizer=optimizer, loss=loss, feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2) transformer = est.fit_on_parquet() predictions = transformer.transform(df) assert predictions.count() == df.count() # Test if checkpoint call back is correctly set to the model mock_fit_fn.return_value = _get_mock_fit_fn(checkpoint_callback_provided=True) checkpoint_callback = BestModelCheckpoint(monitor='binary_crossentropy') est = hvd.KerasEstimator( backend=backend, store=store, model=model, optimizer=optimizer, loss=loss, feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2, checkpoint_callback=checkpoint_callback) transformer = est.fit_on_parquet() predictions = transformer.transform(df) assert predictions.count() == df.count()
def test_direct_parquet_train_with_no_val_column(self): if skip_lightning_tests: self.skipTest( 'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: ' 'https://github.com/horovod/horovod/pull/3263') with spark_session( 'test_direct_parquet_train_with_no_val_column') as spark: df_train = create_noisy_xor_data(spark) df_val = create_noisy_xor_data(spark) def to_petastorm(df): metadata = None if util._has_vector_column(df): to_petastorm = util.to_petastorm_fn(["features", "y"], metadata) df = df.rdd.map(to_petastorm).toDF() return df df_train = to_petastorm(df_train) df_val = to_petastorm(df_val) df_train.show(1) print(df_train.count()) df_val.show(1) print(df_val.count()) backend = CallbackBackend() with local_store() as store: store.get_train_data_path = lambda v=None: store._train_path store.get_val_data_path = lambda v=None: store._val_path print(store.get_train_data_path()) print(store.get_val_data_path()) df_train \ .coalesce(4) \ .write \ .mode('overwrite') \ .parquet(store.get_train_data_path()) df_val \ .coalesce(4) \ .write \ .mode('overwrite') \ .parquet(store.get_val_data_path()) model = create_xor_model() inmemory_cache_all = True reader_pool_type = 'process' est = hvd_spark.TorchEstimator( backend=backend, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], batch_size=64, epochs=2, verbose=2, inmemory_cache_all=inmemory_cache_all, reader_pool_type=reader_pool_type) # set validation to any random strings would work. est.setValidation("True") transformer = est.fit_on_parquet() predictions = transformer.transform(df_train) assert predictions.count() == df_train.count()