def test_transform_multi_class(self): # set dim as 2, to mock a multi class model. model = create_xor_model(output_dim=2) with spark_session('test_transform_multi_class') as spark: df = create_xor_data(spark) metadata = util._get_metadata(df) torch_model = hvd_spark.TorchModel(history=None, model=model, input_shapes=[[2]], feature_columns=['features'], label_columns=['y'], _metadata=metadata) out_df = torch_model.transform(df) # in multi class model, model output is a vector but label is number. expected_types = { 'x1': IntegerType, 'x2': IntegerType, 'features': VectorUDT, 'weight': FloatType, 'y': FloatType, 'y__output': VectorUDT } for field in out_df.schema.fields: assert type(field.dataType) == expected_types[field.name]
def test_fit_model(self): model = create_xor_model() optimizer = torch.optim.SGD(model.parameters(), lr=0.1) loss = F.binary_cross_entropy with spark_session('test_fit_model') as spark: df = create_xor_data(spark) with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, optimizer=optimizer, loss=loss, input_shapes=[[2]], feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, random_seed=1, verbose=2, sample_weight_col='weight') torch_model = torch_estimator.fit(df) trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_torch_direct_parquet_train(self): with spark_session('test_torch_direct_parquet_train') as spark: df = create_xor_data(spark) backend = CallbackBackend() with local_store() as store: store.get_train_data_path = lambda v=None: store._train_path store.get_val_data_path = lambda v=None: store._val_path with util.prepare_data(backend.num_processes(), store, df, feature_columns=['features'], label_columns=['y']): model = create_xor_model() optimizer = torch.optim.SGD(model.parameters(), lr=0.1) loss = nn.BCELoss() est = hvd_spark.TorchEstimator(backend=backend, store=store, model=model, optimizer=optimizer, input_shapes=[[2]], feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2) # To make sure that setLoss works with non-list loss. est.setLoss(loss) transformer = est.fit_on_parquet() predictions = transformer.transform(df) assert predictions.count() == df.count()
def test_fit_model(self): model = create_xor_model() optimizer = tf.keras.optimizers.SGD(lr=0.1) loss = 'binary_crossentropy' with spark_session('test_fit_model') as spark: df = create_xor_data(spark) with local_store() as store: keras_estimator = hvd.KerasEstimator( num_proc=2, store=store, model=model, optimizer=optimizer, loss=loss, feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2) keras_model = keras_estimator.fit(df) trained_model = keras_model.getModel() pred = trained_model.predict([np.ones([1, 2], dtype=np.float32)]) assert len(pred) == 1 assert pred.dtype == np.float32
def test_keras_direct_parquet_train(self, mock_fit_fn, mock_pin_gpu_fn): mock_fit_fn.return_value = get_mock_fit_fn() mock_pin_gpu_fn.return_value = mock.Mock() with spark_session('test_keras_direct_parquet_train') as spark: df = create_xor_data(spark) backend = CallbackBackend() with local_store() as store: store.get_train_data_path = lambda v=None: store._train_path store.get_val_data_path = lambda v=None: store._val_path with util.prepare_data(backend.num_processes(), store, df, feature_columns=['features'], label_columns=['y']): model = create_xor_model() optimizer = tf.keras.optimizers.SGD(lr=0.1) loss = 'binary_crossentropy' est = hvd.KerasEstimator(backend=backend, store=store, model=model, optimizer=optimizer, loss=loss, feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2) transformer = est.fit_on_parquet() predictions = transformer.transform(df) assert predictions.count() == df.count()
def test_transform_multi_class(self): model = create_xor_model(output_dim=2) with spark_session('test_transform_multi_class') as spark: df = create_xor_data(spark) metadata = util._get_metadata(df) torch_model = hvd_spark.TorchModel(history=None, model=model, input_shapes=[[2]], feature_columns=['features'], label_columns=['y'], _metadata=metadata) out_df = torch_model.transform(df) expected_types = { 'x1': LongType, 'x2': LongType, 'features': VectorUDT, 'weight': DoubleType, 'y': DoubleType, 'y__output': VectorUDT } for field in out_df.schema.fields: assert type(field.dataType) == expected_types[field.name]
def test_fit_model(self): if sys.version_info < (3, 0, 0) and is_gloo_used(): self.skipTest( 'Horovod on Spark over Gloo only supported on Python3') model = create_xor_model() optimizer = tf.keras.optimizers.SGD(lr=0.1) loss = 'binary_crossentropy' with spark_session('test_fit_model') as spark: df = create_xor_data(spark) with local_store() as store: keras_estimator = hvd.KerasEstimator(num_proc=2, store=store, model=model, optimizer=optimizer, loss=loss, feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2) keras_model = keras_estimator.fit(df) trained_model = keras_model.getModel() pred = trained_model.predict( [np.ones([1, 2], dtype=np.float32)]) assert len(pred) == 1 assert pred.dtype == np.float32
def test_fit_model(self): if sys.version_info < (3, 0, 0) and is_gloo_used(): self.skipTest( 'Horovod on Spark over Gloo only supported on Python3') model = create_xor_model() optimizer = torch.optim.SGD(model.parameters(), lr=0.1) loss = F.binary_cross_entropy with spark_session('test_fit_model') as spark: df = create_xor_data(spark) with local_store() as store: torch_estimator = hvd.TorchEstimator( num_proc=2, store=store, model=model, optimizer=optimizer, loss=loss, input_shapes=[[2]], feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2, sample_weight_col='weight') torch_model = torch_estimator.fit(df) trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_prepare_data(self): with spark_session('test_prepare_data') as spark: df = create_xor_data(spark) train_rows = df.count() schema_cols = ['features', 'y'] metadata = util._get_metadata(df) assert metadata['features']['intermediate_format'] == constants.ARRAY to_petastorm = util.to_petastorm_fn(schema_cols, metadata) modified_df = df.rdd.map(to_petastorm).toDF() data = modified_df.collect() prepare_data = remote._prepare_data_fn(metadata) features = torch.tensor([data[i].features for i in range(train_rows)]) features_prepared = prepare_data('features', features) assert np.array_equal(features_prepared, features)
def test_restore_from_checkpoint(self, mock_fit_fn, mock_pin_gpu_fn): mock_fit_fn.return_value = get_mock_fit_fn() mock_pin_gpu_fn.return_value = mock.Mock() model = create_xor_model() optimizer = tf.keras.optimizers.SGD(lr=0.1) loss = 'binary_crossentropy' with spark_session('test_restore_from_checkpoint') as spark: df = create_xor_data(spark) backend = CallbackBackend() run_id = 'run01' with local_store() as store: keras_estimator = hvd.KerasEstimator( backend=backend, store=store, model=model, optimizer=optimizer, loss=loss, feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2, run_id=run_id) keras_estimator._load_model_from_checkpoint = mock.Mock( side_effect=keras_estimator._load_model_from_checkpoint) ckpt_path = store.get_checkpoint_path(run_id) assert not store.exists(ckpt_path) keras_estimator._load_model_from_checkpoint.assert_not_called() keras_model = keras_estimator.fit(df) trained_model = keras_model.getModel() pred = trained_model.predict([np.ones([1, 2], dtype=np.float64)]) assert len(pred) == 1 assert store.exists(ckpt_path) keras_estimator.fit(df) keras_estimator._load_model_from_checkpoint.assert_called()
def test_model_serialization(self, mock_remote_trainer): model = create_xor_model() optimizer = tf.keras.optimizers.SGD(lr=0.1) loss = 'binary_crossentropy' def train(serialized_model, train_rows, val_rows, avg_row_size): return None, serialized_model, 2 mock_remote_trainer.return_value = train with spark_session('test_model_serialization') as spark: df = create_xor_data(spark) keras_estimator = hvd.KerasEstimator(model=model, optimizer=optimizer, loss=loss, feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2) backend = CallbackBackend() with local_store() as store: with temppath() as saved_path: keras_estimator.save(saved_path) keras_estimator_loaded = hvd.KerasEstimator.load( saved_path) keras_model = keras_estimator_loaded.fit( df, params={ keras_estimator_loaded.backend: backend, keras_estimator_loaded.store: store }) trained_model = keras_model.getModel() pred = trained_model.predict( [np.ones([1, 2], dtype=np.float32)]) assert len(pred) == 1 assert pred.dtype == np.float32
def test_restore_from_checkpoint(self): model = create_xor_model() optimizer = torch.optim.SGD(model.parameters(), lr=0.1) loss = nn.BCELoss() with spark_session('test_restore_from_checkpoint') as spark: df = create_xor_data(spark) ctx = CallbackBackend() run_id = 'run01' with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( backend=ctx, store=store, model=model, optimizer=optimizer, loss=loss, input_shapes=[[2]], feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=1, verbose=2, run_id=run_id) torch_estimator._load_checkpoint = mock.Mock( side_effect=torch_estimator._load_checkpoint) ckpt_path = store.get_checkpoint_path(run_id) assert not store.exists(ckpt_path) torch_estimator._load_checkpoint.assert_not_called() torch_estimator.fit(df) assert store.exists(ckpt_path) torch_estimator.fit(df) torch_estimator._load_checkpoint.assert_called()
def test_fit_model(self): model = create_xor_model() optimizer = tf.keras.optimizers.SGD(lr=0.1) loss = 'binary_crossentropy' with spark_session('test_fit_model') as spark: df = create_xor_data(spark) with local_store() as store: keras_estimator = hvd.KerasEstimator(num_proc=2, store=store, model=model, optimizer=optimizer, loss=loss, feature_cols=['features'], label_cols=['y'], batch_size=1, random_seed=1, epochs=3, verbose=2, use_gpu=False, mp_start_method='spawn') assert not keras_estimator.getUseGpu() assert 'spawn' == keras_estimator.getMpStartMethod() keras_estimator.setMpStartMethod('forkserver') assert 'forkserver' == keras_estimator.getMpStartMethod() keras_model = keras_estimator.fit(df) trained_model = keras_model.getModel() pred = trained_model.predict( [np.ones([1, 2], dtype=np.float32)]) assert len(pred) == 1 assert pred.dtype == np.float32
def test_df_cache(self): # Clean the cache before starting the test util.clear_training_cache() util._training_cache.get_dataset = mock.Mock( side_effect=util._training_cache.get_dataset) with spark_session('test_df_cache') as spark: with local_store() as store: df = create_xor_data(spark) df2 = create_xor_data(spark) df3 = create_xor_data(spark) key = util._training_cache.create_key(df, store, None) key2 = util._training_cache.create_key(df2, store, None) key3 = util._training_cache.create_key(df3, store, None) # All keys are distinct assert key != key2 assert key != key3 assert key2 != key3 # The cache should be empty to start assert not util._training_cache.is_cached(key, store) assert not util._training_cache.is_cached(key2, store) assert not util._training_cache.is_cached(key3, store) # First insertion into the cache with util.prepare_data(num_processes=2, store=store, df=df, feature_columns=['features'], label_columns=['y']) as dataset_idx: train_rows, val_rows, metadata, avg_row_size = util.get_dataset_properties( dataset_idx) util._training_cache.get_dataset.assert_not_called() assert len(util._training_cache._key_to_dataset) == 1 assert util._training_cache.is_cached(key, store) assert dataset_idx == 0 # The first dataset is still in use, so we assign the next integer in sequence to this # dataset assert not util._training_cache.is_cached(key2, store) with util.prepare_data(num_processes=2, store=store, df=df2, feature_columns=['features'], label_columns=['y' ]) as dataset_idx2: util._training_cache.get_dataset.assert_not_called() assert len(util._training_cache._key_to_dataset) == 2 assert util._training_cache.is_cached(key2, store) assert dataset_idx2 == 1 # Even though the first dataset is no longer in use, it is still cached with util.prepare_data(num_processes=2, store=store, df=df, feature_columns=['features'], label_columns=['y']) as dataset_idx1: train_rows1, val_rows1, metadata1, avg_row_size1 = util.get_dataset_properties( dataset_idx1) util._training_cache.get_dataset.assert_called() assert train_rows == train_rows1 assert val_rows == val_rows1 assert metadata == metadata1 assert avg_row_size == avg_row_size1 assert dataset_idx1 == 0 # The first dataset is no longer in use, so we can reclaim its dataset index assert not util._training_cache.is_cached(key3, store) with util.prepare_data(num_processes=2, store=store, df=df3, feature_columns=['features'], label_columns=['y']) as dataset_idx3: train_rows3, val_rows3, metadata3, avg_row_size3 = util.get_dataset_properties( dataset_idx3) assert train_rows == train_rows3 assert val_rows == val_rows3 assert metadata == metadata3 assert avg_row_size == avg_row_size3 assert dataset_idx3 == 0 # Same dataframe, different validation bad_key = util._training_cache.create_key(df, store, 0.1) assert not util._training_cache.is_cached(bad_key, store)
def test_keras_model_checkpoint_callback(self, mock_fit_fn, mock_pin_gpu_fn): from horovod.tensorflow.keras.callbacks import BestModelCheckpoint def _get_mock_fit_fn(checkpoint_callback_provided): def fit(model, train_data, val_data, steps_per_epoch, validation_steps, callbacks, verbose): returned_model_checkpoint_present = False model_checkpoint_present = False for callback in callbacks: callback.set_model(model) if checkpoint_callback_provided: callback.on_epoch_end(0, logs={'binary_crossentropy': 0.3}) else: callback.on_epoch_end(0, logs={'binary_crossentropy': 0.3}) if checkpoint_callback_provided and isinstance(callback, BestModelCheckpoint): self.assertIsNotNone(callback.filepath) self.assertTrue(callback.save_best_only) self.assertEqual(callback.monitor, 'binary_crossentropy') returned_model_checkpoint_present = True if not checkpoint_callback_provided and isinstance(callback, tf.keras.callbacks.ModelCheckpoint): self.assertFalse(callback.save_best_only) self.assertFalse(callback.save_best_only) self.assertEqual(callback.monitor, 'val_loss') model_checkpoint_present = True if checkpoint_callback_provided: self.assertTrue(returned_model_checkpoint_present) self.assertFalse(model_checkpoint_present) else: self.assertFalse(returned_model_checkpoint_present) self.assertTrue(model_checkpoint_present) return mock.Mock() return fit mock_pin_gpu_fn.return_value = mock.Mock() with spark_session('test_keras_model_chekcpoint_callbacks') as spark: df = create_xor_data(spark) backend = CallbackBackend() with local_store() as store: store.get_train_data_path = lambda v=None: store._train_path store.get_val_data_path = lambda v=None: store._val_path with util.prepare_data(backend.num_processes(), store, df, feature_columns=['features'], label_columns=['y']): model = create_xor_model() optimizer = tf.keras.optimizers.SGD(lr=0.1) loss = 'binary_crossentropy' # Test when the checkpoint callback is not set, the correct one is created mock_fit_fn.return_value = _get_mock_fit_fn(checkpoint_callback_provided=False) est = hvd.KerasEstimator( backend=backend, store=store, model=model, optimizer=optimizer, loss=loss, feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2) transformer = est.fit_on_parquet() predictions = transformer.transform(df) assert predictions.count() == df.count() # Test if checkpoint call back is correctly set to the model mock_fit_fn.return_value = _get_mock_fit_fn(checkpoint_callback_provided=True) checkpoint_callback = BestModelCheckpoint(monitor='binary_crossentropy') est = hvd.KerasEstimator( backend=backend, store=store, model=model, optimizer=optimizer, loss=loss, feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2, checkpoint_callback=checkpoint_callback) transformer = est.fit_on_parquet() predictions = transformer.transform(df) assert predictions.count() == df.count()