def test_fit_model(self): if skip_lightning_tests: self.skipTest( 'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: ' 'https://github.com/horovod/horovod/pull/3263') model = create_xor_model() with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, random_seed=1, verbose=2) torch_model = torch_estimator.fit(df) trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_fit_model(self): model = create_xor_model() optimizer = torch.optim.SGD(model.parameters(), lr=0.1) loss = F.binary_cross_entropy with spark_session('test_fit_model') as spark: df = create_xor_data(spark) with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, optimizer=optimizer, loss=loss, input_shapes=[[2]], feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, random_seed=1, verbose=2, sample_weight_col='weight') torch_model = torch_estimator.fit(df) trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_legacy_fit_model(self): if skip_lightning_tests: self.skipTest( 'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: ' 'https://github.com/horovod/horovod/pull/3263') model = create_legacy_xor_model() optimizer = torch.optim.SGD(model.parameters(), lr=0.1) loss = F.binary_cross_entropy with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, optimizer=optimizer, loss=loss, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], batch_size=4, epochs=2, verbose=2, sample_weight_col='weight') torch_model = torch_estimator.fit(df) trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_fit_model(self): if sys.version_info < (3, 0, 0) and is_gloo_used(): self.skipTest( 'Horovod on Spark over Gloo only supported on Python3') model = create_xor_model() optimizer = torch.optim.SGD(model.parameters(), lr=0.1) loss = F.binary_cross_entropy with spark_session('test_fit_model') as spark: df = create_xor_data(spark) with local_store() as store: torch_estimator = hvd.TorchEstimator( num_proc=2, store=store, model=model, optimizer=optimizer, loss=loss, input_shapes=[[2]], feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2, sample_weight_col='weight') torch_model = torch_estimator.fit(df) trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_torch_direct_parquet_train(self): with spark_session('test_torch_direct_parquet_train') as spark: df = create_xor_data(spark) backend = CallbackBackend() with local_store() as store: store.get_train_data_path = lambda v=None: store._train_path store.get_val_data_path = lambda v=None: store._val_path with util.prepare_data(backend.num_processes(), store, df, feature_columns=['features'], label_columns=['y']): model = create_xor_model() optimizer = torch.optim.SGD(model.parameters(), lr=0.1) loss = nn.BCELoss() est = hvd_spark.TorchEstimator(backend=backend, store=store, model=model, optimizer=optimizer, input_shapes=[[2]], feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2) # To make sure that setLoss works with non-list loss. est.setLoss(loss) transformer = est.fit_on_parquet() predictions = transformer.transform(df) assert predictions.count() == df.count()
def test_transform_multi_class(self): model = create_xor_model(output_dim=2) with spark_session('test_transform_multi_class') as spark: df = create_xor_data(spark) metadata = util._get_metadata(df) torch_model = hvd_spark.TorchModel(history=None, model=model, input_shapes=[[2]], feature_columns=['features'], label_columns=['y'], _metadata=metadata) out_df = torch_model.transform(df) expected_types = { 'x1': LongType, 'x2': LongType, 'features': VectorUDT, 'weight': DoubleType, 'y': DoubleType, 'y__output': VectorUDT } for field in out_df.schema.fields: assert type(field.dataType) == expected_types[field.name]
def test_model_override_trainer_args(self): if skip_lightning_tests: self.skipTest( 'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: ' 'https://github.com/horovod/horovod/pull/3263') from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) model = create_xor_model() with tempdir() as dir: with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2, trainer_args={'stochastic_weight_avg': True}) torch_model = torch_estimator.fit(df) # TODO: Find a way to pass log metrics from remote, and assert base on the logger. trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_restore_from_checkpoint(self): model = create_xor_model() with spark_session('test_restore_from_checkpoint') as spark: df = create_noisy_xor_data(spark) ctx = CallbackBackend() run_id = 'run01' with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( backend=ctx, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2, run_id=run_id) torch_estimator._read_checkpoint = Mock( side_effect=torch_estimator._read_checkpoint) ckpt_path = store.get_checkpoint_path(run_id) assert not store.exists(ckpt_path) torch_estimator._read_checkpoint.assert_not_called() torch_estimator.fit(df) assert store.exists(ckpt_path) torch_estimator.fit(df) torch_estimator._read_checkpoint.assert_called()
def test_fit_model(self): model = create_xor_model() optimizer = tf.keras.optimizers.SGD(lr=0.1) loss = 'binary_crossentropy' with spark_session('test_fit_model') as spark: df = create_xor_data(spark) with local_store() as store: keras_estimator = hvd.KerasEstimator( num_proc=2, store=store, model=model, optimizer=optimizer, loss=loss, feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2) keras_model = keras_estimator.fit(df) trained_model = keras_model.getModel() pred = trained_model.predict([np.ones([1, 2], dtype=np.float32)]) assert len(pred) == 1 assert pred.dtype == np.float32
def test_happy_run_elastic_fault_tolerant_fails(self): self.skipTest( 'elastic horovod does not support shutdown from the spark driver ' 'while elastic driver is waiting for hosts to come up') if not gloo_built(): self.skipTest("Gloo is not available") with spark_session('test_happy_run_elastic_fault_tolerant_fails', max_failures=2): with tempdir() as dir: # these files make training function fail in given rank, epoch and batch # we have as many failures as Spark has max_failures (per task / index) with open(os.path.sep.join([dir, 'rank_1_epoch_2_batch_4_fail']), 'w'), \ open(os.path.sep.join([dir, 'rank_1_epoch_3_batch_1_fail']), 'w'): pass res = horovod.spark.run_elastic( fn, args=(2, 5, 5, dir), env={'HOROVOD_LOG_LEVEL': 'DEBUG'}, num_proc=2, min_num_proc=2, max_num_proc=2, start_timeout=5, verbose=2) self.assertListEqual([([0, 4, 0, 4, 1, 4, 0, 4], 0), ([0, 4, 0, 4, 1, 4, 0, 4], 1)], res)
def test_train_with_inmemory_cache_all(self): if skip_lightning_tests: self.skipTest( 'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: ' 'https://github.com/horovod/horovod/pull/3263') with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) model = create_xor_model() with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc= 1, # Normally inmem dataloader is for single worker training with small data store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2, inmemory_cache_all=True) torch_model = torch_estimator.fit(df) # TODO: Find a way to pass log metrics from remote, and assert base on the logger. trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_fit_model_multiclass(self): model = create_mnist_model() optimizer = tf.keras.optimizers.Adadelta(1.0) loss = tf.keras.losses.categorical_crossentropy for num_cores in [2, constants.TOTAL_BUFFER_MEMORY_CAP_GIB + 1]: with spark_session('test_fit_model_multiclass', cores=num_cores) as spark: df = create_mnist_data(spark) with local_store() as store: keras_estimator = hvd.KerasEstimator( num_proc=num_cores, store=store, model=model, optimizer=optimizer, loss=loss, metrics=['accuracy'], feature_cols=['features'], label_cols=['label_vec'], batch_size=2, epochs=2, verbose=2) keras_model = keras_estimator.fit(df).setOutputCols(['label_prob']) pred_df = keras_model.transform(df) argmax = udf(lambda v: float(np.argmax(v)), returnType=T.DoubleType()) pred_df = pred_df.withColumn('label_pred', argmax(pred_df.label_prob)) preds = pred_df.collect() assert len(preds) == df.count() row = preds[0] label_prob = row.label_prob.toArray().tolist() assert label_prob[int(row.label_pred)] == max(label_prob)
def test_direct_parquet_train(self): with spark_session('test_direct_parquet_train') as spark: df = create_noisy_xor_data(spark) backend = CallbackBackend() with local_store() as store: store.get_train_data_path = lambda v=None: store._train_path store.get_val_data_path = lambda v=None: store._val_path with util.prepare_data(backend.num_processes(), store, df, feature_columns=['features'], label_columns=['y'], validation=0.2): model = create_xor_model() for inmemory_cache_all in [False, True]: est = hvd_spark.TorchEstimator( backend=backend, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=1, epochs=3, verbose=2, inmemory_cache_all=inmemory_cache_all) transformer = est.fit_on_parquet() predictions = transformer.transform(df) assert predictions.count() == df.count()
def test_happy_run_elastic_fault_tolerant(self): if skip_lightning_tests: self.skipTest( 'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: ' 'https://github.com/horovod/horovod/pull/3263') if not gloo_built(): self.skipTest("Gloo is not available") with spark_session('test_happy_run_elastic_fault_tolerant', max_failures=3): with tempdir() as dir: # these files make training function fail in given rank, epoch and batch with open(os.path.sep.join([dir, 'rank_1_epoch_2_batch_4_fail']), 'w'), \ open(os.path.sep.join([dir, 'rank_0_epoch_3_batch_1_fail']), 'w'), \ open(os.path.sep.join([dir, 'rank_1_epoch_4_batch_2_fail']), 'w'): pass res = horovod.spark.run_elastic( fn, args=(2, 5, 5, dir), env={'HOROVOD_LOG_LEVEL': 'DEBUG'}, num_proc=2, min_num_proc=2, max_num_proc=2, start_timeout=5, verbose=2) self.assertListEqual([([0, 4, 0, 4, 1, 4, 0, 4], 0), ([0, 4, 0, 4, 1, 4, 0, 4], 1)], res)
def test_train_with_pytorch_infinite_async_data_loader(self): from horovod.spark.data_loaders.pytorch_data_loaders import PytorchInfiniteAsyncDataLoader with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) model = create_xor_model() with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2, data_loader_class=PytorchInfiniteAsyncDataLoader) torch_model = torch_estimator.fit(df) # TODO: Find a way to pass log metrics from remote, and assert base on the logger. trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_keras_direct_parquet_train(self, mock_fit_fn, mock_pin_gpu_fn): mock_fit_fn.return_value = get_mock_fit_fn() mock_pin_gpu_fn.return_value = mock.Mock() with spark_session('test_keras_direct_parquet_train') as spark: df = create_xor_data(spark) backend = CallbackBackend() with local_store() as store: store.get_train_data_path = lambda v=None: store._train_path store.get_val_data_path = lambda v=None: store._val_path with util.prepare_data(backend.num_processes(), store, df, feature_columns=['features'], label_columns=['y']): model = create_xor_model() optimizer = tf.keras.optimizers.SGD(lr=0.1) loss = 'binary_crossentropy' est = hvd.KerasEstimator(backend=backend, store=store, model=model, optimizer=optimizer, loss=loss, feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2) transformer = est.fit_on_parquet() predictions = transformer.transform(df) assert predictions.count() == df.count()
def test_early_stop_callback(self): from pytorch_lightning.callbacks.early_stopping import EarlyStopping with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) model = create_xor_model() early_stop_callback = EarlyStopping(monitor='val_loss', min_delta=0.00, patience=3, verbose=True, mode='max') callbacks = [early_stop_callback] with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2, callbacks=callbacks) torch_model = torch_estimator.fit(df) # TODO: Find a way to pass log metrics from remote, and assert base on the logger. trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_model_checkpoint_callback(self): from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) model = create_xor_model() with tempdir() as dir: checkpoint_callback = ModelCheckpoint(dirpath=dir) callbacks = [checkpoint_callback] with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2, callbacks=callbacks) torch_model = torch_estimator.fit(df) # TODO: Find a way to pass log metrics from remote, and assert base on the logger. trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_fit_model(self): model = create_xor_model() with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2) torch_model = torch_estimator.fit(df) trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_transform_multi_class(self): # set dim as 2, to mock a multi class model. model = create_xor_model(output_dim=2) with spark_session('test_transform_multi_class') as spark: df = create_xor_data(spark) metadata = util._get_metadata(df) torch_model = hvd_spark.TorchModel(history=None, model=model, input_shapes=[[2]], feature_columns=['features'], label_columns=['y'], _metadata=metadata) out_df = torch_model.transform(df) # in multi class model, model output is a vector but label is number. expected_types = { 'x1': IntegerType, 'x2': IntegerType, 'features': VectorUDT, 'weight': FloatType, 'y': FloatType, 'y__output': VectorUDT } for field in out_df.schema.fields: assert type(field.dataType) == expected_types[field.name]
def test_fit_model(self): if sys.version_info < (3, 0, 0) and is_gloo_used(): self.skipTest( 'Horovod on Spark over Gloo only supported on Python3') model = create_xor_model() optimizer = tf.keras.optimizers.SGD(lr=0.1) loss = 'binary_crossentropy' with spark_session('test_fit_model') as spark: df = create_xor_data(spark) with local_store() as store: keras_estimator = hvd.KerasEstimator(num_proc=2, store=store, model=model, optimizer=optimizer, loss=loss, feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2) keras_model = keras_estimator.fit(df) trained_model = keras_model.getModel() pred = trained_model.predict( [np.ones([1, 2], dtype=np.float32)]) assert len(pred) == 1 assert pred.dtype == np.float32
def test_get_col_info_error_bad_size(self): with spark_session('test_get_col_info_error_bad_size') as spark: data_bad_size = [[DenseVector([1.0, 1.0])], [DenseVector([1.0])]] schema = StructType([StructField('data', VectorUDT())]) df = create_test_data_from_schema(spark, data_bad_size, schema) with pytest.raises(ValueError): util._get_col_info(df)
def test_happy_run(self): def fn(): hvd.init() res = hvd.allgather(torch.tensor([hvd.rank()])).tolist() return res, hvd.rank() with spark_session('test_happy_run'): res = horovod.spark.run(fn, env={'PATH': os.environ.get('PATH')}, verbose=0) self.assertListEqual([([0, 1], 0), ([0, 1], 1)], res)
def test_get_col_info_error_bad_shape(self): with spark_session('test_get_col_info_error_bad_shape') as spark: data_bad_shape = [[SparseVector(2, {0: 1.0})], [SparseVector(1, {0: 1.0})]] schema = StructType([StructField('data', VectorUDT())]) df = create_test_data_from_schema(spark, data_bad_shape, schema) with pytest.raises(ValueError): util._get_col_info(df)
def test_dummy_callback(self): from pytorch_lightning.callbacks import Callback model = create_xor_model() with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) for num_proc in [1, 2]: for epochs in [2, 3]: class MyDummyCallback(Callback): def __init__(self): self.epcoh_end_counter = 0 self.train_epcoh_end_counter = 0 def on_init_start(self, trainer): print('Starting to init trainer!') def on_init_end(self, trainer): print('Trainer is initialized.') def on_epoch_end(self, trainer, model): print('A epoch ended.') self.epcoh_end_counter += 1 def on_train_epoch_end(self, trainer, model, unused=None): print('A train epoch ended.') self.train_epcoh_end_counter += 1 def on_train_end(self, trainer, model): print('Training ends') assert self.train_epcoh_end_counter == epochs dm_callback = MyDummyCallback() callbacks = [dm_callback] with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=num_proc, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=epochs, verbose=2, callbacks=callbacks) torch_model = torch_estimator.fit(df) # TODO: Find a way to pass log metrics from remote, and assert base on the logger. trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_get_available_devices(self): def fn(): hvd.init() devices = get_available_devices() return devices, hvd.local_rank() with spark_session('test_get_available_devices', gpus=2): res = horovod.spark.run(fn, env={'PATH': os.environ.get('PATH')}, verbose=0) self.assertListEqual([(['0'], 0), (['1'], 1)], res)
def test_spark_run_func_with_non_zero_exit(self): run_func = MagicMock(return_value=1) def fn(): return 1 with spark_session('test_spark_run_func', cores=4): with pytest.raises(Exception, match='^mpirun failed with exit code 1$') as e: horovod.spark.run(fn, verbose=0, run_func=run_func)
def test_get_metadata(self): expected_metadata = \ { 'float': { 'spark_data_type': FloatType, 'is_sparse_vector_only': False, 'intermediate_format': constants.NOCHANGE, 'max_size': 1, 'shape': 1 }, 'dense': { 'spark_data_type': DenseVector, 'is_sparse_vector_only': False, 'intermediate_format': constants.ARRAY, 'max_size': 2, 'shape': 2 }, 'sparse': { 'spark_data_type': SparseVector, 'is_sparse_vector_only': True, 'intermediate_format': constants.CUSTOM_SPARSE, 'max_size': 1, 'shape': 2 }, 'mixed': { 'spark_data_type': DenseVector, 'is_sparse_vector_only': False, 'intermediate_format': constants.ARRAY, 'max_size': 2, 'shape': 2 }, } with spark_session('test_get_metadata') as spark: data = [[ 1.0, DenseVector([1.0, 1.0]), SparseVector(2, {0: 1.0}), DenseVector([1.0, 1.0]) ], [ 1.0, DenseVector([1.0, 1.0]), SparseVector(2, {1: 1.0}), SparseVector(2, {1: 1.0}) ]] schema = StructType([ StructField('float', FloatType()), StructField('dense', VectorUDT()), StructField('sparse', VectorUDT()), StructField('mixed', VectorUDT()) ]) df = create_test_data_from_schema(spark, data, schema) metadata = util._get_metadata(df) self.assertDictEqual(metadata, expected_metadata)
def test_timeout(self): with spark_session('test_timeout'): with pytest.raises( Exception, match='^Timed out waiting for Spark tasks to start.'): horovod.spark.run(None, num_proc=4, start_timeout=5, env={'PATH': os.environ.get('PATH')}, verbose=0)
def test_mpirun_not_found(self): start = time.time() with spark_session('test_mpirun_not_found'): with pytest.raises(Exception, match='^mpirun failed with exit code 127$'): horovod.spark.run(None, env={'PATH': '/nonexistent'}, verbose=0) self.assertLessEqual(time.time() - start, 10, 'Failure propagation took too long')