Пример #1
0
    def test_direct_parquet_train(self):
        with spark_session('test_direct_parquet_train') as spark:
            df = create_noisy_xor_data(spark)

            backend = CallbackBackend()
            with local_store() as store:
                store.get_train_data_path = lambda v=None: store._train_path
                store.get_val_data_path = lambda v=None: store._val_path

                with util.prepare_data(backend.num_processes(),
                                       store,
                                       df,
                                       feature_columns=['features'],
                                       label_columns=['y'],
                                       validation=0.2):
                    model = create_xor_model()

                    for inmemory_cache_all in [False, True]:
                        est = hvd_spark.TorchEstimator(
                            backend=backend,
                            store=store,
                            model=model,
                            input_shapes=[[-1, 2]],
                            feature_cols=['features'],
                            label_cols=['y'],
                            validation=0.2,
                            batch_size=1,
                            epochs=3,
                            verbose=2,
                            inmemory_cache_all=inmemory_cache_all)

                        transformer = est.fit_on_parquet()
                        predictions = transformer.transform(df)
                        assert predictions.count() == df.count()
Пример #2
0
    def test_fit_model(self):
        model = create_xor_model()
        optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
        loss = F.binary_cross_entropy

        with spark_session('test_fit_model') as spark:
            df = create_xor_data(spark)

            with local_store() as store:
                torch_estimator = hvd_spark.TorchEstimator(
                    num_proc=2,
                    store=store,
                    model=model,
                    optimizer=optimizer,
                    loss=loss,
                    input_shapes=[[2]],
                    feature_cols=['features'],
                    label_cols=['y'],
                    batch_size=1,
                    epochs=3,
                    random_seed=1,
                    verbose=2,
                    sample_weight_col='weight')

                torch_model = torch_estimator.fit(df)

                trained_model = torch_model.getModel()
                pred = trained_model(torch.ones([1, 2], dtype=torch.int32))
                assert len(pred) == 1
                assert pred.dtype == torch.float32
Пример #3
0
    def test_fit_model(self):
        if sys.version_info < (3, 0, 0) and is_gloo_used():
            self.skipTest(
                'Horovod on Spark over Gloo only supported on Python3')

        model = create_xor_model()
        optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
        loss = F.binary_cross_entropy

        with spark_session('test_fit_model') as spark:
            df = create_xor_data(spark)

            with local_store() as store:
                torch_estimator = hvd.TorchEstimator(
                    num_proc=2,
                    store=store,
                    model=model,
                    optimizer=optimizer,
                    loss=loss,
                    input_shapes=[[2]],
                    feature_cols=['features'],
                    label_cols=['y'],
                    batch_size=1,
                    epochs=3,
                    verbose=2,
                    sample_weight_col='weight')

                torch_model = torch_estimator.fit(df)

                trained_model = torch_model.getModel()
                pred = trained_model(torch.ones([1, 2], dtype=torch.int32))
                assert len(pred) == 1
                assert pred.dtype == torch.float32
Пример #4
0
    def test_torch_direct_parquet_train(self):
        with spark_session('test_torch_direct_parquet_train') as spark:
            df = create_xor_data(spark)

            backend = CallbackBackend()
            with local_store() as store:
                store.get_train_data_path = lambda v=None: store._train_path
                store.get_val_data_path = lambda v=None: store._val_path

                with util.prepare_data(backend.num_processes(),
                                       store,
                                       df,
                                       feature_columns=['features'],
                                       label_columns=['y']):
                    model = create_xor_model()
                    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
                    loss = nn.BCELoss()

                    est = hvd_spark.TorchEstimator(backend=backend,
                                                   store=store,
                                                   model=model,
                                                   optimizer=optimizer,
                                                   input_shapes=[[2]],
                                                   feature_cols=['features'],
                                                   label_cols=['y'],
                                                   batch_size=1,
                                                   epochs=3,
                                                   verbose=2)

                    # To make sure that setLoss works with non-list loss.
                    est.setLoss(loss)

                    transformer = est.fit_on_parquet()
                    predictions = transformer.transform(df)
                    assert predictions.count() == df.count()
Пример #5
0
    def test_fit_model(self):
        if skip_lightning_tests:
            self.skipTest(
                'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: '
                'https://github.com/horovod/horovod/pull/3263')

        model = create_xor_model()

        with spark_session('test_fit_model') as spark:
            df = create_noisy_xor_data(spark)

            with local_store() as store:
                torch_estimator = hvd_spark.TorchEstimator(
                    num_proc=2,
                    store=store,
                    model=model,
                    input_shapes=[[-1, 2]],
                    feature_cols=['features'],
                    label_cols=['y'],
                    validation=0.2,
                    batch_size=4,
                    epochs=2,
                    random_seed=1,
                    verbose=2)

                torch_model = torch_estimator.fit(df)

                trained_model = torch_model.getModel()
                pred = trained_model(torch.ones([1, 2], dtype=torch.int32))
                assert len(pred) == 1
                assert pred.dtype == torch.float32
Пример #6
0
    def test_fit_model(self):
        model = create_xor_model()
        optimizer = tf.keras.optimizers.SGD(lr=0.1)
        loss = 'binary_crossentropy'

        with spark_session('test_fit_model') as spark:
            df = create_xor_data(spark)

            with local_store() as store:
                keras_estimator = hvd.KerasEstimator(
                    num_proc=2,
                    store=store,
                    model=model,
                    optimizer=optimizer,
                    loss=loss,
                    feature_cols=['features'],
                    label_cols=['y'],
                    batch_size=1,
                    epochs=3,
                    verbose=2)

                keras_model = keras_estimator.fit(df)

                trained_model = keras_model.getModel()
                pred = trained_model.predict([np.ones([1, 2], dtype=np.float32)])
                assert len(pred) == 1
                assert pred.dtype == np.float32
Пример #7
0
    def test_train_with_pytorch_infinite_async_data_loader(self):
        from horovod.spark.data_loaders.pytorch_data_loaders import PytorchInfiniteAsyncDataLoader

        with spark_session('test_fit_model') as spark:
            df = create_noisy_xor_data(spark)
            model = create_xor_model()

            with local_store() as store:
                torch_estimator = hvd_spark.TorchEstimator(
                    num_proc=2,
                    store=store,
                    model=model,
                    input_shapes=[[-1, 2]],
                    feature_cols=['features'],
                    label_cols=['y'],
                    validation=0.2,
                    batch_size=4,
                    epochs=2,
                    verbose=2,
                    data_loader_class=PytorchInfiniteAsyncDataLoader)

                torch_model = torch_estimator.fit(df)

                # TODO: Find a way to pass log metrics from remote, and assert base on the logger.
                trained_model = torch_model.getModel()
                pred = trained_model(torch.ones([1, 2], dtype=torch.int32))
                assert len(pred) == 1
                assert pred.dtype == torch.float32
Пример #8
0
    def test_fit_model(self):
        if sys.version_info < (3, 0, 0) and is_gloo_used():
            self.skipTest(
                'Horovod on Spark over Gloo only supported on Python3')

        model = create_xor_model()
        optimizer = tf.keras.optimizers.SGD(lr=0.1)
        loss = 'binary_crossentropy'

        with spark_session('test_fit_model') as spark:
            df = create_xor_data(spark)

            with local_store() as store:
                keras_estimator = hvd.KerasEstimator(num_proc=2,
                                                     store=store,
                                                     model=model,
                                                     optimizer=optimizer,
                                                     loss=loss,
                                                     feature_cols=['features'],
                                                     label_cols=['y'],
                                                     batch_size=1,
                                                     epochs=3,
                                                     verbose=2)

                keras_model = keras_estimator.fit(df)

                trained_model = keras_model.getModel()
                pred = trained_model.predict(
                    [np.ones([1, 2], dtype=np.float32)])
                assert len(pred) == 1
                assert pred.dtype == np.float32
Пример #9
0
    def test_model_override_trainer_args(self):
        if skip_lightning_tests:
            self.skipTest(
                'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: '
                'https://github.com/horovod/horovod/pull/3263')

        from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint

        with spark_session('test_fit_model') as spark:
            df = create_noisy_xor_data(spark)
            model = create_xor_model()

            with tempdir() as dir:

                with local_store() as store:
                    torch_estimator = hvd_spark.TorchEstimator(
                        num_proc=2,
                        store=store,
                        model=model,
                        input_shapes=[[-1, 2]],
                        feature_cols=['features'],
                        label_cols=['y'],
                        validation=0.2,
                        batch_size=4,
                        epochs=2,
                        verbose=2,
                        trainer_args={'stochastic_weight_avg': True})

                    torch_model = torch_estimator.fit(df)

                    # TODO: Find a way to pass log metrics from remote, and assert base on the logger.
                    trained_model = torch_model.getModel()
                    pred = trained_model(torch.ones([1, 2], dtype=torch.int32))
                    assert len(pred) == 1
                    assert pred.dtype == torch.float32
Пример #10
0
    def test_train_with_inmemory_cache_all(self):
        if skip_lightning_tests:
            self.skipTest(
                'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: '
                'https://github.com/horovod/horovod/pull/3263')

        with spark_session('test_fit_model') as spark:
            df = create_noisy_xor_data(spark)
            model = create_xor_model()

            with local_store() as store:
                torch_estimator = hvd_spark.TorchEstimator(
                    num_proc=
                    1,  # Normally inmem dataloader is for single worker training with small data
                    store=store,
                    model=model,
                    input_shapes=[[-1, 2]],
                    feature_cols=['features'],
                    label_cols=['y'],
                    validation=0.2,
                    batch_size=4,
                    epochs=2,
                    verbose=2,
                    inmemory_cache_all=True)

                torch_model = torch_estimator.fit(df)

                # TODO: Find a way to pass log metrics from remote, and assert base on the logger.
                trained_model = torch_model.getModel()
                pred = trained_model(torch.ones([1, 2], dtype=torch.int32))
                assert len(pred) == 1
                assert pred.dtype == torch.float32
Пример #11
0
    def test_keras_direct_parquet_train(self, mock_fit_fn, mock_pin_gpu_fn):
        mock_fit_fn.return_value = get_mock_fit_fn()
        mock_pin_gpu_fn.return_value = mock.Mock()

        with spark_session('test_keras_direct_parquet_train') as spark:
            df = create_xor_data(spark)

            backend = CallbackBackend()
            with local_store() as store:
                store.get_train_data_path = lambda v=None: store._train_path
                store.get_val_data_path = lambda v=None: store._val_path

                with util.prepare_data(backend.num_processes(),
                                       store,
                                       df,
                                       feature_columns=['features'],
                                       label_columns=['y']):
                    model = create_xor_model()
                    optimizer = tf.keras.optimizers.SGD(lr=0.1)
                    loss = 'binary_crossentropy'

                    est = hvd.KerasEstimator(backend=backend,
                                             store=store,
                                             model=model,
                                             optimizer=optimizer,
                                             loss=loss,
                                             feature_cols=['features'],
                                             label_cols=['y'],
                                             batch_size=1,
                                             epochs=3,
                                             verbose=2)

                    transformer = est.fit_on_parquet()
                    predictions = transformer.transform(df)
                assert predictions.count() == df.count()
Пример #12
0
    def test_fit_model_multiclass(self):
        model = create_mnist_model()
        optimizer = tf.keras.optimizers.Adadelta(1.0)
        loss = tf.keras.losses.categorical_crossentropy

        for num_cores in [2, constants.TOTAL_BUFFER_MEMORY_CAP_GIB + 1]:
            with spark_session('test_fit_model_multiclass', cores=num_cores) as spark:
                df = create_mnist_data(spark)

                with local_store() as store:
                    keras_estimator = hvd.KerasEstimator(
                        num_proc=num_cores,
                        store=store,
                        model=model,
                        optimizer=optimizer,
                        loss=loss,
                        metrics=['accuracy'],
                        feature_cols=['features'],
                        label_cols=['label_vec'],
                        batch_size=2,
                        epochs=2,
                        verbose=2)

                    keras_model = keras_estimator.fit(df).setOutputCols(['label_prob'])
                    pred_df = keras_model.transform(df)

                    argmax = udf(lambda v: float(np.argmax(v)), returnType=T.DoubleType())
                    pred_df = pred_df.withColumn('label_pred', argmax(pred_df.label_prob))

                    preds = pred_df.collect()
                    assert len(preds) == df.count()

                    row = preds[0]
                    label_prob = row.label_prob.toArray().tolist()
                    assert label_prob[int(row.label_pred)] == max(label_prob)
Пример #13
0
    def test_early_stop_callback(self):
        from pytorch_lightning.callbacks.early_stopping import EarlyStopping

        with spark_session('test_fit_model') as spark:
            df = create_noisy_xor_data(spark)
            model = create_xor_model()

            early_stop_callback = EarlyStopping(monitor='val_loss',
                                                min_delta=0.00,
                                                patience=3,
                                                verbose=True,
                                                mode='max')
            callbacks = [early_stop_callback]

            with local_store() as store:
                torch_estimator = hvd_spark.TorchEstimator(
                    num_proc=2,
                    store=store,
                    model=model,
                    input_shapes=[[-1, 2]],
                    feature_cols=['features'],
                    label_cols=['y'],
                    validation=0.2,
                    batch_size=4,
                    epochs=2,
                    verbose=2,
                    callbacks=callbacks)

                torch_model = torch_estimator.fit(df)

                # TODO: Find a way to pass log metrics from remote, and assert base on the logger.
                trained_model = torch_model.getModel()
                pred = trained_model(torch.ones([1, 2], dtype=torch.int32))
                assert len(pred) == 1
                assert pred.dtype == torch.float32
Пример #14
0
    def test_model_checkpoint_callback(self):
        from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint

        with spark_session('test_fit_model') as spark:
            df = create_noisy_xor_data(spark)
            model = create_xor_model()

            with tempdir() as dir:
                checkpoint_callback = ModelCheckpoint(dirpath=dir)
                callbacks = [checkpoint_callback]

                with local_store() as store:
                    torch_estimator = hvd_spark.TorchEstimator(
                        num_proc=2,
                        store=store,
                        model=model,
                        input_shapes=[[-1, 2]],
                        feature_cols=['features'],
                        label_cols=['y'],
                        validation=0.2,
                        batch_size=4,
                        epochs=2,
                        verbose=2,
                        callbacks=callbacks)

                    torch_model = torch_estimator.fit(df)

                    # TODO: Find a way to pass log metrics from remote, and assert base on the logger.
                    trained_model = torch_model.getModel()
                    pred = trained_model(torch.ones([1, 2], dtype=torch.int32))
                    assert len(pred) == 1
                    assert pred.dtype == torch.float32
Пример #15
0
    def test_fit_model(self):
        model = create_xor_model()

        with spark_session('test_fit_model') as spark:
            df = create_noisy_xor_data(spark)

            with local_store() as store:
                torch_estimator = hvd_spark.TorchEstimator(
                    num_proc=2,
                    store=store,
                    model=model,
                    input_shapes=[[-1, 2]],
                    feature_cols=['features'],
                    label_cols=['y'],
                    validation=0.2,
                    batch_size=4,
                    epochs=2,
                    verbose=2)

                torch_model = torch_estimator.fit(df)

                trained_model = torch_model.getModel()
                pred = trained_model(torch.ones([1, 2], dtype=torch.int32))
                assert len(pred) == 1
                assert pred.dtype == torch.float32
Пример #16
0
    def test_restore_from_checkpoint(self):

        model = create_xor_model()

        with spark_session('test_restore_from_checkpoint') as spark:
            df = create_noisy_xor_data(spark)

            ctx = CallbackBackend()

            run_id = 'run01'
            with local_store() as store:
                torch_estimator = hvd_spark.TorchEstimator(
                    backend=ctx,
                    store=store,
                    model=model,
                    input_shapes=[[-1, 2]],
                    feature_cols=['features'],
                    label_cols=['y'],
                    validation=0.2,
                    batch_size=4,
                    epochs=2,
                    verbose=2,
                    run_id=run_id)

                torch_estimator._read_checkpoint = Mock(
                    side_effect=torch_estimator._read_checkpoint)

                ckpt_path = store.get_checkpoint_path(run_id)
                assert not store.exists(ckpt_path)
                torch_estimator._read_checkpoint.assert_not_called()
                torch_estimator.fit(df)

                assert store.exists(ckpt_path)
                torch_estimator.fit(df)
                torch_estimator._read_checkpoint.assert_called()
Пример #17
0
    def test_legacy_fit_model(self):
        if skip_lightning_tests:
            self.skipTest(
                'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: '
                'https://github.com/horovod/horovod/pull/3263')

        model = create_legacy_xor_model()
        optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
        loss = F.binary_cross_entropy

        with spark_session('test_fit_model') as spark:
            df = create_noisy_xor_data(spark)

            with local_store() as store:
                torch_estimator = hvd_spark.TorchEstimator(
                    num_proc=2,
                    store=store,
                    model=model,
                    optimizer=optimizer,
                    loss=loss,
                    input_shapes=[[-1, 2]],
                    feature_cols=['features'],
                    label_cols=['y'],
                    batch_size=4,
                    epochs=2,
                    verbose=2,
                    sample_weight_col='weight')

                torch_model = torch_estimator.fit(df)

                trained_model = torch_model.getModel()
                pred = trained_model(torch.ones([1, 2], dtype=torch.int32))
                assert len(pred) == 1
                assert pred.dtype == torch.float32
Пример #18
0
    def test_dummy_callback(self):
        from pytorch_lightning.callbacks import Callback
        model = create_xor_model()

        with spark_session('test_fit_model') as spark:
            df = create_noisy_xor_data(spark)

            for num_proc in [1, 2]:
                for epochs in [2, 3]:

                    class MyDummyCallback(Callback):
                        def __init__(self):
                            self.epcoh_end_counter = 0
                            self.train_epcoh_end_counter = 0

                        def on_init_start(self, trainer):
                            print('Starting to init trainer!')

                        def on_init_end(self, trainer):
                            print('Trainer is initialized.')

                        def on_epoch_end(self, trainer, model):
                            print('A epoch ended.')
                            self.epcoh_end_counter += 1

                        def on_train_epoch_end(self, trainer, model, unused=None):
                            print('A train epoch ended.')
                            self.train_epcoh_end_counter += 1

                        def on_train_end(self, trainer, model):
                            print('Training ends')
                            assert self.train_epcoh_end_counter == epochs

                    dm_callback = MyDummyCallback()
                    callbacks = [dm_callback]

                    with local_store() as store:
                        torch_estimator = hvd_spark.TorchEstimator(
                            num_proc=num_proc,
                            store=store,
                            model=model,
                            input_shapes=[[-1, 2]],
                            feature_cols=['features'],
                            label_cols=['y'],
                            validation=0.2,
                            batch_size=4,
                            epochs=epochs,
                            verbose=2,
                            callbacks=callbacks)

                        torch_model = torch_estimator.fit(df)

                        # TODO: Find a way to pass log metrics from remote, and assert base on the logger.
                        trained_model = torch_model.getModel()
                        pred = trained_model(torch.ones([1, 2], dtype=torch.int32))
                        assert len(pred) == 1
                        assert pred.dtype == torch.float32
Пример #19
0
    def test_lr_scheduler_callback(self):
        if skip_lightning_tests:
            self.skipTest(
                'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: '
                'https://github.com/horovod/horovod/pull/3263')

        from pytorch_lightning.callbacks import LearningRateMonitor

        class LRTestingModel(XOR):
            def configure_optimizers(self):
                optimizer = torch.optim.Adam(model.parameters(), lr=0.02)

                def lambda_func(epoch):
                    return epoch // 30

                lr_scheduler = {
                    'scheduler':
                    torch.optim.lr_scheduler.LambdaLR(optimizer,
                                                      lr_lambda=lambda_func),
                    'name':
                    'my_logging_name'
                }
                return [optimizer], [lr_scheduler]

        model = LRTestingModel()

        with spark_session('test_fit_model') as spark:
            df = create_noisy_xor_data(spark)

            lr_monitor = LearningRateMonitor(logging_interval='step')
            callbacks = [lr_monitor]

            with local_store() as store:
                torch_estimator = hvd_spark.TorchEstimator(
                    num_proc=2,
                    store=store,
                    model=model,
                    input_shapes=[[-1, 2]],
                    feature_cols=['features'],
                    label_cols=['y'],
                    validation=0.2,
                    batch_size=4,
                    epochs=2,
                    verbose=2,
                    callbacks=callbacks)

                torch_model = torch_estimator.fit(df)

                # TODO: Find a way to pass log metrics from remote, and assert base on the logger.
                trained_model = torch_model.getModel()
                pred = trained_model(torch.ones([1, 2], dtype=torch.int32))
                assert len(pred) == 1
                assert pred.dtype == torch.float32
Пример #20
0
    def test_torch_direct_parquet_train(self):
        with spark_session('test_torch_direct_parquet_train') as spark:
            df = create_xor_data_with_val(spark)

            backend = CallbackBackend()
            with local_store() as store:
                store.get_train_data_path = lambda v=None: store._train_path
                store.get_val_data_path = lambda v=None: store._val_path

                # Make sure we cover validation dataloader as well
                for validation in [None, 'val']:
                    # Need validation ratio to split data
                    with util.prepare_data(backend.num_processes(),
                                           store,
                                           df,
                                           feature_columns=['features'],
                                           label_columns=['y'],
                                           validation=validation):
                        model = create_xor_model()
                        optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
                        loss = nn.BCELoss()

                        for inmemory_cache_all in [False, True]:
                            for reader_pool_type in ['process', 'thread']:
                                est = hvd_spark.TorchEstimator(
                                    backend=backend,
                                    store=store,
                                    model=model,
                                    optimizer=optimizer,
                                    input_shapes=[[2]],
                                    feature_cols=['features'],
                                    label_cols=['y'],
                                    batch_size=1,
                                    epochs=3,
                                    verbose=2,
                                    reader_pool_type=reader_pool_type,
                                    inmemory_cache_all=inmemory_cache_all,
                                    validation=validation)

                                # To make sure that setLoss works with non-list loss.
                                est.setLoss(loss)

                                transformer = est.fit_on_parquet()
                                predictions = transformer.transform(df)
                                assert predictions.count() == df.count()
Пример #21
0
    def test_keras_direct_parquet_train(self, mock_fit_fn, mock_pin_gpu_fn):
        mock_fit_fn.return_value = get_mock_fit_fn()
        mock_pin_gpu_fn.return_value = mock.Mock()

        with spark_session('test_keras_direct_parquet_train') as spark:
            df = create_xor_data_with_val(spark)

            backend = CallbackBackend()
            with local_store() as store:
                store.get_train_data_path = lambda v=None: store._train_path
                store.get_val_data_path = lambda v=None: store._val_path

                # Make sure we cover val dataloader cases
                for validation in [None, 'val']:
                    with util.prepare_data(backend.num_processes(),
                                           store,
                                           df,
                                           feature_columns=['features'],
                                           label_columns=['y'],
                                           validation=validation):
                        model = create_xor_model()
                        optimizer = tf.keras.optimizers.SGD(lr=0.1)
                        loss = 'binary_crossentropy'

                        for inmemory_cache_all in [False, True]:
                            for reader_pool_type in ['process', 'thread']:
                                est = hvd.KerasEstimator(
                                    backend=backend,
                                    store=store,
                                    model=model,
                                    optimizer=optimizer,
                                    loss=loss,
                                    feature_cols=['features'],
                                    label_cols=['y'],
                                    batch_size=1,
                                    epochs=3,
                                    reader_pool_type=reader_pool_type,
                                    validation=validation,
                                    inmemory_cache_all=inmemory_cache_all,
                                    verbose=2)

                                transformer = est.fit_on_parquet()
                                predictions = transformer.transform(df)
                                assert predictions.count() == df.count()
Пример #22
0
    def test_restore_from_checkpoint(self, mock_fit_fn, mock_pin_gpu_fn):
        mock_fit_fn.return_value = get_mock_fit_fn()
        mock_pin_gpu_fn.return_value = mock.Mock()

        model = create_xor_model()
        optimizer = tf.keras.optimizers.SGD(lr=0.1)
        loss = 'binary_crossentropy'

        with spark_session('test_restore_from_checkpoint') as spark:
            df = create_xor_data(spark)

            backend = CallbackBackend()

            run_id = 'run01'
            with local_store() as store:
                keras_estimator = hvd.KerasEstimator(
                    backend=backend,
                    store=store,
                    model=model,
                    optimizer=optimizer,
                    loss=loss,
                    feature_cols=['features'],
                    label_cols=['y'],
                    batch_size=1,
                    epochs=3,
                    verbose=2,
                    run_id=run_id)

                keras_estimator._load_model_from_checkpoint = mock.Mock(
                    side_effect=keras_estimator._load_model_from_checkpoint)

                ckpt_path = store.get_checkpoint_path(run_id)
                assert not store.exists(ckpt_path)
                keras_estimator._load_model_from_checkpoint.assert_not_called()
                keras_model = keras_estimator.fit(df)

                trained_model = keras_model.getModel()
                pred = trained_model.predict([np.ones([1, 2], dtype=np.float64)])
                assert len(pred) == 1

                assert store.exists(ckpt_path)

                keras_estimator.fit(df)
                keras_estimator._load_model_from_checkpoint.assert_called()
Пример #23
0
    def test_direct_parquet_train(self):
        if skip_lightning_tests:
            self.skipTest(
                'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: '
                'https://github.com/horovod/horovod/pull/3263')

        with spark_session('test_direct_parquet_train') as spark:
            df = create_noisy_xor_data_with_val(spark)

            backend = CallbackBackend()
            with local_store() as store:
                store.get_train_data_path = lambda v=None: store._train_path
                store.get_val_data_path = lambda v=None: store._val_path

                # Make sure to cover val dataloader cases
                for validation in [None, 'val']:
                    with util.prepare_data(backend.num_processes(),
                                           store,
                                           df,
                                           feature_columns=['features'],
                                           label_columns=['y'],
                                           validation=validation):
                        model = create_xor_model()

                        for inmemory_cache_all in [False, True]:
                            for reader_pool_type in ['process', 'thread']:
                                est = hvd_spark.TorchEstimator(
                                    backend=backend,
                                    store=store,
                                    model=model,
                                    input_shapes=[[-1, 2]],
                                    feature_cols=['features'],
                                    label_cols=['y'],
                                    validation=validation,
                                    batch_size=1,
                                    epochs=3,
                                    verbose=2,
                                    inmemory_cache_all=inmemory_cache_all,
                                    reader_pool_type=reader_pool_type)

                                transformer = est.fit_on_parquet()
                                predictions = transformer.transform(df)
                                assert predictions.count() == df.count()
Пример #24
0
    def test_legacy_restore_from_checkpoint(self):
        self.skipTest(
            'There is a bug in current lightning version for checkpoint'
            'call back. Will add this test back when it is solved.')

        model = create_legacy_xor_model()
        optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
        loss = nn.BCELoss()

        with spark_session('test_restore_from_checkpoint') as spark:
            df = create_noisy_xor_data(spark)

            ctx = CallbackBackend()

            run_id = 'run01'
            with local_store() as store:
                torch_estimator = hvd_spark.TorchEstimator(
                    backend=ctx,
                    store=store,
                    model=model,
                    optimizer=optimizer,
                    loss=loss,
                    input_shapes=[[-1, 2]],
                    feature_cols=['features'],
                    label_cols=['y'],
                    validation=0.2,
                    batch_size=4,
                    epochs=2,
                    verbose=2,
                    run_id=run_id)

                torch_estimator._read_checkpoint = mock.Mock(
                    side_effect=torch_estimator._read_checkpoint)

                ckpt_path = store.get_checkpoint_path(run_id)
                assert not store.exists(ckpt_path)
                torch_estimator._read_checkpoint.assert_not_called()
                torch_estimator.fit(df)

                assert store.exists(ckpt_path)
                torch_estimator.fit(df)
                torch_estimator._read_checkpoint.assert_called()
Пример #25
0
    def test_model_serialization(self, mock_remote_trainer):
        model = create_xor_model()
        optimizer = tf.keras.optimizers.SGD(lr=0.1)
        loss = 'binary_crossentropy'

        def train(serialized_model, train_rows, val_rows, avg_row_size):
            return None, serialized_model, 2

        mock_remote_trainer.return_value = train

        with spark_session('test_model_serialization') as spark:
            df = create_xor_data(spark)

            keras_estimator = hvd.KerasEstimator(model=model,
                                                 optimizer=optimizer,
                                                 loss=loss,
                                                 feature_cols=['features'],
                                                 label_cols=['y'],
                                                 batch_size=1,
                                                 epochs=3,
                                                 verbose=2)

            backend = CallbackBackend()
            with local_store() as store:
                with temppath() as saved_path:
                    keras_estimator.save(saved_path)
                    keras_estimator_loaded = hvd.KerasEstimator.load(
                        saved_path)

                keras_model = keras_estimator_loaded.fit(
                    df,
                    params={
                        keras_estimator_loaded.backend: backend,
                        keras_estimator_loaded.store: store
                    })

                trained_model = keras_model.getModel()
                pred = trained_model.predict(
                    [np.ones([1, 2], dtype=np.float32)])
                assert len(pred) == 1
                assert pred.dtype == np.float32
Пример #26
0
    def test_terminate_on_nan_flag(self):
        model = create_xor_model()

        with spark_session('test_terminate_on_nan_flag') as spark:
            df = create_noisy_xor_data(spark)

            with local_store() as store:
                torch_estimator = hvd_spark.TorchEstimator(
                    num_proc=2,
                    store=store,
                    model=model,
                    input_shapes=[[-1, 2]],
                    feature_cols=['features'],
                    label_cols=['y'],
                    validation=0.2,
                    batch_size=4,
                    epochs=2,
                    verbose=2,
                    terminate_on_nan=True,
                    profiler="pytorch")
                assert torch_estimator.getTerminateOnNan() == True
Пример #27
0
    def test_restore_from_checkpoint(self):
        model = create_xor_model()
        optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
        loss = nn.BCELoss()

        with spark_session('test_restore_from_checkpoint') as spark:
            df = create_xor_data(spark)

            ctx = CallbackBackend()

            run_id = 'run01'
            with local_store() as store:
                torch_estimator = hvd_spark.TorchEstimator(
                    backend=ctx,
                    store=store,
                    model=model,
                    optimizer=optimizer,
                    loss=loss,
                    input_shapes=[[2]],
                    feature_cols=['features'],
                    label_cols=['y'],
                    batch_size=1,
                    epochs=1,
                    verbose=2,
                    run_id=run_id)

                torch_estimator._load_checkpoint = mock.Mock(
                    side_effect=torch_estimator._load_checkpoint)

                ckpt_path = store.get_checkpoint_path(run_id)
                assert not store.exists(ckpt_path)
                torch_estimator._load_checkpoint.assert_not_called()
                torch_estimator.fit(df)

                assert store.exists(ckpt_path)
                torch_estimator.fit(df)
                torch_estimator._load_checkpoint.assert_called()
Пример #28
0
    def test_fit_model(self):
        model = create_xor_model()
        optimizer = tf.keras.optimizers.SGD(lr=0.1)
        loss = 'binary_crossentropy'

        with spark_session('test_fit_model') as spark:
            df = create_xor_data(spark)

            with local_store() as store:
                keras_estimator = hvd.KerasEstimator(num_proc=2,
                                                     store=store,
                                                     model=model,
                                                     optimizer=optimizer,
                                                     loss=loss,
                                                     feature_cols=['features'],
                                                     label_cols=['y'],
                                                     batch_size=1,
                                                     random_seed=1,
                                                     epochs=3,
                                                     verbose=2,
                                                     use_gpu=False,
                                                     mp_start_method='spawn')

                assert not keras_estimator.getUseGpu()
                assert 'spawn' == keras_estimator.getMpStartMethod()

                keras_estimator.setMpStartMethod('forkserver')
                assert 'forkserver' == keras_estimator.getMpStartMethod()

                keras_model = keras_estimator.fit(df)

                trained_model = keras_model.getModel()
                pred = trained_model.predict(
                    [np.ones([1, 2], dtype=np.float32)])
                assert len(pred) == 1
                assert pred.dtype == np.float32
Пример #29
0
    def test_df_cache(self):
        # Clean the cache before starting the test
        util.clear_training_cache()
        util._training_cache.get_dataset = mock.Mock(
            side_effect=util._training_cache.get_dataset)

        with spark_session('test_df_cache') as spark:
            with local_store() as store:
                df = create_xor_data(spark)
                df2 = create_xor_data(spark)
                df3 = create_xor_data(spark)

                key = util._training_cache.create_key(df, store, None)
                key2 = util._training_cache.create_key(df2, store, None)
                key3 = util._training_cache.create_key(df3, store, None)

                # All keys are distinct
                assert key != key2
                assert key != key3
                assert key2 != key3

                # The cache should be empty to start
                assert not util._training_cache.is_cached(key, store)
                assert not util._training_cache.is_cached(key2, store)
                assert not util._training_cache.is_cached(key3, store)

                # First insertion into the cache
                with util.prepare_data(num_processes=2,
                                       store=store,
                                       df=df,
                                       feature_columns=['features'],
                                       label_columns=['y']) as dataset_idx:
                    train_rows, val_rows, metadata, avg_row_size = util.get_dataset_properties(
                        dataset_idx)
                    util._training_cache.get_dataset.assert_not_called()
                    assert len(util._training_cache._key_to_dataset) == 1
                    assert util._training_cache.is_cached(key, store)
                    assert dataset_idx == 0

                    # The first dataset is still in use, so we assign the next integer in sequence to this
                    # dataset
                    assert not util._training_cache.is_cached(key2, store)
                    with util.prepare_data(num_processes=2,
                                           store=store,
                                           df=df2,
                                           feature_columns=['features'],
                                           label_columns=['y'
                                                          ]) as dataset_idx2:
                        util._training_cache.get_dataset.assert_not_called()
                        assert len(util._training_cache._key_to_dataset) == 2
                        assert util._training_cache.is_cached(key2, store)
                        assert dataset_idx2 == 1

                # Even though the first dataset is no longer in use, it is still cached
                with util.prepare_data(num_processes=2,
                                       store=store,
                                       df=df,
                                       feature_columns=['features'],
                                       label_columns=['y']) as dataset_idx1:
                    train_rows1, val_rows1, metadata1, avg_row_size1 = util.get_dataset_properties(
                        dataset_idx1)
                    util._training_cache.get_dataset.assert_called()
                    assert train_rows == train_rows1
                    assert val_rows == val_rows1
                    assert metadata == metadata1
                    assert avg_row_size == avg_row_size1
                    assert dataset_idx1 == 0

                # The first dataset is no longer in use, so we can reclaim its dataset index
                assert not util._training_cache.is_cached(key3, store)
                with util.prepare_data(num_processes=2,
                                       store=store,
                                       df=df3,
                                       feature_columns=['features'],
                                       label_columns=['y']) as dataset_idx3:
                    train_rows3, val_rows3, metadata3, avg_row_size3 = util.get_dataset_properties(
                        dataset_idx3)
                    assert train_rows == train_rows3
                    assert val_rows == val_rows3
                    assert metadata == metadata3
                    assert avg_row_size == avg_row_size3
                    assert dataset_idx3 == 0

                # Same dataframe, different validation
                bad_key = util._training_cache.create_key(df, store, 0.1)
                assert not util._training_cache.is_cached(bad_key, store)
Пример #30
0
    def test_prepare_data_compress_sparse(self):
        util.clear_training_cache()

        expected_metadata = \
            {
                'float': {
                    'spark_data_type': FloatType,
                    'is_sparse_vector_only': False,
                    'intermediate_format': constants.NOCHANGE,
                    'max_size': 1,
                    'shape': 1
                },
                'dense': {
                    'spark_data_type': DenseVector,
                    'is_sparse_vector_only': False,
                    'intermediate_format': constants.ARRAY,
                    'max_size': 2,
                    'shape': 2
                },
                'sparse': {
                    'spark_data_type': SparseVector,
                    'is_sparse_vector_only': True,
                    'intermediate_format': constants.CUSTOM_SPARSE,
                    'max_size': 1,
                    'shape': 2
                },
                'mixed': {
                    'spark_data_type': DenseVector,
                    'is_sparse_vector_only': False,
                    'intermediate_format': constants.ARRAY,
                    'max_size': 2,
                    'shape': 2
                },
            }

        with mock.patch('horovod.spark.common.util._get_metadata',
                        side_effect=util._get_metadata) as mock_get_metadata:
            with spark_session('test_prepare_data') as spark:
                data = [[
                    0.0,
                    DenseVector([1.0, 1.0]),
                    SparseVector(2, {1: 1.0}),
                    DenseVector([1.0, 1.0])
                ],
                        [
                            1.0,
                            DenseVector([1.0, 1.0]),
                            SparseVector(2, {1: 1.0}),
                            SparseVector(2, {1: 1.0})
                        ]]

                schema = StructType([
                    StructField('float', FloatType()),
                    StructField('dense', VectorUDT()),
                    StructField('sparse', VectorUDT()),
                    StructField('mixed', VectorUDT())
                ])

                df = create_test_data_from_schema(spark, data, schema)

                with local_store() as store:
                    with util.prepare_data(
                            num_processes=2,
                            store=store,
                            df=df,
                            feature_columns=['dense', 'sparse', 'mixed'],
                            label_columns=['float'],
                            compress_sparse=True) as dataset_idx:
                        mock_get_metadata.assert_called()
                        assert dataset_idx == 0

                        train_rows, val_rows, metadata, avg_row_size = util.get_dataset_properties(
                            dataset_idx)
                        self.assertDictEqual(metadata, expected_metadata)