示例#1
0
    def test_keras_direct_parquet_train(self, mock_fit_fn, mock_pin_gpu_fn):
        mock_fit_fn.return_value = get_mock_fit_fn()
        mock_pin_gpu_fn.return_value = mock.Mock()

        with spark_session('test_keras_direct_parquet_train') as spark:
            df = create_xor_data(spark)

            backend = CallbackBackend()
            with local_store() as store:
                store.get_train_data_path = lambda v=None: store._train_path
                store.get_val_data_path = lambda v=None: store._val_path

                with util.prepare_data(backend.num_processes(),
                                       store,
                                       df,
                                       feature_columns=['features'],
                                       label_columns=['y']):
                    model = create_xor_model()
                    optimizer = tf.keras.optimizers.SGD(lr=0.1)
                    loss = 'binary_crossentropy'

                    for reader_pool_type in ['process', 'thread']:
                        est = hvd.KerasEstimator(
                            backend=backend,
                            store=store,
                            model=model,
                            optimizer=optimizer,
                            loss=loss,
                            feature_cols=['features'],
                            label_cols=['y'],
                            batch_size=1,
                            epochs=3,
                            reader_pool_type=reader_pool_type,
                            verbose=2)

                        transformer = est.fit_on_parquet()
                        predictions = transformer.transform(df)
                        assert predictions.count() == df.count()
示例#2
0
    def test_torch_direct_parquet_train(self):
        with spark_session('test_torch_direct_parquet_train') as spark:
            df = create_xor_data(spark)

            backend = CallbackBackend()
            with local_store() as store:
                store.get_train_data_path = lambda v=None: store._train_path
                store.get_val_data_path = lambda v=None: store._val_path

                with util.prepare_data(backend.num_processes(),
                                       store,
                                       df,
                                       feature_columns=['features'],
                                       label_columns=['y']):
                    model = create_xor_model()
                    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
                    loss = nn.BCELoss()

                    for inmemory_cache_all in [False, True]:
                        est = hvd_spark.TorchEstimator(
                            backend=backend,
                            store=store,
                            model=model,
                            optimizer=optimizer,
                            input_shapes=[[2]],
                            feature_cols=['features'],
                            label_cols=['y'],
                            batch_size=1,
                            epochs=3,
                            verbose=2,
                            inmemory_cache_all=inmemory_cache_all)

                        # To make sure that setLoss works with non-list loss.
                        est.setLoss(loss)

                        transformer = est.fit_on_parquet()
                        predictions = transformer.transform(df)
                        assert predictions.count() == df.count()
示例#3
0
    def test_direct_parquet_train(self):
        with spark_session('test_direct_parquet_train') as spark:
            df = create_noisy_xor_data_with_val(spark)

            backend = CallbackBackend()
            with local_store() as store:
                store.get_train_data_path = lambda v=None: store._train_path
                store.get_val_data_path = lambda v=None: store._val_path

                # Make sure to cover val dataloader cases
                for validation in [None, 'val']:
                    with util.prepare_data(backend.num_processes(),
                                           store,
                                           df,
                                           feature_columns=['features'],
                                           label_columns=['y'],
                                           validation=validation):
                        model = create_xor_model()

                        for inmemory_cache_all in [False, True]:
                            for reader_pool_type in ['process', 'thread']:
                                est = hvd_spark.TorchEstimator(
                                    backend=backend,
                                    store=store,
                                    model=model,
                                    input_shapes=[[-1, 2]],
                                    feature_cols=['features'],
                                    label_cols=['y'],
                                    validation=validation,
                                    batch_size=1,
                                    epochs=3,
                                    verbose=2,
                                    inmemory_cache_all=inmemory_cache_all,
                                    reader_pool_type=reader_pool_type)

                                transformer = est.fit_on_parquet()
                                predictions = transformer.transform(df)
                                assert predictions.count() == df.count()
示例#4
0
    def test_restore_from_checkpoint(self, mock_fit_fn, mock_pin_gpu_fn):
        mock_fit_fn.return_value = get_mock_fit_fn()
        mock_pin_gpu_fn.return_value = mock.Mock()

        model = create_xor_model()
        optimizer = tf.keras.optimizers.SGD(lr=0.1)
        loss = 'binary_crossentropy'

        with spark_session('test_restore_from_checkpoint') as spark:
            df = create_xor_data(spark)

            backend = CallbackBackend()

            run_id = 'run01'
            with local_store() as store:
                keras_estimator = hvd.KerasEstimator(
                    backend=backend,
                    store=store,
                    model=model,
                    optimizer=optimizer,
                    loss=loss,
                    feature_cols=['features'],
                    label_cols=['y'],
                    batch_size=1,
                    epochs=3,
                    verbose=2,
                    run_id=run_id)

                keras_estimator._load_model_from_checkpoint = mock.Mock(
                    side_effect=keras_estimator._load_model_from_checkpoint)

                ckpt_path = store.get_checkpoint_path(run_id)
                assert not store.exists(ckpt_path)
                keras_estimator._load_model_from_checkpoint.assert_not_called()
                keras_model = keras_estimator.fit(df)

                trained_model = keras_model.getModel()
                pred = trained_model.predict([np.ones([1, 2], dtype=np.float64)])
                assert len(pred) == 1

                assert store.exists(ckpt_path)

                keras_estimator.fit(df)
                keras_estimator._load_model_from_checkpoint.assert_called()
示例#5
0
    def test_legacy_restore_from_checkpoint(self):
        self.skipTest(
            'There is a bug in current lightning version for checkpoint'
            'call back. Will add this test back when it is solved.')

        model = create_legacy_xor_model()
        optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
        loss = nn.BCELoss()

        with spark_session('test_restore_from_checkpoint') as spark:
            df = create_noisy_xor_data(spark)

            ctx = CallbackBackend()

            run_id = 'run01'
            with local_store() as store:
                torch_estimator = hvd_spark.TorchEstimator(
                    backend=ctx,
                    store=store,
                    model=model,
                    optimizer=optimizer,
                    loss=loss,
                    input_shapes=[[-1, 2]],
                    feature_cols=['features'],
                    label_cols=['y'],
                    validation=0.2,
                    batch_size=4,
                    epochs=2,
                    verbose=2,
                    run_id=run_id)

                torch_estimator._read_checkpoint = mock.Mock(
                    side_effect=torch_estimator._read_checkpoint)

                ckpt_path = store.get_checkpoint_path(run_id)
                assert not store.exists(ckpt_path)
                torch_estimator._read_checkpoint.assert_not_called()
                torch_estimator.fit(df)

                assert store.exists(ckpt_path)
                torch_estimator.fit(df)
                torch_estimator._read_checkpoint.assert_called()
示例#6
0
    def test_model_serialization(self, mock_remote_trainer):
        model = create_xor_model()
        optimizer = tf.keras.optimizers.SGD(lr=0.1)
        loss = 'binary_crossentropy'

        def train(serialized_model, train_rows, val_rows, avg_row_size):
            return None, serialized_model, 2

        mock_remote_trainer.return_value = train

        with spark_session('test_model_serialization') as spark:
            df = create_xor_data(spark)

            keras_estimator = hvd.KerasEstimator(model=model,
                                                 optimizer=optimizer,
                                                 loss=loss,
                                                 feature_cols=['features'],
                                                 label_cols=['y'],
                                                 batch_size=1,
                                                 epochs=3,
                                                 verbose=2)

            backend = CallbackBackend()
            with local_store() as store:
                with temppath() as saved_path:
                    keras_estimator.save(saved_path)
                    keras_estimator_loaded = hvd.KerasEstimator.load(
                        saved_path)

                keras_model = keras_estimator_loaded.fit(
                    df,
                    params={
                        keras_estimator_loaded.backend: backend,
                        keras_estimator_loaded.store: store
                    })

                trained_model = keras_model.getModel()
                pred = trained_model.predict(
                    [np.ones([1, 2], dtype=np.float32)])
                assert len(pred) == 1
                assert pred.dtype == np.float32
示例#7
0
    def test_restore_from_checkpoint(self):
        model = create_xor_model()
        optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
        loss = nn.BCELoss()

        with spark_session('test_restore_from_checkpoint') as spark:
            df = create_xor_data(spark)

            ctx = CallbackBackend()

            run_id = 'run01'
            with local_store() as store:
                torch_estimator = hvd_spark.TorchEstimator(
                    backend=ctx,
                    store=store,
                    model=model,
                    optimizer=optimizer,
                    loss=loss,
                    input_shapes=[[2]],
                    feature_cols=['features'],
                    label_cols=['y'],
                    batch_size=1,
                    epochs=1,
                    verbose=2,
                    run_id=run_id)

                torch_estimator._load_checkpoint = mock.Mock(
                    side_effect=torch_estimator._load_checkpoint)

                ckpt_path = store.get_checkpoint_path(run_id)
                assert not store.exists(ckpt_path)
                torch_estimator._load_checkpoint.assert_not_called()
                torch_estimator.fit(df)

                assert store.exists(ckpt_path)
                torch_estimator.fit(df)
                torch_estimator._load_checkpoint.assert_called()
示例#8
0
    def test_restore_from_checkpoint(self):
        self.skipTest('There is a deadlock bug for checkpoint call back. ' +
                      'Will add this test back when it is solved.')

        model = create_xor_model()

        with spark_session('test_restore_from_checkpoint') as spark:
            df = create_noisy_xor_data(spark)

            ctx = CallbackBackend()

            run_id = 'run01'
            with local_store() as store:
                torch_estimator = hvd_spark.TorchEstimator(
                    backend=ctx,
                    store=store,
                    model=model,
                    input_shapes=[[-1, 2]],
                    feature_cols=['features'],
                    label_cols=['y'],
                    validation=0.2,
                    batch_size=4,
                    epochs=2,
                    verbose=2,
                    run_id=run_id)

                torch_estimator._read_checkpoint = Mock(
                    side_effect=torch_estimator._read_checkpoint)

                ckpt_path = store.get_checkpoint_path(run_id)
                assert not store.exists(ckpt_path)
                torch_estimator._read_checkpoint.assert_not_called()
                torch_estimator.fit(df)

                assert store.exists(ckpt_path)
                torch_estimator.fit(df)
                torch_estimator._read_checkpoint.assert_called()
示例#9
0
    def test_keras_model_checkpoint_callback(self, mock_fit_fn, mock_pin_gpu_fn):
        from horovod.tensorflow.keras.callbacks import BestModelCheckpoint

        def _get_mock_fit_fn(checkpoint_callback_provided):
            def fit(model, train_data, val_data, steps_per_epoch, validation_steps, callbacks,
                    verbose):
                returned_model_checkpoint_present = False
                model_checkpoint_present = False
                for callback in callbacks:
                    callback.set_model(model)
                    if checkpoint_callback_provided:
                        callback.on_epoch_end(0, logs={'binary_crossentropy': 0.3})
                    else:
                        callback.on_epoch_end(0, logs={'binary_crossentropy': 0.3})

                    if checkpoint_callback_provided and isinstance(callback, BestModelCheckpoint):
                        self.assertIsNotNone(callback.filepath)
                        self.assertTrue(callback.save_best_only)
                        self.assertEqual(callback.monitor, 'binary_crossentropy')
                        returned_model_checkpoint_present = True

                    if not checkpoint_callback_provided and isinstance(callback, tf.keras.callbacks.ModelCheckpoint):
                        self.assertFalse(callback.save_best_only)
                        self.assertFalse(callback.save_best_only)
                        self.assertEqual(callback.monitor, 'val_loss')
                        model_checkpoint_present = True

                if checkpoint_callback_provided:
                    self.assertTrue(returned_model_checkpoint_present)
                    self.assertFalse(model_checkpoint_present)
                else:
                    self.assertFalse(returned_model_checkpoint_present)
                    self.assertTrue(model_checkpoint_present)

                return mock.Mock()

            return fit

        mock_pin_gpu_fn.return_value = mock.Mock()

        with spark_session('test_keras_model_chekcpoint_callbacks') as spark:
            df = create_xor_data(spark)

            backend = CallbackBackend()
            with local_store() as store:
                store.get_train_data_path = lambda v=None: store._train_path
                store.get_val_data_path = lambda v=None: store._val_path

                with util.prepare_data(backend.num_processes(),
                                       store,
                                       df,
                                       feature_columns=['features'],
                                       label_columns=['y']):
                    model = create_xor_model()
                    optimizer = tf.keras.optimizers.SGD(lr=0.1)
                    loss = 'binary_crossentropy'

                    # Test when the checkpoint callback is not set, the correct one is created
                    mock_fit_fn.return_value = _get_mock_fit_fn(checkpoint_callback_provided=False)
                    est = hvd.KerasEstimator(
                        backend=backend,
                        store=store,
                        model=model,
                        optimizer=optimizer,
                        loss=loss,
                        feature_cols=['features'],
                        label_cols=['y'],
                        batch_size=1,
                        epochs=3,
                        verbose=2)

                    transformer = est.fit_on_parquet()
                    predictions = transformer.transform(df)
                    assert predictions.count() == df.count()

                    # Test if checkpoint call back is correctly set to the model
                    mock_fit_fn.return_value = _get_mock_fit_fn(checkpoint_callback_provided=True)
                    checkpoint_callback = BestModelCheckpoint(monitor='binary_crossentropy')
                    est = hvd.KerasEstimator(
                        backend=backend,
                        store=store,
                        model=model,
                        optimizer=optimizer,
                        loss=loss,
                        feature_cols=['features'],
                        label_cols=['y'],
                        batch_size=1,
                        epochs=3,
                        verbose=2,
                        checkpoint_callback=checkpoint_callback)

                    transformer = est.fit_on_parquet()
                    predictions = transformer.transform(df)
                    assert predictions.count() == df.count()
示例#10
0
    def test_direct_parquet_train_with_no_val_column(self):
        if skip_lightning_tests:
            self.skipTest(
                'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: '
                'https://github.com/horovod/horovod/pull/3263')

        with spark_session(
                'test_direct_parquet_train_with_no_val_column') as spark:
            df_train = create_noisy_xor_data(spark)
            df_val = create_noisy_xor_data(spark)

            def to_petastorm(df):
                metadata = None
                if util._has_vector_column(df):
                    to_petastorm = util.to_petastorm_fn(["features", "y"],
                                                        metadata)
                    df = df.rdd.map(to_petastorm).toDF()
                return df

            df_train = to_petastorm(df_train)
            df_val = to_petastorm(df_val)

            df_train.show(1)
            print(df_train.count())
            df_val.show(1)
            print(df_val.count())

            backend = CallbackBackend()
            with local_store() as store:
                store.get_train_data_path = lambda v=None: store._train_path
                store.get_val_data_path = lambda v=None: store._val_path

                print(store.get_train_data_path())
                print(store.get_val_data_path())

                df_train \
                    .coalesce(4) \
                    .write \
                    .mode('overwrite') \
                    .parquet(store.get_train_data_path())

                df_val \
                    .coalesce(4) \
                    .write \
                    .mode('overwrite') \
                    .parquet(store.get_val_data_path())

                model = create_xor_model()

                inmemory_cache_all = True
                reader_pool_type = 'process'
                est = hvd_spark.TorchEstimator(
                    backend=backend,
                    store=store,
                    model=model,
                    input_shapes=[[-1, 2]],
                    feature_cols=['features'],
                    label_cols=['y'],
                    batch_size=64,
                    epochs=2,
                    verbose=2,
                    inmemory_cache_all=inmemory_cache_all,
                    reader_pool_type=reader_pool_type)

                # set validation to any random strings would work.
                est.setValidation("True")

                transformer = est.fit_on_parquet()

                predictions = transformer.transform(df_train)
                assert predictions.count() == df_train.count()