예제 #1
0
    def test_get_col_info_error_bad_size(self):
        with spark_session('test_get_col_info_error_bad_size') as spark:
            data_bad_size = [[DenseVector([1.0, 1.0])], [DenseVector([1.0])]]
            schema = StructType([StructField('data', VectorUDT())])
            df = create_test_data_from_schema(spark, data_bad_size, schema)

            with pytest.raises(ValueError):
                util._get_col_info(df)
예제 #2
0
    def test_get_col_info_error_bad_shape(self):
        with spark_session('test_get_col_info_error_bad_shape') as spark:
            data_bad_shape = [[SparseVector(2, {0: 1.0})],
                              [SparseVector(1, {0: 1.0})]]
            schema = StructType([StructField('data', VectorUDT())])
            df = create_test_data_from_schema(spark, data_bad_shape, schema)

            with pytest.raises(ValueError):
                util._get_col_info(df)
예제 #3
0
    def test_get_metadata(self):
        expected_metadata = \
            {
                'float': {
                    'spark_data_type': FloatType,
                    'is_sparse_vector_only': False,
                    'intermediate_format': constants.NOCHANGE,
                    'max_size': 1,
                    'shape': 1
                },
                'dense': {
                    'spark_data_type': DenseVector,
                    'is_sparse_vector_only': False,
                    'intermediate_format': constants.ARRAY,
                    'max_size': 2,
                    'shape': 2
                },
                'sparse': {
                    'spark_data_type': SparseVector,
                    'is_sparse_vector_only': True,
                    'intermediate_format': constants.CUSTOM_SPARSE,
                    'max_size': 1,
                    'shape': 2
                },
                'mixed': {
                    'spark_data_type': DenseVector,
                    'is_sparse_vector_only': False,
                    'intermediate_format': constants.ARRAY,
                    'max_size': 2,
                    'shape': 2
                },
            }

        with spark_session('test_get_metadata') as spark:
            data = [[
                1.0,
                DenseVector([1.0, 1.0]),
                SparseVector(2, {0: 1.0}),
                DenseVector([1.0, 1.0])
            ],
                    [
                        1.0,
                        DenseVector([1.0, 1.0]),
                        SparseVector(2, {1: 1.0}),
                        SparseVector(2, {1: 1.0})
                    ]]
            schema = StructType([
                StructField('float', FloatType()),
                StructField('dense', VectorUDT()),
                StructField('sparse', VectorUDT()),
                StructField('mixed', VectorUDT())
            ])
            df = create_test_data_from_schema(spark, data, schema)

            metadata = util._get_metadata(df)
            self.assertDictEqual(metadata, expected_metadata)
예제 #4
0
    def test_train_val_split_ratio(self):
        with spark_session('test_train_val_split_ratio') as spark:
            data = [[1.0], [1.0], [1.0], [1.0], [1.0]]
            schema = StructType([StructField('data', FloatType())])
            df = create_test_data_from_schema(spark, data, schema)

            validation = 0.2
            train_df, val_df, validation_ratio = util._train_val_split(
                df, validation)

            # Only check validation ratio, as we can't rely on random splitting to produce an exact
            # result of 4 training and 1 validation samples.
            assert validation_ratio == validation
예제 #5
0
    def test_check_shape_compatibility(self):
        feature_columns = ['x1', 'x2', 'features']
        label_columns = ['y1', 'y_embedding']

        schema = StructType([
            StructField('x1', DoubleType()),
            StructField('x2', IntegerType()),
            StructField('features', VectorUDT()),
            StructField('y1', FloatType()),
            StructField('y_embedding', VectorUDT())
        ])
        data = [[
            1.0, 1,
            DenseVector([1.0] * 12), 1.0,
            DenseVector([1.0] * 12)
        ]] * 10

        with spark_session('test_df_cache') as spark:
            df = create_test_data_from_schema(spark, data, schema)
            metadata = util._get_metadata(df)

            input_shapes = [[1], [1], [-1, 3, 4]]
            output_shapes = [[1], [-1, 3, 4]]
            util.check_shape_compatibility(metadata, feature_columns,
                                           label_columns, input_shapes,
                                           output_shapes)

            input_shapes = [[1], [1], [3, 2, 2]]
            output_shapes = [[1, 1], [-1, 2, 3, 2]]
            util.check_shape_compatibility(metadata, feature_columns,
                                           label_columns, input_shapes,
                                           output_shapes)

            bad_input_shapes = [[1], [1], [-1, 3, 5]]
            with pytest.raises(ValueError):
                util.check_shape_compatibility(metadata, feature_columns,
                                               label_columns, bad_input_shapes,
                                               output_shapes)

            bad_input_shapes = [[2], [1], [-1, 3, 4]]
            with pytest.raises(ValueError):
                util.check_shape_compatibility(metadata, feature_columns,
                                               label_columns, bad_input_shapes,
                                               output_shapes)

            bad_output_shapes = [[7], [-1, 3, 4]]
            with pytest.raises(ValueError):
                util.check_shape_compatibility(metadata, feature_columns,
                                               label_columns, input_shapes,
                                               bad_output_shapes)
예제 #6
0
    def test_train_val_split_col_boolean(self):
        with spark_session('test_train_val_split_col_boolean') as spark:
            data = [
                [1.0, False], [1.0, False], [1.0, False], [1.0, False], [1.0, True]
            ]
            schema = StructType([StructField('data', FloatType()), StructField('val', BooleanType())])
            df = create_test_data_from_schema(spark, data, schema)

            validation = 'val'
            train_df, val_df, validation_ratio = util._train_val_split(df, validation)

            # Only check counts as validation ratio cannot be guaranteed due to approx calculation
            assert train_df.count() == 4
            assert val_df.count() == 1
예제 #7
0
    def test_get_col_info(self):
        with spark_session('test_get_col_info') as spark:
            data = [[
                0,
                0.0,
                None,
                [1, 1],
                DenseVector([1.0, 1.0]),
                SparseVector(2, {1: 1.0}),
                DenseVector([1.0, 1.0])
            ], [
                1,
                None,
                None,
                [1, 1],
                DenseVector([1.0, 1.0]),
                SparseVector(2, {1: 1.0}),
                SparseVector(2, {1: 1.0})
            ]]

            schema = StructType([
                StructField('int', IntegerType()),
                StructField('float', FloatType()),
                StructField('null', NullType()),
                StructField('array', ArrayType(IntegerType())),
                StructField('dense', VectorUDT()),
                StructField('sparse', VectorUDT()),
                StructField('mixed', VectorUDT())
            ])

            df = create_test_data_from_schema(spark, data, schema)
            all_col_types, col_shapes, col_max_sizes = util._get_col_info(df)

            expected = [
                ('int', {int}, 1, 1),
                ('float', {float, NullType}, 1, 1),
                ('null', {NullType}, 1, 1),
                ('array', {list}, 2, 2),
                ('dense', {DenseVector}, 2, 2),
                ('sparse', {SparseVector}, 2, 1),
                ('mixed', {DenseVector, SparseVector}, 2, 2)
            ]

            for expected_col_info in expected:
                col_name, col_types, col_shape, col_size = expected_col_info
                assert all_col_types[col_name] == col_types, col_name
                assert col_shapes[col_name] == col_shape, col_name
                assert col_max_sizes[col_name] == col_size, col_name
예제 #8
0
    def test_prepare_data_compress_sparse(self):
        util.clear_training_cache()

        expected_metadata = \
            {
                'float': {
                    'spark_data_type': FloatType,
                    'is_sparse_vector_only': False,
                    'intermediate_format': constants.NOCHANGE,
                    'max_size': 1,
                    'shape': 1
                },
                'dense': {
                    'spark_data_type': DenseVector,
                    'is_sparse_vector_only': False,
                    'intermediate_format': constants.ARRAY,
                    'max_size': 2,
                    'shape': 2
                },
                'sparse': {
                    'spark_data_type': SparseVector,
                    'is_sparse_vector_only': True,
                    'intermediate_format': constants.CUSTOM_SPARSE,
                    'max_size': 1,
                    'shape': 2
                },
                'mixed': {
                    'spark_data_type': DenseVector,
                    'is_sparse_vector_only': False,
                    'intermediate_format': constants.ARRAY,
                    'max_size': 2,
                    'shape': 2
                },
            }

        with mock.patch('horovod.spark.common.util._get_metadata',
                        side_effect=util._get_metadata) as mock_get_metadata:
            with spark_session('test_prepare_data') as spark:
                data = [[
                    0.0,
                    DenseVector([1.0, 1.0]),
                    SparseVector(2, {1: 1.0}),
                    DenseVector([1.0, 1.0])
                ],
                        [
                            1.0,
                            DenseVector([1.0, 1.0]),
                            SparseVector(2, {1: 1.0}),
                            SparseVector(2, {1: 1.0})
                        ]]

                schema = StructType([
                    StructField('float', FloatType()),
                    StructField('dense', VectorUDT()),
                    StructField('sparse', VectorUDT()),
                    StructField('mixed', VectorUDT())
                ])

                df = create_test_data_from_schema(spark, data, schema)

                with local_store() as store:
                    with util.prepare_data(
                            num_processes=2,
                            store=store,
                            df=df,
                            feature_columns=['dense', 'sparse', 'mixed'],
                            label_columns=['float'],
                            compress_sparse=True) as dataset_idx:
                        mock_get_metadata.assert_called()
                        assert dataset_idx == 0

                        train_rows, val_rows, metadata, avg_row_size = util.get_dataset_properties(
                            dataset_idx)
                        self.assertDictEqual(metadata, expected_metadata)