def test_get_col_info_error_bad_size(self): with spark_session('test_get_col_info_error_bad_size') as spark: data_bad_size = [[DenseVector([1.0, 1.0])], [DenseVector([1.0])]] schema = StructType([StructField('data', VectorUDT())]) df = create_test_data_from_schema(spark, data_bad_size, schema) with pytest.raises(ValueError): util._get_col_info(df)
def test_get_col_info_error_bad_shape(self): with spark_session('test_get_col_info_error_bad_shape') as spark: data_bad_shape = [[SparseVector(2, {0: 1.0})], [SparseVector(1, {0: 1.0})]] schema = StructType([StructField('data', VectorUDT())]) df = create_test_data_from_schema(spark, data_bad_shape, schema) with pytest.raises(ValueError): util._get_col_info(df)
def test_get_metadata(self): expected_metadata = \ { 'float': { 'spark_data_type': FloatType, 'is_sparse_vector_only': False, 'intermediate_format': constants.NOCHANGE, 'max_size': 1, 'shape': 1 }, 'dense': { 'spark_data_type': DenseVector, 'is_sparse_vector_only': False, 'intermediate_format': constants.ARRAY, 'max_size': 2, 'shape': 2 }, 'sparse': { 'spark_data_type': SparseVector, 'is_sparse_vector_only': True, 'intermediate_format': constants.CUSTOM_SPARSE, 'max_size': 1, 'shape': 2 }, 'mixed': { 'spark_data_type': DenseVector, 'is_sparse_vector_only': False, 'intermediate_format': constants.ARRAY, 'max_size': 2, 'shape': 2 }, } with spark_session('test_get_metadata') as spark: data = [[ 1.0, DenseVector([1.0, 1.0]), SparseVector(2, {0: 1.0}), DenseVector([1.0, 1.0]) ], [ 1.0, DenseVector([1.0, 1.0]), SparseVector(2, {1: 1.0}), SparseVector(2, {1: 1.0}) ]] schema = StructType([ StructField('float', FloatType()), StructField('dense', VectorUDT()), StructField('sparse', VectorUDT()), StructField('mixed', VectorUDT()) ]) df = create_test_data_from_schema(spark, data, schema) metadata = util._get_metadata(df) self.assertDictEqual(metadata, expected_metadata)
def test_train_val_split_ratio(self): with spark_session('test_train_val_split_ratio') as spark: data = [[1.0], [1.0], [1.0], [1.0], [1.0]] schema = StructType([StructField('data', FloatType())]) df = create_test_data_from_schema(spark, data, schema) validation = 0.2 train_df, val_df, validation_ratio = util._train_val_split( df, validation) # Only check validation ratio, as we can't rely on random splitting to produce an exact # result of 4 training and 1 validation samples. assert validation_ratio == validation
def test_check_shape_compatibility(self): feature_columns = ['x1', 'x2', 'features'] label_columns = ['y1', 'y_embedding'] schema = StructType([ StructField('x1', DoubleType()), StructField('x2', IntegerType()), StructField('features', VectorUDT()), StructField('y1', FloatType()), StructField('y_embedding', VectorUDT()) ]) data = [[ 1.0, 1, DenseVector([1.0] * 12), 1.0, DenseVector([1.0] * 12) ]] * 10 with spark_session('test_df_cache') as spark: df = create_test_data_from_schema(spark, data, schema) metadata = util._get_metadata(df) input_shapes = [[1], [1], [-1, 3, 4]] output_shapes = [[1], [-1, 3, 4]] util.check_shape_compatibility(metadata, feature_columns, label_columns, input_shapes, output_shapes) input_shapes = [[1], [1], [3, 2, 2]] output_shapes = [[1, 1], [-1, 2, 3, 2]] util.check_shape_compatibility(metadata, feature_columns, label_columns, input_shapes, output_shapes) bad_input_shapes = [[1], [1], [-1, 3, 5]] with pytest.raises(ValueError): util.check_shape_compatibility(metadata, feature_columns, label_columns, bad_input_shapes, output_shapes) bad_input_shapes = [[2], [1], [-1, 3, 4]] with pytest.raises(ValueError): util.check_shape_compatibility(metadata, feature_columns, label_columns, bad_input_shapes, output_shapes) bad_output_shapes = [[7], [-1, 3, 4]] with pytest.raises(ValueError): util.check_shape_compatibility(metadata, feature_columns, label_columns, input_shapes, bad_output_shapes)
def test_train_val_split_col_boolean(self): with spark_session('test_train_val_split_col_boolean') as spark: data = [ [1.0, False], [1.0, False], [1.0, False], [1.0, False], [1.0, True] ] schema = StructType([StructField('data', FloatType()), StructField('val', BooleanType())]) df = create_test_data_from_schema(spark, data, schema) validation = 'val' train_df, val_df, validation_ratio = util._train_val_split(df, validation) # Only check counts as validation ratio cannot be guaranteed due to approx calculation assert train_df.count() == 4 assert val_df.count() == 1
def test_get_col_info(self): with spark_session('test_get_col_info') as spark: data = [[ 0, 0.0, None, [1, 1], DenseVector([1.0, 1.0]), SparseVector(2, {1: 1.0}), DenseVector([1.0, 1.0]) ], [ 1, None, None, [1, 1], DenseVector([1.0, 1.0]), SparseVector(2, {1: 1.0}), SparseVector(2, {1: 1.0}) ]] schema = StructType([ StructField('int', IntegerType()), StructField('float', FloatType()), StructField('null', NullType()), StructField('array', ArrayType(IntegerType())), StructField('dense', VectorUDT()), StructField('sparse', VectorUDT()), StructField('mixed', VectorUDT()) ]) df = create_test_data_from_schema(spark, data, schema) all_col_types, col_shapes, col_max_sizes = util._get_col_info(df) expected = [ ('int', {int}, 1, 1), ('float', {float, NullType}, 1, 1), ('null', {NullType}, 1, 1), ('array', {list}, 2, 2), ('dense', {DenseVector}, 2, 2), ('sparse', {SparseVector}, 2, 1), ('mixed', {DenseVector, SparseVector}, 2, 2) ] for expected_col_info in expected: col_name, col_types, col_shape, col_size = expected_col_info assert all_col_types[col_name] == col_types, col_name assert col_shapes[col_name] == col_shape, col_name assert col_max_sizes[col_name] == col_size, col_name
def test_prepare_data_compress_sparse(self): util.clear_training_cache() expected_metadata = \ { 'float': { 'spark_data_type': FloatType, 'is_sparse_vector_only': False, 'intermediate_format': constants.NOCHANGE, 'max_size': 1, 'shape': 1 }, 'dense': { 'spark_data_type': DenseVector, 'is_sparse_vector_only': False, 'intermediate_format': constants.ARRAY, 'max_size': 2, 'shape': 2 }, 'sparse': { 'spark_data_type': SparseVector, 'is_sparse_vector_only': True, 'intermediate_format': constants.CUSTOM_SPARSE, 'max_size': 1, 'shape': 2 }, 'mixed': { 'spark_data_type': DenseVector, 'is_sparse_vector_only': False, 'intermediate_format': constants.ARRAY, 'max_size': 2, 'shape': 2 }, } with mock.patch('horovod.spark.common.util._get_metadata', side_effect=util._get_metadata) as mock_get_metadata: with spark_session('test_prepare_data') as spark: data = [[ 0.0, DenseVector([1.0, 1.0]), SparseVector(2, {1: 1.0}), DenseVector([1.0, 1.0]) ], [ 1.0, DenseVector([1.0, 1.0]), SparseVector(2, {1: 1.0}), SparseVector(2, {1: 1.0}) ]] schema = StructType([ StructField('float', FloatType()), StructField('dense', VectorUDT()), StructField('sparse', VectorUDT()), StructField('mixed', VectorUDT()) ]) df = create_test_data_from_schema(spark, data, schema) with local_store() as store: with util.prepare_data( num_processes=2, store=store, df=df, feature_columns=['dense', 'sparse', 'mixed'], label_columns=['float'], compress_sparse=True) as dataset_idx: mock_get_metadata.assert_called() assert dataset_idx == 0 train_rows, val_rows, metadata, avg_row_size = util.get_dataset_properties( dataset_idx) self.assertDictEqual(metadata, expected_metadata)