def get_petastorm_schema(name, column_list): petastorm_column_list = [] for _column in column_list: petastorm_column = SchemaUtils.get_petastorm_column(_column) petastorm_column_list.append(petastorm_column) petastorm_schema = Unischema(name, petastorm_column_list) return petastorm_schema
def test_dict_to_spark_row_field_validation_scalar_nullable(): """Test various validations done on data types when converting a dictionary to a spark row""" TestSchema = Unischema('TestSchema', [ UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), True), UnischemaField('nullable_implicitly_set', np.string_, (), ScalarCodec(StringType()), True), ]) assert isinstance(dict_to_spark_row(TestSchema, {'string_field': None}), Row)
def test_create_schema_view_using_regex_and_unischema_fields_with_duplicates(): TestSchema = Unischema('TestSchema', [ UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False), UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False), UnischemaField('other_string_field', np.string_, (), ScalarCodec(StringType()), False), ]) view = TestSchema.create_schema_view(['int.*$', TestSchema.int_field]) assert set(view.fields.keys()) == {'int_field'}
def test_create_schema_view_using_invalid_type(): """ Exercises code paths unischema.create_schema_view ValueError, and unischema.__str__.""" TestSchema = Unischema('TestSchema', [ UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False), UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False), ]) with pytest.raises(ValueError, match='must be either a string'): TestSchema.create_schema_view([42])
def test_field_name_conflict_with_unischema_attribute(self): # fields is an existing attribute of Unischema with pytest.warns(UserWarning, match='Can not create dynamic property'): Unischema('TestSchema', [ UnischemaField('fields', np.int32, (), ScalarCodec(StringType()), True) ])
def test_as_spark_schema_unspecified_codec_type_for_non_scalars_raises(): """Do not currently support choosing spark type automatically for non-scalar types.""" TestSchema = Unischema('TestSchema', [ UnischemaField('int_vector_unspecified_codec', np.int8, (1,)), ]) with pytest.raises(ValueError, match='has codec set to None'): TestSchema.as_spark_schema()
def test_decode_numpy_scalar_with_unknown_dtype(): """If numpy_dtype is None, then the value is not decoded, just passed through.""" MatrixSchema = Unischema('TestSchema', [UnischemaField('scalar', None, ())]) row = {'scalar': [4, 2]} decoded_value = decode_row(row, MatrixSchema)['scalar'] assert decoded_value == [4, 2]
def test_create_schema_view_fails_validate(): """ Exercises code paths unischema.create_schema_view ValueError, and unischema.__str__.""" TestSchema = Unischema('TestSchema', [ UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False), UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False), ]) with pytest.raises(ValueError, match='does not belong to the schema'): TestSchema.create_schema_view([UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False)])
def main(source, target, test_size, under_sampling): source_data_dir_path = Path(source) target_data_dir_path = Path(target) # prepare dir for dataset application_data_dir_path = target_data_dir_path / 'application_classification' traffic_data_dir_path = target_data_dir_path / 'traffic_classification' # initialise local spark os.environ['PYSPARK_PYTHON'] = sys.executable os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable memory_gb = psutil.virtual_memory().available // 1024 // 1024 // 1024 spark = (SparkSession.builder.master('local[*]').config( 'spark.driver.memory', f'{memory_gb}g').config('spark.driver.host', '127.0.0.1').getOrCreate()) # prepare final schema schema = Unischema('data_schema', [ UnischemaField('feature', np.float32, (1, 1500), CompressedNdarrayCodec(), False), UnischemaField('flow_feature', np.float32, (1, 76), CompressedNdarrayCodec(), False), UnischemaField('label', np.int32, (), ScalarCodec(LongType()), False), ]) # read data+ df = spark.read.parquet( f'{source_data_dir_path.absolute().as_uri()}/*.parquet') # prepare data for application classification and traffic classification print('processing application classification dataset') create_train_test_for_task(df=df, label_col='app_label', spark=spark, schema=schema, test_size=test_size, under_sampling=under_sampling, data_dir_path=application_data_dir_path) print('processing traffic classification dataset') create_train_test_for_task(df=df, label_col='traffic_label', spark=spark, schema=schema, test_size=test_size, under_sampling=under_sampling, data_dir_path=traffic_data_dir_path) # stats print_df_label_distribution(spark, schema, application_data_dir_path / 'train.parquet') print_df_label_distribution(spark, schema, application_data_dir_path / 'test.parquet') print_df_label_distribution(spark, schema, traffic_data_dir_path / 'train.parquet') print_df_label_distribution(spark, schema, traffic_data_dir_path / 'test.parquet')
def test_create_schema_view_using_regex(self): TestSchema = Unischema('TestSchema', [ UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False), UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False), ]) view = TestSchema.create_schema_view(['int.*$']) self.assertEqual(set(view.fields.keys()), {'int_field'})
def test_partial_application(self): unischema = Unischema('foo', []) func = partial(dict_to_spark_row, unischema) func({}) # Must pass as positional arg in the right order func = partial(dict_to_spark_row, {}) with self.assertRaises(AssertionError): func(Unischema)
def test_as_spark_schema_unspecified_codec_type_unknown_scalar_type_raises(): """We have a limited list of scalar types we can automatically map from numpy (+Decimal) types to spark types. Make sure that a ValueError is raised if an unknown type is used.""" TestSchema = Unischema('TestSchema', [ UnischemaField('int_vector_unspecified_codec', object, ()), ]) with pytest.raises(ValueError, match='Was not able to map type'): TestSchema.as_spark_schema()
def test_create_schema_view_fails_validate(self): """ Exercises code paths unischema.create_schema_view ValueError, and unischema.__str__.""" TestSchema = Unischema('TestSchema', [ UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False), UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False), ]) with self.assertRaises(ValueError) as ex: TestSchema.create_schema_view([UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False)]) self.assertTrue('does not belong to the schema' in str(ex.exception))
def test_create_schema_view_using_invalid_type(self): """ Exercises code paths unischema.create_schema_view ValueError, and unischema.__str__.""" TestSchema = Unischema('TestSchema', [ UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False), UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False), ]) with self.assertRaises(ValueError) as ex: TestSchema.create_schema_view([42]) self.assertTrue('must be either a string' in str(ex.exception))
def test_create_schema_view_no_field_matches_regex(): TestSchema = Unischema('TestSchema', [ UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False), UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False), ]) view = TestSchema.create_schema_view(['bogus']) assert not view.fields
def test_create_schema_view_using_unischema_fields(): TestSchema = Unischema('TestSchema', [ UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False), UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False), ]) view = TestSchema.create_schema_view([TestSchema.int_field]) assert set(view.fields.keys()) == {'int_field'}
def test_decode_numpy_scalar_when_codec_is_none(): """Decoding a row that has a field with the codec set to None. The type should be deduced automatically from UnischemaField's numpy_dtype attribute""" MatrixSchema = Unischema('TestSchema', [UnischemaField('scalar', np.float64, ())]) row = {'scalar': 42.0} decoded_value = decode_row(row, MatrixSchema)['scalar'] assert decoded_value == 42 assert isinstance(decoded_value, np.float64)
def test_fields(): """Try using 'fields' getter""" TestSchema = Unischema('TestSchema', [ UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False), UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False), ]) assert len(TestSchema.fields) == 2 assert TestSchema.fields['int_field'].name == 'int_field' assert TestSchema.fields['string_field'].name == 'string_field'
def test_fields(self): """Try using 'fields' getter""" TestSchema = Unischema('TestSchema', [ UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False), UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False), ]) self.assertEqual(len(TestSchema.fields), 2) self.assertEqual(TestSchema.fields['int_field'].name, 'int_field') self.assertEqual(TestSchema.fields['string_field'].name, 'string_field')
def test_match_unischema_fields(): TestSchema = Unischema('TestSchema', [ UnischemaField('int32', np.int32, (), None, False), UnischemaField('uint8', np.uint8, (), None, False), UnischemaField('uint16', np.uint16, (), None, False), ]) assert match_unischema_fields(TestSchema, ['.*nt.*6']) == [TestSchema.uint16] assert match_unischema_fields(TestSchema, ['nomatch']) == [] assert set(match_unischema_fields(TestSchema, ['.*'])) == set(TestSchema.fields.values()) assert set(match_unischema_fields(TestSchema, ['int32', 'uint8'])) == {TestSchema.int32, TestSchema.uint8}
def main(train: str, test: str, target_train: str, target_test: str): # initialise logger logger = logging.getLogger(__file__) logger.addHandler(logging.StreamHandler()) logger.setLevel('INFO') logger.info('Initialising local spark') spark = init_local_spark() logger.info('Preparing schema') # petastorm schema schema = Unischema('data_schema', [ UnischemaField('time_window', np.str, (), ScalarCodec(StringType()), False), UnischemaField('src_ip', np.str, (), ScalarCodec(StringType()), False), UnischemaField('feature', np.float32, (1, 69), CompressedNdarrayCodec(), False), UnischemaField('label', np.str, (), ScalarCodec(StringType()), True), ]) # processing train logger.info('Processing train parquet files') logger.info('Read parquet') train_feature_df = spark.read.parquet(train) logger.info('Composing features...') train_input = FeatureComposer(spark, train_feature_df).transform( remove_malicious=True, remove_null_label=True) logger.info('Changing schema...') train_input = change_df_schema(spark, schema, train_input) logger.info('Persisting...') save_parquet_for_petastorm_parquet(spark, train_input, target_train, schema) logger.info('Train input done') # processing test logger.info('Processing test parquet files') logger.info('Read parquet') test_feature_df = spark.read.parquet(test) logger.info('Composing features...') test_input = FeatureComposer(spark, test_feature_df).transform( remove_malicious=False, remove_null_label=True) logger.info('Changing schema...') test_input = change_df_schema(spark, schema, test_input) logger.info('Persisting...') save_parquet_for_petastorm_parquet(spark, test_input, target_test, schema) logger.info('Test input done')
def test_filter_schema_fields_from_url(self): TestSchema = Unischema('TestSchema', [ UnischemaField('int32', np.int32, (), None, False), UnischemaField('uint8', np.uint8, (), None, False), UnischemaField('uint16', np.uint16, (), None, False), ]) assert match_unischema_fields(TestSchema, ['.*nt.*6']) == [TestSchema.uint16] assert match_unischema_fields(TestSchema, ['nomatch']) == [] assert match_unischema_fields(TestSchema, ['.*']) == list(TestSchema.fields.values()) assert match_unischema_fields(TestSchema, ['int32', 'uint8']) == [TestSchema.int32, TestSchema.uint8]
def test_decode_numpy_scalar_with_explicit_scalar_codec(): """Decoding a row that has a field with the codec set explicitly""" MatrixSchema = Unischema('TestSchema', [ UnischemaField('scalar', np.float64, (), ScalarCodec(DoubleType()), False) ]) row = {'scalar': 42.0} decoded_value = decode_row(row, MatrixSchema)['scalar'] assert decoded_value == 42 assert isinstance(decoded_value, np.float64)
def test_dict_to_spark_row_order(): TestSchema = Unischema('TestSchema', [ UnischemaField('float_col', np.float64, ()), UnischemaField('int_col', np.int64, ()), ]) row_dict = { TestSchema.int_col.name: 3, TestSchema.float_col.name: 2.0, } spark_row = dict_to_spark_row(TestSchema, row_dict) schema_field_names = list(TestSchema.fields) assert spark_row[0] == row_dict[schema_field_names[0]] assert spark_row[1] == row_dict[schema_field_names[1]]
def test_as_spark_schema(self): """Try using 'as_spark_schema' function""" TestSchema = Unischema('TestSchema', [ UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False), UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False), ]) spark_schema = TestSchema.as_spark_schema() self.assertEqual(spark_schema.fields[0].name, 'int_field') self.assertEqual(spark_schema.fields[1].name, 'string_field') self.assertEqual(TestSchema.fields['int_field'].name, 'int_field') self.assertEqual(TestSchema.fields['string_field'].name, 'string_field')
def test_invalid_schema_field(synthetic_dataset): # Let's assume we are selecting columns using a schema which is different from the one # stored in the dataset. Would expect to get a reasonable error message BogusSchema = Unischema('BogusSchema', [ UnischemaField('partition_key', np.string_, (), ScalarCodec(StringType()), False), UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False), UnischemaField('bogus_key', np.int32, (), ScalarCodec(ShortType()), False)]) expected_values = {'bogus_key': 11, 'id': 1} with pytest.raises(ValueError) as e: Reader(synthetic_dataset.url, schema_fields=BogusSchema.fields.values(), shuffle_options=ShuffleOptions(False), predicate=EqualPredicate(expected_values), reader_pool=ThreadPool(1)) assert 'bogus_key' in str(e)
def test_make_named_tuple(): TestSchema = Unischema('TestSchema', [ UnischemaField('string_scalar', np.string_, (), ScalarCodec(StringType()), True), UnischemaField('int32_scalar', np.int32, (), ScalarCodec(ShortType()), False), UnischemaField('uint8_scalar', np.uint8, (), ScalarCodec(ShortType()), False), UnischemaField('int32_matrix', np.float32, (10, 20, 3), NdarrayCodec(), True), UnischemaField('decimal_scalar', Decimal, (10, 20, 3), ScalarCodec(DecimalType(10, 9)), False), ]) TestSchema.make_namedtuple(string_scalar='abc', int32_scalar=10, uint8_scalar=20, int32_matrix=np.int32((10, 20, 3)), decimal_scalar=Decimal(123) / Decimal(10)) TestSchema.make_namedtuple(string_scalar=None, int32_scalar=10, uint8_scalar=20, int32_matrix=None, decimal_scalar=Decimal(123) / Decimal(10))
def test_schema_to_dtype_list(): TestSchema = Unischema('TestSchema', [ UnischemaField('int32', np.int32, (), None, False), UnischemaField('uint8', np.uint8, (), None, False), UnischemaField('uint16', np.uint16, (), None, False), UnischemaField('Decimal', Decimal, (), None, False), ]) actual_tf_dtype_list = _schema_to_tf_dtypes(TestSchema) # Note that the order of the fields is defined by alphabetical order of keys and always sorted by Unischema # to avoid ambiguity # [Decimal, int32, uint16, uint8] <- alphabetical order # [tf.string, tf.int32, tf.int32, tf.uint8] np.testing.assert_equal(actual_tf_dtype_list, [tf.string, tf.int32, tf.int32, tf.uint8])
def test_dict_to_spark_row_field_validation_ndarrays(): """Test various validations done on data types when converting a dictionary to a spark row""" TestSchema = Unischema('TestSchema', [ UnischemaField('tensor3d', np.float32, (10, 20, 30), NdarrayCodec(), False), ]) assert isinstance(dict_to_spark_row(TestSchema, {'tensor3d': np.zeros((10, 20, 30), dtype=np.float32)}), Row) # Null value into not nullable field with pytest.raises(ValueError): isinstance(dict_to_spark_row(TestSchema, {'string_field': None}), Row) # Wrong dimensions with pytest.raises(ValueError): isinstance(dict_to_spark_row(TestSchema, {'string_field': np.zeros((1, 2, 3), dtype=np.float32)}), Row)
def test_dict_to_spark_row_field_validation_scalar_types(): """Test various validations done on data types when converting a dictionary to a spark row""" TestSchema = Unischema('TestSchema', [ UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False), ]) assert isinstance(dict_to_spark_row(TestSchema, {'string_field': 'abc'}), Row) # Not a nullable field with pytest.raises(ValueError): isinstance(dict_to_spark_row(TestSchema, {'string_field': None}), Row) # Wrong field type with pytest.raises(TypeError): isinstance(dict_to_spark_row(TestSchema, {'string_field': []}), Row)