def test_build_lambda_numbers(self): st = SyntaxTree() st.operation = "_" st.children = [13] # as if it parsed parsed_transformations = [FieldTransformation("a", st)] operations = TransformationOperations(self.config) transformations_validator = TransformationsValidator( operations, self.data_structure) _ = transformations_validator.validate(parsed_transformations) creator = TransformationCreator(self.data_structure, parsed_transformations, TransformationOperations(self.config)) transformation = creator.build_lambda() self.assertIsInstance(transformation, types.LambdaType, "Transformation type should be lambda") spark = SparkSession.builder.getOrCreate() file = spark.read.csv(DATA_PATH, self.data_structure_pyspark) result = file.rdd.map(transformation) result = result.collect() self.assertListEqual(result, [(13, ), (13, ), (13, ), (13, ), (13, )], "List of tuples should be equal") spark.stop()
def test_build_lambda_with_nested_literals(self): st = SyntaxTree() st.operation = "concat" # should cast int to str and concat st.children = ["'6'", "packet_size"] # packet_size [74, 68] st2 = SyntaxTree() st2.operation = "concat" st2.children = [2E+2, st] parsed_transformations = [FieldTransformation("nested", st2)] creator = TransformationCreator(self.data_structure, parsed_transformations, TransformationOperations(self.config)) transformation = creator.build_lambda() self.assertIsInstance(transformation, types.LambdaType, "Transformation type should be lambda") spark = SparkSession.builder.getOrCreate() file = spark.read.csv(DATA_PATH, self.data_structure_pyspark) result = file.rdd.map(transformation) result = result.collect() self.assertListEqual(result, [('200.0674', ), ('200.0668', ), ('200.061510', ), ('200.06185', ), ('200.06185', )], "List of tuples should be equal") spark.stop()
def test_build_lambda_add_scientific(self): st = SyntaxTree() st.operation = "add" st.children = [1.2E+5, 1.0] parsed_transformations = [FieldTransformation("sum", st)] creator = TransformationCreator(self.data_structure, parsed_transformations, TransformationOperations(self.config)) transformation = creator.build_lambda() self.assertIsInstance(transformation, types.LambdaType, "Transformation type should be lambda") spark = SparkSession.builder.getOrCreate() file = spark.read.csv(DATA_PATH, self.data_structure_pyspark) result = file.rdd.map(transformation) result = result.collect() self.assertListEqual(result, [(120001.0, ), (120001.0, ), (120001.0, ), (120001.0, ), (120001.0, )], "List of tuples should be equal") spark.stop()
def test_build_lambda_truncate(self): st = SyntaxTree() st.operation = "truncate" st.children = ["'test'", 2] parsed_transformations = [ FieldTransformation("cut_upto_2_symbols", st) ] creator = TransformationCreator(self.data_structure, parsed_transformations, TransformationOperations(self.config)) transformation = creator.build_lambda() self.assertIsInstance(transformation, types.LambdaType, "Transformation type should be lambda") spark = SparkSession.builder.getOrCreate() file = spark.read.csv(DATA_PATH, self.data_structure_pyspark) result = file.rdd.map(transformation) result = result.collect() self.assertListEqual(result, [('te', ), ('te', ), ('te', ), ('te', ), ('te', )], "List of tuples should be equal") spark.stop()
def test_validate_raise_field_not_exists_when_rename_field(self): validator = TransformationsValidator( TransformationOperations(CONFIG_PATH), self.data_structure_pyspark) with self.assertRaises(errors.FieldNotExists): validator.validate( [FieldTransformation("size", "not_exists_field"), "dst_ip"])
def test_validate_with_correct_two_level_subtree(self): validator = TransformatoinsValidator( TransformationOperations({ "country": "./GeoLite2-Country.mmdb", "city": "./GeoLite2-City.mmdb", "asn": "./GeoLite2-ASN.mmdb" }), self.data_structure_pyspark) syntaxtree = SyntaxTree() syntaxtree.operation = "sum" syntaxtree.children = ["sampling_rate", "packet_size"] main_syntax_tree = SyntaxTree() main_syntax_tree.operation = "mult" main_syntax_tree.children = [syntaxtree, "sampling_rate"] fields = validator.validate( [FieldTransformation("result", main_syntax_tree), "dst_ip"]) self.assertEqual( fields, types.StructType([ types.StructField('result', types.LongType()), types.StructField('dst_ip', types.StringType()) ]))
def test_build_lambda_with_literals(self): st = SyntaxTree() st.operation = "concat" st.children = ["'6 - '", "packet_size"] # packet_size [74, 68] parsed_transformations = [FieldTransformation("ephemer", st)] creator = TransformationCreator(self.data_structure, parsed_transformations, TransformationOperations(self.config)) transformation = creator.build_lambda() self.assertIsInstance(transformation, types.LambdaType, "Transformation type should be lambda") spark = SparkSession.builder.getOrCreate() file = spark.read.csv(DATA_PATH, self.data_structure_pyspark) result = file.rdd.map(transformation) result = result.collect() self.assertListEqual(result, [("6 - 74", ), ("6 - 68", ), ("6 - 1510", ), ("6 - 185", ), ("6 - 185", )], "List of tuples should be equal")
def test_build_lambda_with_nested_operations(self): mult_syntax_tree = SyntaxTree() mult_syntax_tree.operation = "mult" mult_syntax_tree.children = ["packet_size", "sampling_rate"] root_mult_st = SyntaxTree() root_mult_st.operation = "mult" root_mult_st.children = [mult_syntax_tree, "10"] parsed_transformations = [ "src_ip", FieldTransformation("destination_ip", "dst_ip"), FieldTransformation("traffic", root_mult_st) ] creator = TransformationCreator( self.data_structure, parsed_transformations, TransformationOperations({ "country": "./GeoLite2-Country.mmdb", "city": "./GeoLite2-City.mmdb", "asn": "./GeoLite2-ASN.mmdb" })) transformation = creator.build_lambda() self.assertIsInstance(transformation, types.LambdaType, "Transformation type should be lambda") spark = SparkSession.builder.getOrCreate() file = spark.read.csv(DATA_PATH, self.data_structure_pyspark) result = file.rdd.map(transformation) result = result.collect() self.assertListEqual(result, [("217.69.143.60", "91.221.61.183", 378880), ("91.221.61.168", "90.188.114.141", 348160), ("91.226.13.80", "5.136.78.36", 7731200), ("192.168.30.2", "192.168.30.1", 947200), ("192.168.30.2", "192.168.30.1", 947200)], "List of tuples should be equal") spark.stop()
def test_validate_raise_operation_not_supported_error(self): validator = TransformationsValidator( TransformationOperations(CONFIG_PATH), self.data_structure_pyspark) syntaxtree = SyntaxTree() syntaxtree.operation = "not_exists_operation" with self.assertRaises(errors.OperationNotSupportedError): validator.validate( [FieldTransformation("size", syntaxtree), "dst_ip"])
def test_validate_raise_incorrect_argument_type_for_operation_error(self): validator = TransformationsValidator( TransformationOperations(CONFIG_PATH), self.data_structure_pyspark) syntaxtree = SyntaxTree() syntaxtree.operation = "mul" syntaxtree.children = ["src_ip", "packet_size"] with self.assertRaises(errors.IncorrectArgumentTypeForOperationError): validator.validate( [FieldTransformation("traffic", syntaxtree), "dst_ip"])
def test_validate_raise_field_not_exists_when_rename_field(self): validator = TransformatoinsValidator( TransformationOperations({ "country": "./GeoLite2-Country.mmdb", "city": "./GeoLite2-City.mmdb", "asn": "./GeoLite2-ASN.mmdb" }), self.data_structure_pyspark) with self.assertRaises(errors.FieldNotExists): validator.validate( [FieldTransformation("size", "not_exists_field"), "dst_ip"])
def test_validate_correct_arguments_amount_for_operation_add(self): validator = TransformationsValidator( TransformationOperations(CONFIG_PATH), self.data_structure_pyspark) syntaxtree = SyntaxTree() syntaxtree.operation = "add" syntaxtree.children = [1, 2] fields = validator.validate([FieldTransformation("add", syntaxtree)]) self.assertEqual( fields, types.StructType([types.StructField("add", types.LongType())]))
def test_validate_raise_error_for_function_with_different_arguments_type( self): validator = TransformationsValidator( TransformationOperations(CONFIG_PATH), self.data_structure_pyspark) main_syntax_tree = SyntaxTree() main_syntax_tree.operation = "truncate" main_syntax_tree.children = ["src_ip", "dst_ip"] with self.assertRaises(errors.IncorrectArgumentTypeForOperationError): validator.validate( [FieldTransformation("result", main_syntax_tree)])
def test_validate_with_transformation_primitives(self): validator = TransformationsValidator( TransformationOperations(CONFIG_PATH), self.data_structure_pyspark) # test "{foo: 'bar'}"" tree = SyntaxTree() tree.operation = "concat" tree.children = [1, 2] fields = validator.validate([FieldTransformation("foo", tree)]) self.assertEqual( fields, types.StructType([types.StructField("foo", types.StringType())])) tree.children = ["'1'", "'2'"] fields = validator.validate([FieldTransformation("foo", tree)]) self.assertEqual( fields, types.StructType([types.StructField("foo", types.StringType())])) tree.children = [1E+2, "'1'"] fields = validator.validate([FieldTransformation("foo", tree)]) self.assertEqual( fields, types.StructType([types.StructField("foo", types.StringType())])) tree.children = ["'foo\'bar'", "'2'"] fields = validator.validate([FieldTransformation("foo", tree)]) self.assertEqual( fields, types.StructType([types.StructField("foo", types.StringType())])) tree.children = ["'foo\"bar'", 2] fields = validator.validate([FieldTransformation("foo", tree)]) self.assertEqual( fields, types.StructType([types.StructField("foo", types.StringType())]))
def test_validate_raise_incorrect_arguments_amount_for_operation_error( self): validator = TransformationsValidator( TransformationOperations(CONFIG_PATH), self.data_structure_pyspark) syntaxtree = SyntaxTree() syntaxtree.operation = "add" syntaxtree.children = [1, 2, 3] with self.assertRaises( errors.IncorrectArgumentsAmountForOperationError): validator.validate( [FieldTransformation("add", syntaxtree), "dst_ip"])
def test_validate_rename_field(self): validator = TransformationsValidator( TransformationOperations(CONFIG_PATH), self.data_structure_pyspark) fields = validator.validate( [FieldTransformation("size", "packet_size"), "dst_ip"]) self.assertEqual( fields, types.StructType([ types.StructField('size', types.LongType()), types.StructField('dst_ip', types.StringType()) ]))
def test_validate_function_with_different_arguments_type(self): validator = TransformationsValidator( TransformationOperations(CONFIG_PATH), self.data_structure_pyspark) tree = SyntaxTree() tree.operation = "truncate" tree.children = ["src_ip", 5] fields = validator.validate([FieldTransformation("result", tree)]) self.assertEqual( fields, types.StructType([types.StructField("result", types.StringType())]))
def test_validate_raise_operation_not_supported_error(self): validator = TransformatoinsValidator( TransformationOperations({ "country": "./GeoLite2-Country.mmdb", "city": "./GeoLite2-City.mmdb", "asn": "./GeoLite2-ASN.mmdb" }), self.data_structure_pyspark) syntaxtree = SyntaxTree() syntaxtree.operation = "not_exists_operation" with self.assertRaises(errors.OperationNotSupportedError): validator.validate( [FieldTransformation("size", syntaxtree), "dst_ip"])
def test_validate_raise_operation_not_supported_error_for_subtree(self): validator = TransformationsValidator( TransformationOperations(CONFIG_PATH), self.data_structure_pyspark) syntaxtree = SyntaxTree() syntaxtree.operation = "not_exists_operator" syntaxtree.children = ["1", "2"] main_syntax_tree = SyntaxTree() main_syntax_tree.operation = "mul" main_syntax_tree.children = [syntaxtree, "1"] with self.assertRaises(errors.OperationNotSupportedError): validator.validate( [FieldTransformation("result", main_syntax_tree), "dst_ip"])
def test_validate_raise_incorrect_argument_type_for_operation_error(self): validator = TransformatoinsValidator( TransformationOperations({ "country": "./GeoLite2-Country.mmdb", "city": "./GeoLite2-City.mmdb", "asn": "./GeoLite2-ASN.mmdb" }), self.data_structure_pyspark) syntaxtree = SyntaxTree() syntaxtree.operation = "mult" syntaxtree.children = ["src_ip", "packet_size"] with self.assertRaises(errors.IncorrectArgumentTypeForOperationError): validator.validate( [FieldTransformation("traffic", syntaxtree), "dst_ip"])
def test_validate_raise_error_for_function_with_different_arguments_type( self): validator = TransformatoinsValidator( TransformationOperations({ "country": "./GeoLite2-Country.mmdb", "city": "./GeoLite2-City.mmdb", "asn": "./GeoLite2-ASN.mmdb" }), self.data_structure_pyspark) main_syntax_tree = SyntaxTree() main_syntax_tree.operation = "truncate" main_syntax_tree.children = ["src_ip", "dst_ip"] with self.assertRaises(errors.IncorrectArgumentTypeForOperationError): validator.validate( [FieldTransformation("result", main_syntax_tree)])
def test_validate_with_correct_one_level_subtree(self): validator = TransformationsValidator( TransformationOperations(CONFIG_PATH), self.data_structure_pyspark) syntaxtree = SyntaxTree() syntaxtree.operation = "mul" syntaxtree.children = ["packet_size", "sampling_rate"] fields = validator.validate( [FieldTransformation("traffic", syntaxtree), "dst_ip"]) self.assertEqual( fields, types.StructType([ types.StructField('traffic', types.LongType()), types.StructField('dst_ip', types.StringType()) ]))
def test_validate_rename_field(self): validator = TransformatoinsValidator( TransformationOperations({ "country": "./GeoLite2-Country.mmdb", "city": "./GeoLite2-City.mmdb", "asn": "./GeoLite2-ASN.mmdb" }), self.data_structure_pyspark) fields = validator.validate( [FieldTransformation("size", "packet_size"), "dst_ip"]) self.assertEqual( fields, types.StructType([ types.StructField('size', types.LongType()), types.StructField('dst_ip', types.StringType()) ]))
def test_validate_raise_incorrect_arguments_amount_for_operation_error( self): validator = TransformatoinsValidator( TransformationOperations({ "country": "./GeoLite2-Country.mmdb", "city": "./GeoLite2-City.mmdb", "asn": "./GeoLite2-ASN.mmdb" }), self.data_structure_pyspark) syntaxtree = SyntaxTree() syntaxtree.operation = "sum" syntaxtree.children = ["1", "2", "3"] with self.assertRaises( errors.IncorrectArgumentsAmountForOperationError): validator.validate( [FieldTransformation("sum", syntaxtree), "dst_ip"])
def test_validate_raise_operation_not_supported_error_for_subtree(self): validator = TransformatoinsValidator( TransformationOperations({ "country": "./GeoLite2-Country.mmdb", "city": "./GeoLite2-City.mmdb", "asn": "./GeoLite2-ASN.mmdb" }), self.data_structure_pyspark) syntaxtree = SyntaxTree() syntaxtree.operation = "not_exists_operator" syntaxtree.children = ["1", "2"] main_syntax_tree = SyntaxTree() main_syntax_tree.operation = "mult" main_syntax_tree.children = [syntaxtree, "1"] with self.assertRaises(errors.OperationNotSupportedError): validator.validate( [FieldTransformation("result", main_syntax_tree), "dst_ip"])
def test_validate_function_with_different_arguments_type(self): validator = TransformatoinsValidator( TransformationOperations({ "country": "./GeoLite2-Country.mmdb", "city": "./GeoLite2-City.mmdb", "asn": "./GeoLite2-ASN.mmdb" }), self.data_structure_pyspark) main_syntax_tree = SyntaxTree() main_syntax_tree.operation = "truncate" main_syntax_tree.children = ["src_ip", "5"] fields = validator.validate( [FieldTransformation("result", main_syntax_tree)]) self.assertEqual( fields, types.StructType([types.StructField("result", types.StringType())]))
def test_validate_config_operation(self): validator = TransformationsValidator( TransformationOperations(CONFIG_PATH), self.data_structure_pyspark) syntaxtree = SyntaxTree() syntaxtree.operation = "config" syntaxtree.children = ["'input.options.port'"] main_syntax_tree = SyntaxTree() main_syntax_tree.operation = "concat" main_syntax_tree.children = [syntaxtree, "'sampling_rate'"] fields = validator.validate( [FieldTransformation("result", main_syntax_tree)]) self.assertEqual( fields, types.StructType([types.StructField('result', types.StringType())]))
def test_build_lambda_concat_with_nested_mul(self): mult_syntax_tree = SyntaxTree() mult_syntax_tree.operation = "mul" mult_syntax_tree.children = [6, "packet_size"] mult_syntax_tree_root = SyntaxTree() mult_syntax_tree_root.operation = "concat" mult_syntax_tree_root.children = [ mult_syntax_tree, "' -- xe \' 2/3 mul(3,3) FooBar'" ] parsed_transformations = [ FieldTransformation("traffic", mult_syntax_tree_root) ] creator = TransformationCreator(self.data_structure, parsed_transformations, TransformationOperations(self.config)) transformation = creator.build_lambda() self.assertIsInstance(transformation, types.LambdaType, "Transformation type should be lambda") spark = SparkSession.builder.getOrCreate() file = spark.read.csv(DATA_PATH, self.data_structure_pyspark) result = file.rdd.map(transformation) result = result.collect() self.assertListEqual(result, [('444 -- xe \' 2/3 mul(3,3) FooBar', ), ('408 -- xe \' 2/3 mul(3,3) FooBar', ), ('9060 -- xe \' 2/3 mul(3,3) FooBar', ), ('1110 -- xe \' 2/3 mul(3,3) FooBar', ), ('1110 -- xe \' 2/3 mul(3,3) FooBar', )], "List of tuples should be equal") spark.stop()