def test_build_lambda_numbers(self):
        st = SyntaxTree()
        st.operation = "_"
        st.children = [13]  # as if it parsed

        parsed_transformations = [FieldTransformation("a", st)]

        operations = TransformationOperations(self.config)

        transformations_validator = TransformationsValidator(
            operations, self.data_structure)
        _ = transformations_validator.validate(parsed_transformations)

        creator = TransformationCreator(self.data_structure,
                                        parsed_transformations,
                                        TransformationOperations(self.config))

        transformation = creator.build_lambda()

        self.assertIsInstance(transformation, types.LambdaType,
                              "Transformation type should be lambda")

        spark = SparkSession.builder.getOrCreate()
        file = spark.read.csv(DATA_PATH, self.data_structure_pyspark)

        result = file.rdd.map(transformation)

        result = result.collect()
        self.assertListEqual(result, [(13, ), (13, ), (13, ), (13, ), (13, )],
                             "List of tuples should be equal")

        spark.stop()
    def test_build_lambda_with_nested_literals(self):
        st = SyntaxTree()
        st.operation = "concat"
        # should cast int to str and concat
        st.children = ["'6'", "packet_size"]  # packet_size [74, 68]

        st2 = SyntaxTree()
        st2.operation = "concat"
        st2.children = [2E+2, st]

        parsed_transformations = [FieldTransformation("nested", st2)]
        creator = TransformationCreator(self.data_structure,
                                        parsed_transformations,
                                        TransformationOperations(self.config))
        transformation = creator.build_lambda()

        self.assertIsInstance(transformation, types.LambdaType,
                              "Transformation type should be lambda")

        spark = SparkSession.builder.getOrCreate()
        file = spark.read.csv(DATA_PATH, self.data_structure_pyspark)

        result = file.rdd.map(transformation)

        result = result.collect()
        self.assertListEqual(result, [('200.0674', ), ('200.0668', ),
                                      ('200.061510', ), ('200.06185', ),
                                      ('200.06185', )],
                             "List of tuples should be equal")
        spark.stop()
    def test_build_lambda_add_scientific(self):
        st = SyntaxTree()
        st.operation = "add"
        st.children = [1.2E+5, 1.0]
        parsed_transformations = [FieldTransformation("sum", st)]
        creator = TransformationCreator(self.data_structure,
                                        parsed_transformations,
                                        TransformationOperations(self.config))

        transformation = creator.build_lambda()

        self.assertIsInstance(transformation, types.LambdaType,
                              "Transformation type should be lambda")

        spark = SparkSession.builder.getOrCreate()
        file = spark.read.csv(DATA_PATH, self.data_structure_pyspark)

        result = file.rdd.map(transformation)

        result = result.collect()
        self.assertListEqual(result, [(120001.0, ), (120001.0, ), (120001.0, ),
                                      (120001.0, ), (120001.0, )],
                             "List of tuples should be equal")

        spark.stop()
    def test_build_lambda_truncate(self):
        st = SyntaxTree()
        st.operation = "truncate"
        st.children = ["'test'", 2]

        parsed_transformations = [
            FieldTransformation("cut_upto_2_symbols", st)
        ]
        creator = TransformationCreator(self.data_structure,
                                        parsed_transformations,
                                        TransformationOperations(self.config))

        transformation = creator.build_lambda()

        self.assertIsInstance(transformation, types.LambdaType,
                              "Transformation type should be lambda")

        spark = SparkSession.builder.getOrCreate()
        file = spark.read.csv(DATA_PATH, self.data_structure_pyspark)

        result = file.rdd.map(transformation)

        result = result.collect()
        self.assertListEqual(result, [('te', ), ('te', ), ('te', ), ('te', ),
                                      ('te', )],
                             "List of tuples should be equal")

        spark.stop()
예제 #5
0
    def test_validate_raise_field_not_exists_when_rename_field(self):
        validator = TransformationsValidator(
            TransformationOperations(CONFIG_PATH), self.data_structure_pyspark)

        with self.assertRaises(errors.FieldNotExists):
            validator.validate(
                [FieldTransformation("size", "not_exists_field"), "dst_ip"])
    def test_validate_with_correct_two_level_subtree(self):
        validator = TransformatoinsValidator(
            TransformationOperations({
                "country": "./GeoLite2-Country.mmdb",
                "city": "./GeoLite2-City.mmdb",
                "asn": "./GeoLite2-ASN.mmdb"
            }), self.data_structure_pyspark)

        syntaxtree = SyntaxTree()
        syntaxtree.operation = "sum"
        syntaxtree.children = ["sampling_rate", "packet_size"]

        main_syntax_tree = SyntaxTree()
        main_syntax_tree.operation = "mult"
        main_syntax_tree.children = [syntaxtree, "sampling_rate"]

        fields = validator.validate(
            [FieldTransformation("result", main_syntax_tree), "dst_ip"])

        self.assertEqual(
            fields,
            types.StructType([
                types.StructField('result', types.LongType()),
                types.StructField('dst_ip', types.StringType())
            ]))
    def test_build_lambda_with_literals(self):
        st = SyntaxTree()
        st.operation = "concat"
        st.children = ["'6 - '", "packet_size"]  # packet_size [74, 68]

        parsed_transformations = [FieldTransformation("ephemer", st)]

        creator = TransformationCreator(self.data_structure,
                                        parsed_transformations,
                                        TransformationOperations(self.config))
        transformation = creator.build_lambda()

        self.assertIsInstance(transformation, types.LambdaType,
                              "Transformation type should be lambda")

        spark = SparkSession.builder.getOrCreate()
        file = spark.read.csv(DATA_PATH, self.data_structure_pyspark)

        result = file.rdd.map(transformation)

        result = result.collect()
        self.assertListEqual(result, [("6 - 74", ), ("6 - 68", ),
                                      ("6 - 1510", ), ("6 - 185", ),
                                      ("6 - 185", )],
                             "List of tuples should be equal")
    def test_build_lambda_with_nested_operations(self):
        mult_syntax_tree = SyntaxTree()
        mult_syntax_tree.operation = "mult"
        mult_syntax_tree.children = ["packet_size", "sampling_rate"]

        root_mult_st = SyntaxTree()
        root_mult_st.operation = "mult"
        root_mult_st.children = [mult_syntax_tree, "10"]

        parsed_transformations = [
            "src_ip",
            FieldTransformation("destination_ip", "dst_ip"),
            FieldTransformation("traffic", root_mult_st)
        ]

        creator = TransformationCreator(
            self.data_structure, parsed_transformations,
            TransformationOperations({
                "country": "./GeoLite2-Country.mmdb",
                "city": "./GeoLite2-City.mmdb",
                "asn": "./GeoLite2-ASN.mmdb"
            }))

        transformation = creator.build_lambda()

        self.assertIsInstance(transformation, types.LambdaType,
                              "Transformation type should be lambda")

        spark = SparkSession.builder.getOrCreate()
        file = spark.read.csv(DATA_PATH, self.data_structure_pyspark)

        result = file.rdd.map(transformation)

        result = result.collect()

        self.assertListEqual(result,
                             [("217.69.143.60", "91.221.61.183", 378880),
                              ("91.221.61.168", "90.188.114.141", 348160),
                              ("91.226.13.80", "5.136.78.36", 7731200),
                              ("192.168.30.2", "192.168.30.1", 947200),
                              ("192.168.30.2", "192.168.30.1", 947200)],
                             "List of tuples should be equal")

        spark.stop()
예제 #9
0
    def test_validate_raise_operation_not_supported_error(self):
        validator = TransformationsValidator(
            TransformationOperations(CONFIG_PATH), self.data_structure_pyspark)

        syntaxtree = SyntaxTree()
        syntaxtree.operation = "not_exists_operation"

        with self.assertRaises(errors.OperationNotSupportedError):
            validator.validate(
                [FieldTransformation("size", syntaxtree), "dst_ip"])
예제 #10
0
    def test_validate_raise_incorrect_argument_type_for_operation_error(self):
        validator = TransformationsValidator(
            TransformationOperations(CONFIG_PATH), self.data_structure_pyspark)

        syntaxtree = SyntaxTree()
        syntaxtree.operation = "mul"
        syntaxtree.children = ["src_ip", "packet_size"]

        with self.assertRaises(errors.IncorrectArgumentTypeForOperationError):
            validator.validate(
                [FieldTransformation("traffic", syntaxtree), "dst_ip"])
    def test_validate_raise_field_not_exists_when_rename_field(self):
        validator = TransformatoinsValidator(
            TransformationOperations({
                "country": "./GeoLite2-Country.mmdb",
                "city": "./GeoLite2-City.mmdb",
                "asn": "./GeoLite2-ASN.mmdb"
            }), self.data_structure_pyspark)

        with self.assertRaises(errors.FieldNotExists):
            validator.validate(
                [FieldTransformation("size", "not_exists_field"), "dst_ip"])
예제 #12
0
    def test_validate_correct_arguments_amount_for_operation_add(self):
        validator = TransformationsValidator(
            TransformationOperations(CONFIG_PATH), self.data_structure_pyspark)

        syntaxtree = SyntaxTree()
        syntaxtree.operation = "add"
        syntaxtree.children = [1, 2]

        fields = validator.validate([FieldTransformation("add", syntaxtree)])
        self.assertEqual(
            fields,
            types.StructType([types.StructField("add", types.LongType())]))
예제 #13
0
    def test_validate_raise_error_for_function_with_different_arguments_type(
            self):
        validator = TransformationsValidator(
            TransformationOperations(CONFIG_PATH), self.data_structure_pyspark)

        main_syntax_tree = SyntaxTree()
        main_syntax_tree.operation = "truncate"
        main_syntax_tree.children = ["src_ip", "dst_ip"]

        with self.assertRaises(errors.IncorrectArgumentTypeForOperationError):
            validator.validate(
                [FieldTransformation("result", main_syntax_tree)])
예제 #14
0
    def test_validate_with_transformation_primitives(self):
        validator = TransformationsValidator(
            TransformationOperations(CONFIG_PATH), self.data_structure_pyspark)
        # test "{foo: 'bar'}""
        tree = SyntaxTree()
        tree.operation = "concat"
        tree.children = [1, 2]
        fields = validator.validate([FieldTransformation("foo", tree)])

        self.assertEqual(
            fields,
            types.StructType([types.StructField("foo", types.StringType())]))

        tree.children = ["'1'", "'2'"]
        fields = validator.validate([FieldTransformation("foo", tree)])

        self.assertEqual(
            fields,
            types.StructType([types.StructField("foo", types.StringType())]))

        tree.children = [1E+2, "'1'"]
        fields = validator.validate([FieldTransformation("foo", tree)])

        self.assertEqual(
            fields,
            types.StructType([types.StructField("foo", types.StringType())]))

        tree.children = ["'foo\'bar'", "'2'"]
        fields = validator.validate([FieldTransformation("foo", tree)])

        self.assertEqual(
            fields,
            types.StructType([types.StructField("foo", types.StringType())]))

        tree.children = ["'foo\"bar'", 2]
        fields = validator.validate([FieldTransformation("foo", tree)])

        self.assertEqual(
            fields,
            types.StructType([types.StructField("foo", types.StringType())]))
예제 #15
0
    def test_validate_raise_incorrect_arguments_amount_for_operation_error(
            self):
        validator = TransformationsValidator(
            TransformationOperations(CONFIG_PATH), self.data_structure_pyspark)

        syntaxtree = SyntaxTree()
        syntaxtree.operation = "add"
        syntaxtree.children = [1, 2, 3]

        with self.assertRaises(
                errors.IncorrectArgumentsAmountForOperationError):
            validator.validate(
                [FieldTransformation("add", syntaxtree), "dst_ip"])
예제 #16
0
    def test_validate_rename_field(self):
        validator = TransformationsValidator(
            TransformationOperations(CONFIG_PATH), self.data_structure_pyspark)

        fields = validator.validate(
            [FieldTransformation("size", "packet_size"), "dst_ip"])

        self.assertEqual(
            fields,
            types.StructType([
                types.StructField('size', types.LongType()),
                types.StructField('dst_ip', types.StringType())
            ]))
예제 #17
0
    def test_validate_function_with_different_arguments_type(self):
        validator = TransformationsValidator(
            TransformationOperations(CONFIG_PATH), self.data_structure_pyspark)

        tree = SyntaxTree()
        tree.operation = "truncate"
        tree.children = ["src_ip", 5]

        fields = validator.validate([FieldTransformation("result", tree)])

        self.assertEqual(
            fields,
            types.StructType([types.StructField("result",
                                                types.StringType())]))
    def test_validate_raise_operation_not_supported_error(self):
        validator = TransformatoinsValidator(
            TransformationOperations({
                "country": "./GeoLite2-Country.mmdb",
                "city": "./GeoLite2-City.mmdb",
                "asn": "./GeoLite2-ASN.mmdb"
            }), self.data_structure_pyspark)

        syntaxtree = SyntaxTree()
        syntaxtree.operation = "not_exists_operation"

        with self.assertRaises(errors.OperationNotSupportedError):
            validator.validate(
                [FieldTransformation("size", syntaxtree), "dst_ip"])
예제 #19
0
    def test_validate_raise_operation_not_supported_error_for_subtree(self):
        validator = TransformationsValidator(
            TransformationOperations(CONFIG_PATH), self.data_structure_pyspark)

        syntaxtree = SyntaxTree()
        syntaxtree.operation = "not_exists_operator"
        syntaxtree.children = ["1", "2"]

        main_syntax_tree = SyntaxTree()
        main_syntax_tree.operation = "mul"
        main_syntax_tree.children = [syntaxtree, "1"]

        with self.assertRaises(errors.OperationNotSupportedError):
            validator.validate(
                [FieldTransformation("result", main_syntax_tree), "dst_ip"])
    def test_validate_raise_incorrect_argument_type_for_operation_error(self):
        validator = TransformatoinsValidator(
            TransformationOperations({
                "country": "./GeoLite2-Country.mmdb",
                "city": "./GeoLite2-City.mmdb",
                "asn": "./GeoLite2-ASN.mmdb"
            }), self.data_structure_pyspark)

        syntaxtree = SyntaxTree()
        syntaxtree.operation = "mult"
        syntaxtree.children = ["src_ip", "packet_size"]

        with self.assertRaises(errors.IncorrectArgumentTypeForOperationError):
            validator.validate(
                [FieldTransformation("traffic", syntaxtree), "dst_ip"])
    def test_validate_raise_error_for_function_with_different_arguments_type(
            self):
        validator = TransformatoinsValidator(
            TransformationOperations({
                "country": "./GeoLite2-Country.mmdb",
                "city": "./GeoLite2-City.mmdb",
                "asn": "./GeoLite2-ASN.mmdb"
            }), self.data_structure_pyspark)

        main_syntax_tree = SyntaxTree()
        main_syntax_tree.operation = "truncate"
        main_syntax_tree.children = ["src_ip", "dst_ip"]

        with self.assertRaises(errors.IncorrectArgumentTypeForOperationError):
            validator.validate(
                [FieldTransformation("result", main_syntax_tree)])
예제 #22
0
    def test_validate_with_correct_one_level_subtree(self):
        validator = TransformationsValidator(
            TransformationOperations(CONFIG_PATH), self.data_structure_pyspark)

        syntaxtree = SyntaxTree()
        syntaxtree.operation = "mul"
        syntaxtree.children = ["packet_size", "sampling_rate"]

        fields = validator.validate(
            [FieldTransformation("traffic", syntaxtree), "dst_ip"])

        self.assertEqual(
            fields,
            types.StructType([
                types.StructField('traffic', types.LongType()),
                types.StructField('dst_ip', types.StringType())
            ]))
    def test_validate_rename_field(self):
        validator = TransformatoinsValidator(
            TransformationOperations({
                "country": "./GeoLite2-Country.mmdb",
                "city": "./GeoLite2-City.mmdb",
                "asn": "./GeoLite2-ASN.mmdb"
            }), self.data_structure_pyspark)

        fields = validator.validate(
            [FieldTransformation("size", "packet_size"), "dst_ip"])

        self.assertEqual(
            fields,
            types.StructType([
                types.StructField('size', types.LongType()),
                types.StructField('dst_ip', types.StringType())
            ]))
    def test_validate_raise_incorrect_arguments_amount_for_operation_error(
            self):
        validator = TransformatoinsValidator(
            TransformationOperations({
                "country": "./GeoLite2-Country.mmdb",
                "city": "./GeoLite2-City.mmdb",
                "asn": "./GeoLite2-ASN.mmdb"
            }), self.data_structure_pyspark)

        syntaxtree = SyntaxTree()
        syntaxtree.operation = "sum"
        syntaxtree.children = ["1", "2", "3"]

        with self.assertRaises(
                errors.IncorrectArgumentsAmountForOperationError):
            validator.validate(
                [FieldTransformation("sum", syntaxtree), "dst_ip"])
    def test_validate_raise_operation_not_supported_error_for_subtree(self):
        validator = TransformatoinsValidator(
            TransformationOperations({
                "country": "./GeoLite2-Country.mmdb",
                "city": "./GeoLite2-City.mmdb",
                "asn": "./GeoLite2-ASN.mmdb"
            }), self.data_structure_pyspark)

        syntaxtree = SyntaxTree()
        syntaxtree.operation = "not_exists_operator"
        syntaxtree.children = ["1", "2"]

        main_syntax_tree = SyntaxTree()
        main_syntax_tree.operation = "mult"
        main_syntax_tree.children = [syntaxtree, "1"]

        with self.assertRaises(errors.OperationNotSupportedError):
            validator.validate(
                [FieldTransformation("result", main_syntax_tree), "dst_ip"])
    def test_validate_function_with_different_arguments_type(self):
        validator = TransformatoinsValidator(
            TransformationOperations({
                "country": "./GeoLite2-Country.mmdb",
                "city": "./GeoLite2-City.mmdb",
                "asn": "./GeoLite2-ASN.mmdb"
            }), self.data_structure_pyspark)

        main_syntax_tree = SyntaxTree()
        main_syntax_tree.operation = "truncate"
        main_syntax_tree.children = ["src_ip", "5"]

        fields = validator.validate(
            [FieldTransformation("result", main_syntax_tree)])

        self.assertEqual(
            fields,
            types.StructType([types.StructField("result",
                                                types.StringType())]))
예제 #27
0
    def test_validate_config_operation(self):
        validator = TransformationsValidator(
            TransformationOperations(CONFIG_PATH), self.data_structure_pyspark)

        syntaxtree = SyntaxTree()
        syntaxtree.operation = "config"
        syntaxtree.children = ["'input.options.port'"]

        main_syntax_tree = SyntaxTree()
        main_syntax_tree.operation = "concat"
        main_syntax_tree.children = [syntaxtree, "'sampling_rate'"]

        fields = validator.validate(
            [FieldTransformation("result", main_syntax_tree)])

        self.assertEqual(
            fields,
            types.StructType([types.StructField('result',
                                                types.StringType())]))
    def test_build_lambda_concat_with_nested_mul(self):
        mult_syntax_tree = SyntaxTree()
        mult_syntax_tree.operation = "mul"
        mult_syntax_tree.children = [6, "packet_size"]
        mult_syntax_tree_root = SyntaxTree()
        mult_syntax_tree_root.operation = "concat"
        mult_syntax_tree_root.children = [
            mult_syntax_tree, "' -- xe \' 2/3 mul(3,3) FooBar'"
        ]

        parsed_transformations = [
            FieldTransformation("traffic", mult_syntax_tree_root)
        ]
        creator = TransformationCreator(self.data_structure,
                                        parsed_transformations,
                                        TransformationOperations(self.config))

        transformation = creator.build_lambda()

        self.assertIsInstance(transformation, types.LambdaType,
                              "Transformation type should be lambda")

        spark = SparkSession.builder.getOrCreate()
        file = spark.read.csv(DATA_PATH, self.data_structure_pyspark)

        result = file.rdd.map(transformation)

        result = result.collect()
        self.assertListEqual(result, [('444 -- xe \' 2/3 mul(3,3) FooBar', ),
                                      ('408 -- xe \' 2/3 mul(3,3) FooBar', ),
                                      ('9060 -- xe \' 2/3 mul(3,3) FooBar', ),
                                      ('1110 -- xe \' 2/3 mul(3,3) FooBar', ),
                                      ('1110 -- xe \' 2/3 mul(3,3) FooBar', )],
                             "List of tuples should be equal")

        spark.stop()