def test_build_lambda_numbers(self):
        st = SyntaxTree()
        st.operation = "_"
        st.children = [13]  # as if it parsed

        parsed_transformations = [FieldTransformation("a", st)]

        operations = TransformationOperations(self.config)

        transformations_validator = TransformationsValidator(
            operations, self.data_structure)
        _ = transformations_validator.validate(parsed_transformations)

        creator = TransformationCreator(self.data_structure,
                                        parsed_transformations,
                                        TransformationOperations(self.config))

        transformation = creator.build_lambda()

        self.assertIsInstance(transformation, types.LambdaType,
                              "Transformation type should be lambda")

        spark = SparkSession.builder.getOrCreate()
        file = spark.read.csv(DATA_PATH, self.data_structure_pyspark)

        result = file.rdd.map(transformation)

        result = result.collect()
        self.assertListEqual(result, [(13, ), (13, ), (13, ), (13, ), (13, )],
                             "List of tuples should be equal")

        spark.stop()
    def test_build_lambda_processor_config(self):
        parser = TransformationsParser(["a: config('input.options.port')"])
        parser.run()
        operations = TransformationOperations(self.config)

        transformations_validator = TransformationsValidator(
            operations, self.data_structure)

        _ = transformations_validator.validate(parser.expanded_transformation)
        creator = TransformationCreator(self.data_structure,
                                        parser.expanded_transformation,
                                        TransformationOperations(self.config))

        transformation = creator.build_lambda()

        self.assertIsInstance(transformation, types.LambdaType,
                              "Transformation type should be lambda")

        spark = SparkSession.builder.getOrCreate()
        file = spark.read.csv(DATA_PATH, self.data_structure_pyspark)

        result = file.rdd.map(transformation)

        result = result.collect()

        self.assertListEqual(result,
                             [(29092, ), (29092, ), (29092, ), (29092, ),
                              (29092, )], "List of tuples should be equal")

        spark.stop()
    def test_build_lambda_with_nested_literals(self):
        st = SyntaxTree()
        st.operation = "concat"
        # should cast int to str and concat
        st.children = ["'6'", "packet_size"]  # packet_size [74, 68]

        st2 = SyntaxTree()
        st2.operation = "concat"
        st2.children = [2E+2, st]

        parsed_transformations = [FieldTransformation("nested", st2)]
        creator = TransformationCreator(self.data_structure,
                                        parsed_transformations,
                                        TransformationOperations(self.config))
        transformation = creator.build_lambda()

        self.assertIsInstance(transformation, types.LambdaType,
                              "Transformation type should be lambda")

        spark = SparkSession.builder.getOrCreate()
        file = spark.read.csv(DATA_PATH, self.data_structure_pyspark)

        result = file.rdd.map(transformation)

        result = result.collect()
        self.assertListEqual(result, [('200.0674', ), ('200.0668', ),
                                      ('200.061510', ), ('200.06185', ),
                                      ('200.06185', )],
                             "List of tuples should be equal")
        spark.stop()
示例#4
0
    def test_validate_raise_field_not_exists_error(self):
        validator = TransformationsValidator(
            TransformationOperations(CONFIG_PATH), self.data_structure_pyspark)

        with self.assertRaises(errors.FieldNotExists):
            validator.validate(
                ["src_ip", "dst_ip", "packet_size", "sample_rate"])
示例#5
0
    def test_validate_raise_field_not_exists_when_rename_field(self):
        validator = TransformationsValidator(
            TransformationOperations(CONFIG_PATH), self.data_structure_pyspark)

        with self.assertRaises(errors.FieldNotExists):
            validator.validate(
                [FieldTransformation("size", "not_exists_field"), "dst_ip"])
    def test_build_lambda_add_scientific(self):
        st = SyntaxTree()
        st.operation = "add"
        st.children = [1.2E+5, 1.0]
        parsed_transformations = [FieldTransformation("sum", st)]
        creator = TransformationCreator(self.data_structure,
                                        parsed_transformations,
                                        TransformationOperations(self.config))

        transformation = creator.build_lambda()

        self.assertIsInstance(transformation, types.LambdaType,
                              "Transformation type should be lambda")

        spark = SparkSession.builder.getOrCreate()
        file = spark.read.csv(DATA_PATH, self.data_structure_pyspark)

        result = file.rdd.map(transformation)

        result = result.collect()
        self.assertListEqual(result, [(120001.0, ), (120001.0, ), (120001.0, ),
                                      (120001.0, ), (120001.0, )],
                             "List of tuples should be equal")

        spark.stop()
    def test_build_lambda_truncate(self):
        st = SyntaxTree()
        st.operation = "truncate"
        st.children = ["'test'", 2]

        parsed_transformations = [
            FieldTransformation("cut_upto_2_symbols", st)
        ]
        creator = TransformationCreator(self.data_structure,
                                        parsed_transformations,
                                        TransformationOperations(self.config))

        transformation = creator.build_lambda()

        self.assertIsInstance(transformation, types.LambdaType,
                              "Transformation type should be lambda")

        spark = SparkSession.builder.getOrCreate()
        file = spark.read.csv(DATA_PATH, self.data_structure_pyspark)

        result = file.rdd.map(transformation)

        result = result.collect()
        self.assertListEqual(result, [('te', ), ('te', ), ('te', ), ('te', ),
                                      ('te', )],
                             "List of tuples should be equal")

        spark.stop()
    def test_validate_with_correct_two_level_subtree(self):
        validator = TransformatoinsValidator(
            TransformationOperations({
                "country": "./GeoLite2-Country.mmdb",
                "city": "./GeoLite2-City.mmdb",
                "asn": "./GeoLite2-ASN.mmdb"
            }), self.data_structure_pyspark)

        syntaxtree = SyntaxTree()
        syntaxtree.operation = "sum"
        syntaxtree.children = ["sampling_rate", "packet_size"]

        main_syntax_tree = SyntaxTree()
        main_syntax_tree.operation = "mult"
        main_syntax_tree.children = [syntaxtree, "sampling_rate"]

        fields = validator.validate(
            [FieldTransformation("result", main_syntax_tree), "dst_ip"])

        self.assertEqual(
            fields,
            types.StructType([
                types.StructField('result', types.LongType()),
                types.StructField('dst_ip', types.StringType())
            ]))
    def test_build_lambda_with_literals(self):
        st = SyntaxTree()
        st.operation = "concat"
        st.children = ["'6 - '", "packet_size"]  # packet_size [74, 68]

        parsed_transformations = [FieldTransformation("ephemer", st)]

        creator = TransformationCreator(self.data_structure,
                                        parsed_transformations,
                                        TransformationOperations(self.config))
        transformation = creator.build_lambda()

        self.assertIsInstance(transformation, types.LambdaType,
                              "Transformation type should be lambda")

        spark = SparkSession.builder.getOrCreate()
        file = spark.read.csv(DATA_PATH, self.data_structure_pyspark)

        result = file.rdd.map(transformation)

        result = result.collect()
        self.assertListEqual(result, [("6 - 74", ), ("6 - 68", ),
                                      ("6 - 1510", ), ("6 - 185", ),
                                      ("6 - 185", )],
                             "List of tuples should be equal")
示例#10
0
    def test_validate_raise_operation_not_supported_error(self):
        validator = TransformationsValidator(
            TransformationOperations(CONFIG_PATH), self.data_structure_pyspark)

        syntaxtree = SyntaxTree()
        syntaxtree.operation = "not_exists_operation"

        with self.assertRaises(errors.OperationNotSupportedError):
            validator.validate(
                [FieldTransformation("size", syntaxtree), "dst_ip"])
    def test_validate_raise_field_not_exists_when_rename_field(self):
        validator = TransformatoinsValidator(
            TransformationOperations({
                "country": "./GeoLite2-Country.mmdb",
                "city": "./GeoLite2-City.mmdb",
                "asn": "./GeoLite2-ASN.mmdb"
            }), self.data_structure_pyspark)

        with self.assertRaises(errors.FieldNotExists):
            validator.validate(
                [FieldTransformation("size", "not_exists_field"), "dst_ip"])
示例#12
0
    def test_validate_raise_incorrect_argument_type_for_operation_error(self):
        validator = TransformationsValidator(
            TransformationOperations(CONFIG_PATH), self.data_structure_pyspark)

        syntaxtree = SyntaxTree()
        syntaxtree.operation = "mul"
        syntaxtree.children = ["src_ip", "packet_size"]

        with self.assertRaises(errors.IncorrectArgumentTypeForOperationError):
            validator.validate(
                [FieldTransformation("traffic", syntaxtree), "dst_ip"])
    def test_validate_raise_field_not_exists_error(self):
        validator = TransformatoinsValidator(
            TransformationOperations({
                "country": "./GeoLite2-Country.mmdb",
                "city": "./GeoLite2-City.mmdb",
                "asn": "./GeoLite2-ASN.mmdb"
            }), self.data_structure_pyspark)

        with self.assertRaises(errors.FieldNotExists):
            validator.validate(
                ["src_ip", "dst_ip", "packet_size", "sample_rate"])
示例#14
0
    def test_validate_raise_error_for_function_with_different_arguments_type(
            self):
        validator = TransformationsValidator(
            TransformationOperations(CONFIG_PATH), self.data_structure_pyspark)

        main_syntax_tree = SyntaxTree()
        main_syntax_tree.operation = "truncate"
        main_syntax_tree.children = ["src_ip", "dst_ip"]

        with self.assertRaises(errors.IncorrectArgumentTypeForOperationError):
            validator.validate(
                [FieldTransformation("result", main_syntax_tree)])
示例#15
0
    def test_validate_correct_arguments_amount_for_operation_add(self):
        validator = TransformationsValidator(
            TransformationOperations(CONFIG_PATH), self.data_structure_pyspark)

        syntaxtree = SyntaxTree()
        syntaxtree.operation = "add"
        syntaxtree.children = [1, 2]

        fields = validator.validate([FieldTransformation("add", syntaxtree)])
        self.assertEqual(
            fields,
            types.StructType([types.StructField("add", types.LongType())]))
示例#16
0
    def test_validate_rename_field(self):
        validator = TransformationsValidator(
            TransformationOperations(CONFIG_PATH), self.data_structure_pyspark)

        fields = validator.validate(
            [FieldTransformation("size", "packet_size"), "dst_ip"])

        self.assertEqual(
            fields,
            types.StructType([
                types.StructField('size', types.LongType()),
                types.StructField('dst_ip', types.StringType())
            ]))
示例#17
0
    def test_validate_raise_incorrect_arguments_amount_for_operation_error(
            self):
        validator = TransformationsValidator(
            TransformationOperations(CONFIG_PATH), self.data_structure_pyspark)

        syntaxtree = SyntaxTree()
        syntaxtree.operation = "add"
        syntaxtree.children = [1, 2, 3]

        with self.assertRaises(
                errors.IncorrectArgumentsAmountForOperationError):
            validator.validate(
                [FieldTransformation("add", syntaxtree), "dst_ip"])
示例#18
0
 def test_validate_work_success(self):
     validator = TransformationsValidator(
         TransformationOperations(CONFIG_PATH), self.data_structure_pyspark)
     fields = validator.validate(
         ["src_ip", "dst_ip", "packet_size", "sampling_rate"])
     self.assertEqual(
         fields,
         types.StructType([
             types.StructField('src_ip', types.StringType()),
             types.StructField('dst_ip', types.StringType()),
             types.StructField('packet_size', types.LongType()),
             types.StructField('sampling_rate', types.LongType())
         ]), 'StructType should be equal')
示例#19
0
    def test_validate_function_with_different_arguments_type(self):
        validator = TransformationsValidator(
            TransformationOperations(CONFIG_PATH), self.data_structure_pyspark)

        tree = SyntaxTree()
        tree.operation = "truncate"
        tree.children = ["src_ip", 5]

        fields = validator.validate([FieldTransformation("result", tree)])

        self.assertEqual(
            fields,
            types.StructType([types.StructField("result",
                                                types.StringType())]))
    def test_validate_raise_operation_not_supported_error(self):
        validator = TransformatoinsValidator(
            TransformationOperations({
                "country": "./GeoLite2-Country.mmdb",
                "city": "./GeoLite2-City.mmdb",
                "asn": "./GeoLite2-ASN.mmdb"
            }), self.data_structure_pyspark)

        syntaxtree = SyntaxTree()
        syntaxtree.operation = "not_exists_operation"

        with self.assertRaises(errors.OperationNotSupportedError):
            validator.validate(
                [FieldTransformation("size", syntaxtree), "dst_ip"])
    def test_build_lambda_processor_add(self):
        self.maxDiff = None
        parser = TransformationsParser([
            "dst_ip: add(-13.5, 2)", "src_ip:add(-13.5,2)",
            "foobar: 'add(-13.5,2)'", "foobar2: 'add\\'(-13.5,2)'"
        ])
        parser.run()
        operations = TransformationOperations(self.config)

        transformations_validator = TransformationsValidator(
            operations, self.data_structure)
        _ = transformations_validator.validate(parser.expanded_transformation)
        creator = TransformationCreator(self.data_structure,
                                        parser.expanded_transformation,
                                        TransformationOperations(self.config))

        transformation = creator.build_lambda()

        self.assertIsInstance(transformation, types.LambdaType,
                              "Transformation type should be lambda")

        spark = SparkSession.builder.getOrCreate()
        file = spark.read.csv(DATA_PATH, self.data_structure_pyspark)

        result = file.rdd.map(transformation)

        result = result.collect()

        self.assertListEqual(result,
                             [(-11.5, -11.5, 'add(-13.5,2)', "add'(-13.5,2)"),
                              (-11.5, -11.5, 'add(-13.5,2)', "add'(-13.5,2)"),
                              (-11.5, -11.5, 'add(-13.5,2)', "add'(-13.5,2)"),
                              (-11.5, -11.5, 'add(-13.5,2)', "add'(-13.5,2)"),
                              (-11.5, -11.5, 'add(-13.5,2)', "add'(-13.5,2)")],
                             "List of tuples should be equal")

        spark.stop()
    def test_validate_raise_incorrect_argument_type_for_operation_error(self):
        validator = TransformatoinsValidator(
            TransformationOperations({
                "country": "./GeoLite2-Country.mmdb",
                "city": "./GeoLite2-City.mmdb",
                "asn": "./GeoLite2-ASN.mmdb"
            }), self.data_structure_pyspark)

        syntaxtree = SyntaxTree()
        syntaxtree.operation = "mult"
        syntaxtree.children = ["src_ip", "packet_size"]

        with self.assertRaises(errors.IncorrectArgumentTypeForOperationError):
            validator.validate(
                [FieldTransformation("traffic", syntaxtree), "dst_ip"])
示例#23
0
    def test_validate_raise_operation_not_supported_error_for_subtree(self):
        validator = TransformationsValidator(
            TransformationOperations(CONFIG_PATH), self.data_structure_pyspark)

        syntaxtree = SyntaxTree()
        syntaxtree.operation = "not_exists_operator"
        syntaxtree.children = ["1", "2"]

        main_syntax_tree = SyntaxTree()
        main_syntax_tree.operation = "mul"
        main_syntax_tree.children = [syntaxtree, "1"]

        with self.assertRaises(errors.OperationNotSupportedError):
            validator.validate(
                [FieldTransformation("result", main_syntax_tree), "dst_ip"])
    def test_validate_raise_error_for_function_with_different_arguments_type(
            self):
        validator = TransformatoinsValidator(
            TransformationOperations({
                "country": "./GeoLite2-Country.mmdb",
                "city": "./GeoLite2-City.mmdb",
                "asn": "./GeoLite2-ASN.mmdb"
            }), self.data_structure_pyspark)

        main_syntax_tree = SyntaxTree()
        main_syntax_tree.operation = "truncate"
        main_syntax_tree.children = ["src_ip", "dst_ip"]

        with self.assertRaises(errors.IncorrectArgumentTypeForOperationError):
            validator.validate(
                [FieldTransformation("result", main_syntax_tree)])
    def test_validate_rename_field(self):
        validator = TransformatoinsValidator(
            TransformationOperations({
                "country": "./GeoLite2-Country.mmdb",
                "city": "./GeoLite2-City.mmdb",
                "asn": "./GeoLite2-ASN.mmdb"
            }), self.data_structure_pyspark)

        fields = validator.validate(
            [FieldTransformation("size", "packet_size"), "dst_ip"])

        self.assertEqual(
            fields,
            types.StructType([
                types.StructField('size', types.LongType()),
                types.StructField('dst_ip', types.StringType())
            ]))
示例#26
0
    def test_validate_with_correct_one_level_subtree(self):
        validator = TransformationsValidator(
            TransformationOperations(CONFIG_PATH), self.data_structure_pyspark)

        syntaxtree = SyntaxTree()
        syntaxtree.operation = "mul"
        syntaxtree.children = ["packet_size", "sampling_rate"]

        fields = validator.validate(
            [FieldTransformation("traffic", syntaxtree), "dst_ip"])

        self.assertEqual(
            fields,
            types.StructType([
                types.StructField('traffic', types.LongType()),
                types.StructField('dst_ip', types.StringType())
            ]))
 def test_validate_work_success(self):
     validator = TransformatoinsValidator(
         TransformationOperations({
             "country": "./GeoLite2-Country.mmdb",
             "city": "./GeoLite2-City.mmdb",
             "asn": "./GeoLite2-ASN.mmdb"
         }), self.data_structure_pyspark)
     fields = validator.validate(
         ["src_ip", "dst_ip", "packet_size", "sampling_rate"])
     self.assertEqual(
         fields,
         types.StructType([
             types.StructField('src_ip', types.StringType()),
             types.StructField('dst_ip', types.StringType()),
             types.StructField('packet_size', types.LongType()),
             types.StructField('sampling_rate', types.LongType())
         ]), 'StructType should be equal')
    def test_validate_raise_incorrect_arguments_amount_for_operation_error(
            self):
        validator = TransformatoinsValidator(
            TransformationOperations({
                "country": "./GeoLite2-Country.mmdb",
                "city": "./GeoLite2-City.mmdb",
                "asn": "./GeoLite2-ASN.mmdb"
            }), self.data_structure_pyspark)

        syntaxtree = SyntaxTree()
        syntaxtree.operation = "sum"
        syntaxtree.children = ["1", "2", "3"]

        with self.assertRaises(
                errors.IncorrectArgumentsAmountForOperationError):
            validator.validate(
                [FieldTransformation("sum", syntaxtree), "dst_ip"])
    def test_build_lambda_with_nested_operations(self):
        mult_syntax_tree = SyntaxTree()
        mult_syntax_tree.operation = "mult"
        mult_syntax_tree.children = ["packet_size", "sampling_rate"]

        root_mult_st = SyntaxTree()
        root_mult_st.operation = "mult"
        root_mult_st.children = [mult_syntax_tree, "10"]

        parsed_transformations = [
            "src_ip",
            FieldTransformation("destination_ip", "dst_ip"),
            FieldTransformation("traffic", root_mult_st)
        ]

        creator = TransformationCreator(
            self.data_structure, parsed_transformations,
            TransformationOperations({
                "country": "./GeoLite2-Country.mmdb",
                "city": "./GeoLite2-City.mmdb",
                "asn": "./GeoLite2-ASN.mmdb"
            }))

        transformation = creator.build_lambda()

        self.assertIsInstance(transformation, types.LambdaType,
                              "Transformation type should be lambda")

        spark = SparkSession.builder.getOrCreate()
        file = spark.read.csv(DATA_PATH, self.data_structure_pyspark)

        result = file.rdd.map(transformation)

        result = result.collect()

        self.assertListEqual(result,
                             [("217.69.143.60", "91.221.61.183", 378880),
                              ("91.221.61.168", "90.188.114.141", 348160),
                              ("91.226.13.80", "5.136.78.36", 7731200),
                              ("192.168.30.2", "192.168.30.1", 947200),
                              ("192.168.30.2", "192.168.30.1", 947200)],
                             "List of tuples should be equal")

        spark.stop()
示例#30
0
    def test_validate_config_operation(self):
        validator = TransformationsValidator(
            TransformationOperations(CONFIG_PATH), self.data_structure_pyspark)

        syntaxtree = SyntaxTree()
        syntaxtree.operation = "config"
        syntaxtree.children = ["'input.options.port'"]

        main_syntax_tree = SyntaxTree()
        main_syntax_tree.operation = "concat"
        main_syntax_tree.children = [syntaxtree, "'sampling_rate'"]

        fields = validator.validate(
            [FieldTransformation("result", main_syntax_tree)])

        self.assertEqual(
            fields,
            types.StructType([types.StructField('result',
                                                types.StringType())]))