Exemplo n.º 1
0
    def test_simple_data_validate_column_name_number_in_string(self):
        simple_schema = {
            "1col_1string": "string",
            "2col_2integer": "integer",
            "3col_3float": "double",
            "4col_4date": "date",
            "5col_5datetime": "timestamp",
            "6col_6boolean": "boolean"
        }

        data = [{
            "1col_1string": "string",
            "2col_2integer": "99-01-01",
            "3col_3float": 5566.5566,
            "4col_4date": "NULL",
            "5col_5datetime": "1995-01-01 00:01:01",
            "6col_6boolean": True
        }]

        df = spark.createDataFrame(data)
        validate_data = Validator.validate_data(df, simple_schema)
        validate_data_schema = Validator.validate_schema(validate_data,
                                                         'bq').collect()
        self.assertEqual(validate_data_schema, [
            Row(_6col_6boolean=True,
                _4col_4date=None,
                _5col_5datetime=datetime.datetime(1995, 1, 1, 0, 1, 1),
                _3col_3float=5566.5566,
                _2col_2integer=None,
                _1col_1string='string')
        ])
Exemplo n.º 2
0
    def validate(self, validate_schema=True, rule='bq'):
        self._valid_df = Validator.validate_data(self.df, self.schema)

        if validate_schema is True:
            self._valid_df = Validator.validate_schema(self._valid_df, rule=rule)

        return self
Exemplo n.º 3
0
    def test_simple_data_validate_column_name_not_support_error(self):
        data = [{
            "1col_string": "string",
            "2col_integer": "99-01-01",
            "3col_float": 5566.5566,
            "4col_date": "NULL",
            "5col_datetime": "1995-01-01 00:01:01",
            "6col_boolean": True
        }]
        df = spark.createDataFrame(data)
        validate_data = Validator.validate_data(df, self.simple_schema)

        with self.assertRaises(ValueError):
            validate_data_schema = Validator.validate_schema(
                validate_data, rule='test').collect()
Exemplo n.º 4
0
 def test_simple_data_validate_column_name(self):
     data = [{
         "1col_string": "string",
         "2col_integer": "99-01-01",
         "3col_float": 5566.5566,
         "4col_date": "NULL",
         "5col_datetime": "1995-01-01 00:01:01",
         "6col_boolean": True
     }]
     df = spark.createDataFrame(data)
     validate_data = Validator.validate_data(df, self.simple_schema)
     validate_data_schema = Validator.validate_schema(validate_data,
                                                      'bq').collect()
     self.assertEqual(validate_data_schema, [
         Row(_6col_boolean=True,
             _4col_date=None,
             _5col_datetime=datetime.datetime(1995, 1, 1, 0, 1, 1),
             _3col_float=5566.5566,
             _2col_integer=None,
             _1col_string='string')
     ])
Exemplo n.º 5
0
    def test_simple_data_validate_invalid(self):
        data_for_simple_schema_path = data_path + 'simple_invalid.json'

        with open(data_for_simple_schema_path) as f:
            data = json.load(f)
        simple_invalid_df = spark.createDataFrame(data)

        validate_data = Validator.validate_data(simple_invalid_df,
                                                self.simple_schema).collect()
        self.assertEqual(validate_data, [
            Row(col_boolean=True,
                col_date=datetime.date(1995, 1, 1),
                col_datetime=datetime.datetime(1995, 1, 1, 0, 1, 1),
                col_float=5566.5566,
                col_integer=None,
                col_string='string')
        ])
Exemplo n.º 6
0
    def test_simple_data_validate_invalidate_date_format(self):
        data = [{
            "col_string": "string",
            "col_integer": "99-01-01",
            "col_float": 5566.5566,
            "col_date": "NULL",
            "col_datetime": "1995-01-01 00:01:01",
            "col_boolean": True
        }]
        df = spark.createDataFrame(data)

        validate_data = Validator.validate_data(df,
                                                self.simple_schema).collect()
        self.assertEqual(validate_data, [
            Row(col_boolean=True,
                col_date=None,
                col_datetime=datetime.datetime(1995, 1, 1, 0, 1, 1),
                col_float=5566.5566,
                col_integer=None,
                col_string='string')
        ])