def test_wrong_ints(self, no_header_schema, test_input): with pytest.raises(ParseErrors) as e: parse_csv([[test_input, "Foo", "true"]], json.loads(no_header_schema)) assert e.errors == [{ "row": 0, "column": "0", "message": f"invalid literal for int() with base 10: '{test_input}'", }]
def test_wrong_floats(self, boligpriser_schema, boligpriser_header, test_input): with pytest.raises(ParseErrors) as e: parse_csv( [["0001", "Trøndelag", test_input]], json.loads(boligpriser_schema), header=boligpriser_header, ) assert e.errors == [{ "row": 0, "column": "pris", "message": f"could not convert string to float: '{test_input}'", }]
def test_incorrect_date_colum(self, dates_header, dates_schema): csv_data = parse_csv( [["1", "2020", "garbish data", "2020-01-01T12:01:01"]], dates_schema, header=dates_header, ) validation_errors = JsonSchemaValidator(dates_schema).validate( csv_data) assert len(validation_errors) == 1
def test_incorrect_year_colum(self, dates_header, dates_schema): invalid_years = ["abc", ""] for invalid_year in invalid_years: csv_data = parse_csv( [["1", invalid_year, "2020-12-30", "2020-12-01T12:01:01"]], dates_schema, header=dates_header, ) validation_errors = JsonSchemaValidator(dates_schema).validate( csv_data) assert len(validation_errors) == 1
def test_valid_year_colum(self, dates_header, dates_schema): valid_years = ["2020", "-100", "9999"] for valid_year in valid_years: csv_data = parse_csv( [["1", valid_year, "2020-12-30", "2020-12-01T12:01:01"]], dates_schema, header=dates_header, ) validation_errors = JsonSchemaValidator(dates_schema).validate( csv_data) assert len(validation_errors) == 0
def validate_csv(event, context): config = Config.from_lambda_event(event) step_config = StepConfig.from_task_config(config.task_config) s3_prefix = config.payload.output_dataset.s3_prefix log_add( header_row=step_config.header_row, delimiter=step_config.delimiter, quote=step_config.quote, schema=step_config.schema, output_prefix=s3_prefix, ) if not step_config.schema: log_add(notice="No Schema provided for validation") config.payload.step_data.status = Status.VALIDATION_SUCCESS.value # 2020.06: Validation done optionally - we now return ok if we don't supply a # schema for the validation step return asdict(config.payload.step_data) input_prefix = next( iter(config.payload.step_data.s3_input_prefixes.values())) log_add(s3_input_prefix=input_prefix) objects = s3.list_objects_v2(Bucket=BUCKET, Prefix=input_prefix) s3_path = next(iter(objects["Contents"]))["Key"] log_add(s3_input_path=s3_path) response = s3.get_object(Bucket=BUCKET, Key=s3_path) reader = csv.reader( string_reader.from_response(response), dialect="unix", delimiter=step_config.delimiter, quotechar=step_config.quote, ) header = None if step_config.header_row: header = next(reader) try: csv_data = parse_csv(reader, step_config.schema, header) except ParseErrors as p: return _with_error(config, p.errors) validation_errors = JsonSchemaValidator( step_config.schema).validate(csv_data) if validation_errors: return _with_error(config, errors=validation_errors) config.payload.step_data.status = Status.VALIDATION_SUCCESS.value return asdict(config.payload.step_data)
def test_correct_dates(self, dates_header, dates_schema): csv_data = parse_csv( [ ["1", "2020", "2020-01-01", "2020-01-01T12:01:01"], ["1", "2020", "2020-01-01", "2020-01-01 12:01:01"], ["1", "2020", "2020-01-01", "2020-12-01T12-01"], ], dates_schema, header=dates_header, ) validation_errors = JsonSchemaValidator(dates_schema).validate( csv_data) assert len(validation_errors) == 0
def test_valid_date_time_colum(self, dates_header, dates_schema): valid_dates = [ "0009-12-01T12:01:01", "2020-12-12T12:01:01", ] for valid_date in valid_dates: csv_data = parse_csv( [["1", "2020", "2020-12-30", valid_date]], dates_schema, header=dates_header, ) validation_errors = JsonSchemaValidator(dates_schema).validate( csv_data) assert len(validation_errors) == 0
def test_incorrect_date_time_colum(self, dates_header, dates_schema): invalid_dates = [ "2020-13-01T12:01:01", "2020 12 32T12:01:01", "garbish data", ] for invalid_date in invalid_dates: csv_data = parse_csv( [["1", "2020", "2020-12-30", invalid_date]], dates_schema, header=dates_header, ) validation_errors = JsonSchemaValidator(dates_schema).validate( csv_data) assert len(validation_errors) == 1
def test_parse_no_headers(self, no_header_schema): data = parse_csv( [["120", "Foo", "true"], ["999199", "Bar", "false"]], json.loads(no_header_schema), ) assert data == [ { "0": 120, "1": "Foo", "2": True }, { "0": 999_199, "1": "Bar", "2": False }, ]
def test_parse_with_headers(self, boligpriser_schema, boligpriser_header): data = parse_csv( [ ["001", "Østre byflak", "1010.01", "true"], ["002", "Hønse-Lovisaløkka", "5001,10", "false"], ], json.loads(boligpriser_schema), header=boligpriser_header, ) assert data == [ { "delbydel_id": "001", "navn": "Østre byflak", "pris": 1010.01, "til_salg": True, }, { "delbydel_id": "002", "navn": "Hønse-Lovisaløkka", "pris": 5001.10, "til_salg": False, }, ]
def test_empty_schema(): data = parse_csv([["1", "foo"], ["2", "bar"]], {}) assert data == [["1", "foo"], ["2", "bar"]]
def test_simple_array(): data = parse_csv([["1", "foo"], ["2", "bar"]], {"type": "array"}) assert data == [["1", "foo"], ["2", "bar"]]
def test_parse_empty_values(self, no_header_schema): data = parse_csv([["55", "", "true"]], json.loads(no_header_schema)) assert data == [{"0": 55, "2": True}]