def _validate_csv(csv_file, output_file=None): """ Validates a CSV file. :param csv_file: The CSV file to validate :param output_file: The optional output file to which problems should be written :returns: True if the CSV file is valid, false otherwise """ field_names = _get_header(csv_file) validator = CSVValidator(field_names) # basic header and record length checks validator.add_header_check('EX1', 'bad header') validator.add_record_length_check('EX2', 'unexpected record length') with open(csv_file) as fp: data = csv.reader(fp) problems = validator.validate(data) if problems: write_problems(problems, output_file or sys.stdout) return False else: return True
def create_validator(): """Create an example CSV validator for patient demographic data.""" field_names = ('study_id', 'patient_id', 'gender', 'age_years', 'age_months', 'date_inclusion') validator = CSVValidator(field_names) # basic header and record length checks validator.add_header_check('EX1', 'bad header') validator.add_record_length_check('EX2', 'unexpected record length') # some simple value checks validator.add_value_check('study_id', int, 'EX3', 'study id must be an integer') validator.add_value_check('patient_id', int, 'EX4', 'patient id must be an integer') validator.add_value_check('gender', enumeration('M', 'F'), 'EX5', 'invalid gender') validator.add_value_check('age_years', number_range_inclusive(0, 120, int), 'EX6', 'invalid age in years') validator.add_value_check('date_inclusion', datetime_string('%Y-%m-%d'), 'EX7', 'invalid date') # a more complicated record check def check_age_variables(r): age_years = int(r['age_years']) age_months = int(r['age_months']) valid = (age_months >= age_years * 12 and age_months % age_years < 12) if not valid: raise RecordError('EX8', 'invalid age variables') validator.add_record_check(check_age_variables) return validator
def generate(self): validator = CSVValidator(self.field_names) validator.add_header_check() validator.add_record_length_check() for value, check in self.value_checks: validator.add_value_check(value, check) validator.add_unique_check(self.unique_checks) return validator
def test_record_length_checks(): """Test the record length checks.""" field_names = ('foo', 'bar') validator = CSVValidator(field_names) validator.add_record_length_check() # test default code and message validator.add_record_length_check('X2', 'custom message') data = ( ('foo', 'bar'), ('12', '3.4'), ('12', ), # be careful with syntax for singleton tuples ('12', '3.4', 'spong')) problems = validator.validate(data) assert len(problems) == 4, len(problems) # find problems reported under default code default_problems = [ p for p in problems if p['code'] == RECORD_LENGTH_CHECK_FAILED ] assert len(default_problems) == 2 d0 = default_problems[0] assert d0['message'] == MESSAGES[RECORD_LENGTH_CHECK_FAILED] assert d0['row'] == 3 assert d0['record'] == ('12', ) assert d0['length'] == 1 d1 = default_problems[1] assert d1['message'] == MESSAGES[RECORD_LENGTH_CHECK_FAILED] assert d1['row'] == 4 assert d1['record'] == ('12', '3.4', 'spong') assert d1['length'] == 3 # find problems reported under custom code custom_problems = [p for p in problems if p['code'] == 'X2'] assert len(custom_problems) == 2 c0 = custom_problems[0] assert c0['message'] == 'custom message' assert c0['row'] == 3 assert c0['record'] == ('12', ) assert c0['length'] == 1 c1 = custom_problems[1] assert c1['message'] == 'custom message' assert c1['row'] == 4 assert c1['record'] == ('12', '3.4', 'spong') assert c1['length'] == 3
def test_record_length_checks(): """Test the record length checks.""" field_names = ('foo', 'bar') validator = CSVValidator(field_names) validator.add_record_length_check() # test default code and message validator.add_record_length_check('X2', 'custom message') data = ( ('foo', 'bar'), ('12', '3.4'), ('12',), # be careful with syntax for singleton tuples ('12', '3.4', 'spong') ) problems = validator.validate(data) assert len(problems) == 4, len(problems) # find problems reported under default code default_problems = [p for p in problems if p['code'] == RECORD_LENGTH_CHECK_FAILED] assert len(default_problems) == 2 d0 = default_problems[0] assert d0['message'] == MESSAGES[RECORD_LENGTH_CHECK_FAILED] assert d0['row'] == 3 assert d0['record'] == ('12',) assert d0['length'] == 1 d1 = default_problems[1] assert d1['message'] == MESSAGES[RECORD_LENGTH_CHECK_FAILED] assert d1['row'] == 4 assert d1['record'] == ('12', '3.4', 'spong') assert d1['length'] == 3 # find problems reported under custom code custom_problems = [p for p in problems if p['code'] == 'X2'] assert len(custom_problems) == 2 c0 = custom_problems[0] assert c0['message'] == 'custom message' assert c0['row'] == 3 assert c0['record'] == ('12',) assert c0['length'] == 1 c1 = custom_problems[1] assert c1['message'] == 'custom message' assert c1['row'] == 4 assert c1['record'] == ('12', '3.4', 'spong') assert c1['length'] == 3
def test_skips(): """Test skip functions.""" field_names = ('foo', 'bar') validator = CSVValidator(field_names) validator.add_record_length_check() validator.add_value_check('foo', int) def skip_pragma(record): return record[0].startswith('##') validator.add_skip(skip_pragma) data = (('foo', 'bar'), ('1', 'X'), ('## this row', 'should be', 'skipped'), ('3', 'Y')) problems = validator.validate(data) assert len(problems) == 0, problems
def create_validator(): """Create an example CSV validator for patient demographic data.""" field_names = ( 'study_id', 'patient_id', 'gender', 'age_years', 'age_months', 'date_inclusion' ) validator = CSVValidator(field_names) # basic header and record length checks validator.add_header_check('EX1', 'bad header') validator.add_record_length_check('EX2', 'unexpected record length') # some simple value checks validator.add_value_check('study_id', int, 'EX3', 'study id must be an integer') validator.add_value_check('patient_id', int, 'EX4', 'patient id must be an integer') validator.add_value_check('gender', enumeration('M', 'F'), 'EX5', 'invalid gender') validator.add_value_check('age_years', number_range_inclusive(0, 120, int), 'EX6', 'invalid age in years') validator.add_value_check('date_inclusion', datetime_string('%Y-%m-%d'), 'EX7', 'invalid date') # a more complicated record check def check_age_variables(r): age_years = int(r['age_years']) age_months = int(r['age_months']) valid = (age_months >= age_years * 12 and age_months % age_years < 12) if not valid: raise RecordError('EX8', 'invalid age variables') validator.add_record_check(check_age_variables) return validator
def test_skips(): """Test skip functions.""" field_names = ('foo', 'bar') validator = CSVValidator(field_names) validator.add_record_length_check() validator.add_value_check('foo', int) def skip_pragma(record): return record[0].startswith('##') validator.add_skip(skip_pragma) data = ( ('foo', 'bar'), ('1', 'X'), ('## this row', 'should be', 'skipped'), ('3', 'Y') ) problems = validator.validate(data) assert len(problems) == 0, problems
def create_validator(): """Create an example CSV validator for patient demographic data.""" # def CheckAlpha(s=''): if len(s) > 0: # s=FixString(s) if not s.replace(" ","").isalpha() and len(s) > 0: return False # Logit("CheckAlpha: LineNo - " + str(ln+1) + " | Mem ID - " + curline[0] + " |" + COLDESC[(i+1)] +" - " + s + " :: Not a alphabetic letter.") return True field_names = ( 'CUSTID', 'FIRSTNAME', 'LASTNAME', 'CUSTNM', 'ADDRESS1', 'ADDRESS2', 'POSTCODE', 'CITY', 'STATE', 'WORKPHONE', 'WORKFAX', 'PHONE', 'MOBILE', 'EMAIL', 'ORGANISATION', 'EOL' ) validator = CSVValidator(field_names) # basic header and record length checks validator.add_header_check('EX1', 'bad header') validator.add_record_length_check('EX2', 'unexpected record length') # some simple value checks validator.add_value_check('CUSTID', int, 'EX3', 'CUSTID must be an integer') validator.add_value_check('FIRSTNAME', CheckAlpha, 'EX4', 'FIRSTNAME must be an integer') validator.add_value_check('LASTNAME', str, 'EX5', 'invalid LASTNAME') validator.add_value_check('CUSTNM', str, 'EX6', 'invalid CUSTNM') validator.add_value_check('ADDRESS1', str, 'EX7', 'invalid ADDRESS1') validator.add_value_check('ADDRESS2', str, 'EX8', 'invalid ADDRESS2') validator.add_value_check('POSTCODE', int, 'EX9', 'invalid POSTCODE') validator.add_value_check('CITY', str, 'EX10', 'invalid CITY') validator.add_value_check('STATE', str, 'EX11', 'invalid STATE') validator.add_value_check('WORKPHONE', int, 'EX12', 'invalid WORKPHONE') validator.add_value_check('WORKFAX', int, 'EX13', 'invalid WORKFAC') validator.add_value_check('PHONE', int, 'EX14', 'invalid PHONE') validator.add_value_check('MOBILE', int, 'EX15', 'invalid MOBILE') validator.add_value_check('EMAIL', str, 'EX16', 'invalid EMAIL') validator.add_value_check('ORGANISATION', str, 'EX17', 'invalid ORGANISATION') validator.add_value_check('EOL', str, 'EX18', 'invalid EOL') # a more complicated record check def check_age_variables(r): CUSTNM = int(r['CUSTNM']) ADDRESS1 = int(r['ADDRESS1']) valid = (ADDRESS1 >= CUSTNM * 12 and ADDRESS1 % CUSTNM < 12) if not valid: raise RecordError('EX8', 'invalid age variables') validator.add_record_check(check_age_variables) return validator