def test_parse_fields(database): sess = database.session fields = [ FileColumn(name_short='string', field_types_id=FIELD_TYPE_DICT['STRING'], length=5), FileColumn(name_short='bool', field_types_id=FIELD_TYPE_DICT['BOOLEAN'], required=True), FileColumn(name_short='dec', field_types_id=FIELD_TYPE_DICT['DECIMAL']), FileColumn(name_short='int', field_types_id=FIELD_TYPE_DICT['INT'], padded_flag=True, length=4, required=True) ] sess.add_all(fields) expected_parsed_fields = { 'required': ['bool', 'int'], 'number': ['dec', 'int'], 'boolean': ['bool'], 'length': ['string', 'int'], 'padded': ['int'] } expected_expected_headers = ['bool', 'int', 'dec', 'string'] expected_headers, parsed_fields = validation_helper.parse_fields( sess, fields) assert parsed_fields == expected_parsed_fields assert set(expected_headers) == set(expected_expected_headers)
def test_check_length(): data = pd.DataFrame({ 'row_number': ['1', '2', '3', '4', '5'], 'unique_id': ['ID1', 'ID2', 'ID3', 'ID4', 'ID5'], 'has_length': ['1', '12', '123', '1234', '12345'], 'no_length': ['', '1', 'no', '1232', '4.3'] }) length_fields = ['has_length'] report_headers = ValidationManager.report_headers csv_schema = { 'has_length': FileColumn(length=3), 'no_length': FileColumn() } short_cols = {'has_length': 'len'} flex_data = pd.DataFrame({ 'row_number': ['1', '2', '3', '4', '5'], 'concatted': ['A', 'B', 'C', 'D', 'E'] }) type_error_rows = ['5'] error_msg = ValidationError.lengthErrorMsg error_type = ValidationError.lengthError # report_headers = ['Unique ID', 'Field Name', 'Error Message', 'Value Provided', 'Expected Value', 'Difference', # 'Flex Field', 'Row Number', 'Rule Label'] + ['error_type'] expected_data = [[ 'ID4', 'len', error_msg, 'len: 1234', 'Max length: 3', '', 'D', '4', '', error_type ]] expected_error_df = pd.DataFrame(expected_data, columns=report_headers + ['error_type']) error_df = validation_helper.check_length(data, length_fields, report_headers, csv_schema, short_cols, flex_data, type_error_rows) assert_frame_equal(error_df, expected_error_df)
def test_valid_type(): str_field = FileColumn(field_types_id=FIELD_TYPE_DICT['STRING']) int_field = FileColumn(field_types_id=FIELD_TYPE_DICT['INT']) csv_schema = {'str_field': str_field, 'int_field': int_field} # For more detailed tests, see is_valid_type row = {'Field Name': 'int_field', 'Value Provided': 'this is a string'} assert validation_helper.valid_type(row, csv_schema) is False row = {'Field Name': 'int_field', 'Value Provided': '1000'} assert validation_helper.valid_type(row, csv_schema) is True
def test_expected_type(): bool_field = FileColumn(field_types_id=FIELD_TYPE_DICT['BOOLEAN']) dec_field = FileColumn(field_types_id=FIELD_TYPE_DICT['DECIMAL']) csv_schema = {'bool_field': bool_field, 'dec_field': dec_field} row = {'Field Name': 'bool_field'} assert validation_helper.expected_type( row, csv_schema) == 'This field must be a boolean' row = {'Field Name': 'dec_field'} assert validation_helper.expected_type( row, csv_schema) == 'This field must be a decimal'
def test_expected_length(): length_field = FileColumn(length=5) non_length_field = FileColumn() csv_schema = { 'length_field': length_field, 'non_length_field': non_length_field } row = {'Field Name': 'length_field'} assert validation_helper.expected_length(row, csv_schema) == 'Max length: 5' row = {'Field Name': 'non_length_field'} assert validation_helper.expected_length(row, csv_schema) == 'Max length: None'
def test_valid_length(): length_field = FileColumn(length=5) non_length_field = FileColumn() csv_schema = { 'length_field': length_field, 'non_length_field': non_length_field } row = { 'Field Name': 'length_field', 'Value Provided': 'this is more than five characters' } assert validation_helper.valid_length(row, csv_schema) is False row = {'Field Name': 'length_field', 'Value Provided': 'four'} assert validation_helper.valid_length(row, csv_schema) is True row = { 'Field Name': 'non_length_field', 'Value Provided': 'can be any length' } assert validation_helper.valid_length(row, csv_schema) is True
def addColumnByFileType(self, fileType, fieldName, required, field_type): """ Adds a new column to the schema Args: fileType -- One of the set of valid types of files (e.g. Award, AwardFinancial) fieldName -- The name of the scheam column required -- marks the column if data is allways required field_type -- sets the type of data allowed in the column Returns: ID of new column """ fileId = self.getFileId(fileType) if (fileId is None): raise ValueError("Filetype does not exist") newColumn = FileColumn() newColumn.required = False newColumn.name = fieldName newColumn.file_id = fileId field_type = field_type.upper() types = self.getDataTypes() #Allow for other names if (field_type == "STR"): field_type = "STRING" elif (field_type == "FLOAT"): field_type = "DECIMAL" elif (field_type == "BOOL"): field_type = "BOOLEAN" #Check types if field_type in types: newColumn.field_types_id = types[field_type] else: raise ValueError("".join( ["Type ", field_type, " is not vaild for ", str(fieldName)])) #Check Required required = required.upper() if (required in ["TRUE", "FALSE"]): if (required == "TRUE"): newColumn.required = True else: raise ValueError("".join( ["Required is not boolean for ", str(fieldName)])) # Save self.session.add(newColumn) self.session.commit() return newColumn.file_column_id
def addColumnByFileType(self,fileType,fieldName,required,field_type): """ Adds a new column to the schema Args: fileType -- One of the set of valid types of files (e.g. Award, AwardFinancial) fieldName -- The name of the scheam column required -- marks the column if data is allways required field_type -- sets the type of data allowed in the column Returns: ID of new column """ fileId = self.getFileId(fileType) if(fileId is None) : raise ValueError("Filetype does not exist") newColumn = FileColumn() newColumn.required = False newColumn.name = fieldName newColumn.file_id = fileId field_type = field_type.upper() types = self.getDataTypes() #Allow for other names if(field_type == "STR") : field_type = "STRING" elif(field_type == "FLOAT") : field_type = "DECIMAL" elif(field_type == "BOOL"): field_type = "BOOLEAN" #Check types if field_type in types : newColumn.field_types_id = types[field_type] else : raise ValueError("".join(["Type ",field_type," is not vaild for ",str(fieldName)])) #Check Required required = required.upper() if( required in ["TRUE","FALSE"]) : if( required == "TRUE") : newColumn.required = True else : raise ValueError("".join(["Required is not boolean for ",str(fieldName)])) # Save self.session.add(newColumn) self.session.commit() return newColumn.file_column_id
def addFileColumn(fileId, fieldTypeId, columnName, description, required, session): """ Add information for one field Args: fileId: Which file this field is part of fieldTypeId: Data type found in this field columnName: Name of field description: Description of field required: True if field is required session: session object to be used for queries Returns: """ column = FileColumn(file_id=fileId, field_types_id=fieldTypeId, name=columnName, description=description, required=required) session.add(column) session.commit() return column
def setUpClass(cls): """Set up class-wide resources (test data)""" super(JobTests, cls).setUpClass() user = cls.userId # Flag for testing a million+ errors (can take ~30 min to run) cls.includeLongTests = False with createApp().app_context(): # get the submission test user sess = GlobalDB.db().session # Create test submissions and jobs, also uploading # the files needed for each job. jobDict = {} submissionId = cls.insertSubmission(sess, user) job_info = Job(job_status_id=JOB_STATUS_DICT['ready'], job_type_id=JOB_TYPE_DICT['file_upload'], file_type_id=FILE_TYPE_DICT['appropriations'], submission_id=submissionId) sess.add(job_info) sess.flush() jobDict['bad_upload'] = job_info.job_id submissionId = cls.insertSubmission(sess, user) job_info = Job(job_status_id=JOB_STATUS_DICT['ready'], job_type_id=JOB_TYPE_DICT['csv_record_validation'], file_type_id=FILE_TYPE_DICT['appropriations'], submission_id=submissionId) sess.add(job_info) sess.flush() jobDict['bad_prereq'] = job_info.job_id submissionId = cls.insertSubmission(sess, user) job_info = Job(job_status_id=JOB_STATUS_DICT['ready'], job_type_id=JOB_TYPE_DICT['external_validation'], file_type_id=FILE_TYPE_DICT['appropriations'], submission_id=submissionId) sess.add(job_info) sess.flush() jobDict['wrong_type'] = job_info.job_id submissionId = cls.insertSubmission(sess, user) job_info = Job(job_status_id=JOB_STATUS_DICT['finished'], job_type_id=JOB_TYPE_DICT['csv_record_validation'], file_type_id=FILE_TYPE_DICT['appropriations'], submission_id=submissionId) sess.add(job_info) sess.flush() jobDict['not_ready'] = job_info.job_id submissionId = cls.insertSubmission(sess, user) job_info = Job(filename=cls.uploadFile('testEmpty.csv', user), job_status_id=JOB_STATUS_DICT['ready'], job_type_id=JOB_TYPE_DICT['csv_record_validation'], file_type_id=FILE_TYPE_DICT['appropriations'], submission_id=submissionId) sess.add(job_info) sess.flush() jobDict['empty'] = job_info.job_id # create dependency dependency = JobDependency(job_id=jobDict["bad_prereq"], prerequisite_id=jobDict["bad_upload"]) sess.add(dependency) colIdDict = {} for fileId in range(1, 5): for columnId in range(1, 6): if columnId < 3: fieldType = FIELD_TYPE_DICT['INT'] else: fieldType = FIELD_TYPE_DICT['STRING'] columnName = "header_{}".format(columnId) fileCol = FileColumn( file_id=fileId, field_types_id=fieldType, name=columnName, required=(columnId != FIELD_TYPE_DICT['STRING'])) sess.add(fileCol) sess.flush() colIdDict["header_{}_file_type_{}".format( columnId, fileId)] = fileCol.file_column_id # commit submissions/jobs and output IDs sess.commit() for job_type, job_id in jobDict.items(): print('{}: {}'.format(job_type, job_id)) cls.jobDict = jobDict
def addColumnByFileType(sess, types, fileType, fieldName, fieldNameShort, required, field_type, paddedFlag="False", fieldLength=None): """ Adds a new column to the schema Args: fileType -- FileTypeValidation object this column belongs to fieldName -- The name of the schema column types -- List of field types fieldNameShort -- The machine-friendly, short column name required -- marks the column if data is allways required field_type -- sets the type of data allowed in the column paddedFlag -- True if this column should be padded fieldLength -- Maximum allowed length for this field """ newColumn = FileColumn() newColumn.file = fileType newColumn.required = False newColumn.name = fieldName newColumn.name_short = fieldNameShort field_type = field_type.upper() # Allow for other names if field_type == "STR": field_type = "STRING" elif field_type == "FLOAT": field_type = "DECIMAL" elif field_type == "BOOL": field_type = "BOOLEAN" # Translate padded flag to true or false if not paddedFlag: newColumn.padded_flag = False elif paddedFlag.lower() == "true": newColumn.padded_flag = True else: newColumn.padded_flag = False # Check types if field_type in types: newColumn.field_types_id = types[field_type] else: raise ValueError('Type {} not value for {}'.format( field_type, fieldName)) # Check Required if required.lower() in ['true', 'false']: if required.lower() == 'true': newColumn.required = True else: raise ValueError( 'Required field is not boolean for {}'.format(fieldName)) # Add length if present if fieldLength is not None and str(fieldLength).strip() != "": lengthInt = int(str(fieldLength).strip()) newColumn.length = lengthInt sess.add(newColumn)
def add_column_by_file_type(sess, types, file_type, field_name, field_name_short, required, field_type, padded_flag="False", field_length=None): """ Adds a new column to the schema Args: file_type -- FileType object this column belongs to field_name -- The name of the schema column types -- List of field types field_name_short -- The machine-friendly, short column name required -- marks the column if data is allways required field_type -- sets the type of data allowed in the column padded_flag -- True if this column should be padded field_length -- Maximum allowed length for this field """ new_column = FileColumn() new_column.file = file_type new_column.required = False new_column.name = field_name.lower().strip().replace(' ', '_') new_column.name_short = field_name_short.lower().strip().replace(' ', '_') field_type = field_type.upper() # Allow for other names if field_type == "STR": field_type = "STRING" elif field_type == "FLOAT": field_type = "DECIMAL" elif field_type == "BOOL": field_type = "BOOLEAN" # Translate padded flag to true or false if not padded_flag: new_column.padded_flag = False elif padded_flag.lower() == "true": new_column.padded_flag = True else: new_column.padded_flag = False # Check types if field_type in types: new_column.field_types_id = types[field_type] else: raise ValueError('Type {} not value for {}'.format(field_type, field_name)) # Check Required if required.lower() in ['true', 'false']: if required.lower() == 'true': new_column.required = True else: raise ValueError('Required field is not boolean for {}'.format(field_name)) # Add length if present if field_length is not None and str(field_length).strip() != "": length_int = int(str(field_length).strip()) new_column.length = length_int sess.add(new_column)
def setUpClass(cls): """Set up class-wide resources (test data)""" super(ValidatorTests, cls).setUpClass() #TODO: refactor into a pytest fixture # create test schema stringType = FieldType() stringType.field_type_id = 1 stringType.name = "STRING" intType = FieldType() intType.field_type_id = 2 intType.name = "INT" floatType = FieldType() floatType.field_type_id = 3 floatType.name = "DECIMAL" booleanType = FieldType() booleanType.field_type_id = 4 booleanType.name = "BOOLEAN" longType = FieldType() longType.field_type_id = 5 longType.name = "LONG" column1 = FileColumn() column1.file_column_id = 1 column1.name = "test1" column1.required = True column1.field_type = stringType column1.field_types_id = FIELD_TYPE_DICT[stringType.name] column1.file_id = 1 column2 = FileColumn() column2.file_column_id = 2 column2.name = "test2" column2.required = True column2.field_type = floatType column2.field_types_id = FIELD_TYPE_DICT[floatType.name] column2.file_id = 1 column3 = FileColumn() column3.file_column_id = 3 column3.name = "test3" column3.required = True column3.field_type = booleanType column3.field_types_id = FIELD_TYPE_DICT[booleanType.name] column3.file_id = 1 column4 = FileColumn() column4.file_column_id = 3 column4.name = "test4" column4.required = True column4.field_type = intType column4.field_types_id = FIELD_TYPE_DICT[intType.name] column4.file_id = 1 column5 = FileColumn() column5.file_column_id = 3 column5.name = "test5" column5.required = False column5.field_type = intType column5.field_types_id = FIELD_TYPE_DICT[intType.name] column5.file_id = 1 column6 = FileColumn() column6.file_column_id = 6 column6.name = "test6" column6.required = False column6.field_type = stringType column6.field_types_id = FIELD_TYPE_DICT[stringType.name] column6.file_id = 1 column7 = FileColumn() column7.file_column_id = 7 column7.name = "test7" column7.required = False column7.field_type = longType column7.field_types_id = FIELD_TYPE_DICT[longType.name] column7.file_id = 1 cls.schema = { "test1": column1, "test2": column2, "test3": column3, "test4": column4, "test5": column5, "test6": column6, "test7": column7 }
def test_check_type(): data = pd.DataFrame({ 'row_number': ['1', '2', '3', '4', '5'], 'unique_id': ['ID1', 'ID2', 'ID3', 'ID4', 'ID5'], 'int': ['1', '2', '3', 'no', '5'], 'dec': ['1.3', '1', 'no', '1232', '4.3'], 'bool': ['no', 'Yes', 'TRUE', 'false', '4'], 'string': ['this', 'row', 'should', 'be', 'ignored'] }) type_fields = ['int', 'bool', 'dec'] type_labels = {'int': 'Integer', 'dec': 'Decimal'} report_headers = ValidationManager.report_headers csv_schema = { 'int': FileColumn(field_types_id=FIELD_TYPE_DICT['INT']), 'bool': FileColumn(field_types_id=FIELD_TYPE_DICT['BOOLEAN']), 'dec': FileColumn(field_types_id=FIELD_TYPE_DICT['DECIMAL']) } short_cols = {'int': 'i', 'bool': 'b'} flex_data = pd.DataFrame({ 'row_number': ['1', '2', '3', '4', '5'], 'concatted': ['A', 'B', 'C', 'D', 'E'] }) is_fabs = False error_msg = ValidationError.typeErrorMsg error_type = ValidationError.typeError # report_headers = ['Unique ID', 'Field Name', 'Error Message', 'Value Provided', 'Expected Value', 'Difference', # 'Flex Field', 'Row Number', 'Rule Label'] + ['error_type'] expected_data = [[ 'ID4', 'i', error_msg, 'i: no', 'This field must be a int', '', 'D', '4', '', error_type ], [ 'ID5', 'b', error_msg, 'b: 4', 'This field must be a boolean', '', 'E', '5', '', error_type ], [ 'ID3', 'dec', error_msg, 'dec: no', 'This field must be a decimal', '', 'C', '3', '', error_type ]] expected_error_df = pd.DataFrame(expected_data, columns=report_headers + ['error_type']) error_df = validation_helper.check_type(data, type_fields, type_labels, report_headers, csv_schema, short_cols, flex_data, is_fabs) assert_frame_equal(error_df, expected_error_df) is_fabs = True expected_data = [[ 'ID4', 'i', error_msg, 'i: no', 'This field must be a int', '', 'D', '4', 'Integer', error_type ], [ 'ID5', 'b', error_msg, 'b: 4', 'This field must be a boolean', '', 'E', '5', '', error_type ], [ 'ID3', 'dec', error_msg, 'dec: no', 'This field must be a decimal', '', 'C', '3', 'Decimal', error_type ]] expected_error_df = pd.DataFrame(expected_data, columns=report_headers + ['error_type']) error_df = validation_helper.check_type(data, type_fields, type_labels, report_headers, csv_schema, short_cols, flex_data, is_fabs) assert_frame_equal(error_df, expected_error_df)