Пример #1
0
def test_parse_fields(database):
    sess = database.session
    fields = [
        FileColumn(name_short='string',
                   field_types_id=FIELD_TYPE_DICT['STRING'],
                   length=5),
        FileColumn(name_short='bool',
                   field_types_id=FIELD_TYPE_DICT['BOOLEAN'],
                   required=True),
        FileColumn(name_short='dec',
                   field_types_id=FIELD_TYPE_DICT['DECIMAL']),
        FileColumn(name_short='int',
                   field_types_id=FIELD_TYPE_DICT['INT'],
                   padded_flag=True,
                   length=4,
                   required=True)
    ]
    sess.add_all(fields)

    expected_parsed_fields = {
        'required': ['bool', 'int'],
        'number': ['dec', 'int'],
        'boolean': ['bool'],
        'length': ['string', 'int'],
        'padded': ['int']
    }
    expected_expected_headers = ['bool', 'int', 'dec', 'string']
    expected_headers, parsed_fields = validation_helper.parse_fields(
        sess, fields)
    assert parsed_fields == expected_parsed_fields
    assert set(expected_headers) == set(expected_expected_headers)
Пример #2
0
def test_check_length():
    data = pd.DataFrame({
        'row_number': ['1', '2', '3', '4', '5'],
        'unique_id': ['ID1', 'ID2', 'ID3', 'ID4', 'ID5'],
        'has_length': ['1', '12', '123', '1234', '12345'],
        'no_length': ['', '1', 'no', '1232', '4.3']
    })
    length_fields = ['has_length']
    report_headers = ValidationManager.report_headers
    csv_schema = {
        'has_length': FileColumn(length=3),
        'no_length': FileColumn()
    }
    short_cols = {'has_length': 'len'}
    flex_data = pd.DataFrame({
        'row_number': ['1', '2', '3', '4', '5'],
        'concatted': ['A', 'B', 'C', 'D', 'E']
    })
    type_error_rows = ['5']

    error_msg = ValidationError.lengthErrorMsg
    error_type = ValidationError.lengthError
    # report_headers = ['Unique ID', 'Field Name', 'Error Message', 'Value Provided', 'Expected Value', 'Difference',
    #                   'Flex Field', 'Row Number', 'Rule Label'] + ['error_type']
    expected_data = [[
        'ID4', 'len', error_msg, 'len: 1234', 'Max length: 3', '', 'D', '4',
        '', error_type
    ]]
    expected_error_df = pd.DataFrame(expected_data,
                                     columns=report_headers + ['error_type'])
    error_df = validation_helper.check_length(data, length_fields,
                                              report_headers, csv_schema,
                                              short_cols, flex_data,
                                              type_error_rows)
    assert_frame_equal(error_df, expected_error_df)
Пример #3
0
def test_valid_type():
    str_field = FileColumn(field_types_id=FIELD_TYPE_DICT['STRING'])
    int_field = FileColumn(field_types_id=FIELD_TYPE_DICT['INT'])
    csv_schema = {'str_field': str_field, 'int_field': int_field}

    # For more detailed tests, see is_valid_type
    row = {'Field Name': 'int_field', 'Value Provided': 'this is a string'}
    assert validation_helper.valid_type(row, csv_schema) is False
    row = {'Field Name': 'int_field', 'Value Provided': '1000'}
    assert validation_helper.valid_type(row, csv_schema) is True
Пример #4
0
def test_expected_type():
    bool_field = FileColumn(field_types_id=FIELD_TYPE_DICT['BOOLEAN'])
    dec_field = FileColumn(field_types_id=FIELD_TYPE_DICT['DECIMAL'])
    csv_schema = {'bool_field': bool_field, 'dec_field': dec_field}

    row = {'Field Name': 'bool_field'}
    assert validation_helper.expected_type(
        row, csv_schema) == 'This field must be a boolean'
    row = {'Field Name': 'dec_field'}
    assert validation_helper.expected_type(
        row, csv_schema) == 'This field must be a decimal'
Пример #5
0
def test_expected_length():
    length_field = FileColumn(length=5)
    non_length_field = FileColumn()
    csv_schema = {
        'length_field': length_field,
        'non_length_field': non_length_field
    }

    row = {'Field Name': 'length_field'}
    assert validation_helper.expected_length(row,
                                             csv_schema) == 'Max length: 5'
    row = {'Field Name': 'non_length_field'}
    assert validation_helper.expected_length(row,
                                             csv_schema) == 'Max length: None'
Пример #6
0
def test_valid_length():
    length_field = FileColumn(length=5)
    non_length_field = FileColumn()
    csv_schema = {
        'length_field': length_field,
        'non_length_field': non_length_field
    }

    row = {
        'Field Name': 'length_field',
        'Value Provided': 'this is more than five characters'
    }
    assert validation_helper.valid_length(row, csv_schema) is False
    row = {'Field Name': 'length_field', 'Value Provided': 'four'}
    assert validation_helper.valid_length(row, csv_schema) is True
    row = {
        'Field Name': 'non_length_field',
        'Value Provided': 'can be any length'
    }
    assert validation_helper.valid_length(row, csv_schema) is True
Пример #7
0
    def addColumnByFileType(self, fileType, fieldName, required, field_type):
        """
        Adds a new column to the schema

        Args:
        fileType -- One of the set of valid types of files (e.g. Award, AwardFinancial)

        fieldName -- The name of the scheam column
        required --  marks the column if data is allways required
        field_type  -- sets the type of data allowed in the column

        Returns:
            ID of new column
        """
        fileId = self.getFileId(fileType)
        if (fileId is None):
            raise ValueError("Filetype does not exist")
        newColumn = FileColumn()
        newColumn.required = False
        newColumn.name = fieldName
        newColumn.file_id = fileId
        field_type = field_type.upper()

        types = self.getDataTypes()
        #Allow for other names
        if (field_type == "STR"):
            field_type = "STRING"
        elif (field_type == "FLOAT"):
            field_type = "DECIMAL"
        elif (field_type == "BOOL"):
            field_type = "BOOLEAN"

        #Check types
        if field_type in types:
            newColumn.field_types_id = types[field_type]
        else:
            raise ValueError("".join(
                ["Type ", field_type, " is not vaild for  ",
                 str(fieldName)]))
        #Check Required
        required = required.upper()
        if (required in ["TRUE", "FALSE"]):
            if (required == "TRUE"):
                newColumn.required = True
        else:
            raise ValueError("".join(
                ["Required is not boolean for ",
                 str(fieldName)]))
        # Save
        self.session.add(newColumn)
        self.session.commit()
        return newColumn.file_column_id
    def addColumnByFileType(self,fileType,fieldName,required,field_type):
        """
        Adds a new column to the schema

        Args:
        fileType -- One of the set of valid types of files (e.g. Award, AwardFinancial)

        fieldName -- The name of the scheam column
        required --  marks the column if data is allways required
        field_type  -- sets the type of data allowed in the column

        Returns:
            ID of new column
        """
        fileId = self.getFileId(fileType)
        if(fileId is None) :
            raise ValueError("Filetype does not exist")
        newColumn = FileColumn()
        newColumn.required = False
        newColumn.name = fieldName
        newColumn.file_id = fileId
        field_type = field_type.upper()

        types = self.getDataTypes()
        #Allow for other names
        if(field_type == "STR") :
            field_type = "STRING"
        elif(field_type  == "FLOAT") :
            field_type = "DECIMAL"
        elif(field_type  == "BOOL"):
            field_type = "BOOLEAN"

        #Check types
        if field_type in types :
            newColumn.field_types_id =  types[field_type]
        else :
            raise ValueError("".join(["Type ",field_type," is not vaild for  ",str(fieldName)]))
        #Check Required
        required = required.upper()
        if( required in ["TRUE","FALSE"]) :
            if( required == "TRUE") :
                newColumn.required = True
        else :
            raise ValueError("".join(["Required is not boolean for ",str(fieldName)]))
        # Save
        self.session.add(newColumn)
        self.session.commit()
        return newColumn.file_column_id
Пример #9
0
    def addFileColumn(fileId, fieldTypeId, columnName, description, required,
                      session):
        """ Add information for one field

        Args:
            fileId: Which file this field is part of
            fieldTypeId: Data type found in this field
            columnName: Name of field
            description: Description of field
            required: True if field is required
            session: session object to be used for queries

        Returns:

        """
        column = FileColumn(file_id=fileId,
                            field_types_id=fieldTypeId,
                            name=columnName,
                            description=description,
                            required=required)
        session.add(column)
        session.commit()
        return column
Пример #10
0
    def setUpClass(cls):
        """Set up class-wide resources (test data)"""
        super(JobTests, cls).setUpClass()
        user = cls.userId

        # Flag for testing a million+ errors (can take ~30 min to run)
        cls.includeLongTests = False

        with createApp().app_context():
            # get the submission test user
            sess = GlobalDB.db().session

            # Create test submissions and jobs, also uploading
            # the files needed for each job.
            jobDict = {}

            submissionId = cls.insertSubmission(sess, user)
            job_info = Job(job_status_id=JOB_STATUS_DICT['ready'],
                           job_type_id=JOB_TYPE_DICT['file_upload'],
                           file_type_id=FILE_TYPE_DICT['appropriations'],
                           submission_id=submissionId)
            sess.add(job_info)
            sess.flush()
            jobDict['bad_upload'] = job_info.job_id

            submissionId = cls.insertSubmission(sess, user)
            job_info = Job(job_status_id=JOB_STATUS_DICT['ready'],
                           job_type_id=JOB_TYPE_DICT['csv_record_validation'],
                           file_type_id=FILE_TYPE_DICT['appropriations'],
                           submission_id=submissionId)
            sess.add(job_info)
            sess.flush()
            jobDict['bad_prereq'] = job_info.job_id

            submissionId = cls.insertSubmission(sess, user)
            job_info = Job(job_status_id=JOB_STATUS_DICT['ready'],
                           job_type_id=JOB_TYPE_DICT['external_validation'],
                           file_type_id=FILE_TYPE_DICT['appropriations'],
                           submission_id=submissionId)
            sess.add(job_info)
            sess.flush()
            jobDict['wrong_type'] = job_info.job_id

            submissionId = cls.insertSubmission(sess, user)
            job_info = Job(job_status_id=JOB_STATUS_DICT['finished'],
                           job_type_id=JOB_TYPE_DICT['csv_record_validation'],
                           file_type_id=FILE_TYPE_DICT['appropriations'],
                           submission_id=submissionId)
            sess.add(job_info)
            sess.flush()
            jobDict['not_ready'] = job_info.job_id

            submissionId = cls.insertSubmission(sess, user)
            job_info = Job(filename=cls.uploadFile('testEmpty.csv', user),
                           job_status_id=JOB_STATUS_DICT['ready'],
                           job_type_id=JOB_TYPE_DICT['csv_record_validation'],
                           file_type_id=FILE_TYPE_DICT['appropriations'],
                           submission_id=submissionId)
            sess.add(job_info)
            sess.flush()
            jobDict['empty'] = job_info.job_id

            # create dependency
            dependency = JobDependency(job_id=jobDict["bad_prereq"],
                                       prerequisite_id=jobDict["bad_upload"])
            sess.add(dependency)

            colIdDict = {}
            for fileId in range(1, 5):
                for columnId in range(1, 6):
                    if columnId < 3:
                        fieldType = FIELD_TYPE_DICT['INT']
                    else:
                        fieldType = FIELD_TYPE_DICT['STRING']
                    columnName = "header_{}".format(columnId)

                    fileCol = FileColumn(
                        file_id=fileId,
                        field_types_id=fieldType,
                        name=columnName,
                        required=(columnId != FIELD_TYPE_DICT['STRING']))
                    sess.add(fileCol)
                    sess.flush()
                    colIdDict["header_{}_file_type_{}".format(
                        columnId, fileId)] = fileCol.file_column_id

            # commit submissions/jobs and output IDs
            sess.commit()
            for job_type, job_id in jobDict.items():
                print('{}: {}'.format(job_type, job_id))

            cls.jobDict = jobDict
Пример #11
0
    def addColumnByFileType(sess,
                            types,
                            fileType,
                            fieldName,
                            fieldNameShort,
                            required,
                            field_type,
                            paddedFlag="False",
                            fieldLength=None):
        """
        Adds a new column to the schema

        Args:
        fileType -- FileTypeValidation object this column belongs to
        fieldName -- The name of the schema column
        types -- List of field types
        fieldNameShort -- The machine-friendly, short column name
        required --  marks the column if data is allways required
        field_type  -- sets the type of data allowed in the column
        paddedFlag -- True if this column should be padded
        fieldLength -- Maximum allowed length for this field

        """
        newColumn = FileColumn()
        newColumn.file = fileType
        newColumn.required = False
        newColumn.name = fieldName
        newColumn.name_short = fieldNameShort
        field_type = field_type.upper()

        # Allow for other names
        if field_type == "STR":
            field_type = "STRING"
        elif field_type == "FLOAT":
            field_type = "DECIMAL"
        elif field_type == "BOOL":
            field_type = "BOOLEAN"

        # Translate padded flag to true or false
        if not paddedFlag:
            newColumn.padded_flag = False
        elif paddedFlag.lower() == "true":
            newColumn.padded_flag = True
        else:
            newColumn.padded_flag = False

        # Check types
        if field_type in types:
            newColumn.field_types_id = types[field_type]
        else:
            raise ValueError('Type {} not value for {}'.format(
                field_type, fieldName))

        # Check Required
        if required.lower() in ['true', 'false']:
            if required.lower() == 'true':
                newColumn.required = True
        else:
            raise ValueError(
                'Required field is not boolean for {}'.format(fieldName))

        # Add length if present
        if fieldLength is not None and str(fieldLength).strip() != "":
            lengthInt = int(str(fieldLength).strip())
            newColumn.length = lengthInt

        sess.add(newColumn)
    def add_column_by_file_type(sess, types, file_type, field_name, field_name_short, required, field_type,
                                padded_flag="False", field_length=None):
        """
        Adds a new column to the schema

        Args:
        file_type -- FileType object this column belongs to
        field_name -- The name of the schema column
        types -- List of field types
        field_name_short -- The machine-friendly, short column name
        required --  marks the column if data is allways required
        field_type  -- sets the type of data allowed in the column
        padded_flag -- True if this column should be padded
        field_length -- Maximum allowed length for this field

        """
        new_column = FileColumn()
        new_column.file = file_type
        new_column.required = False
        new_column.name = field_name.lower().strip().replace(' ', '_')
        new_column.name_short = field_name_short.lower().strip().replace(' ', '_')
        field_type = field_type.upper()

        # Allow for other names
        if field_type == "STR":
            field_type = "STRING"
        elif field_type == "FLOAT":
            field_type = "DECIMAL"
        elif field_type == "BOOL":
            field_type = "BOOLEAN"

        # Translate padded flag to true or false
        if not padded_flag:
            new_column.padded_flag = False
        elif padded_flag.lower() == "true":
            new_column.padded_flag = True
        else:
            new_column.padded_flag = False

        # Check types
        if field_type in types:
            new_column.field_types_id = types[field_type]
        else:
            raise ValueError('Type {} not value for {}'.format(field_type, field_name))

        # Check Required
        if required.lower() in ['true', 'false']:
            if required.lower() == 'true':
                new_column.required = True
        else:
            raise ValueError('Required field is not boolean for {}'.format(field_name))

        # Add length if present
        if field_length is not None and str(field_length).strip() != "":
            length_int = int(str(field_length).strip())
            new_column.length = length_int

        sess.add(new_column)
Пример #13
0
    def setUpClass(cls):
        """Set up class-wide resources (test data)"""
        super(ValidatorTests, cls).setUpClass()
        #TODO: refactor into a pytest fixture

        # create test schema
        stringType = FieldType()
        stringType.field_type_id = 1
        stringType.name = "STRING"

        intType = FieldType()
        intType.field_type_id = 2
        intType.name = "INT"

        floatType = FieldType()
        floatType.field_type_id = 3
        floatType.name = "DECIMAL"

        booleanType = FieldType()
        booleanType.field_type_id = 4
        booleanType.name = "BOOLEAN"

        longType = FieldType()
        longType.field_type_id = 5
        longType.name = "LONG"

        column1 = FileColumn()
        column1.file_column_id = 1
        column1.name = "test1"
        column1.required = True
        column1.field_type = stringType
        column1.field_types_id = FIELD_TYPE_DICT[stringType.name]
        column1.file_id = 1

        column2 = FileColumn()
        column2.file_column_id = 2
        column2.name = "test2"
        column2.required = True
        column2.field_type = floatType
        column2.field_types_id = FIELD_TYPE_DICT[floatType.name]
        column2.file_id = 1

        column3 = FileColumn()
        column3.file_column_id = 3
        column3.name = "test3"
        column3.required = True
        column3.field_type = booleanType
        column3.field_types_id = FIELD_TYPE_DICT[booleanType.name]
        column3.file_id = 1

        column4 = FileColumn()
        column4.file_column_id = 3
        column4.name = "test4"
        column4.required = True
        column4.field_type = intType
        column4.field_types_id = FIELD_TYPE_DICT[intType.name]
        column4.file_id = 1

        column5 = FileColumn()
        column5.file_column_id = 3
        column5.name = "test5"
        column5.required = False
        column5.field_type = intType
        column5.field_types_id = FIELD_TYPE_DICT[intType.name]
        column5.file_id = 1

        column6 = FileColumn()
        column6.file_column_id = 6
        column6.name = "test6"
        column6.required = False
        column6.field_type = stringType
        column6.field_types_id = FIELD_TYPE_DICT[stringType.name]
        column6.file_id = 1

        column7 = FileColumn()
        column7.file_column_id = 7
        column7.name = "test7"
        column7.required = False
        column7.field_type = longType
        column7.field_types_id = FIELD_TYPE_DICT[longType.name]
        column7.file_id = 1

        cls.schema = {
            "test1": column1,
            "test2": column2,
            "test3": column3,
            "test4": column4,
            "test5": column5,
            "test6": column6,
            "test7": column7
        }
Пример #14
0
def test_check_type():
    data = pd.DataFrame({
        'row_number': ['1', '2', '3', '4', '5'],
        'unique_id': ['ID1', 'ID2', 'ID3', 'ID4', 'ID5'],
        'int': ['1', '2', '3', 'no', '5'],
        'dec': ['1.3', '1', 'no', '1232', '4.3'],
        'bool': ['no', 'Yes', 'TRUE', 'false', '4'],
        'string': ['this', 'row', 'should', 'be', 'ignored']
    })
    type_fields = ['int', 'bool', 'dec']
    type_labels = {'int': 'Integer', 'dec': 'Decimal'}
    report_headers = ValidationManager.report_headers
    csv_schema = {
        'int': FileColumn(field_types_id=FIELD_TYPE_DICT['INT']),
        'bool': FileColumn(field_types_id=FIELD_TYPE_DICT['BOOLEAN']),
        'dec': FileColumn(field_types_id=FIELD_TYPE_DICT['DECIMAL'])
    }
    short_cols = {'int': 'i', 'bool': 'b'}
    flex_data = pd.DataFrame({
        'row_number': ['1', '2', '3', '4', '5'],
        'concatted': ['A', 'B', 'C', 'D', 'E']
    })
    is_fabs = False

    error_msg = ValidationError.typeErrorMsg
    error_type = ValidationError.typeError
    # report_headers = ['Unique ID', 'Field Name', 'Error Message', 'Value Provided', 'Expected Value', 'Difference',
    #                   'Flex Field', 'Row Number', 'Rule Label'] + ['error_type']
    expected_data = [[
        'ID4', 'i', error_msg, 'i: no', 'This field must be a int', '', 'D',
        '4', '', error_type
    ],
                     [
                         'ID5', 'b', error_msg, 'b: 4',
                         'This field must be a boolean', '', 'E', '5', '',
                         error_type
                     ],
                     [
                         'ID3', 'dec', error_msg, 'dec: no',
                         'This field must be a decimal', '', 'C', '3', '',
                         error_type
                     ]]
    expected_error_df = pd.DataFrame(expected_data,
                                     columns=report_headers + ['error_type'])
    error_df = validation_helper.check_type(data, type_fields, type_labels,
                                            report_headers, csv_schema,
                                            short_cols, flex_data, is_fabs)
    assert_frame_equal(error_df, expected_error_df)

    is_fabs = True
    expected_data = [[
        'ID4', 'i', error_msg, 'i: no', 'This field must be a int', '', 'D',
        '4', 'Integer', error_type
    ],
                     [
                         'ID5', 'b', error_msg, 'b: 4',
                         'This field must be a boolean', '', 'E', '5', '',
                         error_type
                     ],
                     [
                         'ID3', 'dec', error_msg, 'dec: no',
                         'This field must be a decimal', '', 'C', '3',
                         'Decimal', error_type
                     ]]
    expected_error_df = pd.DataFrame(expected_data,
                                     columns=report_headers + ['error_type'])
    error_df = validation_helper.check_type(data, type_fields, type_labels,
                                            report_headers, csv_schema,
                                            short_cols, flex_data, is_fabs)
    assert_frame_equal(error_df, expected_error_df)