def test_type_conversion(self): """Test data type conversions.""" self.assertIsInstance(Validator.getType("1234.0", "STRING"), basestring) self.assertIsInstance(Validator.getType("10", "INT"), int) self.assertIsInstance(Validator.getType("YES", "BOOLEAN"), basestring) self.assertIsInstance(Validator.getType("1234.2", "DECIMAL"), Decimal) self.assertIsInstance(Validator.getType("400000000001", "LONG"), long)
def test_schema_optional_field(self): """Test optional fields.""" schema = self.schema record = { "test1": "hello", "test2": "1.0", "test3": "YES", "test4": "1", "test5": "1", } self.assertTrue(Validator.validate(record, schema)) record["test5"] = "" self.assertTrue(Validator.validate(record, schema)) record["test5"] = "s" self.assertTrue(Validator.validate(record, schema)) record["test5"] = "" record["test3"] = "" self.assertTrue(Validator.validate(record, schema))
def test_schema_optional_field(self): """Test optional fields.""" schema = self.schema interfaces = self.interfaces record = { "test1": "hello", "test2": "1.0", "test3": "YES", "test4": "1", "test5": "1", } self.assertTrue(Validator.validate( record, [], schema, "award", interfaces)[0]) record["test5"] = "" self.assertTrue(Validator.validate( record, [], schema, "award", interfaces)[0]) record["test5"] = "s" self.assertFalse(Validator.validate( record, [], schema, "award", interfaces)[0]) record["test5"] = "" record["test3"] = "" self.assertFalse(Validator.validate( record, [], schema, "award", interfaces)[0])
def runCrossValidation(self, jobId, interfaces): """ Cross file validation job, test all rules with matching rule_timing """ # Select all rules from multi-field rule table rules = interfaces.validationDb.getMultiFieldRulesByTiming("cross-file") # Validate cross validation rules submissionId = interfaces.jobDb.getSubmissionId(jobId) failures = Validator.crossValidate(rules,submissionId) bucketName = CONFIG_BROKER['aws_bucket'] regionName = CONFIG_BROKER['aws_region'] errorFileName = self.getFileName(interfaces.jobDb.getCrossFileReportPath(submissionId)) errorDb = interfaces.errorDb with self.getWriter(regionName, bucketName, errorFileName, self.crossFileReportHeaders) as writer: for failure in failures: writer.write(failure) errorDb.recordRowError(jobId,"cross_file",failure[0],failure[1],None) writer.finishBatch() errorDb.writeAllRowErrors(jobId)
def runCrossValidation(self, jobId, interfaces): """ Cross file validation job, test all rules with matching rule_timing """ # Select all cross-file rules from rule table rules = interfaces.validationDb.getRulesByTiming("cross_file") # Validate cross validation rules submissionId = interfaces.jobDb.getSubmissionId(jobId) failures = Validator.crossValidate(rules, submissionId) bucketName = CONFIG_BROKER['aws_bucket'] regionName = CONFIG_BROKER['aws_region'] errorFileName = self.getFileName( interfaces.jobDb.getCrossFileReportPath(submissionId)) errorDb = interfaces.errorDb with self.getWriter(regionName, bucketName, errorFileName, self.crossFileReportHeaders) as writer: for failure in failures: writer.write(failure) errorDb.recordRowError(jobId, "cross_file", failure[0], failure[1], None) writer.finishBatch() errorDb.writeAllRowErrors(jobId) interfaces.jobDb.markJobStatus(jobId, "finished")
def loadCsv(cls,filename,model,interface,fieldMap,fieldOptions): """ Loads a table based on a csv Args: filename: CSV to load model: ORM object for table to be loaded interface: interface to DB table is in fieldMap: dict that maps columns of the csv to attributes of the ORM object fieldOptions: dict with keys of attribute names, value contains a dict with options for that attribute. Current options are "pad_to_length" which if present will pad the field with leading zeros up to specified length, and "skip_duplicate" which ignores subsequent lines that repeat values. """ # Delete all records currently in table interface.session.query(model).delete() interface.session.commit() valuePresent = {} # Open csv with open(filename,'rU') as csvfile: # Read header header = csvfile.readline() # Split header into fieldnames rawFieldNames = header.split(",") fieldNames = [] # Clean field names for field in rawFieldNames: fieldNames.append(FieldCleaner.cleanString(field)) # Map fieldnames to attribute names attributeNames = [] for field in fieldNames: if field in fieldMap: attributeNames.append(fieldMap[field]) if fieldMap[field] in fieldOptions and "skip_duplicates" in fieldOptions[fieldMap[field]]: # Create empty dict for this field valuePresent[fieldMap[field]] = {} else: raise KeyError("".join(["Found unexpected field ", str(field)])) # Check that all fields are present for field in fieldMap: if not field in fieldNames: raise ValueError("".join([str(field)," is required for loading table ", str(type(model))])) # Open DictReader with attribute names reader = csv.DictReader(csvfile,fieldnames = attributeNames) # For each row, create instance of model and add it for row in reader: skipInsert = False for field in fieldOptions: # For each field with options present, modify according to those options options = fieldOptions[field] if "pad_to_length" in options: padLength = options["pad_to_length"] row[field] = Validator.padToLength(row[field],padLength) if "skip_duplicates" in options: if len(row[field].strip()) == 0 or row[field] in valuePresent[field]: # Value not provided or already exists, skip it skipInsert = True else: # Insert new value valuePresent[field][row[field]] = True record = model(**row) if not skipInsert: try: interface.session.merge(record) except IntegrityError as e: # Hit a duplicate value that violates index, skip this one print("".join(["Warning: Skipping this row: ",str(row)])) print("".join(["Due to error: ",str(e)])) interface.session.rollback() continue interface.session.commit()
def loadCsv(cls, filename, model, interface, fieldMap, fieldOptions): """ Loads a table based on a csv Args: filename: CSV to load model: ORM object for table to be loaded interface: interface to DB table is in fieldMap: dict that maps columns of the csv to attributes of the ORM object fieldOptions: dict with keys of attribute names, value contains a dict with options for that attribute. Current options are "pad_to_length" which if present will pad the field with leading zeros up to specified length, and "skip_duplicate" which ignores subsequent lines that repeat values. """ # Delete all records currently in table interface.session.query(model).delete() interface.session.commit() valuePresent = {} # Open csv with open(filename, 'rU') as csvfile: # Read header header = csvfile.readline() # Split header into fieldnames rawFieldNames = header.split(",") fieldNames = [] # Clean field names for field in rawFieldNames: fieldNames.append(FieldCleaner.cleanString(field)) # Map fieldnames to attribute names attributeNames = [] for field in fieldNames: if field in fieldMap: attributeNames.append(fieldMap[field]) if fieldMap[ field] in fieldOptions and "skip_duplicates" in fieldOptions[ fieldMap[field]]: # Create empty dict for this field valuePresent[fieldMap[field]] = {} else: raise KeyError("".join( ["Found unexpected field ", str(field)])) # Check that all fields are present for field in fieldMap: if not field in fieldNames: raise ValueError("".join([ str(field), " is required for loading table ", str(type(model)) ])) # Open DictReader with attribute names reader = csv.DictReader(csvfile, fieldnames=attributeNames) # For each row, create instance of model and add it for row in reader: skipInsert = False for field in fieldOptions: # For each field with options present, modify according to those options options = fieldOptions[field] if "pad_to_length" in options: padLength = options["pad_to_length"] row[field] = Validator.padToLength( row[field], padLength) if "skip_duplicates" in options: if len(row[field].strip() ) == 0 or row[field] in valuePresent[field]: # Value not provided or already exists, skip it skipInsert = True else: # Insert new value valuePresent[field][row[field]] = True record = model(**row) if not skipInsert: try: interface.session.merge(record) except IntegrityError as e: # Hit a duplicate value that violates index, skip this one print("".join( ["Warning: Skipping this row: ", str(row)])) print("".join(["Due to error: ", str(e)])) interface.session.rollback() continue interface.session.commit()
def runValidation(self, jobId, interfaces): """ Run validations for specified job Args: jobId: Job to be validated jobTracker: Interface for job tracker Returns: True if successful """ jobTracker = interfaces.jobDb rowNumber = 1 fileType = jobTracker.getFileType(jobId) # If local, make the error report directory if(self.isLocal and not os.path.exists(self.directory)): os.makedirs(self.directory) # Get bucket name and file name fileName = jobTracker.getFileName(jobId) self.filename = fileName bucketName = CONFIG_BROKER['aws_bucket'] regionName = CONFIG_BROKER['aws_region'] errorFileName = self.getFileName(jobTracker.getReportPath(jobId)) # Create File Status object interfaces.errorDb.createFileIfNeeded(jobId,fileName) validationDB = interfaces.validationDb fieldList = validationDB.getFieldsByFileList(fileType) csvSchema = validationDB.getFieldsByFile(fileType) rules = validationDB.getRulesByFile(fileType) reader = self.getReader() # Get file size and write to jobs table if(CONFIG_BROKER["use_aws"]): fileSize = s3UrlHandler.getFileSize("errors/"+jobTracker.getReportPath(jobId)) else: fileSize = os.path.getsize(jobTracker.getFileName(jobId)) jobTracker.setFileSizeById(jobId, fileSize) try: # Pull file reader.openFile(regionName, bucketName, fileName,fieldList,bucketName,errorFileName) # Create staging table tableName = interfaces.stagingDb.getTableName(jobId) # Create staging table tableObject = StagingTable(interfaces) tableObject.createTable(fileType,fileName,jobId,tableName) errorInterface = interfaces.errorDb # While not done, pull one row and put it into staging if it passes # the Validator with self.getWriter(regionName, bucketName, errorFileName, self.reportHeaders) as writer: while(not reader.isFinished): rowNumber += 1 #if (rowNumber % 1000) == 0: # print("Validating row " + str(rowNumber)) try : record = FieldCleaner.cleanRow(reader.getNextRecord(), fileType, validationDB) record["row"] = rowNumber if(reader.isFinished and len(record) < 2): # This is the last line and is empty, don't record an error rowNumber -= 1 # Don't count this row break except ResponseException as e: if reader.isFinished and reader.extraLine: #Last line may be blank don't record an error, reader.extraLine indicates a case where the last valid line has extra line breaks # Don't count last row if empty rowNumber -= 1 else: writer.write(["Formatting Error", ValidationError.readErrorMsg, str(rowNumber), ""]) errorInterface.recordRowError(jobId,self.filename,"Formatting Error",ValidationError.readError,rowNumber) errorInterface.setRowErrorsPresent(jobId, True) continue valid, failures = Validator.validate(record,rules,csvSchema,fileType,interfaces) if(valid) : try: tableObject.insert(record,fileType) except ResponseException as e: # Write failed, move to next record writer.write(["Formatting Error", ValidationError.writeErrorMsg, str(rowNumber),""]) errorInterface.recordRowError(jobId,self.filename,"Formatting Error",ValidationError.writeError,rowNumber) errorInterface.setRowErrorsPresent(jobId, True) continue else: # For each failure, record it in error report and metadata if failures: errorInterface.setRowErrorsPresent(jobId, True) for failure in failures: fieldName = failure[0] error = failure[1] failedValue = failure[2] try: # If error is an int, it's one of our prestored messages errorType = int(error) errorMsg = ValidationError.getErrorMessage(errorType) except ValueError: # If not, treat it literally errorMsg = error writer.write([fieldName,errorMsg,str(rowNumber),failedValue]) errorInterface.recordRowError(jobId,self.filename,fieldName,error,rowNumber) # Write unfinished batch writer.finishBatch() # Write number of rows to job table jobTracker.setNumberOfRowsById(jobId,rowNumber) # Write leftover records tableObject.endBatch() # Mark validation as finished in job tracker jobTracker.markJobStatus(jobId,"finished") errorInterface.writeAllRowErrors(jobId) finally: #ensure the file always closes reader.close() return True
def runValidation(self, jobId, interfaces): """ Run validations for specified job Args: jobId: Job to be validated jobTracker: Interface for job tracker Returns: True if successful """ jobTracker = interfaces.jobDb rowNumber = 1 fileType = jobTracker.getFileType(jobId) # If local, make the error report directory if (self.isLocal and not os.path.exists(self.directory)): os.makedirs(self.directory) # Get bucket name and file name fileName = jobTracker.getFileName(jobId) self.filename = fileName bucketName = CONFIG_BROKER['aws_bucket'] regionName = CONFIG_BROKER['aws_region'] errorFileName = self.getFileName(jobTracker.getReportPath(jobId)) # Create File Status object interfaces.errorDb.createFileIfNeeded(jobId, fileName) validationDB = interfaces.validationDb fieldList = validationDB.getFieldsByFileList(fileType) csvSchema = validationDB.getFieldsByFile(fileType) rules = validationDB.getRulesByFile(fileType) reader = self.getReader() # Get file size and write to jobs table if (CONFIG_BROKER["use_aws"]): fileSize = s3UrlHandler.getFileSize( "errors/" + jobTracker.getReportPath(jobId)) else: fileSize = os.path.getsize(jobTracker.getFileName(jobId)) jobTracker.setFileSizeById(jobId, fileSize) try: # Pull file reader.openFile(regionName, bucketName, fileName, fieldList, bucketName, errorFileName) # Create staging table tableName = interfaces.stagingDb.getTableName(jobId) # Create staging table tableObject = StagingTable(interfaces) tableObject.createTable(fileType, fileName, jobId, tableName) errorInterface = interfaces.errorDb # While not done, pull one row and put it into staging if it passes # the Validator with self.getWriter(regionName, bucketName, errorFileName, self.reportHeaders) as writer: while (not reader.isFinished): rowNumber += 1 #if (rowNumber % 1000) == 0: # print("Validating row " + str(rowNumber)) try: record = FieldCleaner.cleanRow(reader.getNextRecord(), fileType, validationDB) record["row"] = rowNumber if (reader.isFinished and len(record) < 2): # This is the last line and is empty, don't record an error rowNumber -= 1 # Don't count this row break except ResponseException as e: if reader.isFinished and reader.extraLine: #Last line may be blank don't record an error, reader.extraLine indicates a case where the last valid line has extra line breaks # Don't count last row if empty rowNumber -= 1 else: writer.write([ "Formatting Error", ValidationError.readErrorMsg, str(rowNumber), "" ]) errorInterface.recordRowError( jobId, self.filename, "Formatting Error", ValidationError.readError, rowNumber) errorInterface.setRowErrorsPresent(jobId, True) continue valid, failures = Validator.validate( record, rules, csvSchema, fileType, interfaces) if (valid): try: tableObject.insert(record, fileType) except ResponseException as e: # Write failed, move to next record writer.write([ "Formatting Error", ValidationError.writeErrorMsg, str(rowNumber), "" ]) errorInterface.recordRowError( jobId, self.filename, "Formatting Error", ValidationError.writeError, rowNumber) errorInterface.setRowErrorsPresent(jobId, True) continue else: # For each failure, record it in error report and metadata if failures: errorInterface.setRowErrorsPresent(jobId, True) for failure in failures: fieldName = failure[0] error = failure[1] failedValue = failure[2] try: # If error is an int, it's one of our prestored messages errorType = int(error) errorMsg = ValidationError.getErrorMessage( errorType) except ValueError: # If not, treat it literally errorMsg = error writer.write([ fieldName, errorMsg, str(rowNumber), failedValue ]) errorInterface.recordRowError( jobId, self.filename, fieldName, error, rowNumber) # Write unfinished batch writer.finishBatch() # Write number of rows to job table jobTracker.setNumberOfRowsById(jobId, rowNumber) # Write leftover records tableObject.endBatch() # Mark validation as finished in job tracker jobTracker.markJobStatus(jobId, "finished") errorInterface.writeAllRowErrors(jobId) finally: #ensure the file always closes reader.close() return True
def runCrossValidation(self, jobId, interfaces): """ Cross file validation job, test all rules with matching rule_timing """ # Create File Status object interfaces.errorDb.createFileIfNeeded(jobId) validationDb = interfaces.validationDb errorDb = interfaces.errorDb submissionId = interfaces.jobDb.getSubmissionId(jobId) bucketName = CONFIG_BROKER['aws_bucket'] regionName = CONFIG_BROKER['aws_region'] CloudLogger.logError( "VALIDATOR_INFO: ", "Beginning runCrossValidation on submissionID: " + str(submissionId), "") # Delete existing cross file errors for this submission errorDb.resetErrorsByJobId(jobId) # use db to get a list of the cross-file combinations targetFiles = validationDb.session.query(FileTypeValidation).subquery() crossFileCombos = validationDb.session.query( FileTypeValidation.name.label('first_file_name'), FileTypeValidation.file_id.label('first_file_id'), targetFiles.c.name.label('second_file_name'), targetFiles.c.file_id.label('second_file_id')).filter( FileTypeValidation.file_order < targetFiles.c.file_order) # get all cross file rules from db crossFileRules = validationDb.session.query(RuleSql).filter( RuleSql.rule_cross_file_flag == True) # for each cross-file combo, run associated rules and create error report for row in crossFileCombos: comboRules = crossFileRules.filter( or_( and_(RuleSql.file_id == row.first_file_id, RuleSql.target_file_id == row.second_file_id), and_(RuleSql.file_id == row.second_file_id, RuleSql.target_file_id == row.first_file_id))) # send comboRules to validator.crossValidate sql failures = Validator.crossValidateSql(comboRules.all(), submissionId) # get error file name reportFilename = self.getFileName( getCrossReportName(submissionId, row.first_file_name, row.second_file_name)) warningReportFilename = self.getFileName( getCrossWarningReportName(submissionId, row.first_file_name, row.second_file_name)) # loop through failures to create the error report with self.getWriter(regionName, bucketName, reportFilename, self.crossFileReportHeaders) as writer, \ self.getWriter(regionName, bucketName, warningReportFilename, self.crossFileReportHeaders) as warningWriter: for failure in failures: if failure[9] == interfaces.validationDb.getRuleSeverityId( "fatal"): writer.write(failure[0:7]) if failure[9] == interfaces.validationDb.getRuleSeverityId( "warning"): warningWriter.write(failure[0:7]) errorDb.recordRowError(jobId, "cross_file", failure[0], failure[3], failure[5], failure[6], failure[7], failure[8], severity_id=failure[9]) writer.finishBatch() warningWriter.finishBatch() errorDb.writeAllRowErrors(jobId) interfaces.jobDb.markJobStatus(jobId, "finished") CloudLogger.logError( "VALIDATOR_INFO: ", "Completed runCrossValidation on submissionID: " + str(submissionId), "") # Update error info for submission interfaces.jobDb.populateSubmissionErrorInfo(submissionId) # TODO: Remove temporary step below # Temporarily set publishable flag at end of cross file, remove this once users are able to mark their submissions # as publishable # Publish only if no errors are present if interfaces.jobDb.getSubmissionById( submissionId).number_of_errors == 0: interfaces.jobDb.setPublishableFlag(submissionId, True) # Mark validation complete interfaces.errorDb.markFileComplete(jobId)
def runSqlValidations(self, interfaces, jobId, fileType, shortColnames, writer, warningWriter, rowNumber): """ Run all SQL rules for this file type Args: interfaces: InterfaceHolder object jobId: ID of current job fileType: Type of file for current job shortColnames: Dict mapping short field names to long writer: CsvWriter object waringWriter: CsvWriter for warnings rowNumber: Current row number Returns: a list of the row numbers that failed one of the sql-based validations """ errorInterface = interfaces.errorDb errorRows = [] sqlFailures = Validator.validateFileBySql( interfaces.jobDb.getSubmissionId(jobId), fileType, interfaces) for failure in sqlFailures: # convert shorter, machine friendly column names used in the # SQL validation queries back to their long names if failure[0] in shortColnames: fieldName = shortColnames[failure[0]] else: fieldName = failure[0] error = failure[1] failedValue = failure[2] row = failure[3] original_label = failure[4] fileTypeId = failure[5] targetFileId = failure[6] severityId = failure[7] if severityId == interfaces.validationDb.getRuleSeverityId( "fatal"): errorRows.append(row) try: # If error is an int, it's one of our prestored messages errorType = int(error) errorMsg = ValidationError.getErrorMessage(errorType) except ValueError: # If not, treat it literally errorMsg = error if severityId == interfaces.validationDb.getRuleSeverityId( "fatal"): writer.write([ fieldName, errorMsg, str(row), failedValue, original_label ]) elif severityId == interfaces.validationDb.getRuleSeverityId( "warning"): # write to warnings file warningWriter.write([ fieldName, errorMsg, str(row), failedValue, original_label ]) errorInterface.recordRowError(jobId, self.filename, fieldName, error, rowNumber, original_label, file_type_id=fileTypeId, target_file_id=targetFileId, severity_id=severityId) return errorRows
def test_schema_rules(self): """Test schema rules.""" lessRule = RuleType() lessRule.name = "LESS" greaterRule = RuleType() greaterRule.name = "GREATER" lengthRule = RuleType() lengthRule.name = "LENGTH" equalRule = RuleType() equalRule.name = "EQUAL" notRule = RuleType() notRule.name = "NOT EQUAL" setRule = RuleType() setRule.name = "IN_SET" sumRule = RuleType() sumRule.name = "SUM" sumToValueRule = MultiFieldRuleType() sumToValueRule.name = "SUM_TO_VALUE" schema = self.schema interfaces = self.interfaces rule1 = Rule() rule1.rule_type = equalRule rule1.file_column = schema["test1"] rule1.rule_text_1 = "hello" rule1.rule_timing_id = 1 rule2 = Rule() rule2.rule_type = notRule rule2.file_column = schema["test1"] rule2.rule_text_1 = "bye" rule2.rule_timing_id = 1 rule3 = Rule() rule3.rule_type = lengthRule rule3.file_column = schema["test1"] rule3.rule_text_1 = "6" rule3.rule_timing_id = 1 rule4 = Rule() rule4.rule_type = equalRule rule4.file_column = schema["test3"] rule4.rule_text_1 = "YES" rule4.rule_timing_id = 1 rule5 = Rule() rule5.rule_type = equalRule rule5.file_column = schema["test4"] rule5.rule_text_1 = "44" rule5.rule_timing_id = 1 rule6 = Rule() rule6.rule_type = lessRule rule6.file_column = schema["test4"] rule6.rule_text_1 = "45" rule6.rule_timing_id = 1 rule7 = Rule() rule7.rule_type = greaterRule rule7.file_column = schema["test2"] rule7.rule_text_1 = ".5" rule7.rule_timing_id = 1 rule8 = Rule() rule8.rule_type = setRule rule8.file_column = schema["test6"] rule8.rule_text_1 = "X, F, A" rule8.rule_timing_id = 1 rule9 = Rule() rule9.rule_type = sumRule rule9.file_column = schema["test2"] rule9.rule_text_1 = "test7" rule9.rule_text_2 = "test2,test4,test5" rule9.rule_timing_id = 1 rule10 = MultiFieldRule() rule10.rule_type = sumToValueRule rule10.rule_text_1 = "46" rule10.rule_text_2 = "test2,test4,test5" rule10.rule_timing_id = 1 vvi = ValidatorValidationInterface() fileId = vvi.getFileId("award") vvi.addMultiFieldRule(fileId, "SUM_TO_VALUE", rule10.rule_text_1, rule10.rule_text_2, "Evaluates the sum of fields to a number") rules = [rule1, rule2, rule3, rule4, rule5, rule6, rule7, rule8, rule9] record = { "test1": "hello", "test2": "1.0", "test3": "YES", "test4": "44", "test5": "1", "test6": "X", "test7": "46" } self.assertTrue(Validator.validate( record, rules, schema, "award", self.interfaces)[0]) record = { "test1": "goodbye", "test2": ".4", "test3": "NO", "test4": "45", "test5": "1", "test6": "Q", "test7": "46.5" } self.assertFalse(Validator.validate( record, [rule3], schema, "award", interfaces)[0]) self.assertFalse(Validator.validate( record, [rule4], schema, "award", interfaces)[0]) self.assertFalse(Validator.validate( record, [rule5], schema, "award", interfaces)[0]) self.assertFalse(Validator.validate( record, [rule6], schema, "award", interfaces)[0]) self.assertFalse(Validator.validate( record, [rule7], schema, "award", interfaces)[0]) self.assertFalse(Validator.validate( record, [rule8], schema, "award", interfaces)[0]) self.assertFalse(Validator.validate( record, [rule9], schema, "award", interfaces)[0]) self.assertFalse(Validator.validate( record, rules, schema, "award", interfaces)[0])
def run_validation(self, job): """ Run validations for specified job Args: job: Job to be validated Returns: True if successful """ sess = GlobalDB.db().session job_id = job.job_id error_list = ErrorInterface() submission_id = job.submission_id row_number = 1 file_type = job.file_type.name validation_start = datetime.now() logger.info( { 'message': 'Beginning run_validation on submission_id: ' + str(submission_id) + ', job_id: ' + str(job_id) + ', file_type: ' + file_type, 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'run_validations', 'status': 'start', 'start_time': validation_start}) # Get orm model for this file model = [ft.model for ft in FILE_TYPE if ft.name == file_type][0] # Delete existing file level errors for this submission sess.query(ErrorMetadata).filter(ErrorMetadata.job_id == job_id).delete() sess.commit() # Clear existing records for this submission sess.query(model).filter_by(submission_id=submission_id).delete() sess.commit() # Clear existing flex fields for this job sess.query(FlexField).filter_by(job_id=job_id).delete() sess.commit() # If local, make the error report directory if self.isLocal and not os.path.exists(self.directory): os.makedirs(self.directory) # Get bucket name and file name file_name = job.filename bucket_name = CONFIG_BROKER['aws_bucket'] region_name = CONFIG_BROKER['aws_region'] error_file_name = self.get_file_name(report_file_name(job.submission_id, False, job.file_type.name)) warning_file_name = self.get_file_name(report_file_name(job.submission_id, True, job.file_type.name)) # Create File Status object create_file_if_needed(job_id, file_name) reader = self.get_reader() # Get file size and write to jobs table if CONFIG_BROKER["use_aws"]: file_size = S3Handler.get_file_size(file_name) else: file_size = os.path.getsize(file_name) job.file_size = file_size sess.commit() # Get fields for this file fields = sess.query(FileColumn).filter(FileColumn.file_id == FILE_TYPE_DICT[file_type]).all() for field in fields: sess.expunge(field) csv_schema = {row.name_short: row for row in fields} try: extension = os.path.splitext(file_name)[1] if not extension or extension not in ['.csv', '.txt']: raise ResponseException("", StatusCode.CLIENT_ERROR, None, ValidationError.fileTypeError) # Count file rows: throws a File Level Error for non-UTF8 characters temp_file = open(reader.get_filename(region_name, bucket_name, file_name), encoding='utf-8') file_row_count = len(list(csv.reader(temp_file))) try: temp_file.close() except AttributeError: # File does not exist, and so does not need to be closed pass # Pull file and return info on whether it's using short or long col headers reader.open_file(region_name, bucket_name, file_name, fields, bucket_name, error_file_name, self.long_to_short_dict, is_local=self.isLocal) # list to keep track of rows that fail validations error_rows = [] # While not done, pull one row and put it into staging table if it passes # the Validator loading_start = datetime.now() logger.info( { 'message': 'Beginning data loading on submission_id: ' + str(submission_id) + ', job_id: ' + str(job_id) + ', file_type: ' + file_type, 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'start', 'start_time': loading_start}) with self.get_writer(region_name, bucket_name, error_file_name, self.reportHeaders) as writer, \ self.get_writer(region_name, bucket_name, warning_file_name, self.reportHeaders) as warning_writer: while not reader.is_finished: row_number += 1 if row_number % 100 == 0: elapsed_time = (datetime.now()-loading_start).total_seconds() logger.info( { 'message': 'Loading row: ' + str(row_number) + ' on submission_id: ' + str(submission_id) + ', job_id: ' + str(job_id) + ', file_type: ' + file_type, 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'loading', 'rows_loaded': row_number, 'start_time': loading_start, 'elapsed_time': elapsed_time}) # # first phase of validations: read record and record a # formatting error if there's a problem # (record, reduceRow, skip_row, doneReading, rowErrorHere, flex_cols) = \ self.read_record(reader, writer, row_number, job, fields, error_list) if reduceRow: row_number -= 1 if rowErrorHere: error_rows.append(row_number) if doneReading: # Stop reading from input file break elif skip_row: # Do not write this row to staging, but continue processing future rows continue # # second phase of validations: do basic schema checks # (e.g., require fields, field length, data type) # # D files are obtained from upstream systems (ASP and FPDS) that perform their own basic # validations, so these validations are not repeated here if file_type in ["award", "award_procurement"]: # Skip basic validations for D files, set as valid to trigger write to staging passed_validations = True valid = True else: if file_type in ["detached_award"]: record['afa_generated_unique'] = (record['award_modification_amendme'] or '-none-') + \ (record['awarding_sub_tier_agency_c'] or '-none-') + \ (record['fain'] or '-none-') + (record['uri'] or '-none-') passed_validations, failures, valid = Validator.validate(record, csv_schema, file_type in ["detached_award"]) if valid: # todo: update this logic later when we have actual validations if file_type in ["detached_award"]: record["is_valid"] = True model_instance = model(job_id=job_id, submission_id=submission_id, valid_record=passed_validations, **record) skip_row = not insert_staging_model(model_instance, job, writer, error_list) if flex_cols: sess.add_all(flex_cols) sess.commit() if skip_row: error_rows.append(row_number) continue if not passed_validations: fatal = write_errors(failures, job, self.short_to_long_dict, writer, warning_writer, row_number, error_list) if fatal: error_rows.append(row_number) loading_duration = (datetime.now()-loading_start).total_seconds() logger.info( { 'message': 'Completed data loading on submission_id: ' + str(submission_id) + ', job_id: ' + str(job_id) + ', file_type: ' + file_type, 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'finish', 'start_time': loading_start, 'end_time': datetime.now(), 'duration': loading_duration, 'total_rows': row_number }) if file_type in ('appropriations', 'program_activity', 'award_financial'): update_tas_ids(model, submission_id) # # third phase of validations: run validation rules as specified # in the schema guidance. these validations are sql-based. # sql_error_rows = self.run_sql_validations(job, file_type, self.short_to_long_dict, writer, warning_writer, row_number, error_list) error_rows.extend(sql_error_rows) # Write unfinished batch writer.finish_batch() warning_writer.finish_batch() # Calculate total number of rows in file # that passed validations error_rows_unique = set(error_rows) total_rows_excluding_header = row_number - 1 valid_rows = total_rows_excluding_header - len(error_rows_unique) # Update detached_award is_valid rows where applicable # Update submission to include action dates where applicable if file_type in ["detached_award"]: sess.query(DetachedAwardFinancialAssistance).\ filter(DetachedAwardFinancialAssistance.row_number.in_(error_rows_unique), DetachedAwardFinancialAssistance.submission_id == submission_id).\ update({"is_valid": False}, synchronize_session=False) sess.commit() min_action_date, max_action_date = get_action_dates(submission_id) sess.query(Submission).filter(Submission.submission_id == submission_id).\ update({"reporting_start_date": min_action_date, "reporting_end_date": max_action_date}, synchronize_session=False) # Ensure validated rows match initial row count if file_row_count != row_number: raise ResponseException("", StatusCode.CLIENT_ERROR, None, ValidationError.rowCountError) # Update job metadata job.number_of_rows = row_number job.number_of_rows_valid = valid_rows sess.commit() error_list.write_all_row_errors(job_id) # Update error info for submission populate_job_error_info(job) if file_type in ["detached_award"]: # set number of errors and warnings for detached submission populate_submission_error_info(submission_id) # Mark validation as finished in job tracker mark_job_status(job_id, "finished") mark_file_complete(job_id, file_name) finally: # Ensure the file always closes reader.close() validation_duration = (datetime.now()-validation_start).total_seconds() logger.info( { 'message': 'Completed run_validation on submission_id: ' + str(submission_id) + ', job_id: ' + str(job_id) + ', file_type: ' + file_type, 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'run_validation', 'status': 'finish', 'start_time': validation_start, 'end_time': datetime.now(), 'duration': validation_duration }) return True
def test_types(self): """Test data type checks.""" self.assertTrue(Validator.checkType("1234Test", "STRING")) self.assertFalse(Validator.checkType("1234Test", "INT")) self.assertFalse(Validator.checkType("1234Test", "DECIMAL")) self.assertFalse(Validator.checkType("1234Test", "BOOLEAN")) self.assertFalse(Validator.checkType("1234Test", "LONG")) self.assertTrue(Validator.checkType("", "STRING")) self.assertTrue(Validator.checkType("", "INT")) self.assertTrue(Validator.checkType("", "DECIMAL")) self.assertTrue(Validator.checkType("", "BOOLEAN")) self.assertTrue(Validator.checkType("", "LONG")) self.assertTrue(Validator.checkType("01234", "STRING")) self.assertTrue(Validator.checkType("01234", "INT")) self.assertTrue(Validator.checkType("01234", "DECIMAL")) self.assertTrue(Validator.checkType("01234", "LONG")) self.assertFalse(Validator.checkType("01234", "BOOLEAN")) self.assertTrue(Validator.checkType("1234.0", "STRING")) self.assertFalse(Validator.checkType("1234.0", "INT")) self.assertTrue(Validator.checkType("1234.00", "DECIMAL")) self.assertFalse(Validator.checkType("1234.0", "LONG")) self.assertFalse(Validator.checkType("1234.0", "BOOLEAN"))
def runValidation(self, job): """ Run validations for specified job Args: job: Job to be validated Returns: True if successful """ sess = GlobalDB.db().session job_id = job.job_id error_list = ErrorInterface() _exception_logger.info( 'VALIDATOR_INFO: Beginning runValidation on job_id: %s', job_id) submission_id = job.submission_id rowNumber = 1 fileType = job.file_type.name # Get orm model for this file model = [ft.model for ft in FILE_TYPE if ft.name == fileType][0] # Clear existing records for this submission sess.query(model).filter_by(submission_id=submission_id).delete() sess.commit() # If local, make the error report directory if self.isLocal and not os.path.exists(self.directory): os.makedirs(self.directory) # Get bucket name and file name fileName = job.filename bucketName = CONFIG_BROKER['aws_bucket'] regionName = CONFIG_BROKER['aws_region'] errorFileName = self.getFileName(get_report_path(job, 'error')) warningFileName = self.getFileName(get_report_path(job, 'warning')) # Create File Status object createFileIfNeeded(job_id, fileName) reader = self.getReader() # Get file size and write to jobs table if CONFIG_BROKER["use_aws"]: fileSize = s3UrlHandler.getFileSize(errorFileName) else: fileSize = os.path.getsize(fileName) job.file_size = fileSize sess.commit() # Get fields for this file fields = sess.query(FileColumn). \ filter(FileColumn.file_id == FILE_TYPE_DICT[fileType]). \ all() for field in fields: sess.expunge(field) csvSchema = {row.name_short: row for row in fields} try: # Pull file and return info on whether it's using short or long col headers reader.open_file(regionName, bucketName, fileName, fields, bucketName, errorFileName, self.long_to_short_dict) # list to keep track of rows that fail validations errorRows = [] # While not done, pull one row and put it into staging table if it passes # the Validator with self.getWriter(regionName, bucketName, errorFileName, self.reportHeaders) as writer, \ self.getWriter(regionName, bucketName, warningFileName, self.reportHeaders) as warningWriter: while not reader.is_finished: rowNumber += 1 if rowNumber % 10 == 0: logger.info('loading row %s', rowNumber) # # first phase of validations: read record and record a # formatting error if there's a problem # (record, reduceRow, skipRow, doneReading, rowErrorHere, flex_cols) = self.readRecord(reader, writer, fileType, rowNumber, job, fields, error_list) if reduceRow: rowNumber -= 1 if rowErrorHere: errorRows.append(rowNumber) if doneReading: # Stop reading from input file break elif skipRow: # Do not write this row to staging, but continue processing future rows continue # # second phase of validations: do basic schema checks # (e.g., require fields, field length, data type) # # D files are obtained from upstream systems (ASP and FPDS) that perform their own basic validations, # so these validations are not repeated here if fileType in ["award", "award_procurement"]: # Skip basic validations for D files, set as valid to trigger write to staging passedValidations = True valid = True else: passedValidations, failures, valid = Validator.validate( record, csvSchema) if valid: skipRow = self.writeToStaging(record, job, submission_id, passedValidations, writer, rowNumber, model, error_list) if flex_cols: self.write_to_flex(flex_cols, job_id, submission_id, fileType) if skipRow: errorRows.append(rowNumber) continue if not passedValidations: if self.writeErrors(failures, job, self.short_to_long_dict, writer, warningWriter, rowNumber, error_list): errorRows.append(rowNumber) _exception_logger.info( 'VALIDATOR_INFO: Loading complete on job_id: %s. ' 'Total rows added to staging: %s', job_id, rowNumber) if fileType in ('appropriations', 'program_activity', 'award_financial'): update_tas_ids(model, submission_id) # # third phase of validations: run validation rules as specified # in the schema guidance. these validations are sql-based. # sqlErrorRows = self.runSqlValidations(job, fileType, self.short_to_long_dict, writer, warningWriter, rowNumber, error_list) errorRows.extend(sqlErrorRows) # Write unfinished batch writer.finishBatch() warningWriter.finishBatch() # Calculate total number of rows in file # that passed validations errorRowsUnique = set(errorRows) totalRowsExcludingHeader = rowNumber - 1 validRows = totalRowsExcludingHeader - len(errorRowsUnique) # Update job metadata job.number_of_rows = rowNumber job.number_of_rows_valid = validRows sess.commit() error_list.writeAllRowErrors(job_id) # Update error info for submission populateSubmissionErrorInfo(submission_id) # Mark validation as finished in job tracker mark_job_status(job_id, "finished") markFileComplete(job_id, fileName) finally: # Ensure the file always closes reader.close() _exception_logger.info( 'VALIDATOR_INFO: Completed L1 and SQL rule validations on ' 'job_id: %s', job_id) return True
def runCrossValidation(self, job): """ Cross file validation job, test all rules with matching rule_timing """ sess = GlobalDB.db().session job_id = job.job_id # Create File Status object createFileIfNeeded(job_id) error_list = ErrorInterface() submission_id = job.submission_id bucketName = CONFIG_BROKER['aws_bucket'] regionName = CONFIG_BROKER['aws_region'] _exception_logger.info( 'VALIDATOR_INFO: Beginning runCrossValidation on submission_id: ' '%s', submission_id) # Delete existing cross file errors for this submission sess.query(ErrorMetadata).filter( ErrorMetadata.job_id == job_id).delete() sess.commit() # get all cross file rules from db crossFileRules = sess.query(RuleSql).filter( RuleSql.rule_cross_file_flag == True) # for each cross-file combo, run associated rules and create error report for c in get_cross_file_pairs(): first_file = c[0] second_file = c[1] comboRules = crossFileRules.filter( or_( and_(RuleSql.file_id == first_file.id, RuleSql.target_file_id == second_file.id), and_(RuleSql.file_id == second_file.id, RuleSql.target_file_id == first_file.id))) # send comboRules to validator.crossValidate sql failures = Validator.crossValidateSql(comboRules.all(), submission_id, self.short_to_long_dict) # get error file name reportFilename = self.getFileName( get_cross_report_name(submission_id, first_file.name, second_file.name)) warningReportFilename = self.getFileName( get_cross_warning_report_name(submission_id, first_file.name, second_file.name)) # loop through failures to create the error report with self.getWriter(regionName, bucketName, reportFilename, self.crossFileReportHeaders) as writer, \ self.getWriter(regionName, bucketName, warningReportFilename, self.crossFileReportHeaders) as warningWriter: for failure in failures: if failure[9] == RULE_SEVERITY_DICT['fatal']: writer.write(failure[0:7]) if failure[9] == RULE_SEVERITY_DICT['warning']: warningWriter.write(failure[0:7]) error_list.recordRowError(job_id, "cross_file", failure[0], failure[3], failure[5], failure[6], failure[7], failure[8], severity_id=failure[9]) writer.finishBatch() warningWriter.finishBatch() error_list.writeAllRowErrors(job_id) mark_job_status(job_id, "finished") _exception_logger.info( 'VALIDATOR_INFO: Completed runCrossValidation on submission_id: ' '%s', submission_id) submission = sess.query(Submission).filter_by( submission_id=submission_id).one() # Update error info for submission submission.number_of_errors = sumNumberOfErrorsForJobList( submission_id) submission.number_of_warnings = sumNumberOfErrorsForJobList( submission_id, errorType="warning") # TODO: Remove temporary step below # Temporarily set publishable flag at end of cross file, remove this once users are able to mark their submissions # as publishable # Publish only if no errors are present if submission.number_of_errors == 0: submission.publishable = True sess.commit() # Mark validation complete markFileComplete(job_id)
def runSqlValidations(self, job, file_type, short_colnames, writer, warning_writer, row_number, error_list): """ Run all SQL rules for this file type Args: job: Current job file_type: Type of file for current job short_colnames: Dict mapping short field names to long writer: CsvWriter object warning_writer: CsvWriter for warnings row_number: Current row number error_list: instance of ErrorInterface to keep track of errors Returns: a list of the row numbers that failed one of the sql-based validations """ sess = GlobalDB.db().session job_id = job.job_id error_rows = [] sql_failures = Validator.validateFileBySql(job.submission_id, file_type, self.short_to_long_dict) for failure in sql_failures: # convert shorter, machine friendly column names used in the # SQL validation queries back to their long names if failure[0] in short_colnames: field_name = short_colnames[failure[0]] else: field_name = failure[0] error = failure[1] failed_value = failure[2] row = failure[3] original_label = failure[4] file_type_id = failure[5] target_file_id = failure[6] severity_id = failure[7] if severity_id == RULE_SEVERITY_DICT['fatal']: error_rows.append(row) try: # If error is an int, it's one of our prestored messages error_type = int(error) error_msg = ValidationError.getErrorMessage(error_type) except ValueError: # If not, treat it literally error_msg = error if severity_id == RULE_SEVERITY_DICT['fatal']: writer.write([ field_name, error_msg, str(row), failed_value, original_label ]) elif severity_id == RULE_SEVERITY_DICT['warning']: # write to warnings file warning_writer.write([ field_name, error_msg, str(row), failed_value, original_label ]) error_list.recordRowError(job_id, job.filename, field_name, error, row_number, original_label, file_type_id=file_type_id, target_file_id=target_file_id, severity_id=severity_id) return error_rows
def test_schema_rules(self): """Test schema rules.""" lessRule = RuleType() lessRule.name = "LESS" greaterRule = RuleType() greaterRule.name = "GREATER" lengthRule = RuleType() lengthRule.name = "LENGTH" equalRule = RuleType() equalRule.name = "EQUAL" notRule = RuleType() notRule.name = "NOT EQUAL" setRule = RuleType() setRule.name = "IN_SET" sumRule = RuleType() sumRule.name = "SUM" sumToValueRule = RuleType() sumToValueRule.name = "SUM_TO_VALUE" schema = self.schema interfaces = self.interfaces rule1 = Rule() rule1.rule_type = equalRule rule1.file_column = schema["test1"] rule1.rule_text_1 = "hello" rule1.rule_timing_id = 1 rule2 = Rule() rule2.rule_type = notRule rule2.file_column = schema["test1"] rule2.rule_text_1 = "bye" rule2.rule_timing_id = 1 rule3 = Rule() rule3.rule_type = lengthRule rule3.file_column = schema["test1"] rule3.rule_text_1 = "6" rule3.rule_timing_id = 1 rule4 = Rule() rule4.rule_type = equalRule rule4.file_column = schema["test3"] rule4.rule_text_1 = "YES" rule4.rule_timing_id = 1 rule5 = Rule() rule5.rule_type = equalRule rule5.file_column = schema["test4"] rule5.rule_text_1 = "44" rule5.rule_timing_id = 1 rule6 = Rule() rule6.rule_type = lessRule rule6.file_column = schema["test4"] rule6.rule_text_1 = "45" rule6.rule_timing_id = 1 rule7 = Rule() rule7.rule_type = greaterRule rule7.file_column = schema["test2"] rule7.rule_text_1 = ".5" rule7.rule_timing_id = 1 rule8 = Rule() rule8.rule_type = setRule rule8.file_column = schema["test6"] rule8.rule_text_1 = "X, F, A" rule8.rule_timing_id = 1 rule9 = Rule() rule9.rule_type = sumRule rule9.file_column = schema["test2"] rule9.rule_text_1 = "test7" rule9.rule_text_2 = "test2,test4,test5" rule9.rule_timing_id = 1 rule10 = Rule() rule10.rule_type = sumToValueRule rule10.rule_text_1 = "46" rule10.rule_text_2 = "test2,test4,test5" rule10.rule_timing_id = 4 vvi = ValidatorValidationInterface() fileId = vvi.getFileId("award") vvi.addRule(None, "SUM_TO_VALUE", rule10.rule_text_1, rule10.rule_text_2, "Evaluates the sum of fields to a number",rule10.rule_timing_id,fileId = fileId) rules = [rule1, rule2, rule3, rule4, rule5, rule6, rule7, rule8, rule9] record = { "test1": "hello", "test2": "1.0", "test3": "YES", "test4": "44", "test5": "1", "test6": "X", "test7": "46" } self.assertTrue(Validator.validate( record, rules, schema, "award", self.interfaces)[0]) record = { "test1": "goodbye", "test2": ".4", "test3": "NO", "test4": "45", "test5": "1", "test6": "Q", "test7": "46.5" } self.assertFalse(Validator.validate( record, [rule3], schema, "award", interfaces)[0]) self.assertFalse(Validator.validate( record, [rule4], schema, "award", interfaces)[0]) self.assertFalse(Validator.validate( record, [rule5], schema, "award", interfaces)[0]) self.assertFalse(Validator.validate( record, [rule6], schema, "award", interfaces)[0]) self.assertFalse(Validator.validate( record, [rule7], schema, "award", interfaces)[0]) self.assertFalse(Validator.validate( record, [rule8], schema, "award", interfaces)[0]) self.assertFalse(Validator.validate( record, [rule9], schema, "award", interfaces)[0]) self.assertFalse(Validator.validate( record, rules, schema, "award", interfaces)[0])
def runValidation(self, jobId, interfaces): """ Run validations for specified job Args: jobId: Job to be validated jobTracker: Interface for job tracker Returns: True if successful """ sess = GlobalDB.db().session # get the job object here so we can call the refactored getReportPath # todo: replace other db access functions with job object attributes job = sess.query(Job).filter(Job.job_id == jobId).one() CloudLogger.logError("VALIDATOR_INFO: ", "Beginning runValidation on jobID: " + str(jobId), "") jobTracker = interfaces.jobDb submissionId = jobTracker.getSubmissionId(jobId) rowNumber = 1 fileType = jobTracker.getFileType(jobId) # Clear existing records for this submission interfaces.stagingDb.clearFileBySubmission(submissionId, fileType) # Get short to long colname dictionary shortColnames = interfaces.validationDb.getShortToLongColname() # If local, make the error report directory if self.isLocal and not os.path.exists(self.directory): os.makedirs(self.directory) # Get bucket name and file name fileName = jobTracker.getFileName(jobId) self.filename = fileName bucketName = CONFIG_BROKER['aws_bucket'] regionName = CONFIG_BROKER['aws_region'] errorFileName = self.getFileName(getReportPath(job, 'error')) warningFileName = self.getFileName(getReportPath(job, 'warning')) # Create File Status object interfaces.errorDb.createFileIfNeeded(jobId, fileName) validationDB = interfaces.validationDb fieldList = validationDB.getFieldsByFileList(fileType) csvSchema = validationDB.getFieldsByFile(fileType, shortCols=True) reader = self.getReader() # Get file size and write to jobs table if CONFIG_BROKER["use_aws"]: fileSize = s3UrlHandler.getFileSize(errorFileName) else: fileSize = os.path.getsize(jobTracker.getFileName(jobId)) jobTracker.setFileSizeById(jobId, fileSize) fields = interfaces.validationDb.getFileColumnsByFile(fileType) try: # Pull file and return info on whether it's using short or long col headers reader.openFile(regionName, bucketName, fileName, fieldList, bucketName, errorFileName) errorInterface = interfaces.errorDb self.longToShortDict = interfaces.validationDb.getLongToShortColname( ) # rowErrorPresent becomes true if any row error occurs, used for determining file status rowErrorPresent = False # list to keep track of rows that fail validations errorRows = [] # While not done, pull one row and put it into staging table if it passes # the Validator with self.getWriter(regionName, bucketName, errorFileName, self.reportHeaders) as writer, \ self.getWriter(regionName, bucketName, warningFileName, self.reportHeaders) as warningWriter: while not reader.isFinished: rowNumber += 1 if (rowNumber % 100) == 0: CloudLogger.logError( "VALIDATOR_INFO: ", "JobId: " + str(jobId) + " loading row " + str(rowNumber), "") # # first phase of validations: read record and record a # formatting error if there's a problem # (record, reduceRow, skipRow, doneReading, rowErrorHere) = self.readRecord(reader, writer, fileType, interfaces, rowNumber, jobId, fields) if reduceRow: rowNumber -= 1 if rowErrorHere: rowErrorPresent = True errorRows.append(rowNumber) if doneReading: # Stop reading from input file break elif skipRow: # Do not write this row to staging, but continue processing future rows continue # # second phase of validations: do basic schema checks # (e.g., require fields, field length, data type) # # D files are obtained from upstream systems (ASP and FPDS) that perform their own basic validations, # so these validations are not repeated here if fileType in ["award", "award_procurement"]: # Skip basic validations for D files, set as valid to trigger write to staging passedValidations = True valid = True else: passedValidations, failures, valid = Validator.validate( record, csvSchema) if valid: skipRow = self.writeToStaging(record, jobId, submissionId, passedValidations, interfaces, writer, rowNumber, fileType) if skipRow: errorRows.append(rowNumber) continue if not passedValidations: if self.writeErrors(failures, interfaces, jobId, shortColnames, writer, warningWriter, rowNumber): errorRows.append(rowNumber) CloudLogger.logError( "VALIDATOR_INFO: ", "Loading complete on jobID: " + str(jobId) + ". Total rows added to staging: " + str(rowNumber), "") # # third phase of validations: run validation rules as specified # in the schema guidance. these validations are sql-based. # sqlErrorRows = self.runSqlValidations(interfaces, jobId, fileType, shortColnames, writer, warningWriter, rowNumber) errorRows.extend(sqlErrorRows) # Write unfinished batch writer.finishBatch() warningWriter.finishBatch() # Calculate total number of rows in file # that passed validations errorRowsUnique = set(errorRows) totalRowsExcludingHeader = rowNumber - 1 validRows = totalRowsExcludingHeader - len(errorRowsUnique) # Update job metadata jobTracker.setJobRowcounts(jobId, rowNumber, validRows) errorInterface.writeAllRowErrors(jobId) # Update error info for submission jobTracker.populateSubmissionErrorInfo(submissionId) # Mark validation as finished in job tracker jobTracker.markJobStatus(jobId, "finished") interfaces.errorDb.markFileComplete(jobId, self.filename) finally: # Ensure the file always closes reader.close() CloudLogger.logError( "VALIDATOR_INFO: ", "Completed L1 and SQL rule validations on jobID: " + str(jobId), "") return True
def run_validation(self, job): """ Run validations for specified job Args: job: Job to be validated Returns: True if successful """ sess = GlobalDB.db().session error_list = ErrorInterface() job_id = job.job_id submission_id = job.submission_id row_number = 1 file_type = job.file_type.name validation_start = datetime.now() log_str = 'on submission_id: {}, job_id: {}, file_type: {}'.format( str(submission_id), str(job_id), file_type) logger.info({ 'message': 'Beginning run_validation {}'.format(log_str), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'run_validations', 'status': 'start', 'start_time': validation_start }) # Get orm model for this file model = [ft.model for ft in FILE_TYPE if ft.name == file_type][0] # Delete existing file level errors for this submission sess.query(ErrorMetadata).filter( ErrorMetadata.job_id == job_id).delete() sess.commit() # Clear existing records for this submission sess.query(model).filter_by(submission_id=submission_id).delete() sess.commit() # Clear existing flex fields for this job sess.query(FlexField).filter_by(job_id=job_id).delete() sess.commit() # If local, make the error report directory if self.is_local and not os.path.exists(self.directory): os.makedirs(self.directory) # Get bucket name and file name file_name = job.filename bucket_name = CONFIG_BROKER['aws_bucket'] region_name = CONFIG_BROKER['aws_region'] error_file_name = report_file_name(job.submission_id, False, job.file_type.name) error_file_path = "".join( [CONFIG_SERVICES['error_report_path'], error_file_name]) warning_file_name = report_file_name(job.submission_id, True, job.file_type.name) warning_file_path = "".join( [CONFIG_SERVICES['error_report_path'], warning_file_name]) # Create File Status object create_file_if_needed(job_id, file_name) reader = CsvReader() # Get file size and write to jobs table if CONFIG_BROKER["use_aws"]: file_size = S3Handler.get_file_size(file_name) else: file_size = os.path.getsize(file_name) job.file_size = file_size sess.commit() # Get fields for this file fields = sess.query(FileColumn).filter( FileColumn.file_id == FILE_TYPE_DICT[file_type]).all() for field in fields: sess.expunge(field) csv_schema = {row.name_short: row for row in fields} try: extension = os.path.splitext(file_name)[1] if not extension or extension.lower() not in ['.csv', '.txt']: raise ResponseException("", StatusCode.CLIENT_ERROR, None, ValidationError.fileTypeError) # Count file rows: throws a File Level Error for non-UTF8 characters temp_file = open(reader.get_filename(region_name, bucket_name, file_name), encoding='utf-8') file_row_count = len(list(csv.reader(temp_file))) try: temp_file.close() except AttributeError: # File does not exist, and so does not need to be closed pass # Pull file and return info on whether it's using short or long col headers reader.open_file(region_name, bucket_name, file_name, fields, bucket_name, self.get_file_name(error_file_name), self.long_to_short_dict[job.file_type_id], is_local=self.is_local) # list to keep track of rows that fail validations error_rows = [] # While not done, pull one row and put it into staging table if it passes # the Validator loading_start = datetime.now() logger.info({ 'message': 'Beginning data loading {}'.format(log_str), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'start', 'start_time': loading_start }) with open(error_file_path, 'w', newline='') as error_file,\ open(warning_file_path, 'w', newline='') as warning_file: error_csv = csv.writer(error_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') warning_csv = csv.writer(warning_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') required_list = None type_list = None if file_type == "fabs": # create a list of all required/type labels for FABS labels = sess.query(ValidationLabel).all() required_list = {} type_list = {} for label in labels: if label.label_type == "requirement": required_list[label.column_name] = label.label else: type_list[label.column_name] = label.label # write headers to file error_csv.writerow(self.reportHeaders) warning_csv.writerow(self.reportHeaders) while not reader.is_finished: row_number += 1 if row_number % 100 == 0: elapsed_time = (datetime.now() - loading_start).total_seconds() logger.info({ 'message': 'Loading row: {} {}'.format( str(row_number), log_str), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'loading', 'rows_loaded': row_number, 'start_time': loading_start, 'elapsed_time': elapsed_time }) # # first phase of validations: read record and record a # formatting error if there's a problem # (record, reduceRow, skip_row, doneReading, rowErrorHere, flex_cols) = \ self.read_record(reader, error_csv, row_number, job, fields, error_list) if reduceRow: row_number -= 1 if rowErrorHere: error_rows.append(row_number) if doneReading: # Stop reading from input file break elif skip_row: # Do not write this row to staging, but continue processing future rows continue # # second phase of validations: do basic schema checks # (e.g., require fields, field length, data type) # # D files are obtained from upstream systems (ASP and FPDS) that perform their own basic # validations, so these validations are not repeated here if file_type in ["award", "award_procurement"]: # Skip basic validations for D files, set as valid to trigger write to staging passed_validations = True valid = True else: if file_type == "fabs": record['afa_generated_unique'] = (record['award_modification_amendme'] or '-none-') + "_" +\ (record['awarding_sub_tier_agency_c'] or '-none-') + \ "_" + (record['fain'] or '-none-') + "_" + \ (record['uri'] or '-none-') passed_validations, failures, valid = Validator.validate( record, csv_schema, file_type == "fabs", required_list, type_list) if valid: # todo: update this logic later when we have actual validations if file_type == "fabs": record["is_valid"] = True model_instance = model(job_id=job_id, submission_id=submission_id, valid_record=passed_validations, **record) skip_row = not insert_staging_model( model_instance, job, error_csv, error_list) if flex_cols: sess.add_all(flex_cols) sess.commit() if skip_row: error_rows.append(row_number) continue if not passed_validations: fatal = write_errors( failures, job, self.short_to_long_dict[job.file_type_id], error_csv, warning_csv, row_number, error_list, flex_cols) if fatal: error_rows.append(row_number) loading_duration = (datetime.now() - loading_start).total_seconds() logger.info({ 'message': 'Completed data loading {}'.format(log_str), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'finish', 'start_time': loading_start, 'end_time': datetime.now(), 'duration': loading_duration, 'total_rows': row_number }) if file_type in ('appropriations', 'program_activity', 'award_financial'): update_tas_ids(model, submission_id) # # third phase of validations: run validation rules as specified # in the schema guidance. these validations are sql-based. # sql_error_rows = self.run_sql_validations( job, file_type, self.short_to_long_dict[job.file_type_id], error_csv, warning_csv, row_number, error_list) error_rows.extend(sql_error_rows) error_file.close() warning_file.close() # stream file to S3 when not local if not self.is_local: # stream error file with open(error_file_path, 'rb') as csv_file: with smart_open.smart_open(S3Handler.create_file_path(self.get_file_name(error_file_name)), 'w')\ as writer: while True: chunk = csv_file.read(CHUNK_SIZE) if chunk: writer.write(chunk) else: break csv_file.close() os.remove(error_file_path) # stream warning file with open(warning_file_path, 'rb') as warning_csv_file: with smart_open.smart_open(S3Handler.create_file_path(self.get_file_name(warning_file_name)), 'w')\ as warning_writer: while True: chunk = warning_csv_file.read(CHUNK_SIZE) if chunk: warning_writer.write(chunk) else: break warning_csv_file.close() os.remove(warning_file_path) # Calculate total number of rows in file # that passed validations error_rows_unique = set(error_rows) total_rows_excluding_header = row_number - 1 valid_rows = total_rows_excluding_header - len(error_rows_unique) # Update fabs is_valid rows where applicable # Update submission to include action dates where applicable if file_type == "fabs": sess.query(DetachedAwardFinancialAssistance).\ filter(DetachedAwardFinancialAssistance.row_number.in_(error_rows_unique), DetachedAwardFinancialAssistance.submission_id == submission_id).\ update({"is_valid": False}, synchronize_session=False) sess.commit() min_action_date, max_action_date = get_action_dates( submission_id) sess.query(Submission).filter(Submission.submission_id == submission_id).\ update({"reporting_start_date": min_action_date, "reporting_end_date": max_action_date}, synchronize_session=False) # Ensure validated rows match initial row count if file_row_count != row_number: raise ResponseException("", StatusCode.CLIENT_ERROR, None, ValidationError.rowCountError) # Update job metadata job.number_of_rows = row_number job.number_of_rows_valid = valid_rows sess.commit() error_list.write_all_row_errors(job_id) # Update error info for submission populate_job_error_info(job) if file_type == "fabs": # set number of errors and warnings for detached submission populate_submission_error_info(submission_id) # Mark validation as finished in job tracker mark_job_status(job_id, "finished") mark_file_complete(job_id, file_name) finally: # Ensure the files always close reader.close() validation_duration = (datetime.now() - validation_start).total_seconds() logger.info({ 'message': 'Completed run_validation {}'.format(log_str), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'run_validation', 'status': 'finish', 'start_time': validation_start, 'end_time': datetime.now(), 'duration': validation_duration }) return True