def writeErrors(self, failures, job, short_colnames, writer, warning_writer, row_number, error_list): """ Write errors to error database Args: failures: List of errors to be written job: Current job short_colnames: Dict mapping short names to long names writer: CsvWriter object warning_writer: CsvWriter object row_number: Current row number error_list: instance of ErrorInterface to keep track of errors Returns: True if any fatal errors were found, False if only warnings are present """ job_id = job.job_id fatal_error_found = False # For each failure, record it in error report and metadata for failure in failures: # map short column names back to long names if failure[0] in short_colnames: field_name = short_colnames[failure[0]] else: field_name = failure[0] error = failure[1] failed_value = failure[2] original_rule_label = failure[3] severityId = RULE_SEVERITY_DICT[failure[4]] try: # If error is an int, it's one of our prestored messages error_type = int(error) error_msg = ValidationError.getErrorMessage(error_type) except ValueError: # If not, treat it literally error_msg = error if failure[4] == "fatal": fatal_error_found = True writer.write([ field_name, error_msg, str(row_number), failed_value, original_rule_label ]) elif failure[4] == "warning": # write to warnings file warning_writer.write([ field_name, error_msg, str(row_number), failed_value, original_rule_label ]) error_list.recordRowError(job_id, job.filename, field_name, error, row_number, original_rule_label, severity_id=severityId) return fatal_error_found
def writeErrors(self, failures, interfaces, jobId, shortColnames, writer, warningWriter, rowNumber): """ Write errors to error database Args: failures: List of errors to be written interfaces: InterfaceHolder object jobId: ID of current job shortColnames: Dict mapping short names to long names writer: CsvWriter object rowNumber: Current row number Returns: True if any fatal errors were found, False if only warnings are present """ fatalErrorFound = False errorInterface = interfaces.errorDb # For each failure, record it in error report and metadata for failure in failures: # map short column names back to long names if failure[0] in shortColnames: fieldName = shortColnames[failure[0]] else: fieldName = failure[0] error = failure[1] failedValue = failure[2] originalRuleLabel = failure[3] severityId = interfaces.validationDb.getRuleSeverityId(failure[4]) try: # If error is an int, it's one of our prestored messages errorType = int(error) errorMsg = ValidationError.getErrorMessage(errorType) except ValueError: # If not, treat it literally errorMsg = error if failure[4] == "fatal": fatalErrorFound = True writer.write([ fieldName, errorMsg, str(rowNumber), failedValue, originalRuleLabel ]) elif failure[4] == "warning": # write to warnings file warningWriter.write([ fieldName, errorMsg, str(rowNumber), failedValue, originalRuleLabel ]) errorInterface.recordRowError(jobId, self.filename, fieldName, error, rowNumber, originalRuleLabel, severity_id=severityId) return fatalErrorFound
def writeAllRowErrors(self, job_id): """ Writes all recorded errors to database Args: job_id: ID to write errors for """ sess = GlobalDB.db().session for key in self.rowErrors.keys(): errorDict = self.rowErrors[key] # Set info for this error thisJob = errorDict["jobId"] if int(job_id) != int(thisJob): # This row is for a different job, skip it continue field_name = errorDict["fieldName"] try: # If last part of key is an int, it's one of our prestored messages error_type = int(errorDict["errorType"]) except ValueError: # For rule failures, it will hold the error message errorMsg = errorDict["errorType"] if "Field must be no longer than specified limit" in errorMsg: ruleFailedId = ERROR_TYPE_DICT['length_error'] else: ruleFailedId = ERROR_TYPE_DICT['rule_failed'] errorRow = ErrorMetadata(job_id=thisJob, filename=errorDict["filename"], field_name=field_name, error_type_id=ruleFailedId, rule_failed=errorMsg, occurrences=errorDict["numErrors"], first_row=errorDict["firstRow"], original_rule_label=errorDict["originalRuleLabel"], file_type_id=errorDict["fileTypeId"], target_file_type_id=errorDict["targetFileId"], severity_id=errorDict["severity"]) else: # This happens if cast to int was successful errorString = ValidationError.getErrorTypeString(error_type) errorId = ERROR_TYPE_DICT[errorString] # Create error metadata errorRow = ErrorMetadata(job_id=thisJob, filename=errorDict["filename"], field_name=field_name, error_type_id=errorId, occurrences=errorDict["numErrors"], first_row=errorDict["firstRow"], rule_failed=ValidationError.getErrorMessage(error_type), original_rule_label=errorDict["originalRuleLabel"], file_type_id=errorDict["fileTypeId"], target_file_type_id=errorDict["targetFileId"], severity_id=errorDict["severity"]) sess.add(errorRow) # Commit the session to write all rows sess.commit() # Clear the dictionary self.rowErrors = {}
def writeAllRowErrors(self, jobId): """ Writes all recorded errors to database Args: jobId: ID to write errors for Returns: True if successful """ for key in self.rowErrors.keys(): errorDict = self.rowErrors[key] # Set info for this error thisJob = errorDict["jobId"] if (int(jobId) != int(thisJob)): # This row is for a different job, skip it continue fieldName = errorDict["fieldName"] try: # If last part of key is an int, it's one of our prestored messages errorType = int(errorDict["errorType"]) except ValueError: # For rule failures, it will hold the error message errorMsg = errorDict["errorType"] ruleFailedId = self.getTypeId("rule_failed") errorRow = ErrorData(job_id=thisJob, filename=errorDict["filename"], field_name=fieldName, error_type_id=ruleFailedId, rule_failed=errorMsg, occurrences=errorDict["numErrors"], first_row=errorDict["firstRow"]) else: # This happens if cast to int was successful errorString = ValidationError.getErrorTypeString(errorType) errorId = self.getTypeId(errorString) # Create error data errorRow = ErrorData( job_id=thisJob, filename=errorDict["filename"], field_name=fieldName, error_type_id=errorId, occurrences=errorDict["numErrors"], first_row=errorDict["firstRow"], rule_failed=ValidationError.getErrorMessage(errorType)) self.session.add(errorRow) # Commit the session to write all rows self.session.commit() # Clear the dictionary self.rowErrors = {}
def writeAllRowErrors(self, jobId): """ Writes all recorded errors to database Args: jobId: ID to write errors for Returns: True if successful """ for key in self.rowErrors.keys(): errorDict = self.rowErrors[key] # Set info for this error thisJob = errorDict["jobId"] if(int(jobId) != int(thisJob)): # This row is for a different job, skip it continue fieldName = errorDict["fieldName"] try: # If last part of key is an int, it's one of our prestored messages errorType = int(errorDict["errorType"]) except ValueError: # For rule failures, it will hold the error message errorMsg = errorDict["errorType"] ruleFailedId = self.getTypeId("rule_failed") errorRow = ErrorMetadata(job_id=thisJob, filename=errorDict["filename"], field_name=fieldName, error_type_id=ruleFailedId, rule_failed=errorMsg, occurrences=errorDict["numErrors"], first_row=errorDict["firstRow"]) else: # This happens if cast to int was successful errorString = ValidationError.getErrorTypeString(errorType) errorId = self.getTypeId(errorString) # Create error metadata errorRow = ErrorMetadata(job_id=thisJob, filename=errorDict["filename"], field_name=fieldName, error_type_id=errorId, occurrences=errorDict["numErrors"], first_row=errorDict["firstRow"], rule_failed=ValidationError.getErrorMessage(errorType)) self.session.add(errorRow) # Commit the session to write all rows self.session.commit() # Clear the dictionary self.rowErrors = {}
def runValidation(self, jobId, interfaces): """ Run validations for specified job Args: jobId: Job to be validated jobTracker: Interface for job tracker Returns: True if successful """ jobTracker = interfaces.jobDb rowNumber = 1 fileType = jobTracker.getFileType(jobId) # If local, make the error report directory if(self.isLocal and not os.path.exists(self.directory)): os.makedirs(self.directory) # Get bucket name and file name fileName = jobTracker.getFileName(jobId) self.filename = fileName bucketName = CONFIG_BROKER['aws_bucket'] regionName = CONFIG_BROKER['aws_region'] errorFileName = self.getFileName(jobTracker.getReportPath(jobId)) # Create File Status object interfaces.errorDb.createFileIfNeeded(jobId,fileName) validationDB = interfaces.validationDb fieldList = validationDB.getFieldsByFileList(fileType) csvSchema = validationDB.getFieldsByFile(fileType) rules = validationDB.getRulesByFile(fileType) reader = self.getReader() # Get file size and write to jobs table if(CONFIG_BROKER["use_aws"]): fileSize = s3UrlHandler.getFileSize("errors/"+jobTracker.getReportPath(jobId)) else: fileSize = os.path.getsize(jobTracker.getFileName(jobId)) jobTracker.setFileSizeById(jobId, fileSize) try: # Pull file reader.openFile(regionName, bucketName, fileName,fieldList,bucketName,errorFileName) # Create staging table tableName = interfaces.stagingDb.getTableName(jobId) # Create staging table tableObject = StagingTable(interfaces) tableObject.createTable(fileType,fileName,jobId,tableName) errorInterface = interfaces.errorDb # While not done, pull one row and put it into staging if it passes # the Validator with self.getWriter(regionName, bucketName, errorFileName, self.reportHeaders) as writer: while(not reader.isFinished): rowNumber += 1 #if (rowNumber % 1000) == 0: # print("Validating row " + str(rowNumber)) try : record = FieldCleaner.cleanRow(reader.getNextRecord(), fileType, validationDB) record["row"] = rowNumber if(reader.isFinished and len(record) < 2): # This is the last line and is empty, don't record an error rowNumber -= 1 # Don't count this row break except ResponseException as e: if reader.isFinished and reader.extraLine: #Last line may be blank don't record an error, reader.extraLine indicates a case where the last valid line has extra line breaks # Don't count last row if empty rowNumber -= 1 else: writer.write(["Formatting Error", ValidationError.readErrorMsg, str(rowNumber), ""]) errorInterface.recordRowError(jobId,self.filename,"Formatting Error",ValidationError.readError,rowNumber) errorInterface.setRowErrorsPresent(jobId, True) continue valid, failures = Validator.validate(record,rules,csvSchema,fileType,interfaces) if(valid) : try: tableObject.insert(record,fileType) except ResponseException as e: # Write failed, move to next record writer.write(["Formatting Error", ValidationError.writeErrorMsg, str(rowNumber),""]) errorInterface.recordRowError(jobId,self.filename,"Formatting Error",ValidationError.writeError,rowNumber) errorInterface.setRowErrorsPresent(jobId, True) continue else: # For each failure, record it in error report and metadata if failures: errorInterface.setRowErrorsPresent(jobId, True) for failure in failures: fieldName = failure[0] error = failure[1] failedValue = failure[2] try: # If error is an int, it's one of our prestored messages errorType = int(error) errorMsg = ValidationError.getErrorMessage(errorType) except ValueError: # If not, treat it literally errorMsg = error writer.write([fieldName,errorMsg,str(rowNumber),failedValue]) errorInterface.recordRowError(jobId,self.filename,fieldName,error,rowNumber) # Write unfinished batch writer.finishBatch() # Write number of rows to job table jobTracker.setNumberOfRowsById(jobId,rowNumber) # Write leftover records tableObject.endBatch() # Mark validation as finished in job tracker jobTracker.markJobStatus(jobId,"finished") errorInterface.writeAllRowErrors(jobId) finally: #ensure the file always closes reader.close() return True
def runValidation(self, jobId, interfaces): """ Run validations for specified job Args: jobId: Job to be validated jobTracker: Interface for job tracker Returns: True if successful """ jobTracker = interfaces.jobDb rowNumber = 1 fileType = jobTracker.getFileType(jobId) # If local, make the error report directory if (self.isLocal and not os.path.exists(self.directory)): os.makedirs(self.directory) # Get bucket name and file name fileName = jobTracker.getFileName(jobId) self.filename = fileName bucketName = CONFIG_BROKER['aws_bucket'] regionName = CONFIG_BROKER['aws_region'] errorFileName = self.getFileName(jobTracker.getReportPath(jobId)) # Create File Status object interfaces.errorDb.createFileIfNeeded(jobId, fileName) validationDB = interfaces.validationDb fieldList = validationDB.getFieldsByFileList(fileType) csvSchema = validationDB.getFieldsByFile(fileType) rules = validationDB.getRulesByFile(fileType) reader = self.getReader() # Get file size and write to jobs table if (CONFIG_BROKER["use_aws"]): fileSize = s3UrlHandler.getFileSize( "errors/" + jobTracker.getReportPath(jobId)) else: fileSize = os.path.getsize(jobTracker.getFileName(jobId)) jobTracker.setFileSizeById(jobId, fileSize) try: # Pull file reader.openFile(regionName, bucketName, fileName, fieldList, bucketName, errorFileName) # Create staging table tableName = interfaces.stagingDb.getTableName(jobId) # Create staging table tableObject = StagingTable(interfaces) tableObject.createTable(fileType, fileName, jobId, tableName) errorInterface = interfaces.errorDb # While not done, pull one row and put it into staging if it passes # the Validator with self.getWriter(regionName, bucketName, errorFileName, self.reportHeaders) as writer: while (not reader.isFinished): rowNumber += 1 #if (rowNumber % 1000) == 0: # print("Validating row " + str(rowNumber)) try: record = FieldCleaner.cleanRow(reader.getNextRecord(), fileType, validationDB) record["row"] = rowNumber if (reader.isFinished and len(record) < 2): # This is the last line and is empty, don't record an error rowNumber -= 1 # Don't count this row break except ResponseException as e: if reader.isFinished and reader.extraLine: #Last line may be blank don't record an error, reader.extraLine indicates a case where the last valid line has extra line breaks # Don't count last row if empty rowNumber -= 1 else: writer.write([ "Formatting Error", ValidationError.readErrorMsg, str(rowNumber), "" ]) errorInterface.recordRowError( jobId, self.filename, "Formatting Error", ValidationError.readError, rowNumber) errorInterface.setRowErrorsPresent(jobId, True) continue valid, failures = Validator.validate( record, rules, csvSchema, fileType, interfaces) if (valid): try: tableObject.insert(record, fileType) except ResponseException as e: # Write failed, move to next record writer.write([ "Formatting Error", ValidationError.writeErrorMsg, str(rowNumber), "" ]) errorInterface.recordRowError( jobId, self.filename, "Formatting Error", ValidationError.writeError, rowNumber) errorInterface.setRowErrorsPresent(jobId, True) continue else: # For each failure, record it in error report and metadata if failures: errorInterface.setRowErrorsPresent(jobId, True) for failure in failures: fieldName = failure[0] error = failure[1] failedValue = failure[2] try: # If error is an int, it's one of our prestored messages errorType = int(error) errorMsg = ValidationError.getErrorMessage( errorType) except ValueError: # If not, treat it literally errorMsg = error writer.write([ fieldName, errorMsg, str(rowNumber), failedValue ]) errorInterface.recordRowError( jobId, self.filename, fieldName, error, rowNumber) # Write unfinished batch writer.finishBatch() # Write number of rows to job table jobTracker.setNumberOfRowsById(jobId, rowNumber) # Write leftover records tableObject.endBatch() # Mark validation as finished in job tracker jobTracker.markJobStatus(jobId, "finished") errorInterface.writeAllRowErrors(jobId) finally: #ensure the file always closes reader.close() return True
def runSqlValidations(self, interfaces, jobId, fileType, shortColnames, writer, warningWriter, rowNumber): """ Run all SQL rules for this file type Args: interfaces: InterfaceHolder object jobId: ID of current job fileType: Type of file for current job shortColnames: Dict mapping short field names to long writer: CsvWriter object waringWriter: CsvWriter for warnings rowNumber: Current row number Returns: a list of the row numbers that failed one of the sql-based validations """ errorInterface = interfaces.errorDb errorRows = [] sqlFailures = Validator.validateFileBySql( interfaces.jobDb.getSubmissionId(jobId), fileType, interfaces) for failure in sqlFailures: # convert shorter, machine friendly column names used in the # SQL validation queries back to their long names if failure[0] in shortColnames: fieldName = shortColnames[failure[0]] else: fieldName = failure[0] error = failure[1] failedValue = failure[2] row = failure[3] original_label = failure[4] fileTypeId = failure[5] targetFileId = failure[6] severityId = failure[7] if severityId == interfaces.validationDb.getRuleSeverityId( "fatal"): errorRows.append(row) try: # If error is an int, it's one of our prestored messages errorType = int(error) errorMsg = ValidationError.getErrorMessage(errorType) except ValueError: # If not, treat it literally errorMsg = error if severityId == interfaces.validationDb.getRuleSeverityId( "fatal"): writer.write([ fieldName, errorMsg, str(row), failedValue, original_label ]) elif severityId == interfaces.validationDb.getRuleSeverityId( "warning"): # write to warnings file warningWriter.write([ fieldName, errorMsg, str(row), failedValue, original_label ]) errorInterface.recordRowError(jobId, self.filename, fieldName, error, rowNumber, original_label, file_type_id=fileTypeId, target_file_id=targetFileId, severity_id=severityId) return errorRows
def runSqlValidations(self, job, file_type, short_colnames, writer, warning_writer, row_number, error_list): """ Run all SQL rules for this file type Args: job: Current job file_type: Type of file for current job short_colnames: Dict mapping short field names to long writer: CsvWriter object warning_writer: CsvWriter for warnings row_number: Current row number error_list: instance of ErrorInterface to keep track of errors Returns: a list of the row numbers that failed one of the sql-based validations """ sess = GlobalDB.db().session job_id = job.job_id error_rows = [] sql_failures = Validator.validateFileBySql(job.submission_id, file_type, self.short_to_long_dict) for failure in sql_failures: # convert shorter, machine friendly column names used in the # SQL validation queries back to their long names if failure[0] in short_colnames: field_name = short_colnames[failure[0]] else: field_name = failure[0] error = failure[1] failed_value = failure[2] row = failure[3] original_label = failure[4] file_type_id = failure[5] target_file_id = failure[6] severity_id = failure[7] if severity_id == RULE_SEVERITY_DICT['fatal']: error_rows.append(row) try: # If error is an int, it's one of our prestored messages error_type = int(error) error_msg = ValidationError.getErrorMessage(error_type) except ValueError: # If not, treat it literally error_msg = error if severity_id == RULE_SEVERITY_DICT['fatal']: writer.write([ field_name, error_msg, str(row), failed_value, original_label ]) elif severity_id == RULE_SEVERITY_DICT['warning']: # write to warnings file warning_writer.write([ field_name, error_msg, str(row), failed_value, original_label ]) error_list.recordRowError(job_id, job.filename, field_name, error, row_number, original_label, file_type_id=file_type_id, target_file_id=target_file_id, severity_id=severity_id) return error_rows