def test_cross_file(self): crossId = self.jobIdDict["crossFile"] # Run jobs for C and D2, then cross file validation job awardFinResponse = self.validateJob(self.jobIdDict["crossAwardFin"],self.useThreads) self.assertEqual(awardFinResponse.status_code, 200,msg=str(awardFinResponse.json)) awardResponse = self.validateJob(self.jobIdDict["crossAward"],self.useThreads) self.assertEqual(awardResponse.status_code, 200,msg=str(awardResponse.json)) crossFileResponse = self.validateJob(crossId,self.useThreads) self.assertEqual(crossFileResponse.status_code, 200,msg=str(crossFileResponse.json)) # Check number of cross file validation errors in DB for this job self.assertEqual(self.interfaces.errorDb.checkNumberOfErrorsByJobId(crossId),2) # Check that cross file validation report exists and is the right size jobTracker = self.interfaces.jobDb fileSize = 405 reportPath = jobTracker.getCrossFileReportPath(jobTracker.getSubmissionId(crossId)) if self.local: path = "".join( [self.local_file_directory,reportPath]) self.assertGreater(os.path.getsize(path), fileSize - 5) self.assertLess(os.path.getsize(path), fileSize + 5) else: self.assertGreater(s3UrlHandler.getFileSize( "errors/"+reportPath), fileSize - 5) self.assertLess(s3UrlHandler.getFileSize( "errors/"+reportPath), fileSize + 5)
def test_cross_file(self): crossId = self.jobIdDict["crossFile"] # Run jobs for C and D2, then cross file validation job awardFinResponse = self.validateJob(self.jobIdDict["crossAwardFin"], self.useThreads) self.assertEqual(awardFinResponse.status_code, 200, msg=str(awardFinResponse.json)) awardResponse = self.validateJob(self.jobIdDict["crossAward"], self.useThreads) self.assertEqual(awardResponse.status_code, 200, msg=str(awardResponse.json)) crossFileResponse = self.validateJob(crossId, self.useThreads) self.assertEqual(crossFileResponse.status_code, 200, msg=str(crossFileResponse.json)) # Check number of cross file validation errors in DB for this job self.assertEqual( self.interfaces.errorDb.checkNumberOfErrorsByJobId(crossId), 2) # Check that cross file validation report exists and is the right size jobTracker = self.interfaces.jobDb fileSize = 405 reportPath = jobTracker.getCrossFileReportPath( jobTracker.getSubmissionId(crossId)) if self.local: path = "".join([self.local_file_directory, reportPath]) self.assertGreater(os.path.getsize(path), fileSize - 5) self.assertLess(os.path.getsize(path), fileSize + 5) else: self.assertGreater( s3UrlHandler.getFileSize("errors/" + reportPath), fileSize - 5) self.assertLess(s3UrlHandler.getFileSize("errors/" + reportPath), fileSize + 5)
def test_cross_file(self): crossId = self.jobDict["crossFile"] # Run jobs for A, B, C, and D2, then cross file validation job # Note: test files used for cross validation use the short column names # as a way to ensure those are handled correctly by the validator awardFinResponse = self.validateJob(self.jobDict["crossAwardFin"]) self.assertEqual(awardFinResponse.status_code, 200, msg=str(awardFinResponse.json)) awardResponse = self.validateJob(self.jobDict["crossAward"]) self.assertEqual(awardResponse.status_code, 200, msg=str(awardResponse.json)) appropResponse = self.validateJob(self.jobDict["crossApprop"]) self.assertEqual(appropResponse.status_code, 200, msg=str(appropResponse.json)) pgmActResponse = self.validateJob(self.jobDict["crossPgmAct"]) self.assertEqual(pgmActResponse.status_code, 200, msg=str(pgmActResponse.json)) crossFileResponse = self.validateJob(crossId) self.assertEqual(crossFileResponse.status_code, 200, msg=str(crossFileResponse.json)) with createApp().app_context(): sess = GlobalDB.db().session job = sess.query(Job).filter(Job.job_id == crossId).one() # Check number of cross file validation errors in DB for this job self.assertEqual(checkNumberOfErrorsByJobId(crossId, "fatal"), 0) self.assertEqual(checkNumberOfErrorsByJobId(crossId, "warning"), 3) self.assertEqual(job.job_status_id, JOB_STATUS_DICT['finished']) # Check that cross file validation report exists and is the right size submissionId = job.submission_id sizePathPairs = [ (89, getCrossReportName(submissionId, "appropriations", "program_activity")), (89, getCrossReportName(submissionId, "award_financial", "award")), (2348, getCrossWarningReportName(submissionId, "appropriations", "program_activity")), (89, getCrossWarningReportName(submissionId, "award_financial", "award")), ] for size, path in sizePathPairs: if self.local: self.assertFileSizeAppxy(size, path) else: self.assertGreater(s3UrlHandler.getFileSize("errors/" + path), size - 5) self.assertLess(s3UrlHandler.getFileSize("errors/" + path), size + 5)
def test_cross_file(self): crossId = self.jobIdDict["crossFile"] # Run jobs for A, B, C, and D2, then cross file validation job awardFinResponse = self.validateJob(self.jobIdDict["crossAwardFin"], self.useThreads) self.assertEqual(awardFinResponse.status_code, 200, msg=str(awardFinResponse.json)) awardResponse = self.validateJob(self.jobIdDict["crossAward"], self.useThreads) self.assertEqual(awardResponse.status_code, 200, msg=str(awardResponse.json)) # appropResponse = self.validateJob(self.jobIdDict["crossApprop"], self.useThreads) # self.assertEqual(appropResponse.status_code, 200, msg=str(appropResponse.json)) # pgmActResponse = self.validateJob(self.jobIdDict["crossPgmAct"], self.useThreads) # self.assertEqual(pgmActResponse.status_code, 200, msg=str(pgmActResponse.json)) crossFileResponse = self.validateJob(crossId, self.useThreads) self.assertEqual(crossFileResponse.status_code, 200, msg=str(crossFileResponse.json)) # TODO: once SUM_BY_TAS rule is implemented, check for specific types # of cross-file validation error. Do we need to split into more discrete tests? # Check number of cross file validation errors in DB for this job self.assertEqual( self.interfaces.errorDb.checkNumberOfErrorsByJobId(crossId), 2) # Check cross file job complete self.waitOnJob(self.interfaces.jobDb, crossId, "finished", self.useThreads) # Check that cross file validation report exists and is the right size jobTracker = self.interfaces.jobDb fileSize = 402 reportPath = jobTracker.getCrossFileReportPath( jobTracker.getSubmissionId(crossId)) if self.local: path = "".join([self.local_file_directory, reportPath]) self.assertGreater(os.path.getsize(path), fileSize - 5) self.assertLess(os.path.getsize(path), fileSize + 5) else: self.assertGreater( s3UrlHandler.getFileSize("errors/" + reportPath), fileSize - 5) self.assertLess(s3UrlHandler.getFileSize("errors/" + reportPath), fileSize + 5)
def run_test(self, jobId, statusId, statusName, fileSize, stagingRows, errorStatus, numErrors, rowErrorsPresent = None): response = self.validateJob(jobId, self.useThreads) jobTracker = self.jobTracker stagingDb = self.stagingDb self.assertEqual(response.status_code, statusId, msg="{}".format(self.getResponseInfo(response))) if statusName != False: self.waitOnJob(jobTracker, jobId, statusName, self.useThreads) self.assertEqual(jobTracker.getStatus(jobId), jobTracker.getStatusId(statusName)) self.assertEqual( response.headers.get("Content-Type"), "application/json") tableName = response.json["table"] if type(stagingRows) == type(False) and not stagingRows: self.assertFalse(stagingDb.tableExists(tableName)) else: self.assertTrue(stagingDb.tableExists(tableName)) self.assertEqual(stagingDb.countRows(tableName), stagingRows) errorInterface = self.errorInterface if errorStatus is not False: self.assertEqual(errorInterface.checkStatusByJobId(jobId), errorInterface.getStatusId(errorStatus)) self.assertEqual(errorInterface.checkNumberOfErrorsByJobId(jobId), numErrors) if(fileSize != False): if self.local: path = "".join( [self.local_file_directory,jobTracker.getReportPath(jobId)]) self.assertGreater(os.path.getsize(path), fileSize - 5) self.assertLess(os.path.getsize(path), fileSize + 5) else: self.assertGreater(s3UrlHandler.getFileSize( "errors/"+jobTracker.getReportPath(jobId)), fileSize - 5) self.assertLess(s3UrlHandler.getFileSize( "errors/"+jobTracker.getReportPath(jobId)), fileSize + 5) # Check if errors_present is set correctly if rowErrorsPresent is not None: # If no value provided, skip this check self.assertEqual(self.interfaces.errorDb.getRowErrorsPresent(jobId), rowErrorsPresent) return response
def run_test(self, jobId, statusId, statusName, fileSize, stagingRows, errorStatus, numErrors, rowErrorsPresent=None): response = self.validateJob(jobId, self.useThreads) jobTracker = self.jobTracker stagingDb = self.stagingDb self.assertEqual(response.status_code, statusId, msg="{}".format(self.getResponseInfo(response))) if statusName != False: self.waitOnJob(jobTracker, jobId, statusName, self.useThreads) self.assertEqual(jobTracker.getStatus(jobId), jobTracker.getStatusId(statusName)) self.assertEqual(response.headers.get("Content-Type"), "application/json") tableName = response.json["table"] if type(stagingRows) == type(False) and not stagingRows: self.assertFalse(stagingDb.tableExists(tableName)) else: self.assertTrue(stagingDb.tableExists(tableName)) self.assertEqual(stagingDb.countRows(tableName), stagingRows) errorInterface = self.errorInterface if errorStatus is not False: self.assertEqual(errorInterface.checkStatusByJobId(jobId), errorInterface.getStatusId(errorStatus)) self.assertEqual(errorInterface.checkNumberOfErrorsByJobId(jobId), numErrors) if (fileSize != False): if self.local: path = "".join([ self.local_file_directory, jobTracker.getReportPath(jobId) ]) self.assertGreater(os.path.getsize(path), fileSize - 5) self.assertLess(os.path.getsize(path), fileSize + 5) else: self.assertGreater( s3UrlHandler.getFileSize("errors/" + jobTracker.getReportPath(jobId)), fileSize - 5) self.assertLess( s3UrlHandler.getFileSize("errors/" + jobTracker.getReportPath(jobId)), fileSize + 5) # Check if errors_present is set correctly if rowErrorsPresent is not None: # If no value provided, skip this check self.assertEqual( self.interfaces.errorDb.getRowErrorsPresent(jobId), rowErrorsPresent) return response
def runValidation(self, jobId, interfaces): """ Run validations for specified job Args: jobId: Job to be validated jobTracker: Interface for job tracker Returns: True if successful """ jobTracker = interfaces.jobDb rowNumber = 1 fileType = jobTracker.getFileType(jobId) # If local, make the error report directory if(self.isLocal and not os.path.exists(self.directory)): os.makedirs(self.directory) # Get bucket name and file name fileName = jobTracker.getFileName(jobId) self.filename = fileName bucketName = CONFIG_BROKER['aws_bucket'] regionName = CONFIG_BROKER['aws_region'] errorFileName = self.getFileName(jobTracker.getReportPath(jobId)) # Create File Status object interfaces.errorDb.createFileIfNeeded(jobId,fileName) validationDB = interfaces.validationDb fieldList = validationDB.getFieldsByFileList(fileType) csvSchema = validationDB.getFieldsByFile(fileType) rules = validationDB.getRulesByFile(fileType) reader = self.getReader() # Get file size and write to jobs table if(CONFIG_BROKER["use_aws"]): fileSize = s3UrlHandler.getFileSize("errors/"+jobTracker.getReportPath(jobId)) else: fileSize = os.path.getsize(jobTracker.getFileName(jobId)) jobTracker.setFileSizeById(jobId, fileSize) try: # Pull file reader.openFile(regionName, bucketName, fileName,fieldList,bucketName,errorFileName) # Create staging table tableName = interfaces.stagingDb.getTableName(jobId) # Create staging table tableObject = StagingTable(interfaces) tableObject.createTable(fileType,fileName,jobId,tableName) errorInterface = interfaces.errorDb # While not done, pull one row and put it into staging if it passes # the Validator with self.getWriter(regionName, bucketName, errorFileName, self.reportHeaders) as writer: while(not reader.isFinished): rowNumber += 1 #if (rowNumber % 1000) == 0: # print("Validating row " + str(rowNumber)) try : record = FieldCleaner.cleanRow(reader.getNextRecord(), fileType, validationDB) record["row"] = rowNumber if(reader.isFinished and len(record) < 2): # This is the last line and is empty, don't record an error rowNumber -= 1 # Don't count this row break except ResponseException as e: if reader.isFinished and reader.extraLine: #Last line may be blank don't record an error, reader.extraLine indicates a case where the last valid line has extra line breaks # Don't count last row if empty rowNumber -= 1 else: writer.write(["Formatting Error", ValidationError.readErrorMsg, str(rowNumber), ""]) errorInterface.recordRowError(jobId,self.filename,"Formatting Error",ValidationError.readError,rowNumber) errorInterface.setRowErrorsPresent(jobId, True) continue valid, failures = Validator.validate(record,rules,csvSchema,fileType,interfaces) if(valid) : try: tableObject.insert(record,fileType) except ResponseException as e: # Write failed, move to next record writer.write(["Formatting Error", ValidationError.writeErrorMsg, str(rowNumber),""]) errorInterface.recordRowError(jobId,self.filename,"Formatting Error",ValidationError.writeError,rowNumber) errorInterface.setRowErrorsPresent(jobId, True) continue else: # For each failure, record it in error report and metadata if failures: errorInterface.setRowErrorsPresent(jobId, True) for failure in failures: fieldName = failure[0] error = failure[1] failedValue = failure[2] try: # If error is an int, it's one of our prestored messages errorType = int(error) errorMsg = ValidationError.getErrorMessage(errorType) except ValueError: # If not, treat it literally errorMsg = error writer.write([fieldName,errorMsg,str(rowNumber),failedValue]) errorInterface.recordRowError(jobId,self.filename,fieldName,error,rowNumber) # Write unfinished batch writer.finishBatch() # Write number of rows to job table jobTracker.setNumberOfRowsById(jobId,rowNumber) # Write leftover records tableObject.endBatch() # Mark validation as finished in job tracker jobTracker.markJobStatus(jobId,"finished") errorInterface.writeAllRowErrors(jobId) finally: #ensure the file always closes reader.close() return True
def runValidation(self, jobId, interfaces): """ Run validations for specified job Args: jobId: Job to be validated jobTracker: Interface for job tracker Returns: True if successful """ jobTracker = interfaces.jobDb rowNumber = 1 fileType = jobTracker.getFileType(jobId) # If local, make the error report directory if (self.isLocal and not os.path.exists(self.directory)): os.makedirs(self.directory) # Get bucket name and file name fileName = jobTracker.getFileName(jobId) self.filename = fileName bucketName = CONFIG_BROKER['aws_bucket'] regionName = CONFIG_BROKER['aws_region'] errorFileName = self.getFileName(jobTracker.getReportPath(jobId)) # Create File Status object interfaces.errorDb.createFileIfNeeded(jobId, fileName) validationDB = interfaces.validationDb fieldList = validationDB.getFieldsByFileList(fileType) csvSchema = validationDB.getFieldsByFile(fileType) rules = validationDB.getRulesByFile(fileType) reader = self.getReader() # Get file size and write to jobs table if (CONFIG_BROKER["use_aws"]): fileSize = s3UrlHandler.getFileSize( "errors/" + jobTracker.getReportPath(jobId)) else: fileSize = os.path.getsize(jobTracker.getFileName(jobId)) jobTracker.setFileSizeById(jobId, fileSize) try: # Pull file reader.openFile(regionName, bucketName, fileName, fieldList, bucketName, errorFileName) # Create staging table tableName = interfaces.stagingDb.getTableName(jobId) # Create staging table tableObject = StagingTable(interfaces) tableObject.createTable(fileType, fileName, jobId, tableName) errorInterface = interfaces.errorDb # While not done, pull one row and put it into staging if it passes # the Validator with self.getWriter(regionName, bucketName, errorFileName, self.reportHeaders) as writer: while (not reader.isFinished): rowNumber += 1 #if (rowNumber % 1000) == 0: # print("Validating row " + str(rowNumber)) try: record = FieldCleaner.cleanRow(reader.getNextRecord(), fileType, validationDB) record["row"] = rowNumber if (reader.isFinished and len(record) < 2): # This is the last line and is empty, don't record an error rowNumber -= 1 # Don't count this row break except ResponseException as e: if reader.isFinished and reader.extraLine: #Last line may be blank don't record an error, reader.extraLine indicates a case where the last valid line has extra line breaks # Don't count last row if empty rowNumber -= 1 else: writer.write([ "Formatting Error", ValidationError.readErrorMsg, str(rowNumber), "" ]) errorInterface.recordRowError( jobId, self.filename, "Formatting Error", ValidationError.readError, rowNumber) errorInterface.setRowErrorsPresent(jobId, True) continue valid, failures = Validator.validate( record, rules, csvSchema, fileType, interfaces) if (valid): try: tableObject.insert(record, fileType) except ResponseException as e: # Write failed, move to next record writer.write([ "Formatting Error", ValidationError.writeErrorMsg, str(rowNumber), "" ]) errorInterface.recordRowError( jobId, self.filename, "Formatting Error", ValidationError.writeError, rowNumber) errorInterface.setRowErrorsPresent(jobId, True) continue else: # For each failure, record it in error report and metadata if failures: errorInterface.setRowErrorsPresent(jobId, True) for failure in failures: fieldName = failure[0] error = failure[1] failedValue = failure[2] try: # If error is an int, it's one of our prestored messages errorType = int(error) errorMsg = ValidationError.getErrorMessage( errorType) except ValueError: # If not, treat it literally errorMsg = error writer.write([ fieldName, errorMsg, str(rowNumber), failedValue ]) errorInterface.recordRowError( jobId, self.filename, fieldName, error, rowNumber) # Write unfinished batch writer.finishBatch() # Write number of rows to job table jobTracker.setNumberOfRowsById(jobId, rowNumber) # Write leftover records tableObject.endBatch() # Mark validation as finished in job tracker jobTracker.markJobStatus(jobId, "finished") errorInterface.writeAllRowErrors(jobId) finally: #ensure the file always closes reader.close() return True
def runValidation(self, jobId, interfaces): """ Run validations for specified job Args: jobId: Job to be validated jobTracker: Interface for job tracker Returns: True if successful """ sess = GlobalDB.db().session # get the job object here so we can call the refactored getReportPath # todo: replace other db access functions with job object attributes job = sess.query(Job).filter(Job.job_id == jobId).one() CloudLogger.logError("VALIDATOR_INFO: ", "Beginning runValidation on jobID: " + str(jobId), "") jobTracker = interfaces.jobDb submissionId = jobTracker.getSubmissionId(jobId) rowNumber = 1 fileType = jobTracker.getFileType(jobId) # Clear existing records for this submission interfaces.stagingDb.clearFileBySubmission(submissionId, fileType) # Get short to long colname dictionary shortColnames = interfaces.validationDb.getShortToLongColname() # If local, make the error report directory if self.isLocal and not os.path.exists(self.directory): os.makedirs(self.directory) # Get bucket name and file name fileName = jobTracker.getFileName(jobId) self.filename = fileName bucketName = CONFIG_BROKER['aws_bucket'] regionName = CONFIG_BROKER['aws_region'] errorFileName = self.getFileName(getReportPath(job, 'error')) warningFileName = self.getFileName(getReportPath(job, 'warning')) # Create File Status object interfaces.errorDb.createFileIfNeeded(jobId, fileName) validationDB = interfaces.validationDb fieldList = validationDB.getFieldsByFileList(fileType) csvSchema = validationDB.getFieldsByFile(fileType, shortCols=True) reader = self.getReader() # Get file size and write to jobs table if CONFIG_BROKER["use_aws"]: fileSize = s3UrlHandler.getFileSize(errorFileName) else: fileSize = os.path.getsize(jobTracker.getFileName(jobId)) jobTracker.setFileSizeById(jobId, fileSize) fields = interfaces.validationDb.getFileColumnsByFile(fileType) try: # Pull file and return info on whether it's using short or long col headers reader.openFile(regionName, bucketName, fileName, fieldList, bucketName, errorFileName) errorInterface = interfaces.errorDb self.longToShortDict = interfaces.validationDb.getLongToShortColname( ) # rowErrorPresent becomes true if any row error occurs, used for determining file status rowErrorPresent = False # list to keep track of rows that fail validations errorRows = [] # While not done, pull one row and put it into staging table if it passes # the Validator with self.getWriter(regionName, bucketName, errorFileName, self.reportHeaders) as writer, \ self.getWriter(regionName, bucketName, warningFileName, self.reportHeaders) as warningWriter: while not reader.isFinished: rowNumber += 1 if (rowNumber % 100) == 0: CloudLogger.logError( "VALIDATOR_INFO: ", "JobId: " + str(jobId) + " loading row " + str(rowNumber), "") # # first phase of validations: read record and record a # formatting error if there's a problem # (record, reduceRow, skipRow, doneReading, rowErrorHere) = self.readRecord(reader, writer, fileType, interfaces, rowNumber, jobId, fields) if reduceRow: rowNumber -= 1 if rowErrorHere: rowErrorPresent = True errorRows.append(rowNumber) if doneReading: # Stop reading from input file break elif skipRow: # Do not write this row to staging, but continue processing future rows continue # # second phase of validations: do basic schema checks # (e.g., require fields, field length, data type) # # D files are obtained from upstream systems (ASP and FPDS) that perform their own basic validations, # so these validations are not repeated here if fileType in ["award", "award_procurement"]: # Skip basic validations for D files, set as valid to trigger write to staging passedValidations = True valid = True else: passedValidations, failures, valid = Validator.validate( record, csvSchema) if valid: skipRow = self.writeToStaging(record, jobId, submissionId, passedValidations, interfaces, writer, rowNumber, fileType) if skipRow: errorRows.append(rowNumber) continue if not passedValidations: if self.writeErrors(failures, interfaces, jobId, shortColnames, writer, warningWriter, rowNumber): errorRows.append(rowNumber) CloudLogger.logError( "VALIDATOR_INFO: ", "Loading complete on jobID: " + str(jobId) + ". Total rows added to staging: " + str(rowNumber), "") # # third phase of validations: run validation rules as specified # in the schema guidance. these validations are sql-based. # sqlErrorRows = self.runSqlValidations(interfaces, jobId, fileType, shortColnames, writer, warningWriter, rowNumber) errorRows.extend(sqlErrorRows) # Write unfinished batch writer.finishBatch() warningWriter.finishBatch() # Calculate total number of rows in file # that passed validations errorRowsUnique = set(errorRows) totalRowsExcludingHeader = rowNumber - 1 validRows = totalRowsExcludingHeader - len(errorRowsUnique) # Update job metadata jobTracker.setJobRowcounts(jobId, rowNumber, validRows) errorInterface.writeAllRowErrors(jobId) # Update error info for submission jobTracker.populateSubmissionErrorInfo(submissionId) # Mark validation as finished in job tracker jobTracker.markJobStatus(jobId, "finished") interfaces.errorDb.markFileComplete(jobId, self.filename) finally: # Ensure the file always closes reader.close() CloudLogger.logError( "VALIDATOR_INFO: ", "Completed L1 and SQL rule validations on jobID: " + str(jobId), "") return True
def run_test(self, jobId, statusId, statusName, fileSize, stagingRows, errorStatus, numErrors, numWarnings = 0, warningFileSize = None): """ Runs a validation test Args: jobId: ID of job for this validation statusId: Expected HTTP status code for this test statusName: Expected status in job tracker, False if job should not exist fileSize: Expected file size of error report, False if error report should not exist stagingRows: Expected number of rows in validation db staging tables. False if no rows are expected errorStatus: Expected status in file table of error DB, False if file object should not exist numErrors: Expected number of errors rowErrorsPresent: Checks flag for whether row errors occurred, None to skip the check Returns: """ with createApp().app_context(): sess = GlobalDB.db().session response = self.validateJob(jobId) self.assertEqual(response.status_code, statusId, str(self.getResponseInfo(response))) # get the job from db job = sess.query(Job).filter(Job.job_id == jobId).one() if statusName is not False: self.assertEqual(job.job_status_id, JOB_STATUS_DICT[statusName]) self.assertEqual( response.headers.get("Content-Type"), "application/json") # Check valid row count for this job if stagingRows is not False: self.assertEqual(job.number_of_rows_valid, stagingRows) if errorStatus is not False: self.assertEqual( sess.query(File).filter(File.job_id == jobId).one().file_status_id, FILE_STATUS_DICT[errorStatus] ) self.assertEqual(checkNumberOfErrorsByJobId(jobId, 'fatal'), numErrors) self.assertEqual(checkNumberOfErrorsByJobId(jobId, 'warning'), numWarnings) if fileSize is not False: reportPath = get_report_path(job, 'error') if self.local: self.assertFileSizeAppxy(fileSize, reportPath) else: self.assertGreater(s3UrlHandler.getFileSize( 'errors/{}'.format(reportPath)), fileSize - 5) self.assertLess(s3UrlHandler.getFileSize( 'errors/{}'.format(reportPath)), fileSize + 5) if warningFileSize is not None and warningFileSize is not False: reportPath = get_report_path(job, 'warning') if self.local: self.assertFileSizeAppxy(warningFileSize, reportPath) else: self.assertGreater(s3UrlHandler.getFileSize( 'errors/{}'.format(reportPath)), warningFileSize - 5) self.assertLess(s3UrlHandler.getFileSize( 'errors/{}'.format(reportPath)), warningFileSize + 5) return response
def run_test(self, jobId, statusId, statusName, fileSize, stagingRows, errorStatus, numErrors, rowErrorsPresent = None): """ Runs a validation test Args: jobId: ID of job for this validation statusId: Expected HTTP status code for this test statusName: Expected status in job tracker, False if job should not exist fileSize: Expected file size of error report, False if error report should not exist stagingRows: Expected number of rows in staging table, False if table should not exist errorStatus: Expected status in file table of error DB, False if file object should not exist numErrors: Expected number of errors rowErrorsPresent: Checks flag for whether row errors occurred, None to skip the check Returns: """ response = self.validateJob(jobId, self.useThreads) jobTracker = self.jobTracker stagingDb = self.stagingDb self.assertEqual(response.status_code, statusId, msg="{}".format(self.getResponseInfo(response))) if statusName != False: self.waitOnJob(jobTracker, jobId, statusName, self.useThreads) self.assertEqual(jobTracker.getJobStatus(jobId), jobTracker.getJobStatusId(statusName)) self.assertEqual( response.headers.get("Content-Type"), "application/json") tableName = response.json["table"] if type(stagingRows) == type(False) and not stagingRows: self.assertFalse(stagingDb.tableExists(tableName)) else: self.assertTrue(stagingDb.tableExists(tableName)) self.assertEqual(stagingDb.countRows(tableName), stagingRows) # Check that field name map table is populated fieldMap = self.interfaces.stagingDb.getFieldNameMap(tableName) self.assertIsNotNone(fieldMap) errorInterface = self.errorInterface if errorStatus is not False: self.assertEqual(errorInterface.checkFileStatusByJobId(jobId), errorInterface.getFileStatusId(errorStatus)) self.assertEqual(errorInterface.checkNumberOfErrorsByJobId(jobId), numErrors) if(fileSize != False): if self.local: path = "".join( [self.local_file_directory,jobTracker.getReportPath(jobId)]) self.assertGreater(os.path.getsize(path), fileSize - 5) self.assertLess(os.path.getsize(path), fileSize + 5) else: self.assertGreater(s3UrlHandler.getFileSize( "errors/"+jobTracker.getReportPath(jobId)), fileSize - 5) self.assertLess(s3UrlHandler.getFileSize( "errors/"+jobTracker.getReportPath(jobId)), fileSize + 5) # Check if errors_present is set correctly if rowErrorsPresent is not None: # If no value provided, skip this check self.assertEqual(self.interfaces.errorDb.getRowErrorsPresent(jobId), rowErrorsPresent) return response
def runValidation(self, job): """ Run validations for specified job Args: job: Job to be validated Returns: True if successful """ sess = GlobalDB.db().session job_id = job.job_id error_list = ErrorInterface() _exception_logger.info( 'VALIDATOR_INFO: Beginning runValidation on job_id: %s', job_id) submission_id = job.submission_id rowNumber = 1 fileType = job.file_type.name # Get orm model for this file model = [ft.model for ft in FILE_TYPE if ft.name == fileType][0] # Clear existing records for this submission sess.query(model).filter_by(submission_id=submission_id).delete() sess.commit() # If local, make the error report directory if self.isLocal and not os.path.exists(self.directory): os.makedirs(self.directory) # Get bucket name and file name fileName = job.filename bucketName = CONFIG_BROKER['aws_bucket'] regionName = CONFIG_BROKER['aws_region'] errorFileName = self.getFileName(get_report_path(job, 'error')) warningFileName = self.getFileName(get_report_path(job, 'warning')) # Create File Status object createFileIfNeeded(job_id, fileName) reader = self.getReader() # Get file size and write to jobs table if CONFIG_BROKER["use_aws"]: fileSize = s3UrlHandler.getFileSize(errorFileName) else: fileSize = os.path.getsize(fileName) job.file_size = fileSize sess.commit() # Get fields for this file fields = sess.query(FileColumn). \ filter(FileColumn.file_id == FILE_TYPE_DICT[fileType]). \ all() for field in fields: sess.expunge(field) csvSchema = {row.name_short: row for row in fields} try: # Pull file and return info on whether it's using short or long col headers reader.open_file(regionName, bucketName, fileName, fields, bucketName, errorFileName, self.long_to_short_dict) # list to keep track of rows that fail validations errorRows = [] # While not done, pull one row and put it into staging table if it passes # the Validator with self.getWriter(regionName, bucketName, errorFileName, self.reportHeaders) as writer, \ self.getWriter(regionName, bucketName, warningFileName, self.reportHeaders) as warningWriter: while not reader.is_finished: rowNumber += 1 if rowNumber % 10 == 0: logger.info('loading row %s', rowNumber) # # first phase of validations: read record and record a # formatting error if there's a problem # (record, reduceRow, skipRow, doneReading, rowErrorHere, flex_cols) = self.readRecord(reader, writer, fileType, rowNumber, job, fields, error_list) if reduceRow: rowNumber -= 1 if rowErrorHere: errorRows.append(rowNumber) if doneReading: # Stop reading from input file break elif skipRow: # Do not write this row to staging, but continue processing future rows continue # # second phase of validations: do basic schema checks # (e.g., require fields, field length, data type) # # D files are obtained from upstream systems (ASP and FPDS) that perform their own basic validations, # so these validations are not repeated here if fileType in ["award", "award_procurement"]: # Skip basic validations for D files, set as valid to trigger write to staging passedValidations = True valid = True else: passedValidations, failures, valid = Validator.validate( record, csvSchema) if valid: skipRow = self.writeToStaging(record, job, submission_id, passedValidations, writer, rowNumber, model, error_list) if flex_cols: self.write_to_flex(flex_cols, job_id, submission_id, fileType) if skipRow: errorRows.append(rowNumber) continue if not passedValidations: if self.writeErrors(failures, job, self.short_to_long_dict, writer, warningWriter, rowNumber, error_list): errorRows.append(rowNumber) _exception_logger.info( 'VALIDATOR_INFO: Loading complete on job_id: %s. ' 'Total rows added to staging: %s', job_id, rowNumber) if fileType in ('appropriations', 'program_activity', 'award_financial'): update_tas_ids(model, submission_id) # # third phase of validations: run validation rules as specified # in the schema guidance. these validations are sql-based. # sqlErrorRows = self.runSqlValidations(job, fileType, self.short_to_long_dict, writer, warningWriter, rowNumber, error_list) errorRows.extend(sqlErrorRows) # Write unfinished batch writer.finishBatch() warningWriter.finishBatch() # Calculate total number of rows in file # that passed validations errorRowsUnique = set(errorRows) totalRowsExcludingHeader = rowNumber - 1 validRows = totalRowsExcludingHeader - len(errorRowsUnique) # Update job metadata job.number_of_rows = rowNumber job.number_of_rows_valid = validRows sess.commit() error_list.writeAllRowErrors(job_id) # Update error info for submission populateSubmissionErrorInfo(submission_id) # Mark validation as finished in job tracker mark_job_status(job_id, "finished") markFileComplete(job_id, fileName) finally: # Ensure the file always closes reader.close() _exception_logger.info( 'VALIDATOR_INFO: Completed L1 and SQL rule validations on ' 'job_id: %s', job_id) return True
def run_test(self, jobId, statusId, statusName, fileSize, stagingRows, errorStatus, numErrors, rowErrorsPresent=None): """ Runs a validation test Args: jobId: ID of job for this validation statusId: Expected HTTP status code for this test statusName: Expected status in job tracker, False if job should not exist fileSize: Expected file size of error report, False if error report should not exist stagingRows: Expected number of rows in staging table, False if table should not exist errorStatus: Expected status in file table of error DB, False if file object should not exist numErrors: Expected number of errors rowErrorsPresent: Checks flag for whether row errors occurred, None to skip the check Returns: """ response = self.validateJob(jobId, self.useThreads) jobTracker = self.jobTracker stagingDb = self.stagingDb self.assertEqual(response.status_code, statusId, msg="{}".format(self.getResponseInfo(response))) if statusName != False: self.waitOnJob(jobTracker, jobId, statusName, self.useThreads) self.assertEqual(jobTracker.getJobStatus(jobId), jobTracker.getJobStatusId(statusName)) self.assertEqual(response.headers.get("Content-Type"), "application/json") tableName = response.json["table"] if type(stagingRows) == type(False) and not stagingRows: self.assertFalse(stagingDb.tableExists(tableName)) else: self.assertTrue(stagingDb.tableExists(tableName)) self.assertEqual(stagingDb.countRows(tableName), stagingRows) # Check that field name map table is populated fieldMap = self.interfaces.stagingDb.getFieldNameMap(tableName) self.assertIsNotNone(fieldMap) errorInterface = self.errorInterface if errorStatus is not False: self.assertEqual(errorInterface.checkFileStatusByJobId(jobId), errorInterface.getFileStatusId(errorStatus)) self.assertEqual(errorInterface.checkNumberOfErrorsByJobId(jobId), numErrors) if (fileSize != False): if self.local: path = "".join([ self.local_file_directory, jobTracker.getReportPath(jobId) ]) self.assertGreater(os.path.getsize(path), fileSize - 5) self.assertLess(os.path.getsize(path), fileSize + 5) else: self.assertGreater( s3UrlHandler.getFileSize("errors/" + jobTracker.getReportPath(jobId)), fileSize - 5) self.assertLess( s3UrlHandler.getFileSize("errors/" + jobTracker.getReportPath(jobId)), fileSize + 5) # Check if errors_present is set correctly if rowErrorsPresent is not None: # If no value provided, skip this check self.assertEqual( self.interfaces.errorDb.getRowErrorsPresent(jobId), rowErrorsPresent) return response