def validate_job(self, request): """ Gets file for job, validates each row, and sends valid rows to a staging table Args: request -- HTTP request containing the jobId Returns: Http response object """ # Create connection to job tracker database sess = GlobalDB.db().session requestDict = RequestDictionary(request) if requestDict.exists('job_id'): job_id = requestDict.getValue('job_id') else: # Request does not have a job ID, can't validate validation_error_type = ValidationError.jobError raise ResponseException('No job ID specified in request', StatusCode.CLIENT_ERROR, None, validation_error_type) # Get the job job = sess.query(Job).filter_by(job_id=job_id).one_or_none() if job is None: validation_error_type = ValidationError.jobError writeFileError(job_id, None, validation_error_type) raise ResponseException( 'Job ID {} not found in database'.format(job_id), StatusCode.CLIENT_ERROR, None, validation_error_type) # Make sure job's prerequisites are complete if not run_job_checks(job_id): validation_error_type = ValidationError.jobError writeFileError(job_id, None, validation_error_type) raise ResponseException( 'Prerequisites for Job ID {} are not complete'.format(job_id), StatusCode.CLIENT_ERROR, None, validation_error_type) # Make sure this is a validation job if job.job_type.name in ('csv_record_validation', 'validation'): job_type_name = job.job_type.name else: validation_error_type = ValidationError.jobError writeFileError(job_id, None, validation_error_type) raise ResponseException( 'Job ID {} is not a validation job (job type is {})'.format( job_id, job.job_type.name), StatusCode.CLIENT_ERROR, None, validation_error_type) # set job status to running and do validations mark_job_status(job_id, "running") if job_type_name == 'csv_record_validation': self.runValidation(job) elif job_type_name == 'validation': self.runCrossValidation(job) else: raise ResponseException("Bad job type for validator", StatusCode.INTERNAL_ERROR) return JsonResponse.create(StatusCode.OK, {"message": "Validation complete"})
def start_a_generation(job, start_date, end_date, agency_code): """ Validates the start and end dates of the generation and sends the job information to SQS. Args: job: File generation job to start start_date: String to parse as the start date of the generation end_date: String to parse as the end date of the generation agency_code: Agency code for A file generations """ if not (StringCleaner.is_date(start_date) and StringCleaner.is_date(end_date)): raise ResponseException("Start or end date cannot be parsed into a date of format MM/DD/YYYY", StatusCode.CLIENT_ERROR) # Update the Job's start and end dates sess = GlobalDB.db().session job.start_date = start_date job.end_date = end_date sess.commit() mark_job_status(job.job_id, "waiting") file_type = job.file_type.letter_name log_data = {'message': 'Sending {} file generation job {} to Validator in SQS'.format(file_type, job.job_id), 'message_type': 'BrokerInfo', 'job_id': job.job_id, 'file_type': file_type} logger.info(log_data) # Set SQS message attributes message_attr = {'agency_code': {'DataType': 'String', 'StringValue': agency_code}} # Add job_id to the SQS job queue queue = sqs_queue() msg_response = queue.send_message(MessageBody=str(job.job_id), MessageAttributes=message_attr) log_data['message'] = 'SQS message response: {}'.format(msg_response) logger.debug(log_data)
def start_e_f_generation(job): """ Passes the Job ID for an E or F generation Job to SQS Args: job: File generation job to start """ mark_job_status(job.job_id, "waiting") file_type = job.file_type.letter_name log_data = { 'message': 'Sending {} file generation job {} to Validator in SQS'.format( file_type, job.job_id), 'message_type': 'BrokerInfo', 'submission_id': job.submission_id, 'job_id': job.job_id, 'file_type': file_type } logger.info(log_data) # Add job_id to the SQS job queue queue = sqs_queue() msg_response = queue.send_message(MessageBody=str(job.job_id), MessageAttributes={}) log_data['message'] = 'SQS message response: {}'.format(msg_response) logger.debug(log_data)
def generate_file(self, agency_code=None): """ Generates a file based on the FileGeneration object and updates any Jobs referencing it """ raw_filename = (GEN_FILENAMES[self.file_type] if not self.file_generation else GEN_FILENAMES[self.file_type].format( self.file_generation.agency_type)) file_name = S3Handler.get_timestamped_filename(raw_filename) if self.is_local: file_path = "".join([CONFIG_BROKER['broker_files'], file_name]) else: file_path = "".join(["None/", file_name]) # Generate the file and upload to S3 log_data = { 'message': 'Finished file {} generation'.format(self.file_type), 'message_type': 'ValidatorInfo', 'file_type': self.file_type, 'file_path': file_path } if self.file_generation: self.generate_d_file(file_path) log_data.update({ 'agency_code': self.file_generation.agency_code, 'agency_type': self.file_generation.agency_type, 'start_date': self.file_generation.start_date, 'end_date': self.file_generation.end_date, 'file_generation_id': self.file_generation.file_generation_id }) elif self.job.file_type.letter_name in ['A', 'E', 'F']: log_data['job_id'] = self.job.job_id mark_job_status(self.job.job_id, 'running') if self.job.file_type.letter_name == 'A': if not agency_code: raise ResponseException( 'Agency code not provided for an A file generation') self.generate_a_file(agency_code, file_path) else: # Call self.generate_%s_file() where %s is e or f based on the Job's file_type file_type_lower = self.job.file_type.letter_name.lower() getattr(self, 'generate_%s_file' % file_type_lower)() mark_job_status(self.job.job_id, 'finished') else: e = 'No FileGeneration object for D file generation.' if self.file_type in ['D1', 'D2'] else \ 'Cannot generate file for {} file type.'.format(self.file_type if self.file_type else 'empty') raise ResponseException(e) logger.info(log_data)
def generate_from_job(self): """ Generates a file for a specified job """ # Mark Job as running mark_job_status(self.job.job_id, 'running') # Ensure this is a file generation job job_type = self.job.job_type.name if job_type != 'file_upload': raise ResponseException( 'Job ID {} is not a file generation job (job type is {})'.format(self.job.job_id, job_type), StatusCode.CLIENT_ERROR, None, ValidationError.jobError) # Ensure there is an available agency_code if not self.agency_code: raise ResponseException( 'An agency_code must be provided to generate a file'.format(self.job.job_id, job_type), StatusCode.CLIENT_ERROR, None, ValidationError.jobError) # Retrieve any FileRequest that may have started since the Broker sent the request to SQS skip_generation = None if self.job.file_type.letter_name in ['D1', 'D2']: skip_generation = retrieve_cached_file_request(self.job, self.agency_type, self.agency_code, self.is_local) if not skip_generation: # Generate timestamped file names raw_filename = CONFIG_BROKER["".join([str(self.job.file_type.name), "_file_name"])] self.job.original_filename = S3Handler.get_timestamped_filename(raw_filename) if self.is_local: self.job.filename = "".join([CONFIG_BROKER['broker_files'], self.job.original_filename]) else: self.job.filename = "".join([str(self.job.submission_id), "/", self.job.original_filename]) self.sess.commit() # Generate the file, and upload to S3 if self.job.file_type.letter_name in ['D1', 'D2']: # Update the validation Job if necessary update_validation_job_info(self.sess, self.job) self.generate_d_file() elif self.job.file_type.letter_name == 'A': self.generate_a_file() elif self.job.file_type.letter_name == 'E': self.generate_e_file() else: self.generate_f_file() mark_job_status(self.job.job_id, 'finished') logger.info({ 'message': 'Finished file {} generation'.format(self.job.file_type.letter_name), 'message_type': 'ValidatorInfo', 'job_id': self.job.job_id, 'agency_code': self.agency_code, 'file_type': self.job.file_type.letter_name, 'start_date': self.job.start_date, 'end_date': self.job.end_date, 'filename': self.job.original_filename })
def generate_from_job(self, job_id, agency_code): """ Generates a file for a specified job Args: job_id: ID of the upload Job agency_code: FREC or CGAC code to generate data from """ mark_job_status(job_id, 'running') with job_context(job_id, self.is_local) as context: sess, job = context # Ensure this is a file generation job if job.job_type.name != 'file_upload': raise ResponseException( 'Job ID {} is not a file generation job (job type is {})'. format(job.job_id, job.job_type.name), StatusCode.CLIENT_ERROR, None, ValidationError.jobError) # Ensure there is an available agency_code if not agency_code: if job.submission_id: agency_code = job.submission.frec_code if job.submission.frec_code else job.submission.cgac_code else: raise ResponseException( 'An agency_code must be provided to generate a file'. format(job.job_id, job.job_type.name), StatusCode.CLIENT_ERROR, None, ValidationError.jobError) # Generate timestamped file names old_filename = job.original_filename job.original_filename = S3Handler.get_timestamped_filename( CONFIG_BROKER["".join([str(job.file_type.name), "_file_name"])]) if self.is_local: job.filename = "".join( [CONFIG_BROKER['broker_files'], job.original_filename]) else: job.filename = "".join( [str(job.submission_id), "/", job.original_filename]) # Generate the file and upload to S3 if job.file_type.letter_name in ['D1', 'D2']: # Update the validation Job if necessary if job.submission_id: self.update_validation_job_info(job) generate_d_file(sess, job, agency_code, self.is_local, old_filename) elif job.file_type.letter_name == 'E': generate_e_file(sess, job, self.is_local) else: generate_f_file(sess, job, self.is_local)
def validate_job(self, job_id): """ Gets file for job, validates each row, and sends valid rows to a staging table Args: request -- HTTP request containing the jobId Returns: Http response object """ # Create connection to job tracker database sess = GlobalDB.db().session # Get the job job = sess.query(Job).filter_by(job_id=job_id).one_or_none() if job is None: validation_error_type = ValidationError.jobError write_file_error(job_id, None, validation_error_type) raise ResponseException( 'Job ID {} not found in database'.format(job_id), StatusCode.CLIENT_ERROR, None, validation_error_type) # Make sure job's prerequisites are complete if not run_job_checks(job_id): validation_error_type = ValidationError.jobError write_file_error(job_id, None, validation_error_type) raise ResponseException( 'Prerequisites for Job ID {} are not complete'.format(job_id), StatusCode.CLIENT_ERROR, None, validation_error_type) # Make sure this is a validation job if job.job_type.name in ('csv_record_validation', 'validation'): job_type_name = job.job_type.name else: validation_error_type = ValidationError.jobError write_file_error(job_id, None, validation_error_type) raise ResponseException( 'Job ID {} is not a validation job (job type is {})'.format( job_id, job.job_type.name), StatusCode.CLIENT_ERROR, None, validation_error_type) # set job status to running and do validations mark_job_status(job_id, "running") if job_type_name == 'csv_record_validation': self.run_validation(job) elif job_type_name == 'validation': self.run_cross_validation(job) else: raise ResponseException("Bad job type for validator", StatusCode.INTERNAL_ERROR) # Update last validated date job.last_validated = datetime.utcnow() sess.commit() return JsonResponse.create(StatusCode.OK, {"message": "Validation complete"})
def job_context(job_id, is_local=True): """Common context for files D1, D2, E, and F generation. Handles marking the job finished and/or failed""" # Flask context ensures we have access to global.g with Flask(__name__).app_context(): sess = GlobalDB.db().session try: yield sess logger.info({ 'message': 'Marking job {} as finished'.format(job_id), 'message_type': 'BrokerInfo', 'job_id': job_id }) mark_job_status(job_id, "finished") except Exception as e: # logger.exception() automatically adds traceback info logger.exception({ 'message': 'Marking job {} as failed'.format(job_id), 'message_type': 'BrokerException', 'job_id': job_id, 'exception': str(e) }) job = sess.query(Job).filter_by(job_id=job_id).one_or_none() if job: # mark job as failed job.error_message = str(e) sess.commit() mark_job_status(job_id, "failed") # ensure FileRequest from failed job is not cached file_request = sess.query(FileRequest).filter_by( job_id=job_id).one_or_none() if file_request and file_request.is_cached_file: file_request.is_cached_file = False sess.commit() finally: file_request = sess.query(FileRequest).filter_by( job_id=job_id).one_or_none() if file_request and file_request.is_cached_file: # copy job data to all child FileRequests child_requests = sess.query(FileRequest).filter_by( parent_job_id=job_id).all() file_type = FILE_TYPE_DICT_LETTER[ file_request.job.file_type_id] for child in child_requests: copy_parent_file_request_data(sess, child.job, file_request.job, file_type, is_local) GlobalDB.close()
def start_d_generation(job, start_date, end_date, agency_type, agency_code=None): """ Validates the start and end dates of the generation, updates the submission's publish status and progress (if its not detached generation), and sends the job information to SQS. Args: job: File generation job to start start_date: String to parse as the start date of the generation end_date: String to parse as the end date of the generation agency_type: Type of Agency to generate files by: "awarding" or "funding" agency_code: Agency code for detached D file generations Returns: SQS send_message response """ if not (StringCleaner.is_date(start_date) and StringCleaner.is_date(end_date)): raise ResponseException("Start or end date cannot be parsed into a date of format MM/DD/YYYY", StatusCode.CLIENT_ERROR) # Update the Job's start and end dates sess = GlobalDB.db().session job.start_date = start_date job.end_date = end_date sess.commit() # Update submission if job.submission_id: agency_code = update_generation_submission(sess, job) mark_job_status(job.job_id, "waiting") log_data = {'message': 'Sending {} file generation job {} to SQS'.format(job.file_type.letter_name, job.job_id), 'message_type': 'BrokerInfo', 'submission_id': job.submission_id, 'job_id': job.job_id, 'file_type': job.file_type.letter_name} logger.info(log_data) file_request = retrieve_cached_file_request(job, agency_type, agency_code, g.is_local) if file_request: log_data['message'] = 'No new file generated, used FileRequest with ID {}'.format(file_request.file_request_id) logger.info(log_data) else: # Set SQS message attributes message_attr = {'agency_type': {'DataType': 'String', 'StringValue': agency_type}} if not job.submission_id: message_attr['agency_code'] = {'DataType': 'String', 'StringValue': agency_code} # Add job_id to the SQS job queue queue = sqs_queue() msg_response = queue.send_message(MessageBody=str(job.job_id), MessageAttributes=message_attr) log_data['message'] = 'SQS message response: {}'.format(msg_response) logger.debug(log_data)
def handle_response_exception(error): """Handle exceptions explicitly raised during validation.""" logger.error(str(error)) job = get_current_job() if job: if job.filename is not None: # insert file-level error info to the database writeFileError(job.job_id, job.filename, error.errorType, error.extraInfo) if error.errorType != ValidationError.jobError: # job pass prerequisites for validation, but an error # happened somewhere. mark job as 'invalid' mark_job_status(job.job_id, 'invalid') return JsonResponse.error(error, error.status)
def validate_job(self, job_id): """ Gets file for job, validates each row, and sends valid rows to a staging table Args: request -- HTTP request containing the jobId Returns: Http response object """ # Create connection to job tracker database sess = GlobalDB.db().session # Get the job job = sess.query(Job).filter_by(job_id=job_id).one_or_none() if job is None: validation_error_type = ValidationError.jobError write_file_error(job_id, None, validation_error_type) raise ResponseException('Job ID {} not found in database'.format(job_id), StatusCode.CLIENT_ERROR, None, validation_error_type) # Make sure job's prerequisites are complete if not run_job_checks(job_id): validation_error_type = ValidationError.jobError write_file_error(job_id, None, validation_error_type) raise ResponseException('Prerequisites for Job ID {} are not complete'.format(job_id), StatusCode.CLIENT_ERROR, None, validation_error_type) # Make sure this is a validation job if job.job_type.name in ('csv_record_validation', 'validation'): job_type_name = job.job_type.name else: validation_error_type = ValidationError.jobError write_file_error(job_id, None, validation_error_type) raise ResponseException( 'Job ID {} is not a validation job (job type is {})'.format(job_id, job.job_type.name), StatusCode.CLIENT_ERROR, None, validation_error_type) # set job status to running and do validations mark_job_status(job_id, "running") if job_type_name == 'csv_record_validation': self.run_validation(job) elif job_type_name == 'validation': self.run_cross_validation(job) else: raise ResponseException("Bad job type for validator", StatusCode.INTERNAL_ERROR) # Update last validated date job.last_validated = datetime.utcnow() sess.commit() return JsonResponse.create(StatusCode.OK, {"message": "Validation complete"})
def handle_validation_exception(error): """Handle uncaught exceptions in validation process.""" logger.error(str(error)) # csv-specific errors get a different job status and response code if isinstance(error, ValueError) or isinstance(error, csv.Error): job_status, response_code = 'invalid', 400 else: job_status, response_code = 'failed', 500 job = get_current_job() if job: if job.filename is not None: writeFileError(job.job_id, job.filename, ValidationError.unknownError) mark_job_status(job.job_id, job_status) return JsonResponse.error(error, response_code)
def generate_file(self, agency_code=None): """ Generates a file based on the FileGeneration object and updates any Jobs referencing it """ raw_filename = (GEN_FILENAMES[self.file_type] if not self.file_generation else GEN_FILENAMES[self.file_type].format(self.file_generation.agency_type)) file_name = S3Handler.get_timestamped_filename(raw_filename) if self.is_local: file_path = "".join([CONFIG_BROKER['broker_files'], file_name]) else: file_path = "".join(["None/", file_name]) # Generate the file and upload to S3 log_data = {'message': 'Finished file {} generation'.format(self.file_type), 'message_type': 'ValidatorInfo', 'file_type': self.file_type, 'file_path': file_path} if self.file_generation: self.generate_d_file(file_path) log_data.update({ 'agency_code': self.file_generation.agency_code, 'agency_type': self.file_generation.agency_type, 'start_date': self.file_generation.start_date, 'end_date': self.file_generation.end_date, 'file_generation_id': self.file_generation.file_generation_id }) elif self.job.file_type.letter_name in ['A', 'E', 'F']: log_data['job_id'] = self.job.job_id mark_job_status(self.job.job_id, 'running') if self.job.file_type.letter_name == 'A': if not agency_code: raise ResponseException('Agency code not provided for an A file generation') self.generate_a_file(agency_code, file_path) else: # Call self.generate_%s_file() where %s is e or f based on the Job's file_type file_type_lower = self.job.file_type.letter_name.lower() getattr(self, 'generate_%s_file' % file_type_lower)() mark_job_status(self.job.job_id, 'finished') else: e = 'No FileGeneration object for D file generation.' if self.file_type in ['D1', 'D2'] else \ 'Cannot generate file for {} file type.'.format(self.file_type if self.file_type else 'empty') raise ResponseException(e) logger.info(log_data)
def start_e_f_generation(job): """ Passes the Job ID for an E or F generation Job to SQS Args: job: File generation job to start """ mark_job_status(job.job_id, "waiting") file_type = job.file_type.letter_name log_data = {'message': 'Sending {} file generation job {} to Validator in SQS'.format(file_type, job.job_id), 'message_type': 'BrokerInfo', 'submission_id': job.submission_id, 'job_id': job.job_id, 'file_type': file_type} logger.info(log_data) # Add job_id to the SQS job queue queue = sqs_queue() msg_response = queue.send_message(MessageBody=str(job.job_id), MessageAttributes={}) log_data['message'] = 'SQS message response: {}'.format(msg_response) logger.debug(log_data)
def job_context(job_id): """Common context for file E and F generation. Handles marking the job finished and/or failed""" # Flask context ensures we have access to global.g with Flask(__name__).app_context(): sess = GlobalDB.db().session try: yield sess logger.debug('Marking job as finished') mark_job_status(job_id, "finished") except Exception as e: # logger.exception() automatically adds traceback info logger.exception('Job %s failed', job_id) job = sess.query(Job).filter_by(job_id=job_id).one_or_none() if job: job.error_message = str(e) sess.commit() mark_job_status(job_id, "failed") finally: GlobalDB.close()
def job_context(task, job_id): """Common context for file E and F generation. Handles marking the job finished and/or failed""" # Flask context ensures we have access to global.g with Flask(__name__).app_context(): sess = GlobalDB.db().session try: yield sess mark_job_status(job_id, "finished") except Exception as e: # logger.exception() automatically adds traceback info logger.exception('Job %s failed, retrying', job_id) try: raise task.retry() except MaxRetriesExceededError: logger.warning('Job %s completely failed', job_id) # Log the error job = sess.query(Job).filter_by(job_id=job_id).one_or_none() if job: job.error_message = str(e) sess.commit() mark_job_status(job_id, "failed")
def copy_parent_file_request_data(child_job, parent_job, is_local=None): """ Parent FileRequest job data to the child FileRequest job data. Args: child_job: Job object for the child FileRequest parent_job: Job object for the parent FileRequest is_local: A boolean flag indicating whether the application is being run locally or not """ sess = GlobalDB.db().session # Do not edit submissions that have successfully completed if child_job.job_status_id == lookups.JOB_STATUS_DICT['finished']: return # Generate file path for child Job's filaname filepath = CONFIG_BROKER['broker_files'] if is_local else "{}/".format(str(child_job.submission_id)) filename = '{}{}'.format(filepath, parent_job.original_filename) # Copy parent job's data child_job.from_cached = True child_job.filename = filename child_job.original_filename = parent_job.original_filename child_job.number_of_errors = parent_job.number_of_errors child_job.number_of_warnings = parent_job.number_of_warnings child_job.error_message = parent_job.error_message # Change the validation job's file data when within a submission if child_job.submission_id is not None: val_job = sess.query(Job).filter(Job.submission_id == child_job.submission_id, Job.file_type_id == parent_job.file_type_id, Job.job_type_id == lookups.JOB_TYPE_DICT['csv_record_validation']).one() val_job.filename = filename val_job.original_filename = parent_job.original_filename sess.commit() copy_file_from_parent_to_child(child_job, parent_job, is_local) # Mark job status last so the validation job doesn't start until everything is done mark_job_status(child_job.job_id, lookups.JOB_STATUS_DICT_ID[parent_job.job_status_id])
def run_cross_validation(self, job): """ Cross file validation job. Test all rules with matching rule_timing. Run each cross-file rule and create error report. Args: job: Current job """ sess = GlobalDB.db().session job_id = job.job_id # Create File Status object create_file_if_needed(job_id) # Create list of errors error_list = ErrorInterface() submission_id = job.submission_id bucket_name = CONFIG_BROKER['aws_bucket'] region_name = CONFIG_BROKER['aws_region'] job_start = datetime.now() logger.info( { 'message': 'Beginning cross-file validations on submission_id: ' + str(submission_id), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job.job_id, 'action': 'run_cross_validations', 'start': job_start, 'status': 'start'}) # Delete existing cross file errors for this submission sess.query(ErrorMetadata).filter(ErrorMetadata.job_id == job_id).delete() sess.commit() # get all cross file rules from db cross_file_rules = sess.query(RuleSql).filter_by(rule_cross_file_flag=True) # for each cross-file combo, run associated rules and create error report for c in get_cross_file_pairs(): first_file = c[0] second_file = c[1] combo_rules = cross_file_rules.filter(or_(and_( RuleSql.file_id == first_file.id, RuleSql.target_file_id == second_file.id), and_( RuleSql.file_id == second_file.id, RuleSql.target_file_id == first_file.id))) # send comboRules to validator.crossValidate sql failures = cross_validate_sql(combo_rules.all(), submission_id, self.short_to_long_dict, first_file.id, second_file.id, job) # get error file name report_filename = self.get_file_name(report_file_name(submission_id, False, first_file.name, second_file.name)) warning_report_filename = self.get_file_name(report_file_name(submission_id, True, first_file.name, second_file.name)) # loop through failures to create the error report with self.get_writer(region_name, bucket_name, report_filename, self.crossFileReportHeaders) as writer, \ self.get_writer(region_name, bucket_name, warning_report_filename, self.crossFileReportHeaders) as \ warning_writer: for failure in failures: if failure[9] == RULE_SEVERITY_DICT['fatal']: writer.write(failure[0:7]) if failure[9] == RULE_SEVERITY_DICT['warning']: warning_writer.write(failure[0:7]) error_list.record_row_error(job_id, "cross_file", failure[0], failure[3], failure[5], failure[6], failure[7], failure[8], severity_id=failure[9]) # write the last unfinished batch writer.finish_batch() warning_writer.finish_batch() # write all recorded errors to database error_list.write_all_row_errors(job_id) # Update error info for submission populate_job_error_info(job) # mark job status as "finished" mark_job_status(job_id, "finished") job_duration = (datetime.now()-job_start).total_seconds() logger.info( { 'message': 'Completed cross-file validations on submission_id: ' + str(submission_id), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job.job_id, 'action': 'run_cross_validations', 'status': 'finish', 'start': job_start, 'duration': job_duration}) # set number of errors and warnings for submission. submission = populate_submission_error_info(submission_id) # TODO: Remove temporary step below # Temporarily set publishable flag at end of cross file, remove this once users are able to mark their # submissions as publishable # Publish only if no errors are present if submission.number_of_errors == 0: submission.publishable = True sess.commit() # Mark validation complete mark_file_complete(job_id)
def start_d_generation(job, start_date, end_date, agency_type, agency_code=None): """ Validates the start and end dates of the generation, updates the submission's publish status and progress (if its not detached generation), and sends the job information to SQS. Args: job: File generation job to start start_date: String to parse as the start date of the generation end_date: String to parse as the end date of the generation agency_type: Type of Agency to generate files by: "awarding" or "funding" agency_code: Agency code for detached D file generations """ if not (StringCleaner.is_date(start_date) and StringCleaner.is_date(end_date)): raise ResponseException("Start or end date cannot be parsed into a date of format MM/DD/YYYY", StatusCode.CLIENT_ERROR) # Update the Job's start and end dates sess = GlobalDB.db().session job.start_date = start_date job.end_date = end_date sess.commit() # Update submission if job.submission_id: agency_code = update_generation_submission(sess, job) mark_job_status(job.job_id, 'waiting') file_generation = retrieve_cached_file_generation(job, agency_type, agency_code) if file_generation: try: copy_file_generation_to_job(job, file_generation, g.is_local) except Exception as e: logger.error(traceback.format_exc()) mark_job_status(job.job_id, 'failed') job.error_message = str(e) sess.commit() else: # Create new FileGeneration and reset Jobs file_generation = FileGeneration( request_date=datetime.now().date(), start_date=job.start_date, end_date=job.end_date, file_type=job.file_type.letter_name, agency_code=agency_code, agency_type=agency_type, is_cached_file=True) sess.add(file_generation) sess.commit() try: job.file_generation_id = file_generation.file_generation_id sess.commit() reset_generation_jobs(sess, job) logger.info({'message': 'Sending new FileGeneration {} to SQS'.format(file_generation.file_generation_id), 'message_type': 'BrokerInfo', 'file_type': job.file_type.letter_name, 'job_id': job.job_id, 'submission_id': job.submission_id, 'file_generation_id': file_generation.file_generation_id}) # Add file_generation_id to the SQS job queue queue = sqs_queue() message_attr = {"validation_type": {"DataType": "String", "StringValue": "generation"}} queue.send_message(MessageBody=str(file_generation.file_generation_id), MessageAttributes=message_attr) except Exception as e: logger.error(traceback.format_exc()) mark_job_status(job.job_id, 'failed') job.error_message = str(e) file_generation.is_cached_file = False sess.commit()
def validator_process_job(job_id, agency_code, is_retry=False): """ Retrieves a Job based on its ID, and kicks off a validation. Handles errors by ensuring the Job (if exists) is no longer running. Args: job_id: ID of a Job agency_code: CGAC or FREC code for agency, only required for file generations by Job is_retry: If this is not the very first time handling execution of this job. If True, cleanup is performed before proceeding to retry the job Raises: Any Exceptions raised by the GenerationManager or ValidationManager, excluding those explicitly handled """ if is_retry: if cleanup_validation(job_id): log_job_message( logger=logger, message= "Attempting a retry of {} after successful retry-cleanup.". format(inspect.stack()[0][3]), job_id=job_id, is_debug=True) else: log_job_message( logger=logger, message="Retry of {} found to be not necessary after cleanup. " "Returning from job with success.".format( inspect.stack()[0][3]), job_id=job_id, is_debug=True) return sess = GlobalDB.db().session job = None try: # Get the job job = sess.query(Job).filter_by(job_id=job_id).one_or_none() if job is None: validation_error_type = ValidationError.jobError write_file_error(job_id, None, validation_error_type) raise ResponseException( 'Job ID {} not found in database'.format(job_id), StatusCode.CLIENT_ERROR, None, validation_error_type) mark_job_status(job_id, 'ready') # We can either validate or generate a file based on Job ID if job.job_type.name == 'file_upload': # Generate A, E, or F file file_generation_manager = FileGenerationManager(sess, g.is_local, job=job) file_generation_manager.generate_file(agency_code) else: # Run validations validation_manager = ValidationManager( g.is_local, CONFIG_SERVICES['error_report_path']) validation_manager.validate_job(job.job_id) except (ResponseException, csv.Error, UnicodeDecodeError, ValueError) as e: # Handle exceptions explicitly raised during validation error_data = { 'message': 'An exception occurred in the Validator', 'message_type': 'ValidatorInfo', 'job_id': job_id, 'traceback': traceback.format_exc() } if job: error_data.update({ 'submission_id': job.submission_id, 'file_type': job.file_type.name }) logger.error(error_data) sess.refresh(job) job.error_message = str(e) if job.filename is not None: error_type = ValidationError.unknownError if isinstance(e, UnicodeDecodeError): error_type = ValidationError.encodingError elif isinstance(e, ResponseException): error_type = e.errorType write_file_error(job.job_id, job.filename, error_type) mark_job_status(job.job_id, 'invalid') else: logger.error(error_data) raise e except Exception as e: # Log uncaught exceptions and fail the Job error_data = { 'message': 'An unhandled exception occurred in the Validator', 'message_type': 'ValidatorInfo', 'job_id': job_id, 'traceback': traceback.format_exc() } if job: error_data.update({ 'submission_id': job.submission_id, 'file_type': job.file_type.name }) logger.error(error_data) # Try to mark the Job as failed, but continue raising the original Exception if not possible try: mark_job_status(job_id, 'failed') sess.refresh(job) job.error_message = str(e) sess.commit() except: pass raise e
def run_cross_validation(self, job): """ Cross file validation job. Test all rules with matching rule_timing. Run each cross-file rule and create error report. Args: job: Current job """ sess = GlobalDB.db().session job_id = job.job_id # Create File Status object create_file_if_needed(job_id) # Create list of errors error_list = ErrorInterface() submission_id = job.submission_id job_start = datetime.now() logger.info({ 'message': 'Beginning cross-file validations on submission_id: ' + str(submission_id), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job.job_id, 'action': 'run_cross_validations', 'start': job_start, 'status': 'start' }) # Delete existing cross file errors for this submission sess.query(ErrorMetadata).filter( ErrorMetadata.job_id == job_id).delete() sess.commit() # get all cross file rules from db cross_file_rules = sess.query(RuleSql).filter_by( rule_cross_file_flag=True) # for each cross-file combo, run associated rules and create error report for c in get_cross_file_pairs(): first_file = c[0] second_file = c[1] combo_rules = cross_file_rules.filter( or_( and_(RuleSql.file_id == first_file.id, RuleSql.target_file_id == second_file.id), and_(RuleSql.file_id == second_file.id, RuleSql.target_file_id == first_file.id))) # get error file name/path error_file_name = report_file_name(submission_id, False, first_file.name, second_file.name) error_file_path = "".join( [CONFIG_SERVICES['error_report_path'], error_file_name]) warning_file_name = report_file_name(submission_id, True, first_file.name, second_file.name) warning_file_path = "".join( [CONFIG_SERVICES['error_report_path'], warning_file_name]) # open error report and gather failed rules within it with open(error_file_path, 'w', newline='') as error_file,\ open(warning_file_path, 'w', newline='') as warning_file: error_csv = csv.writer(error_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') warning_csv = csv.writer(warning_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') # write headers to file error_csv.writerow(self.crossFileReportHeaders) warning_csv.writerow(self.crossFileReportHeaders) # send comboRules to validator.crossValidate sql current_cols_short_to_long = self.short_to_long_dict[ first_file.id].copy() current_cols_short_to_long.update( self.short_to_long_dict[second_file.id].copy()) cross_validate_sql(combo_rules.all(), submission_id, current_cols_short_to_long, first_file.id, second_file.id, job, error_csv, warning_csv, error_list, job_id) # close files error_file.close() warning_file.close() # stream file to S3 when not local if not self.is_local: # stream error file with open(error_file_path, 'rb') as csv_file: with smart_open.smart_open( S3Handler.create_file_path( self.get_file_name(error_file_name)), 'w') as writer: while True: chunk = csv_file.read(CHUNK_SIZE) if chunk: writer.write(chunk) else: break csv_file.close() os.remove(error_file_path) # stream warning file with open(warning_file_path, 'rb') as warning_csv_file: with smart_open.smart_open( S3Handler.create_file_path( self.get_file_name(warning_file_name)), 'w') as warning_writer: while True: chunk = warning_csv_file.read(CHUNK_SIZE) if chunk: warning_writer.write(chunk) else: break warning_csv_file.close() os.remove(warning_file_path) # write all recorded errors to database error_list.write_all_row_errors(job_id) # Update error info for submission populate_job_error_info(job) # mark job status as "finished" mark_job_status(job_id, "finished") job_duration = (datetime.now() - job_start).total_seconds() logger.info({ 'message': 'Completed cross-file validations on submission_id: ' + str(submission_id), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job.job_id, 'action': 'run_cross_validations', 'status': 'finish', 'start': job_start, 'duration': job_duration }) # set number of errors and warnings for submission. submission = populate_submission_error_info(submission_id) # TODO: Remove temporary step below # Temporarily set publishable flag at end of cross file, remove this once users are able to mark their # submissions as publishable # Publish only if no errors are present if submission.number_of_errors == 0: submission.publishable = True sess.commit() # Mark validation complete mark_file_complete(job_id)
def copy_file_generation_to_job(job, file_generation, is_local): """ Copy cached FileGeneration data to a Job requesting a file. Args: job: Job object to copy the data to file_generation: Cached FileGeneration object to copy the data from is_local: A boolean flag indicating whether the application is being run locally or not """ sess = GlobalDB.db().session log_data = { 'message': 'Copying FileGeneration {} data to Job {}'.format( file_generation.file_generation_id, job.job_id), 'message_type': 'BrokerInfo', 'job_id': job.job_id, 'file_type': job.file_type.name, 'file_generation_id': file_generation.file_generation_id } logger.info(log_data) # Do not edit submissions that have already successfully completed sess.refresh(job) if job.job_status_id == lookups.JOB_STATUS_DICT['finished']: return job.file_generation_id = file_generation.file_generation_id # File is still being generated, just mark the FileGeneration ID in the Job and wait # FileGeneration will update all child Jobs when it finishes if not file_generation.file_path: sess.commit() return # Generate file path for child Job's filename filepath = CONFIG_BROKER['broker_files'] if g.is_local else "{}/".format( str(job.submission_id)) original_filename = file_generation.file_path.split('/')[-1] filename = '{}{}'.format(filepath, original_filename) # Copy parent job's data job.filename = filename job.original_filename = original_filename job.number_of_errors = 0 job.number_of_warnings = 0 # Change the validation job's file data when within a submission if job.submission_id is not None: val_job = sess.query(Job).filter( Job.submission_id == job.submission_id, Job.file_type_id == job.file_type_id, Job.job_type_id == lookups.JOB_TYPE_DICT['csv_record_validation']).one() val_job.filename = filename val_job.original_filename = original_filename # Copy the data to the Submission's bucket if not g.is_local and file_generation.file_path != job.filename: # Check to see if the same file exists in the child bucket s3 = boto3.client('s3', region_name=CONFIG_BROKER["aws_region"]) bucket = CONFIG_BROKER['aws_bucket'] response = s3.list_objects_v2(Bucket=bucket, Prefix=job.filename) for obj in response.get('Contents', []): if obj['Key'] == job.filename: # The file already exists in this location log_data[ 'message'] = '{} file already exists in this location: {}; not overwriting.'.format( job.file_type.name, job.filename) logger.info(log_data) mark_job_status(job.job_id, 'finished') return S3Handler.copy_file(bucket, bucket, file_generation.file_path, job.filename) sess.commit() # Mark Job status last so the validation job doesn't start until everything is done mark_job_status(job.job_id, 'finished')
def runCrossValidation(self, job): """ Cross file validation job, test all rules with matching rule_timing """ sess = GlobalDB.db().session job_id = job.job_id # Create File Status object createFileIfNeeded(job_id) error_list = ErrorInterface() submission_id = job.submission_id bucketName = CONFIG_BROKER['aws_bucket'] regionName = CONFIG_BROKER['aws_region'] _exception_logger.info( 'VALIDATOR_INFO: Beginning runCrossValidation on submission_id: ' '%s', submission_id) # Delete existing cross file errors for this submission sess.query(ErrorMetadata).filter( ErrorMetadata.job_id == job_id).delete() sess.commit() # get all cross file rules from db crossFileRules = sess.query(RuleSql).filter( RuleSql.rule_cross_file_flag == True) # for each cross-file combo, run associated rules and create error report for c in get_cross_file_pairs(): first_file = c[0] second_file = c[1] comboRules = crossFileRules.filter( or_( and_(RuleSql.file_id == first_file.id, RuleSql.target_file_id == second_file.id), and_(RuleSql.file_id == second_file.id, RuleSql.target_file_id == first_file.id))) # send comboRules to validator.crossValidate sql failures = Validator.crossValidateSql(comboRules.all(), submission_id, self.short_to_long_dict) # get error file name reportFilename = self.getFileName( get_cross_report_name(submission_id, first_file.name, second_file.name)) warningReportFilename = self.getFileName( get_cross_warning_report_name(submission_id, first_file.name, second_file.name)) # loop through failures to create the error report with self.getWriter(regionName, bucketName, reportFilename, self.crossFileReportHeaders) as writer, \ self.getWriter(regionName, bucketName, warningReportFilename, self.crossFileReportHeaders) as warningWriter: for failure in failures: if failure[9] == RULE_SEVERITY_DICT['fatal']: writer.write(failure[0:7]) if failure[9] == RULE_SEVERITY_DICT['warning']: warningWriter.write(failure[0:7]) error_list.recordRowError(job_id, "cross_file", failure[0], failure[3], failure[5], failure[6], failure[7], failure[8], severity_id=failure[9]) writer.finishBatch() warningWriter.finishBatch() error_list.writeAllRowErrors(job_id) mark_job_status(job_id, "finished") _exception_logger.info( 'VALIDATOR_INFO: Completed runCrossValidation on submission_id: ' '%s', submission_id) submission = sess.query(Submission).filter_by( submission_id=submission_id).one() # Update error info for submission submission.number_of_errors = sumNumberOfErrorsForJobList( submission_id) submission.number_of_warnings = sumNumberOfErrorsForJobList( submission_id, errorType="warning") # TODO: Remove temporary step below # Temporarily set publishable flag at end of cross file, remove this once users are able to mark their submissions # as publishable # Publish only if no errors are present if submission.number_of_errors == 0: submission.publishable = True sess.commit() # Mark validation complete markFileComplete(job_id)
def run_app(): """Run the application.""" app = create_app() # This is for DataDog (Do Not Delete) if USE_DATADOG: TraceMiddleware(app, tracer, service="broker-dd", distributed_tracing=False) with app.app_context(): current_app.debug = CONFIG_SERVICES['debug'] local = CONFIG_BROKER['local'] g.is_local = local error_report_path = CONFIG_SERVICES['error_report_path'] current_app.config.from_object(__name__) # Create connection to job tracker database sess = GlobalDB.db().session # Future: Override config w/ environment variable, if set current_app.config.from_envvar('VALIDATOR_SETTINGS', silent=True) queue = sqs_queue() messages = [] logger.info("Starting SQS polling") while True: # Set current_message to None before every loop to ensure it's never set to the previous message current_message = None try: # Grabs one (or more) messages from the queue messages = queue.receive_messages( WaitTimeSeconds=10, MessageAttributeNames=['All']) for message in messages: logger.info("Message received: %s", message.body) # Retrieve the job_id from the message body current_message = message g.job_id = message.body mark_job_status(g.job_id, "ready") # Get the job job = sess.query(Job).filter_by( job_id=g.job_id).one_or_none() if job is None: validation_error_type = ValidationError.jobError write_file_error(g.job_id, None, validation_error_type) raise ResponseException( 'Job ID {} not found in database'.format(g.job_id), StatusCode.CLIENT_ERROR, None, validation_error_type) # We have two major functionalities in the Validator: validation and file generation if (not job.file_type or job.file_type.letter_name in ['A', 'B', 'C', 'FABS'] or job.job_type.name != 'file_upload') and job.submission_id: # Run validations validation_manager = ValidationManager( local, error_report_path) validation_manager.validate_job(job.job_id) else: # Retrieve the agency code data from the message attributes msg_attr = current_message.message_attributes agency_code = msg_attr['agency_code']['StringValue'] if msg_attr and \ msg_attr.get('agency_code') else None agency_type = msg_attr['agency_type']['StringValue'] if msg_attr and \ msg_attr.get('agency_type') else None file_generation_manager = FileGenerationManager( job, agency_code, agency_type, local) file_generation_manager.generate_from_job() sess.commit() sess.refresh(job) # Delete from SQS once processed message.delete() except ResponseException as e: # Handle exceptions explicitly raised during validation. logger.error(traceback.format_exc()) job = get_current_job() if job: if job.filename is not None: # Insert file-level error info to the database write_file_error(job.job_id, job.filename, e.errorType, e.extraInfo) if e.errorType != ValidationError.jobError: # Job passed prerequisites for validation but an error happened somewhere: mark job as 'invalid' mark_job_status(job.job_id, 'invalid') if current_message: if e.errorType in [ ValidationError.rowCountError, ValidationError.headerError, ValidationError.fileTypeError ]: current_message.delete() except Exception as e: # Handle uncaught exceptions in validation process. logger.error(traceback.format_exc()) # csv-specific errors get a different job status and response code if isinstance(e, ValueError) or isinstance( e, csv.Error) or isinstance(e, UnicodeDecodeError): job_status = 'invalid' else: job_status = 'failed' job = get_current_job() if job: if job.filename is not None: error_type = ValidationError.unknownError if isinstance(e, UnicodeDecodeError): error_type = ValidationError.encodingError # TODO Is this really the only case where the message should be deleted? if current_message: current_message.delete() write_file_error(job.job_id, job.filename, error_type) mark_job_status(job.job_id, job_status) finally: GlobalDB.close() # Set visibility to 0 so that another attempt can be made to process in SQS immediately, # instead of waiting for the timeout window to expire for message in messages: try: message.change_visibility(VisibilityTimeout=0) except ClientError: # Deleted messages will throw errors, which is fine because they are handled pass
def generate_file(submission, file_type, start, end, agency_type): """ Start a file generation job for the specified file type within a submission Args: submission: submission for which we're generating the file file_type: type of file to generate the job for start: the start date for the file to generate end: the end date for the file to generate agency_type: The type of agency (awarding or funding) to generate the file for (only used for D file generation) Returns: Results of check_generation or JsonResponse object containing an error if the prerequisite job isn't complete. """ error_message = None # submission is a FABS submission if submission.d2_submission: error_message = "Cannot generate files for FABS submissions." elif file_type in ['D1', 'D2']: # D file generation requires start and end date if not start or not end: error_message = "Must have a start and end date for D file generation." # D files can only be generated by awarding or funding agency elif agency_type not in ['awarding', 'funding']: error_message = "agency_type must be either awarding or funding for D file generation." # Only D1, D2, E, and F files can be generated elif file_type not in ['E', 'F']: error_message = "File type must be either D1, D2, E, or F" # Return any client errors if error_message: return JsonResponse.error(ValueError(error_message), StatusCode.CLIENT_ERROR) sess = GlobalDB.db().session job = sess.query(Job).filter( Job.submission_id == submission.submission_id, Job.file_type_id == lookups.FILE_TYPE_DICT_LETTER_ID[file_type], Job.job_type_id == lookups.JOB_TYPE_DICT['file_upload']).one() log_data = { 'message': 'Starting {} file generation within submission {}'.format( file_type, submission.submission_id), 'message_type': 'BrokerInfo', 'submission_id': submission.submission_id, 'job_id': job.job_id, 'file_type': file_type } logger.info(log_data) # Check prerequisites on upload job if not generation_helper.check_generation_prereqs(submission.submission_id, file_type): return JsonResponse.error( ResponseException( "Must wait for completion of prerequisite validation job", StatusCode.CLIENT_ERROR), StatusCode.CLIENT_ERROR) try: if file_type in ['D1', 'D2']: generation_helper.start_d_generation(job, start, end, agency_type) else: generation_helper.start_e_f_generation(job) except Exception as e: mark_job_status(job.job_id, 'failed') job.error_message = str(e) sess.commit() return JsonResponse.error(e, StatusCode.INTERNAL_ERROR) # Return same response as check generation route return check_generation(submission, file_type)
def generate_detached_file(file_type, cgac_code, frec_code, start, end, quarter, agency_type): """ Start a file generation job for the specified file type not connected to a submission Args: file_type: type of file to be generated cgac_code: the code of a CGAC agency if generating for a CGAC agency frec_code: the code of a FREC agency if generating for a FREC agency start: start date in a string, formatted MM/DD/YYYY end: end date in a string, formatted MM/DD/YYYY quarter: quarter to generate for, formatted Q#/YYYY agency_type: The type of agency (awarding or funding) to generate the file for Returns: JSONResponse object with keys job_id, status, file_type, url, message, start, and end. Raises: ResponseException: if the start and end Strings cannot be parsed into dates """ # Make sure it's a valid request if not cgac_code and not frec_code: return JsonResponse.error( ValueError( "Detached file generation requires CGAC or FR Entity Code"), StatusCode.CLIENT_ERROR) if file_type in ['D1', 'D2']: # Make sure we have a start and end date for D1/D2 generation if not start or not end: return JsonResponse.error( ValueError( "Must have a start and end date for D file generation."), StatusCode.CLIENT_ERROR) # Check if date format is MM/DD/YYYY if not (StringCleaner.is_date(start) and StringCleaner.is_date(end)): raise ResponseException( 'Start or end date cannot be parsed into a date', StatusCode.CLIENT_ERROR) if agency_type not in ('awarding', 'funding'): return JsonResponse.error( ValueError("agency_type must be either awarding or funding."), StatusCode.CLIENT_ERROR) else: # Check if date format is Q#/YYYY if not quarter: return JsonResponse.error( ValueError("Must have a quarter for A file generation."), StatusCode.CLIENT_ERROR) try: start, end = generic_helper.quarter_to_dates(quarter) except ResponseException as e: return JsonResponse.error(e, StatusCode.CLIENT_ERROR) # Add job info file_type_name = lookups.FILE_TYPE_DICT_LETTER_NAME[file_type] new_job = generation_helper.add_generation_job_info( file_type_name=file_type_name, start_date=start, end_date=end) agency_code = frec_code if frec_code else cgac_code log_data = { 'message': 'Starting detached {} file generation'.format(file_type), 'message_type': 'BrokerInfo', 'job_id': new_job.job_id, 'file_type': file_type, 'agency_code': agency_code, 'start_date': start, 'end_date': end } logger.info(log_data) try: if file_type in ['D1', 'D2']: generation_helper.start_d_generation(new_job, start, end, agency_type, agency_code=agency_code) else: generation_helper.start_a_generation(new_job, start, end, agency_code) except Exception as e: mark_job_status(new_job.job_id, 'failed') new_job.error_message = str(e) GlobalDB.db().session.commit() return JsonResponse.error(e, StatusCode.INTERNAL_ERROR) # Return same response as check generation route return check_detached_generation(new_job.job_id)
def copy_file_generation_to_job(job, file_generation, is_local): """ Copy cached FileGeneration data to a Job requesting a file. Args: job: Job object to copy the data to file_generation: Cached FileGeneration object to copy the data from is_local: A boolean flag indicating whether the application is being run locally or not """ sess = GlobalDB.db().session log_data = { 'message': 'Copying FileGeneration {} data to Job {}'.format(file_generation.file_generation_id, job.job_id), 'message_type': 'BrokerInfo', 'job_id': job.job_id, 'file_type': job.file_type.name, 'file_generation_id': file_generation.file_generation_id} logger.info(log_data) # Do not edit submissions that have already successfully completed sess.refresh(job) if job.job_status_id == lookups.JOB_STATUS_DICT['finished']: return job.file_generation_id = file_generation.file_generation_id # File is still being generated, just mark the FileGeneration ID in the Job and wait # FileGeneration will update all child Jobs when it finishes if not file_generation.file_path: sess.commit() return # Generate file path for child Job's filename filepath = CONFIG_BROKER['broker_files'] if g.is_local else "{}/".format(str(job.submission_id)) original_filename = file_generation.file_path.split('/')[-1] filename = '{}{}'.format(filepath, original_filename) # Copy parent job's data job.filename = filename job.original_filename = original_filename job.number_of_errors = 0 job.number_of_warnings = 0 # Change the validation job's file data when within a submission if job.submission_id is not None: val_job = sess.query(Job).filter(Job.submission_id == job.submission_id, Job.file_type_id == job.file_type_id, Job.job_type_id == lookups.JOB_TYPE_DICT['csv_record_validation']).one() val_job.filename = filename val_job.original_filename = original_filename # Copy the data to the Submission's bucket if not g.is_local and file_generation.file_path != job.filename: # Check to see if the same file exists in the child bucket s3 = boto3.client('s3', region_name=CONFIG_BROKER["aws_region"]) bucket = CONFIG_BROKER['aws_bucket'] response = s3.list_objects_v2(Bucket=bucket, Prefix=job.filename) for obj in response.get('Contents', []): if obj['Key'] == job.filename: # The file already exists in this location log_data['message'] = '{} file already exists in this location: {}; not overwriting.'.format( job.file_type.name, job.filename) logger.info(log_data) mark_job_status(job.job_id, 'finished') return S3Handler.copy_file(bucket, bucket, file_generation.file_path, job.filename) sess.commit() # Mark Job status last so the validation job doesn't start until everything is done mark_job_status(job.job_id, 'finished')
def generate_file(submission, file_type, start, end, agency_type): """ Start a file generation job for the specified file type within a submission Args: submission: submission for which we're generating the file file_type: type of file to generate the job for start: the start date for the file to generate end: the end date for the file to generate agency_type: The type of agency (awarding or funding) to generate the file for (only used for D file generation) Returns: Results of check_generation or JsonResponse object containing an error if the prerequisite job isn't complete. """ error_message = None # submission is a FABS submission if submission.d2_submission: error_message = "Cannot generate files for FABS submissions." elif file_type in ['D1', 'D2']: # D file generation requires start and end date if not start or not end: error_message = "Must have a start and end date for D file generation." # D files can only be generated by awarding or funding agency elif agency_type not in ['awarding', 'funding']: error_message = "agency_type must be either awarding or funding for D file generation." # Only D1, D2, E, and F files can be generated elif file_type not in ['E', 'F']: error_message = "File type must be either D1, D2, E, or F" # Return any client errors if error_message: return JsonResponse.error(ValueError(error_message), StatusCode.CLIENT_ERROR) sess = GlobalDB.db().session job = sess.query(Job).filter(Job.submission_id == submission.submission_id, Job.file_type_id == lookups.FILE_TYPE_DICT_LETTER_ID[file_type], Job.job_type_id == lookups.JOB_TYPE_DICT['file_upload']).one() logger.info({ 'message': 'Starting {} file generation within submission {}'.format(file_type, submission.submission_id), 'message_type': 'BrokerInfo', 'submission_id': submission.submission_id, 'job_id': job.job_id, 'file_type': file_type }) # Check prerequisites on upload job if not generation_helper.check_generation_prereqs(submission.submission_id, file_type): return JsonResponse.error(ResponseException("Must wait for completion of prerequisite validation job", StatusCode.CLIENT_ERROR), StatusCode.CLIENT_ERROR) try: if file_type in ['D1', 'D2']: generation_helper.start_d_generation(job, start, end, agency_type) else: generation_helper.start_e_f_generation(job) except Exception as e: mark_job_status(job.job_id, 'failed') job.error_message = str(e) sess.commit() return JsonResponse.error(e, StatusCode.INTERNAL_ERROR) # Return same response as check generation route return check_generation(submission, file_type)
def generate_detached_file(file_type, cgac_code, frec_code, start_date, end_date, year, period, agency_type): """ Start a file generation job for the specified file type not connected to a submission Args: file_type: type of file to be generated cgac_code: the code of a CGAC agency if generating for a CGAC agency frec_code: the code of a FREC agency if generating for a FREC agency start_date: start date in a string, formatted MM/DD/YYYY end_date: end date in a string, formatted MM/DD/YYYY year: year to generate for, integer 4 digits period: period to generate for, integer (2-12) agency_type: The type of agency (awarding or funding) to generate the file for Returns: JSONResponse object with keys job_id, status, file_type, url, message, start_date, and end_date. Raises: ResponseException: if the start_date and end_date Strings cannot be parsed into dates """ # Make sure it's a valid request if not cgac_code and not frec_code: return JsonResponse.error(ValueError("Detached file generation requires CGAC or FR Entity Code"), StatusCode.CLIENT_ERROR) if file_type in ['D1', 'D2']: # Make sure we have a start and end date for D1/D2 generation if not start_date or not end_date: return JsonResponse.error(ValueError("Must have a start and end date for D file generation."), StatusCode.CLIENT_ERROR) # Check if date format is MM/DD/YYYY if not (StringCleaner.is_date(start_date) and StringCleaner.is_date(end_date)): raise ResponseException('Start or end date cannot be parsed into a date', StatusCode.CLIENT_ERROR) if agency_type not in ('awarding', 'funding'): return JsonResponse.error(ValueError("agency_type must be either awarding or funding."), StatusCode.CLIENT_ERROR) else: # Make sure both year and period are provided if not (year and period): return JsonResponse.error(ValueError("Must have a year and period for A file generation."), StatusCode.CLIENT_ERROR) try: # Convert to real start and end dates start_date, end_date = generic_helper.year_period_to_dates(year, period) except ResponseException as e: return JsonResponse.error(e, StatusCode.CLIENT_ERROR) # Add job info file_type_name = lookups.FILE_TYPE_DICT_LETTER_NAME[file_type] new_job = generation_helper.create_generation_job(file_type_name, start_date, end_date) agency_code = frec_code if frec_code else cgac_code logger.info({'message': 'Starting detached {} file generation'.format(file_type), 'message_type': 'BrokerInfo', 'job_id': new_job.job_id, 'file_type': file_type, 'agency_code': agency_code, 'start_date': start_date, 'end_date': end_date}) try: if file_type in ['D1', 'D2']: generation_helper.start_d_generation(new_job, start_date, end_date, agency_type, agency_code=agency_code) else: generation_helper.start_a_generation(new_job, start_date, end_date, agency_code) except Exception as e: mark_job_status(new_job.job_id, 'failed') new_job.error_message = str(e) GlobalDB.db().session.commit() return JsonResponse.error(e, StatusCode.INTERNAL_ERROR) # Return same response as check generation route return check_detached_generation(new_job.job_id)
def copy_parent_file_request_data(sess, child_job, parent_job, is_local): """Parent FileRequest job data to the child FileRequest job data. Args: sess: current DB session child_job: Job ID for the child FileRequest object parent_job: Job ID for the parent FileRequest object is_local: True if in local development, False otherwise """ file_type = parent_job.file_type.letter_name log_data = { 'message': 'Copying data from parent job with job_id:{}'.format( parent_job.job_id), 'message_type': 'ValidatorInfo', 'job_id': child_job.job_id, 'file_type': parent_job.file_type.name } # Keep path but update file name filename = '{}/{}'.format( child_job.filename.rsplit('/', 1)[0], parent_job.original_filename) # Copy parent job's data child_job.from_cached = True child_job.filename = filename child_job.original_filename = parent_job.original_filename child_job.number_of_errors = parent_job.number_of_errors child_job.number_of_warnings = parent_job.number_of_warnings child_job.error_message = parent_job.error_message # Change the validation job's file data when within a submission if child_job.submission_id is not None: val_job = sess.query(Job).filter( Job.submission_id == child_job.submission_id, Job.file_type_id == parent_job.file_type_id, Job.job_type_id == JOB_TYPE_DICT['csv_record_validation']).one() val_job.filename = filename val_job.original_filename = parent_job.original_filename sess.commit() if not is_local and parent_job.filename != child_job.filename: # Check to see if the same file exists in the child bucket s3 = boto3.client('s3', region_name=CONFIG_BROKER["aws_region"]) response = s3.list_objects_v2(Bucket=CONFIG_BROKER['aws_bucket'], Prefix=child_job.filename) for obj in response.get('Contents', []): if obj['Key'] == child_job.filename: # The file already exists in this location log_data[ 'message'] = 'Cached {} file CSV already exists in this location'.format( file_type) logger.info(log_data) return # Copy the parent file into the child's S3 location log_data['message'] = 'Copying the cached {} file from job {}'.format( file_type, parent_job.job_id) logger.info(log_data) with smart_open.smart_open( S3Handler.create_file_path(parent_job.filename), 'r') as reader: stream_file_to_s3(child_job.filename, reader) # Mark job status last so the validation job doesn't start until everything is done mark_job_status(child_job.job_id, JOB_STATUS_DICT_ID[parent_job.job_status_id])
def start_d_generation(job, start_date, end_date, agency_type, agency_code=None, file_format='csv'): """ Validates the start and end dates of the generation, updates the submission's publish status and progress (if its not detached generation), and sends the job information to SQS. Args: job: File generation job to start start_date: String to parse as the start date of the generation end_date: String to parse as the end date of the generation agency_type: Type of Agency to generate files by: "awarding" or "funding" agency_code: Agency code for detached D file generations file_format: determines if the file generated is a txt or a csv """ if not (StringCleaner.is_date(start_date) and StringCleaner.is_date(end_date)): raise ResponseException( "Start or end date cannot be parsed into a date of format MM/DD/YYYY", StatusCode.CLIENT_ERROR) # Update the Job's start and end dates sess = GlobalDB.db().session job.start_date = start_date job.end_date = end_date sess.commit() # Update submission if job.submission_id: agency_code = update_generation_submission(sess, job) mark_job_status(job.job_id, 'waiting') file_generation = retrieve_cached_file_generation(job, agency_type, agency_code, file_format) if file_generation: try: copy_file_generation_to_job(job, file_generation, g.is_local) except Exception as e: logger.error(traceback.format_exc()) mark_job_status(job.job_id, 'failed') job.error_message = str(e) sess.commit() else: # Create new FileGeneration and reset Jobs file_generation = FileGeneration(request_date=datetime.now().date(), start_date=job.start_date, end_date=job.end_date, file_type=job.file_type.letter_name, agency_code=agency_code, agency_type=agency_type, file_format=file_format, is_cached_file=True) sess.add(file_generation) sess.commit() try: job.file_generation_id = file_generation.file_generation_id sess.commit() reset_generation_jobs(sess, job) logger.info({ 'message': 'Sending new FileGeneration {} to SQS'.format( file_generation.file_generation_id), 'message_type': 'BrokerInfo', 'file_type': job.file_type.letter_name, 'job_id': job.job_id, 'submission_id': job.submission_id, 'file_generation_id': file_generation.file_generation_id }) # Add file_generation_id to the SQS job queue queue = sqs_queue() message_attr = { "validation_type": { "DataType": "String", "StringValue": "generation" } } queue.send_message(MessageBody=str( file_generation.file_generation_id), MessageAttributes=message_attr) except Exception as e: logger.error(traceback.format_exc()) mark_job_status(job.job_id, 'failed') job.error_message = str(e) file_generation.is_cached_file = False sess.commit()
def start_generation_job(job, start_date, end_date, agency_code=None): """ Validates the dates for a D file generation job and passes the Job ID to SQS Args: job: File generation job to start start_date: Start date of the file generation end_date: End date of the file generation agency_code: Agency code for detached D file generations Returns: Tuple of boolean indicating successful start, and error response if False """ sess = GlobalDB.db().session file_type = job.file_type.letter_name try: if file_type in ['D1', 'D2']: # Validate and set Job's start and end dates if not (StringCleaner.is_date(start_date) and StringCleaner.is_date(end_date)): raise ResponseException( "Start or end date cannot be parsed into a date", StatusCode.CLIENT_ERROR) job.start_date = start_date job.end_date = end_date sess.commit() elif file_type not in ["E", "F"]: raise ResponseException("File type must be either D1, D2, E or F", StatusCode.CLIENT_ERROR) except ResponseException as e: return False, JsonResponse.error(e, e.status, file_type=file_type, status='failed') mark_job_status(job.job_id, "waiting") # Add job_id to the SQS job queue logger.info({ 'message_type': 'ValidatorInfo', 'job_id': job.job_id, 'message': 'Sending file generation job {} to Validator in SQS'.format(job.job_id) }) queue = sqs_queue() message_attr = { 'agency_code': { 'DataType': 'String', 'StringValue': agency_code } } if agency_code else {} response = queue.send_message(MessageBody=str(job.job_id), MessageAttributes=message_attr) logger.debug({ 'message_type': 'ValidatorInfo', 'job_id': job.job_id, 'message': 'Send message response: {}'.format(response) }) return True, None
def run_validation(self, job): """ Run validations for specified job Args: job: Job to be validated Returns: True if successful """ sess = GlobalDB.db().session error_list = ErrorInterface() job_id = job.job_id submission_id = job.submission_id row_number = 1 file_type = job.file_type.name validation_start = datetime.now() log_str = 'on submission_id: {}, job_id: {}, file_type: {}'.format( str(submission_id), str(job_id), file_type) logger.info({ 'message': 'Beginning run_validation {}'.format(log_str), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'run_validations', 'status': 'start', 'start_time': validation_start }) # Get orm model for this file model = [ft.model for ft in FILE_TYPE if ft.name == file_type][0] # Delete existing file level errors for this submission sess.query(ErrorMetadata).filter( ErrorMetadata.job_id == job_id).delete() sess.commit() # Clear existing records for this submission sess.query(model).filter_by(submission_id=submission_id).delete() sess.commit() # Clear existing flex fields for this job sess.query(FlexField).filter_by(job_id=job_id).delete() sess.commit() # If local, make the error report directory if self.is_local and not os.path.exists(self.directory): os.makedirs(self.directory) # Get bucket name and file name file_name = job.filename bucket_name = CONFIG_BROKER['aws_bucket'] region_name = CONFIG_BROKER['aws_region'] error_file_name = report_file_name(job.submission_id, False, job.file_type.name) error_file_path = "".join( [CONFIG_SERVICES['error_report_path'], error_file_name]) warning_file_name = report_file_name(job.submission_id, True, job.file_type.name) warning_file_path = "".join( [CONFIG_SERVICES['error_report_path'], warning_file_name]) # Create File Status object create_file_if_needed(job_id, file_name) reader = CsvReader() # Get file size and write to jobs table if CONFIG_BROKER["use_aws"]: file_size = S3Handler.get_file_size(file_name) else: file_size = os.path.getsize(file_name) job.file_size = file_size sess.commit() # Get fields for this file fields = sess.query(FileColumn).filter( FileColumn.file_id == FILE_TYPE_DICT[file_type]).all() for field in fields: sess.expunge(field) csv_schema = {row.name_short: row for row in fields} try: extension = os.path.splitext(file_name)[1] if not extension or extension.lower() not in ['.csv', '.txt']: raise ResponseException("", StatusCode.CLIENT_ERROR, None, ValidationError.fileTypeError) # Count file rows: throws a File Level Error for non-UTF8 characters temp_file = open(reader.get_filename(region_name, bucket_name, file_name), encoding='utf-8') file_row_count = len(list(csv.reader(temp_file))) try: temp_file.close() except AttributeError: # File does not exist, and so does not need to be closed pass # Pull file and return info on whether it's using short or long col headers reader.open_file(region_name, bucket_name, file_name, fields, bucket_name, self.get_file_name(error_file_name), self.long_to_short_dict[job.file_type_id], is_local=self.is_local) # list to keep track of rows that fail validations error_rows = [] # While not done, pull one row and put it into staging table if it passes # the Validator loading_start = datetime.now() logger.info({ 'message': 'Beginning data loading {}'.format(log_str), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'start', 'start_time': loading_start }) with open(error_file_path, 'w', newline='') as error_file,\ open(warning_file_path, 'w', newline='') as warning_file: error_csv = csv.writer(error_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') warning_csv = csv.writer(warning_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') required_list = None type_list = None if file_type == "fabs": # create a list of all required/type labels for FABS labels = sess.query(ValidationLabel).all() required_list = {} type_list = {} for label in labels: if label.label_type == "requirement": required_list[label.column_name] = label.label else: type_list[label.column_name] = label.label # write headers to file error_csv.writerow(self.reportHeaders) warning_csv.writerow(self.reportHeaders) while not reader.is_finished: row_number += 1 if row_number % 100 == 0: elapsed_time = (datetime.now() - loading_start).total_seconds() logger.info({ 'message': 'Loading row: {} {}'.format( str(row_number), log_str), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'loading', 'rows_loaded': row_number, 'start_time': loading_start, 'elapsed_time': elapsed_time }) # # first phase of validations: read record and record a # formatting error if there's a problem # (record, reduceRow, skip_row, doneReading, rowErrorHere, flex_cols) = \ self.read_record(reader, error_csv, row_number, job, fields, error_list) if reduceRow: row_number -= 1 if rowErrorHere: error_rows.append(row_number) if doneReading: # Stop reading from input file break elif skip_row: # Do not write this row to staging, but continue processing future rows continue # # second phase of validations: do basic schema checks # (e.g., require fields, field length, data type) # # D files are obtained from upstream systems (ASP and FPDS) that perform their own basic # validations, so these validations are not repeated here if file_type in ["award", "award_procurement"]: # Skip basic validations for D files, set as valid to trigger write to staging passed_validations = True valid = True else: if file_type == "fabs": record['afa_generated_unique'] = (record['award_modification_amendme'] or '-none-') + "_" +\ (record['awarding_sub_tier_agency_c'] or '-none-') + \ "_" + (record['fain'] or '-none-') + "_" + \ (record['uri'] or '-none-') passed_validations, failures, valid = Validator.validate( record, csv_schema, file_type == "fabs", required_list, type_list) if valid: # todo: update this logic later when we have actual validations if file_type == "fabs": record["is_valid"] = True model_instance = model(job_id=job_id, submission_id=submission_id, valid_record=passed_validations, **record) skip_row = not insert_staging_model( model_instance, job, error_csv, error_list) if flex_cols: sess.add_all(flex_cols) sess.commit() if skip_row: error_rows.append(row_number) continue if not passed_validations: fatal = write_errors( failures, job, self.short_to_long_dict[job.file_type_id], error_csv, warning_csv, row_number, error_list, flex_cols) if fatal: error_rows.append(row_number) loading_duration = (datetime.now() - loading_start).total_seconds() logger.info({ 'message': 'Completed data loading {}'.format(log_str), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'finish', 'start_time': loading_start, 'end_time': datetime.now(), 'duration': loading_duration, 'total_rows': row_number }) if file_type in ('appropriations', 'program_activity', 'award_financial'): update_tas_ids(model, submission_id) # # third phase of validations: run validation rules as specified # in the schema guidance. these validations are sql-based. # sql_error_rows = self.run_sql_validations( job, file_type, self.short_to_long_dict[job.file_type_id], error_csv, warning_csv, row_number, error_list) error_rows.extend(sql_error_rows) error_file.close() warning_file.close() # stream file to S3 when not local if not self.is_local: # stream error file with open(error_file_path, 'rb') as csv_file: with smart_open.smart_open(S3Handler.create_file_path(self.get_file_name(error_file_name)), 'w')\ as writer: while True: chunk = csv_file.read(CHUNK_SIZE) if chunk: writer.write(chunk) else: break csv_file.close() os.remove(error_file_path) # stream warning file with open(warning_file_path, 'rb') as warning_csv_file: with smart_open.smart_open(S3Handler.create_file_path(self.get_file_name(warning_file_name)), 'w')\ as warning_writer: while True: chunk = warning_csv_file.read(CHUNK_SIZE) if chunk: warning_writer.write(chunk) else: break warning_csv_file.close() os.remove(warning_file_path) # Calculate total number of rows in file # that passed validations error_rows_unique = set(error_rows) total_rows_excluding_header = row_number - 1 valid_rows = total_rows_excluding_header - len(error_rows_unique) # Update fabs is_valid rows where applicable # Update submission to include action dates where applicable if file_type == "fabs": sess.query(DetachedAwardFinancialAssistance).\ filter(DetachedAwardFinancialAssistance.row_number.in_(error_rows_unique), DetachedAwardFinancialAssistance.submission_id == submission_id).\ update({"is_valid": False}, synchronize_session=False) sess.commit() min_action_date, max_action_date = get_action_dates( submission_id) sess.query(Submission).filter(Submission.submission_id == submission_id).\ update({"reporting_start_date": min_action_date, "reporting_end_date": max_action_date}, synchronize_session=False) # Ensure validated rows match initial row count if file_row_count != row_number: raise ResponseException("", StatusCode.CLIENT_ERROR, None, ValidationError.rowCountError) # Update job metadata job.number_of_rows = row_number job.number_of_rows_valid = valid_rows sess.commit() error_list.write_all_row_errors(job_id) # Update error info for submission populate_job_error_info(job) if file_type == "fabs": # set number of errors and warnings for detached submission populate_submission_error_info(submission_id) # Mark validation as finished in job tracker mark_job_status(job_id, "finished") mark_file_complete(job_id, file_name) finally: # Ensure the files always close reader.close() validation_duration = (datetime.now() - validation_start).total_seconds() logger.info({ 'message': 'Completed run_validation {}'.format(log_str), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'run_validation', 'status': 'finish', 'start_time': validation_start, 'end_time': datetime.now(), 'duration': validation_duration }) return True
def job_context(job_id, is_local=True): """ Common context for files D1, D2, E, and F generation. Handles marking the job finished and/or failed Args: job_id: the ID of the submission job is_local: a boolean indicating whether this is being run in a local environment or not Yields: The current DB session """ # Flask context ensures we have access to global.g with Flask(__name__).app_context(): sess, job = retrieve_job_context_data(job_id) try: yield sess, job if not job.from_cached: # only mark completed jobs as done logger.info({ 'message': 'Marking job {} as finished'.format(job.job_id), 'job_id': job.job_id, 'message_type': 'ValidatorInfo' }) mark_job_status(job.job_id, "finished") except Exception as e: # logger.exception() automatically adds traceback info logger.exception({ 'message': 'Marking job {} as failed'.format(job.job_id), 'job_id': job.job_id, 'message_type': 'ValidatorException', 'exception': str(e) }) # mark job as failed job.error_message = str(e) mark_job_status(job.job_id, "failed") # ensure FileRequest from failed job is not cached file_request = sess.query(FileRequest).filter_by( job_id=job.job_id).one_or_none() if file_request and file_request.is_cached_file: file_request.is_cached_file = False sess.commit() finally: file_request = sess.query(FileRequest).filter_by( job_id=job.job_id).one_or_none() if file_request and file_request.is_cached_file: # copy job data to all child FileRequests child_requests = sess.query(FileRequest).filter_by( parent_job_id=job.job_id).all() if len(child_requests) > 0: logger.info({ 'message': 'Copying file data from job {} to its children'.format( job.job_id), 'message_type': 'ValidatorInfo', 'job_id': job.job_id }) for child in child_requests: copy_parent_file_request_data(sess, child.job, job, is_local) GlobalDB.close()
def runValidation(self, job): """ Run validations for specified job Args: job: Job to be validated Returns: True if successful """ sess = GlobalDB.db().session job_id = job.job_id error_list = ErrorInterface() _exception_logger.info( 'VALIDATOR_INFO: Beginning runValidation on job_id: %s', job_id) submission_id = job.submission_id rowNumber = 1 fileType = job.file_type.name # Get orm model for this file model = [ft.model for ft in FILE_TYPE if ft.name == fileType][0] # Clear existing records for this submission sess.query(model).filter_by(submission_id=submission_id).delete() sess.commit() # If local, make the error report directory if self.isLocal and not os.path.exists(self.directory): os.makedirs(self.directory) # Get bucket name and file name fileName = job.filename bucketName = CONFIG_BROKER['aws_bucket'] regionName = CONFIG_BROKER['aws_region'] errorFileName = self.getFileName(get_report_path(job, 'error')) warningFileName = self.getFileName(get_report_path(job, 'warning')) # Create File Status object createFileIfNeeded(job_id, fileName) reader = self.getReader() # Get file size and write to jobs table if CONFIG_BROKER["use_aws"]: fileSize = s3UrlHandler.getFileSize(errorFileName) else: fileSize = os.path.getsize(fileName) job.file_size = fileSize sess.commit() # Get fields for this file fields = sess.query(FileColumn). \ filter(FileColumn.file_id == FILE_TYPE_DICT[fileType]). \ all() for field in fields: sess.expunge(field) csvSchema = {row.name_short: row for row in fields} try: # Pull file and return info on whether it's using short or long col headers reader.open_file(regionName, bucketName, fileName, fields, bucketName, errorFileName, self.long_to_short_dict) # list to keep track of rows that fail validations errorRows = [] # While not done, pull one row and put it into staging table if it passes # the Validator with self.getWriter(regionName, bucketName, errorFileName, self.reportHeaders) as writer, \ self.getWriter(regionName, bucketName, warningFileName, self.reportHeaders) as warningWriter: while not reader.is_finished: rowNumber += 1 if rowNumber % 10 == 0: logger.info('loading row %s', rowNumber) # # first phase of validations: read record and record a # formatting error if there's a problem # (record, reduceRow, skipRow, doneReading, rowErrorHere, flex_cols) = self.readRecord(reader, writer, fileType, rowNumber, job, fields, error_list) if reduceRow: rowNumber -= 1 if rowErrorHere: errorRows.append(rowNumber) if doneReading: # Stop reading from input file break elif skipRow: # Do not write this row to staging, but continue processing future rows continue # # second phase of validations: do basic schema checks # (e.g., require fields, field length, data type) # # D files are obtained from upstream systems (ASP and FPDS) that perform their own basic validations, # so these validations are not repeated here if fileType in ["award", "award_procurement"]: # Skip basic validations for D files, set as valid to trigger write to staging passedValidations = True valid = True else: passedValidations, failures, valid = Validator.validate( record, csvSchema) if valid: skipRow = self.writeToStaging(record, job, submission_id, passedValidations, writer, rowNumber, model, error_list) if flex_cols: self.write_to_flex(flex_cols, job_id, submission_id, fileType) if skipRow: errorRows.append(rowNumber) continue if not passedValidations: if self.writeErrors(failures, job, self.short_to_long_dict, writer, warningWriter, rowNumber, error_list): errorRows.append(rowNumber) _exception_logger.info( 'VALIDATOR_INFO: Loading complete on job_id: %s. ' 'Total rows added to staging: %s', job_id, rowNumber) if fileType in ('appropriations', 'program_activity', 'award_financial'): update_tas_ids(model, submission_id) # # third phase of validations: run validation rules as specified # in the schema guidance. these validations are sql-based. # sqlErrorRows = self.runSqlValidations(job, fileType, self.short_to_long_dict, writer, warningWriter, rowNumber, error_list) errorRows.extend(sqlErrorRows) # Write unfinished batch writer.finishBatch() warningWriter.finishBatch() # Calculate total number of rows in file # that passed validations errorRowsUnique = set(errorRows) totalRowsExcludingHeader = rowNumber - 1 validRows = totalRowsExcludingHeader - len(errorRowsUnique) # Update job metadata job.number_of_rows = rowNumber job.number_of_rows_valid = validRows sess.commit() error_list.writeAllRowErrors(job_id) # Update error info for submission populateSubmissionErrorInfo(submission_id) # Mark validation as finished in job tracker mark_job_status(job_id, "finished") markFileComplete(job_id, fileName) finally: # Ensure the file always closes reader.close() _exception_logger.info( 'VALIDATOR_INFO: Completed L1 and SQL rule validations on ' 'job_id: %s', job_id) return True
def generate_detached_file(file_type, cgac_code, frec_code, start_date, end_date, year, period, agency_type, file_format): """ Start a file generation job for the specified file type not connected to a submission Args: file_type: type of file to be generated cgac_code: the code of a CGAC agency if generating for a CGAC agency frec_code: the code of a FREC agency if generating for a FREC agency start_date: start date in a string, formatted MM/DD/YYYY end_date: end date in a string, formatted MM/DD/YYYY year: year to generate for, integer 4 digits period: period to generate for, integer (2-12) agency_type: The type of agency (awarding or funding) to generate the file for file_format: determines if the file generated is a txt or a csv (only used for D file generation) Returns: JSONResponse object with keys job_id, status, file_type, url, message, start_date, and end_date. Raises: ResponseException: if the start_date and end_date Strings cannot be parsed into dates """ # Make sure it's a valid request if not cgac_code and not frec_code: return JsonResponse.error( ValueError( "Detached file generation requires CGAC or FR Entity Code"), StatusCode.CLIENT_ERROR) if file_type in ['D1', 'D2']: # Make sure we have a start and end date for D1/D2 generation if not start_date or not end_date: return JsonResponse.error( ValueError( 'Must have a start and end date for D file generation.'), StatusCode.CLIENT_ERROR) # Check if date format is MM/DD/YYYY if not (StringCleaner.is_date(start_date) and StringCleaner.is_date(end_date)): raise ResponseException( 'Start or end date cannot be parsed into a date', StatusCode.CLIENT_ERROR) if agency_type not in ['awarding', 'funding']: return JsonResponse.error( ValueError('agency_type must be either awarding or funding.'), StatusCode.CLIENT_ERROR) if file_format not in ['csv', 'txt']: return JsonResponse.error( ValueError('file_format must be either csv or txt.'), StatusCode.CLIENT_ERROR) else: # Make sure both year and period are provided if not (year and period): return JsonResponse.error( ValueError( "Must have a year and period for A file generation."), StatusCode.CLIENT_ERROR) try: # Convert to real start and end dates start_date, end_date = generic_helper.year_period_to_dates( year, period) except ResponseException as e: return JsonResponse.error(e, StatusCode.CLIENT_ERROR) # Add job info file_type_name = lookups.FILE_TYPE_DICT_LETTER_NAME[file_type] new_job = generation_helper.create_generation_job(file_type_name, start_date, end_date) agency_code = frec_code if frec_code else cgac_code logger.info({ 'message': 'Starting detached {} file generation'.format(file_type), 'message_type': 'BrokerInfo', 'job_id': new_job.job_id, 'file_type': file_type, 'agency_code': agency_code, 'start_date': start_date, 'end_date': end_date }) try: if file_type in ['D1', 'D2']: generation_helper.start_d_generation(new_job, start_date, end_date, agency_type, agency_code=agency_code, file_format=file_format) else: generation_helper.start_a_generation(new_job, start_date, end_date, agency_code) except Exception as e: mark_job_status(new_job.job_id, 'failed') new_job.error_message = str(e) GlobalDB.db().session.commit() return JsonResponse.error(e, StatusCode.INTERNAL_ERROR) # Return same response as check generation route return check_detached_generation(new_job.job_id)
def validator_process_file_generation(file_gen_id, is_retry=False): """ Retrieves a FileGeneration object based on its ID, and kicks off a file generation. Handles errors by ensuring the FileGeneration (if exists) is no longer cached. Args: file_gen_id: ID of a FileGeneration object is_retry: If this is not the very first time handling execution of this job. If True, cleanup is performed before proceeding to retry the job Raises: Any Exceptions raised by the FileGenerationManager """ if is_retry: if cleanup_generation(file_gen_id): log_job_message( logger=logger, message= "Attempting a retry of {} after successful retry-cleanup.". format(inspect.stack()[0][3]), job_id=file_gen_id, is_debug=True) else: log_job_message( logger=logger, message="Retry of {} found to be not necessary after cleanup. " "Returning from job with success.".format( inspect.stack()[0][3]), job_id=file_gen_id, is_debug=True) return sess = GlobalDB.db().session file_generation = None try: file_generation = sess.query(FileGeneration).filter_by( file_generation_id=file_gen_id).one_or_none() if file_generation is None: raise ResponseException( 'FileGeneration ID {} not found in database'.format( file_gen_id), StatusCode.CLIENT_ERROR, None) file_generation_manager = FileGenerationManager( sess, g.is_local, file_generation=file_generation) file_generation_manager.generate_file() except Exception as e: # Log uncaught exceptions and fail all Jobs referencing this FileGeneration error_data = { 'message': 'An unhandled exception occurred in the Validator during file generation', 'message_type': 'ValidatorInfo', 'file_generation_id': file_gen_id, 'traceback': traceback.format_exc() } if file_generation: error_data.update({ 'agency_code': file_generation.agency_code, 'agency_type': file_generation.agency_type, 'start_date': file_generation.start_date, 'end_date': file_generation.end_date, 'file_type': file_generation.file_type, 'file_path': file_generation.file_path, }) logger.error(error_data) # Try to mark the Jobs as failed, but continue raising the original Exception if not possible try: if file_generation: # Uncache the FileGeneration sess.refresh(file_generation) file_generation.is_cached_file = False # Mark all Jobs waiting on this FileGeneration as failed generation_jobs = sess.query(Job).filter_by( file_generation_id=file_gen_id).all() for job in generation_jobs: if job.job_status in [ JOB_STATUS_DICT['waiting'], JOB_STATUS_DICT['ready'], JOB_STATUS_DICT['running'] ]: mark_job_status(job.job_id, 'failed') sess.refresh(job) job.file_generation_id = None job.error_message = str(e) sess.commit() except: pass # ResponseExceptions only occur at very specific times, and should not affect the Validator's future attempts # at handling messages from SQS if not isinstance(e, ResponseException): raise e
def run_validation(self, job): """ Run validations for specified job Args: job: Job to be validated Returns: True if successful """ sess = GlobalDB.db().session job_id = job.job_id error_list = ErrorInterface() submission_id = job.submission_id row_number = 1 file_type = job.file_type.name validation_start = datetime.now() logger.info( { 'message': 'Beginning run_validation on submission_id: ' + str(submission_id) + ', job_id: ' + str(job_id) + ', file_type: ' + file_type, 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'run_validations', 'status': 'start', 'start_time': validation_start}) # Get orm model for this file model = [ft.model for ft in FILE_TYPE if ft.name == file_type][0] # Delete existing file level errors for this submission sess.query(ErrorMetadata).filter(ErrorMetadata.job_id == job_id).delete() sess.commit() # Clear existing records for this submission sess.query(model).filter_by(submission_id=submission_id).delete() sess.commit() # Clear existing flex fields for this job sess.query(FlexField).filter_by(job_id=job_id).delete() sess.commit() # If local, make the error report directory if self.isLocal and not os.path.exists(self.directory): os.makedirs(self.directory) # Get bucket name and file name file_name = job.filename bucket_name = CONFIG_BROKER['aws_bucket'] region_name = CONFIG_BROKER['aws_region'] error_file_name = self.get_file_name(report_file_name(job.submission_id, False, job.file_type.name)) warning_file_name = self.get_file_name(report_file_name(job.submission_id, True, job.file_type.name)) # Create File Status object create_file_if_needed(job_id, file_name) reader = self.get_reader() # Get file size and write to jobs table if CONFIG_BROKER["use_aws"]: file_size = S3Handler.get_file_size(file_name) else: file_size = os.path.getsize(file_name) job.file_size = file_size sess.commit() # Get fields for this file fields = sess.query(FileColumn).filter(FileColumn.file_id == FILE_TYPE_DICT[file_type]).all() for field in fields: sess.expunge(field) csv_schema = {row.name_short: row for row in fields} try: extension = os.path.splitext(file_name)[1] if not extension or extension not in ['.csv', '.txt']: raise ResponseException("", StatusCode.CLIENT_ERROR, None, ValidationError.fileTypeError) # Count file rows: throws a File Level Error for non-UTF8 characters temp_file = open(reader.get_filename(region_name, bucket_name, file_name), encoding='utf-8') file_row_count = len(list(csv.reader(temp_file))) try: temp_file.close() except AttributeError: # File does not exist, and so does not need to be closed pass # Pull file and return info on whether it's using short or long col headers reader.open_file(region_name, bucket_name, file_name, fields, bucket_name, error_file_name, self.long_to_short_dict, is_local=self.isLocal) # list to keep track of rows that fail validations error_rows = [] # While not done, pull one row and put it into staging table if it passes # the Validator loading_start = datetime.now() logger.info( { 'message': 'Beginning data loading on submission_id: ' + str(submission_id) + ', job_id: ' + str(job_id) + ', file_type: ' + file_type, 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'start', 'start_time': loading_start}) with self.get_writer(region_name, bucket_name, error_file_name, self.reportHeaders) as writer, \ self.get_writer(region_name, bucket_name, warning_file_name, self.reportHeaders) as warning_writer: while not reader.is_finished: row_number += 1 if row_number % 100 == 0: elapsed_time = (datetime.now()-loading_start).total_seconds() logger.info( { 'message': 'Loading row: ' + str(row_number) + ' on submission_id: ' + str(submission_id) + ', job_id: ' + str(job_id) + ', file_type: ' + file_type, 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'loading', 'rows_loaded': row_number, 'start_time': loading_start, 'elapsed_time': elapsed_time}) # # first phase of validations: read record and record a # formatting error if there's a problem # (record, reduceRow, skip_row, doneReading, rowErrorHere, flex_cols) = \ self.read_record(reader, writer, row_number, job, fields, error_list) if reduceRow: row_number -= 1 if rowErrorHere: error_rows.append(row_number) if doneReading: # Stop reading from input file break elif skip_row: # Do not write this row to staging, but continue processing future rows continue # # second phase of validations: do basic schema checks # (e.g., require fields, field length, data type) # # D files are obtained from upstream systems (ASP and FPDS) that perform their own basic # validations, so these validations are not repeated here if file_type in ["award", "award_procurement"]: # Skip basic validations for D files, set as valid to trigger write to staging passed_validations = True valid = True else: if file_type in ["detached_award"]: record['afa_generated_unique'] = (record['award_modification_amendme'] or '-none-') + \ (record['awarding_sub_tier_agency_c'] or '-none-') + \ (record['fain'] or '-none-') + (record['uri'] or '-none-') passed_validations, failures, valid = Validator.validate(record, csv_schema, file_type in ["detached_award"]) if valid: # todo: update this logic later when we have actual validations if file_type in ["detached_award"]: record["is_valid"] = True model_instance = model(job_id=job_id, submission_id=submission_id, valid_record=passed_validations, **record) skip_row = not insert_staging_model(model_instance, job, writer, error_list) if flex_cols: sess.add_all(flex_cols) sess.commit() if skip_row: error_rows.append(row_number) continue if not passed_validations: fatal = write_errors(failures, job, self.short_to_long_dict, writer, warning_writer, row_number, error_list) if fatal: error_rows.append(row_number) loading_duration = (datetime.now()-loading_start).total_seconds() logger.info( { 'message': 'Completed data loading on submission_id: ' + str(submission_id) + ', job_id: ' + str(job_id) + ', file_type: ' + file_type, 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'finish', 'start_time': loading_start, 'end_time': datetime.now(), 'duration': loading_duration, 'total_rows': row_number }) if file_type in ('appropriations', 'program_activity', 'award_financial'): update_tas_ids(model, submission_id) # # third phase of validations: run validation rules as specified # in the schema guidance. these validations are sql-based. # sql_error_rows = self.run_sql_validations(job, file_type, self.short_to_long_dict, writer, warning_writer, row_number, error_list) error_rows.extend(sql_error_rows) # Write unfinished batch writer.finish_batch() warning_writer.finish_batch() # Calculate total number of rows in file # that passed validations error_rows_unique = set(error_rows) total_rows_excluding_header = row_number - 1 valid_rows = total_rows_excluding_header - len(error_rows_unique) # Update detached_award is_valid rows where applicable # Update submission to include action dates where applicable if file_type in ["detached_award"]: sess.query(DetachedAwardFinancialAssistance).\ filter(DetachedAwardFinancialAssistance.row_number.in_(error_rows_unique), DetachedAwardFinancialAssistance.submission_id == submission_id).\ update({"is_valid": False}, synchronize_session=False) sess.commit() min_action_date, max_action_date = get_action_dates(submission_id) sess.query(Submission).filter(Submission.submission_id == submission_id).\ update({"reporting_start_date": min_action_date, "reporting_end_date": max_action_date}, synchronize_session=False) # Ensure validated rows match initial row count if file_row_count != row_number: raise ResponseException("", StatusCode.CLIENT_ERROR, None, ValidationError.rowCountError) # Update job metadata job.number_of_rows = row_number job.number_of_rows_valid = valid_rows sess.commit() error_list.write_all_row_errors(job_id) # Update error info for submission populate_job_error_info(job) if file_type in ["detached_award"]: # set number of errors and warnings for detached submission populate_submission_error_info(submission_id) # Mark validation as finished in job tracker mark_job_status(job_id, "finished") mark_file_complete(job_id, file_name) finally: # Ensure the file always closes reader.close() validation_duration = (datetime.now()-validation_start).total_seconds() logger.info( { 'message': 'Completed run_validation on submission_id: ' + str(submission_id) + ', job_id: ' + str(job_id) + ', file_type: ' + file_type, 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'run_validation', 'status': 'finish', 'start_time': validation_start, 'end_time': datetime.now(), 'duration': validation_duration }) return True
def generate_file(self, agency_code=None): """ Generates a file based on the FileGeneration object and updates any Jobs referencing it """ fillin_vals = {'timestamp': get_timestamp()} if self.file_generation: fillin_vals.update({ 'start': self.file_generation.start_date.strftime('%Y%m%d'), 'end': self.file_generation.end_date.strftime('%Y%m%d'), 'agency_type': self.file_generation.agency_type, 'ext': '.{}'.format(self.file_generation.file_format), }) if self.job and self.job.submission: # Submission Files fillin_vals.update({ 'submission_id': self.job.submission_id, 'FYP': filename_fyp_sub_format(self.job.submission), }) file_name = SUBMISSION_FILENAMES[self.file_type].format( **fillin_vals) else: # Detached Files if self.job and self.job.file_type.letter_name == 'A': period_date = self.job.end_date + relativedelta(months=3) fillin_vals['FYP'] = filename_fyp_format( period_date.year, period_date.month, False) file_name = DETACHED_FILENAMES[self.file_type].format( **fillin_vals) if self.is_local: file_path = "".join([CONFIG_BROKER['broker_files'], file_name]) else: file_path = "".join(["None/", file_name]) # Generate the file and upload to S3 log_data = { 'message': 'Finished file {} generation'.format(self.file_type), 'message_type': 'ValidatorInfo', 'file_type': self.file_type, 'file_path': file_path } if self.file_generation: self.generate_d_file(file_path) log_data.update({ 'agency_code': self.file_generation.agency_code, 'agency_type': self.file_generation.agency_type, 'start_date': self.file_generation.start_date, 'end_date': self.file_generation.end_date, 'file_generation_id': self.file_generation.file_generation_id }) elif self.job.file_type.letter_name in ['A', 'E', 'F']: log_data['job_id'] = self.job.job_id mark_job_status(self.job.job_id, 'running') if self.job.file_type.letter_name == 'A': if not agency_code: raise ResponseException( 'Agency code not provided for an A file generation') self.generate_a_file(agency_code, file_path) else: # Call self.generate_%s_file() where %s is e or f based on the Job's file_type file_type_lower = self.job.file_type.letter_name.lower() getattr(self, 'generate_%s_file' % file_type_lower)() mark_job_status(self.job.job_id, 'finished') else: e = 'No FileGeneration object for D file generation.' if self.file_type in ['D1', 'D2'] else \ 'Cannot generate file for {} file type.'.format(self.file_type if self.file_type else 'empty') raise ResponseException(e) logger.info(log_data)
def run_validation(self, job): """ Run validations for specified job Args: job: Job to be validated Returns: True if successful """ sess = GlobalDB.db().session self.job = job self.submission_id = job.submission_id self.file_type = job.file_type self.file_name = job.filename self.is_fabs = (self.file_type.name == 'fabs') # initializing processing metadata vars for a new validation self.reader = CsvReader() self.error_list = ErrorInterface() self.error_rows = [] self.max_row_number = 1 self.total_rows = 0 self.short_rows = [] self.long_rows = [] validation_start = datetime.now() bucket_name = CONFIG_BROKER['aws_bucket'] region_name = CONFIG_BROKER['aws_region'] self.log_str = 'on submission_id: {}, job_id: {}, file_type: {}'.format( str(self.submission_id), str(self.job.job_id), self.file_type.name) logger.info({ 'message': 'Beginning run_validation {}'.format(self.log_str), 'message_type': 'ValidatorInfo', 'submission_id': self.submission_id, 'job_id': self.job.job_id, 'file_type': self.file_type.name, 'action': 'run_validations', 'status': 'start', 'start_time': validation_start }) # Get orm model for this file self.model = [ft.model for ft in FILE_TYPE if ft.name == self.file_type.name][0] # Delete existing file level errors for this submission sess.query(ErrorMetadata).filter(ErrorMetadata.job_id == self.job.job_id).delete() sess.commit() # Clear existing records for this submission sess.query(self.model).filter_by(submission_id=self.submission_id).delete() sess.commit() # Clear existing flex fields for this job sess.query(FlexField).filter_by(job_id=self.job.job_id).delete() sess.commit() # If local, make the error report directory if self.is_local and not os.path.exists(self.directory): os.makedirs(self.directory) create_file_if_needed(self.job.job_id, self.file_name) # Get file size and write to jobs table if CONFIG_BROKER['use_aws']: file_size = S3Handler.get_file_size(self.file_name) else: file_size = os.path.getsize(self.file_name) self.job.file_size = file_size sess.commit() # Get fields for this file self.fields = sess.query(FileColumn).filter(FileColumn.file_id == FILE_TYPE_DICT[self.file_type.name])\ .order_by(FileColumn.daims_name.asc()).all() self.expected_headers, self.parsed_fields = parse_fields(sess, self.fields) self.csv_schema = {row.name_short: row for row in self.fields} try: # Loading data and initial validations self.load_file_data(sess, bucket_name, region_name) if self.file_type.name in ('appropriations', 'program_activity', 'award_financial'): update_tas_ids(self.model, self.submission_id) # SQL Validations with open(self.error_file_path, 'a', newline='') as error_file, \ open(self.warning_file_path, 'a', newline='') as warning_file: error_csv = csv.writer(error_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') warning_csv = csv.writer(warning_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') # third phase of validations: run validation rules as specified in the schema guidance. These # validations are sql-based. sql_error_rows = self.run_sql_validations(self.short_to_long_dict[self.file_type.file_type_id], error_csv, warning_csv) self.error_rows.extend(sql_error_rows) error_file.close() warning_file.close() # stream file to S3 when not local if not self.is_local: s3_resource = boto3.resource('s3', region_name=region_name) # stream error file with open(self.error_file_path, 'rb') as csv_file: s3_resource.Object(bucket_name, self.get_file_name(self.error_file_name)).put(Body=csv_file) csv_file.close() os.remove(self.error_file_path) # stream warning file with open(self.warning_file_path, 'rb') as warning_csv_file: s3_resource.Object(bucket_name, self.get_file_name(self.warning_file_name)).put(Body=warning_csv_file) warning_csv_file.close() os.remove(self.warning_file_path) # Calculate total number of rows in file that passed validations error_rows_unique = set(self.error_rows) total_rows_excluding_header = self.total_rows - 1 valid_rows = total_rows_excluding_header - len(error_rows_unique) # Update fabs is_valid rows where applicable # Update submission to include action dates where applicable if self.is_fabs: sess.query(DetachedAwardFinancialAssistance). \ filter(DetachedAwardFinancialAssistance.row_number.in_(error_rows_unique), DetachedAwardFinancialAssistance.submission_id == self.submission_id). \ update({'is_valid': False}, synchronize_session=False) sess.commit() min_action_date, max_action_date = get_action_dates(self.submission_id) sess.query(Submission).filter(Submission.submission_id == self.submission_id). \ update({'reporting_start_date': min_action_date, 'reporting_end_date': max_action_date}, synchronize_session=False) # Update job metadata self.job.number_of_rows = self.total_rows self.job.number_of_rows_valid = valid_rows sess.commit() self.error_list.write_all_row_errors(self.job.job_id) # Update error info for submission populate_job_error_info(self.job) if self.is_fabs: # set number of errors and warnings for detached submission populate_submission_error_info(self.submission_id) # Mark validation as finished in job tracker mark_job_status(self.job.job_id, 'finished') mark_file_complete(self.job.job_id, self.file_name) except Exception: logger.error({ 'message': 'An exception occurred during validation', 'message_type': 'ValidatorInfo', 'submission_id': self.submission_id, 'job_id': self.job.job_id, 'file_type': self.file_type.name, 'traceback': traceback.format_exc() }) raise finally: # Ensure the files always close self.reader.close() validation_duration = (datetime.now()-validation_start).total_seconds() logger.info({ 'message': 'Completed run_validation {}'.format(self.log_str), 'message_type': 'ValidatorInfo', 'submission_id': self.submission_id, 'job_id': self.job.job_id, 'file_type': self.file_type.name, 'action': 'run_validation', 'status': 'finish', 'start_time': validation_start, 'end_time': datetime.now(), 'duration': validation_duration }) return True