def test_simple_file_scan(): # Note: only testing locally assert validation_helper.simple_file_scan(CsvReader(), None, None, READ_ERROR) == (11, [5], [2, 3, 7], [], []) assert validation_helper.simple_file_scan(CsvReader(), None, None, BLANK_C) == (5, [], [], [3], [4])
def get_delete_file(): """ Read the file into a pandas object """ file_name = 'IDV_Deletes.csv' if CONFIG_BROKER["use_aws"]: reader = CsvReader() pa_file = open(reader.get_filename(CONFIG_BROKER['aws_region'], CONFIG_BROKER['sf_133_bucket'], file_name), encoding='utf-8') else: base_path = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config") pa_file = os.path.join(base_path, file_name) return pa_file
def run_validation(self, job): """ Run validations for specified job Args: job: Job to be validated Returns: True if successful """ sess = GlobalDB.db().session error_list = ErrorInterface() job_id = job.job_id submission_id = job.submission_id row_number = 1 file_type = job.file_type.name validation_start = datetime.now() log_str = 'on submission_id: {}, job_id: {}, file_type: {}'.format( str(submission_id), str(job_id), file_type) logger.info({ 'message': 'Beginning run_validation {}'.format(log_str), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'run_validations', 'status': 'start', 'start_time': validation_start }) # Get orm model for this file model = [ft.model for ft in FILE_TYPE if ft.name == file_type][0] # Delete existing file level errors for this submission sess.query(ErrorMetadata).filter( ErrorMetadata.job_id == job_id).delete() sess.commit() # Clear existing records for this submission sess.query(model).filter_by(submission_id=submission_id).delete() sess.commit() # Clear existing flex fields for this job sess.query(FlexField).filter_by(job_id=job_id).delete() sess.commit() # If local, make the error report directory if self.is_local and not os.path.exists(self.directory): os.makedirs(self.directory) # Get bucket name and file name file_name = job.filename bucket_name = CONFIG_BROKER['aws_bucket'] region_name = CONFIG_BROKER['aws_region'] error_file_name = report_file_name(job.submission_id, False, job.file_type.name) error_file_path = "".join( [CONFIG_SERVICES['error_report_path'], error_file_name]) warning_file_name = report_file_name(job.submission_id, True, job.file_type.name) warning_file_path = "".join( [CONFIG_SERVICES['error_report_path'], warning_file_name]) # Create File Status object create_file_if_needed(job_id, file_name) reader = CsvReader() # Get file size and write to jobs table if CONFIG_BROKER["use_aws"]: file_size = S3Handler.get_file_size(file_name) else: file_size = os.path.getsize(file_name) job.file_size = file_size sess.commit() # Get fields for this file fields = sess.query(FileColumn).filter( FileColumn.file_id == FILE_TYPE_DICT[file_type]).all() for field in fields: sess.expunge(field) csv_schema = {row.name_short: row for row in fields} try: extension = os.path.splitext(file_name)[1] if not extension or extension.lower() not in ['.csv', '.txt']: raise ResponseException("", StatusCode.CLIENT_ERROR, None, ValidationError.fileTypeError) # Count file rows: throws a File Level Error for non-UTF8 characters temp_file = open(reader.get_filename(region_name, bucket_name, file_name), encoding='utf-8') file_row_count = len(list(csv.reader(temp_file))) try: temp_file.close() except AttributeError: # File does not exist, and so does not need to be closed pass # Pull file and return info on whether it's using short or long col headers reader.open_file(region_name, bucket_name, file_name, fields, bucket_name, self.get_file_name(error_file_name), self.long_to_short_dict[job.file_type_id], is_local=self.is_local) # list to keep track of rows that fail validations error_rows = [] # While not done, pull one row and put it into staging table if it passes # the Validator loading_start = datetime.now() logger.info({ 'message': 'Beginning data loading {}'.format(log_str), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'start', 'start_time': loading_start }) with open(error_file_path, 'w', newline='') as error_file,\ open(warning_file_path, 'w', newline='') as warning_file: error_csv = csv.writer(error_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') warning_csv = csv.writer(warning_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') required_list = None type_list = None if file_type == "fabs": # create a list of all required/type labels for FABS labels = sess.query(ValidationLabel).all() required_list = {} type_list = {} for label in labels: if label.label_type == "requirement": required_list[label.column_name] = label.label else: type_list[label.column_name] = label.label # write headers to file error_csv.writerow(self.reportHeaders) warning_csv.writerow(self.reportHeaders) while not reader.is_finished: row_number += 1 if row_number % 100 == 0: elapsed_time = (datetime.now() - loading_start).total_seconds() logger.info({ 'message': 'Loading row: {} {}'.format( str(row_number), log_str), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'loading', 'rows_loaded': row_number, 'start_time': loading_start, 'elapsed_time': elapsed_time }) # # first phase of validations: read record and record a # formatting error if there's a problem # (record, reduceRow, skip_row, doneReading, rowErrorHere, flex_cols) = \ self.read_record(reader, error_csv, row_number, job, fields, error_list) if reduceRow: row_number -= 1 if rowErrorHere: error_rows.append(row_number) if doneReading: # Stop reading from input file break elif skip_row: # Do not write this row to staging, but continue processing future rows continue # # second phase of validations: do basic schema checks # (e.g., require fields, field length, data type) # # D files are obtained from upstream systems (ASP and FPDS) that perform their own basic # validations, so these validations are not repeated here if file_type in ["award", "award_procurement"]: # Skip basic validations for D files, set as valid to trigger write to staging passed_validations = True valid = True else: if file_type == "fabs": record['afa_generated_unique'] = (record['award_modification_amendme'] or '-none-') + "_" +\ (record['awarding_sub_tier_agency_c'] or '-none-') + \ "_" + (record['fain'] or '-none-') + "_" + \ (record['uri'] or '-none-') passed_validations, failures, valid = Validator.validate( record, csv_schema, file_type == "fabs", required_list, type_list) if valid: # todo: update this logic later when we have actual validations if file_type == "fabs": record["is_valid"] = True model_instance = model(job_id=job_id, submission_id=submission_id, valid_record=passed_validations, **record) skip_row = not insert_staging_model( model_instance, job, error_csv, error_list) if flex_cols: sess.add_all(flex_cols) sess.commit() if skip_row: error_rows.append(row_number) continue if not passed_validations: fatal = write_errors( failures, job, self.short_to_long_dict[job.file_type_id], error_csv, warning_csv, row_number, error_list, flex_cols) if fatal: error_rows.append(row_number) loading_duration = (datetime.now() - loading_start).total_seconds() logger.info({ 'message': 'Completed data loading {}'.format(log_str), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'data_loading', 'status': 'finish', 'start_time': loading_start, 'end_time': datetime.now(), 'duration': loading_duration, 'total_rows': row_number }) if file_type in ('appropriations', 'program_activity', 'award_financial'): update_tas_ids(model, submission_id) # # third phase of validations: run validation rules as specified # in the schema guidance. these validations are sql-based. # sql_error_rows = self.run_sql_validations( job, file_type, self.short_to_long_dict[job.file_type_id], error_csv, warning_csv, row_number, error_list) error_rows.extend(sql_error_rows) error_file.close() warning_file.close() # stream file to S3 when not local if not self.is_local: # stream error file with open(error_file_path, 'rb') as csv_file: with smart_open.smart_open(S3Handler.create_file_path(self.get_file_name(error_file_name)), 'w')\ as writer: while True: chunk = csv_file.read(CHUNK_SIZE) if chunk: writer.write(chunk) else: break csv_file.close() os.remove(error_file_path) # stream warning file with open(warning_file_path, 'rb') as warning_csv_file: with smart_open.smart_open(S3Handler.create_file_path(self.get_file_name(warning_file_name)), 'w')\ as warning_writer: while True: chunk = warning_csv_file.read(CHUNK_SIZE) if chunk: warning_writer.write(chunk) else: break warning_csv_file.close() os.remove(warning_file_path) # Calculate total number of rows in file # that passed validations error_rows_unique = set(error_rows) total_rows_excluding_header = row_number - 1 valid_rows = total_rows_excluding_header - len(error_rows_unique) # Update fabs is_valid rows where applicable # Update submission to include action dates where applicable if file_type == "fabs": sess.query(DetachedAwardFinancialAssistance).\ filter(DetachedAwardFinancialAssistance.row_number.in_(error_rows_unique), DetachedAwardFinancialAssistance.submission_id == submission_id).\ update({"is_valid": False}, synchronize_session=False) sess.commit() min_action_date, max_action_date = get_action_dates( submission_id) sess.query(Submission).filter(Submission.submission_id == submission_id).\ update({"reporting_start_date": min_action_date, "reporting_end_date": max_action_date}, synchronize_session=False) # Ensure validated rows match initial row count if file_row_count != row_number: raise ResponseException("", StatusCode.CLIENT_ERROR, None, ValidationError.rowCountError) # Update job metadata job.number_of_rows = row_number job.number_of_rows_valid = valid_rows sess.commit() error_list.write_all_row_errors(job_id) # Update error info for submission populate_job_error_info(job) if file_type == "fabs": # set number of errors and warnings for detached submission populate_submission_error_info(submission_id) # Mark validation as finished in job tracker mark_job_status(job_id, "finished") mark_file_complete(job_id, file_name) finally: # Ensure the files always close reader.close() validation_duration = (datetime.now() - validation_start).total_seconds() logger.info({ 'message': 'Completed run_validation {}'.format(log_str), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job_id, 'file_type': file_type, 'action': 'run_validation', 'status': 'finish', 'start_time': validation_start, 'end_time': datetime.now(), 'duration': validation_duration }) return True
def get_reader(self): """ Gets the reader type based on if its local install or not. """ return CsvReader()
def test_simple_file_scan(): # Note: only testing locally assert validation_helper.simple_file_scan(CsvReader(), None, None, READ_ERROR) == (11, [3, 6], [])
class ValidationManager: """ Outer level class, called by flask route """ report_headers = ['Unique ID', 'Field Name', 'Error Message', 'Value Provided', 'Expected Value', 'Difference', 'Flex Field', 'Row Number', 'Rule Label'] cross_file_report_headers = ['Unique ID', 'Source File', 'Source Field Name', 'Target File', 'Target Field Name', 'Error Message', 'Source Value Provided', 'Target Value Provided', 'Difference', 'Source Flex Field', 'Source Row Number', 'Rule Label'] def __init__(self, is_local=True, directory=''): # Initialize instance variables self.is_local = is_local self.directory = directory self.log_str = '' # Validation Info self.job = None self.submission_id = None self.file_type = None self.file_name = None self.is_fabs = False self.model = None self.error_file_name = None self.error_file_path = None self.warning_file_name = None self.warning_file_path = None # Schema info self.csv_schema = {} self.fields = {} self.parsed_fields = {} self.expected_headers = [] self.long_to_short_dict = {} self.short_to_long_dict = {} self.daims_to_short_dict = {} self.short_to_daims_dict = {} # create long-to-short (and vice-versa) column name mappings sess = GlobalDB.db().session colnames = sess.query(FileColumn.daims_name, FileColumn.name, FileColumn.name_short, FileColumn.file_id).all() # fill in long_to_short and short_to_long dicts for col in colnames: # Get long_to_short_dict filled in if not self.long_to_short_dict.get(col.file_id): self.long_to_short_dict[col.file_id] = {} self.long_to_short_dict[col.file_id][col.name] = col.name_short # Get short_to_long_dict filled in if not self.short_to_long_dict.get(col.file_id): self.short_to_long_dict[col.file_id] = {} self.short_to_long_dict[col.file_id][col.name_short] = col.name # Get daims_to_short_dict filled in if not self.daims_to_short_dict.get(col.file_id): self.daims_to_short_dict[col.file_id] = {} clean_daims = StringCleaner.clean_string(col.daims_name, remove_extras=False) self.daims_to_short_dict[col.file_id][clean_daims] = col.name_short # Get short_to_daims_dict filled in if not self.short_to_daims_dict.get(col.file_id): self.short_to_daims_dict[col.file_id] = {} self.short_to_daims_dict[col.file_id][col.name_short] = col.daims_name def get_file_name(self, path): """ Return full path of error report based on provided name Args: path: the name of the file Returns: the full path of the file """ if self.is_local: return os.path.join(self.directory, path) # Forcing forward slash here instead of using os.path to write a valid path for S3 return ''.join(['errors/', path]) def run_validation(self, job): """ Run validations for specified job Args: job: Job to be validated Returns: True if successful """ sess = GlobalDB.db().session self.job = job self.submission_id = job.submission_id self.file_type = job.file_type self.file_name = job.filename self.is_fabs = (self.file_type.name == 'fabs') # initializing processing metadata vars for a new validation self.reader = CsvReader() self.error_list = ErrorInterface() self.error_rows = [] self.max_row_number = 1 self.total_rows = 0 self.short_rows = [] self.long_rows = [] validation_start = datetime.now() bucket_name = CONFIG_BROKER['aws_bucket'] region_name = CONFIG_BROKER['aws_region'] self.log_str = 'on submission_id: {}, job_id: {}, file_type: {}'.format( str(self.submission_id), str(self.job.job_id), self.file_type.name) logger.info({ 'message': 'Beginning run_validation {}'.format(self.log_str), 'message_type': 'ValidatorInfo', 'submission_id': self.submission_id, 'job_id': self.job.job_id, 'file_type': self.file_type.name, 'action': 'run_validations', 'status': 'start', 'start_time': validation_start }) # Get orm model for this file self.model = [ft.model for ft in FILE_TYPE if ft.name == self.file_type.name][0] # Delete existing file level errors for this submission sess.query(ErrorMetadata).filter(ErrorMetadata.job_id == self.job.job_id).delete() sess.commit() # Clear existing records for this submission sess.query(self.model).filter_by(submission_id=self.submission_id).delete() sess.commit() # Clear existing flex fields for this job sess.query(FlexField).filter_by(job_id=self.job.job_id).delete() sess.commit() # If local, make the error report directory if self.is_local and not os.path.exists(self.directory): os.makedirs(self.directory) create_file_if_needed(self.job.job_id, self.file_name) # Get file size and write to jobs table if CONFIG_BROKER['use_aws']: file_size = S3Handler.get_file_size(self.file_name) else: file_size = os.path.getsize(self.file_name) self.job.file_size = file_size sess.commit() # Get fields for this file self.fields = sess.query(FileColumn).filter(FileColumn.file_id == FILE_TYPE_DICT[self.file_type.name])\ .order_by(FileColumn.daims_name.asc()).all() self.expected_headers, self.parsed_fields = parse_fields(sess, self.fields) self.csv_schema = {row.name_short: row for row in self.fields} try: # Loading data and initial validations self.load_file_data(sess, bucket_name, region_name) if self.file_type.name in ('appropriations', 'program_activity', 'award_financial'): update_tas_ids(self.model, self.submission_id) # SQL Validations with open(self.error_file_path, 'a', newline='') as error_file, \ open(self.warning_file_path, 'a', newline='') as warning_file: error_csv = csv.writer(error_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') warning_csv = csv.writer(warning_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') # third phase of validations: run validation rules as specified in the schema guidance. These # validations are sql-based. sql_error_rows = self.run_sql_validations(self.short_to_long_dict[self.file_type.file_type_id], error_csv, warning_csv) self.error_rows.extend(sql_error_rows) error_file.close() warning_file.close() # stream file to S3 when not local if not self.is_local: s3_resource = boto3.resource('s3', region_name=region_name) # stream error file with open(self.error_file_path, 'rb') as csv_file: s3_resource.Object(bucket_name, self.get_file_name(self.error_file_name)).put(Body=csv_file) csv_file.close() os.remove(self.error_file_path) # stream warning file with open(self.warning_file_path, 'rb') as warning_csv_file: s3_resource.Object(bucket_name, self.get_file_name(self.warning_file_name)).put(Body=warning_csv_file) warning_csv_file.close() os.remove(self.warning_file_path) # Calculate total number of rows in file that passed validations error_rows_unique = set(self.error_rows) total_rows_excluding_header = self.total_rows - 1 valid_rows = total_rows_excluding_header - len(error_rows_unique) # Update fabs is_valid rows where applicable # Update submission to include action dates where applicable if self.is_fabs: sess.query(DetachedAwardFinancialAssistance). \ filter(DetachedAwardFinancialAssistance.row_number.in_(error_rows_unique), DetachedAwardFinancialAssistance.submission_id == self.submission_id). \ update({'is_valid': False}, synchronize_session=False) sess.commit() min_action_date, max_action_date = get_action_dates(self.submission_id) sess.query(Submission).filter(Submission.submission_id == self.submission_id). \ update({'reporting_start_date': min_action_date, 'reporting_end_date': max_action_date}, synchronize_session=False) # Update job metadata self.job.number_of_rows = self.total_rows self.job.number_of_rows_valid = valid_rows sess.commit() self.error_list.write_all_row_errors(self.job.job_id) # Update error info for submission populate_job_error_info(self.job) if self.is_fabs: # set number of errors and warnings for detached submission populate_submission_error_info(self.submission_id) # Mark validation as finished in job tracker mark_job_status(self.job.job_id, 'finished') mark_file_complete(self.job.job_id, self.file_name) except Exception: logger.error({ 'message': 'An exception occurred during validation', 'message_type': 'ValidatorInfo', 'submission_id': self.submission_id, 'job_id': self.job.job_id, 'file_type': self.file_type.name, 'traceback': traceback.format_exc() }) raise finally: # Ensure the files always close self.reader.close() validation_duration = (datetime.now()-validation_start).total_seconds() logger.info({ 'message': 'Completed run_validation {}'.format(self.log_str), 'message_type': 'ValidatorInfo', 'submission_id': self.submission_id, 'job_id': self.job.job_id, 'file_type': self.file_type.name, 'action': 'run_validation', 'status': 'finish', 'start_time': validation_start, 'end_time': datetime.now(), 'duration': validation_duration }) return True def load_file_data(self, sess, bucket_name, region_name): """ Loads in the file data and performs initial validations Args: sess: the database connection bucket_name: the bucket to pull the file region_name: the region to pull the file """ loading_start = datetime.now() logger.info({ 'message': 'Beginning data loading {}'.format(self.log_str), 'message_type': 'ValidatorInfo', 'submission_id': self.submission_id, 'job_id': self.job.job_id, 'file_type': self.file_type.name, 'action': 'data_loading', 'status': 'start', 'start_time': loading_start }) # Extension Check extension = os.path.splitext(self.file_name)[1] if not extension or extension.lower() not in ['.csv', '.txt']: raise ResponseException('', StatusCode.CLIENT_ERROR, None, ValidationError.fileTypeError) # Base file check file_row_count, self.short_rows, self.long_rows = simple_file_scan(self.reader, bucket_name, region_name, self.file_name) # total_rows = header + long_rows (and will be added on per chunk) # Note: we're adding long_rows here because pandas will exclude long_rows when we're loading the data self.total_rows = 1 + len(self.long_rows) # Making base error/warning files self.error_file_name = report_file_name(self.submission_id, False, self.file_type.name) self.error_file_path = ''.join([CONFIG_SERVICES['error_report_path'], self.error_file_name]) self.warning_file_name = report_file_name(self.submission_id, True, self.file_type.name) self.warning_file_path = ''.join([CONFIG_SERVICES['error_report_path'], self.warning_file_name]) with open(self.error_file_path, 'w', newline='') as error_file, \ open(self.warning_file_path, 'w', newline='') as warning_file: error_csv = csv.writer(error_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') warning_csv = csv.writer(warning_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') error_csv.writerow(self.report_headers) warning_csv.writerow(self.report_headers) # Adding formatting errors to error file format_error_df = process_formatting_errors(self.short_rows, self.long_rows, self.report_headers) format_error_df.to_csv(self.error_file_path, columns=self.report_headers, index=False, quoting=csv.QUOTE_ALL, mode='a', header=False) # Finally open the file for loading into the database with baseline validations self.reader.open_file(region_name, bucket_name, self.file_name, self.fields, bucket_name, self.get_file_name(self.error_file_name), self.daims_to_short_dict[self.file_type.file_type_id], self.short_to_daims_dict[self.file_type.file_type_id], is_local=self.is_local) # Going back to reprocess the header row self.reader.file.seek(0) reader_obj = pd.read_csv(self.reader.file, dtype=str, delimiter=self.reader.delimiter, error_bad_lines=False, keep_default_na=False, chunksize=CHUNK_SIZE, warn_bad_lines=False) for chunk_df in reader_obj: self.process_data_chunk(sess, chunk_df) # Ensure validated rows match initial row count if file_row_count != self.total_rows: raise ResponseException('', StatusCode.CLIENT_ERROR, None, ValidationError.rowCountError) loading_duration = (datetime.now() - loading_start).total_seconds() logger.info({ 'message': 'Completed data loading {}'.format(self.log_str), 'message_type': 'ValidatorInfo', 'submission_id': self.submission_id, 'job_id': self.job.job_id, 'file_type': self.file_type.name, 'action': 'data_loading', 'status': 'finish', 'start_time': loading_start, 'end_time': datetime.now(), 'duration': loading_duration, 'total_rows': self.total_rows }) return file_row_count def process_data_chunk(self, sess, chunk_df): """ Loads in a chunk of the file and performs initial validations Args: sess: the database connection bucket_name: the bucket to pull the file region_name: the region to pull the file """ logger.info({ 'message': 'Loading rows starting from {}'.format(self.max_row_number + 1), 'message_type': 'ValidatorInfo', 'submission_id': self.submission_id, 'job_id': self.job.job_id, 'file_type': self.file_type.name, 'action': 'data_loading', 'status': 'start' }) # initializing warning/error files and dataframes total_errors = pd.DataFrame(columns=self.report_headers) total_warnings = pd.DataFrame(columns=self.report_headers) flex_data = None required_list = {} type_list = {} office_list = {} # Replace whatever the user included so we're using the database headers chunk_df.rename(columns=self.reader.header_dict, inplace=True) empty_file = chunk_df.empty if not empty_file: chunk_df = chunk_df.applymap(clean_col) # Adding row number chunk_df = chunk_df.reset_index() # index gets reset for each chunk, adding the header, and adding previous rows chunk_df['row_number'] = chunk_df.index + 1 + self.max_row_number self.total_rows += len(chunk_df.index) # Increment row numbers if any were ignored being too long # This syncs the row numbers back to their original values for row in sorted(self.long_rows): chunk_df.loc[chunk_df['row_number'] >= row, 'row_number'] = chunk_df['row_number'] + 1 # Setting max row number for chunking purposes self.max_row_number = chunk_df['row_number'].max() # Filtering out already processed long rows self.long_rows = [row for row in self.long_rows if row > self.max_row_number] # Drop rows that were too short and pandas filled in with Nones chunk_df = chunk_df[~chunk_df['row_number'].isin(self.short_rows)] # Drop the index column chunk_df = chunk_df.drop(['index'], axis=1) # Drop all rows that have 1 or less filled in values (row_number is always filled in so this is how # we have to drop all rows that are just empty) chunk_df.dropna(thresh=2, inplace=True) empty_file = chunk_df.empty if not empty_file: if self.is_fabs: # create a list of all required/type labels for FABS labels = sess.query(ValidationLabel).all() for label in labels: if label.label_type == 'requirement': required_list[label.column_name] = label.label else: type_list[label.column_name] = label.label # Create a list of all offices offices = sess.query(Office.office_code, Office.sub_tier_code).all() for office in offices: office_list[office.office_code] = office.sub_tier_code # Clear out office list to save space del offices # Gathering flex data (must be done before chunk limiting) if self.reader.flex_fields: flex_data = chunk_df.loc[:, list(self.reader.flex_fields + ['row_number'])] if flex_data is not None and not flex_data.empty: flex_data['concatted'] = flex_data.apply(lambda x: concat_flex(x), axis=1) # Dropping any extraneous fields included + flex data (must be done before file type checking) chunk_df = chunk_df[list(self.expected_headers + ['row_number'])] # Only do validations if it's not a D file if self.file_type.name not in ['award', 'award_procurement']: # Padding specific fields for field in self.parsed_fields['padded']: chunk_df[field] = chunk_df.apply( lambda x: FieldCleaner.pad_field(self.csv_schema[field], x[field]), axis=1) # Cleaning up numbers so they can be inserted properly for field in self.parsed_fields['number']: chunk_df[field] = chunk_df.apply(lambda x: clean_numbers(x[field]), axis=1) if self.is_fabs: chunk_df['is_valid'] = True chunk_df['awarding_sub_tier_agency_c'] = chunk_df.apply( lambda x: derive_fabs_awarding_sub_tier(x, office_list), axis=1) chunk_df['afa_generated_unique'] = chunk_df.apply( lambda x: derive_fabs_afa_generated_unique(x), axis=1) chunk_df['unique_award_key'] = chunk_df.apply( lambda x: derive_fabs_unique_award_key(x), axis=1) else: chunk_df['tas'] = chunk_df.apply(lambda x: concat_tas_dict(x), axis=1) chunk_df['display_tas'] = chunk_df.apply(lambda x: concat_display_tas_dict(x), axis=1) chunk_df['unique_id'] = chunk_df.apply(lambda x: derive_unique_id(x, self.is_fabs), axis=1) # Separate each of the checks to their own dataframes, then concat them together req_errors = check_required(chunk_df, self.parsed_fields['required'], required_list, self.report_headers, self.short_to_long_dict[self.file_type.file_type_id], flex_data, is_fabs=self.is_fabs) type_errors = check_type(chunk_df, self.parsed_fields['number'] + self.parsed_fields['boolean'], type_list, self.report_headers, self.csv_schema, self.short_to_long_dict[self.file_type.file_type_id], flex_data, is_fabs=self.is_fabs) type_error_rows = type_errors['Row Number'].tolist() length_errors = check_length(chunk_df, self.parsed_fields['length'], self.report_headers, self.csv_schema, self.short_to_long_dict[self.file_type.file_type_id], flex_data, type_error_rows) if self.is_fabs: error_dfs = [req_errors, type_errors, length_errors] warning_dfs = [pd.DataFrame(columns=list(self.report_headers + ['error_type']))] else: error_dfs = [req_errors, type_errors] warning_dfs = [length_errors] total_errors = pd.concat(error_dfs, ignore_index=True) total_warnings = pd.concat(warning_dfs, ignore_index=True) # Converting these to ints because pandas likes to change them to floats randomly total_errors[['Row Number', 'error_type']] = total_errors[['Row Number', 'error_type']].astype(int) total_warnings[['Row Number', 'error_type']] = total_warnings[['Row Number', 'error_type']]. \ astype(int) self.error_rows.extend([int(x) for x in total_errors['Row Number'].tolist()]) for index, row in total_errors.iterrows(): self.error_list.record_row_error(self.job.job_id, self.file_name, row['Field Name'], row['error_type'], row['Row Number'], row['Rule Label'], self.file_type.file_type_id, None, RULE_SEVERITY_DICT['fatal']) for index, row in total_warnings.iterrows(): self.error_list.record_row_error(self.job.job_id, self.file_name, row['Field Name'], row['error_type'], row['Row Number'], row['Rule Label'], self.file_type.file_type_id, None, RULE_SEVERITY_DICT['warning']) total_errors.drop(['error_type'], axis=1, inplace=True, errors='ignore') total_warnings.drop(['error_type'], axis=1, inplace=True, errors='ignore') # Remove type error rows from original dataframe chunk_df = chunk_df[~chunk_df['row_number'].isin(type_error_rows)] chunk_df.drop(['unique_id'], axis=1, inplace=True) # Write all the errors/warnings to their files total_errors.to_csv(self.error_file_path, columns=self.report_headers, index=False, quoting=csv.QUOTE_ALL, mode='a', header=False) total_warnings.to_csv(self.warning_file_path, columns=self.report_headers, index=False, quoting=csv.QUOTE_ALL, mode='a', header=False) # Finally load the data into the database if not empty_file: # The model data now = datetime.now() chunk_df['created_at'] = now chunk_df['updated_at'] = now chunk_df['job_id'] = self.job.job_id chunk_df['submission_id'] = self.submission_id insert_dataframe(chunk_df, self.model.__table__.name, sess.connection()) # Flex Fields if flex_data is not None: flex_data.drop(['concatted'], axis=1, inplace=True) flex_data = flex_data[flex_data['row_number'].isin(chunk_df['row_number'])] flex_rows = pd.melt(flex_data, id_vars=['row_number'], value_vars=self.reader.flex_fields, var_name='header', value_name='cell') # Filling in all the shared data for these flex fields now = datetime.now() flex_rows['created_at'] = now flex_rows['updated_at'] = now flex_rows['job_id'] = self.job.job_id flex_rows['submission_id'] = self.submission_id flex_rows['file_type_id'] = self.file_type.file_type_id # Adding the entire set of flex fields insert_dataframe(flex_rows, FlexField.__table__.name, sess.connection()) sess.commit() logger.info({ 'message': 'Loaded rows up to {}'.format(self.max_row_number), 'message_type': 'ValidatorInfo', 'submission_id': self.submission_id, 'job_id': self.job.job_id, 'file_type': self.file_type.name, 'action': 'data_loading', 'status': 'end' }) def run_sql_validations(self, short_colnames, writer, warning_writer): """ Run all SQL rules for this file type Args: short_colnames: Dict mapping short field names to long writer: CsvWriter object for error file warning_writer: CsvWriter object for warning file Returns: a list of the row numbers that failed one of the sql-based validations """ error_rows = [] sql_failures = validate_file_by_sql(self.job, self.file_type.name, self.short_to_long_dict[self.file_type.file_type_id]) for failure in sql_failures: # convert shorter, machine friendly column names used in the # SQL validation queries back to their long names if failure.field_name in short_colnames: field_name = short_colnames[failure.field_name] else: field_name = failure.field_name if failure.severity_id == RULE_SEVERITY_DICT['fatal']: error_rows.append(failure.row) try: # If error is an int, it's one of our prestored messages error_type = int(failure.error) error_msg = ValidationError.get_error_message(error_type) except ValueError: # If not, treat it literally error_msg = failure.error if failure.severity_id == RULE_SEVERITY_DICT['fatal']: writer.writerow([failure.unique_id, field_name, error_msg, failure.failed_value, failure.expected_value, failure.difference, failure.flex_fields, str(failure.row), failure.original_label]) elif failure.severity_id == RULE_SEVERITY_DICT['warning']: # write to warnings file warning_writer.writerow([failure.unique_id, field_name, error_msg, failure.failed_value, failure.expected_value, failure.difference, failure.flex_fields, str(failure.row), failure.original_label]) # labeled errors self.error_list.record_row_error(self.job.job_id, self.file_name, field_name, failure.error, self.total_rows, failure.original_label, failure.file_type_id, failure.target_file_id, failure.severity_id) return error_rows def run_cross_validation(self, job): """ Cross file validation job. Test all rules with matching rule_timing. Run each cross-file rule and create error report. Args: job: Current job """ sess = GlobalDB.db().session job_id = job.job_id # Create File Status object create_file_if_needed(job_id) # Create list of errors error_list = ErrorInterface() submission_id = job.submission_id job_start = datetime.now() logger.info({ 'message': 'Beginning cross-file validations on submission_id: ' + str(submission_id), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job.job_id, 'action': 'run_cross_validations', 'start': job_start, 'status': 'start' }) # Delete existing cross file errors for this submission sess.query(ErrorMetadata).filter(ErrorMetadata.job_id == job_id).delete() sess.commit() # get all cross file rules from db cross_file_rules = sess.query(RuleSql).filter_by(rule_cross_file_flag=True) # for each cross-file combo, run associated rules and create error report for c in get_cross_file_pairs(): first_file = c[0] second_file = c[1] combo_rules = cross_file_rules.filter(or_(and_( RuleSql.file_id == first_file.id, RuleSql.target_file_id == second_file.id), and_( RuleSql.file_id == second_file.id, RuleSql.target_file_id == first_file.id))) # get error file name/path error_file_name = report_file_name(submission_id, False, first_file.name, second_file.name) error_file_path = ''.join([CONFIG_SERVICES['error_report_path'], error_file_name]) warning_file_name = report_file_name(submission_id, True, first_file.name, second_file.name) warning_file_path = ''.join([CONFIG_SERVICES['error_report_path'], warning_file_name]) # open error report and gather failed rules within it with open(error_file_path, 'w', newline='') as error_file,\ open(warning_file_path, 'w', newline='') as warning_file: error_csv = csv.writer(error_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') warning_csv = csv.writer(warning_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') # write headers to file error_csv.writerow(self.cross_file_report_headers) warning_csv.writerow(self.cross_file_report_headers) # send comboRules to validator.crossValidate sql current_cols_short_to_long = self.short_to_long_dict[first_file.id].copy() current_cols_short_to_long.update(self.short_to_long_dict[second_file.id].copy()) cross_validate_sql(combo_rules.all(), submission_id, current_cols_short_to_long, job_id, error_csv, warning_csv, error_list) # close files error_file.close() warning_file.close() # stream file to S3 when not local if not self.is_local: s3_resource = boto3.resource('s3', region_name=CONFIG_BROKER['aws_region']) # stream error file with open(error_file_path, 'rb') as csv_file: s3_resource.Object(CONFIG_BROKER['aws_bucket'], self.get_file_name(error_file_name)).\ put(Body=csv_file) csv_file.close() os.remove(error_file_path) # stream warning file with open(warning_file_path, 'rb') as warning_csv_file: s3_resource.Object(CONFIG_BROKER['aws_bucket'], self.get_file_name(warning_file_name)).\ put(Body=warning_csv_file) warning_csv_file.close() os.remove(warning_file_path) # write all recorded errors to database error_list.write_all_row_errors(job_id) # Update error info for submission populate_job_error_info(job) # mark job status as 'finished' mark_job_status(job_id, 'finished') job_duration = (datetime.now()-job_start).total_seconds() logger.info({ 'message': 'Completed cross-file validations on submission_id: ' + str(submission_id), 'message_type': 'ValidatorInfo', 'submission_id': submission_id, 'job_id': job.job_id, 'action': 'run_cross_validations', 'status': 'finish', 'start': job_start, 'duration': job_duration }) # set number of errors and warnings for submission. submission = populate_submission_error_info(submission_id) # TODO: Remove temporary step below # Temporarily set publishable flag at end of cross file, remove this once users are able to mark their # submissions as publishable # Publish only if no errors are present if submission.number_of_errors == 0: submission.publishable = True sess.commit() # Mark validation complete mark_file_complete(job_id) def validate_job(self, job_id): """ Gets file for job, validates each row, and sends valid rows to a staging table Args: job_id: Database ID for the validation Job Returns: Http response object """ # Create connection to job tracker database sess = GlobalDB.db().session # Get the job job = sess.query(Job).filter_by(job_id=job_id).one_or_none() if job is None: raise ResponseException('Job ID {} not found in database'.format(job_id), StatusCode.CLIENT_ERROR, None, ValidationError.jobError) # Make sure job's prerequisites are complete if not run_job_checks(job_id): validation_error_type = ValidationError.jobError write_file_error(job_id, None, validation_error_type) raise ResponseException('Prerequisites for Job ID {} are not complete'.format(job_id), StatusCode.CLIENT_ERROR, None, validation_error_type) # Make sure this is a validation job if job.job_type.name in ('csv_record_validation', 'validation'): job_type_name = job.job_type.name else: validation_error_type = ValidationError.jobError write_file_error(job_id, None, validation_error_type) raise ResponseException( 'Job ID {} is not a validation job (job type is {})'.format(job_id, job.job_type.name), StatusCode.CLIENT_ERROR, None, validation_error_type) # set job status to running and do validations mark_job_status(job_id, 'running') if job_type_name == 'csv_record_validation': self.run_validation(job) elif job_type_name == 'validation': self.run_cross_validation(job) else: raise ResponseException('Bad job type for validator', StatusCode.INTERNAL_ERROR) # Update last validated date job.last_validated = datetime.utcnow() sess.commit() return JsonResponse.create(StatusCode.OK, {'message': 'Validation complete'})
def run_validation(self, job): """ Run validations for specified job Args: job: Job to be validated Returns: True if successful """ sess = GlobalDB.db().session self.job = job self.submission_id = job.submission_id self.file_type = job.file_type self.file_name = job.filename self.is_fabs = (self.file_type.name == 'fabs') # initializing processing metadata vars for a new validation self.reader = CsvReader() self.error_list = ErrorInterface() self.error_rows = [] self.max_row_number = 1 self.total_rows = 0 self.short_rows = [] self.long_rows = [] validation_start = datetime.now() bucket_name = CONFIG_BROKER['aws_bucket'] region_name = CONFIG_BROKER['aws_region'] self.log_str = 'on submission_id: {}, job_id: {}, file_type: {}'.format( str(self.submission_id), str(self.job.job_id), self.file_type.name) logger.info({ 'message': 'Beginning run_validation {}'.format(self.log_str), 'message_type': 'ValidatorInfo', 'submission_id': self.submission_id, 'job_id': self.job.job_id, 'file_type': self.file_type.name, 'action': 'run_validations', 'status': 'start', 'start_time': validation_start }) # Get orm model for this file self.model = [ft.model for ft in FILE_TYPE if ft.name == self.file_type.name][0] # Delete existing file level errors for this submission sess.query(ErrorMetadata).filter(ErrorMetadata.job_id == self.job.job_id).delete() sess.commit() # Clear existing records for this submission sess.query(self.model).filter_by(submission_id=self.submission_id).delete() sess.commit() # Clear existing flex fields for this job sess.query(FlexField).filter_by(job_id=self.job.job_id).delete() sess.commit() # If local, make the error report directory if self.is_local and not os.path.exists(self.directory): os.makedirs(self.directory) create_file_if_needed(self.job.job_id, self.file_name) # Get file size and write to jobs table if CONFIG_BROKER['use_aws']: file_size = S3Handler.get_file_size(self.file_name) else: file_size = os.path.getsize(self.file_name) self.job.file_size = file_size sess.commit() # Get fields for this file self.fields = sess.query(FileColumn).filter(FileColumn.file_id == FILE_TYPE_DICT[self.file_type.name])\ .order_by(FileColumn.daims_name.asc()).all() self.expected_headers, self.parsed_fields = parse_fields(sess, self.fields) self.csv_schema = {row.name_short: row for row in self.fields} try: # Loading data and initial validations self.load_file_data(sess, bucket_name, region_name) if self.file_type.name in ('appropriations', 'program_activity', 'award_financial'): update_tas_ids(self.model, self.submission_id) # SQL Validations with open(self.error_file_path, 'a', newline='') as error_file, \ open(self.warning_file_path, 'a', newline='') as warning_file: error_csv = csv.writer(error_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') warning_csv = csv.writer(warning_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') # third phase of validations: run validation rules as specified in the schema guidance. These # validations are sql-based. sql_error_rows = self.run_sql_validations(self.short_to_long_dict[self.file_type.file_type_id], error_csv, warning_csv) self.error_rows.extend(sql_error_rows) error_file.close() warning_file.close() # stream file to S3 when not local if not self.is_local: s3_resource = boto3.resource('s3', region_name=region_name) # stream error file with open(self.error_file_path, 'rb') as csv_file: s3_resource.Object(bucket_name, self.get_file_name(self.error_file_name)).put(Body=csv_file) csv_file.close() os.remove(self.error_file_path) # stream warning file with open(self.warning_file_path, 'rb') as warning_csv_file: s3_resource.Object(bucket_name, self.get_file_name(self.warning_file_name)).put(Body=warning_csv_file) warning_csv_file.close() os.remove(self.warning_file_path) # Calculate total number of rows in file that passed validations error_rows_unique = set(self.error_rows) total_rows_excluding_header = self.total_rows - 1 valid_rows = total_rows_excluding_header - len(error_rows_unique) # Update fabs is_valid rows where applicable # Update submission to include action dates where applicable if self.is_fabs: sess.query(DetachedAwardFinancialAssistance). \ filter(DetachedAwardFinancialAssistance.row_number.in_(error_rows_unique), DetachedAwardFinancialAssistance.submission_id == self.submission_id). \ update({'is_valid': False}, synchronize_session=False) sess.commit() min_action_date, max_action_date = get_action_dates(self.submission_id) sess.query(Submission).filter(Submission.submission_id == self.submission_id). \ update({'reporting_start_date': min_action_date, 'reporting_end_date': max_action_date}, synchronize_session=False) # Update job metadata self.job.number_of_rows = self.total_rows self.job.number_of_rows_valid = valid_rows sess.commit() self.error_list.write_all_row_errors(self.job.job_id) # Update error info for submission populate_job_error_info(self.job) if self.is_fabs: # set number of errors and warnings for detached submission populate_submission_error_info(self.submission_id) # Mark validation as finished in job tracker mark_job_status(self.job.job_id, 'finished') mark_file_complete(self.job.job_id, self.file_name) except Exception: logger.error({ 'message': 'An exception occurred during validation', 'message_type': 'ValidatorInfo', 'submission_id': self.submission_id, 'job_id': self.job.job_id, 'file_type': self.file_type.name, 'traceback': traceback.format_exc() }) raise finally: # Ensure the files always close self.reader.close() validation_duration = (datetime.now()-validation_start).total_seconds() logger.info({ 'message': 'Completed run_validation {}'.format(self.log_str), 'message_type': 'ValidatorInfo', 'submission_id': self.submission_id, 'job_id': self.job.job_id, 'file_type': self.file_type.name, 'action': 'run_validation', 'status': 'finish', 'start_time': validation_start, 'end_time': datetime.now(), 'duration': validation_duration }) return True