def normalize_headers(header_row, long_headers, long_to_short_dict): """ Clean the headers (remove extra spaces and lowercase) and convert them to short headers if we're given long headers Args: header_row: an array of the file headers given long_headers: boolean indicating if we're using the long versions of the headers (True for long) long_to_short_dict: a dictionary containing a mapping from long headers to short ones for this file type Yields: A string containing the cleaned header name (converted to short version if long versions were provided and there is a mapping for that header). """ for header in header_row: header = FieldCleaner.clean_string(header) # Replace headers that don't match DB but are allowed by the broker with their DB matches if header == 'deobligationsrecoveriesrefundsofprioryearbyprogramobjectclass_cpe': header = 'deobligationsrecoveriesrefundsdofprioryearbyprogramobjectclass_cpe' elif header == 'facevalueloanguarantee': header = 'facevalueofdirectloanorloanguarantee' elif header == 'budgetauthorityavailableamounttotal_cpe': header = 'totalbudgetaryresources_cpe' elif header == 'correctionlatedeleteindicator': header = 'correctiondeleteindicator' elif header == 'place_of_performance_zip4': header = 'place_of_performance_zip4a' # yield the short header when applicable, otherwise yield the cleaned header, whatever it is if long_headers and header in long_to_short_dict: yield FieldCleaner.clean_string(long_to_short_dict[header]) else: yield header
def normalize_headers(header_row, long_headers, long_to_short_dict): """ Clean the headers (remove extra spaces and lowercase) and convert them to short headers if we're given long headers Args: header_row: an array of the file headers given long_headers: boolean indicating if we're using the long versions of the headers (True for long) long_to_short_dict: a dictionary containing a mapping from long headers to short ones for this file type Yields: A string containing the cleaned header name (converted to short version if long versions were provided and there is a mapping for that header). """ for header in header_row: header = FieldCleaner.clean_name(header) # Replace headers that don't match DB but are allowed by the broker with their DB matches if header == 'deobligationsrecoveriesrefundsofprioryearbyprogramobjectclass_cpe': header = 'deobligationsrecoveriesrefundsdofprioryearbyprogramobjectclass_cpe' elif header == 'facevalueloanguarantee': header = 'facevalueofdirectloanorloanguarantee' elif header == 'budgetauthorityavailableamounttotal_cpe': header = 'totalbudgetaryresources_cpe' elif header == 'correctionlatedeleteindicator': header = 'correctiondeleteindicator' elif header == 'place_of_performance_zip4': header = 'place_of_performance_zip4a' # yield the short header when applicable, otherwise yield the cleaned header, whatever it is if long_headers and header in long_to_short_dict: yield FieldCleaner.clean_name(long_to_short_dict[header]) else: yield header
def normalize_headers(header_row, long_headers, long_to_short_dict): for header in header_row: header = FieldCleaner.clean_string(header) # Replace correctly spelled header (which does NOT match the db) with the misspelling that DOES match the db if header == 'deobligationsrecoveriesrefundsofprioryearbyprogramobjectclass_cpe': header = 'deobligationsrecoveriesrefundsdofprioryearbyprogramobjectclass_cpe' if long_headers and header in long_to_short_dict: yield FieldCleaner.clean_string(long_to_short_dict[header]) else: yield header
def normalize_headers(header_row, long_headers, long_to_short_dict): for header in header_row: header = FieldCleaner.clean_string(header) # Replace correctly spelled header (which does NOT match the db) with the misspelling that DOES match the db if header == 'deobligationsrecoveriesrefundsofprioryearbyprogramobjectclass_cpe': header = 'deobligationsrecoveriesrefundsdofprioryearbyprogramobjectclass_cpe' if header == 'facevalueloanguarantee': header = 'facevalueofdirectloanorloanguarantee' if long_headers and header in long_to_short_dict: yield FieldCleaner.clean_string(long_to_short_dict[header]) else: yield header
def load_fields(file_type_name, schema_file_name): """Load specified schema from a .csv.""" with create_app().app_context(): sess = GlobalDB.db().session # get file type object for specified fileTypeName file_type = sess.query(FileType).filter( FileType.name == file_type_name).one() # delete existing schema from database SchemaLoader.remove_columns_by_file_type(sess, file_type) # get allowable datatypes type_query = sess.query(FieldType.name, FieldType.field_type_id).all() types = { data_type.name: data_type.field_type_id for data_type in type_query } # add schema to database with open(schema_file_name, 'rU') as csvfile: reader = csv.DictReader(csvfile) file_column_count = 0 for record in reader: record = FieldCleaner.clean_record(record) fields = ["fieldname", "required", "data_type"] if all(field in record for field in fields): SchemaLoader.add_column_by_file_type( sess, types, file_type, FieldCleaner.clean_string(record["fieldname"]), FieldCleaner.clean_string( record["fieldname_short"]), record["required"], record["data_type"], record["padded_flag"], record["field_length"]) file_column_count += 1 else: raise ValueError('CSV File does not follow schema') sess.commit() logger.info({ 'message': '{} {} schema records added to {}'.format( file_column_count, file_type_name, FileColumn.__tablename__), 'message_type': 'ValidatorInfo', 'file_type': file_type.letter_name })
def count_and_set_headers(self, csv_schema, header_row): """ Track how many times we've seen a field we were expecting and set self.expected_headers and self.flex_headers Args: csv_schema: list of FileColumn objects for this file type header_row: an array of the file headers given Returns: expected field dict {[expected field name]: [header count]) """ self.expected_headers = [] self.flex_headers = [] # Track how many times we've seen a field we were expecting. Keyed by the shorter, machine-readable column names expected_fields = OrderedDict() for schema in csv_schema: expected_fields[FieldCleaner.clean_name(schema.name_short)] = 0 for header_value in header_row: if header_value not in expected_fields: # Add flex headers to flex list if str(header_value).startswith("flex_"): self.flex_headers.append(header_value) else: self.flex_headers.append(None) # Allow unexpected headers, just mark the header as None so we skip it when reading self.expected_headers.append(None) else: self.flex_headers.append(None) self.expected_headers.append(header_value) expected_fields[header_value] += 1 return expected_fields
def validateSum(value, fields_to_sum, record): """ Check that the value of one field is the sum of others :param value: The field which holds the sum we will validate against :param fields_to_sum: A comma separated list of fields which we should sum. These should be valid Decimals :param record: Record containing the data for the current record :return: True if the sum of fields is equal to the designated sum field """ decimalValues = [] # Validate that our sum is a decimal if Validator.checkType(str(value), 'DECIMAL'): decimalSum = Validator.getType(value, 'DECIMAL') else: return False # Validate each field we are summing is a decimal and store their values in an array for field in Validator.cleanSplit(fields_to_sum, True): entry = record[FieldCleaner.cleanName(field)] if entry is None or entry == "": decimalValues.append(0) elif Validator.checkType(entry, 'DECIMAL'): decimalValues.append(Validator.getType(entry, 'DECIMAL')) else: return False return decimalSum == sum(decimalValues)
def conditionalRequired(cls, data,rule,datatype,interfaces,record, isList = False): """ If conditional rule passes, data must not be empty Args: data: Data to be checked rule: Rule object to test against datatype: Type to convert data into interfaces: InterfaceHolder object to the databases record: Some rule types require the entire record as a dict """ # Get rule object for conditional rule conditionalRule = interfaces.validationDb.getRuleByLabel(rule.rule_text_1) if conditionalRule.file_column is not None: # This is a single field rule conditionalTypeId = conditionalRule.file_column.field_types_id conditionalDataType = interfaces.validationDb.getFieldTypeById(conditionalTypeId) conditionalData = record[conditionalRule.file_column.name] else: conditionalDataType = None conditionalData = record # If conditional rule passes, check that data is not empty if Validator.evaluateRule(conditionalData,conditionalRule,conditionalDataType,interfaces,record): if isList: # rule_text_2 is a list of fields fieldList = rule.rule_text_2.split(",") for field in fieldList: if not cls.isFieldPopulated(record[FieldCleaner.cleanName(field)]): # If any are empty, rule fails return False else: # data is value from a single field return cls.isFieldPopulated(data) else: # If conditional rule fails, this field is not required, so the condtional requirement passes return True
def count_and_set_headers(self, csv_schema, header_row): """Track how many times we've seen a field we were expecting and set self.expected_headers and self.flex_headers""" self.expected_headers = [] self.flex_headers = [] # Track how many times we've seen a field we were expecting. Keyed by the shorter, machine-readable column names expected_fields = {} for schema in csv_schema: expected_fields[FieldCleaner.clean_string(schema.name_short)] = 0 for header_value in header_row: if header_value not in expected_fields: # Add flex headers to flex list if str(header_value).startswith("flex_"): self.flex_headers.append(header_value) else: self.flex_headers.append(None) # Allow unexpected headers, just mark the header as None so we skip it when reading self.expected_headers.append(None) else: self.flex_headers.append(None) self.expected_headers.append(header_value) expected_fields[header_value] += 1 return expected_fields
def evaluateRule(cls,data,rule,datatype,interfaces,record): """ Checks data against specified rule Args: data: Data to be checked rule: Rule object to test against datatype: Type to convert data into interfaces: InterfaceHolder object to the databases record: Some rule types require the entire record as a dict Returns: True if rule passed, False otherwise """ if data is None: # Treat blank as an empty string data = "" value = rule.rule_text_1 currentRuleType = rule.rule_type.name # Call specific rule function ruleFunction = "_".join(["rule",str(currentRuleType).lower()]) ruleFunction = FieldCleaner.cleanString(ruleFunction) try: ruleMethod = getattr(cls, str(ruleFunction)) return ruleMethod(data, value, rule, datatype, interfaces, record) except AttributeError as e: # Unrecognized rule type raise ResponseException(str(e), StatusCode.INTERNAL_ERROR, ValueError)
def read_record(self, reader, writer, row_number, job, fields, error_list): """ Read and process the next record Args: reader: CsvReader object writer: CsvWriter object row_number: Next row number to be read job: current job fields: List of FileColumn objects for this file type error_list: instance of ErrorInterface to keep track of errors Returns: Tuple with six elements: 1. Dict of record after preprocessing 2. Boolean indicating whether to reduce row count 3. Boolean indicating whether to skip row 4. Boolean indicating whether to stop reading 5. Row error has been found 6. Dict of flex columns """ reduce_row = False row_error_found = False job_id = job.job_id try: (next_record, flex_fields) = reader.get_next_record() record = FieldCleaner.clean_row( next_record, self.long_to_short_dict[job.file_type_id], fields) record["row_number"] = row_number for flex_field in flex_fields: flex_field.submission_id = job.submission_id flex_field.job_id = job.job_id flex_field.row_number = row_number flex_field.file_type_id = job.file_type_id if reader.is_finished and len(record) < 2: # This is the last line and is empty, don't record an error return {}, True, True, True, False, [] # Don't count this row except ResponseException: if reader.is_finished and reader.extra_line: # Last line may be blank don't record an error, # reader.extra_line indicates a case where the last valid line has extra line breaks # Don't count last row if empty reduce_row = True else: writer.writerow([ "Formatting Error", ValidationError.readErrorMsg, str(row_number), "" ]) error_list.record_row_error( job_id, job.filename, "Formatting Error", ValidationError.readError, row_number, severity_id=RULE_SEVERITY_DICT['fatal']) row_error_found = True return {}, reduce_row, True, False, row_error_found, [] return record, reduce_row, False, False, row_error_found, flex_fields
def use_long_headers(header_row, long_to_short_dict): """Check to see if header contains long or short column names""" col_matches = 0 for value in header_row: if FieldCleaner.clean_string(value) in long_to_short_dict: col_matches += 1 # if most of column headers are in the long format, we'll treat the file as having long headers return col_matches > .5 * len(header_row)
def load_fields(file_type_name, schema_file_name): """Load specified schema from a .csv.""" with create_app().app_context(): sess = GlobalDB.db().session # get file type object for specified fileTypeName file_type = sess.query(FileType).filter(FileType.name == file_type_name).one() # delete existing schema from database SchemaLoader.remove_columns_by_file_type(sess, file_type) # get allowable datatypes type_query = sess.query(FieldType.name, FieldType.field_type_id).all() types = {data_type.name: data_type.field_type_id for data_type in type_query} # add schema to database with open(schema_file_name, 'rU') as csvfile: reader = csv.DictReader(csvfile) file_column_count = 0 for record in reader: record = FieldCleaner.clean_record(record) fields = ["fieldname", "required", "data_type"] if all(field in record for field in fields): SchemaLoader.add_column_by_file_type( sess, types, file_type, FieldCleaner.clean_string(record["fieldname"]), FieldCleaner.clean_string(record["fieldname_short"]), record["required"], record["data_type"], record["padded_flag"], record["field_length"]) file_column_count += 1 else: raise ValueError('CSV File does not follow schema') sess.commit() logger.info({ 'message': '{} {} schema records added to {}'.format(file_column_count, file_type_name, FileColumn.__tablename__), 'message_type': 'ValidatorInfo', 'file_type': file_type.letter_name })
def getFieldsByFileList(self, fileType): """ Returns a list of valid field names that can appear in this type of file Args: fileType -- One of the set of valid types of files (e.g. Award, AwardFinancial) Returns: list of names """ fileId = self.getFileTypeIdByName(fileType) if (fileId is None): raise ValueError("Filetype does not exist") queryResult = self.session.query(FileColumn).filter( FileColumn.file_id == fileId).all() for result in queryResult: result.name = FieldCleaner.cleanString( result.name) # Standardize field names result.name_short = FieldCleaner.cleanString(result.name_short) return queryResult
def load_labels(cls, filename): """Load non-SQL-based validation rules to db.""" with create_app().app_context(): sess = GlobalDB.db().session # Delete all records currently in table sess.query(ValidationLabel).delete() filename = os.path.join(cls.validation_labels_path, filename) # open csv with open(filename, 'rU') as csvfile: # read header header = csvfile.readline() # split header into filed names raw_field_names = header.split(',') field_names = [] # clean field names for field in raw_field_names: field_names.append(FieldCleaner.clean_string(field)) unknown_fields = set(field_names) - set(cls.headers) if len(unknown_fields) != 0: raise KeyError("".join([ "Found unexpected fields: ", str(list(unknown_fields)) ])) missing_fields = set(cls.headers) - set(field_names) if len(missing_fields) != 0: raise ValueError("".join([ "Missing required fields: ", str(list(missing_fields)) ])) reader = csv.DictReader(csvfile, fieldnames=field_names) for row in reader: validation_label = ValidationLabel( label=row['label'], error_message=row['error_message'], column_name=row['column_name'], label_type=row['label_type']) # look up file type id try: file_id = FILE_TYPE_DICT[row["file_type"]] except Exception as e: raise Exception( "{}: file type={}, rule label={}. Rule not loaded." .format(e, row["file_type"], row["rule_label"])) validation_label.file_id = file_id sess.merge(validation_label) sess.commit()
def readRecord(self, reader, writer, fileType, interfaces, rowNumber, jobId, fields): """ Read and process the next record Args: reader: CsvReader object writer: CsvWriter object fileType: Type of file for current job interfaces: InterfaceHolder object rowNumber: Next row number to be read jobId: ID of current job Returns: Tuple with four elements: 1. Dict of record after preprocessing 2. Boolean indicating whether to reduce row count 3. Boolean indicating whether to skip row 4. Boolean indicating whether to stop reading 5. Row error has been found """ errorInterface = interfaces.errorDb reduceRow = False rowErrorFound = False try: record = FieldCleaner.cleanRow(reader.getNextRecord(), fileType, interfaces.validationDb, self.longToShortDict, fields) record["row_number"] = rowNumber if reader.isFinished and len(record) < 2: # This is the last line and is empty, don't record an error return {}, True, True, True, False # Don't count this row except ResponseException as e: if reader.isFinished and reader.extraLine: #Last line may be blank don't record an error, reader.extraLine indicates a case where the last valid line has extra line breaks # Don't count last row if empty reduceRow = True else: writer.write([ "Formatting Error", ValidationError.readErrorMsg, str(rowNumber), "" ]) errorInterface.recordRowError( jobId, self.filename, "Formatting Error", ValidationError.readError, rowNumber, severity_id=interfaces.validationDb.getRuleSeverityId( "fatal")) rowErrorFound = True return {}, reduceRow, True, False, rowErrorFound return record, reduceRow, False, False, rowErrorFound
def insert(self, record, fileType): """ Write single record to this table Args: record: dict with column names as keys fileType: Type of file record is in Returns: True if successful """ # Need to translate the provided record to use column IDs instead of field names for keys idRecord = {} for key in record: idRecord[str( self.interfaces.validationDb.getColumnId( key, fileType))] = record[key] if (self.BATCH_INSERT): if (self.INSERT_BY_ORM): raise NotImplementedError( "Have not implemented ORM method for batch insert") else: self.batch.append(idRecord) if (len(self.batch) > self.BATCH_SIZE): # Time to write the batch self.interface.connection.execute( self.orm.__table__.insert(), self.batch) # Reset batch self.batch = [] return True else: if (self.INSERT_BY_ORM): try: recordOrm = self.orm() except: # createTable was not called raise Exception("Must call createTable before writing") attributes = self.getPublicMembers(recordOrm) # For each field, add value to ORM object for key in idRecord: attr = FieldCleaner.cleanString(key) #key.replace(" ","_") setattr(recordOrm, attr, idRecord[key]) self.interface.session.add(recordOrm) self.interface.session.commit() return True else: raise ValueError( "Must do either batch or use ORM, cannot set both to False" )
def rule_check_prefix(cls, data, value, rule, datatype, interfaces, record): """ Check that 1-digit prefix is consistent with reimbursable flag """ dataString = FieldCleaner.cleanString(data) # Load target field and dict to compare with targetField = FieldCleaner.cleanName(rule.rule_text_1) prefixMap = json.loads(str(rule.rule_text_2)) # Check that character and value are consistent with dict in rule_text_2 if dataString[0] not in prefixMap: # Unknown prefix, this is a failure return False source = prefixMap[dataString[0]] target = record[targetField] source = source.lower() if source is not None else source target = target.lower() if target is not None else target if source == target: # Matches the value in target field, rule passes return True else: return False
def insert(self, record, fileType): """ Write single record to this table Args: record: dict with column names as keys fileType: Type of file record is in Returns: True if successful """ # Need to translate the provided record to use column IDs instead of field names for keys idRecord = {} # Mark if header for key in record: if key == "row": idRecord[key] = record[key] else: idRecord[str(self.interfaces.validationDb.getColumnId(key,fileType))] = record[key] if(self.BATCH_INSERT): if(self.INSERT_BY_ORM): raise NotImplementedError("Have not implemented ORM method for batch insert") else: self.batch.append(idRecord) if(len(self.batch)>self.BATCH_SIZE): # Time to write the batch self.interface.connection.execute(self.orm.__table__.insert(),self.batch) # Reset batch self.batch = [] return True else: if(self.INSERT_BY_ORM): try: recordOrm = self.orm() except: # createTable was not called raise Exception("Must call createTable before writing") attributes = self.getPublicMembers(recordOrm) # For each field, add value to ORM object for key in idRecord: attr = FieldCleaner.cleanString(key) #key.replace(" ","_") setattr(recordOrm,attr,idRecord[key]) self.interface.session.add(recordOrm) self.interface.session.commit() return True else: raise ValueError("Must do either batch or use ORM, cannot set both to False")
def clean_numbers(value): """ Removes commas from strings representing numbers Args: value: the value to remove commas from Returns: The original value with commas removed if there were any """ if value is not None: temp_value = str(value).replace(',', '') if FieldCleaner.is_numeric(temp_value): return temp_value return value
def read_record(self, reader, writer, row_number, job, fields, error_list): """ Read and process the next record Args: reader: CsvReader object writer: CsvWriter object row_number: Next row number to be read job: current job fields: List of FileColumn objects for this file type error_list: instance of ErrorInterface to keep track of errors Returns: Tuple with six elements: 1. Dict of record after preprocessing 2. Boolean indicating whether to reduce row count 3. Boolean indicating whether to skip row 4. Boolean indicating whether to stop reading 5. Row error has been found 6. Dict of flex columns """ reduce_row = False row_error_found = False job_id = job.job_id try: (next_record, flex_fields) = reader.get_next_record() record = FieldCleaner.clean_row(next_record, self.long_to_short_dict, fields) record["row_number"] = row_number for flex_field in flex_fields: flex_field.submission_id = job.submission_id flex_field.job_id = job.job_id flex_field.row_number = row_number flex_field.file_type_id = job.file_type_id if reader.is_finished and len(record) < 2: # This is the last line and is empty, don't record an error return {}, True, True, True, False, [] # Don't count this row except ResponseException: if reader.is_finished and reader.extra_line: # Last line may be blank don't record an error, # reader.extra_line indicates a case where the last valid line has extra line breaks # Don't count last row if empty reduce_row = True else: writer.write(["Formatting Error", ValidationError.readErrorMsg, str(row_number), ""]) error_list.record_row_error(job_id, job.filename, "Formatting Error", ValidationError.readError, row_number, severity_id=RULE_SEVERITY_DICT['fatal']) row_error_found = True return {}, reduce_row, True, False, row_error_found, [] return record, reduce_row, False, False, row_error_found, flex_fields
def loadFields(fileTypeName, schemaFileName): """Load specified schema from a .csv.""" with createApp().app_context(): sess = GlobalDB.db().session # get file type object for specified fileTypeName fileType = sess.query(FileTypeValidation).filter( FileTypeValidation.name == fileTypeName).one() # delete existing schema from database SchemaLoader.removeColumnsByFileType(sess, fileType) # get allowable datatypes typeQuery = sess.query(FieldType.name, FieldType.field_type_id).all() types = {type.name: type.field_type_id for type in typeQuery} # add schema to database with open(schemaFileName, 'rU') as csvfile: reader = csv.DictReader(csvfile) for record in reader: record = FieldCleaner.cleanRecord(record) fields = ["fieldname", "required", "data_type"] if all(field in record for field in fields): SchemaLoader.addColumnByFileType( sess, types, fileType, FieldCleaner.cleanString(record["fieldname"]), FieldCleaner.cleanString( record["fieldname_short"]), record["required"], record["data_type"], record["padded_flag"], record["field_length"]) else: raise ValueError('CSV File does not follow schema') sess.commit()
def getFieldsByFile(self, fileType, shortCols=False): """ Returns a dict of valid field names that can appear in this type of file Args: fileType -- One of the set of valid types of files (e.g. Award, AwardFinancial) shortCols -- If true, return the short column names instead of the long names Returns: dict with field names as keys and values are ORM object FileColumn """ returnDict = {} fileId = self.getFileTypeIdByName(fileType) if (fileId is None): raise ValueError("File type does not exist") queryResult = self.session.query(FileColumn).options( subqueryload("field_type")).filter( FileColumn.file_id == fileId).all() for column in queryResult: if shortCols: returnDict[FieldCleaner.cleanString( column.name_short)] = column else: returnDict[FieldCleaner.cleanString(column.name)] = column return returnDict
def rule_exists_in_table(cls, data, value, rule, datatype, interfaces, record): """ Check that field value exists in specified table, rule_text_1 has table and column to check against, rule_text_2 is length to pad to """ ruleTextOne = str(rule.rule_text_1).split(",") if len(ruleTextOne) != 2: # Bad rule definition raise ResponseException("exists_in_table rule incorrectly defined, must have both table and field in rule_text_one",StatusCode.INTERNAL_ERROR,ValueError) # Not putting model name through FieldCleaner because model names will have uppercase model = getattr(domainModels,str(ruleTextOne[0]).strip()) field = FieldCleaner.cleanString(ruleTextOne[1]) ruleTextTwo = FieldCleaner.cleanString(rule.rule_text_2) if len(ruleTextTwo) == 0: # Skip padding paddedData = FieldCleaner.cleanString(data) else: # Pad data to correct length try: padLength = int(ruleTextTwo) except ValueError as e: # Need an integer in rule_text_two raise ResponseException("Need an integer width in rule_text_two for exists_in_table rules",StatusCode.INTERNAL_ERROR,ValueError) paddedData = FieldCleaner.cleanString(data).zfill(padLength) # Build query for model and field specified query = interfaces.validationDb.session.query(model).filter(getattr(model,field) == paddedData) try: # Check that value exists in table, should be unique interfaces.validationDb.runUniqueQuery(query,"Data not found in table", "Conflicting entries found for this data") # If unique result found, rule passed return True except ResponseException as e: # If exception is no result found, rule failed if type(e.wrappedException) == type(NoResultFound()): return False else: # This is an unexpected exception, so re-raise it raise
def getFieldsByFile(self, fileType): """ Returns a dict of valid field names that can appear in this type of file Args: fileType -- One of the set of valid types of files (e.g. Award, AwardFinancial) Returns: dict with field names as keys and values are ORM object FileColumn """ returnDict = {} fileId = self.getFileId(fileType) if(fileId is None) : raise ValueError("File type does not exist") queryResult = self.session.query(FileColumn).options(subqueryload("field_type")).filter(FileColumn.file_id == fileId).all() for column in queryResult : returnDict[FieldCleaner.cleanString(column.name)] = column return returnDict
def getFieldsByFileList(self, fileType): """ Returns a list of valid field names that can appear in this type of file Args: fileType -- One of the set of valid types of files (e.g. Award, AwardFinancial) Returns: list of names """ fileId = self.getFileId(fileType) returnList = [] if(fileId is None) : raise ValueError("Filetype does not exist") queryResult = self.session.query(FileColumn).filter(FileColumn.file_id == fileId).all() for result in queryResult: result.name = FieldCleaner.cleanString(result.name) # Standardize field names return queryResult
def load_labels(cls, filename): """Load non-SQL-based validation rules to db.""" with create_app().app_context(): sess = GlobalDB.db().session # Delete all records currently in table sess.query(ValidationLabel).delete() filename = os.path.join(cls.validation_labels_path, filename) # open csv with open(filename, 'rU') as csvfile: # read header header = csvfile.readline() # split header into filed names raw_field_names = header.split(',') field_names = [] # clean field names for field in raw_field_names: field_names.append(FieldCleaner.clean_string(field)) unknown_fields = set(field_names) - set(cls.headers) if len(unknown_fields) != 0: raise KeyError("".join(["Found unexpected fields: ", str(list(unknown_fields))])) missing_fields = set(cls.headers) - set(field_names) if len(missing_fields) != 0: raise ValueError("".join(["Missing required fields: ", str(list(missing_fields))])) reader = csv.DictReader(csvfile, fieldnames=field_names) for row in reader: validation_label = ValidationLabel(label=row['label'], error_message=row['error_message'], column_name=row['column_name'], label_type=row['label_type']) # look up file type id try: file_id = FILE_TYPE_DICT[row["file_type"]] except Exception as e: raise Exception("{}: file type={}, rule label={}. Rule not loaded.".format( e, row["file_type"], row["rule_label"])) validation_label.file_id = file_id sess.merge(validation_label) sess.commit()
def requireOne(record, fields, interfaces): """ Require at least one of the specified fields to be present Args: record: Dict for current record fields: List of fields to check interfaces: interface holder for DBs Returns: True if at least one of the fields is present """ for field in fields: fieldName = FieldCleaner.cleanName(field) if fieldName in record and record[fieldName] is not None and str(record[fieldName]).strip() != "": # If data is present in this field, rule is satisfied return True # If all were empty, return false return False
def open_file(self, region, bucket, filename, csv_schema, bucket_name, error_filename, long_to_short_dict): """ Opens file and prepares to read each record, mapping entries to specified column names Args: region: AWS region where the bucket is located (not used if instantiated as CsvLocalReader) bucket: the S3 Bucket (not used if instantiated as CsvLocalReader) filename: The file path for the CSV file in S3 csv_schema: list of FileColumn objects for this file type bucket_name: bucket to send errors to error_filename: filename for error report long_to_short_dict: mapping of long to short schema column names """ self.filename = filename self.unprocessed = '' self.extra_line = False self.lines = [] self.flex_dictionary = {} self.header_dictionary = {} self.packet_counter = 0 current = 0 self.is_finished = False self.column_count = 0 line = self._get_line() # make sure we have not finished reading the file if self.is_finished: # Write header error for no header row with self.get_writer(bucket_name, error_filename, ["Error Type"], self.is_local) as writer: writer.write(["No header row"]) writer.finishBatch() raise ResponseException("CSV file must have a header", StatusCode.CLIENT_ERROR, ValueError, ValidationError.singleRow) duplicated_headers = [] #create the header # check delimiters in header row pipe_count = line.count("|") comma_count = line.count(",") if pipe_count != 0 and comma_count != 0: # Write header error for mixed delimiter use with self.get_writer(bucket_name, error_filename, ["Error Type"], self.is_local) as writer: writer.write([ "Cannot use both ',' and '|' as delimiters. Please choose one." ]) writer.finishBatch() raise ResponseException( "Error in header row: CSV file must use only '|' or ',' as the delimiter", StatusCode.CLIENT_ERROR, ValueError, ValidationError.headerError) self.delimiter = "|" if line.count("|") != 0 else "," # Set the list of possible_fields, using the shorter, # machine-readable column names possible_fields = {} for schema in csv_schema: possible_fields[FieldCleaner.cleanString(schema.name_short)] = 0 for row in csv.reader([line], dialect='excel', delimiter=self.delimiter): # check to see if header contains long or short column names col_matches = 0 for value in row: if FieldCleaner.cleanString(value) in long_to_short_dict: col_matches += 1 # if most of column headers are in the long format, # we'll treat the file as having long headers if col_matches > .5 * len(row): long_headers = True else: long_headers = False for cell in row: submitted_header_value = FieldCleaner.cleanString(cell) if long_headers and submitted_header_value in long_to_short_dict: header_value = FieldCleaner.cleanString( long_to_short_dict[submitted_header_value]) elif long_headers: header_value = None else: header_value = submitted_header_value if not header_value in possible_fields: # Add flex headers to flex list if str(submitted_header_value).startswith("flex_"): self.flex_dictionary[current] = submitted_header_value else: self.flex_dictionary[current] = None # Allow unexpected headers, just mark the header as None so we skip it when reading self.header_dictionary[current] = None current += 1 elif possible_fields[header_value] == 1: # Add header value (as submitted) to duplicated header list duplicated_headers.append(submitted_header_value) else: self.header_dictionary[current] = header_value possible_fields[header_value] = 1 current += 1 self.column_count = current #Check that all required fields exists missing_headers = [] for schema in csv_schema: if possible_fields[FieldCleaner.cleanString( schema.name_short)] == 0: # return long colname for error reporting missing_headers.append(schema.name) if len(missing_headers) > 0 or len(duplicated_headers) > 0: # Write header errors if any occurred and raise a header_error exception error_string = "" with self.get_writer(bucket_name, error_filename, self.header_report_headers, self.is_local) as writer: extra_info = {} if len(duplicated_headers) > 0: error_string = "".join([ error_string, "Duplicated: ", ", ".join(duplicated_headers) ]) extra_info["duplicated_headers"] = ", ".join( duplicated_headers) for header in duplicated_headers: writer.write(["Duplicated header", header]) if len(missing_headers) > 0: if len(duplicated_headers): # Separate missing and duplicated headers if both are present error_string += "| " error_string = "".join([ error_string, "Missing: ", ", ".join(missing_headers) ]) extra_info["missing_headers"] = ", ".join(missing_headers) for header in missing_headers: writer.write(["Missing header", header]) writer.finishBatch() raise ResponseException( "Errors in header row: " + str(error_string), StatusCode.CLIENT_ERROR, ValueError, ValidationError.headerError, **extra_info) return long_headers
def openFile(self, region, bucket, filename, csvSchema, bucketName, errorFilename): """ Opens file and prepares to read each record, mapping entries to specified column names Args: bucket : the S3 Bucket filename: The file path for the CSV file in S3 writer: An implementation of csvAbstractWriter to send header errors to Returns: """ possibleFields = {} currentFields = {} for schema in csvSchema: possibleFields[FieldCleaner.cleanString(schema.name)] = 0 self.filename = filename self.unprocessed = '' self.extraLine = False self.lines = [] self.headerDictionary = {} self.packetCounter = 0 current = 0 self.isFinished = False self.columnCount = 0 line = self._getLine() # make sure we have not finished reading the file if (self.isFinished): raise ResponseException("CSV file must have a header", StatusCode.CLIENT_ERROR, ValueError, ValidationError.singleRow) duplicatedHeaders = [] #create the header for row in csv.reader([line], dialect='excel'): for cell in row: headerValue = FieldCleaner.cleanString(cell) if (not headerValue in possibleFields): # Allow unexpected headers, just mark the header as None so we skip it when reading self.headerDictionary[(current)] = None current += 1 elif (possibleFields[headerValue] == 1): # Add to duplicated header list duplicatedHeaders.append(headerValue) else: self.headerDictionary[(current)] = headerValue possibleFields[headerValue] = 1 current += 1 self.columnCount = current #Check that all required fields exists missingHeaders = [] for schema in csvSchema: if (schema.required and possibleFields[FieldCleaner.cleanString( schema.name)] == 0): missingHeaders.append(schema.name) if (len(missingHeaders) > 0 or len(duplicatedHeaders) > 0): # Write header errors if any occurred and raise a header_error exception with self.getWriter(bucketName, errorFilename, self.headerReportHeaders, self.isLocal) as writer: extraInfo = {} if (len(duplicatedHeaders) > 0): extraInfo["duplicated_headers"] = ", ".join( duplicatedHeaders) for header in duplicatedHeaders: writer.write(["Duplicated header", header]) if (len(missingHeaders) > 0): extraInfo["missing_headers"] = ", ".join(missingHeaders) for header in missingHeaders: writer.write(["Missing header", header]) writer.finishBatch() raise ResponseException("Errors in header row", StatusCode.CLIENT_ERROR, ValueError, ValidationError.headerError, **extraInfo)
def createTable(self, fileType, filename, jobId, tableName=None): """ Create staging table for new file Args: fileType -- type of file to create a table for (e.g. Award, AwardFinancial) Returns: tableName if created, exception otherwise """ if (tableName == None): tableName = self.interface.getTableName(jobId) self.name = tableName if (self.interface.tableExists(tableName)): # Old table still present, drop table and replace self.interface.dropTable(tableName) # Alternate way of naming tables #tableName = "data" + tableName.replace("/","").replace("\\","").replace(".","") # Write tableName to related job in job tracker self.interfaces.jobDb.addStagingTable(jobId, tableName) fields = self.interfaces.validationDb.getFieldsByFile(fileType) """ Might not need sequence for ORM # Create sequence to be used for primary key sequenceName = tableName + "Serial" sequenceStatement = "CREATE SEQUENCE " + sequenceName + " START 1" try: self.runStatement(sequenceStatement) except ProgrammingError: # Sequence already exists pass """ primaryAssigned = False # Create empty dict for field names and values classFieldDict = {"__tablename__": tableName} # Add each column for key in fields: # Build column statement for this key # Create cleaned version of key newKey = str(fields[key].file_column_id) # Get correct type name fieldTypeName = FieldCleaner.cleanString( fields[key].field_type.name) if (fieldTypeName == "string"): fieldTypeName = Text elif (fieldTypeName == "int"): fieldTypeName = Integer elif (fieldTypeName == "decimal"): fieldTypeName = Numeric elif (fieldTypeName == "boolean"): fieldTypeName = Boolean elif (fieldTypeName == "long"): fieldTypeName = BigInteger else: raise ValueError("Bad field type") # Get extra parameters (primary key or not null) extraParam = "" if (FieldCleaner.cleanString( fields[key].field_type.description) == "primary_key"): classFieldDict[newKey] = Column(fieldTypeName, primary_key=True) primaryAssigned = True elif (fields[key].required): classFieldDict[newKey] = Column(fieldTypeName, nullable=False) else: classFieldDict[newKey] = Column(fieldTypeName) if (not primaryAssigned): # If no primary key assigned, add one based on table name classFieldDict["".join([tableName, "id"])] = Column(Integer, primary_key=True) # Create ORM class based on dict self.orm = type(tableName, (declarative_base(), ), classFieldDict) self.jobId = jobId # Create table self.orm.__table__.create(self.interface.engine)
def load_sql(cls, filename): """Load SQL-based validation rules to db.""" with create_app().app_context(): sess = GlobalDB.db().session # Delete all records currently in table sess.query(RuleSql).delete() filename = os.path.join(cls.sql_rules_path, filename) # open csv with open(filename, 'rU') as csvfile: # read header header = csvfile.readline() # split header into filed names raw_field_names = header.split(',') field_names = [] # clean field names for field in raw_field_names: field_names.append(FieldCleaner.clean_string(field)) unknown_fields = set(field_names) - set(cls.headers) if len(unknown_fields) != 0: raise KeyError("".join(["Found unexpected fields: ", str(list(unknown_fields))])) missing_fields = set(cls.headers) - set(field_names) if len(missing_fields) != 0: raise ValueError("".join(["Missing required fields: ", str(list(missing_fields))])) reader = csv.DictReader(csvfile, fieldnames=field_names) for row in reader: sql = cls.read_sql_str(row['query_name']) rule_sql = RuleSql(rule_sql=sql, rule_label=row['rule_label'], rule_error_message=row['rule_error_message'], query_name=row['query_name']) # look up file type id try: file_id = FILE_TYPE_DICT[row["file_type"]] except Exception as e: raise Exception("{}: file type={}, rule label={}. Rule not loaded.".format( e, row["file_type"], row["rule_label"])) try: if row["target_file"].strip() == "": # No target file provided target_file_id = None else: target_file_id = FILE_TYPE_DICT[row["target_file"]] except Exception as e: raise Exception("{}: file type={}, rule label={}. Rule not loaded.".format( e, row["target_file"], row["rule_label"])) # set cross file flag flag = FieldCleaner.clean_string(row["rule_cross_file_flag"]) if flag in ('true', 't', 'y', 'yes'): cross_file_flag = True else: cross_file_flag = False rule_sql.rule_severity_id = RULE_SEVERITY_DICT[row['severity_name']] rule_sql.file_id = file_id rule_sql.target_file_id = target_file_id rule_sql.rule_cross_file_flag = cross_file_flag sess.merge(rule_sql) sess.commit()
def loadCsv(cls,filename,model,interface,fieldMap,fieldOptions): """ Loads a table based on a csv Args: filename: CSV to load model: ORM object for table to be loaded interface: interface to DB table is in fieldMap: dict that maps columns of the csv to attributes of the ORM object fieldOptions: dict with keys of attribute names, value contains a dict with options for that attribute. Current options are "pad_to_length" which if present will pad the field with leading zeros up to specified length, and "skip_duplicate" which ignores subsequent lines that repeat values. """ # Delete all records currently in table interface.session.query(model).delete() interface.session.commit() valuePresent = {} # Open csv with open(filename,'rU') as csvfile: # Read header header = csvfile.readline() # Split header into fieldnames rawFieldNames = header.split(",") fieldNames = [] # Clean field names for field in rawFieldNames: fieldNames.append(FieldCleaner.cleanString(field)) # Map fieldnames to attribute names attributeNames = [] for field in fieldNames: if field in fieldMap: attributeNames.append(fieldMap[field]) if fieldMap[field] in fieldOptions and "skip_duplicates" in fieldOptions[fieldMap[field]]: # Create empty dict for this field valuePresent[fieldMap[field]] = {} else: raise KeyError("".join(["Found unexpected field ", str(field)])) # Check that all fields are present for field in fieldMap: if not field in fieldNames: raise ValueError("".join([str(field)," is required for loading table ", str(type(model))])) # Open DictReader with attribute names reader = csv.DictReader(csvfile,fieldnames = attributeNames) # For each row, create instance of model and add it for row in reader: skipInsert = False for field in fieldOptions: # For each field with options present, modify according to those options options = fieldOptions[field] if "pad_to_length" in options: padLength = options["pad_to_length"] row[field] = Validator.padToLength(row[field],padLength) if "skip_duplicates" in options: if len(row[field].strip()) == 0 or row[field] in valuePresent[field]: # Value not provided or already exists, skip it skipInsert = True else: # Insert new value valuePresent[field][row[field]] = True record = model(**row) if not skipInsert: try: interface.session.merge(record) except IntegrityError as e: # Hit a duplicate value that violates index, skip this one print("".join(["Warning: Skipping this row: ",str(row)])) print("".join(["Due to error: ",str(e)])) interface.session.rollback() continue interface.session.commit()
def loadCsv(cls, filename, model, interface, fieldMap, fieldOptions): """ Loads a table based on a csv Args: filename: CSV to load model: ORM object for table to be loaded interface: interface to DB table is in fieldMap: dict that maps columns of the csv to attributes of the ORM object fieldOptions: dict with keys of attribute names, value contains a dict with options for that attribute. Current options are "pad_to_length" which if present will pad the field with leading zeros up to specified length, and "skip_duplicate" which ignores subsequent lines that repeat values. """ # Delete all records currently in table interface.session.query(model).delete() interface.session.commit() valuePresent = {} # Open csv with open(filename, 'rU') as csvfile: # Read header header = csvfile.readline() # Split header into fieldnames rawFieldNames = header.split(",") fieldNames = [] # Clean field names for field in rawFieldNames: fieldNames.append(FieldCleaner.cleanString(field)) # Map fieldnames to attribute names attributeNames = [] for field in fieldNames: if field in fieldMap: attributeNames.append(fieldMap[field]) if fieldMap[ field] in fieldOptions and "skip_duplicates" in fieldOptions[ fieldMap[field]]: # Create empty dict for this field valuePresent[fieldMap[field]] = {} else: raise KeyError("".join( ["Found unexpected field ", str(field)])) # Check that all fields are present for field in fieldMap: if not field in fieldNames: raise ValueError("".join([ str(field), " is required for loading table ", str(type(model)) ])) # Open DictReader with attribute names reader = csv.DictReader(csvfile, fieldnames=attributeNames) # For each row, create instance of model and add it for row in reader: skipInsert = False for field in fieldOptions: # For each field with options present, modify according to those options options = fieldOptions[field] if "pad_to_length" in options: padLength = options["pad_to_length"] row[field] = Validator.padToLength( row[field], padLength) if "skip_duplicates" in options: if len(row[field].strip() ) == 0 or row[field] in valuePresent[field]: # Value not provided or already exists, skip it skipInsert = True else: # Insert new value valuePresent[field][row[field]] = True record = model(**row) if not skipInsert: try: interface.session.merge(record) except IntegrityError as e: # Hit a duplicate value that violates index, skip this one print("".join( ["Warning: Skipping this row: ", str(row)])) print("".join(["Due to error: ", str(e)])) interface.session.rollback() continue interface.session.commit()
def runValidation(self, jobId, interfaces): """ Run validations for specified job Args: jobId: Job to be validated jobTracker: Interface for job tracker Returns: True if successful """ jobTracker = interfaces.jobDb rowNumber = 1 fileType = jobTracker.getFileType(jobId) # If local, make the error report directory if(self.isLocal and not os.path.exists(self.directory)): os.makedirs(self.directory) # Get bucket name and file name fileName = jobTracker.getFileName(jobId) self.filename = fileName bucketName = CONFIG_BROKER['aws_bucket'] regionName = CONFIG_BROKER['aws_region'] errorFileName = self.getFileName(jobTracker.getReportPath(jobId)) # Create File Status object interfaces.errorDb.createFileIfNeeded(jobId,fileName) validationDB = interfaces.validationDb fieldList = validationDB.getFieldsByFileList(fileType) csvSchema = validationDB.getFieldsByFile(fileType) rules = validationDB.getRulesByFile(fileType) reader = self.getReader() # Get file size and write to jobs table if(CONFIG_BROKER["use_aws"]): fileSize = s3UrlHandler.getFileSize("errors/"+jobTracker.getReportPath(jobId)) else: fileSize = os.path.getsize(jobTracker.getFileName(jobId)) jobTracker.setFileSizeById(jobId, fileSize) try: # Pull file reader.openFile(regionName, bucketName, fileName,fieldList,bucketName,errorFileName) # Create staging table tableName = interfaces.stagingDb.getTableName(jobId) # Create staging table tableObject = StagingTable(interfaces) tableObject.createTable(fileType,fileName,jobId,tableName) errorInterface = interfaces.errorDb # While not done, pull one row and put it into staging if it passes # the Validator with self.getWriter(regionName, bucketName, errorFileName, self.reportHeaders) as writer: while(not reader.isFinished): rowNumber += 1 #if (rowNumber % 1000) == 0: # print("Validating row " + str(rowNumber)) try : record = FieldCleaner.cleanRow(reader.getNextRecord(), fileType, validationDB) record["row"] = rowNumber if(reader.isFinished and len(record) < 2): # This is the last line and is empty, don't record an error rowNumber -= 1 # Don't count this row break except ResponseException as e: if reader.isFinished and reader.extraLine: #Last line may be blank don't record an error, reader.extraLine indicates a case where the last valid line has extra line breaks # Don't count last row if empty rowNumber -= 1 else: writer.write(["Formatting Error", ValidationError.readErrorMsg, str(rowNumber), ""]) errorInterface.recordRowError(jobId,self.filename,"Formatting Error",ValidationError.readError,rowNumber) errorInterface.setRowErrorsPresent(jobId, True) continue valid, failures = Validator.validate(record,rules,csvSchema,fileType,interfaces) if(valid) : try: tableObject.insert(record,fileType) except ResponseException as e: # Write failed, move to next record writer.write(["Formatting Error", ValidationError.writeErrorMsg, str(rowNumber),""]) errorInterface.recordRowError(jobId,self.filename,"Formatting Error",ValidationError.writeError,rowNumber) errorInterface.setRowErrorsPresent(jobId, True) continue else: # For each failure, record it in error report and metadata if failures: errorInterface.setRowErrorsPresent(jobId, True) for failure in failures: fieldName = failure[0] error = failure[1] failedValue = failure[2] try: # If error is an int, it's one of our prestored messages errorType = int(error) errorMsg = ValidationError.getErrorMessage(errorType) except ValueError: # If not, treat it literally errorMsg = error writer.write([fieldName,errorMsg,str(rowNumber),failedValue]) errorInterface.recordRowError(jobId,self.filename,fieldName,error,rowNumber) # Write unfinished batch writer.finishBatch() # Write number of rows to job table jobTracker.setNumberOfRowsById(jobId,rowNumber) # Write leftover records tableObject.endBatch() # Mark validation as finished in job tracker jobTracker.markJobStatus(jobId,"finished") errorInterface.writeAllRowErrors(jobId) finally: #ensure the file always closes reader.close() return True
def runValidation(self, jobId, interfaces): """ Run validations for specified job Args: jobId: Job to be validated jobTracker: Interface for job tracker Returns: True if successful """ jobTracker = interfaces.jobDb rowNumber = 1 fileType = jobTracker.getFileType(jobId) # If local, make the error report directory if (self.isLocal and not os.path.exists(self.directory)): os.makedirs(self.directory) # Get bucket name and file name fileName = jobTracker.getFileName(jobId) self.filename = fileName bucketName = CONFIG_BROKER['aws_bucket'] regionName = CONFIG_BROKER['aws_region'] errorFileName = self.getFileName(jobTracker.getReportPath(jobId)) # Create File Status object interfaces.errorDb.createFileIfNeeded(jobId, fileName) validationDB = interfaces.validationDb fieldList = validationDB.getFieldsByFileList(fileType) csvSchema = validationDB.getFieldsByFile(fileType) rules = validationDB.getRulesByFile(fileType) reader = self.getReader() # Get file size and write to jobs table if (CONFIG_BROKER["use_aws"]): fileSize = s3UrlHandler.getFileSize( "errors/" + jobTracker.getReportPath(jobId)) else: fileSize = os.path.getsize(jobTracker.getFileName(jobId)) jobTracker.setFileSizeById(jobId, fileSize) try: # Pull file reader.openFile(regionName, bucketName, fileName, fieldList, bucketName, errorFileName) # Create staging table tableName = interfaces.stagingDb.getTableName(jobId) # Create staging table tableObject = StagingTable(interfaces) tableObject.createTable(fileType, fileName, jobId, tableName) errorInterface = interfaces.errorDb # While not done, pull one row and put it into staging if it passes # the Validator with self.getWriter(regionName, bucketName, errorFileName, self.reportHeaders) as writer: while (not reader.isFinished): rowNumber += 1 #if (rowNumber % 1000) == 0: # print("Validating row " + str(rowNumber)) try: record = FieldCleaner.cleanRow(reader.getNextRecord(), fileType, validationDB) record["row"] = rowNumber if (reader.isFinished and len(record) < 2): # This is the last line and is empty, don't record an error rowNumber -= 1 # Don't count this row break except ResponseException as e: if reader.isFinished and reader.extraLine: #Last line may be blank don't record an error, reader.extraLine indicates a case where the last valid line has extra line breaks # Don't count last row if empty rowNumber -= 1 else: writer.write([ "Formatting Error", ValidationError.readErrorMsg, str(rowNumber), "" ]) errorInterface.recordRowError( jobId, self.filename, "Formatting Error", ValidationError.readError, rowNumber) errorInterface.setRowErrorsPresent(jobId, True) continue valid, failures = Validator.validate( record, rules, csvSchema, fileType, interfaces) if (valid): try: tableObject.insert(record, fileType) except ResponseException as e: # Write failed, move to next record writer.write([ "Formatting Error", ValidationError.writeErrorMsg, str(rowNumber), "" ]) errorInterface.recordRowError( jobId, self.filename, "Formatting Error", ValidationError.writeError, rowNumber) errorInterface.setRowErrorsPresent(jobId, True) continue else: # For each failure, record it in error report and metadata if failures: errorInterface.setRowErrorsPresent(jobId, True) for failure in failures: fieldName = failure[0] error = failure[1] failedValue = failure[2] try: # If error is an int, it's one of our prestored messages errorType = int(error) errorMsg = ValidationError.getErrorMessage( errorType) except ValueError: # If not, treat it literally errorMsg = error writer.write([ fieldName, errorMsg, str(rowNumber), failedValue ]) errorInterface.recordRowError( jobId, self.filename, fieldName, error, rowNumber) # Write unfinished batch writer.finishBatch() # Write number of rows to job table jobTracker.setNumberOfRowsById(jobId, rowNumber) # Write leftover records tableObject.endBatch() # Mark validation as finished in job tracker jobTracker.markJobStatus(jobId, "finished") errorInterface.writeAllRowErrors(jobId) finally: #ensure the file always closes reader.close() return True
def rule_sum_fields(cls, data, value, rule, datatype, interfaces, record): """Checks that set of fields sums to value in other field""" valueToMatch = record[FieldCleaner.cleanName(rule.rule_text_1)] if valueToMatch is None or valueToMatch == "": valueToMatch = 0 return cls.validateSum(valueToMatch, rule.rule_text_2, record)
def load_sql(cls, filename): """Load SQL-based validation rules to db.""" with create_app().app_context(): sess = GlobalDB.db().session # Delete all records currently in table sess.query(RuleSql).delete() filename = os.path.join(cls.sql_rules_path, filename) # open csv with open(filename, 'rU') as csvfile: # read header header = csvfile.readline() # split header into filed names raw_field_names = header.split(',') field_names = [] # clean field names for field in raw_field_names: field_names.append(FieldCleaner.clean_string(field)) unknown_fields = set(field_names) - set(cls.headers) if len(unknown_fields) != 0: raise KeyError("".join([ "Found unexpected fields: ", str(list(unknown_fields)) ])) missing_fields = set(cls.headers) - set(field_names) if len(missing_fields) != 0: raise ValueError("".join([ "Missing required fields: ", str(list(missing_fields)) ])) reader = csv.DictReader(csvfile, fieldnames=field_names) for row in reader: sql = cls.read_sql_str(row['query_name']) rule_sql = RuleSql( rule_sql=sql, rule_label=row['rule_label'], rule_error_message=row['rule_error_message'], query_name=row['query_name']) # look up file type id try: file_id = FILE_TYPE_DICT[row["file_type"]] except Exception as e: raise Exception( "{}: file type={}, rule label={}. Rule not loaded." .format(e, row["file_type"], row["rule_label"])) try: if row["target_file"].strip() == "": # No target file provided target_file_id = None else: target_file_id = FILE_TYPE_DICT[row["target_file"]] except Exception as e: raise Exception( "{}: file type={}, rule label={}. Rule not loaded." .format(e, row["target_file"], row["rule_label"])) # set cross file flag flag = FieldCleaner.clean_string( row["rule_cross_file_flag"]) if flag in ('true', 't', 'y', 'yes'): cross_file_flag = True else: cross_file_flag = False rule_sql.rule_severity_id = RULE_SEVERITY_DICT[ row['severity_name']] rule_sql.file_id = file_id rule_sql.target_file_id = target_file_id rule_sql.rule_cross_file_flag = cross_file_flag sess.merge(rule_sql) sess.commit()
def process_data_chunk(self, sess, chunk_df): """ Loads in a chunk of the file and performs initial validations Args: sess: the database connection bucket_name: the bucket to pull the file region_name: the region to pull the file """ logger.info({ 'message': 'Loading rows starting from {}'.format(self.max_row_number + 1), 'message_type': 'ValidatorInfo', 'submission_id': self.submission_id, 'job_id': self.job.job_id, 'file_type': self.file_type.name, 'action': 'data_loading', 'status': 'start' }) # initializing warning/error files and dataframes total_errors = pd.DataFrame(columns=self.report_headers) total_warnings = pd.DataFrame(columns=self.report_headers) flex_data = None required_list = {} type_list = {} office_list = {} # Replace whatever the user included so we're using the database headers chunk_df.rename(columns=self.reader.header_dict, inplace=True) empty_file = chunk_df.empty if not empty_file: chunk_df = chunk_df.applymap(clean_col) # Adding row number chunk_df = chunk_df.reset_index() # index gets reset for each chunk, adding the header, and adding previous rows chunk_df['row_number'] = chunk_df.index + 1 + self.max_row_number self.total_rows += len(chunk_df.index) # Increment row numbers if any were ignored being too long # This syncs the row numbers back to their original values for row in sorted(self.long_rows): chunk_df.loc[chunk_df['row_number'] >= row, 'row_number'] = chunk_df['row_number'] + 1 # Setting max row number for chunking purposes self.max_row_number = chunk_df['row_number'].max() # Filtering out already processed long rows self.long_rows = [row for row in self.long_rows if row > self.max_row_number] # Drop rows that were too short and pandas filled in with Nones chunk_df = chunk_df[~chunk_df['row_number'].isin(self.short_rows)] # Drop the index column chunk_df = chunk_df.drop(['index'], axis=1) # Drop all rows that have 1 or less filled in values (row_number is always filled in so this is how # we have to drop all rows that are just empty) chunk_df.dropna(thresh=2, inplace=True) empty_file = chunk_df.empty if not empty_file: if self.is_fabs: # create a list of all required/type labels for FABS labels = sess.query(ValidationLabel).all() for label in labels: if label.label_type == 'requirement': required_list[label.column_name] = label.label else: type_list[label.column_name] = label.label # Create a list of all offices offices = sess.query(Office.office_code, Office.sub_tier_code).all() for office in offices: office_list[office.office_code] = office.sub_tier_code # Clear out office list to save space del offices # Gathering flex data (must be done before chunk limiting) if self.reader.flex_fields: flex_data = chunk_df.loc[:, list(self.reader.flex_fields + ['row_number'])] if flex_data is not None and not flex_data.empty: flex_data['concatted'] = flex_data.apply(lambda x: concat_flex(x), axis=1) # Dropping any extraneous fields included + flex data (must be done before file type checking) chunk_df = chunk_df[list(self.expected_headers + ['row_number'])] # Only do validations if it's not a D file if self.file_type.name not in ['award', 'award_procurement']: # Padding specific fields for field in self.parsed_fields['padded']: chunk_df[field] = chunk_df.apply( lambda x: FieldCleaner.pad_field(self.csv_schema[field], x[field]), axis=1) # Cleaning up numbers so they can be inserted properly for field in self.parsed_fields['number']: chunk_df[field] = chunk_df.apply(lambda x: clean_numbers(x[field]), axis=1) if self.is_fabs: chunk_df['is_valid'] = True chunk_df['awarding_sub_tier_agency_c'] = chunk_df.apply( lambda x: derive_fabs_awarding_sub_tier(x, office_list), axis=1) chunk_df['afa_generated_unique'] = chunk_df.apply( lambda x: derive_fabs_afa_generated_unique(x), axis=1) chunk_df['unique_award_key'] = chunk_df.apply( lambda x: derive_fabs_unique_award_key(x), axis=1) else: chunk_df['tas'] = chunk_df.apply(lambda x: concat_tas_dict(x), axis=1) chunk_df['display_tas'] = chunk_df.apply(lambda x: concat_display_tas_dict(x), axis=1) chunk_df['unique_id'] = chunk_df.apply(lambda x: derive_unique_id(x, self.is_fabs), axis=1) # Separate each of the checks to their own dataframes, then concat them together req_errors = check_required(chunk_df, self.parsed_fields['required'], required_list, self.report_headers, self.short_to_long_dict[self.file_type.file_type_id], flex_data, is_fabs=self.is_fabs) type_errors = check_type(chunk_df, self.parsed_fields['number'] + self.parsed_fields['boolean'], type_list, self.report_headers, self.csv_schema, self.short_to_long_dict[self.file_type.file_type_id], flex_data, is_fabs=self.is_fabs) type_error_rows = type_errors['Row Number'].tolist() length_errors = check_length(chunk_df, self.parsed_fields['length'], self.report_headers, self.csv_schema, self.short_to_long_dict[self.file_type.file_type_id], flex_data, type_error_rows) if self.is_fabs: error_dfs = [req_errors, type_errors, length_errors] warning_dfs = [pd.DataFrame(columns=list(self.report_headers + ['error_type']))] else: error_dfs = [req_errors, type_errors] warning_dfs = [length_errors] total_errors = pd.concat(error_dfs, ignore_index=True) total_warnings = pd.concat(warning_dfs, ignore_index=True) # Converting these to ints because pandas likes to change them to floats randomly total_errors[['Row Number', 'error_type']] = total_errors[['Row Number', 'error_type']].astype(int) total_warnings[['Row Number', 'error_type']] = total_warnings[['Row Number', 'error_type']]. \ astype(int) self.error_rows.extend([int(x) for x in total_errors['Row Number'].tolist()]) for index, row in total_errors.iterrows(): self.error_list.record_row_error(self.job.job_id, self.file_name, row['Field Name'], row['error_type'], row['Row Number'], row['Rule Label'], self.file_type.file_type_id, None, RULE_SEVERITY_DICT['fatal']) for index, row in total_warnings.iterrows(): self.error_list.record_row_error(self.job.job_id, self.file_name, row['Field Name'], row['error_type'], row['Row Number'], row['Rule Label'], self.file_type.file_type_id, None, RULE_SEVERITY_DICT['warning']) total_errors.drop(['error_type'], axis=1, inplace=True, errors='ignore') total_warnings.drop(['error_type'], axis=1, inplace=True, errors='ignore') # Remove type error rows from original dataframe chunk_df = chunk_df[~chunk_df['row_number'].isin(type_error_rows)] chunk_df.drop(['unique_id'], axis=1, inplace=True) # Write all the errors/warnings to their files total_errors.to_csv(self.error_file_path, columns=self.report_headers, index=False, quoting=csv.QUOTE_ALL, mode='a', header=False) total_warnings.to_csv(self.warning_file_path, columns=self.report_headers, index=False, quoting=csv.QUOTE_ALL, mode='a', header=False) # Finally load the data into the database if not empty_file: # The model data now = datetime.now() chunk_df['created_at'] = now chunk_df['updated_at'] = now chunk_df['job_id'] = self.job.job_id chunk_df['submission_id'] = self.submission_id insert_dataframe(chunk_df, self.model.__table__.name, sess.connection()) # Flex Fields if flex_data is not None: flex_data.drop(['concatted'], axis=1, inplace=True) flex_data = flex_data[flex_data['row_number'].isin(chunk_df['row_number'])] flex_rows = pd.melt(flex_data, id_vars=['row_number'], value_vars=self.reader.flex_fields, var_name='header', value_name='cell') # Filling in all the shared data for these flex fields now = datetime.now() flex_rows['created_at'] = now flex_rows['updated_at'] = now flex_rows['job_id'] = self.job.job_id flex_rows['submission_id'] = self.submission_id flex_rows['file_type_id'] = self.file_type.file_type_id # Adding the entire set of flex fields insert_dataframe(flex_rows, FlexField.__table__.name, sess.connection()) sess.commit() logger.info({ 'message': 'Loaded rows up to {}'.format(self.max_row_number), 'message_type': 'ValidatorInfo', 'submission_id': self.submission_id, 'job_id': self.job.job_id, 'file_type': self.file_type.name, 'action': 'data_loading', 'status': 'end' })
def openFile(self,region,bucket,filename,csvSchema,bucketName,errorFilename): """ Opens file and prepares to read each record, mapping entries to specified column names Args: region: AWS region where the bucket is located (not used if instantiated as CsvLocalReader) bucket: the S3 Bucket (not used if instantiated as CsvLocalReader) filename: The file path for the CSV file in S3 csvSchema: list of FileColumn objects for this file type bucketName: bucket to send errors to errorFilename: filename for error report """ possibleFields = {} currentFields = {} for schema in csvSchema: possibleFields[FieldCleaner.cleanString(schema.name)] = 0 self.filename = filename self.unprocessed = '' self.extraLine = False self.lines = [] self.headerDictionary = {} self.packetCounter = 0 current = 0 self.isFinished = False self.columnCount = 0 line = self._getLine() # make sure we have not finished reading the file if(self.isFinished) : # Write header error for no header row with self.getWriter(bucketName, errorFilename, ["Error Type"], self.isLocal) as writer: writer.write(["No header row"]) writer.finishBatch() raise ResponseException("CSV file must have a header",StatusCode.CLIENT_ERROR,ValueError,ValidationError.singleRow) duplicatedHeaders = [] #create the header # check delimiters in header row pipeCount = line.count("|") commaCount = line.count(",") if pipeCount != 0 and commaCount != 0: # Write header error for mixed delimiter use with self.getWriter(bucketName, errorFilename, ["Error Type"], self.isLocal) as writer: writer.write(["Cannot use both ',' and '|' as delimiters. Please choose one."]) writer.finishBatch() raise ResponseException("Error in header row: CSV file must use only '|' or ',' as the delimiter", StatusCode.CLIENT_ERROR, ValueError, ValidationError.headerError) self.delimiter = "|" if line.count("|") != 0 else "," for row in csv.reader([line],dialect='excel', delimiter=self.delimiter): for cell in row : headerValue = FieldCleaner.cleanString(cell) if( not headerValue in possibleFields) : # Allow unexpected headers, just mark the header as None so we skip it when reading self.headerDictionary[(current)] = None current += 1 elif(possibleFields[headerValue] == 1) : # Add to duplicated header list duplicatedHeaders.append(headerValue) else: self.headerDictionary[(current)] = headerValue possibleFields[headerValue] = 1 current += 1 self.columnCount = current #Check that all required fields exists missingHeaders = [] for schema in csvSchema : if(possibleFields[FieldCleaner.cleanString(schema.name)] == 0) : missingHeaders.append(schema.name) if(len(missingHeaders) > 0 or len(duplicatedHeaders) > 0): # Write header errors if any occurred and raise a header_error exception errorString = "" with self.getWriter(bucketName, errorFilename, self.headerReportHeaders, self.isLocal) as writer: extraInfo = {} if(len(duplicatedHeaders) > 0): errorString = "".join([errorString, "Duplicated: ",", ".join(duplicatedHeaders)]) extraInfo["duplicated_headers"] = ", ".join(duplicatedHeaders) for header in duplicatedHeaders: writer.write(["Duplicated header", header]) if(len(missingHeaders) > 0): if(len(duplicatedHeaders)): # Separate missing and duplicated headers if both are present errorString += "| " errorString = "".join([errorString, "Missing: ",", ".join(missingHeaders)]) extraInfo["missing_headers"] = ", ".join(missingHeaders) for header in missingHeaders: writer.write(["Missing header", header]) writer.finishBatch() raise ResponseException("Errors in header row: " + str(errorString), StatusCode.CLIENT_ERROR, ValueError,ValidationError.headerError,**extraInfo)
def createTable(self, fileType, filename, jobId, tableName=None): """ Create staging table for new file Args: fileType -- type of file to create a table for (e.g. Award, AwardFinancial) Returns: tableName if created, exception otherwise """ if(tableName == None): tableName = self.interface.getTableName(jobId) self.name = tableName if(self.interface.tableExists(tableName)): # Old table still present, drop table and replace self.interface.dropTable(tableName) # Alternate way of naming tables #tableName = "data" + tableName.replace("/","").replace("\\","").replace(".","") # Write tableName to related job in job tracker self.interfaces.jobDb.addStagingTable(jobId,tableName) fields = self.interfaces.validationDb.getFieldsByFile(fileType) """ Might not need sequence for ORM # Create sequence to be used for primary key sequenceName = tableName + "Serial" sequenceStatement = "CREATE SEQUENCE " + sequenceName + " START 1" try: self.runStatement(sequenceStatement) except ProgrammingError: # Sequence already exists pass """ primaryAssigned = False # Create empty dict for field names and values classFieldDict = {"__tablename__":tableName} # Create dict to hold record for field names fieldNameMap = {} # Add each column for key in fields: # Build column statement for this key # Create cleaned version of key newKey = str(fields[key].file_column_id) # Get correct type name fieldTypeName = FieldCleaner.cleanString(fields[key].field_type.name) if(fieldTypeName == "string"): fieldTypeName = Text elif(fieldTypeName == "int"): fieldTypeName = Integer elif(fieldTypeName == "decimal"): fieldTypeName = Numeric elif(fieldTypeName == "boolean"): fieldTypeName = Boolean elif(fieldTypeName == "long"): fieldTypeName = BigInteger else: raise ValueError("Bad field type") # Get extra parameters (primary key or not null) extraParam = "" if(FieldCleaner.cleanString(fields[key].field_type.description) == "primary_key"): classFieldDict[newKey] = Column(fieldTypeName, primary_key=True) primaryAssigned = True elif(fields[key].required): classFieldDict[newKey] = Column(fieldTypeName, nullable=False) else: classFieldDict[newKey] = Column(fieldTypeName) # First record will hold field names fieldNameMap[str(newKey)] = str(key) # Add column for row number classFieldDict["row"] = Column(Integer, nullable=False) if(not primaryAssigned): # If no primary key assigned, add one based on table name classFieldDict["".join([tableName,"id"])] = Column(Integer, primary_key = True) # Create ORM class based on dict self.orm = type(tableName,(declarative_base(),),classFieldDict) self.jobId = jobId # Create table self.orm.__table__.create(self.interface.engine) # Add field name map to table self.interface.addFieldNameMap(tableName,fieldNameMap)
def openFile(self,region,bucket,filename,csvSchema,bucketName,errorFilename): """ Opens file and prepares to read each record, mapping entries to specified column names Args: bucket : the S3 Bucket filename: The file path for the CSV file in S3 writer: An implementation of csvAbstractWriter to send header errors to Returns: """ possibleFields = {} currentFields = {} for schema in csvSchema: possibleFields[FieldCleaner.cleanString(schema.name)] = 0 self.filename = filename self.unprocessed = '' self.extraLine = False self.lines = [] self.headerDictionary = {} self.packetCounter = 0 current = 0 self.isFinished = False self.columnCount = 0 line = self._getLine() # make sure we have not finished reading the file if(self.isFinished) : raise ResponseException("CSV file must have a header",StatusCode.CLIENT_ERROR,ValueError,ValidationError.singleRow) duplicatedHeaders = [] #create the header for row in csv.reader([line],dialect='excel'): for cell in row : headerValue = FieldCleaner.cleanString(cell) if( not headerValue in possibleFields) : # Allow unexpected headers, just mark the header as None so we skip it when reading self.headerDictionary[(current)] = None current += 1 elif(possibleFields[headerValue] == 1) : # Add to duplicated header list duplicatedHeaders.append(headerValue) else: self.headerDictionary[(current)] = headerValue possibleFields[headerValue] = 1 current += 1 self.columnCount = current #Check that all required fields exists missingHeaders = [] for schema in csvSchema : if(schema.required and possibleFields[FieldCleaner.cleanString(schema.name)] == 0) : missingHeaders.append(schema.name) if(len(missingHeaders) > 0 or len(duplicatedHeaders) > 0): # Write header errors if any occurred and raise a header_error exception with self.getWriter(bucketName, errorFilename, self.headerReportHeaders, self.isLocal) as writer: extraInfo = {} if(len(duplicatedHeaders) > 0): extraInfo["duplicated_headers"] = ", ".join(duplicatedHeaders) for header in duplicatedHeaders: writer.write(["Duplicated header", header]) if(len(missingHeaders) > 0): extraInfo["missing_headers"] = ", ".join(missingHeaders) for header in missingHeaders: writer.write(["Missing header", header]) writer.finishBatch() raise ResponseException("Errors in header row", StatusCode.CLIENT_ERROR, ValueError,ValidationError.headerError,**extraInfo)