def evaluateRule(cls,data,rule,datatype,interfaces,record): """ Checks data against specified rule Args: data: Data to be checked rule: Rule object to test against datatype: Type to convert data into interfaces: InterfaceHolder object to the databases record: Some rule types require the entire record as a dict Returns: True if rule passed, False otherwise """ if data is None: # Treat blank as an empty string data = "" value = rule.rule_text_1 currentRuleType = rule.rule_type.name # Call specific rule function ruleFunction = "_".join(["rule",str(currentRuleType).lower()]) ruleFunction = FieldCleaner.cleanString(ruleFunction) try: ruleMethod = getattr(cls, str(ruleFunction)) return ruleMethod(data, value, rule, datatype, interfaces, record) except AttributeError as e: # Unrecognized rule type raise ResponseException(str(e), StatusCode.INTERNAL_ERROR, ValueError)
def insert(self, record, fileType): """ Write single record to this table Args: record: dict with column names as keys fileType: Type of file record is in Returns: True if successful """ # Need to translate the provided record to use column IDs instead of field names for keys idRecord = {} for key in record: idRecord[str( self.interfaces.validationDb.getColumnId( key, fileType))] = record[key] if (self.BATCH_INSERT): if (self.INSERT_BY_ORM): raise NotImplementedError( "Have not implemented ORM method for batch insert") else: self.batch.append(idRecord) if (len(self.batch) > self.BATCH_SIZE): # Time to write the batch self.interface.connection.execute( self.orm.__table__.insert(), self.batch) # Reset batch self.batch = [] return True else: if (self.INSERT_BY_ORM): try: recordOrm = self.orm() except: # createTable was not called raise Exception("Must call createTable before writing") attributes = self.getPublicMembers(recordOrm) # For each field, add value to ORM object for key in idRecord: attr = FieldCleaner.cleanString(key) #key.replace(" ","_") setattr(recordOrm, attr, idRecord[key]) self.interface.session.add(recordOrm) self.interface.session.commit() return True else: raise ValueError( "Must do either batch or use ORM, cannot set both to False" )
def insert(self, record, fileType): """ Write single record to this table Args: record: dict with column names as keys fileType: Type of file record is in Returns: True if successful """ # Need to translate the provided record to use column IDs instead of field names for keys idRecord = {} # Mark if header for key in record: if key == "row": idRecord[key] = record[key] else: idRecord[str(self.interfaces.validationDb.getColumnId(key,fileType))] = record[key] if(self.BATCH_INSERT): if(self.INSERT_BY_ORM): raise NotImplementedError("Have not implemented ORM method for batch insert") else: self.batch.append(idRecord) if(len(self.batch)>self.BATCH_SIZE): # Time to write the batch self.interface.connection.execute(self.orm.__table__.insert(),self.batch) # Reset batch self.batch = [] return True else: if(self.INSERT_BY_ORM): try: recordOrm = self.orm() except: # createTable was not called raise Exception("Must call createTable before writing") attributes = self.getPublicMembers(recordOrm) # For each field, add value to ORM object for key in idRecord: attr = FieldCleaner.cleanString(key) #key.replace(" ","_") setattr(recordOrm,attr,idRecord[key]) self.interface.session.add(recordOrm) self.interface.session.commit() return True else: raise ValueError("Must do either batch or use ORM, cannot set both to False")
def rule_exists_in_table(cls, data, value, rule, datatype, interfaces, record): """ Check that field value exists in specified table, rule_text_1 has table and column to check against, rule_text_2 is length to pad to """ ruleTextOne = str(rule.rule_text_1).split(",") if len(ruleTextOne) != 2: # Bad rule definition raise ResponseException("exists_in_table rule incorrectly defined, must have both table and field in rule_text_one",StatusCode.INTERNAL_ERROR,ValueError) # Not putting model name through FieldCleaner because model names will have uppercase model = getattr(domainModels,str(ruleTextOne[0]).strip()) field = FieldCleaner.cleanString(ruleTextOne[1]) ruleTextTwo = FieldCleaner.cleanString(rule.rule_text_2) if len(ruleTextTwo) == 0: # Skip padding paddedData = FieldCleaner.cleanString(data) else: # Pad data to correct length try: padLength = int(ruleTextTwo) except ValueError as e: # Need an integer in rule_text_two raise ResponseException("Need an integer width in rule_text_two for exists_in_table rules",StatusCode.INTERNAL_ERROR,ValueError) paddedData = FieldCleaner.cleanString(data).zfill(padLength) # Build query for model and field specified query = interfaces.validationDb.session.query(model).filter(getattr(model,field) == paddedData) try: # Check that value exists in table, should be unique interfaces.validationDb.runUniqueQuery(query,"Data not found in table", "Conflicting entries found for this data") # If unique result found, rule passed return True except ResponseException as e: # If exception is no result found, rule failed if type(e.wrappedException) == type(NoResultFound()): return False else: # This is an unexpected exception, so re-raise it raise
def getFieldsByFile(self, fileType): """ Returns a dict of valid field names that can appear in this type of file Args: fileType -- One of the set of valid types of files (e.g. Award, AwardFinancial) Returns: dict with field names as keys and values are ORM object FileColumn """ returnDict = {} fileId = self.getFileId(fileType) if(fileId is None) : raise ValueError("File type does not exist") queryResult = self.session.query(FileColumn).options(subqueryload("field_type")).filter(FileColumn.file_id == fileId).all() for column in queryResult : returnDict[FieldCleaner.cleanString(column.name)] = column return returnDict
def getFieldsByFileList(self, fileType): """ Returns a list of valid field names that can appear in this type of file Args: fileType -- One of the set of valid types of files (e.g. Award, AwardFinancial) Returns: list of names """ fileId = self.getFileId(fileType) returnList = [] if(fileId is None) : raise ValueError("Filetype does not exist") queryResult = self.session.query(FileColumn).filter(FileColumn.file_id == fileId).all() for result in queryResult: result.name = FieldCleaner.cleanString(result.name) # Standardize field names return queryResult
def rule_check_prefix(cls, data, value, rule, datatype, interfaces, record): """ Check that 1-digit prefix is consistent with reimbursable flag """ dataString = FieldCleaner.cleanString(data) # Load target field and dict to compare with targetField = FieldCleaner.cleanName(rule.rule_text_1) prefixMap = json.loads(str(rule.rule_text_2)) # Check that character and value are consistent with dict in rule_text_2 if dataString[0] not in prefixMap: # Unknown prefix, this is a failure return False source = prefixMap[dataString[0]] target = record[targetField] source = source.lower() if source is not None else source target = target.lower() if target is not None else target if source == target: # Matches the value in target field, rule passes return True else: return False
def createTable(self, fileType, filename, jobId, tableName=None): """ Create staging table for new file Args: fileType -- type of file to create a table for (e.g. Award, AwardFinancial) Returns: tableName if created, exception otherwise """ if(tableName == None): tableName = self.interface.getTableName(jobId) self.name = tableName if(self.interface.tableExists(tableName)): # Old table still present, drop table and replace self.interface.dropTable(tableName) # Alternate way of naming tables #tableName = "data" + tableName.replace("/","").replace("\\","").replace(".","") # Write tableName to related job in job tracker self.interfaces.jobDb.addStagingTable(jobId,tableName) fields = self.interfaces.validationDb.getFieldsByFile(fileType) """ Might not need sequence for ORM # Create sequence to be used for primary key sequenceName = tableName + "Serial" sequenceStatement = "CREATE SEQUENCE " + sequenceName + " START 1" try: self.runStatement(sequenceStatement) except ProgrammingError: # Sequence already exists pass """ primaryAssigned = False # Create empty dict for field names and values classFieldDict = {"__tablename__":tableName} # Create dict to hold record for field names fieldNameMap = {} # Add each column for key in fields: # Build column statement for this key # Create cleaned version of key newKey = str(fields[key].file_column_id) # Get correct type name fieldTypeName = FieldCleaner.cleanString(fields[key].field_type.name) if(fieldTypeName == "string"): fieldTypeName = Text elif(fieldTypeName == "int"): fieldTypeName = Integer elif(fieldTypeName == "decimal"): fieldTypeName = Numeric elif(fieldTypeName == "boolean"): fieldTypeName = Boolean elif(fieldTypeName == "long"): fieldTypeName = BigInteger else: raise ValueError("Bad field type") # Get extra parameters (primary key or not null) extraParam = "" if(FieldCleaner.cleanString(fields[key].field_type.description) == "primary_key"): classFieldDict[newKey] = Column(fieldTypeName, primary_key=True) primaryAssigned = True elif(fields[key].required): classFieldDict[newKey] = Column(fieldTypeName, nullable=False) else: classFieldDict[newKey] = Column(fieldTypeName) # First record will hold field names fieldNameMap[str(newKey)] = str(key) # Add column for row number classFieldDict["row"] = Column(Integer, nullable=False) if(not primaryAssigned): # If no primary key assigned, add one based on table name classFieldDict["".join([tableName,"id"])] = Column(Integer, primary_key = True) # Create ORM class based on dict self.orm = type(tableName,(declarative_base(),),classFieldDict) self.jobId = jobId # Create table self.orm.__table__.create(self.interface.engine) # Add field name map to table self.interface.addFieldNameMap(tableName,fieldNameMap)
def loadCsv(cls,filename,model,interface,fieldMap,fieldOptions): """ Loads a table based on a csv Args: filename: CSV to load model: ORM object for table to be loaded interface: interface to DB table is in fieldMap: dict that maps columns of the csv to attributes of the ORM object fieldOptions: dict with keys of attribute names, value contains a dict with options for that attribute. Current options are "pad_to_length" which if present will pad the field with leading zeros up to specified length, and "skip_duplicate" which ignores subsequent lines that repeat values. """ # Delete all records currently in table interface.session.query(model).delete() interface.session.commit() valuePresent = {} # Open csv with open(filename,'rU') as csvfile: # Read header header = csvfile.readline() # Split header into fieldnames rawFieldNames = header.split(",") fieldNames = [] # Clean field names for field in rawFieldNames: fieldNames.append(FieldCleaner.cleanString(field)) # Map fieldnames to attribute names attributeNames = [] for field in fieldNames: if field in fieldMap: attributeNames.append(fieldMap[field]) if fieldMap[field] in fieldOptions and "skip_duplicates" in fieldOptions[fieldMap[field]]: # Create empty dict for this field valuePresent[fieldMap[field]] = {} else: raise KeyError("".join(["Found unexpected field ", str(field)])) # Check that all fields are present for field in fieldMap: if not field in fieldNames: raise ValueError("".join([str(field)," is required for loading table ", str(type(model))])) # Open DictReader with attribute names reader = csv.DictReader(csvfile,fieldnames = attributeNames) # For each row, create instance of model and add it for row in reader: skipInsert = False for field in fieldOptions: # For each field with options present, modify according to those options options = fieldOptions[field] if "pad_to_length" in options: padLength = options["pad_to_length"] row[field] = Validator.padToLength(row[field],padLength) if "skip_duplicates" in options: if len(row[field].strip()) == 0 or row[field] in valuePresent[field]: # Value not provided or already exists, skip it skipInsert = True else: # Insert new value valuePresent[field][row[field]] = True record = model(**row) if not skipInsert: try: interface.session.merge(record) except IntegrityError as e: # Hit a duplicate value that violates index, skip this one print("".join(["Warning: Skipping this row: ",str(row)])) print("".join(["Due to error: ",str(e)])) interface.session.rollback() continue interface.session.commit()
def loadSql(cls, filename): """Load SQL-based validation rules to db.""" with createApp().app_context(): sess = GlobalDB.db().session # Delete all records currently in table sess.query(RuleSql).delete() # Create rule severity and file type lookups severity = sess.query(RuleSeverity) severityDict = {s.name: s.rule_severity_id for s in severity.all()} ft = sess.query(FileTypeValidation) fileTypeDict = {f.name: f.file_id for f in ft.all()} filename = os.path.join(cls.sql_rules_path, filename) # open csv with open(filename, 'rU') as csvfile: # read header header = csvfile.readline() # split header into filed names rawFieldNames = header.split(',') fieldNames = [] # clean field names for field in rawFieldNames: fieldNames.append(FieldCleaner.cleanString(field)) unknownFields = set(fieldNames) - set(cls.headers) if len(unknownFields) != 0: raise KeyError("".join([ "Found unexpected fields: ", str(list(unknownFields)) ])) missingFields = set(cls.headers) - set(fieldNames) if len(missingFields) != 0: raise ValueError("".join([ "Missing required fields: ", str(list(missingFields)) ])) reader = csv.DictReader(csvfile, fieldnames=fieldNames) for row in reader: sql = cls.readSqlStr(row['query_name']) rule_sql = RuleSql( rule_sql=sql, rule_label=row['rule_label'], rule_description=row['rule_description'], rule_error_message=row['rule_error_message'], query_name=row['query_name']) # look up file type id try: fileId = fileTypeDict[row["file_type"]] except Exception as e: raise Exception( "{}: file type={}, rule label={}. Rule not loaded." .format(e, row["file_type"], row["rule_label"])) try: if row["target_file"].strip() == "": # No target file provided targetFileId = None else: targetFileId = fileTypeDict[row["target_file"]] except Exception as e: raise Exception( "{}: file type={}, rule label={}. Rule not loaded." .format(e, row["target_file"], row["rule_label"])) # set cross file flag if (FieldCleaner.cleanString(row["rule_cross_file_flag"]) in ['true', 't', 'y', 'yes']): cross_file_flag = True else: cross_file_flag = False rule_sql.rule_severity_id = severityDict[ row['severity_name']] rule_sql.file_id = fileId rule_sql.target_file_id = targetFileId rule_sql.rule_cross_file_flag = cross_file_flag sess.merge(rule_sql) sess.commit()
def openFile(self,region,bucket,filename,csvSchema,bucketName,errorFilename): """ Opens file and prepares to read each record, mapping entries to specified column names Args: region: AWS region where the bucket is located (not used if instantiated as CsvLocalReader) bucket: the S3 Bucket (not used if instantiated as CsvLocalReader) filename: The file path for the CSV file in S3 csvSchema: list of FileColumn objects for this file type bucketName: bucket to send errors to errorFilename: filename for error report """ possibleFields = {} currentFields = {} for schema in csvSchema: possibleFields[FieldCleaner.cleanString(schema.name)] = 0 self.filename = filename self.unprocessed = '' self.extraLine = False self.lines = [] self.headerDictionary = {} self.packetCounter = 0 current = 0 self.isFinished = False self.columnCount = 0 line = self._getLine() # make sure we have not finished reading the file if(self.isFinished) : # Write header error for no header row with self.getWriter(bucketName, errorFilename, ["Error Type"], self.isLocal) as writer: writer.write(["No header row"]) writer.finishBatch() raise ResponseException("CSV file must have a header",StatusCode.CLIENT_ERROR,ValueError,ValidationError.singleRow) duplicatedHeaders = [] #create the header # check delimiters in header row pipeCount = line.count("|") commaCount = line.count(",") if pipeCount != 0 and commaCount != 0: # Write header error for mixed delimiter use with self.getWriter(bucketName, errorFilename, ["Error Type"], self.isLocal) as writer: writer.write(["Cannot use both ',' and '|' as delimiters. Please choose one."]) writer.finishBatch() raise ResponseException("Error in header row: CSV file must use only '|' or ',' as the delimiter", StatusCode.CLIENT_ERROR, ValueError, ValidationError.headerError) self.delimiter = "|" if line.count("|") != 0 else "," for row in csv.reader([line],dialect='excel', delimiter=self.delimiter): for cell in row : headerValue = FieldCleaner.cleanString(cell) if( not headerValue in possibleFields) : # Allow unexpected headers, just mark the header as None so we skip it when reading self.headerDictionary[(current)] = None current += 1 elif(possibleFields[headerValue] == 1) : # Add to duplicated header list duplicatedHeaders.append(headerValue) else: self.headerDictionary[(current)] = headerValue possibleFields[headerValue] = 1 current += 1 self.columnCount = current #Check that all required fields exists missingHeaders = [] for schema in csvSchema : if(possibleFields[FieldCleaner.cleanString(schema.name)] == 0) : missingHeaders.append(schema.name) if(len(missingHeaders) > 0 or len(duplicatedHeaders) > 0): # Write header errors if any occurred and raise a header_error exception errorString = "" with self.getWriter(bucketName, errorFilename, self.headerReportHeaders, self.isLocal) as writer: extraInfo = {} if(len(duplicatedHeaders) > 0): errorString = "".join([errorString, "Duplicated: ",", ".join(duplicatedHeaders)]) extraInfo["duplicated_headers"] = ", ".join(duplicatedHeaders) for header in duplicatedHeaders: writer.write(["Duplicated header", header]) if(len(missingHeaders) > 0): if(len(duplicatedHeaders)): # Separate missing and duplicated headers if both are present errorString += "| " errorString = "".join([errorString, "Missing: ",", ".join(missingHeaders)]) extraInfo["missing_headers"] = ", ".join(missingHeaders) for header in missingHeaders: writer.write(["Missing header", header]) writer.finishBatch() raise ResponseException("Errors in header row: " + str(errorString), StatusCode.CLIENT_ERROR, ValueError,ValidationError.headerError,**extraInfo)
def createTable(self, fileType, filename, jobId, tableName=None): """ Create staging table for new file Args: fileType -- type of file to create a table for (e.g. Award, AwardFinancial) Returns: tableName if created, exception otherwise """ if (tableName == None): tableName = self.interface.getTableName(jobId) self.name = tableName if (self.interface.tableExists(tableName)): # Old table still present, drop table and replace self.interface.dropTable(tableName) # Alternate way of naming tables #tableName = "data" + tableName.replace("/","").replace("\\","").replace(".","") # Write tableName to related job in job tracker self.interfaces.jobDb.addStagingTable(jobId, tableName) fields = self.interfaces.validationDb.getFieldsByFile(fileType) """ Might not need sequence for ORM # Create sequence to be used for primary key sequenceName = tableName + "Serial" sequenceStatement = "CREATE SEQUENCE " + sequenceName + " START 1" try: self.runStatement(sequenceStatement) except ProgrammingError: # Sequence already exists pass """ primaryAssigned = False # Create empty dict for field names and values classFieldDict = {"__tablename__": tableName} # Add each column for key in fields: # Build column statement for this key # Create cleaned version of key newKey = str(fields[key].file_column_id) # Get correct type name fieldTypeName = FieldCleaner.cleanString( fields[key].field_type.name) if (fieldTypeName == "string"): fieldTypeName = Text elif (fieldTypeName == "int"): fieldTypeName = Integer elif (fieldTypeName == "decimal"): fieldTypeName = Numeric elif (fieldTypeName == "boolean"): fieldTypeName = Boolean elif (fieldTypeName == "long"): fieldTypeName = BigInteger else: raise ValueError("Bad field type") # Get extra parameters (primary key or not null) extraParam = "" if (FieldCleaner.cleanString( fields[key].field_type.description) == "primary_key"): classFieldDict[newKey] = Column(fieldTypeName, primary_key=True) primaryAssigned = True elif (fields[key].required): classFieldDict[newKey] = Column(fieldTypeName, nullable=False) else: classFieldDict[newKey] = Column(fieldTypeName) if (not primaryAssigned): # If no primary key assigned, add one based on table name classFieldDict["".join([tableName, "id"])] = Column(Integer, primary_key=True) # Create ORM class based on dict self.orm = type(tableName, (declarative_base(), ), classFieldDict) self.jobId = jobId # Create table self.orm.__table__.create(self.interface.engine)
def open_file(self, region, bucket, filename, csv_schema, bucket_name, error_filename, long_to_short_dict): """ Opens file and prepares to read each record, mapping entries to specified column names Args: region: AWS region where the bucket is located (not used if instantiated as CsvLocalReader) bucket: the S3 Bucket (not used if instantiated as CsvLocalReader) filename: The file path for the CSV file in S3 csv_schema: list of FileColumn objects for this file type bucket_name: bucket to send errors to error_filename: filename for error report long_to_short_dict: mapping of long to short schema column names """ self.filename = filename self.unprocessed = '' self.extra_line = False self.lines = [] self.flex_dictionary = {} self.header_dictionary = {} self.packet_counter = 0 current = 0 self.is_finished = False self.column_count = 0 line = self._get_line() # make sure we have not finished reading the file if self.is_finished: # Write header error for no header row with self.get_writer(bucket_name, error_filename, ["Error Type"], self.is_local) as writer: writer.write(["No header row"]) writer.finishBatch() raise ResponseException("CSV file must have a header", StatusCode.CLIENT_ERROR, ValueError, ValidationError.singleRow) duplicated_headers = [] #create the header # check delimiters in header row pipe_count = line.count("|") comma_count = line.count(",") if pipe_count != 0 and comma_count != 0: # Write header error for mixed delimiter use with self.get_writer(bucket_name, error_filename, ["Error Type"], self.is_local) as writer: writer.write([ "Cannot use both ',' and '|' as delimiters. Please choose one." ]) writer.finishBatch() raise ResponseException( "Error in header row: CSV file must use only '|' or ',' as the delimiter", StatusCode.CLIENT_ERROR, ValueError, ValidationError.headerError) self.delimiter = "|" if line.count("|") != 0 else "," # Set the list of possible_fields, using the shorter, # machine-readable column names possible_fields = {} for schema in csv_schema: possible_fields[FieldCleaner.cleanString(schema.name_short)] = 0 for row in csv.reader([line], dialect='excel', delimiter=self.delimiter): # check to see if header contains long or short column names col_matches = 0 for value in row: if FieldCleaner.cleanString(value) in long_to_short_dict: col_matches += 1 # if most of column headers are in the long format, # we'll treat the file as having long headers if col_matches > .5 * len(row): long_headers = True else: long_headers = False for cell in row: submitted_header_value = FieldCleaner.cleanString(cell) if long_headers and submitted_header_value in long_to_short_dict: header_value = FieldCleaner.cleanString( long_to_short_dict[submitted_header_value]) elif long_headers: header_value = None else: header_value = submitted_header_value if not header_value in possible_fields: # Add flex headers to flex list if str(submitted_header_value).startswith("flex_"): self.flex_dictionary[current] = submitted_header_value else: self.flex_dictionary[current] = None # Allow unexpected headers, just mark the header as None so we skip it when reading self.header_dictionary[current] = None current += 1 elif possible_fields[header_value] == 1: # Add header value (as submitted) to duplicated header list duplicated_headers.append(submitted_header_value) else: self.header_dictionary[current] = header_value possible_fields[header_value] = 1 current += 1 self.column_count = current #Check that all required fields exists missing_headers = [] for schema in csv_schema: if possible_fields[FieldCleaner.cleanString( schema.name_short)] == 0: # return long colname for error reporting missing_headers.append(schema.name) if len(missing_headers) > 0 or len(duplicated_headers) > 0: # Write header errors if any occurred and raise a header_error exception error_string = "" with self.get_writer(bucket_name, error_filename, self.header_report_headers, self.is_local) as writer: extra_info = {} if len(duplicated_headers) > 0: error_string = "".join([ error_string, "Duplicated: ", ", ".join(duplicated_headers) ]) extra_info["duplicated_headers"] = ", ".join( duplicated_headers) for header in duplicated_headers: writer.write(["Duplicated header", header]) if len(missing_headers) > 0: if len(duplicated_headers): # Separate missing and duplicated headers if both are present error_string += "| " error_string = "".join([ error_string, "Missing: ", ", ".join(missing_headers) ]) extra_info["missing_headers"] = ", ".join(missing_headers) for header in missing_headers: writer.write(["Missing header", header]) writer.finishBatch() raise ResponseException( "Errors in header row: " + str(error_string), StatusCode.CLIENT_ERROR, ValueError, ValidationError.headerError, **extra_info) return long_headers
def openFile(self, region, bucket, filename, csvSchema, bucketName, errorFilename): """ Opens file and prepares to read each record, mapping entries to specified column names Args: bucket : the S3 Bucket filename: The file path for the CSV file in S3 writer: An implementation of csvAbstractWriter to send header errors to Returns: """ possibleFields = {} currentFields = {} for schema in csvSchema: possibleFields[FieldCleaner.cleanString(schema.name)] = 0 self.filename = filename self.unprocessed = '' self.extraLine = False self.lines = [] self.headerDictionary = {} self.packetCounter = 0 current = 0 self.isFinished = False self.columnCount = 0 line = self._getLine() # make sure we have not finished reading the file if (self.isFinished): raise ResponseException("CSV file must have a header", StatusCode.CLIENT_ERROR, ValueError, ValidationError.singleRow) duplicatedHeaders = [] #create the header for row in csv.reader([line], dialect='excel'): for cell in row: headerValue = FieldCleaner.cleanString(cell) if (not headerValue in possibleFields): # Allow unexpected headers, just mark the header as None so we skip it when reading self.headerDictionary[(current)] = None current += 1 elif (possibleFields[headerValue] == 1): # Add to duplicated header list duplicatedHeaders.append(headerValue) else: self.headerDictionary[(current)] = headerValue possibleFields[headerValue] = 1 current += 1 self.columnCount = current #Check that all required fields exists missingHeaders = [] for schema in csvSchema: if (schema.required and possibleFields[FieldCleaner.cleanString( schema.name)] == 0): missingHeaders.append(schema.name) if (len(missingHeaders) > 0 or len(duplicatedHeaders) > 0): # Write header errors if any occurred and raise a header_error exception with self.getWriter(bucketName, errorFilename, self.headerReportHeaders, self.isLocal) as writer: extraInfo = {} if (len(duplicatedHeaders) > 0): extraInfo["duplicated_headers"] = ", ".join( duplicatedHeaders) for header in duplicatedHeaders: writer.write(["Duplicated header", header]) if (len(missingHeaders) > 0): extraInfo["missing_headers"] = ", ".join(missingHeaders) for header in missingHeaders: writer.write(["Missing header", header]) writer.finishBatch() raise ResponseException("Errors in header row", StatusCode.CLIENT_ERROR, ValueError, ValidationError.headerError, **extraInfo)
def openFile(self,region,bucket,filename,csvSchema,bucketName,errorFilename): """ Opens file and prepares to read each record, mapping entries to specified column names Args: bucket : the S3 Bucket filename: The file path for the CSV file in S3 writer: An implementation of csvAbstractWriter to send header errors to Returns: """ possibleFields = {} currentFields = {} for schema in csvSchema: possibleFields[FieldCleaner.cleanString(schema.name)] = 0 self.filename = filename self.unprocessed = '' self.extraLine = False self.lines = [] self.headerDictionary = {} self.packetCounter = 0 current = 0 self.isFinished = False self.columnCount = 0 line = self._getLine() # make sure we have not finished reading the file if(self.isFinished) : raise ResponseException("CSV file must have a header",StatusCode.CLIENT_ERROR,ValueError,ValidationError.singleRow) duplicatedHeaders = [] #create the header for row in csv.reader([line],dialect='excel'): for cell in row : headerValue = FieldCleaner.cleanString(cell) if( not headerValue in possibleFields) : # Allow unexpected headers, just mark the header as None so we skip it when reading self.headerDictionary[(current)] = None current += 1 elif(possibleFields[headerValue] == 1) : # Add to duplicated header list duplicatedHeaders.append(headerValue) else: self.headerDictionary[(current)] = headerValue possibleFields[headerValue] = 1 current += 1 self.columnCount = current #Check that all required fields exists missingHeaders = [] for schema in csvSchema : if(schema.required and possibleFields[FieldCleaner.cleanString(schema.name)] == 0) : missingHeaders.append(schema.name) if(len(missingHeaders) > 0 or len(duplicatedHeaders) > 0): # Write header errors if any occurred and raise a header_error exception with self.getWriter(bucketName, errorFilename, self.headerReportHeaders, self.isLocal) as writer: extraInfo = {} if(len(duplicatedHeaders) > 0): extraInfo["duplicated_headers"] = ", ".join(duplicatedHeaders) for header in duplicatedHeaders: writer.write(["Duplicated header", header]) if(len(missingHeaders) > 0): extraInfo["missing_headers"] = ", ".join(missingHeaders) for header in missingHeaders: writer.write(["Missing header", header]) writer.finishBatch() raise ResponseException("Errors in header row", StatusCode.CLIENT_ERROR, ValueError,ValidationError.headerError,**extraInfo)