def customValidator(self: object, class_path: str): """ Dynamically load a class from a string in the format '<root folder>.<module filename>.<ClassName>' The class must inherit AbstractDQValidator and implement the validate() method. """ if (class_path is None or len(class_path) == 0): raise ValidationError( "LANG Exception: class_path has not been set", None) class_data = class_path.split(".") module_path = ".".join(class_data[:-1]) class_str = class_data[-1] try: module = importlib.import_module(module_path) # Finally, we retrieve the Class custom_validator = getattr(module, class_str) except ImportError as e: raise (ValidationError( "Unable to load: " + class_str + " from module: " + module_path, None)) if (not issubclass(custom_validator, AbstractDUQValidator)): raise (ValidationError( "The custom validator '" + self.customValidator + "' must inherit AbstractDUQValidator.", None)) obj = custom_validator(self.dataset, self.metadata) obj.validate() self.validation_errors.extend(obj.validation_errors) del obj
def __init__(self: object, dataset: dict, meta: dict): if (dataset is None): raise ValidationError("LANG Exception: DataSet has not been set", None) if (meta is None): raise ValidationError("LANG Exception: metadata has not been set", None) self.metadata = meta self.dataset = dataset self.validation_errors = [] self.data_profile = []
def profileData(self: object, meta_attribute_definition: dict, colData: dict, key: str): if (colData is None): raise ValidationError("LANG Exception: Coldata has not been set", None) if (meta_attribute_definition is None): raise ValidationError( "LANG Exception: meta_attribute_definition has not been set", None) profile = DataProfile() profile.profileData(meta_attribute_definition, colData, key) profile.setPosition(len(self.data_profile) + 1) self.data_profile.append(profile.to_dict())
def rowCount(dataset: dict): """ Return the count of rows in the resultset. """ if (dataset is None): raise ValidationError("LANG Exception: DataSet has not been set", None) return len(dataset)
def getColValuesAsDict(dataset: dict, *argv) -> dict: """ Accepts an aribrtiary set of data columns as args and returns all the column values in a single dictionary. """ if (dataset is None): raise ValidationError("LANG Exception: DataSet has not been set", None) if (argv is None): raise ValidationError("LANG Exception: argv has not been set", None) result = dict() for arg in argv: result[arg] = SQLTools.getColValues(dataset, arg) return result
def getColValues(dataset: dict, col: str) -> list: """ Given a resultset and a column return all the values for that column as a list. """ if (dataset is None): raise ValidationError("LANG Exception: DataSet has not been set", None) return dataset[col]
def addDataQualityError(self: object, data_quality_error: DataQualityError): """ Add a new dimension. """ if (data_quality_error is None): raise ValidationError( "LANG Exception: DataQualityError has not been set", None) self.validation_errors.append(data_quality_error.to_dict())
def evaluateExpression(self, meta_attribute_definition: dict, meta_attribute_key: str): # evaluate any custom expressions if (MetaUtils.exists(meta_attribute_definition, "Expression")): expr = meta_attribute_definition["Expression"] # %1 is a placeholder for whatever the column name is owning the expression (it's just a shortcut) expr = expr.replace("%1", "[" + meta_attribute_key + "]") exp = ExpressionBuilder() fields = exp.parseExpr(expr) colData = dict() # grab all of the columns that we need and store in a local dict for field in fields: # grab the column data out of the resultset values = SQLTools.getCol(self.dataset, field) # if the column couldn't be found then we have a configuration issue so raise an exception if (values is None): raise ValidationError( "Error evaluating expression: '" + expr + "'. Unable to find column '" + field + "' in the resultset", None) colData.update(values) # convert the seperate columns into an array of name,value pairs pairs = [dict(zip(colData, col)) for col in zip(*colData.values())] for pair in pairs: result = None ev = exp.merge(expr, pair) try: result = eval(ev) except Exception as e: self.addDataQualityError( DataQualityError(meta_attribute_key, error_dimension=DataQualityDimension. BUSINESSRULECOMPLIANCE.value, description="Error: Expression '" + ev + "' returned an error '" + str(e) + "'")) result = None if ((not result is None) and (result == False)): self.addDataQualityError( DataQualityError(meta_attribute_key, error_dimension=DataQualityDimension. BUSINESSRULECOMPLIANCE.value, description="Error: Expression '" + ev + "' returned FALSE"))
def __repr__(self): if (self.dataset is None): raise ValidationError("DataSet is NULL.", None) pt = PrettyTable() row = next(iter(self.dataset.values()) ) # grab an arbritary row so we can lift the keys pt.field_names = row.keys() for i in self.dataset: # pull the fields out of the resultset row and add as discreet elements to print pt.add_row([field for field in self.dataset[field].values()]) return str(pt)
def xlsFileToDict(fileName: str, sheet_name: str = None) -> dict: """ xlsFileToDict: Converts an Excel spreadsheet into a dictionary of dictionaries. Each row is its own dictionary with each attribute recorded as a tupple, indexed by a row counter. Assumptions: The spreadsheet is well-formatted columns and rows. """ data = {} # first we load the data into a simple workbook = load_workbook(filename=fileName, data_only=True, read_only=True) if (not sheet_name is None): if (sheet_name in workbook.sheetnames): sheet = workbook[sheet_name] else: raise ValidationError("Sheet '" + sheet_name + "' not found", None) else: sheet = workbook.active # extract the column headers - assumes headers in row 1 only - perhaps # this could be configrable :-) columns = next(sheet.iter_rows(min_row=1, max_row=1, values_only=True)) # convert the data from rows into columns. This looks clunky # but it's somehow faster that iterating through the columns and # and provides an oportunity to clean up indiuvidual cell values. for row in sheet.iter_rows(min_row=2, min_col=1, values_only=True): col = 0 for value in row: col_name = ("<Undefined_" + str(col) + ">" if columns[col] is None else columns[col]) if (not col_name in data): data[col_name] = [] data[col_name].append("(Null)" if value is None else FileTools. FormatString(str(value).strip())) col += 1 workbook.close() return data
def validate(self:object, customValidator:str=None): """ Validate a resultset against predefined metadata based on the LANG rules of data quality. """ if (self.metadata is None): raise ValidationError("LANG Exception: meta-data has not been set", None) elif (self.dataset is None): raise ValidationError("LANG Exception: resultset has not been set", None) """ Change request: find and output the primary key in the error report file if specified """ primary_key = "" primary_key_values = None for key, item in self.metadata.items(): if (MetaUtils.isTrue(item, "PrimaryKey")): primary_key = key primary_key_values = self.dataset[primary_key] break """ Execute a series of validations against the supplied column of data and the metadata for the column. Which validation is run is determined by entries in the metadata. """ for meta_attribute_key, meta_attribute_definition in self.metadata.items(): if (meta_attribute_key in self.dataset): print("Validating attribute \t'" + meta_attribute_key + "'...", end='\r') attribute = self.dataset[meta_attribute_key] for row_count in range(len(attribute)): value = attribute[row_count] """ If a primarykey tag has been found then output the value so that the user has a reference to search for the record in the source system. If there is no primary key attribute set then output the row count """ if (not primary_key_values is None): primary_key_value = primary_key_values[row_count] else: primary_key_value = "Row: " + str(row_count+1) self.checkMandatory(meta_attribute_definition, meta_attribute_key, value, primary_key_value) self.checkSize(meta_attribute_definition, meta_attribute_key, value, primary_key_value) self.checkType(meta_attribute_definition, meta_attribute_key, value, primary_key_value) self.checkEnum(meta_attribute_definition, meta_attribute_key, value, primary_key_value) self.checkStartsWith(meta_attribute_definition, meta_attribute_key, value, primary_key_value) # format check (must provide a regex) if (MetaUtils.exists(meta_attribute_definition, "Format")): re.purge() regex=re.compile(meta_attribute_definition["Format"]) for row_count in range(len(attribute)): primary_key_value = primary_key_values[row_count] value = attribute[row_count] isMatch = (not regex.match(value) is None) if ( (not isMatch) and (not MetaUtils.isAllowBlank(meta_attribute_definition)) ): self.addDataQualityError(DataQualityError(meta_attribute_key,error_dimension=DataQualityDimension.FORMATCONSISTENCY.value, description="Error: Value '" + value + "' does not match regex #'" + meta_attribute_definition["Format"] + "'")) # unique field check if (MetaUtils.isTrue(meta_attribute_definition, "Unique") ): # quick count the number of times values occurs in the column. Assumes possibly sorted so breaks the loop if >1 occurences to save time0 seen = set() for row_count in range(len(attribute)): primary_key_value = primary_key_values[row_count] value = attribute[row_count] if (not value in seen): seen.add(value) #only process a value once else: self.addDataQualityError(DataQualityError(meta_attribute_key,error_dimension=DataQualityDimension.UNIQUENESS.value, description="Error: Value '" + value + "' is not UNIQUE. A unique value was expected.")) self.checkComposite(meta_attribute_definition, meta_attribute_key) # expression evaluation is different to processing field specific validations as it could link in other columns from the resultset self.evaluateExpression(meta_attribute_definition, meta_attribute_key) print("Validating attribute \t'" + meta_attribute_key + "'...\t\t..Complete.") else: self.addDataQualityError(DataQualityError(meta_attribute_key, error_dimension=DataQualityDimension.METADATACOMPLIANCE.value, description="Error: Attribute '" + meta_attribute_key + "' was not found in the dataset.")) # only invoke the custom validator if one has been provoded if (not customValidator is None and len(customValidator) > 0): self.customValidator(customValidator)