示例#1
0
    def customValidator(self: object, class_path: str):
        """
        Dynamically load a class from a string in the format '<root folder>.<module filename>.<ClassName>'
        The class must inherit AbstractDQValidator and implement the validate() method.
        """
        if (class_path is None or len(class_path) == 0):
            raise ValidationError(
                "LANG Exception: class_path has not been set", None)

        class_data = class_path.split(".")
        module_path = ".".join(class_data[:-1])
        class_str = class_data[-1]

        try:
            module = importlib.import_module(module_path)

            # Finally, we retrieve the Class
            custom_validator = getattr(module, class_str)
        except ImportError as e:
            raise (ValidationError(
                "Unable to load: " + class_str + " from module: " +
                module_path, None))

        if (not issubclass(custom_validator, AbstractDUQValidator)):
            raise (ValidationError(
                "The custom validator '" + self.customValidator +
                "' must inherit AbstractDUQValidator.", None))

        obj = custom_validator(self.dataset, self.metadata)

        obj.validate()
        self.validation_errors.extend(obj.validation_errors)

        del obj
示例#2
0
    def __init__(self: object, dataset: dict, meta: dict):

        if (dataset is None):
            raise ValidationError("LANG Exception: DataSet has not been set",
                                  None)

        if (meta is None):
            raise ValidationError("LANG Exception: metadata has not been set",
                                  None)

        self.metadata = meta
        self.dataset = dataset
        self.validation_errors = []
        self.data_profile = []
示例#3
0
    def profileData(self: object, meta_attribute_definition: dict,
                    colData: dict, key: str):
        if (colData is None):
            raise ValidationError("LANG Exception: Coldata has not been set",
                                  None)

        if (meta_attribute_definition is None):
            raise ValidationError(
                "LANG Exception: meta_attribute_definition has not been set",
                None)

        profile = DataProfile()
        profile.profileData(meta_attribute_definition, colData, key)
        profile.setPosition(len(self.data_profile) + 1)
        self.data_profile.append(profile.to_dict())
示例#4
0
    def rowCount(dataset: dict):
        """
        Return the count of rows in the resultset.
        """
        if (dataset is None):
            raise ValidationError("LANG Exception: DataSet has not been set",
                                  None)

        return len(dataset)
示例#5
0
    def getColValuesAsDict(dataset: dict, *argv) -> dict:
        """
        Accepts an aribrtiary set of data columns as args and returns all the column values in a single dictionary.
        """
        if (dataset is None):
            raise ValidationError("LANG Exception: DataSet has not been set",
                                  None)

        if (argv is None):
            raise ValidationError("LANG Exception: argv has not been set",
                                  None)

        result = dict()

        for arg in argv:
            result[arg] = SQLTools.getColValues(dataset, arg)

        return result
示例#6
0
    def getColValues(dataset: dict, col: str) -> list:
        """
        Given a resultset and a column return all the values for that column as a list.
        """
        if (dataset is None):
            raise ValidationError("LANG Exception: DataSet has not been set",
                                  None)

        return dataset[col]
示例#7
0
    def addDataQualityError(self: object,
                            data_quality_error: DataQualityError):
        """
        Add a new dimension.
        """
        if (data_quality_error is None):
            raise ValidationError(
                "LANG Exception: DataQualityError has not been set", None)

        self.validation_errors.append(data_quality_error.to_dict())
示例#8
0
    def evaluateExpression(self, meta_attribute_definition: dict,
                           meta_attribute_key: str):
        # evaluate any custom expressions
        if (MetaUtils.exists(meta_attribute_definition, "Expression")):
            expr = meta_attribute_definition["Expression"]

            # %1 is a placeholder for whatever the column name is owning the expression (it's just a shortcut)
            expr = expr.replace("%1", "[" + meta_attribute_key + "]")
            exp = ExpressionBuilder()

            fields = exp.parseExpr(expr)
            colData = dict()

            # grab all of the columns that we need and store in a local dict
            for field in fields:

                # grab the column data out of the resultset
                values = SQLTools.getCol(self.dataset, field)

                # if the column couldn't be found then we have a configuration issue so raise an exception
                if (values is None):
                    raise ValidationError(
                        "Error evaluating expression: '" + expr +
                        "'. Unable to find column '" + field +
                        "' in the resultset", None)

                colData.update(values)

            # convert the seperate columns into an array of name,value pairs
            pairs = [dict(zip(colData, col)) for col in zip(*colData.values())]

            for pair in pairs:
                result = None
                ev = exp.merge(expr, pair)

                try:
                    result = eval(ev)
                except Exception as e:
                    self.addDataQualityError(
                        DataQualityError(meta_attribute_key,
                                         error_dimension=DataQualityDimension.
                                         BUSINESSRULECOMPLIANCE.value,
                                         description="Error: Expression '" +
                                         ev + "' returned an error '" +
                                         str(e) + "'"))
                    result = None

                if ((not result is None) and (result == False)):
                    self.addDataQualityError(
                        DataQualityError(meta_attribute_key,
                                         error_dimension=DataQualityDimension.
                                         BUSINESSRULECOMPLIANCE.value,
                                         description="Error: Expression '" +
                                         ev + "' returned FALSE"))
示例#9
0
    def __repr__(self):
        if (self.dataset is None):
            raise ValidationError("DataSet is NULL.", None)

        pt = PrettyTable()
        row = next(iter(self.dataset.values())
                   )  # grab an arbritary row so we can lift the keys
        pt.field_names = row.keys()
        for i in self.dataset:
            # pull the fields out of the resultset row and add as discreet elements to print
            pt.add_row([field for field in self.dataset[field].values()])

        return str(pt)
示例#10
0
    def xlsFileToDict(fileName: str, sheet_name: str = None) -> dict:
        """ xlsFileToDict:
        Converts an Excel spreadsheet into a dictionary of dictionaries.
        Each row is its own dictionary with each attribute recorded as a tupple,
        indexed by a row counter.
        
        Assumptions: The spreadsheet is well-formatted columns and rows.
        """
        data = {}

        # first we load the data into a simple
        workbook = load_workbook(filename=fileName,
                                 data_only=True,
                                 read_only=True)

        if (not sheet_name is None):
            if (sheet_name in workbook.sheetnames):
                sheet = workbook[sheet_name]
            else:
                raise ValidationError("Sheet '" + sheet_name + "' not found",
                                      None)
        else:
            sheet = workbook.active

        # extract the column headers - assumes headers in row 1 only - perhaps
        # this could be configrable :-)
        columns = next(sheet.iter_rows(min_row=1, max_row=1, values_only=True))

        # convert the data from rows into columns. This looks clunky
        # but it's somehow faster that iterating through the columns and
        # and provides an oportunity to clean up indiuvidual cell values.
        for row in sheet.iter_rows(min_row=2, min_col=1, values_only=True):
            col = 0

            for value in row:
                col_name = ("<Undefined_" + str(col) +
                            ">" if columns[col] is None else columns[col])

                if (not col_name in data):
                    data[col_name] = []

                data[col_name].append("(Null)" if value is None else FileTools.
                                      FormatString(str(value).strip()))
                col += 1

        workbook.close()
        return data
示例#11
0
    def validate(self:object, customValidator:str=None):
        """
        Validate a resultset against predefined metadata based on the LANG rules of data quality.
        """
        if (self.metadata is None):
            raise ValidationError("LANG Exception: meta-data has not been set", None)
        elif (self.dataset is None):
            raise ValidationError("LANG Exception: resultset has not been set", None)

        """
        Change request: find and output the primary key in the error report file if specified
        """
        primary_key = ""
        primary_key_values = None
        
        for key, item in self.metadata.items():                
            if (MetaUtils.isTrue(item, "PrimaryKey")):
                primary_key = key
                primary_key_values = self.dataset[primary_key]
                break
                
        """
        Execute a series of validations against the supplied column of data and the metadata for the column.
        Which validation is run is determined by entries in the metadata.
        """         
        for meta_attribute_key, meta_attribute_definition in self.metadata.items():                
            if (meta_attribute_key in self.dataset):
                print("Validating attribute \t'" + meta_attribute_key + "'...", end='\r')
                                
                attribute = self.dataset[meta_attribute_key]
                                
                for row_count in range(len(attribute)):
                    value = attribute[row_count]
                    
                    """ 
                    If a primarykey tag has been found then output the value so that the user 
                     has a reference to search for the record in the source system. 
                     If there is no primary key attribute set then output the row count 
                    """
                    
                    if (not primary_key_values is None):
                        primary_key_value = primary_key_values[row_count]
                    else:
                        primary_key_value = "Row: " + str(row_count+1)
                    
                    self.checkMandatory(meta_attribute_definition, meta_attribute_key, value, primary_key_value)                  
                    self.checkSize(meta_attribute_definition, meta_attribute_key, value, primary_key_value)
                    self.checkType(meta_attribute_definition, meta_attribute_key, value, primary_key_value)
                    self.checkEnum(meta_attribute_definition, meta_attribute_key, value, primary_key_value)
                    self.checkStartsWith(meta_attribute_definition, meta_attribute_key, value, primary_key_value)

                
                # format check (must provide a regex)
                if (MetaUtils.exists(meta_attribute_definition, "Format")):
                    re.purge()
                    regex=re.compile(meta_attribute_definition["Format"])
                    
                    for row_count in range(len(attribute)):
                        primary_key_value = primary_key_values[row_count]
                        value = attribute[row_count]
                        
                        isMatch = (not regex.match(value) is None)
                        
                        if ( (not isMatch) and (not MetaUtils.isAllowBlank(meta_attribute_definition)) ):
                            self.addDataQualityError(DataQualityError(meta_attribute_key,error_dimension=DataQualityDimension.FORMATCONSISTENCY.value, description="Error: Value '" + value + "' does not match regex #'" + meta_attribute_definition["Format"] + "'"))

                   
                # unique field check        
                if (MetaUtils.isTrue(meta_attribute_definition, "Unique") ):
                    # quick count the number of times values occurs in the column. Assumes possibly sorted so breaks the loop if >1 occurences to save time0
                    seen = set()          

                    for row_count in range(len(attribute)):
                        primary_key_value = primary_key_values[row_count]
                        value = attribute[row_count]

                        if (not value in seen):
                            seen.add(value) #only process a value once 
                        else:    
                            self.addDataQualityError(DataQualityError(meta_attribute_key,error_dimension=DataQualityDimension.UNIQUENESS.value, description="Error: Value '" + value + "' is not UNIQUE. A unique value was expected."))
                            
                self.checkComposite(meta_attribute_definition, meta_attribute_key)
                
                # expression evaluation is different to processing field specific validations as it could link in other columns from the resultset
                self.evaluateExpression(meta_attribute_definition, meta_attribute_key)

                print("Validating attribute \t'" + meta_attribute_key + "'...\t\t..Complete.")
            else:
                self.addDataQualityError(DataQualityError(meta_attribute_key, error_dimension=DataQualityDimension.METADATACOMPLIANCE.value, description="Error: Attribute '" + meta_attribute_key + "' was not found in the dataset."))
        
        # only invoke the custom validator if one has been provoded
        if (not customValidator is None and len(customValidator) > 0):
            self.customValidator(customValidator)