예제 #1
0
def normalize_data(df, target_fields):
    """Normalizes data before starting the cleansing"""

    checker_document = CheckerDocument()
    for field_code, field_data in target_fields.items():
        ref_type_id = field_data.get("ref_type_id")
        if ref_type_id:
            conditions = {"ref_type_id": ref_type_id}
            ref_values = checker_document.get_ref_value(conditions,
                                                        "code",
                                                        alias=True)
            df[field_code] = df[field_code].apply(replace_alias,
                                                  args=(ref_values, ))
예제 #2
0
    def get_ref_list(self, check, ref_type_id):
        """Fetches the references values list from the database"""

        list_values = check.get("values")
        if list_values:
            return list_values

        field_name = check.get("field_name")
        conditions = {"ref_type_id": ref_type_id}
        conditions.update(check.get("conditions", {}))
        checker_document = CheckerDocument()
        ref_values = checker_document.get_ref_value(conditions, field_name)

        return ref_values
예제 #3
0
    def check_column(self, df, column, empty_column, *args, **kwargs):
        """Checks if a given column matches the business requirements"""

        if not empty_column.all():
            check = kwargs.get("check")
            list_values = check.get("values")
            if list_values:
                return empty_column | df[column].isin(list_values)

            field_name = check.get("field_name")
            conditions = {"ref_type_id": kwargs.get("ref_type_id")}
            conditions.update(check.get("conditions", {}))
            checker_document = CheckerDocument()
            ref_values = checker_document.get_ref_value(conditions, field_name)

            return empty_column | df[column].str.lower().isin(
                {ref_value.lower()
                 for ref_value in ref_values})