예제 #1
0
def read_result(file_id, worksheet_id, domain_id, index):
    """Reads the data check result file"""

    result = {}
    try:
        result = {}
        result_df = get_check_results_df(file_id, worksheet_id)
        result_df = result_df.iloc[index]
        keys = ["label", "type", "rules", "ref_type_id"]
        checker_document = CheckerDocument()
        target_fields = checker_document.get_all_target_fields(domain_id, keys)

        for column in result_df.columns.values:
            s_check_res=result_df[column]
            indexes = s_check_res[s_check_res].index
            check_type, field_code, error_type, check_index = eval(column)
            for index in result_df.index:
                count = index
                if index in indexes:
                    target = result.setdefault(count, {})
                    target = target.setdefault(field_code, {})
                    field_data = target_fields.get(field_code)
                    checker = CheckerFactory.get_checker(check_type)
                    error_message = checker.get_message(field_data=field_data, check_type=check_type)
                    result[count][field_code][error_type] = {"check_type": check_type, "error_message": error_message}
                else:
                    result.setdefault(count, {})

        return result

    except pd.errors.EmptyDataError:
        return result
예제 #2
0
    def get(self, domain_id):
        try:
            checker_document = CheckerDocument()
            headers = checker_document.get_headers(domain_id)

            return jsonify(headers)
        except Exception as exp:
            print(exp)
            traceback.print_exc()
            return resp.response_with(resp.SERVER_ERROR_500)
예제 #3
0
def start_check_job(job_id, file_id, worksheet_id, mapping_id, domain_id, is_transformed, user_id, modifications=None):
    """Starts the data check service"""

    if is_transformed:
        transformed_path = worksheet_id.split('/')
        file_id = transformed_path[-2]
        worksheet_id = transformed_path[-1]

    keys = ["label", "type", "rules", "ref_type_id"]
    checker_document = CheckerDocument()
    modifier_document = ModifierDocument()
    target_fields = checker_document.get_all_target_fields(domain_id, keys)

    start = time.time()
    if modifications:
        rows_indices = set()
        rows_indices.update(map(int, modifications.keys()))
        nrows = len(rows_indices)
        skiprows = set(range(1, max(rows_indices) + 1)) - set([index + 1 for index in rows_indices])

        result_df = get_check_results_df(file_id, worksheet_id)
        mapped_df = get_mapped_df(file_id, worksheet_id, nrows=nrows, skiprows=skiprows)
        mapped_df.index = rows_indices
        modifier_document.save_modifications(worksheet_id, modifications, user_id)
        modifier_document.apply_modifications(mapped_df, worksheet_id, list(rows_indices))
        data_check_result = create_check_metadata(result_df.reset_index(), job_id, file_id, worksheet_id, mapping_id,
                                                  domain_id)
        result_df = check_modifications(mapped_df, result_df, target_fields, data_check_result, modifications,
                                        rows_indices)

        save_check_results_df(result_df, file_id, worksheet_id)
        print("end checks")
        print(time.time() - start)

    else:
        start = time.time()
        df = get_imported_data_df(file_id, worksheet_id, nrows=None, skiprows=None)
        final_df = apply_mapping_transformation(df, mapping_id, target_fields)
        modifier_document.apply_modifications(final_df, worksheet_id, is_all=True)
        normalize_data(final_df, target_fields)
        save_mapped_df(final_df, file_id, worksheet_id)
        print("end mapping")
        print(time.time() - start)
        data_check_result = create_check_metadata(final_df.reset_index(), job_id, file_id, worksheet_id, mapping_id,
                                                  domain_id)
        result_df = run_checks(final_df, target_fields, data_check_result)
        save_check_results_df(result_df, file_id, worksheet_id)
        print("end checks")
        print(time.time() - start)

    job_result_document = JobResultDocument()

    return job_result_document.save_check_job(data_check_result)
예제 #4
0
def normalize_data(df, target_fields):
    """Normalizes data before starting the cleansing"""

    checker_document = CheckerDocument()
    for field_code, field_data in target_fields.items():
        ref_type_id = field_data.get("ref_type_id")
        if ref_type_id:
            conditions = {"ref_type_id": ref_type_id}
            ref_values = checker_document.get_ref_value(conditions,
                                                        "code",
                                                        alias=True)
            df[field_code] = df[field_code].apply(replace_alias,
                                                  args=(ref_values, ))
예제 #5
0
    def get_ref_list(self, check, ref_type_id):
        """Fetches the references values list from the database"""

        list_values = check.get("values")
        if list_values:
            return list_values

        field_name = check.get("field_name")
        conditions = {"ref_type_id": ref_type_id}
        conditions.update(check.get("conditions", {}))
        checker_document = CheckerDocument()
        ref_values = checker_document.get_ref_value(conditions, field_name)

        return ref_values
예제 #6
0
    def get_message(self, **kwargs):

        field_data = kwargs.get("field_data")
        rule = {}
        check = kwargs.get("check_type")
        field_name = field_data.get("label")
        ref_type_id = field_data.get("ref_type_id")
        for check_rule in field_data.get("rules"):
            if check_rule["type"] == check:
                rule = check_rule
                break
        checker_document = CheckerDocument()
        reference_type = checker_document.get_reference_type(ref_type_id)

        return f"{field_name} must be {reference_type['label']} ({reference_type['version']})"
예제 #7
0
def apply_mapping_transformation(df, mapping_id, target_fields):
    """Apply the mapping transformation in case of multiple source mapping"""

    checker_document = CheckerDocument()
    transformed_df = pd.DataFrame(columns=list(target_fields.keys()))
    mappings = checker_document.get_mappings(mapping_id)
    for target, source in mappings.items():
        data_type = target_fields[target]["type"]
        if len(source) > 1:
            if data_type == "string":
                transformed_df[target] = df[source[0]].str.cat(df[source[1:]].astype(str), sep=" ")
            else:
                transformed_df[target] = df[source].apply(pd.to_numeric, errors='coerce').sum(axis=1)
        else:
            transformed_df[target] = df[source[0]]

    return transformed_df
예제 #8
0
    def check_column(self, df, column, empty_column, *args, **kwargs):
        """Checks if a given column matches the business requirements"""

        if not empty_column.all():
            check = kwargs.get("check")
            list_values = check.get("values")
            if list_values:
                return empty_column | df[column].isin(list_values)

            field_name = check.get("field_name")
            conditions = {"ref_type_id": kwargs.get("ref_type_id")}
            conditions.update(check.get("conditions", {}))
            checker_document = CheckerDocument()
            ref_values = checker_document.get_ref_value(conditions, field_name)

            return empty_column | df[column].str.lower().isin(
                {ref_value.lower()
                 for ref_value in ref_values})
예제 #9
0
def get_check_modifications(worksheet_id, domain_id):
    """Fetches the check modification data"""

    audit_trial = {}
    user_document = UserDocument()
    checker_document = CheckerDocument()
    modifier_document = ModifierDocument()

    target_fields = checker_document.get_all_target_fields(domain_id, ["label"])
    modified_data = modifier_document.get_modifications(worksheet_id, is_all=True)
    for modification in modified_data:
        for column, col_modif in modification["columns"].items():
            label = target_fields.get(column)["label"]
            line_modif = {"previous": col_modif["previous"][-1], "new": col_modif["new"], 
                          "updated_at": col_modif["updatedAt"],
                          "user": user_document.get_user_fullname(col_modif["userId"])}
            if audit_trial.get(label):
                audit_trial[label][modification["line"]] = line_modif
            else:
                audit_trial[label] = {}
                audit_trial[label][modification["line"]] = line_modif

    return audit_trial
예제 #10
0
def read_exposures(base_url, file_id, worksheet_id, url_params, is_transformed, domain_id, sort, filters, errors_filter):
    """Gets paginated data, filters and sorts data"""

    if is_transformed:
        transformed_path = worksheet_id.split('/')
        file_id = transformed_path[-2]
        worksheet_id = transformed_path[-1]

    indices = []
    filtred = False
    sort_indices = []
    filter_indices = set()
    checker_document = CheckerDocument()

    if errors_filter:
        filter_indices = apply_errors_filter(file_id, worksheet_id, errors_filter)
        filtred = True
    if filters:
        result_indices = apply_filter(file_id, worksheet_id, filters)
        filter_indices = filter_indices.intersection(set(result_indices)) if filter_indices else result_indices
        filtred = True
    if sort:
        sort_indices = apply_sort(file_id, worksheet_id, sort)
        if filter_indices:
            indices = [elem for elem in sort_indices if elem in filter_indices]
        filtred = True
    indices = indices if indices else sort_indices or list(filter_indices)
    total_lines = len(indices) if indices else checker_document.get_worksheet_length(worksheet_id)
    preview = get_dataframe_page(file_id, worksheet_id, base_url, url_params, total_lines, filtred, indices, sort)
    
    if preview.get("absolute_index"):
        preview["results"] = read_result(file_id, worksheet_id, domain_id, preview["absolute_index"])
    else:
        preview["results"] = read_result(file_id, worksheet_id, domain_id, preview["index"])
    
    return preview