def test_round_column(
    test_config,
    round_columns,
    source_query,
    transformed_query,
    merge_columns,
    module_name,
):
    print(f"module_name: {module_name}")

    config = test_config
    add_to_tested_list(
        module_name=module_name,
        tested_fields=[y for x in round_columns.values() for y in x],
    )

    source_sample_df = get_data_from_query(query=source_query,
                                           config=config,
                                           sort_col=merge_columns["source"],
                                           sample=True)

    assert source_sample_df.shape[0] > 0

    sample_caserefs = get_merge_col_data_as_list(
        df=source_sample_df, column_name=merge_columns["source"])

    transformed_df = get_data_from_query(
        query=transformed_query,
        config=config,
        sort_col=merge_columns["transformed"],
        sample=False,
    )

    assert transformed_df.shape[0] > 0

    transformed_sample_df = transformed_df[transformed_df[
        merge_columns["transformed"]].isin(sample_caserefs)]

    result_df = merge_source_and_transformed_df(
        source_df=source_sample_df,
        transformed_df=transformed_sample_df,
        merge_columns=merge_columns,
    )

    log.debug(
        f"Checking {result_df.shape[0]} rows of data ({config.SAMPLE_PERCENTAGE}%)  from table: {module_name}"
    )

    assert result_df.shape[0] > 0
    for k, v in round_columns.items():

        for i in v:
            result_df["round2dp"] = result_df[i].apply(lambda x: round(x, 2))

            assert (result_df["round2dp"].astype(bool) == True).all()
def test_complex_joins(
    test_config,
    module_name,
    source_query,
    transformed_query,
    merge_columns,
    match_columns,
):
    log.debug(f"module_name: {module_name}")

    config = test_config

    source_sample_df = get_data_from_query(query=source_query,
                                           config=config,
                                           sort_col=merge_columns["source"],
                                           sample=True)

    assert source_sample_df.shape[0] > 0

    sample_caserefs = get_merge_col_data_as_list(
        df=source_sample_df, column_name=merge_columns["source"])

    transformed_df = get_data_from_query(
        query=transformed_query,
        config=config,
        sort_col=merge_columns["transformed"],
        sample=False,
    )

    assert transformed_df.shape[0] > 0

    transformed_sample_df = transformed_df[transformed_df[
        merge_columns["transformed"]].isin(sample_caserefs)]

    result_df = merge_source_and_transformed_df(
        source_df=source_sample_df,
        transformed_df=transformed_sample_df,
        merge_columns=merge_columns,
    )

    log.debug(
        f"Checking {result_df.shape[0]} rows of data ({config.SAMPLE_PERCENTAGE}%) from table: {module_name}"
    )
    assert result_df.shape[0] > 0
    for k, v in match_columns.items():
        match = result_df[k].equals(result_df[v])
        log.log(
            config.VERBOSE,
            f"checking {k} == {v}...."
            " {'OK' if match is True else 'oh no'} ",
        )

        assert match is True
def test_one_to_one_joins(
    test_config,
    join_columns,
    merge_columns,
    fk_child_query,
    fk_parent_query,
    module_name,
):
    print(f"module_name: {module_name}")

    add_to_tested_list(module_name=module_name,
                       tested_fields=[x for x in join_columns.keys()])

    config = test_config

    fk_child_df = get_data_from_query(
        query=fk_child_query,
        config=config,
        sample=True,
    )

    sample_caserefs = get_merge_col_data_as_list(
        df=fk_child_df, column_name=merge_columns["fk_child"])

    fk_parent_df = get_data_from_query(
        query=fk_parent_query,
        config=config,
        sort_col=merge_columns["fk_parent"],
        sample=False,
    )
    print(fk_parent_query)
    fk_parent_sample_df = fk_parent_df[fk_parent_df[
        merge_columns["fk_parent"]].isin(sample_caserefs)]

    fk_child_id_list = fk_child_df[[k for k in join_columns][0]].tolist()
    fk_child_id_list = [int(x) for x in fk_child_id_list]

    fk_parent_id_list = fk_parent_sample_df[[
        y for x in join_columns.values() for y in x.values()
    ][0]].tolist()
    fk_parent_id_list = [int(x) for x in fk_parent_id_list]

    print(
        f"Checking {fk_parent_df.shape[0]} rows of data ({config.SAMPLE_PERCENTAGE}%) from table: {module_name} "
    )
    success = set(fk_child_id_list) == set(fk_parent_id_list)
    print(
        config.VERBOSE,
        f"checking {[k for k in join_columns][0]} == "
        f"{[y for x in join_columns.values() for y in x.values()][0]}.... "
        f"{'OK' if success is True else 'oh no'} ",
    )
    assert success is True
def test_default_values(test_config, defaults, source_query, module_name):
    log.debug(f"module_name: {module_name}")

    config = test_config

    add_to_tested_list(module_name=module_name,
                       tested_fields=[x for x in defaults.keys()])

    source_sample_df = get_data_from_query(query=source_query,
                                           config=config,
                                           sample=True)

    assert source_sample_df.shape[0] > 0

    log.debug(
        f"Checking {source_sample_df.shape[0]} rows of data ({config.SAMPLE_PERCENTAGE}%) from table: {module_name} "
    )
    assert source_sample_df.shape[0] > 0
    for k, v in defaults.items():
        source_sample_df["compare_col"] = v
        matches = source_sample_df[k] == source_sample_df["compare_col"]
        total_matches = matches.sum()
        success = total_matches == source_sample_df.shape[0]
        print(
            config.VERBOSE,
            f"checking {k} == {v}...."
            f" {'OK' if success else 'oh no'} ",
        )
        assert success
예제 #5
0
def test_deputy_address_joins(test_config, ):

    config = test_config
    source_query = f"""
        SELECT distinct surname from transform.persons where casrec_mapping_file_name = 'deputy_persons_mapping';
    """

    source_sample_df = get_data_from_query(query=source_query,
                                           config=config,
                                           sample=True)

    assert source_sample_df.shape[0] > 0

    sample_caserefs = get_merge_col_data_as_list(df=source_sample_df,
                                                 column_name="surname")

    result = {"total": 0, "passed": 0, "failed": []}
    for single_case in sample_caserefs:
        result["total"] += 1
        transformed_query = f"""
            select
                persons.firstname, persons.surname,
                   addresses.postcode
            from transform.persons
            left outer join transform.addresses on persons.id = addresses.person_id
            where persons.casrec_mapping_file_name = 'deputy_persons_mapping'
            and persons.surname = '{single_case}'
            order by firstname, surname, postcode;
        """

        transformed_df = pd.read_sql_query(
            transformed_query, config.get_db_connection_string(db="migration"))

        casrec_query = f"""
            select distinct
                "Dep Forename" as firstname,
                   "Dep Surname" as surname,
                   "Dep Postcode" as postcode
            from casrec_csv.deputy
            left outer join casrec_csv.deputyship on deputy."Deputy No" = deputyship."Deputy No"
            left outer join casrec_csv.deputy_address on deputyship."Dep Addr No" = deputy_address."Dep Addr No"
            where deputy."Dep Surname" = '{single_case}'
            order by "Dep Forename", "Dep Surname", "Dep Postcode";
        """

        casrec_df = pd.read_sql_query(
            casrec_query, config.get_db_connection_string(db="migration"))

        try:
            pd.testing.assert_frame_equal(transformed_df, casrec_df)
            result["passed"] += 1
        except AssertionError:
            result["failed"].append(single_case)

    print(f"result: {result['passed']}/{result['total']} passed")
    if len(result["failed"]) > 0:
        print(f"Failed caserecnumbers: {', '.join(result['failed'])}")

    assert len(result["failed"]) == 0
예제 #6
0
def test_order_deputy_joins(test_config, ):

    config = test_config
    source_query = f"""
        SELECT distinct caserecnumber from transform.cases;
    """

    source_sample_df = get_data_from_query(query=source_query,
                                           config=config,
                                           sample=True)

    assert source_sample_df.shape[0] > 0

    sample_caserefs = get_merge_col_data_as_list(df=source_sample_df,
                                                 column_name="caserecnumber")

    result = {"total": 0, "passed": 0, "failed": []}
    for single_case in sample_caserefs:
        result["total"] += 1
        transformed_query = f"""
            select
                cases.caserecnumber,
                persons.firstname, persons.surname
            from transform.cases
            left outer join transform.order_deputy on cases.id = order_deputy.order_id
            left outer join transform.persons on persons.id = order_deputy.deputy_id
            where cases.caserecnumber = '{single_case}'
            order by cases.caserecnumber, firstname, surname;
        """

        transformed_df = pd.read_sql_query(
            transformed_query, config.get_db_connection_string(db="migration"))

        casrec_query = f"""
            select
                "order"."Case" as caserecnumber,
                   deputy."Dep Forename" as firstname,
                   deputy."Dep Surname" as surname
            from casrec_csv.order
            left outer join casrec_csv.deputyship on "order"."CoP Case" = deputyship."CoP Case"
            left outer join casrec_csv.deputy on deputyship."Deputy No" = deputy."Deputy No"
            where "order"."Case" = '{single_case}'
            order by "order"."Case", "Dep Forename", "Dep Surname";
        """

        casrec_df = pd.read_sql_query(
            casrec_query, config.get_db_connection_string(db="migration"))

        try:
            pd.testing.assert_frame_equal(transformed_df, casrec_df)
            result["passed"] += 1
        except AssertionError:
            result["failed"].append(single_case)

    print(f"result: {round(result['passed']/result['total']*100, 2)}% passed")
    if len(result["failed"]) > 0:
        print(f"Failed caserecnumbers: {', '.join(result['failed'])}")

    assert len(result["failed"]) == 0
예제 #7
0
def test_calculated_values(test_config, calculated_fields, source_query,
                           module_name):
    log.debug(f"module_name: {module_name}")
    print(f"module_name: {module_name}")

    # log.debug(f"source_query: {source_query}")

    config = test_config

    add_to_tested_list(module_name=module_name,
                       tested_fields=[x for x in calculated_fields.keys()])

    source_sample_df = get_data_from_query(query=source_query,
                                           config=config,
                                           sample=True)

    # print(source_sample_df.to_markdown())
    source_sample_df.info()
    assert source_sample_df.shape[0] > 0

    log.debug(f"Checking {source_sample_df.shape[0]} rows of data ("
              f"{config.SAMPLE_PERCENTAGE}%) from table: {module_name}")
    assert source_sample_df.shape[0] > 0
    for k, v in calculated_fields.items():
        print(f"k: {k}")
        print(f"v: {v}")
        source_sample_df["compare_col"] = v
        matches = source_sample_df[k] == source_sample_df["compare_col"]
        print(f"source_sample_df[k]: {source_sample_df[k]}")
        print(
            f'source_sample_df["compare_col"]: {source_sample_df["compare_col"]}'
        )

        total_matches = matches.sum()
        success = total_matches == source_sample_df.shape[0]
        log.log(
            config.VERBOSE,
            f"checking {k} == {v}.... "
            f"{'OK' if success else 'oh no'} ",
        )
        assert success
def test_simple_transformations(
    test_config,
    simple_matches,
    merge_columns,
    source_query,
    transformed_query,
    module_name,
):
    log.debug(f"module_name: {module_name}")

    add_to_tested_list(
        module_name=module_name,
        tested_fields=[y for x in simple_matches.values() for y in x]
        + [merge_columns["transformed"]],
    )

    config = test_config

    source_sample_df = get_data_from_query(
        query=source_query, config=config, sort_col=merge_columns["source"], sample=True
    )

    assert source_sample_df.shape[0] > 0

    sample_caserefs = get_merge_col_data_as_list(
        df=source_sample_df, column_name=merge_columns["source"]
    )

    transformed_df = get_data_from_query(
        query=transformed_query,
        config=config,
        sort_col=merge_columns["transformed"],
        sample=False,
    )

    assert transformed_df.shape[0] > 0

    transformed_sample_df = transformed_df[
        transformed_df[merge_columns["transformed"]].isin(sample_caserefs)
    ]

    result_df = merge_source_and_transformed_df(
        source_df=source_sample_df,
        transformed_df=transformed_sample_df,
        merge_columns=merge_columns,
    )

    log.debug(
        f"Checking {result_df.shape[0]} rows of data ({config.SAMPLE_PERCENTAGE}%) from table: {module_name} "
    )

    assert result_df.shape[0] > 0
    for k, v in simple_matches.items():
        for i in v:

            try:
                result_df[k] = pd.to_datetime(result_df[k], format="%Y-%m-%d")
                result_df[i] = pd.to_datetime(result_df[i], format="%Y-%m-%d")
            except Exception:
                result_df[k] = result_df[k].apply(lambda x: f"{x}")
                result_df[i] = result_df[i].apply(lambda x: f"{x}")

            match = result_df[k].equals(result_df[i])

            print(
                f"checking {k} == {i}.... " f"{'OK' if match is True else 'oh no'} ",
            )

            assert match is True
def test_map_conditional_lookup_tables(
    test_config,
    lookup_fields,
    merge_columns,
    source_query,
    transformed_query,
    module_name,
):
    print(f"module_name: {module_name}")

    add_to_tested_list(
        module_name=module_name,
        tested_fields=[x for x in lookup_fields.keys()] +
        [merge_columns["transformed"]],
    )

    config = test_config

    source_sample_df = get_data_from_query(query=source_query,
                                           config=config,
                                           sort_col=merge_columns["source"],
                                           sample=True)

    assert source_sample_df.shape[0] > 0

    sample_caserefs = get_merge_col_data_as_list(
        df=source_sample_df, column_name=merge_columns["source"])

    transformed_df = get_data_from_query(
        query=transformed_query,
        config=config,
        sort_col=merge_columns["transformed"],
        sample=False,
    )

    assert transformed_df.shape[0] > 0

    transformed_sample_df = transformed_df[transformed_df[
        merge_columns["transformed"]].isin(sample_caserefs)]

    result_df = merge_source_and_transformed_df(
        source_df=source_sample_df,
        transformed_df=transformed_sample_df,
        merge_columns=merge_columns,
    )

    log.debug(
        f"Checking {result_df.shape[0]} rows of data ({config.SAMPLE_PERCENTAGE}%) from table: {module_name} "
    )

    assert result_df.shape[0] > 0
    for k, v in lookup_fields.items():
        lookup_filename = lookup_fields[k]["lookup_def"]

        for i, j in v.items():
            result_col = lookup_fields[k]["cols"]["result"]

            reference_col = lookup_fields[k]["cols"]["reference"]

            lookup_dict = get_lookup_dict(file_name=lookup_filename.lower())

            result_df["mapped"] = result_df[reference_col].map(lookup_dict)

            result_df = result_df.loc[result_df["mapped"].notnull()]

            result_df[result_col] = result_df[result_col].apply(
                lambda x: f"{x}")
            result_df[k] = result_df[k].apply(lambda x: f"{x}")

            assert result_df[result_col].fillna("").equals(
                result_df[k].fillna(""))

            match = result_df[result_col].fillna("").equals(
                result_df[k].fillna(""))

            print(
                f"checking {k} == {result_col}...."
                f""
                f" {'OK' if match is True else 'oh no'} ", )

            assert match is True
def test_squash_columns(
    test_config,
    squash_columns_fields,
    source_query,
    transformed_query,
    merge_columns,
    module_name,
):
    log.debug(f"module_name: {module_name}")

    config = test_config
    add_to_tested_list(module_name=module_name,
                       tested_fields=[x for x in squash_columns_fields.keys()])

    source_sample_df = get_data_from_query(query=source_query,
                                           config=config,
                                           sort_col=merge_columns["source"],
                                           sample=True)

    assert source_sample_df.shape[0] > 0

    sample_caserefs = get_merge_col_data_as_list(
        df=source_sample_df, column_name=merge_columns["source"])

    transformed_df = get_data_from_query(
        query=transformed_query,
        config=config,
        sort_col=merge_columns["transformed"],
        sample=False,
    )

    assert transformed_df.shape[0] > 0

    transformed_sample_df = transformed_df[transformed_df[
        merge_columns["transformed"]].isin(sample_caserefs)]

    result_df = merge_source_and_transformed_df(
        source_df=source_sample_df,
        transformed_df=transformed_sample_df,
        merge_columns=merge_columns,
    )

    log.debug(
        f"Checking {result_df.shape[0]} rows of data ({config.SAMPLE_PERCENTAGE}%) from table: {module_name} "
    )

    assert result_df.shape[0] > 0

    for k, v in squash_columns_fields.items():
        for i, j in enumerate(v):

            unsquashed = result_df[k].apply(pd.Series)
            unsquashed = unsquashed.rename(columns=lambda x: "unsq_" + str(x))

            match = result_df[j].equals(unsquashed[f"unsq_{i}"])

            log.log(
                config.VERBOSE,
                f"checking {j} == {k}:{i}...."
                f" {'OK' if match is True else 'oh no'} ",
            )

            assert match is True
예제 #11
0
def test_convert_to_bool(
    test_config,
    convert_to_bool_fields,
    source_query,
    transformed_query,
    merge_columns,
    module_name,
):
    log.debug(f"module_name: {module_name}")

    config = test_config
    add_to_tested_list(
        module_name=module_name,
        tested_fields=[y for x in convert_to_bool_fields.values() for y in x],
    )

    source_sample_df = get_data_from_query(
        query=source_query, config=config, sort_col=merge_columns["source"], sample=True
    )

    assert source_sample_df.shape[0] > 0

    sample_caserefs = get_merge_col_data_as_list(
        df=source_sample_df, column_name=merge_columns["source"]
    )

    transformed_df = get_data_from_query(
        query=transformed_query,
        config=config,
        sort_col=merge_columns["transformed"],
        sample=False,
    )

    assert transformed_df.shape[0] > 0

    transformed_sample_df = transformed_df[
        transformed_df[merge_columns["transformed"]].isin(sample_caserefs)
    ]

    result_df = merge_source_and_transformed_df(
        source_df=source_sample_df,
        transformed_df=transformed_sample_df,
        merge_columns=merge_columns,
    )

    log.debug(
        f"Checking {result_df.shape[0]} rows of data ({config.SAMPLE_PERCENTAGE}%)  from table: {module_name}"
    )
    assert result_df.shape[0] > 0
    for k, v in convert_to_bool_fields.items():
        for i in v:
            true_values = ["true", "True", "t", "T", "1", "1.0"]

            true_rows = result_df[result_df[k].isin(true_values)]
            total_true_rows = true_rows.shape[0]
            true_matches = true_rows[i]
            total_true_matches = true_matches.shape[0]

            t_match = total_true_rows == total_true_matches
            log.log(
                config.VERBOSE,
                f"checking True: {total_true_rows} {k} =="
                f" {total_true_matches} "
                f"{i}.... {'OK' if t_match is True else 'oh no'} ",
            )
            assert t_match

            false_rows = result_df[~result_df[k].isin(true_values)]
            total_false_rows = false_rows.shape[0]
            false_matches = false_rows[i]
            total_false_matches = false_matches.shape[0]

            f_match = total_false_rows == total_false_matches

            log.log(
                config.VERBOSE,
                f"checking False: {total_false_rows} {k} =="
                f" {total_false_matches} "
                f"{i}.... {'OK' if f_match is True else 'oh no'} ",
            )
            assert f_match