Python assert_dfs_equal示例，macpie.testing.assert_dfs_equal Python示例

示例#1

0

显示文件

def test_keepone(cli_keepone_big):
    G = ExecutableGraph()

    prim_filepath = data_dir / "instr1_primaryall.csv"

    primary = LavaDataset.from_file(prim_filepath)

    G.add_node(
        primary,
        operation=partial(
            group_by_keep_one,
            group_by_col=primary.id2_col_name,
            date_col_name=primary.date_col_name,
            keep="earliest",
            drop_duplicates=False,
        ),
    )

    G.execute()

    nodes_with_operations = G.get_all_node_data("operation")

    result = nodes_with_operations[0]["operation_result"]

    expected_result = file_to_dataframe(cli_keepone_big)

    assert_dfs_equal(result,
                     expected_result,
                     cols_ignore_pat=cols_ignore_pat,
                     output_dir=output_dir)

示例#2

0

显示文件

文件： test_small.py 项目： UCSFMemoryAndAging/macpie

def test_small_with_merge(cli_link_small_with_merge):
    expected_result = pd.read_excel(
        current_dir / "small_with_merge_expected_result.xlsx",
        sheet_name=MergeableAnchoredList.merged_dsetname,
        index_col=None,
        header=[0, 1],
    )

    # copy file to current dir if you want to debug more
    if output_dir is not None:
        copy(cli_link_small_with_merge, current_dir)

    results = pd.read_excel(
        cli_link_small_with_merge,
        sheet_name=MergeableAnchoredList.merged_dsetname,
        index_col=None,
        header=[0, 1],
    )

    assert_dfs_equal(
        results,
        expected_result,
        cols_ignore=cols_ignore,
        cols_ignore_pat=cols_ignore_pat,
        output_dir=output_dir,
    )

示例#3

0

显示文件

文件： test_small.py 项目： UCSFMemoryAndAging/macpie

def test_small_link_suffixes(tmp_path):
    # macpie link -g closest tests/cli/macpie/link/small.xlsx tests/data/instr2_all.csv tests/data/instr3_all.csv  # noqa: E501

    runner = CliRunner()

    cli_args = [
        "--id2-col",
        "pidn",
        "--date-col",
        "dcdate",
        "link",
        "--primary-keep",
        "all",
        "--secondary-get",
        "closest",
        "--secondary-days",
        90,
        "--secondary-when",
        "earlier_or_later",
        str((current_dir / "small.xlsx").resolve()),
        str((data_dir / "instr2_all.csv").resolve()),
        str((data_dir / "instr3_all.csv").resolve()),
    ]

    set_option("operators.binary.column_suffixes", ("_link", "_y"))

    with runner.isolated_filesystem(temp_dir=tmp_path):
        results = runner.invoke(main, cli_args)
        assert results.exit_code == 0
        # get the results file
        results_path = next(Path(".").glob("**/result*xlsx")).resolve()

        expected_result = pd.read_excel(
            current_dir / "small_link_suffixes_expected_result.xlsx",
            sheet_name=MergeableAnchoredList.merged_dsetname,
            index_col=None,
            header=[0, 1],
        )

        # copy file to current dir if you want to debug more
        if output_dir is not None:
            copy(results_path, current_dir)

        results = pd.read_excel(
            results_path,
            sheet_name=MergeableAnchoredList.merged_dsetname,
            index_col=None,
            header=[0, 1],
        )

        assert_dfs_equal(
            results,
            expected_result,
            cols_ignore=cols_ignore,
            cols_ignore_pat=cols_ignore_pat,
            output_dir=output_dir,
        )

    reset_option("operators.binary.column_suffixes")

示例#4

0

显示文件

文件： test_full.py 项目： UCSFMemoryAndAging/macpie

def test_full_no_link_id(tmp_path):
    # macpie link -g closest tests/cli/macpie/link/full_no_link_id.xlsx tests/data/instr2_all.csv tests/data/instr3_all.csv

    runner = CliRunner()

    cli_args = [
        "--id2-col",
        "pidn",
        "--date-col",
        "dcdate",
        "link",
        "--primary-keep",
        "all",
        "--secondary-get",
        "closest",
        "--secondary-days",
        90,
        "--secondary-when",
        "earlier_or_later",
        str((current_dir / "full_no_link_id.xlsx").resolve()),
        str((data_dir / "instr2_all.csv").resolve()),
        str((data_dir / "instr3_all.csv").resolve()),
    ]

    with runner.isolated_filesystem(temp_dir=tmp_path):
        results = runner.invoke(main, cli_args)

        assert results.exit_code == 0

        # get the results file
        results_path = next(Path(".").glob("**/result*xlsx"))

        # copy file to current dir if you want to debug more
        if output_dir is not None:
            copy(results_path, current_dir)

        results_dict = pd.read_excel(
            results_path, sheet_name=["instr2_all_DUPS", "instr3_all_DUPS"])

        result_secondary_instr2 = results_dict["instr2_all_DUPS"]
        result_secondary_instr3 = results_dict["instr3_all_DUPS"]

        assert_dfs_equal(
            result_secondary_instr2,
            expected_secondary_instr2,
            cols_ignore=cols_ignore,
            cols_ignore_pat=cols_ignore_pat,
            output_dir=output_dir,
        )

        assert_dfs_equal(
            result_secondary_instr3,
            expected_secondary_instr3,
            cols_ignore=cols_ignore,
            cols_ignore_pat=cols_ignore_pat,
            output_dir=output_dir,
        )

示例#5

0

显示文件

文件： test_main.py 项目： UCSFMemoryAndAging/macpie

def test_cli_keepone(cli_keepone_big):
    # copy file to current dir if you want to debug more
    if output_dir is not None:
        copy(cli_keepone_big, current_dir)

    result = file_to_dataframe(cli_keepone_big)
    expected_result = file_to_dataframe(current_dir / "expected_result.xlsx")

    assert_dfs_equal(result, expected_result, output_dir=output_dir)

示例#6

0

显示文件

def test_merge_partial():
    # partial merge

    merge_partial_result = primary.mac.date_proximity(
        secondary,
        id_on='pidn',
        date_on='dcdate',
        get='closest',
        when='earlier_or_later',
        days=90,
        left_link_id='instrid',
        merge='partial'
    )

    # merge_partial_result.to_excel(current_dir / "merge_partial_result.xlsx", index=False)
    merge_partial_expected_result = file_to_dataframe(current_dir / "merge_partial_expected_result.xlsx")
    assert_dfs_equal(merge_partial_result,
                     merge_partial_expected_result,
                     cols_ignore=cols_ignore,
                     output_dir=output_dir)

    # test that results are same when using equivalent id and date params
    test_id_on_params = primary.mac.date_proximity(
        secondary,
        id_left_on='pidn',
        id_right_on='pidn',
        date_on='dcdate',
        get='closest',
        when='earlier_or_later',
        days=90,
        left_link_id='instrid',
        merge='partial'
    )

    test_date_on_params = primary.mac.date_proximity(
        secondary,
        id_on='pidn',
        date_left_on='dcdate',
        date_right_on='dcdate',
        get='closest',
        when='earlier_or_later',
        days=90,
        left_link_id='instrid',
        merge='partial'
    )

    # test using id_left_on and id_right_on params
    assert merge_partial_result.equals(test_id_on_params)

    # test using date_left_on and date_right_on params
    assert merge_partial_result.equals(test_date_on_params)

示例#7

0

显示文件

文件： test_full.py 项目： UCSFMemoryAndAging/macpie

def test_full_no_merge(cli_link_full_no_merge, tmp_path):
    cli_link_full_no_merge_copy = Path(copy(cli_link_full_no_merge, tmp_path))

    expected_result = pd.read_excel(
        current_dir / "full_expected_results.xlsx",
        sheet_name=MergeableAnchoredList.merged_dsetname,
        header=[0, 1],
        index_col=None,
    )

    create_available_fields(cli_link_full_no_merge_copy)

    runner = CliRunner()
    cli_args = ["merge", str(cli_link_full_no_merge_copy.resolve())]

    with runner.isolated_filesystem(temp_dir=tmp_path):
        results = runner.invoke(main, cli_args)
        assert results.exit_code == 0

        # get the results file
        results_path = next(Path(".").glob("**/result*xlsx"))

        # copy file to current dir if you want to debug more
        if output_dir is not None:
            copy(results_path, current_dir)

        results_wb = pyxl.load_workbook(results_path,
                                        read_only=True,
                                        data_only=True)
        # expected_results_wb = pyxl.load_workbook(current_dir / "full_expected_results.xlsx")

        expected_sheetnames = [
            "instr2_all_DUPS",
            "instr3_all_DUPS",
            COLLECTION_SHEET_NAME,
            DATASETS_SHEET_NAME,
            MergeableAnchoredList.available_fields_sheetname,
            MergeableAnchoredList.merged_dsetname,
        ]

        assert all(sheetname in results_wb.sheetnames
                   for sheetname in expected_sheetnames)

        results = pd.read_excel(
            results_path,
            sheet_name=MergeableAnchoredList.merged_dsetname,
            header=[0, 1],
            index_col=None,
        )

        assert_dfs_equal(results, expected_result, output_dir=output_dir)

示例#8

0

显示文件

文件： test_main.py 项目： UCSFMemoryAndAging/macpie

def test_keep_latest_csv():
    # test latest
    df = file_to_dataframe(data_dir / "instr1_primaryall.csv")

    result = df.mac.group_by_keep_one(group_by_col="pidn",
                                      date_col_name="dcdate",
                                      keep="latest")

    expected_result = file_to_dataframe(data_dir / "instr1_primarylatest.csv")

    assert_dfs_equal(result,
                     expected_result,
                     cols_ignore=cols_ignore,
                     output_dir=output_dir)

示例#9

0

显示文件

文件： test_main.py 项目： UCSFMemoryAndAging/macpie

def test_keep_earliest_csv():
    # test earliest
    df = file_to_dataframe(data_dir / "instr1_primaryall.csv")

    result = df.mac.group_by_keep_one(group_by_col="pidn",
                                      date_col_name="dcdate",
                                      keep="earliest")

    assert get_option("column.system.duplicates") in result.columns

    expected_result = file_to_dataframe(data_dir /
                                        "instr1_primaryearliest.csv")

    assert_dfs_equal(result,
                     expected_result,
                     cols_ignore=cols_ignore,
                     output_dir=output_dir)

示例#10

0

显示文件

def test_merge_full():
    # full merge

    merge_full_result = primary.mac.date_proximity(
        secondary,
        id_on='pidn',
        date_on='dcdate',
        get='closest',
        when='earlier_or_later',
        days=90,
        left_link_id='instrid',
        merge='full'
    )

    # merge_full_result.to_excel(current_dir / "merge_full_result.xlsx", index=False)
    merge_full_expected_result = file_to_dataframe(current_dir / "merge_full_expected_result.xlsx")
    assert_dfs_equal(merge_full_result, merge_full_expected_result, cols_ignore=cols_ignore, output_dir=output_dir)

示例#11

0

显示文件

def test_merge_again(tmp_path):
    # macpie merge tests/cli/macpie/merge/merge_again/full_merged_once.xlsx

    expected_result = pd.read_excel(
        current_dir / "expected_results.xlsx",
        sheet_name=MergeableAnchoredList.merged_dsetname,
        header=[0, 1],
        index_col=None,
    )

    runner = CliRunner()

    # the full_merged_once.xlsx file was created from the result
    # of the tests.cli.merge.test_full.test_full_no_merge test,
    # and then removing the first duplicate in each set of duplicates for the
    # instr2_all dataset
    cli_args = [
        "merge",
        str((current_dir / "full_merged_once.xlsx").resolve())
    ]

    with runner.isolated_filesystem(temp_dir=tmp_path):
        results = runner.invoke(main, cli_args)
        assert results.exit_code == 0

        # get the results file
        results_path = next(Path(".").glob("**/result*xlsx"))

        # copy file to current dir if you want to debug more
        if output_dir is not None:
            copy(results_path, current_dir)

        results = pd.read_excel(
            results_path,
            sheet_name=MergeableAnchoredList.merged_dsetname,
            header=[0, 1],
            index_col=None,
        )

        assert_dfs_equal(
            results,
            expected_result,
            cols_ignore=cols_ignore,
            cols_ignore_pat=cols_ignore_pat,
            output_dir=output_dir,
        )

示例#12

0

显示文件

文件： test_basic.py 项目： UCSFMemoryAndAging/macpie

def test_add_suffixes_true():
    result = small_anchor.mac.merge(
        instr2_all_linked,
        left_on=['pidn', 'dcdate', 'instrid'],
        right_on=['pidn_x', 'dcdate_x', 'instrid_x'],
        merge_suffixes=('_a', '_b'),
        add_suffixes=True).mac.merge(
            instr3_all_linked,
            left_on=['pidn_a', 'dcdate_a', 'instrid_a'],
            right_on=['pidn_x', 'dcdate_x', 'instrid_x'],
            merge_suffixes=(None, '_c'),
            add_suffixes=True,
        )

    # result.to_excel(current_dir / "add_suffixes_true_result.xlsx", index=False)
    expected_result = pd.read_excel(current_dir /
                                    "add_suffixes_true_expected_result.xlsx")
    assert_dfs_equal(result, expected_result, output_dir=output_dir)

示例#13

0

显示文件

文件： test_left_link_id.py 项目： UCSFMemoryAndAging/macpie

def test_left_link_id_blank_merge_partial():
    # partial merge

    result = primary.mac.date_proximity(secondary,
                                        id_on='pidn',
                                        date_on='dcdate',
                                        get='closest',
                                        when='earlier_or_later',
                                        days=90,
                                        merge='partial')

    # result.to_excel(current_dir / "left_link_id_blank_merge_partial_result.xlsx", index=False)
    expected_result = file_to_dataframe(
        current_dir / "left_link_id_blank_merge_partial_expected_result.xlsx")
    assert_dfs_equal(result,
                     expected_result,
                     cols_ignore=cols_ignore,
                     output_dir=output_dir)

示例#14

0

显示文件

文件： test_main.py 项目： UCSFMemoryAndAging/macpie

def test_dupes():

    primary = file_to_dataframe(current_dir / "primary.xlsx")
    secondary = file_to_dataframe(current_dir / "secondary.xlsx")

    dupes_result = primary.mac.date_proximity(
        secondary,
        id_on='pidn',
        date_on='dcdate',
        get='closest',
        when='earlier_or_later',
        days=90,
        left_link_id='instrid',
        merge='full',
        duplicates_indicator=True
    )

    # dupes_result.to_excel(current_dir / "dupes_result.xlsx", index=False)
    dupes_expected_result = file_to_dataframe(current_dir / "dupes_expected_result.xlsx")
    assert_dfs_equal(dupes_result, dupes_expected_result, cols_ignore=cols_ignore, output_dir=output_dir)

示例#15

0

显示文件

文件： test_full.py 项目： UCSFMemoryAndAging/macpie

def test_full_no_merge(cli_link_full_no_merge):
    # copy file to current dir if you want to debug more
    if output_dir is not None:
        copy(cli_link_full_no_merge, current_dir)

    results_dict = pd.read_excel(
        cli_link_full_no_merge,
        sheet_name=["full_anchor", "instr2_all_DUPS", "instr3_all_DUPS"],
    )

    result_primary = results_dict["full_anchor"]
    result_secondary_instr2 = results_dict["instr2_all_DUPS"]
    result_secondary_instr3 = results_dict["instr3_all_DUPS"]

    assert_dfs_equal(result_primary, expected_primary, output_dir=output_dir)

    assert_dfs_equal(
        result_secondary_instr2,
        expected_secondary_instr2,
        cols_ignore=cols_ignore,
        cols_ignore_pat=cols_ignore_pat,
        output_dir=output_dir,
    )

    assert_dfs_equal(
        result_secondary_instr3,
        expected_secondary_instr3,
        cols_ignore=cols_ignore,
        cols_ignore_pat=cols_ignore_pat,
        output_dir=output_dir,
    )

示例#16

0

显示文件

文件： test_main.py 项目： UCSFMemoryAndAging/macpie

def test_instr2_small():

    dfs_dict = pd.read_excel(
        current_dir / "instr2_small.xlsx", sheet_name=["primary", "expected_results"]
    )

    primary = dfs_dict["primary"]
    secondary_instr1 = pd.read_csv(data_dir / "instr1_all.csv", parse_dates=[1])

    # test closest; earlier_or_later; 90 days
    small_result = primary.mac.date_proximity(
        secondary_instr1,
        id_on="pidn",
        date_on="dcdate",
        get="closest",
        when="earlier_or_later",
        days=90,
        left_link_id="instrid",
    )

    small_expected_result = dfs_dict["expected_results"]
    assert_dfs_equal(small_result, small_expected_result, cols_ignore=cols_ignore)

示例#17

0

显示文件

def run(filepath, tmp_path):
    expected_result = pd.read_excel(
        current_dir / "small_expected_results.xlsx",
        sheet_name=MergeableAnchoredList.merged_dsetname,
        header=[0, 1],
        index_col=None,
    )

    create_available_fields(filepath)

    runner = CliRunner()
    cli_args = ["merge", str(filepath.resolve())]

    with runner.isolated_filesystem(temp_dir=tmp_path):
        results = runner.invoke(main, cli_args)
        assert results.exit_code == 0

        # get the results file
        results_path = next(Path(".").glob("**/result*xlsx"))

        # copy file to current dir if you want to debug more
        if output_dir is not None:
            copy(results_path, current_dir)

        results = pd.read_excel(
            results_path,
            sheet_name=MergeableAnchoredList.merged_dsetname,
            header=[0, 1],
            index_col=None,
        )

        assert_dfs_equal(
            results,
            expected_result,
            cols_ignore=cols_ignore,
            cols_ignore_pat=cols_ignore_pat,
            output_dir=output_dir,
        )

示例#18

0

显示文件

文件： test_main.py 项目： UCSFMemoryAndAging/macpie

def test_secondary_instr1():

    secondary_instr1 = file_to_dataframe(data_dir / "instr1_all.csv")

    # test closest; earlier_or_later; 90 days
    instr1_result = primary.mac.date_proximity(
        secondary_instr1,
        id_on='pidn',
        date_on='dcdate',
        get='closest',
        when='earlier_or_later',
        days=90,
        left_link_id='instrid',
        duplicates_indicator=True
    )

    # instr1_result.to_excel(current_dir / "instr1_result.xlsx", index=False)

    instr1_expected_result = dfs_dict['INSTR1_linked']
    assert_dfs_equal(instr1_result,
                     instr1_expected_result,
                     cols_ignore=cols_ignore,
                     cols_ignore_pat=cols_ignore_pat)

示例#19

0

显示文件

文件： test_basic.py 项目： UCSFMemoryAndAging/macpie

def test_with_index():
    result = small_anchor.mac.merge(
        instr2_all_linked,
        left_on=['pidn', 'dcdate', 'instrid'],
        right_on=['pidn_x', 'dcdate_x', 'instrid_x'],
        merge_suffixes=('_a', '_b'),
        add_suffixes=False,
        add_indexes=('small_anchor', 'instr2_all_linked')).mac.merge(
            instr3_all_linked,
            left_on=[('small_anchor', 'pidn'), ('small_anchor', 'dcdate'),
                     ('small_anchor', 'instrid')],
            right_on=['pidn_x', 'dcdate_x', 'instrid_x'],
            merge_suffixes=('_a', '_b'),
            add_suffixes=False,
            add_indexes=(None, 'instr3_all_linked'))

    expected_result = pd.read_excel(current_dir /
                                    "with_index_expected_result.xlsx",
                                    index_col=0,
                                    header=[0, 1])

    assert (result.columns.equals(expected_result.columns))

    assert_dfs_equal(result, expected_result, output_dir=output_dir)

示例#20

0

显示文件

文件： test_small.py 项目： UCSFMemoryAndAging/macpie

def test_small_no_merge(cli_link_small_no_merge):
    expected_dict = pd.read_excel(
        current_dir / "small.xlsx",
        sheet_name=["LINK_INSTR1", "INSTR2_linked", "INSTR3_linked"])

    expected_primary = expected_dict["LINK_INSTR1"]
    expected_secondary_instr2 = expected_dict["INSTR2_linked"]
    expected_secondary_instr3 = expected_dict["INSTR3_linked"]

    # copy file to current dir if you want to debug more
    if output_dir is not None:
        copy(cli_link_small_no_merge, current_dir)

    results_dict = pd.read_excel(
        cli_link_small_no_merge,
        sheet_name=["small_anchor", "instr2_all_linked", "instr3_all_linked"],
    )

    result_primary = results_dict["small_anchor"]
    result_secondary_instr2 = results_dict["instr2_all_linked"]
    result_secondary_instr3 = results_dict["instr3_all_linked"]

    assert_dfs_equal(
        result_primary,
        expected_primary,
        cols_ignore=cols_ignore,
        cols_ignore_pat=cols_ignore_pat,
        output_dir=output_dir,
    )

    cols_ignore2 = [
        get_option("column.system.abs_diff_days"),
        get_option("column.system.diff_days"),
        "PIDN",
        "VType",
        "_merge",
    ]

    assert_dfs_equal(
        result_secondary_instr2,
        expected_secondary_instr2,
        cols_ignore=cols_ignore2,
        cols_ignore_pat=cols_ignore_pat,
        output_dir=output_dir,
    )

    assert_dfs_equal(
        result_secondary_instr3,
        expected_secondary_instr3,
        cols_ignore=cols_ignore2,
        cols_ignore_pat=cols_ignore_pat,
        output_dir=output_dir,
    )

示例#21

0

显示文件

def test_instr1():

    dfs_dict = pd.read_excel(data_dir / "instr1.xlsx",
                             sheet_name=[
                                 'primary', 'closest_earlier_or_later_90',
                                 'closest_later_90', 'closest_earlier_90',
                                 'all_earlier_or_later_90', 'all_later_90',
                                 'all_earlier_90'
                             ])

    primary = dfs_dict['primary']
    secondary = file_to_dataframe(data_dir / "instr1_all.csv")

    # test closest; earlier_or_later; 90 days
    closest_earlier_or_later_90_result = primary.mac.date_proximity(
        secondary,
        id_on='pidn',
        date_on='dcdate',
        get='closest',
        when='earlier_or_later',
        days=90)
    # closest_earlier_or_later_90_result.to_excel(current_dir / "closest_earlier_or_later_90_result.xlsx", index=False)
    closest_earlier_or_later_90_expected_result = dfs_dict[
        'closest_earlier_or_later_90']
    assert_dfs_equal(closest_earlier_or_later_90_result,
                     closest_earlier_or_later_90_expected_result,
                     cols_ignore=cols_ignore,
                     cols_ignore_pat=cols_ignore_pat,
                     output_dir=output_dir)

    # test closest; later; 90 days
    closest_later_90_result = primary.mac.date_proximity(secondary,
                                                         id_on='pidn',
                                                         date_on='dcdate',
                                                         get='closest',
                                                         when='later',
                                                         days=90)
    # closest_later_90_result.to_excel(current_dir / "closest_later_90_result.xlsx", index=False)
    closest_later_90_expected_result = dfs_dict['closest_later_90']
    assert_dfs_equal(closest_later_90_result,
                     closest_later_90_expected_result,
                     cols_ignore=cols_ignore,
                     cols_ignore_pat=cols_ignore_pat,
                     output_dir=output_dir)

    # test closest; earlier; 90 days
    closest_earlier_90_result = primary.mac.date_proximity(secondary,
                                                           id_on='pidn',
                                                           date_on='dcdate',
                                                           get='closest',
                                                           when='earlier',
                                                           days=90)

    # closest_earlier_90_result.to_excel(current_dir / "closest_earlier_90_result.xlsx", index=False)
    closest_earlier_90_expected_result = dfs_dict['closest_earlier_90']
    assert_dfs_equal(closest_earlier_90_result,
                     closest_earlier_90_expected_result,
                     cols_ignore=cols_ignore,
                     cols_ignore_pat=cols_ignore_pat,
                     output_dir=output_dir)

    # test all; earlier_or_later; 90 days
    all_earlier_or_later_90_result = primary.mac.date_proximity(
        secondary,
        id_on='pidn',
        date_on='dcdate',
        get='all',
        when='earlier_or_later',
        days=90)

    # all_earlier_or_later_90_result.to_excel(current_dir / "all_earlier_or_later_90_result.xlsx", index=False)
    all_earlier_or_later_90_expected_result = dfs_dict[
        'all_earlier_or_later_90']
    assert_dfs_equal(all_earlier_or_later_90_result,
                     all_earlier_or_later_90_expected_result,
                     cols_ignore=cols_ignore,
                     cols_ignore_pat=cols_ignore_pat,
                     output_dir=output_dir)

    # test all; later; 90 days
    all_later_90_result = primary.mac.date_proximity(secondary,
                                                     id_on='pidn',
                                                     date_on='dcdate',
                                                     get='all',
                                                     when='later',
                                                     days=90)

    # all_later_90_result.to_excel(current_dir / "all_later_90_result.xlsx", index=False)
    all_later_90_expected_result = dfs_dict['all_later_90']
    assert_dfs_equal(all_later_90_result,
                     all_later_90_expected_result,
                     cols_ignore=cols_ignore,
                     cols_ignore_pat=cols_ignore_pat,
                     output_dir=output_dir)

    # test all; earlier; 90 days
    all_earlier_90_result = primary.mac.date_proximity(secondary,
                                                       id_on='pidn',
                                                       date_on='dcdate',
                                                       get='all',
                                                       when='earlier',
                                                       days=90)

    # all_earlier_90_result.to_excel(current_dir / "all_earlier_90_result.xlsx", index=False)
    all_earlier_90_expected_result = dfs_dict['all_earlier_90']
    assert_dfs_equal(all_earlier_90_result,
                     all_earlier_90_expected_result,
                     cols_ignore=cols_ignore,
                     cols_ignore_pat=cols_ignore_pat,
                     output_dir=output_dir)

示例#22

0

显示文件

def test_link():
    # macpie link -g closest tests/cli/macpie/link/small.xlsx tests/data/instr2_all.csv tests/data/instr3_all.csv

    prim = LavaDataset.from_file(Path("tests/cli/macpie/link/small.xlsx"))
    sec_1 = LavaDataset.from_file(Path(data_dir / "instr2_all.csv"))
    sec_2 = LavaDataset.from_file(Path(data_dir / "instr3_all.csv"))

    prim_copy = deepcopy(prim)
    sec_1_copy = deepcopy(sec_1)
    sec_2_copy = deepcopy(sec_2)

    G = ExecutableGraph()

    G.add_node(
        prim,
        operation=partial(
            group_by_keep_one,
            group_by_col=prim.id2_col_name,
            date_col_name=prim.date_col_name,
            keep="all",
            id_col_name=prim.id_col_name,
            drop_duplicates=False,
        ),
    )
    G.add_node(sec_1)
    G.add_node(sec_2)

    G.add_edge(
        prim,
        sec_1,
        operation=partial(
            date_proximity,
            id_left_on=prim.id2_col_name,
            id_right_on=sec_1.id2_col_name,
            date_left_on=prim.date_col_name,
            date_right_on=sec_1.date_col_name,
            get="closest",
            when="earlier_or_later",
            days=90,
            left_link_id=prim.id_col_name,
        ),
    )
    G.add_edge(
        prim,
        sec_2,
        operation=partial(
            date_proximity,
            id_left_on=prim.id2_col_name,
            id_right_on=sec_2.id2_col_name,
            date_left_on=prim.date_col_name,
            date_right_on=sec_2.date_col_name,
            get="closest",
            when="earlier_or_later",
            days=90,
            left_link_id=prim.id_col_name,
        ),
    )

    G.execute()

    edges_with_operation_results = G.get_all_edge_data("operation_result")

    sec_1_copy = prim_copy.date_proximity(
        right_dset=sec_1_copy,
        get="closest",
        when="earlier_or_later",
        days=90,
        prepend_level_name=False,
    )

    assert_dfs_equal(
        sec_1_copy,
        edges_with_operation_results[0]["operation_result"],
        cols_ignore_pat=cols_ignore_pat,
        output_dir=output_dir,
    )

    sec_2_copy = prim_copy.date_proximity(
        right_dset=sec_2_copy,
        get="closest",
        when="earlier_or_later",
        days=90,
        prepend_level_name=False,
    )

    assert_dfs_equal(
        sec_2_copy,
        edges_with_operation_results[1]["operation_result"],
        cols_ignore_pat=cols_ignore_pat,
        output_dir=output_dir,
    )