def test_keepone(cli_keepone_big): G = ExecutableGraph() prim_filepath = data_dir / "instr1_primaryall.csv" primary = LavaDataset.from_file(prim_filepath) G.add_node( primary, operation=partial( group_by_keep_one, group_by_col=primary.id2_col_name, date_col_name=primary.date_col_name, keep="earliest", drop_duplicates=False, ), ) G.execute() nodes_with_operations = G.get_all_node_data("operation") result = nodes_with_operations[0]["operation_result"] expected_result = file_to_dataframe(cli_keepone_big) assert_dfs_equal(result, expected_result, cols_ignore_pat=cols_ignore_pat, output_dir=output_dir)
def test_small_with_merge(cli_link_small_with_merge): expected_result = pd.read_excel( current_dir / "small_with_merge_expected_result.xlsx", sheet_name=MergeableAnchoredList.merged_dsetname, index_col=None, header=[0, 1], ) # copy file to current dir if you want to debug more if output_dir is not None: copy(cli_link_small_with_merge, current_dir) results = pd.read_excel( cli_link_small_with_merge, sheet_name=MergeableAnchoredList.merged_dsetname, index_col=None, header=[0, 1], ) assert_dfs_equal( results, expected_result, cols_ignore=cols_ignore, cols_ignore_pat=cols_ignore_pat, output_dir=output_dir, )
def test_small_link_suffixes(tmp_path): # macpie link -g closest tests/cli/macpie/link/small.xlsx tests/data/instr2_all.csv tests/data/instr3_all.csv # noqa: E501 runner = CliRunner() cli_args = [ "--id2-col", "pidn", "--date-col", "dcdate", "link", "--primary-keep", "all", "--secondary-get", "closest", "--secondary-days", 90, "--secondary-when", "earlier_or_later", str((current_dir / "small.xlsx").resolve()), str((data_dir / "instr2_all.csv").resolve()), str((data_dir / "instr3_all.csv").resolve()), ] set_option("operators.binary.column_suffixes", ("_link", "_y")) with runner.isolated_filesystem(temp_dir=tmp_path): results = runner.invoke(main, cli_args) assert results.exit_code == 0 # get the results file results_path = next(Path(".").glob("**/result*xlsx")).resolve() expected_result = pd.read_excel( current_dir / "small_link_suffixes_expected_result.xlsx", sheet_name=MergeableAnchoredList.merged_dsetname, index_col=None, header=[0, 1], ) # copy file to current dir if you want to debug more if output_dir is not None: copy(results_path, current_dir) results = pd.read_excel( results_path, sheet_name=MergeableAnchoredList.merged_dsetname, index_col=None, header=[0, 1], ) assert_dfs_equal( results, expected_result, cols_ignore=cols_ignore, cols_ignore_pat=cols_ignore_pat, output_dir=output_dir, ) reset_option("operators.binary.column_suffixes")
def test_full_no_link_id(tmp_path): # macpie link -g closest tests/cli/macpie/link/full_no_link_id.xlsx tests/data/instr2_all.csv tests/data/instr3_all.csv runner = CliRunner() cli_args = [ "--id2-col", "pidn", "--date-col", "dcdate", "link", "--primary-keep", "all", "--secondary-get", "closest", "--secondary-days", 90, "--secondary-when", "earlier_or_later", str((current_dir / "full_no_link_id.xlsx").resolve()), str((data_dir / "instr2_all.csv").resolve()), str((data_dir / "instr3_all.csv").resolve()), ] with runner.isolated_filesystem(temp_dir=tmp_path): results = runner.invoke(main, cli_args) assert results.exit_code == 0 # get the results file results_path = next(Path(".").glob("**/result*xlsx")) # copy file to current dir if you want to debug more if output_dir is not None: copy(results_path, current_dir) results_dict = pd.read_excel( results_path, sheet_name=["instr2_all_DUPS", "instr3_all_DUPS"]) result_secondary_instr2 = results_dict["instr2_all_DUPS"] result_secondary_instr3 = results_dict["instr3_all_DUPS"] assert_dfs_equal( result_secondary_instr2, expected_secondary_instr2, cols_ignore=cols_ignore, cols_ignore_pat=cols_ignore_pat, output_dir=output_dir, ) assert_dfs_equal( result_secondary_instr3, expected_secondary_instr3, cols_ignore=cols_ignore, cols_ignore_pat=cols_ignore_pat, output_dir=output_dir, )
def test_cli_keepone(cli_keepone_big): # copy file to current dir if you want to debug more if output_dir is not None: copy(cli_keepone_big, current_dir) result = file_to_dataframe(cli_keepone_big) expected_result = file_to_dataframe(current_dir / "expected_result.xlsx") assert_dfs_equal(result, expected_result, output_dir=output_dir)
def test_merge_partial(): # partial merge merge_partial_result = primary.mac.date_proximity( secondary, id_on='pidn', date_on='dcdate', get='closest', when='earlier_or_later', days=90, left_link_id='instrid', merge='partial' ) # merge_partial_result.to_excel(current_dir / "merge_partial_result.xlsx", index=False) merge_partial_expected_result = file_to_dataframe(current_dir / "merge_partial_expected_result.xlsx") assert_dfs_equal(merge_partial_result, merge_partial_expected_result, cols_ignore=cols_ignore, output_dir=output_dir) # test that results are same when using equivalent id and date params test_id_on_params = primary.mac.date_proximity( secondary, id_left_on='pidn', id_right_on='pidn', date_on='dcdate', get='closest', when='earlier_or_later', days=90, left_link_id='instrid', merge='partial' ) test_date_on_params = primary.mac.date_proximity( secondary, id_on='pidn', date_left_on='dcdate', date_right_on='dcdate', get='closest', when='earlier_or_later', days=90, left_link_id='instrid', merge='partial' ) # test using id_left_on and id_right_on params assert merge_partial_result.equals(test_id_on_params) # test using date_left_on and date_right_on params assert merge_partial_result.equals(test_date_on_params)
def test_full_no_merge(cli_link_full_no_merge, tmp_path): cli_link_full_no_merge_copy = Path(copy(cli_link_full_no_merge, tmp_path)) expected_result = pd.read_excel( current_dir / "full_expected_results.xlsx", sheet_name=MergeableAnchoredList.merged_dsetname, header=[0, 1], index_col=None, ) create_available_fields(cli_link_full_no_merge_copy) runner = CliRunner() cli_args = ["merge", str(cli_link_full_no_merge_copy.resolve())] with runner.isolated_filesystem(temp_dir=tmp_path): results = runner.invoke(main, cli_args) assert results.exit_code == 0 # get the results file results_path = next(Path(".").glob("**/result*xlsx")) # copy file to current dir if you want to debug more if output_dir is not None: copy(results_path, current_dir) results_wb = pyxl.load_workbook(results_path, read_only=True, data_only=True) # expected_results_wb = pyxl.load_workbook(current_dir / "full_expected_results.xlsx") expected_sheetnames = [ "instr2_all_DUPS", "instr3_all_DUPS", COLLECTION_SHEET_NAME, DATASETS_SHEET_NAME, MergeableAnchoredList.available_fields_sheetname, MergeableAnchoredList.merged_dsetname, ] assert all(sheetname in results_wb.sheetnames for sheetname in expected_sheetnames) results = pd.read_excel( results_path, sheet_name=MergeableAnchoredList.merged_dsetname, header=[0, 1], index_col=None, ) assert_dfs_equal(results, expected_result, output_dir=output_dir)
def test_keep_latest_csv(): # test latest df = file_to_dataframe(data_dir / "instr1_primaryall.csv") result = df.mac.group_by_keep_one(group_by_col="pidn", date_col_name="dcdate", keep="latest") expected_result = file_to_dataframe(data_dir / "instr1_primarylatest.csv") assert_dfs_equal(result, expected_result, cols_ignore=cols_ignore, output_dir=output_dir)
def test_keep_earliest_csv(): # test earliest df = file_to_dataframe(data_dir / "instr1_primaryall.csv") result = df.mac.group_by_keep_one(group_by_col="pidn", date_col_name="dcdate", keep="earliest") assert get_option("column.system.duplicates") in result.columns expected_result = file_to_dataframe(data_dir / "instr1_primaryearliest.csv") assert_dfs_equal(result, expected_result, cols_ignore=cols_ignore, output_dir=output_dir)
def test_merge_full(): # full merge merge_full_result = primary.mac.date_proximity( secondary, id_on='pidn', date_on='dcdate', get='closest', when='earlier_or_later', days=90, left_link_id='instrid', merge='full' ) # merge_full_result.to_excel(current_dir / "merge_full_result.xlsx", index=False) merge_full_expected_result = file_to_dataframe(current_dir / "merge_full_expected_result.xlsx") assert_dfs_equal(merge_full_result, merge_full_expected_result, cols_ignore=cols_ignore, output_dir=output_dir)
def test_merge_again(tmp_path): # macpie merge tests/cli/macpie/merge/merge_again/full_merged_once.xlsx expected_result = pd.read_excel( current_dir / "expected_results.xlsx", sheet_name=MergeableAnchoredList.merged_dsetname, header=[0, 1], index_col=None, ) runner = CliRunner() # the full_merged_once.xlsx file was created from the result # of the tests.cli.merge.test_full.test_full_no_merge test, # and then removing the first duplicate in each set of duplicates for the # instr2_all dataset cli_args = [ "merge", str((current_dir / "full_merged_once.xlsx").resolve()) ] with runner.isolated_filesystem(temp_dir=tmp_path): results = runner.invoke(main, cli_args) assert results.exit_code == 0 # get the results file results_path = next(Path(".").glob("**/result*xlsx")) # copy file to current dir if you want to debug more if output_dir is not None: copy(results_path, current_dir) results = pd.read_excel( results_path, sheet_name=MergeableAnchoredList.merged_dsetname, header=[0, 1], index_col=None, ) assert_dfs_equal( results, expected_result, cols_ignore=cols_ignore, cols_ignore_pat=cols_ignore_pat, output_dir=output_dir, )
def test_add_suffixes_true(): result = small_anchor.mac.merge( instr2_all_linked, left_on=['pidn', 'dcdate', 'instrid'], right_on=['pidn_x', 'dcdate_x', 'instrid_x'], merge_suffixes=('_a', '_b'), add_suffixes=True).mac.merge( instr3_all_linked, left_on=['pidn_a', 'dcdate_a', 'instrid_a'], right_on=['pidn_x', 'dcdate_x', 'instrid_x'], merge_suffixes=(None, '_c'), add_suffixes=True, ) # result.to_excel(current_dir / "add_suffixes_true_result.xlsx", index=False) expected_result = pd.read_excel(current_dir / "add_suffixes_true_expected_result.xlsx") assert_dfs_equal(result, expected_result, output_dir=output_dir)
def test_left_link_id_blank_merge_partial(): # partial merge result = primary.mac.date_proximity(secondary, id_on='pidn', date_on='dcdate', get='closest', when='earlier_or_later', days=90, merge='partial') # result.to_excel(current_dir / "left_link_id_blank_merge_partial_result.xlsx", index=False) expected_result = file_to_dataframe( current_dir / "left_link_id_blank_merge_partial_expected_result.xlsx") assert_dfs_equal(result, expected_result, cols_ignore=cols_ignore, output_dir=output_dir)
def test_dupes(): primary = file_to_dataframe(current_dir / "primary.xlsx") secondary = file_to_dataframe(current_dir / "secondary.xlsx") dupes_result = primary.mac.date_proximity( secondary, id_on='pidn', date_on='dcdate', get='closest', when='earlier_or_later', days=90, left_link_id='instrid', merge='full', duplicates_indicator=True ) # dupes_result.to_excel(current_dir / "dupes_result.xlsx", index=False) dupes_expected_result = file_to_dataframe(current_dir / "dupes_expected_result.xlsx") assert_dfs_equal(dupes_result, dupes_expected_result, cols_ignore=cols_ignore, output_dir=output_dir)
def test_full_no_merge(cli_link_full_no_merge): # copy file to current dir if you want to debug more if output_dir is not None: copy(cli_link_full_no_merge, current_dir) results_dict = pd.read_excel( cli_link_full_no_merge, sheet_name=["full_anchor", "instr2_all_DUPS", "instr3_all_DUPS"], ) result_primary = results_dict["full_anchor"] result_secondary_instr2 = results_dict["instr2_all_DUPS"] result_secondary_instr3 = results_dict["instr3_all_DUPS"] assert_dfs_equal(result_primary, expected_primary, output_dir=output_dir) assert_dfs_equal( result_secondary_instr2, expected_secondary_instr2, cols_ignore=cols_ignore, cols_ignore_pat=cols_ignore_pat, output_dir=output_dir, ) assert_dfs_equal( result_secondary_instr3, expected_secondary_instr3, cols_ignore=cols_ignore, cols_ignore_pat=cols_ignore_pat, output_dir=output_dir, )
def test_instr2_small(): dfs_dict = pd.read_excel( current_dir / "instr2_small.xlsx", sheet_name=["primary", "expected_results"] ) primary = dfs_dict["primary"] secondary_instr1 = pd.read_csv(data_dir / "instr1_all.csv", parse_dates=[1]) # test closest; earlier_or_later; 90 days small_result = primary.mac.date_proximity( secondary_instr1, id_on="pidn", date_on="dcdate", get="closest", when="earlier_or_later", days=90, left_link_id="instrid", ) small_expected_result = dfs_dict["expected_results"] assert_dfs_equal(small_result, small_expected_result, cols_ignore=cols_ignore)
def run(filepath, tmp_path): expected_result = pd.read_excel( current_dir / "small_expected_results.xlsx", sheet_name=MergeableAnchoredList.merged_dsetname, header=[0, 1], index_col=None, ) create_available_fields(filepath) runner = CliRunner() cli_args = ["merge", str(filepath.resolve())] with runner.isolated_filesystem(temp_dir=tmp_path): results = runner.invoke(main, cli_args) assert results.exit_code == 0 # get the results file results_path = next(Path(".").glob("**/result*xlsx")) # copy file to current dir if you want to debug more if output_dir is not None: copy(results_path, current_dir) results = pd.read_excel( results_path, sheet_name=MergeableAnchoredList.merged_dsetname, header=[0, 1], index_col=None, ) assert_dfs_equal( results, expected_result, cols_ignore=cols_ignore, cols_ignore_pat=cols_ignore_pat, output_dir=output_dir, )
def test_secondary_instr1(): secondary_instr1 = file_to_dataframe(data_dir / "instr1_all.csv") # test closest; earlier_or_later; 90 days instr1_result = primary.mac.date_proximity( secondary_instr1, id_on='pidn', date_on='dcdate', get='closest', when='earlier_or_later', days=90, left_link_id='instrid', duplicates_indicator=True ) # instr1_result.to_excel(current_dir / "instr1_result.xlsx", index=False) instr1_expected_result = dfs_dict['INSTR1_linked'] assert_dfs_equal(instr1_result, instr1_expected_result, cols_ignore=cols_ignore, cols_ignore_pat=cols_ignore_pat)
def test_with_index(): result = small_anchor.mac.merge( instr2_all_linked, left_on=['pidn', 'dcdate', 'instrid'], right_on=['pidn_x', 'dcdate_x', 'instrid_x'], merge_suffixes=('_a', '_b'), add_suffixes=False, add_indexes=('small_anchor', 'instr2_all_linked')).mac.merge( instr3_all_linked, left_on=[('small_anchor', 'pidn'), ('small_anchor', 'dcdate'), ('small_anchor', 'instrid')], right_on=['pidn_x', 'dcdate_x', 'instrid_x'], merge_suffixes=('_a', '_b'), add_suffixes=False, add_indexes=(None, 'instr3_all_linked')) expected_result = pd.read_excel(current_dir / "with_index_expected_result.xlsx", index_col=0, header=[0, 1]) assert (result.columns.equals(expected_result.columns)) assert_dfs_equal(result, expected_result, output_dir=output_dir)
def test_small_no_merge(cli_link_small_no_merge): expected_dict = pd.read_excel( current_dir / "small.xlsx", sheet_name=["LINK_INSTR1", "INSTR2_linked", "INSTR3_linked"]) expected_primary = expected_dict["LINK_INSTR1"] expected_secondary_instr2 = expected_dict["INSTR2_linked"] expected_secondary_instr3 = expected_dict["INSTR3_linked"] # copy file to current dir if you want to debug more if output_dir is not None: copy(cli_link_small_no_merge, current_dir) results_dict = pd.read_excel( cli_link_small_no_merge, sheet_name=["small_anchor", "instr2_all_linked", "instr3_all_linked"], ) result_primary = results_dict["small_anchor"] result_secondary_instr2 = results_dict["instr2_all_linked"] result_secondary_instr3 = results_dict["instr3_all_linked"] assert_dfs_equal( result_primary, expected_primary, cols_ignore=cols_ignore, cols_ignore_pat=cols_ignore_pat, output_dir=output_dir, ) cols_ignore2 = [ get_option("column.system.abs_diff_days"), get_option("column.system.diff_days"), "PIDN", "VType", "_merge", ] assert_dfs_equal( result_secondary_instr2, expected_secondary_instr2, cols_ignore=cols_ignore2, cols_ignore_pat=cols_ignore_pat, output_dir=output_dir, ) assert_dfs_equal( result_secondary_instr3, expected_secondary_instr3, cols_ignore=cols_ignore2, cols_ignore_pat=cols_ignore_pat, output_dir=output_dir, )
def test_instr1(): dfs_dict = pd.read_excel(data_dir / "instr1.xlsx", sheet_name=[ 'primary', 'closest_earlier_or_later_90', 'closest_later_90', 'closest_earlier_90', 'all_earlier_or_later_90', 'all_later_90', 'all_earlier_90' ]) primary = dfs_dict['primary'] secondary = file_to_dataframe(data_dir / "instr1_all.csv") # test closest; earlier_or_later; 90 days closest_earlier_or_later_90_result = primary.mac.date_proximity( secondary, id_on='pidn', date_on='dcdate', get='closest', when='earlier_or_later', days=90) # closest_earlier_or_later_90_result.to_excel(current_dir / "closest_earlier_or_later_90_result.xlsx", index=False) closest_earlier_or_later_90_expected_result = dfs_dict[ 'closest_earlier_or_later_90'] assert_dfs_equal(closest_earlier_or_later_90_result, closest_earlier_or_later_90_expected_result, cols_ignore=cols_ignore, cols_ignore_pat=cols_ignore_pat, output_dir=output_dir) # test closest; later; 90 days closest_later_90_result = primary.mac.date_proximity(secondary, id_on='pidn', date_on='dcdate', get='closest', when='later', days=90) # closest_later_90_result.to_excel(current_dir / "closest_later_90_result.xlsx", index=False) closest_later_90_expected_result = dfs_dict['closest_later_90'] assert_dfs_equal(closest_later_90_result, closest_later_90_expected_result, cols_ignore=cols_ignore, cols_ignore_pat=cols_ignore_pat, output_dir=output_dir) # test closest; earlier; 90 days closest_earlier_90_result = primary.mac.date_proximity(secondary, id_on='pidn', date_on='dcdate', get='closest', when='earlier', days=90) # closest_earlier_90_result.to_excel(current_dir / "closest_earlier_90_result.xlsx", index=False) closest_earlier_90_expected_result = dfs_dict['closest_earlier_90'] assert_dfs_equal(closest_earlier_90_result, closest_earlier_90_expected_result, cols_ignore=cols_ignore, cols_ignore_pat=cols_ignore_pat, output_dir=output_dir) # test all; earlier_or_later; 90 days all_earlier_or_later_90_result = primary.mac.date_proximity( secondary, id_on='pidn', date_on='dcdate', get='all', when='earlier_or_later', days=90) # all_earlier_or_later_90_result.to_excel(current_dir / "all_earlier_or_later_90_result.xlsx", index=False) all_earlier_or_later_90_expected_result = dfs_dict[ 'all_earlier_or_later_90'] assert_dfs_equal(all_earlier_or_later_90_result, all_earlier_or_later_90_expected_result, cols_ignore=cols_ignore, cols_ignore_pat=cols_ignore_pat, output_dir=output_dir) # test all; later; 90 days all_later_90_result = primary.mac.date_proximity(secondary, id_on='pidn', date_on='dcdate', get='all', when='later', days=90) # all_later_90_result.to_excel(current_dir / "all_later_90_result.xlsx", index=False) all_later_90_expected_result = dfs_dict['all_later_90'] assert_dfs_equal(all_later_90_result, all_later_90_expected_result, cols_ignore=cols_ignore, cols_ignore_pat=cols_ignore_pat, output_dir=output_dir) # test all; earlier; 90 days all_earlier_90_result = primary.mac.date_proximity(secondary, id_on='pidn', date_on='dcdate', get='all', when='earlier', days=90) # all_earlier_90_result.to_excel(current_dir / "all_earlier_90_result.xlsx", index=False) all_earlier_90_expected_result = dfs_dict['all_earlier_90'] assert_dfs_equal(all_earlier_90_result, all_earlier_90_expected_result, cols_ignore=cols_ignore, cols_ignore_pat=cols_ignore_pat, output_dir=output_dir)
def test_link(): # macpie link -g closest tests/cli/macpie/link/small.xlsx tests/data/instr2_all.csv tests/data/instr3_all.csv prim = LavaDataset.from_file(Path("tests/cli/macpie/link/small.xlsx")) sec_1 = LavaDataset.from_file(Path(data_dir / "instr2_all.csv")) sec_2 = LavaDataset.from_file(Path(data_dir / "instr3_all.csv")) prim_copy = deepcopy(prim) sec_1_copy = deepcopy(sec_1) sec_2_copy = deepcopy(sec_2) G = ExecutableGraph() G.add_node( prim, operation=partial( group_by_keep_one, group_by_col=prim.id2_col_name, date_col_name=prim.date_col_name, keep="all", id_col_name=prim.id_col_name, drop_duplicates=False, ), ) G.add_node(sec_1) G.add_node(sec_2) G.add_edge( prim, sec_1, operation=partial( date_proximity, id_left_on=prim.id2_col_name, id_right_on=sec_1.id2_col_name, date_left_on=prim.date_col_name, date_right_on=sec_1.date_col_name, get="closest", when="earlier_or_later", days=90, left_link_id=prim.id_col_name, ), ) G.add_edge( prim, sec_2, operation=partial( date_proximity, id_left_on=prim.id2_col_name, id_right_on=sec_2.id2_col_name, date_left_on=prim.date_col_name, date_right_on=sec_2.date_col_name, get="closest", when="earlier_or_later", days=90, left_link_id=prim.id_col_name, ), ) G.execute() edges_with_operation_results = G.get_all_edge_data("operation_result") sec_1_copy = prim_copy.date_proximity( right_dset=sec_1_copy, get="closest", when="earlier_or_later", days=90, prepend_level_name=False, ) assert_dfs_equal( sec_1_copy, edges_with_operation_results[0]["operation_result"], cols_ignore_pat=cols_ignore_pat, output_dir=output_dir, ) sec_2_copy = prim_copy.date_proximity( right_dset=sec_2_copy, get="closest", when="earlier_or_later", days=90, prepend_level_name=False, ) assert_dfs_equal( sec_2_copy, edges_with_operation_results[1]["operation_result"], cols_ignore_pat=cols_ignore_pat, output_dir=output_dir, )