def test_keep_latest_csv(): # test latest df = file_to_dataframe(data_dir / "instr1_primaryall.csv") result = df.mac.group_by_keep_one(group_by_col="pidn", date_col_name="dcdate", keep="latest") expected_result = file_to_dataframe(data_dir / "instr1_primarylatest.csv") assert_dfs_equal(result, expected_result, cols_ignore=cols_ignore, output_dir=output_dir)
def test_keepone(cli_keepone_big): G = ExecutableGraph() prim_filepath = data_dir / "instr1_primaryall.csv" primary = LavaDataset.from_file(prim_filepath) G.add_node( primary, operation=partial( group_by_keep_one, group_by_col=primary.id2_col_name, date_col_name=primary.date_col_name, keep="earliest", drop_duplicates=False, ), ) G.execute() nodes_with_operations = G.get_all_node_data("operation") result = nodes_with_operations[0]["operation_result"] expected_result = file_to_dataframe(cli_keepone_big) assert_dfs_equal(result, expected_result, cols_ignore_pat=cols_ignore_pat, output_dir=output_dir)
def test_keep_earliest_csv(): # test earliest df = file_to_dataframe(data_dir / "instr1_primaryall.csv") result = df.mac.group_by_keep_one(group_by_col="pidn", date_col_name="dcdate", keep="earliest") assert get_option("column.system.duplicates") in result.columns expected_result = file_to_dataframe(data_dir / "instr1_primaryearliest.csv") assert_dfs_equal(result, expected_result, cols_ignore=cols_ignore, output_dir=output_dir)
def test_dupes(): primary = file_to_dataframe(current_dir / "primary.xlsx") secondary = file_to_dataframe(current_dir / "secondary.xlsx") dupes_result = primary.mac.date_proximity( secondary, id_on='pidn', date_on='dcdate', get='closest', when='earlier_or_later', days=90, left_link_id='instrid', merge='full', duplicates_indicator=True ) # dupes_result.to_excel(current_dir / "dupes_result.xlsx", index=False) dupes_expected_result = file_to_dataframe(current_dir / "dupes_expected_result.xlsx") assert_dfs_equal(dupes_result, dupes_expected_result, cols_ignore=cols_ignore, output_dir=output_dir)
def test_merge_partial(): # partial merge merge_partial_result = primary.mac.date_proximity( secondary, id_on='pidn', date_on='dcdate', get='closest', when='earlier_or_later', days=90, left_link_id='instrid', merge='partial' ) # merge_partial_result.to_excel(current_dir / "merge_partial_result.xlsx", index=False) merge_partial_expected_result = file_to_dataframe(current_dir / "merge_partial_expected_result.xlsx") assert_dfs_equal(merge_partial_result, merge_partial_expected_result, cols_ignore=cols_ignore, output_dir=output_dir) # test that results are same when using equivalent id and date params test_id_on_params = primary.mac.date_proximity( secondary, id_left_on='pidn', id_right_on='pidn', date_on='dcdate', get='closest', when='earlier_or_later', days=90, left_link_id='instrid', merge='partial' ) test_date_on_params = primary.mac.date_proximity( secondary, id_on='pidn', date_left_on='dcdate', date_right_on='dcdate', get='closest', when='earlier_or_later', days=90, left_link_id='instrid', merge='partial' ) # test using id_left_on and id_right_on params assert merge_partial_result.equals(test_id_on_params) # test using date_left_on and date_right_on params assert merge_partial_result.equals(test_date_on_params)
def test_filter_by_id(): df = file_to_dataframe(current_dir / "basic.xlsx") # ids list with invalid integer should raise ValueError ids = [1, 2, "hello"] with pytest.raises(ValueError): df.mac.filter_by_id("pidn", ids) # number of rows of filtered result should match number of ids ids = [2, 3, "4"] result = df.mac.filter_by_id("pidn", ids) # result.to_excel(Path("tests/pandas/operators/filter_by_id/result.xlsx"), index=False) assert result.mac.row_count() == 4
def test_merge_full(): # full merge merge_full_result = primary.mac.date_proximity( secondary, id_on='pidn', date_on='dcdate', get='closest', when='earlier_or_later', days=90, left_link_id='instrid', merge='full' ) # merge_full_result.to_excel(current_dir / "merge_full_result.xlsx", index=False) merge_full_expected_result = file_to_dataframe(current_dir / "merge_full_expected_result.xlsx") assert_dfs_equal(merge_full_result, merge_full_expected_result, cols_ignore=cols_ignore, output_dir=output_dir)
def test_left_link_id_blank_merge_partial(): # partial merge result = primary.mac.date_proximity(secondary, id_on='pidn', date_on='dcdate', get='closest', when='earlier_or_later', days=90, merge='partial') # result.to_excel(current_dir / "left_link_id_blank_merge_partial_result.xlsx", index=False) expected_result = file_to_dataframe( current_dir / "left_link_id_blank_merge_partial_expected_result.xlsx") assert_dfs_equal(result, expected_result, cols_ignore=cols_ignore, output_dir=output_dir)
def test_secondary_instr1(): secondary_instr1 = file_to_dataframe(data_dir / "instr1_all.csv") # test closest; earlier_or_later; 90 days instr1_result = primary.mac.date_proximity( secondary_instr1, id_on='pidn', date_on='dcdate', get='closest', when='earlier_or_later', days=90, left_link_id='instrid', duplicates_indicator=True ) # instr1_result.to_excel(current_dir / "instr1_result.xlsx", index=False) instr1_expected_result = dfs_dict['INSTR1_linked'] assert_dfs_equal(instr1_result, instr1_expected_result, cols_ignore=cols_ignore, cols_ignore_pat=cols_ignore_pat)
from pathlib import Path from macpie.pandas import file_to_dataframe from macpie.testing import assert_dfs_equal current_dir = Path(__file__).parent.absolute() # output_dir = current_dir output_dir = None primary = file_to_dataframe(current_dir / "primary.xlsx") secondary = file_to_dataframe(current_dir / "secondary.xlsx") cols_ignore = [] def test_merge_partial(): # partial merge merge_partial_result = primary.mac.date_proximity( secondary, id_on='pidn', date_on='dcdate', get='closest', when='earlier_or_later', days=90, left_link_id='instrid', merge='partial' )
def test_instr1(): dfs_dict = pd.read_excel(data_dir / "instr1.xlsx", sheet_name=[ 'primary', 'closest_earlier_or_later_90', 'closest_later_90', 'closest_earlier_90', 'all_earlier_or_later_90', 'all_later_90', 'all_earlier_90' ]) primary = dfs_dict['primary'] secondary = file_to_dataframe(data_dir / "instr1_all.csv") # test closest; earlier_or_later; 90 days closest_earlier_or_later_90_result = primary.mac.date_proximity( secondary, id_on='pidn', date_on='dcdate', get='closest', when='earlier_or_later', days=90) # closest_earlier_or_later_90_result.to_excel(current_dir / "closest_earlier_or_later_90_result.xlsx", index=False) closest_earlier_or_later_90_expected_result = dfs_dict[ 'closest_earlier_or_later_90'] assert_dfs_equal(closest_earlier_or_later_90_result, closest_earlier_or_later_90_expected_result, cols_ignore=cols_ignore, cols_ignore_pat=cols_ignore_pat, output_dir=output_dir) # test closest; later; 90 days closest_later_90_result = primary.mac.date_proximity(secondary, id_on='pidn', date_on='dcdate', get='closest', when='later', days=90) # closest_later_90_result.to_excel(current_dir / "closest_later_90_result.xlsx", index=False) closest_later_90_expected_result = dfs_dict['closest_later_90'] assert_dfs_equal(closest_later_90_result, closest_later_90_expected_result, cols_ignore=cols_ignore, cols_ignore_pat=cols_ignore_pat, output_dir=output_dir) # test closest; earlier; 90 days closest_earlier_90_result = primary.mac.date_proximity(secondary, id_on='pidn', date_on='dcdate', get='closest', when='earlier', days=90) # closest_earlier_90_result.to_excel(current_dir / "closest_earlier_90_result.xlsx", index=False) closest_earlier_90_expected_result = dfs_dict['closest_earlier_90'] assert_dfs_equal(closest_earlier_90_result, closest_earlier_90_expected_result, cols_ignore=cols_ignore, cols_ignore_pat=cols_ignore_pat, output_dir=output_dir) # test all; earlier_or_later; 90 days all_earlier_or_later_90_result = primary.mac.date_proximity( secondary, id_on='pidn', date_on='dcdate', get='all', when='earlier_or_later', days=90) # all_earlier_or_later_90_result.to_excel(current_dir / "all_earlier_or_later_90_result.xlsx", index=False) all_earlier_or_later_90_expected_result = dfs_dict[ 'all_earlier_or_later_90'] assert_dfs_equal(all_earlier_or_later_90_result, all_earlier_or_later_90_expected_result, cols_ignore=cols_ignore, cols_ignore_pat=cols_ignore_pat, output_dir=output_dir) # test all; later; 90 days all_later_90_result = primary.mac.date_proximity(secondary, id_on='pidn', date_on='dcdate', get='all', when='later', days=90) # all_later_90_result.to_excel(current_dir / "all_later_90_result.xlsx", index=False) all_later_90_expected_result = dfs_dict['all_later_90'] assert_dfs_equal(all_later_90_result, all_later_90_expected_result, cols_ignore=cols_ignore, cols_ignore_pat=cols_ignore_pat, output_dir=output_dir) # test all; earlier; 90 days all_earlier_90_result = primary.mac.date_proximity(secondary, id_on='pidn', date_on='dcdate', get='all', when='earlier', days=90) # all_earlier_90_result.to_excel(current_dir / "all_earlier_90_result.xlsx", index=False) all_earlier_90_expected_result = dfs_dict['all_earlier_90'] assert_dfs_equal(all_earlier_90_result, all_earlier_90_expected_result, cols_ignore=cols_ignore, cols_ignore_pat=cols_ignore_pat, output_dir=output_dir)
from pathlib import Path import pandas as pd from macpie.pandas import file_to_dataframe from macpie.testing import assert_dfs_equal current_dir = Path("tests/pandas/operators/merge/basic/").resolve() # output_dir = current_dir output_dir = None primary = file_to_dataframe(current_dir / "small.xlsx") dfs_dict = pd.read_excel( current_dir / "small.xlsx", sheet_name=['small_anchor', 'instr2_all_linked', 'instr3_all_linked']) small_anchor = dfs_dict['small_anchor'] instr2_all_linked = dfs_dict['instr2_all_linked'] instr3_all_linked = dfs_dict['instr3_all_linked'] def test_add_suffixes_false(): result = small_anchor.mac.merge( instr2_all_linked, left_on=['pidn', 'dcdate', 'instrid'], right_on=['pidn_x', 'dcdate_x', 'instrid_x'], merge_suffixes=('_a', '_b'), add_suffixes=False).mac.merge( instr3_all_linked,