def test_join_match_case_param_success(): """ Match case converts a column to lower then, it adds a _lower to the column name and finally it drops the column. (Seems redundant...) """ df1 = util.titanic(['name', 'embarked'], size=10) df2 = util.titanic(['homedest', 'name'], size=10) test_df = util.titanic(['name', 'embarked', 'homedest'], size=10) arguments = { 'parameters': { 'keep_right_keys': False, 'match_case': False, 'join_type': 'inner', 'left_attributes': ['name'], 'right_attributes': ['name'] }, 'named_inputs': { 'input data 1': 'df1', 'input data 2': 'df2' }, 'named_outputs': { 'output data': 'out' } } instance = JoinOperation(**arguments) result = util.execute(instance.generate_code(), {'df1': df1, 'df2': df2}) test_df.columns = ['name_l', 'embarked_l', 'homedest_r'] assert result['out'].equals(test_df)
def test_join_krk_param_success(): df1 = util.titanic(['name', 'homedest'], size=10) df2 = util.titanic(['embarked', 'name'], size=10) test_df = util.titanic(['name', 'homedest', 'embarked', 'name'], size=10) arguments = { 'parameters': { 'keep_right_keys': 1, 'match_case': '1', 'join_type': 'inner', 'left_attributes': ['name'], 'right_attributes': ['name'] }, 'named_inputs': { 'input data 1': 'df1', 'input data 2': 'df2' }, 'named_outputs': { 'output data': 'out' } } instance = JoinOperation(**arguments) result = util.execute(instance.generate_code(), {'df1': df1, 'df2': df2}) test_df.columns = ['name_l', 'homedest_l', 'embarked_r', 'name_r'] assert result['out'].equals(test_df)
def test_join_outer_replace_success(): """ This only happens when you pass '_outer' """ df1 = util.titanic(['name', 'homedest'], size=10) df2 = util.titanic(['embarked', 'name'], size=10) arguments = { 'parameters': { 'keep_right_keys': False, 'match_case': True, 'join_type': '_outer', 'left_attributes': ['homedest'], 'right_attributes': ['embarked'] }, 'named_inputs': { 'input data 1': 'df1', 'input data 2': 'df2' }, 'named_outputs': { 'output data': 'out' } } instance = JoinOperation(**arguments) with pytest.raises(KeyError) as key_err: util.execute(instance.generate_code(), {'df1': df1, 'df2': df2}) assert '' in str(key_err.value)
def test_join_custom_suffixes_success(): df1 = util.titanic(['name', 'homedest'], size=10) df2 = util.titanic(['embarked', 'name'], size=10) test_df = util.titanic(['name', 'homedest', 'embarked'], size=10) arguments = { 'parameters': { 'keep_right_keys': False, 'match_case': True, 'join_type': 'inner', 'left_attributes': ['name'], 'right_attributes': ['name'], 'aliases': '_esquerdo,_direito' }, 'named_inputs': { 'input data 1': 'df1', 'input data 2': 'df2' }, 'named_outputs': { 'output data': 'out' } } instance = JoinOperation(**arguments) result = util.execute(instance.generate_code(), {'df1': df1, 'df2': df2}) test_df.columns = [ 'name_esquerdo', 'homedest_esquerdo', 'embarked_direito' ] assert result['out'].equals(test_df)
def test_aggregation_non_numeric_attributes_success(): df = util.titanic(['homedest'], size=150) test_out = df.copy() arguments = { 'parameters': { 'attributes': ['homedest'], 'function': return_funcs('homedest', drop='avg') }, 'named_inputs': { 'input data': 'df' }, 'named_outputs': { 'output data': 'out' } } instance = AggregationOperation(**arguments) result = util.execute(instance.generate_code(), {'df': df}) test_out = test_out.groupby(['homedest']).agg( home_collect_list=('homedest', _collect_list), home_collect_set=('homedest', _collect_set), home_count=('homedest', 'count'), home_first=('homedest', 'first'), home_last=('homedest', 'last'), home_max=('homedest', 'max'), home_min=('homedest', 'min'), home_sum=('homedest', 'sum'), home_size=('homedest', 'size')).reset_index() assert result['out'].equals(test_out)
def test_join_merge_outer_parameter_success(): """ there's a line of code that replaces '_outer' to '' """ df1 = util.titanic(['name', 'homedest'], size=10) df2 = util.titanic(['embarked', 'name'], size=10) test_df1 = df1.copy() test_df2 = df2.copy() arguments = { 'parameters': { 'keep_right_keys': False, 'match_case': True, 'join_type': 'outer', 'left_attributes': ['homedest'], 'right_attributes': ['embarked'] }, 'named_inputs': { 'input data 1': 'df1', 'input data 2': 'df2' }, 'named_outputs': { 'output data': 'out' } } instance = JoinOperation(**arguments) result = util.execute(instance.generate_code(), {'df1': df1, 'df2': df2}) cols1 = [c + '_l' for c in test_df1.columns] cols2 = [c + '_r' for c in test_df2.columns] test_df1.columns = cols1 test_df2.columns = cols2 keys1 = [c + '_l' for c in ['homedest']] keys2 = [c + '_r' for c in ['embarked']] test_out = pd.merge(test_df1, test_df2, how='outer', suffixes=['_l', '_r'], left_on=keys1, right_on=keys2) cols_to_remove = keys2 test_out.drop(cols_to_remove, axis=1, inplace=True) assert result['out'].equals(test_out)
def test_join_invalid_right_attributes_param_fail(): df1 = util.titanic(['name', 'homedest'], size=10) df2 = util.titanic(['embarked', 'name'], size=10) arguments = { 'parameters': { 'keep_right_keys': False, 'match_case': False, 'join_type': 'inner', 'left_attributes': ['homedest'], 'right_attributes': 'invalid' }, 'named_inputs': { 'input data 1': 'df1', 'input data 2': 'df2' }, 'named_outputs': { 'output data': 'out' } } instance = JoinOperation(**arguments) with pytest.raises(NameError) as nam_err: util.execute(instance.generate_code(), {'df1': df1, 'df2': df2}) assert "invalid" in str(nam_err.value)
def test_aggregation_non_numeric_attributes_fail(): df = util.titanic(['homedest'], size=150) arguments = { 'parameters': { 'attributes': ['homedest'], 'function': return_funcs('homedest') }, 'named_inputs': { 'input data': 'df' }, 'named_outputs': { 'output data': 'out' } } instance = AggregationOperation(**arguments) with pytest.raises(pd.core.base.DataError) as data_err: util.execute(instance.generate_code(), {'df': df}) assert "No numeric types to aggregate" in str(data_err.value)