def test_skip_on_other_set_workflow(raw_data): """ Secuential execution of the recipe but one step (FilterStep) must be skipped on new data """ recipe = Recipe([ steps.CleanColumnNamesStep('snake'), steps.SelectColumnsStep(['creation_year', 'total_seasons']), steps.FilterStep('creation_year in [2017, 2020]', role='train') ]) recipe = recipe.prepare(raw_data) train = recipe.bake(raw_data) assert train.shape == (2, 2) # Filter Step was applied test = recipe.bake(raw_data, role='test') assert test.shape == (6, 2) # Filter Step was applied
def test_custom_roles_workflow(raw_data): """ Different steps with different roles should be executed differently """ recipe = Recipe([ steps.CleanColumnNamesStep('snake'), steps.SelectColumnsStep(['creation_year', 'total_seasons']), steps.FilterStep('creation_year == 2020', role='train'), steps.SelectColumnsStep(['creation_year'], role='train'), steps.SelectColumnsStep(['total_seasons'], role='test'), ]) recipe = recipe.prepare(raw_data) train = recipe.bake(raw_data, role='train') assert train.shape == (1, 1) # Filter Step was applied assert 'creation_year' in train.columns # Only creation_year in train role test = recipe.bake(raw_data, role='test') assert test.shape == (6, 1) # Filter Step was applied assert 'total_seasons' in test.columns # Only total_seasons in test role
def test_rownumber_mutation_using_a_lambda_after_groupby(startrek_data): """ Use Mutate After a GroupBy with a lambda to create a new column with the row_number inside the group: ``` title year row_number Picard 2020 1.0 Discovery 2017 1.0 Enterprise 2001 1.0 Voyager 1995 1.0 Deep Space Nine 1993 2.0 TNG 1987 3.0 ``` """ recipe = Recipe([ GroupByStep('seasons'), MutateStep({ 'row_number': lambda df: df['rating'].rank(method="first") }), SortStep(['seasons', 'row_number']) ]) bdf = recipe.bake(startrek_data) assert bdf[['seasons', 'row_number']].loc[0]['seasons'] == 1 assert bdf[['seasons', 'row_number']].loc[0]['row_number'] == 1 assert bdf[['seasons', 'row_number']].loc[1]['seasons'] == 2 assert bdf[['seasons', 'row_number']].loc[1]['row_number'] == 1 assert bdf[['seasons', 'row_number']].loc[2]['seasons'] == 4 assert bdf[['seasons', 'row_number']].loc[2]['row_number'] == 1 assert bdf[['seasons', 'row_number']].loc[3]['seasons'] == 7 assert bdf[['seasons', 'row_number']].loc[3]['row_number'] == 1 assert bdf[['seasons', 'row_number']].loc[4]['seasons'] == 7 assert bdf[['seasons', 'row_number']].loc[4]['row_number'] == 2 assert bdf[['seasons', 'row_number']].loc[5]['seasons'] == 7 assert bdf[['seasons', 'row_number']].loc[5]['row_number'] == 3
def test_rownumber_mutation_using_a_transformer_on_dataframe(startrek_data): """ Test a Rank operation without group by: ``` title year row_number TNG 1987 1.0 Deep Space Nine 1993 2.0 Voyager 1995 3.0 Enterprise 2001 4.0 Discovery 2017 5.0 Picard 2020 6.0 ``` """ recipe = Recipe([ MutateStep({ 'row_number': RowNumber('year') }), SortStep('year') ]) bdf = recipe.bake(startrek_data) assert bdf[['year', 'row_number']].loc[0]['year'] == 1987 assert bdf[['year', 'row_number']].loc[0]['row_number'] == 1 assert bdf[['year', 'row_number']].loc[1]['year'] == 1993 assert bdf[['year', 'row_number']].loc[1]['row_number'] == 2 assert bdf[['year', 'row_number']].loc[2]['year'] == 1995 assert bdf[['year', 'row_number']].loc[2]['row_number'] == 3 assert bdf[['year', 'row_number']].loc[3]['year'] == 2001 assert bdf[['year', 'row_number']].loc[3]['row_number'] == 4 assert bdf[['year', 'row_number']].loc[4]['year'] == 2017 assert bdf[['year', 'row_number']].loc[4]['row_number'] == 5 assert bdf[['year', 'row_number']].loc[5]['year'] == 2020 assert bdf[['year', 'row_number']].loc[5]['row_number'] == 6