def test_invalid_case_must_raise_error(data): """ Error on invalid case """ with pytest.raises(errors.YeastValidationError): step = steps.CleanColumnNamesStep('camel') step.prepare(data).bake(data)
def test_snake_case_with_whitespaces_before_and_after(data): """ Test cleaning with whitespaces before and after the name """ data.columns = [' series_Name ', ' CreationYear', 'Total Seasons '] step = steps.CleanColumnNamesStep('snake') baked_df = step.prepare(data).bake(data) assert 'series_name' in baked_df.columns assert 'creation_year' in baked_df.columns assert 'total_seasons' in baked_df.columns
def test_upper_camel_case_cleaning(data): """ Test the upper camel case transformation """ step = steps.CleanColumnNamesStep('upper_camel') baked_df = step.prepare(data).bake(data) assert 'SeriesName' in baked_df.columns assert 'CreationYear' in baked_df.columns assert 'TotalSeasons' in baked_df.columns assert 'series_Name' not in baked_df.columns assert 'Total Seasons' not in baked_df.columns
def test_snake_case_cleaning(data): """ Test the snake case transformation """ step = steps.CleanColumnNamesStep('snake') baked_df = step.prepare(data).bake(data) assert 'series_name' in baked_df.columns assert 'creation_year' in baked_df.columns assert 'total_seasons' in baked_df.columns assert 'series_Name' not in baked_df.columns assert 'CreationYear' not in baked_df.columns assert 'Total Seasons' not in baked_df.columns
def test_recipe_workflow(raw_data): """ Secuential execution of the recipe """ recipe = Recipe([ steps.CleanColumnNamesStep('snake'), steps.SelectColumnsStep(['creation_year', 'total_seasons']) ]) baked_data = recipe.prepare(raw_data).bake(raw_data) assert 'creation_year' in baked_data.columns assert 'total_seasons' in baked_data.columns assert 'series_name' not in baked_data.columns assert 'series_Name' not in baked_data.columns assert 'CreationYear' not in baked_data.columns assert 'Total Seasons' not in baked_data.columns
def test_skip_on_other_set_workflow(raw_data): """ Secuential execution of the recipe but one step (FilterStep) must be skipped on new data """ recipe = Recipe([ steps.CleanColumnNamesStep('snake'), steps.SelectColumnsStep(['creation_year', 'total_seasons']), steps.FilterStep('creation_year in [2017, 2020]', role='train') ]) recipe = recipe.prepare(raw_data) train = recipe.bake(raw_data) assert train.shape == (2, 2) # Filter Step was applied test = recipe.bake(raw_data, role='test') assert test.shape == (6, 2) # Filter Step was applied
def test_custom_roles_workflow(raw_data): """ Different steps with different roles should be executed differently """ recipe = Recipe([ steps.CleanColumnNamesStep('snake'), steps.SelectColumnsStep(['creation_year', 'total_seasons']), steps.FilterStep('creation_year == 2020', role='train'), steps.SelectColumnsStep(['creation_year'], role='train'), steps.SelectColumnsStep(['total_seasons'], role='test'), ]) recipe = recipe.prepare(raw_data) train = recipe.bake(raw_data, role='train') assert train.shape == (1, 1) # Filter Step was applied assert 'creation_year' in train.columns # Only creation_year in train role test = recipe.bake(raw_data, role='test') assert test.shape == (6, 1) # Filter Step was applied assert 'total_seasons' in test.columns # Only total_seasons in test role