示例#1
0
def test_group_by_and_summarize_should_return_a_ready_to_use_dataframe(data):
    """
    Let's group by `seasons` and calculate some aggregations
    """
    recipe = Recipe([
        GroupByStep(['seasons']),
        SummarizeStep({
            'rating_mean': AggMean('rating'),
            'rating_median': AggMedian('rating'),
            'year_max': AggMax('year'),
            'year_min': AggMin('year'),
            'watched_count': AggSum('watched'),
            'years_count': AggCount('year'),
            'years_unique_count': AggCountDistinct('year')
        })
    ])
    baked_df = recipe.prepare(data).bake(data)

    assert 'seasons' in baked_df.columns
    assert 'rating_mean' in baked_df.columns
    assert 'rating_median' in baked_df.columns
    assert 'year_max' in baked_df.columns
    assert 'year_min' in baked_df.columns
    assert 'watched_count' in baked_df.columns
    assert 'years_count' in baked_df.columns
    assert 'years_unique_count' in baked_df.columns
示例#2
0
def test_right_join_with_using_a_recipe(startrek_starships,
                                        startrek_starships_specs):
    """
    Right Join with another Recipe
    """
    right_recipe = Recipe([SortStep('uid')])

    left_recipe = Recipe([
        RightJoinStep(right_recipe, by='uid', df=startrek_starships_specs),
        SortStep('uid')
    ])
    bdf = left_recipe.prepare(startrek_starships).bake(startrek_starships)

    assert bdf.shape == (4, 3)
    row = pd.Series({
        'uid': 'NCC-1031',
        'name': 'USS Discovery',
        'warp': 9.9
    },
                    name=0)
    assert_series_equal(bdf.loc[0], row)
    row = pd.Series({
        'uid': 'NCC-74656',
        'name': 'USS Voyager',
        'warp': 9.975
    },
                    name=3)
    assert_series_equal(bdf.loc[3], row)
示例#3
0
def test_mean_inputation_on_numerical_column(data):
    """
    Before impute the season, replace 7 with NAN to have some missing values
    """
    recipe = Recipe([
        MutateStep({
            'seasons': MapValues({
                7: np.NaN
            }),
            'rating': MapValues({
                9.3: np.NaN,
                9.9: np.NaN,
                9.0: np.NaN
            })
        }),
        MeanImputeStep(['seasons', 'rating'])
    ])
    bdf = recipe.prepare(data).bake(data)

    seasons = bdf['seasons'].round(1).tolist()
    assert seasons[0] == 1.0
    assert seasons[1] == 2.3  # mean
    assert seasons[2] == 2.3  # mean
    assert seasons[3] == 4.0
    assert seasons[4] == 2.3  # mean
    assert seasons[5] == 2.0

    ratings = bdf['rating'].round(1).tolist()
    assert ratings[0] == 7.7  # mean
    assert ratings[1] == 7.7  # mean
    assert ratings[2] == 7.4
    assert ratings[3] == 6.8
    assert ratings[4] == 8.9
    assert ratings[5] == 7.7  # mean
示例#4
0
def test_join_on_invalid_how_step(startrek_starships, startrek_starships_specs):
    """
    The how parameter is invalid and the step shoul fail the validation
    """
    recipe = Recipe([
        JoinStep(startrek_starships_specs, by="uid", how="not_exist"),
    ])
    with pytest.raises(YeastValidationError) as ex:
        recipe.prepare(startrek_starships).bake(startrek_starships)
示例#5
0
def test_left_join_using_df_but_not_a_recipe(startrek_starships, startrek_starships_specs):
    """
    df is only used if right is a Recipe
    """
    recipe = Recipe([
        JoinStep(startrek_starships_specs, by="uid", df=startrek_starships, how="left")
    ])

    with pytest.raises(YeastValidationError) as ex:
        recipe.prepare(startrek_starships).bake(startrek_starships)
示例#6
0
def test_median_inputation_on_a_non_numerical_column_must_fail(data):
    """
    We can not calculate the median on a non-numerical column.
    """
    recipe = Recipe([
        MutateStep({'title': MapValues({'Voyager': np.NaN})}),
        MedianImputeStep(['title'])
    ])

    with pytest.raises(YeastPreparationError):
        recipe.prepare(data)
示例#7
0
def test_left_join_column_not_found_left_thus_fail(startrek_starships, startrek_starships_specs):
    """
    Left Join but the column uid does not exist on left
    """
    recipe = Recipe([
        RenameColumnsStep({'uid': 'not_found'}),
        JoinStep(startrek_starships_specs, by='uid', how="left")
    ])

    with pytest.raises(YeastValidationError):
        recipe.prepare(startrek_starships).bake(startrek_starships)
示例#8
0
def test_left_join_without_by_workflow(startrek_starships, startrek_starships_specs):
    """
    Left Join without pass the column names
    """
    recipe = Recipe([
        JoinStep(startrek_starships_specs, how="left"),
        SortStep('uid')
    ])
    baked_data = recipe.prepare(startrek_starships).bake(startrek_starships)

    row = pd.Series({'uid': 'NCC-1031', 'name': 'USS Discovery', 'warp': 9.9}, name=0)
    assert_series_equal(baked_data.loc[0], row)
示例#9
0
文件: test_recipe.py 项目: iuga/Yeast
def test_recipe_workflow_validations_should_be_on_the_last_transformed_data(
        raw_data):
    """
    Title exist on the original data but not after the first step.
    Validation on the second step must fail.
    """
    recipe = Recipe([
        steps.SelectColumnsStep(['year', 'seasons']),
        steps.SelectColumnsStep(['title']),
    ])
    with pytest.raises(errors.YeastValidationError):
        recipe.prepare(raw_data).bake(raw_data)
示例#10
0
def test_join_on_right_step(startrek_starships, startrek_starships_specs):
    """
    Right Join with NA mismmatches
    """
    recipe = Recipe([
        JoinStep(startrek_starships_specs, by="uid", how="right"),
        SortStep('uid')
    ])
    baked_data = recipe.prepare(startrek_starships).bake(startrek_starships)

    assert baked_data.shape == (4, 3)
    row = pd.Series({'uid': 'NCC-1031', 'name': 'USS Discovery', 'warp': 9.9}, name=0)
    assert_series_equal(baked_data.loc[0], row)
    row = pd.Series({'uid': 'NCC-74656', 'name': 'USS Voyager', 'warp': 9.975}, name=3)
    assert_series_equal(baked_data.loc[3], row)
示例#11
0
文件: test_recipe.py 项目: iuga/Yeast
def test_recipe_workflow(raw_data):
    """
    Secuential execution of the recipe
    """
    recipe = Recipe([
        steps.CleanColumnNamesStep('snake'),
        steps.SelectColumnsStep(['creation_year', 'total_seasons'])
    ])
    baked_data = recipe.prepare(raw_data).bake(raw_data)
    assert 'creation_year' in baked_data.columns
    assert 'total_seasons' in baked_data.columns
    assert 'series_name' not in baked_data.columns
    assert 'series_Name' not in baked_data.columns
    assert 'CreationYear' not in baked_data.columns
    assert 'Total Seasons' not in baked_data.columns
示例#12
0
def test_join_on_fullouter_step(startrek_starships, startrek_starships_specs):
    """
    Full outer Join with NA mismmatches
    """
    recipe = Recipe([
        JoinStep(startrek_starships_specs, by="uid", how="full"),
        SortStep('uid')
    ])
    baked_data = recipe.prepare(startrek_starships).bake(startrek_starships)

    assert baked_data.shape == (5, 3)
    row = pd.Series({'uid': 'NCC-1031', 'name': 'USS Discovery', 'warp': 9.9}, name=0)
    assert_series_equal(baked_data.loc[0], row)
    row = pd.Series({'uid': 'NX-01', 'name': 'Enterprise', 'warp': None}, name=4)
    assert_series_equal(baked_data.loc[4], row)
示例#13
0
文件: test_recipe.py 项目: iuga/Yeast
def test_skip_on_other_set_workflow(raw_data):
    """
    Secuential execution of the recipe but one step (FilterStep) must be skipped on new data
    """
    recipe = Recipe([
        steps.CleanColumnNamesStep('snake'),
        steps.SelectColumnsStep(['creation_year', 'total_seasons']),
        steps.FilterStep('creation_year in [2017, 2020]', role='train')
    ])
    recipe = recipe.prepare(raw_data)

    train = recipe.bake(raw_data)
    assert train.shape == (2, 2)  # Filter Step was applied

    test = recipe.bake(raw_data, role='test')
    assert test.shape == (6, 2)  # Filter Step was applied
示例#14
0
def test_left_join_with_using_a_recipe(startrek_starships, startrek_starships_specs):
    """
    Left Join with another Recipe
    """
    right_recipe = Recipe([
        SortStep('uid')
    ])

    left_recipe = Recipe([
        JoinStep(right_recipe, by='uid', how='left',  df=startrek_starships_specs),
        SortStep('uid')
    ])
    baked_df = left_recipe.prepare(startrek_starships).bake(startrek_starships)

    row = pd.Series({'uid': 'NCC-1031', 'name': 'USS Discovery', 'warp': 9.9}, name=0)
    assert_series_equal(baked_df.loc[0], row)
示例#15
0
文件: test_recipe.py 项目: iuga/Yeast
def test_recipe_workflow_only_accepts_yeast_steps(raw_data):
    """
    Currently, we are only going to support Steps objects on the Recipe
    """
    with pytest.raises(errors.YeastRecipeError):
        Recipe([
            steps.SelectColumnsStep(['year', 'seasons']),
            {},
        ])
示例#16
0
文件: test_recipe.py 项目: iuga/Yeast
def test_custom_roles_workflow(raw_data):
    """
    Different steps with different roles should be executed differently
    """
    recipe = Recipe([
        steps.CleanColumnNamesStep('snake'),
        steps.SelectColumnsStep(['creation_year', 'total_seasons']),
        steps.FilterStep('creation_year == 2020', role='train'),
        steps.SelectColumnsStep(['creation_year'], role='train'),
        steps.SelectColumnsStep(['total_seasons'], role='test'),
    ])
    recipe = recipe.prepare(raw_data)

    train = recipe.bake(raw_data, role='train')
    assert train.shape == (1, 1)  # Filter Step was applied
    assert 'creation_year' in train.columns  # Only creation_year in train role

    test = recipe.bake(raw_data, role='test')
    assert test.shape == (6, 1)  # Filter Step was applied
    assert 'total_seasons' in test.columns  # Only total_seasons in test role
示例#17
0
def test_rownumber_mutation_using_a_lambda_after_groupby(startrek_data):
    """
    Use Mutate After a GroupBy with a lambda to create a new column with the row_number
    inside the group:
    ```
                 title  year  row_number
                Picard  2020         1.0
             Discovery  2017         1.0
            Enterprise  2001         1.0
               Voyager  1995         1.0
       Deep Space Nine  1993         2.0
                   TNG  1987         3.0
    ```
    """
    recipe = Recipe([
        GroupByStep('seasons'),
        MutateStep({
            'row_number': lambda df: df['rating'].rank(method="first")
        }),
        SortStep(['seasons', 'row_number'])
    ])
    bdf = recipe.bake(startrek_data)

    assert bdf[['seasons', 'row_number']].loc[0]['seasons'] == 1
    assert bdf[['seasons', 'row_number']].loc[0]['row_number'] == 1

    assert bdf[['seasons', 'row_number']].loc[1]['seasons'] == 2
    assert bdf[['seasons', 'row_number']].loc[1]['row_number'] == 1

    assert bdf[['seasons', 'row_number']].loc[2]['seasons'] == 4
    assert bdf[['seasons', 'row_number']].loc[2]['row_number'] == 1

    assert bdf[['seasons', 'row_number']].loc[3]['seasons'] == 7
    assert bdf[['seasons', 'row_number']].loc[3]['row_number'] == 1

    assert bdf[['seasons', 'row_number']].loc[4]['seasons'] == 7
    assert bdf[['seasons', 'row_number']].loc[4]['row_number'] == 2

    assert bdf[['seasons', 'row_number']].loc[5]['seasons'] == 7
    assert bdf[['seasons', 'row_number']].loc[5]['row_number'] == 3
示例#18
0
def test_rownumber_mutation_using_a_transformer_on_dataframe(startrek_data):
    """
    Test a Rank operation without group by:
    ```
                 title  year  row_number
                   TNG  1987         1.0
       Deep Space Nine  1993         2.0
               Voyager  1995         3.0
            Enterprise  2001         4.0
             Discovery  2017         5.0
                Picard  2020         6.0
    ```
    """
    recipe = Recipe([
        MutateStep({
            'row_number': RowNumber('year')
        }),
        SortStep('year')
    ])
    bdf = recipe.bake(startrek_data)

    assert bdf[['year', 'row_number']].loc[0]['year'] == 1987
    assert bdf[['year', 'row_number']].loc[0]['row_number'] == 1

    assert bdf[['year', 'row_number']].loc[1]['year'] == 1993
    assert bdf[['year', 'row_number']].loc[1]['row_number'] == 2

    assert bdf[['year', 'row_number']].loc[2]['year'] == 1995
    assert bdf[['year', 'row_number']].loc[2]['row_number'] == 3

    assert bdf[['year', 'row_number']].loc[3]['year'] == 2001
    assert bdf[['year', 'row_number']].loc[3]['row_number'] == 4

    assert bdf[['year', 'row_number']].loc[4]['year'] == 2017
    assert bdf[['year', 'row_number']].loc[4]['row_number'] == 5

    assert bdf[['year', 'row_number']].loc[5]['year'] == 2020
    assert bdf[['year', 'row_number']].loc[5]['row_number'] == 6
示例#19
0
def test_inner_join_step(startrek_starships, startrek_starships_specs):
    """
    Inner Join with NA mismmatches
    """
    recipe = Recipe(
        [InnerJoinStep(startrek_starships_specs, by="uid"),
         SortStep('uid')])
    bdf = recipe.prepare(startrek_starships).bake(startrek_starships)

    assert bdf.shape == (4, 3)
    row = pd.Series({
        'uid': 'NCC-1031',
        'name': 'USS Discovery',
        'warp': 9.9
    },
                    name=0)
    assert_series_equal(bdf.loc[0], row)
    row = pd.Series({
        'uid': 'NCC-74656',
        'name': 'USS Voyager',
        'warp': 9.975
    },
                    name=3)
    assert_series_equal(bdf.loc[3], row)
示例#20
0
def test_add_recipe_by_name(data):
    """
    Add a new recipe into the cookbook
    """
    # Define the cookbook
    cookbook = Cookbook()
    assert len(cookbook.list()) == 0

    # Define a Recipe
    recipe = Recipe([])

    # Add the Recipe into the Cookbook
    cookbook.add('test.recipe', recipe)
    assert len(cookbook.list()) == 1
    assert 'test.recipe' in cookbook.list()

    # Get the Recipe
    assert cookbook.get('test.recipe') == recipe
示例#21
0
def test_adding_two_recipes_with_same_name_raises_an_error(data):
    """
    You can't have two recipes with the same name
    """
    cookbook = Cookbook()
    assert len(cookbook.list()) == 0

    # Define a Recipe
    recipe = Recipe([])

    # Add the Recipe into the Cookbook
    cookbook.add('test.recipe', recipe)
    assert len(cookbook.list()) == 1

    with pytest.raises(YeastCookbookError):
        # Add the seconds Recipe into the Cookbook
        cookbook.add('test.recipe', recipe)

    assert len(cookbook.list()) == 1