Exemplo n.º 1
0
def test_skip_on_other_set_workflow(raw_data):
    """
    Secuential execution of the recipe but one step (FilterStep) must be skipped on new data
    """
    recipe = Recipe([
        steps.CleanColumnNamesStep('snake'),
        steps.SelectColumnsStep(['creation_year', 'total_seasons']),
        steps.FilterStep('creation_year in [2017, 2020]', role='train')
    ])
    recipe = recipe.prepare(raw_data)

    train = recipe.bake(raw_data)
    assert train.shape == (2, 2)  # Filter Step was applied

    test = recipe.bake(raw_data, role='test')
    assert test.shape == (6, 2)  # Filter Step was applied
Exemplo n.º 2
0
def test_custom_roles_workflow(raw_data):
    """
    Different steps with different roles should be executed differently
    """
    recipe = Recipe([
        steps.CleanColumnNamesStep('snake'),
        steps.SelectColumnsStep(['creation_year', 'total_seasons']),
        steps.FilterStep('creation_year == 2020', role='train'),
        steps.SelectColumnsStep(['creation_year'], role='train'),
        steps.SelectColumnsStep(['total_seasons'], role='test'),
    ])
    recipe = recipe.prepare(raw_data)

    train = recipe.bake(raw_data, role='train')
    assert train.shape == (1, 1)  # Filter Step was applied
    assert 'creation_year' in train.columns  # Only creation_year in train role

    test = recipe.bake(raw_data, role='test')
    assert test.shape == (6, 1)  # Filter Step was applied
    assert 'total_seasons' in test.columns  # Only total_seasons in test role
Exemplo n.º 3
0
def test_rownumber_mutation_using_a_lambda_after_groupby(startrek_data):
    """
    Use Mutate After a GroupBy with a lambda to create a new column with the row_number
    inside the group:
    ```
                 title  year  row_number
                Picard  2020         1.0
             Discovery  2017         1.0
            Enterprise  2001         1.0
               Voyager  1995         1.0
       Deep Space Nine  1993         2.0
                   TNG  1987         3.0
    ```
    """
    recipe = Recipe([
        GroupByStep('seasons'),
        MutateStep({
            'row_number': lambda df: df['rating'].rank(method="first")
        }),
        SortStep(['seasons', 'row_number'])
    ])
    bdf = recipe.bake(startrek_data)

    assert bdf[['seasons', 'row_number']].loc[0]['seasons'] == 1
    assert bdf[['seasons', 'row_number']].loc[0]['row_number'] == 1

    assert bdf[['seasons', 'row_number']].loc[1]['seasons'] == 2
    assert bdf[['seasons', 'row_number']].loc[1]['row_number'] == 1

    assert bdf[['seasons', 'row_number']].loc[2]['seasons'] == 4
    assert bdf[['seasons', 'row_number']].loc[2]['row_number'] == 1

    assert bdf[['seasons', 'row_number']].loc[3]['seasons'] == 7
    assert bdf[['seasons', 'row_number']].loc[3]['row_number'] == 1

    assert bdf[['seasons', 'row_number']].loc[4]['seasons'] == 7
    assert bdf[['seasons', 'row_number']].loc[4]['row_number'] == 2

    assert bdf[['seasons', 'row_number']].loc[5]['seasons'] == 7
    assert bdf[['seasons', 'row_number']].loc[5]['row_number'] == 3
Exemplo n.º 4
0
def test_rownumber_mutation_using_a_transformer_on_dataframe(startrek_data):
    """
    Test a Rank operation without group by:
    ```
                 title  year  row_number
                   TNG  1987         1.0
       Deep Space Nine  1993         2.0
               Voyager  1995         3.0
            Enterprise  2001         4.0
             Discovery  2017         5.0
                Picard  2020         6.0
    ```
    """
    recipe = Recipe([
        MutateStep({
            'row_number': RowNumber('year')
        }),
        SortStep('year')
    ])
    bdf = recipe.bake(startrek_data)

    assert bdf[['year', 'row_number']].loc[0]['year'] == 1987
    assert bdf[['year', 'row_number']].loc[0]['row_number'] == 1

    assert bdf[['year', 'row_number']].loc[1]['year'] == 1993
    assert bdf[['year', 'row_number']].loc[1]['row_number'] == 2

    assert bdf[['year', 'row_number']].loc[2]['year'] == 1995
    assert bdf[['year', 'row_number']].loc[2]['row_number'] == 3

    assert bdf[['year', 'row_number']].loc[3]['year'] == 2001
    assert bdf[['year', 'row_number']].loc[3]['row_number'] == 4

    assert bdf[['year', 'row_number']].loc[4]['year'] == 2017
    assert bdf[['year', 'row_number']].loc[4]['row_number'] == 5

    assert bdf[['year', 'row_number']].loc[5]['year'] == 2020
    assert bdf[['year', 'row_number']].loc[5]['row_number'] == 6