Python Aggregation 예제들, weaverbird.pipeline.steps.aggregate.Aggregation Python 예제들

예제 #1

0

파일 보기

def test_simple_aggregate(sample_df):
    step = AggregateStep(
        name='aggregate',
        on=['Group'],
        aggregations=[
            Aggregation(
                aggfunction='sum',
                columns=['Value1', 'Value2'],
                newcolumns=['Sum-Value1', 'Sum-Value2'],
            ),
            Aggregation(aggfunction='avg',
                        columns=['Value1'],
                        newcolumns=['Avg-Value1']),
        ],
    )
    df_result = execute_aggregate(step, sample_df)

    assert_dataframes_equals(
        df_result.sort_values(by=['Group']),
        DataFrame({
            'Group': ['Group 1', 'Group 2'],
            'Sum-Value1': [40, 16],
            'Sum-Value2': [35, 31],
            'Avg-Value1': [np.average([13, 7, 20]),
                           np.average([1, 10, 5])],
        }).sort_values(by=['Group']),
    )

예제 #2

0

파일 보기

def test_with_group_by_with_aggregation_with_last_with_first(
        query, sql_query_describer):
    step = AggregateStep(
        name='aggregate',
        on=['category'],
        aggregations=[
            Aggregation(aggfunction='last',
                        columns=['Label'],
                        newcolumns=['last_label']),
            Aggregation(aggfunction='first',
                        columns=['title'],
                        newcolumns=['first_title']),
            Aggregation(aggfunction='sum',
                        columns=['title'],
                        newcolumns=['sum_title']),
        ],
    )
    sql_query = translate_aggregate(step,
                                    query,
                                    index=1,
                                    sql_query_describer=sql_query_describer)
    assert (
        sql_query.transformed_query ==
        'WITH SELECT_STEP_0 AS (SELECT * FROM products), AGGREGATE_STEP_1 AS (SELECT A.*, F.first_title, F.last_label FROM (SELECT SUM(title) AS sum_title, category FROM SELECT_STEP_0 GROUP BY category) A INNER JOIN (SELECT first_title, last_label, category FROM (SELECT title AS first_title, Label AS last_label, category, ROW_NUMBER() OVER (PARTITION BY category ORDER BY title, Label DESC) AS R FROM SELECT_STEP_0 QUALIFY R = 1)) F ON A.category=F.category)'
    )

예제 #3

0

파일 보기

def test_with_original_granularity_multiple_aggregations(sample_df):
    step = AggregateStep(
        name='aggregate',
        keepOriginalGranularity=True,
        on=['Group'],
        aggregations=[
            Aggregation(aggfunction='min',
                        columns=['Value1'],
                        newcolumns=['min_Value1']),
            Aggregation(aggfunction='max',
                        columns=['Value2'],
                        newcolumns=['max_Value2']),
        ],
    )
    df_result = execute_aggregate(step, sample_df)

    assert_dataframes_equals(
        df_result,
        DataFrame({
            'Label':
            ['Label 1', 'Label 2', 'Label 3', 'Label 4', 'Label 5', 'Label 6'],
            'Group': ['Group 1'] * 3 + ['Group 2'] * 3,
            'Value1': [13, 7, 20, 1, 10, 5],
            'min_Value1': [7] * 3 + [1] * 3,
            'Value2': [10, 21, 4, 17, 12, 2],
            'max_Value2': [21] * 3 + [17] * 3,
        }),
    )

예제 #4

0

파일 보기

def test_totals_2():
    sample_df = pd.DataFrame({
        'COUNTRY': ['France', 'USA'] * 4,
        'PRODUCT': (['product A'] * 2 + ['product B'] * 2) * 2,
        'YEAR': ['2019'] * 4 + ['2020'] * 4,
        'VALUE_1': [5, 10, 10, 15, 20, 20, 30, 25],
        'VALUE_2': [50, 100, 100, 150, 200, 200, 300, 250],
    })
    step = TotalsStep(
        name='totals',
        totalDimensions=[
            TotalDimension(total_column='COUNTRY',
                           total_rows_label='All countries'),
            TotalDimension(total_column='PRODUCT',
                           total_rows_label='All products'),
        ],
        aggregations=[
            Aggregation(
                columns=['VALUE_1', 'VALUE_2'],
                aggfunction='sum',
                newcolumns=['VALUE_1-sum', 'VALUE_2'],
            ),
            Aggregation(columns=['VALUE_1'],
                        aggfunction='avg',
                        newcolumns=['VALUE_1-avg']),
        ],
        groups=['YEAR'],
    )

    expected_result = sample_df.copy()
    expected_result['VALUE_1-sum'] = expected_result[
        'VALUE_1-avg'] = expected_result['VALUE_1']
    del expected_result['VALUE_1']
    expected_result = pd.concat([
        expected_result,
        pd.DataFrame({
            'COUNTRY': ['USA', 'France'] * 2 + ['All countries'] * 6,
            'PRODUCT': ['All products'] * 4 + ['product B', 'product A'] * 2 +
            ['All products'] * 2,
            'YEAR': (['2020'] * 2 + ['2019'] * 2) * 2 + ['2020', '2019'],
            'VALUE_2': [450, 500, 250, 150, 550, 400, 250, 150, 950, 400],
            'VALUE_1-sum': [45, 50, 25, 15, 55, 40, 25, 15, 95, 40],
            'VALUE_1-avg':
            [22.5, 25, 12.5, 7.5, 27.5, 20, 12.5, 7.5, 23.75, 10],
        }),
    ])

    real_result = execute_totals(step, sample_df)
    real_sorted = real_result.sort_values(by=real_result.columns.tolist())
    expected_sorted = expected_result.sort_values(
        by=expected_result.columns.tolist())
    assert_dataframes_equals(real_sorted, expected_sorted)

예제 #5

0

파일 보기

def test_to_dict():
    pipeline = Pipeline(
        steps=[
            DomainStep(name='domain', domain='foobar'),
            RollupStep(
                name='rollup',
                hierarchy=['a', 'b'],
                aggregations=[Aggregation(newcolumns=['a'], aggfunction='sum', columns=['a'])],
            ),
        ]
    )

    actual_dict = pipeline.dict()

    expected_dict = {
        'steps': [
            {'domain': 'foobar', 'name': 'domain'},
            {
                'name': 'rollup',
                'hierarchy': ['a', 'b'],
                'aggregations': [{'new_columns': ['a'], 'agg_function': 'sum', 'columns': ['a']}],
            },
        ]
    }
    assert actual_dict == expected_dict
    assert pipeline == Pipeline(**pipeline.dict())

예제 #6

0

파일 보기

def test_count_with_null():
    df = DataFrame({
        'Label':
        ['Label 1', 'Label 2', 'Label 3', 'Label 4', 'Label 5', 'Label 6'],
        'Group': ['Group 1'] * 3 + [None] * 3,  # type: ignore
        'Value1': [13, 7, 20, 1, 10, 5],
    })
    step = AggregateStep(
        name='aggregate',
        on=['Group'],
        aggregations=[
            Aggregation(
                aggfunction='count distinct including empty',
                columns=['Group'],
                newcolumns=['__VQB_COUNT'],
            ),
        ],
    )
    df_result = execute_aggregate(step, df)

    assert_dataframes_equals(
        df_result.sort_values(by=['Group']),
        DataFrame({
            'Group': ['Group 1', np.nan],
            '__VQB_COUNT': [3, 3],
        }).sort_values(by=['Group']),
    )

예제 #7

0

파일 보기

def test_simple_aggregate_with_null():
    df = DataFrame({
        'Label':
        ['Label 1', 'Label 2', 'Label 3', 'Label 4', 'Label 5', 'Label 6'],
        'Group': ['Group 1'] * 3 + [None] * 3,  # type: ignore
        'Value1': [13, 7, 20, 1, 10, 5],
    })
    step = AggregateStep(
        name='aggregate',
        on=['Group'],
        aggregations=[
            Aggregation(
                aggfunction='sum',
                columns=['Value1'],
                newcolumns=['Sum-Value1'],
            ),
        ],
    )
    df_result = execute_aggregate(step, df)

    assert_dataframes_equals(
        df_result.sort_values(by=['Group']),
        DataFrame({
            'Group': ['Group 1', None],
            'Sum-Value1': [40, 16],
        }).sort_values(by=['Group']),
    )

예제 #8

0

파일 보기

def test_single_totals_without_groups():
    sample_df = pd.DataFrame({
        'COUNTRY': ['France', 'USA'] * 4,
        'PRODUCT': (['PRODUCT A'] * 2 + ['PRODUCT B'] * 2) * 2,
        'YEAR': ['2019'] * 4 + ['2020'] * 4,
        'VALUE': [5, 10, 10, 15, 20, 20, 30, 25],
    })
    step = TotalsStep(
        name='totals',
        totalDimensions=[
            TotalDimension(total_column='COUNTRY',
                           total_rows_label='All countries')
        ],
        aggregations=[
            Aggregation(columns=['VALUE'],
                        aggfunction='sum',
                        newcolumns=['VALUE'])
        ],
        groups=[],
    )

    expected_result = pd.concat([
        sample_df,
        pd.DataFrame({
            'COUNTRY': 'All countries',
            'PRODUCT': [None],
            'YEAR': [None],
            'VALUE': [135]
        }),
    ])

    real_result = execute_totals(step, sample_df)
    assert_dataframes_equals(real_result, expected_result)

예제 #9

0

파일 보기

def test_last_no_group_by_with_aggregation_with_last(query,
                                                     sql_query_describer):
    step = AggregateStep(
        name='aggregate',
        aggregations=[
            Aggregation(aggfunction='last',
                        columns=['Label'],
                        newcolumns=['last_label']),
            Aggregation(aggfunction='sum',
                        columns=['title'],
                        newcolumns=['sum_title']),
        ],
    )
    sql_query = translate_aggregate(step,
                                    query,
                                    index=1,
                                    sql_query_describer=sql_query_describer)
    assert (
        sql_query.transformed_query ==
        'WITH SELECT_STEP_0 AS (SELECT * FROM products), AGGREGATE_STEP_1 AS (SELECT A.*, F.last_label FROM (SELECT SUM(title) AS sum_title FROM SELECT_STEP_0) A INNER JOIN (SELECT last_label FROM (SELECT Label AS last_label, ROW_NUMBER() OVER (ORDER BY Label DESC) AS R FROM SELECT_STEP_0 QUALIFY R = 1)) F)'
    )

예제 #10

0

파일 보기

def test_last_with_group_by_no_aggregation_with_first(query,
                                                      sql_query_describer):
    step = AggregateStep(
        name='aggregate',
        on=['category'],
        aggregations=[
            Aggregation(aggfunction='first',
                        columns=['Label'],
                        newcolumns=['first_label']),
            Aggregation(aggfunction='last',
                        columns=['title'],
                        newcolumns=['last_title']),
        ],
    )
    sql_query = translate_aggregate(step,
                                    query,
                                    index=1,
                                    sql_query_describer=sql_query_describer)
    assert (
        sql_query.transformed_query ==
        'WITH SELECT_STEP_0 AS (SELECT * FROM products), AGGREGATE_STEP_1 AS (SELECT first_label, last_title, category FROM (SELECT Label AS first_label, title AS last_title, category, ROW_NUMBER() OVER (PARTITION BY category ORDER BY Label, title DESC) AS R FROM SELECT_STEP_0 QUALIFY R = 1))'
    )

예제 #11

0

파일 보기

def test_translate_aggregate_with_group_by(query, sql_query_describer):
    step = AggregateStep(
        name='aggregate',
        on=['category'],
        aggregations=[
            Aggregation(
                aggfunction='sum',
                columns=['Value1', 'Value2'],
                newcolumns=['Sum_Value1', 'Sum_Value2'],
            ),
            Aggregation(aggfunction='avg',
                        columns=['Value1'],
                        newcolumns=['Avg_Value1']),
        ],
    )
    sql_query = translate_aggregate(step,
                                    query,
                                    index=1,
                                    sql_query_describer=sql_query_describer)
    assert (
        sql_query.transformed_query ==
        'WITH SELECT_STEP_0 AS (SELECT * FROM products), AGGREGATE_STEP_1 AS (SELECT SUM(Value1) AS Sum_Value1, SUM(Value2) AS Sum_Value2, AVG(Value1) AS Avg_Value1, category FROM SELECT_STEP_0 GROUP BY category)'
    )

예제 #12

0

파일 보기

def test_keep_original_granularity_empty_on(sample_df):
    step = AggregateStep(
        name='aggregate',
        on=[],
        keepOriginalGranularity=True,
        aggregations=[
            Aggregation(aggfunction='count',
                        columns=['Group'],
                        newcolumns=['__vqb_count__']),
        ],
    )
    df_result = execute_aggregate(step, sample_df)

    assert_dataframes_equals(df_result, sample_df.assign(__vqb_count__=6))

예제 #13

0

파일 보기

def test_totals_nogroup_3_dimensions():
    sample_df = pd.DataFrame({
        'COUNTRY': ['France', 'USA'] * 4,
        'PRODUCT': (['product A'] * 2 + ['product B'] * 2) * 2,
        'YEAR': ['2019'] * 4 + ['2020'] * 4,
        'VALUE_1': [5, 10, 10, 15, 20, 20, 30, 25],
    })
    step = TotalsStep(
        name='totals',
        totalDimensions=[
            TotalDimension(total_column='COUNTRY',
                           total_rows_label='All countries'),
            TotalDimension(total_column='PRODUCT',
                           total_rows_label='All products'),
            TotalDimension(total_column='YEAR', total_rows_label='All years'),
        ],
        aggregations=[
            Aggregation(
                columns=['VALUE_1'],
                aggfunction='sum',
                newcolumns=['VALUE_1-sum'],
            ),
            Aggregation(columns=['VALUE_1'],
                        aggfunction='avg',
                        newcolumns=['VALUE_1-avg']),
        ],
    )

    real_result = execute_totals(step, sample_df)
    assert real_result[real_result['YEAR'] == 'All years'].count()['YEAR'] == 9
    assert real_result[real_result['COUNTRY'] ==
                       'All countries'].count()['COUNTRY'] == 9
    assert real_result[real_result['PRODUCT'] ==
                       'All products'].count()['PRODUCT'] == 9
    # could be any column
    assert real_result.count()['YEAR'] == 27

예제 #14

0

파일 보기

def test_duplicate_aggregation_columns(query, sql_query_describer):

    with pytest.raises(DuplicateColumnError):
        step = AggregateStep(
            name='aggregate',
            on=['Group'],
            aggregations=[
                Aggregation(
                    aggfunction='count distinct including empty',
                    columns=['Group', 'Group'],
                    newcolumns=['coucou'],
                ),
            ],
        )
        translate_aggregate(step, query, index=1)

예제 #15

0

파일 보기

def test_count_distinct(sample_df):
    step = AggregateStep(
        name='aggregate',
        keepOriginalGranularity=False,
        on=[],
        aggregations=[
            Aggregation(
                aggfunction='count distinct',
                columns=['Group'],
                newcolumns=['Group_CD'],
            )
        ],
    )
    df_result = execute_aggregate(step, sample_df)

    assert_dataframes_equals(df_result, DataFrame({'Group_CD': [2]}))

예제 #16

0

파일 보기

def test_without_on(sample_df):
    step = AggregateStep(
        name='aggregate',
        keepOriginalGranularity=False,
        on=[],
        aggregations=[
            Aggregation(aggfunction='sum',
                        columns=['Value1'],
                        newcolumns=['sum_value']),
        ],
    )
    df_result = execute_aggregate(step, sample_df)

    assert_dataframes_equals(
        df_result,
        DataFrame({'sum_value': [56]}),
    )

예제 #17

0

파일 보기

def test_first_no_group_by_no_aggregation(query, sql_query_describer):
    step = AggregateStep(
        name='aggregate',
        aggregations=[
            Aggregation(aggfunction='first',
                        columns=['Label'],
                        newcolumns=['first_Label']),
        ],
    )
    sql_query = translate_aggregate(step,
                                    query,
                                    index=1,
                                    sql_query_describer=sql_query_describer)
    assert (
        sql_query.transformed_query ==
        'WITH SELECT_STEP_0 AS (SELECT * FROM products), AGGREGATE_STEP_1 AS (SELECT first_Label FROM (SELECT Label AS first_Label, ROW_NUMBER() OVER (ORDER BY Label) AS R FROM SELECT_STEP_0 QUALIFY R = 1))'
    )

예제 #18

0

파일 보기

def test_count(query, sql_query_describer):
    step = AggregateStep(
        name='aggregate',
        on=['category'],
        aggregations=[
            Aggregation(aggfunction='count',
                        columns=['Label'],
                        newcolumns=['count']),
        ],
    )
    sql_query = translate_aggregate(step,
                                    query,
                                    index=1,
                                    sql_query_describer=sql_query_describer)
    assert (
        sql_query.transformed_query ==
        'WITH SELECT_STEP_0 AS (SELECT * FROM products), AGGREGATE_STEP_1 AS (SELECT COUNT(Label) AS count, category FROM SELECT_STEP_0 GROUP BY category)'
    )

예제 #19

0

파일 보기

def test_count(sample_df):
    step = AggregateStep(
        name='aggregate',
        keepOriginalGranularity=False,
        on=['Group'],
        aggregations=[
            Aggregation(aggfunction='count',
                        columns=['Label'],
                        newcolumns=['count']),
        ],
    )
    df_result = execute_aggregate(step, sample_df)

    assert_dataframes_equals(
        df_result, DataFrame({
            'Group': ['Group 1', 'Group 2'],
            'count': [3, 3]
        }))

예제 #20

0

파일 보기

def test_benchmark_aggregate(benchmark):
    sample_df = DataFrame({
        'Group': ['Group 1'] * 500 + ['Group 2'] * 500,
        'Value1': np.random.random(1000),
        'Value2': np.random.random(1000),
    })
    step = AggregateStep(
        name='aggregate',
        on=['Group'],
        aggregations=[
            Aggregation(
                aggfunction='avg',
                columns=['Value1'],
                newcolumns=['RESULT'],
            ),
        ],
    )
    benchmark(execute_aggregate, step, sample_df)

예제 #21

0

파일 보기

def test_duplicate_aggregation_columns():
    df = DataFrame({
        'Label': ['Label 1', 'Label 2', 'Label 3'],
        'Group': ['Group 1'] * 3,  # type: ignore
        'Value1': [13, 7, 20],
    })
    with pytest.raises(DuplicateColumnError):
        step = AggregateStep(
            name='aggregate',
            on=['Group'],
            aggregations=[
                Aggregation(
                    aggfunction='count distinct including empty',
                    columns=['Group', 'Group'],
                    newcolumns=['__VQB_COUNT'],
                ),
            ],
        )
        execute_aggregate(step, df)

예제 #22

0

파일 보기

def test_legacy_syntax(sample_df):
    step = AggregateStep(
        name='aggregate',
        keepOriginalGranularity=False,
        on=[],
        aggregations=[
            Aggregation(**{
                'aggfunction': 'sum',
                'column': 'Value1',
                'newcolumn': 'sum_value'
            }),
        ],
    )
    df_result = execute_aggregate(step, sample_df)

    assert_dataframes_equals(
        df_result,
        DataFrame({'sum_value': [56]}),
    )

예제 #23

0

파일 보기

def test_count_distinct(query, sql_query_describer):
    step = AggregateStep(
        name='aggregate',
        on=[],
        aggregations=[
            Aggregation(
                aggfunction='count distinct',
                columns=['Group'],
                newcolumns=['Group_CD'],
            )
        ],
    )
    sql_query = translate_aggregate(step,
                                    query,
                                    index=1,
                                    sql_query_describer=sql_query_describer)
    assert (
        sql_query.transformed_query ==
        'WITH SELECT_STEP_0 AS (SELECT * FROM products), AGGREGATE_STEP_1 AS (SELECT COUNT(DISTINCT Group) AS Group_CD FROM SELECT_STEP_0)'
    )

예제 #24

0

파일 보기

def test_benchmark_totals(benchmark):
    groups = ['group_1', 'group_2']
    df = pd.DataFrame({
        'value': np.random.random(1000),
        'id': list(range(1000)),
        'group': [random.choice(groups) for _ in range(1000)],
    })

    step = TotalsStep(
        name='totals',
        totalDimensions=[
            TotalDimension(total_column='group', total_rows_label='All groups')
        ],
        aggregations=[
            Aggregation(columns=['value'],
                        aggfunction='sum',
                        newcolumns=['value'])
        ],
        groups=[],
    )
    benchmark(execute_totals, step, df)