예제 #1
0
def test_totals_nogroup_3_dimensions():
    sample_df = pd.DataFrame(
        {
            'COUNTRY': ['France', 'USA'] * 4,
            'PRODUCT': (['product A'] * 2 + ['product B'] * 2) * 2,
            'YEAR': ['2019'] * 4 + ['2020'] * 4,
            'VALUE_1': [5, 10, 10, 15, 20, 20, 30, 25],
        }
    )
    step = TotalsStep(
        name='totals',
        totalDimensions=[
            TotalDimension(total_column='COUNTRY', total_rows_label='All countries'),
            TotalDimension(total_column='PRODUCT', total_rows_label='All products'),
            TotalDimension(total_column='YEAR', total_rows_label='All years'),
        ],
        aggregations=[
            Aggregation(
                columns=['VALUE_1'],
                aggfunction='sum',
                newcolumns=['VALUE_1-sum'],
            ),
            Aggregation(columns=['VALUE_1'], aggfunction='avg', newcolumns=['VALUE_1-avg']),
        ],
    )

    real_result = step.execute(sample_df)
    assert real_result[real_result['YEAR'] == 'All years'].count()['YEAR'] == 9
    assert real_result[real_result['COUNTRY'] == 'All countries'].count()['COUNTRY'] == 9
    assert real_result[real_result['PRODUCT'] == 'All products'].count()['PRODUCT'] == 9
    # could be any column
    assert real_result.count()['YEAR'] == 27
예제 #2
0
def test_simple_aggregate(sample_df):
    df_result = AggregateStep(
        name='aggregate',
        on=['Group'],
        aggregations=[
            Aggregation(
                aggfunction='sum',
                columns=['Value1', 'Value2'],
                newcolumns=['Sum-Value1', 'Sum-Value2'],
            ),
            Aggregation(aggfunction='avg', columns=['Value1'], newcolumns=['Avg-Value1']),
        ],
    ).execute(sample_df)

    assert_dataframes_equals(
        df_result.sort_values(by=['Group']),
        DataFrame(
            {
                'Group': ['Group 1', 'Group 2'],
                'Sum-Value1': [40, 16],
                'Sum-Value2': [35, 31],
                'Avg-Value1': [np.average([13, 7, 20]), np.average([1, 10, 5])],
            }
        ).sort_values(by=['Group']),
    )
예제 #3
0
def test_with_original_granularity_multiple_aggregations_multiple_columns(sample_df):
    df_result = AggregateStep(
        name='aggregate',
        keepOriginalGranularity=True,
        on=['Group'],
        aggregations=[
            Aggregation(
                aggfunction='min',
                columns=['Value1', 'Value2'],
                newcolumns=['min_Value1', 'min_Value2'],
            ),
            Aggregation(
                aggfunction='max',
                columns=['Value1', 'Value2'],
                newcolumns=['max_Value1', 'max_Value2'],
            ),
        ],
    ).execute(sample_df)

    assert_dataframes_equals(
        df_result,
        DataFrame(
            {
                'Label': ['Label 1', 'Label 2', 'Label 3', 'Label 4', 'Label 5', 'Label 6'],
                'Group': ['Group 1'] * 3 + ['Group 2'] * 3,
                'Value1': [13, 7, 20, 1, 10, 5],
                'min_Value1': [7] * 3 + [1] * 3,
                'max_Value1': [20] * 3 + [10] * 3,
                'Value2': [10, 21, 4, 17, 12, 2],
                'min_Value2': [4] * 3 + [2] * 3,
                'max_Value2': [21] * 3 + [17] * 3,
            }
        ),
    )
예제 #4
0
def test_totals_2():
    sample_df = pd.DataFrame(
        {
            'COUNTRY': ['France', 'USA'] * 4,
            'PRODUCT': (['product A'] * 2 + ['product B'] * 2) * 2,
            'YEAR': ['2019'] * 4 + ['2020'] * 4,
            'VALUE_1': [5, 10, 10, 15, 20, 20, 30, 25],
            'VALUE_2': [50, 100, 100, 150, 200, 200, 300, 250],
        }
    )
    step = TotalsStep(
        name='totals',
        totalDimensions=[
            TotalDimension(total_column='COUNTRY', total_rows_label='All countries'),
            TotalDimension(total_column='PRODUCT', total_rows_label='All products'),
        ],
        aggregations=[
            Aggregation(
                columns=['VALUE_1', 'VALUE_2'],
                aggfunction='sum',
                newcolumns=['VALUE_1-sum', 'VALUE_2'],
            ),
            Aggregation(columns=['VALUE_1'], aggfunction='avg', newcolumns=['VALUE_1-avg']),
        ],
        groups=['YEAR'],
    )

    expected_result = sample_df.copy()
    expected_result['VALUE_1-sum'] = expected_result['VALUE_1-avg'] = expected_result['VALUE_1']
    del expected_result['VALUE_1']
    expected_result = pd.concat(
        [
            expected_result,
            pd.DataFrame(
                {
                    'COUNTRY': ['USA', 'France'] * 2 + ['All countries'] * 6,
                    'PRODUCT': ['All products'] * 4
                    + ['product B', 'product A'] * 2
                    + ['All products'] * 2,
                    'YEAR': (['2020'] * 2 + ['2019'] * 2) * 2 + ['2020', '2019'],
                    'VALUE_2': [450, 500, 250, 150, 550, 400, 250, 150, 950, 400],
                    'VALUE_1-sum': [45, 50, 25, 15, 55, 40, 25, 15, 95, 40],
                    'VALUE_1-avg': [22.5, 25, 12.5, 7.5, 27.5, 20, 12.5, 7.5, 23.75, 10],
                }
            ),
        ]
    )

    real_result = step.execute(sample_df)
    real_sorted = real_result.sort_values(by=real_result.columns.tolist())
    expected_sorted = expected_result.sort_values(by=expected_result.columns.tolist())
    assert_dataframes_equals(real_sorted, expected_sorted)
예제 #5
0
def test_single_totals_without_groups():
    sample_df = pd.DataFrame(
        {
            'COUNTRY': ['France', 'USA'] * 4,
            'PRODUCT': (['PRODUCT A'] * 2 + ['PRODUCT B'] * 2) * 2,
            'YEAR': ['2019'] * 4 + ['2020'] * 4,
            'VALUE': [5, 10, 10, 15, 20, 20, 30, 25],
        }
    )
    step = TotalsStep(
        name='totals',
        totalDimensions=[TotalDimension(total_column='COUNTRY', total_rows_label='All countries')],
        aggregations=[Aggregation(columns=['VALUE'], aggfunction='sum', newcolumns=['VALUE'])],
        groups=[],
    )

    expected_result = pd.concat(
        [
            sample_df,
            pd.DataFrame(
                {'COUNTRY': 'All countries', 'PRODUCT': [None], 'YEAR': [None], 'VALUE': [135]}
            ),
        ]
    )

    real_result = step.execute(sample_df)
    assert_dataframes_equals(real_result, expected_result)
예제 #6
0
def test_simple_aggregate_with_null():
    df = DataFrame(
        {
            'Label': ['Label 1', 'Label 2', 'Label 3', 'Label 4', 'Label 5', 'Label 6'],
            'Group': ['Group 1'] * 3 + [None] * 3,  # type: ignore
            'Value1': [13, 7, 20, 1, 10, 5],
        }
    )
    df_result = AggregateStep(
        name='aggregate',
        on=['Group'],
        aggregations=[
            Aggregation(
                aggfunction='sum',
                columns=['Value1'],
                newcolumns=['Sum-Value1'],
            ),
        ],
    ).execute(df)

    assert_dataframes_equals(
        df_result.sort_values(by=['Group']),
        DataFrame(
            {
                'Group': ['Group 1', None],
                'Sum-Value1': [40, 16],
            }
        ).sort_values(by=['Group']),
    )
예제 #7
0
def test_count_with_null():
    df = DataFrame({
        'Label':
        ['Label 1', 'Label 2', 'Label 3', 'Label 4', 'Label 5', 'Label 6'],
        'Group': ['Group 1'] * 3 + [None] * 3,  # type: ignore
        'Value1': [13, 7, 20, 1, 10, 5],
    })
    df_result = AggregateStep(
        name='aggregate',
        on=['Group'],
        aggregations=[
            Aggregation(
                aggfunction='count distinct including empty',
                columns=['Group'],
                newcolumns=['__VQB_COUNT'],
            ),
        ],
    ).execute(df)

    assert_dataframes_equals(
        df_result.sort_values(by=['Group']),
        DataFrame({
            'Group': ['Group 1', np.nan],
            '__VQB_COUNT': [3, 3],
        }).sort_values(by=['Group']),
    )
예제 #8
0
def test_to_dict():
    pipeline = Pipeline(steps=[
        DomainStep(name='domain', domain='foobar'),
        RollupStep(
            name='rollup',
            hierarchy=['a', 'b'],
            aggregations=[
                Aggregation(newcolumns=['a'], aggfunction='sum', columns=['a'])
            ],
        ),
    ])

    actual_dict = pipeline.dict()

    expected_dict = {
        'steps': [
            {
                'domain': 'foobar',
                'name': 'domain'
            },
            {
                'name':
                'rollup',
                'hierarchy': ['a', 'b'],
                'aggregations': [{
                    'new_columns': ['a'],
                    'agg_function': 'sum',
                    'columns': ['a']
                }],
            },
        ]
    }
    assert actual_dict == expected_dict
    assert pipeline == Pipeline(**pipeline.dict())
예제 #9
0
def test_keep_original_granularity_empty_on(sample_df):
    df_result = AggregateStep(
        name='aggregate',
        on=[],
        keepOriginalGranularity=True,
        aggregations=[
            Aggregation(aggfunction='count', columns=['Group'], newcolumns=['__vqb_count__']),
        ],
    ).execute(sample_df)

    assert_dataframes_equals(df_result, sample_df.assign(__vqb_count__=6))
예제 #10
0
def test_count(sample_df):
    df_result = AggregateStep(
        name='aggregate',
        keepOriginalGranularity=False,
        on=['Group'],
        aggregations=[
            Aggregation(aggfunction='count', columns=['Label'], newcolumns=['count']),
        ],
    ).execute(sample_df)

    assert_dataframes_equals(
        df_result, DataFrame({'Group': ['Group 1', 'Group 2'], 'count': [3, 3]})
    )
예제 #11
0
def test_legacy_syntax(sample_df):
    df_result = AggregateStep(
        name='aggregate',
        keepOriginalGranularity=False,
        on=[],
        aggregations=[
            Aggregation(**{'aggfunction': 'sum', 'column': 'Value1', 'newcolumn': 'sum_value'}),
        ],
    ).execute(sample_df)

    assert_dataframes_equals(
        df_result,
        DataFrame({'sum_value': [56]}),
    )
예제 #12
0
def test_without_on(sample_df):
    df_result = AggregateStep(
        name='aggregate',
        keepOriginalGranularity=False,
        on=[],
        aggregations=[
            Aggregation(aggfunction='sum', columns=['Value1'], newcolumns=['sum_value']),
        ],
    ).execute(sample_df)

    assert_dataframes_equals(
        df_result,
        DataFrame({'sum_value': [56]}),
    )
예제 #13
0
def test_count_distinct(sample_df):
    df_result = AggregateStep(
        name='aggregate',
        keepOriginalGranularity=False,
        on=[],
        aggregations=[
            Aggregation(
                aggfunction='count distinct',
                columns=['Group'],
                newcolumns=['Group_CD'],
            )
        ],
    ).execute(sample_df)

    assert_dataframes_equals(df_result, DataFrame({'Group_CD': [2]}))
예제 #14
0
def test_with_original_granularity(sample_df):
    df_result = AggregateStep(
        name='aggregate',
        keepOriginalGranularity=True,
        on=['Group'],
        aggregations=[
            Aggregation(aggfunction='sum', columns=['Value1'], newcolumns=['Total']),
        ],
    ).execute(sample_df)

    assert_dataframes_equals(
        df_result,
        DataFrame(
            {
                'Label': ['Label 1', 'Label 2', 'Label 3', 'Label 4', 'Label 5', 'Label 6'],
                'Group': ['Group 1'] * 3 + ['Group 2'] * 3,
                'Value1': [13, 7, 20, 1, 10, 5],
                'Total': [40] * 3 + [16] * 3,
                'Value2': [10, 21, 4, 17, 12, 2],
            }
        ),
    )