def test_totals_nogroup_3_dimensions(): sample_df = pd.DataFrame( { 'COUNTRY': ['France', 'USA'] * 4, 'PRODUCT': (['product A'] * 2 + ['product B'] * 2) * 2, 'YEAR': ['2019'] * 4 + ['2020'] * 4, 'VALUE_1': [5, 10, 10, 15, 20, 20, 30, 25], } ) step = TotalsStep( name='totals', totalDimensions=[ TotalDimension(total_column='COUNTRY', total_rows_label='All countries'), TotalDimension(total_column='PRODUCT', total_rows_label='All products'), TotalDimension(total_column='YEAR', total_rows_label='All years'), ], aggregations=[ Aggregation( columns=['VALUE_1'], aggfunction='sum', newcolumns=['VALUE_1-sum'], ), Aggregation(columns=['VALUE_1'], aggfunction='avg', newcolumns=['VALUE_1-avg']), ], ) real_result = step.execute(sample_df) assert real_result[real_result['YEAR'] == 'All years'].count()['YEAR'] == 9 assert real_result[real_result['COUNTRY'] == 'All countries'].count()['COUNTRY'] == 9 assert real_result[real_result['PRODUCT'] == 'All products'].count()['PRODUCT'] == 9 # could be any column assert real_result.count()['YEAR'] == 27
def test_simple_aggregate(sample_df): df_result = AggregateStep( name='aggregate', on=['Group'], aggregations=[ Aggregation( aggfunction='sum', columns=['Value1', 'Value2'], newcolumns=['Sum-Value1', 'Sum-Value2'], ), Aggregation(aggfunction='avg', columns=['Value1'], newcolumns=['Avg-Value1']), ], ).execute(sample_df) assert_dataframes_equals( df_result.sort_values(by=['Group']), DataFrame( { 'Group': ['Group 1', 'Group 2'], 'Sum-Value1': [40, 16], 'Sum-Value2': [35, 31], 'Avg-Value1': [np.average([13, 7, 20]), np.average([1, 10, 5])], } ).sort_values(by=['Group']), )
def test_with_original_granularity_multiple_aggregations_multiple_columns(sample_df): df_result = AggregateStep( name='aggregate', keepOriginalGranularity=True, on=['Group'], aggregations=[ Aggregation( aggfunction='min', columns=['Value1', 'Value2'], newcolumns=['min_Value1', 'min_Value2'], ), Aggregation( aggfunction='max', columns=['Value1', 'Value2'], newcolumns=['max_Value1', 'max_Value2'], ), ], ).execute(sample_df) assert_dataframes_equals( df_result, DataFrame( { 'Label': ['Label 1', 'Label 2', 'Label 3', 'Label 4', 'Label 5', 'Label 6'], 'Group': ['Group 1'] * 3 + ['Group 2'] * 3, 'Value1': [13, 7, 20, 1, 10, 5], 'min_Value1': [7] * 3 + [1] * 3, 'max_Value1': [20] * 3 + [10] * 3, 'Value2': [10, 21, 4, 17, 12, 2], 'min_Value2': [4] * 3 + [2] * 3, 'max_Value2': [21] * 3 + [17] * 3, } ), )
def test_totals_2(): sample_df = pd.DataFrame( { 'COUNTRY': ['France', 'USA'] * 4, 'PRODUCT': (['product A'] * 2 + ['product B'] * 2) * 2, 'YEAR': ['2019'] * 4 + ['2020'] * 4, 'VALUE_1': [5, 10, 10, 15, 20, 20, 30, 25], 'VALUE_2': [50, 100, 100, 150, 200, 200, 300, 250], } ) step = TotalsStep( name='totals', totalDimensions=[ TotalDimension(total_column='COUNTRY', total_rows_label='All countries'), TotalDimension(total_column='PRODUCT', total_rows_label='All products'), ], aggregations=[ Aggregation( columns=['VALUE_1', 'VALUE_2'], aggfunction='sum', newcolumns=['VALUE_1-sum', 'VALUE_2'], ), Aggregation(columns=['VALUE_1'], aggfunction='avg', newcolumns=['VALUE_1-avg']), ], groups=['YEAR'], ) expected_result = sample_df.copy() expected_result['VALUE_1-sum'] = expected_result['VALUE_1-avg'] = expected_result['VALUE_1'] del expected_result['VALUE_1'] expected_result = pd.concat( [ expected_result, pd.DataFrame( { 'COUNTRY': ['USA', 'France'] * 2 + ['All countries'] * 6, 'PRODUCT': ['All products'] * 4 + ['product B', 'product A'] * 2 + ['All products'] * 2, 'YEAR': (['2020'] * 2 + ['2019'] * 2) * 2 + ['2020', '2019'], 'VALUE_2': [450, 500, 250, 150, 550, 400, 250, 150, 950, 400], 'VALUE_1-sum': [45, 50, 25, 15, 55, 40, 25, 15, 95, 40], 'VALUE_1-avg': [22.5, 25, 12.5, 7.5, 27.5, 20, 12.5, 7.5, 23.75, 10], } ), ] ) real_result = step.execute(sample_df) real_sorted = real_result.sort_values(by=real_result.columns.tolist()) expected_sorted = expected_result.sort_values(by=expected_result.columns.tolist()) assert_dataframes_equals(real_sorted, expected_sorted)
def test_single_totals_without_groups(): sample_df = pd.DataFrame( { 'COUNTRY': ['France', 'USA'] * 4, 'PRODUCT': (['PRODUCT A'] * 2 + ['PRODUCT B'] * 2) * 2, 'YEAR': ['2019'] * 4 + ['2020'] * 4, 'VALUE': [5, 10, 10, 15, 20, 20, 30, 25], } ) step = TotalsStep( name='totals', totalDimensions=[TotalDimension(total_column='COUNTRY', total_rows_label='All countries')], aggregations=[Aggregation(columns=['VALUE'], aggfunction='sum', newcolumns=['VALUE'])], groups=[], ) expected_result = pd.concat( [ sample_df, pd.DataFrame( {'COUNTRY': 'All countries', 'PRODUCT': [None], 'YEAR': [None], 'VALUE': [135]} ), ] ) real_result = step.execute(sample_df) assert_dataframes_equals(real_result, expected_result)
def test_simple_aggregate_with_null(): df = DataFrame( { 'Label': ['Label 1', 'Label 2', 'Label 3', 'Label 4', 'Label 5', 'Label 6'], 'Group': ['Group 1'] * 3 + [None] * 3, # type: ignore 'Value1': [13, 7, 20, 1, 10, 5], } ) df_result = AggregateStep( name='aggregate', on=['Group'], aggregations=[ Aggregation( aggfunction='sum', columns=['Value1'], newcolumns=['Sum-Value1'], ), ], ).execute(df) assert_dataframes_equals( df_result.sort_values(by=['Group']), DataFrame( { 'Group': ['Group 1', None], 'Sum-Value1': [40, 16], } ).sort_values(by=['Group']), )
def test_count_with_null(): df = DataFrame({ 'Label': ['Label 1', 'Label 2', 'Label 3', 'Label 4', 'Label 5', 'Label 6'], 'Group': ['Group 1'] * 3 + [None] * 3, # type: ignore 'Value1': [13, 7, 20, 1, 10, 5], }) df_result = AggregateStep( name='aggregate', on=['Group'], aggregations=[ Aggregation( aggfunction='count distinct including empty', columns=['Group'], newcolumns=['__VQB_COUNT'], ), ], ).execute(df) assert_dataframes_equals( df_result.sort_values(by=['Group']), DataFrame({ 'Group': ['Group 1', np.nan], '__VQB_COUNT': [3, 3], }).sort_values(by=['Group']), )
def test_to_dict(): pipeline = Pipeline(steps=[ DomainStep(name='domain', domain='foobar'), RollupStep( name='rollup', hierarchy=['a', 'b'], aggregations=[ Aggregation(newcolumns=['a'], aggfunction='sum', columns=['a']) ], ), ]) actual_dict = pipeline.dict() expected_dict = { 'steps': [ { 'domain': 'foobar', 'name': 'domain' }, { 'name': 'rollup', 'hierarchy': ['a', 'b'], 'aggregations': [{ 'new_columns': ['a'], 'agg_function': 'sum', 'columns': ['a'] }], }, ] } assert actual_dict == expected_dict assert pipeline == Pipeline(**pipeline.dict())
def test_keep_original_granularity_empty_on(sample_df): df_result = AggregateStep( name='aggregate', on=[], keepOriginalGranularity=True, aggregations=[ Aggregation(aggfunction='count', columns=['Group'], newcolumns=['__vqb_count__']), ], ).execute(sample_df) assert_dataframes_equals(df_result, sample_df.assign(__vqb_count__=6))
def test_count(sample_df): df_result = AggregateStep( name='aggregate', keepOriginalGranularity=False, on=['Group'], aggregations=[ Aggregation(aggfunction='count', columns=['Label'], newcolumns=['count']), ], ).execute(sample_df) assert_dataframes_equals( df_result, DataFrame({'Group': ['Group 1', 'Group 2'], 'count': [3, 3]}) )
def test_legacy_syntax(sample_df): df_result = AggregateStep( name='aggregate', keepOriginalGranularity=False, on=[], aggregations=[ Aggregation(**{'aggfunction': 'sum', 'column': 'Value1', 'newcolumn': 'sum_value'}), ], ).execute(sample_df) assert_dataframes_equals( df_result, DataFrame({'sum_value': [56]}), )
def test_without_on(sample_df): df_result = AggregateStep( name='aggregate', keepOriginalGranularity=False, on=[], aggregations=[ Aggregation(aggfunction='sum', columns=['Value1'], newcolumns=['sum_value']), ], ).execute(sample_df) assert_dataframes_equals( df_result, DataFrame({'sum_value': [56]}), )
def test_count_distinct(sample_df): df_result = AggregateStep( name='aggregate', keepOriginalGranularity=False, on=[], aggregations=[ Aggregation( aggfunction='count distinct', columns=['Group'], newcolumns=['Group_CD'], ) ], ).execute(sample_df) assert_dataframes_equals(df_result, DataFrame({'Group_CD': [2]}))
def test_with_original_granularity(sample_df): df_result = AggregateStep( name='aggregate', keepOriginalGranularity=True, on=['Group'], aggregations=[ Aggregation(aggfunction='sum', columns=['Value1'], newcolumns=['Total']), ], ).execute(sample_df) assert_dataframes_equals( df_result, DataFrame( { 'Label': ['Label 1', 'Label 2', 'Label 3', 'Label 4', 'Label 5', 'Label 6'], 'Group': ['Group 1'] * 3 + ['Group 2'] * 3, 'Value1': [13, 7, 20, 1, 10, 5], 'Total': [40] * 3 + [16] * 3, 'Value2': [10, 21, 4, 17, 12, 2], } ), )