Пример #1
0
def test_aggregation_multiple_functions_success():
    df = util.iris(['class', 'sepalwidth'], size=150)
    test_out = df.copy()
    arguments = {
        'parameters': {
            'attributes': ['class'],
            'function': return_funcs('sepalwidth')
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = AggregationOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df': df})
    test_out = test_out.groupby('class').agg(
        sepal_avg=('sepalwidth', 'mean'),
        sepal_collect_list=('sepalwidth', _collect_list),
        sepal_collect_set=('sepalwidth', _collect_set),
        sepal_count=('sepalwidth', 'count'),
        sepal_first=('sepalwidth', 'first'),
        sepal_last=('sepalwidth', 'last'),
        sepal_max=('sepalwidth', 'max'),
        sepal_min=('sepalwidth', 'min'),
        sepal_sum=('sepalwidth', 'sum'),
        sepal_size=('sepalwidth', 'size')).reset_index()
    assert result['out'].equals(test_out)
Пример #2
0
def test_aggregation_asterisk_success():
    df = util.iris(['class'], size=150)
    test_out = df.copy()
    arguments = {
        'parameters': {
            'attributes': ['class'],
            'function': [{
                'attribute': '*',
                'f': 'count',
                'alias': 'class_count'
            }]
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = AggregationOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df': df})
    test_out = test_out.groupby(['class'
                                 ]).agg(class_count=('class',
                                                     'count')).reset_index()
    assert result['out'].equals(test_out)
Пример #3
0
def test_aggregation_pivot_table_success():
    df = util.iris(['class', 'sepalwidth', 'petalwidth'], size=150)
    test_out = df.copy()
    arguments = {
        'parameters': {
            'attributes': ['petalwidth'],
            'function': [{
                'attribute': 'petalwidth',
                'f': 'count'
            }],
            'pivot': ['class'],
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = AggregationOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df': df})
    aggfunc = {"petalwidth": ['count']}
    test_out = pd.pivot_table(test_out,
                              index=['petalwidth'],
                              columns=['class'],
                              aggfunc=aggfunc)
    test_out.reset_index(inplace=True)
    new_idx = [
        n[0] if n[1] == '' else "%s_%s_%s" % (n[0], n[1], n[2])
        for n in test_out.columns
    ]
    test_out.columns = new_idx
    assert result['out'].equals(test_out)
Пример #4
0
def test_aggregation_non_numeric_attributes_success():
    df = util.titanic(['homedest'], size=150)
    test_out = df.copy()

    arguments = {
        'parameters': {
            'attributes': ['homedest'],
            'function': return_funcs('homedest', drop='avg')
        },
        'named_inputs': {
            'input data': 'df'
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = AggregationOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df': df})

    test_out = test_out.groupby(['homedest']).agg(
        home_collect_list=('homedest', _collect_list),
        home_collect_set=('homedest', _collect_set),
        home_count=('homedest', 'count'),
        home_first=('homedest', 'first'),
        home_last=('homedest', 'last'),
        home_max=('homedest', 'max'),
        home_min=('homedest', 'min'),
        home_sum=('homedest', 'sum'),
        home_size=('homedest', 'size')).reset_index()
    assert result['out'].equals(test_out)
Пример #5
0
def test_aggregation_rows_minimal_params_success():
    params = {
        AggregationOperation.FUNCTION_PARAM: [{
            'attribute': 'income',
            'f': 'AVG',
            'alias': 'avg_income'
        }],
        AggregationOperation.ATTRIBUTES_PARAM: ['country']
    }
    n_in = {'input data': 'input_1'}
    n_out = {'output data': 'output_1'}

    instance = AggregationOperation(params,
                                    named_inputs=n_in,
                                    named_outputs=n_out)
    code = instance.generate_code()

    expected_code = dedent("""
          def _collect_list(x):
              return x.tolist()
          
          def _merge_set(x):
              return set(x.tolist())
          
          
          columns = ['country']
          target = {'income': ['avg_income']}
          operations = {'income': ['AVG']}
          
          output_1 = input_1.groupby(columns).agg(operations)
          new_idx = []
          i = 0
          old = None
          for (n1, n2) in output_1.columns.ravel():
              if old != n1:
                  old = n1
                  i = 0
              new_idx.append(target[n1][i])
              i += 1
          
          output_1.columns = new_idx
          output_1 = output_1.reset_index()
          output_1.reset_index(drop=True, inplace=True)
        """)

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
Пример #6
0
def test_aggregation_missing_input_implies_no_code_success():
    arguments = {
        'parameters': {
            'attributes': ['class'],
            'function': [{
                'attribute': 'class',
                'f': 'count',
                'alias': 'class_count'
            }]
        },
        'named_inputs': {},
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = AggregationOperation(**arguments)
    assert instance.generate_code() is None
Пример #7
0
def test_aggregation_multiple_attributes_and_functions_success():
    """You can pass multiple dicts to FUNCTION_PARAM and this allows to
    specify each parameter ('attribute', 'f' and 'alias').
    In the test below, 'sepalwidth' receives 'sum' and 'size' with their
    respective aliases, and 'petalwidth' receives 'min' and 'max' also
    with their own aliases."""
    df = util.iris(['sepalwidth', 'petalwidth', 'class'], size=150)
    test_out = df.copy()
    arguments = {
        'parameters': {
            'attributes': ['class'],
            'function': [{
                'attribute': 'sepalwidth',
                'f': 'sum',
                'alias': 'sepal_sum'
            }, {
                'attribute': 'sepalwidth',
                'f': 'size',
                'alias': 'sepal_size'
            }, {
                'attribute': 'petalwidth',
                'f': 'min',
                'alias': 'petal_min'
            }, {
                'attribute': 'petalwidth',
                'f': 'max',
                'alias': 'petal_max'
            }]
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = AggregationOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df': df})
    test_out = test_out.groupby(['class'
                                 ]).agg(sepal_sum=("sepalwidth", "sum"),
                                        sepal_size=("sepalwidth", "size"),
                                        petal_min=("petalwidth", "min"),
                                        petal_max=("petalwidth",
                                                   "max")).reset_index()
    assert result['out'].equals(test_out)
Пример #8
0
def test_aggregation_non_numeric_attributes_fail():
    df = util.titanic(['homedest'], size=150)
    arguments = {
        'parameters': {
            'attributes': ['homedest'],
            'function': return_funcs('homedest')
        },
        'named_inputs': {
            'input data': 'df'
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = AggregationOperation(**arguments)
    with pytest.raises(pd.core.base.DataError) as data_err:
        util.execute(instance.generate_code(), {'df': df})
    assert "No numeric types to aggregate" in str(data_err.value)
Пример #9
0
def test_aggregation_with_pivot_values_success():
    params = {
        AggregationOperation.ATTRIBUTES_PARAM: ["sex"],
        AggregationOperation.FUNCTION_PARAM: [{
            "attribute": "fare",
            "f": "max",
            "alias": "sex"
        }],
        "pivot": ["class"],
        "pivot_values": [1, 2],
    }
    n_in = {'input data': 'input_1'}
    n_out = {'output data': 'output_1'}

    instance = AggregationOperation(params,
                                    named_inputs=n_in,
                                    named_outputs=n_out)
    code = instance.generate_code()

    expected_code = dedent("""
    def _collect_list(x):
              return x.tolist()
    def _merge_set(x):
        return set(x.tolist())

    values = [1, 2]
    input_1 = input_1[input_1['class'].isin(values)]
    aggfunc = {'fare': ['max']}
    output_1 = pd.pivot_table(input_1, index=['sex'], values=['fare'],
                              columns=['class'], aggfunc=aggfunc)
    # rename columns and convert to DataFrame
    output_1.reset_index(inplace=True)
    new_idx = [n[0] if n[1] is ''
               else "%s_%s_%s" % (n[0],n[1], n[2])
               for n in output_1.columns.ravel()]    
    output_1 = pd.DataFrame(output_1.to_records())
    output_1.reset_index(drop=True, inplace=True)
    output_1 = output_1.drop(columns='index')
    output_1.columns = new_idx
    """)
    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg
Пример #10
0
def test_aggregation_success():
    slice_size = 10
    df = [
        'df',
        util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
                  slice_size)
    ]

    arguments = {
        'parameters': {},
        'named_inputs': {
            'input data': df[0],
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = AggregationOperation(**arguments)
    result = util.execute(instance.generate_code(), dict([df]))
    assert result['out'].equals(util.iris(size=slice_size))
Пример #11
0
def test_aggregation_missing_attribute_param_fail():
    df = util.iris(['class'], size=150)
    arguments = {
        'parameters': {
            'function': [{
                'attribute': 'class',
                'f': 'count',
                'alias': 'class_count'
            }]
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = AggregationOperation(**arguments)
    with pytest.raises(TypeError) as typ_err:
        util.execute(instance.generate_code(), {'df': df})
    assert "You have to supply one of 'by' and 'level'" in str(typ_err.value)
Пример #12
0
def test_aggregation_invalid_function_param_alias_fail():
    df = util.iris(['class'], size=150)
    arguments = {
        'parameters': {
            'attributes': ['class'],
            'function': [{
                'attribute': 'class',
                'f': 'count',
                'alias': ''
            }]
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = AggregationOperation(**arguments)
    with pytest.raises(SyntaxError) as syn_err:
        util.execute(instance.generate_code(), {'df': df})
    assert "invalid syntax" in str(syn_err.value)
Пример #13
0
def test_aggregation_invalid_pivot_table_fail():
    df = util.iris(['class', 'sepalwidth', 'petalwidth'], size=150)
    arguments = {
        'parameters': {
            'attributes': ['petalwidth'],
            'function': [{
                'attribute': 'petalwidth',
                'f': 'count'
            }],
            'pivot': 'invalid',
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = AggregationOperation(**arguments)
    with pytest.raises(NameError) as nam_err:
        util.execute(instance.generate_code(), {'df': df})
    assert "name 'invalid' is not defined" in str(nam_err.value)
Пример #14
0
def test_aggregation_missing_function_param_fail():
    arguments = {
        'parameters': {
            'attributes': ['class']
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    with pytest.raises(ValueError) as val_err:
        AggregationOperation(**arguments)
    assert "Parameter 'function' must be informed for task" in str(
        val_err.value)
Пример #15
0
def test_aggregation_invalid_function_param_fail():
    arguments = {
        'parameters': {
            'attributes': ['class'],
            'function': 'invalid'
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    with pytest.raises(TypeError) as typ_err:
        AggregationOperation(**arguments)
    assert "string indices must be integers" in str(typ_err.value)
Пример #16
0
def test_aggregation_missing_function_param_function_fail():
    arguments = {
        'parameters': {
            'attributes': ['class'],
            'function': [{
                'attribute': 'class',
                'alias': 'class_count'
            }]
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    with pytest.raises(KeyError) as key_err:
        AggregationOperation(**arguments)
    assert "f" in str(key_err.value)
Пример #17
0
def test_aggregation_missing_function_param_failure():
    params = {AggregationOperation.ATTRIBUTES_PARAM: ['country']}
    n_in = {'input data': 'input_1'}
    n_out = {'output data': 'output_1'}
    with pytest.raises(ValueError):
        AggregationOperation(params, named_inputs=n_in, named_outputs=n_out)