def test_gradient_boosting_regressor_success():
    slice_size = 10
    df = [
        'df',
        util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
                  slice_size)
    ]

    arguments = {
        'parameters': {},
        'named_inputs': {
            'input data': df[0],
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = GradientBoostingRegressor(**arguments)
    result = util.execute(instance.generate_code(), dict([df]))
    assert result['out'].equals(util.iris(size=slice_size))
Пример #2
0
def test_string_indexer_success():
    slice_size = 10
    df = [
        'df',
        util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
                  slice_size)
    ]

    arguments = {
        'parameters': {},
        'named_inputs': {
            'input data': df[0],
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = StringIndexerOperation(**arguments)
    result = util.execute(instance.generate_code(), dict([df]))
    assert result['out'].equals(util.iris(size=slice_size))
Пример #3
0
def test_drop_success():
    slice_size = 10
    df = ['df', util.iris(size=slice_size)]

    arguments = {
        'parameters': {
            DropOperation.ATTRIBUTES_PARAM: ['class']
        },
        'named_inputs': {
            'input data': df[0],
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = DropOperation(**arguments)
    result = util.execute(instance.generate_code(), 
            {'df': df[1]})
    assert result['out'].equals(
            util.iris(size=slice_size).drop(['class'], axis=1))
Пример #4
0
def test_add_columns_aliases_param_success():
    left_df = util.iris(['sepallength', 'sepalwidth'], size=10)
    right_df = util.iris(['sepallength', 'sepalwidth'], size=10)
    test_df = util.iris(
        ['sepallength', 'sepalwidth', 'sepallength', 'sepalwidth'], size=10)
    arguments = {
        'parameters': {'aliases': '_value0,_value1'},
        'named_inputs': {
            'input data 1': 'df1',
            'input data 2': 'df2'
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = AddColumnsOperation(**arguments)
    result = util.execute(instance.generate_code(),
                          {'df1': left_df, 'df2': right_df})
    test_df.columns = ['sepallength_value0', 'sepalwidth_value0',
                       'sepallength_value1', 'sepalwidth_value1']
    assert result['out'].equals(test_df)
Пример #5
0
def test_difference_col_intersection_success():
    df1 = util.iris(['sepallength'], size=15)
    df2 = util.iris(['petalwidth', 'sepallength'], size=10)
    df1.loc[7:9, 'sepallength'] = 1
    test_df = df1.copy()

    arguments = {
        'parameters': {},
        'named_inputs': {
            'input data 1': 'df1',
            'input data 2': 'df2'
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = DifferenceOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df1': df1, 'df2': df2})

    assert result['out'].equals(test_df.loc[7:14, ['sepallength']])
    assert len(result['out']) == 8
Пример #6
0
def test_add_columns_success():
    slice_size = 10
    left_df = ['df1', util.iris(['sepallength', 'sepalwidth'], slice_size)]
    right_df = [
        'df2',
        util.iris(['petallength', 'petalwidth', 'class'], slice_size)
    ]

    arguments = {
        'parameters': {},
        'named_inputs': {
            'input data 1': left_df[0],
            'input data 2': right_df[0]
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = AddColumnsOperation(**arguments)
    result = util.execute(instance.generate_code(), dict([left_df, right_df]))
    assert result['out'].equals(util.iris(size=slice_size))
Пример #7
0
def test_clean_missing_remove_column_success():
    df = util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
                   size=10)
    df.loc[4, 'sepalwidth'] = np.NaN
    arguments = {
        'parameters': {
            'attributes': ['sepalwidth'],
            'cleaning_mode': 'REMOVE_COLUMN'
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = CleanMissingOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df': df})
    assert result['output_data_1'].equals(
        util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
                  10).drop(columns=['sepalwidth']))
Пример #8
0
def test_difference_input2_is_bigger_success():
    """It only returns the dataframe column, maybe it should return nothing?"""
    df1 = util.iris(['petalwidth'], size=10)
    df2 = util.iris(['petalwidth'], size=15)
    test_df = df1.copy().drop(range(10))

    arguments = {
        'parameters': {},
        'named_inputs': {
            'input data 1': 'df1',
            'input data 2': 'df2'
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = DifferenceOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df1': df1, 'df2': df2})

    assert result['out'].equals(test_df)
    assert len(result['out']) == 0
Пример #9
0
def test_balanced_split_k_fold_shuffle_success():
    """Balanced percentage of each class in each group"""
    df = util.iris(['class'], size=30)
    df.loc[10:20, 'class'] = 'Iris-versicolor'
    df.loc[20:30, 'class'] = 'Iris-virginica'
    test_df = df.copy()

    arguments = {
        'parameters': {
            'n_splits': 3,
            'shuffle': 1,
            'attribute': 'groups',
            'stratified': 0,
            'random_state': 18
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = SplitKFoldOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df': df})
    data = {
        "groups": [
            0, 0, 2, 0, 1, 2, 0, 1, 2, 0, 2, 1, 0, 0, 2, 1, 1, 2, 2, 2, 0, 0,
            2, 1, 2, 1, 1, 1, 0, 1
        ]
    }
    test_out = pd.concat([test_df, pd.DataFrame(data)], axis=1)
    count_percent = percent(df=result['out'], n_groups=3)

    assert [
        count_percent['group0']['Iris-setosa'],
        count_percent['group1']['Iris-setosa'],
        count_percent['group2']['Iris-setosa']
    ] == [50, 20, 30]

    assert [
        count_percent['group0']['Iris-versicolor'],
        count_percent['group1']['Iris-versicolor'],
        count_percent['group2']['Iris-versicolor']
    ] == [20, 30, 50]

    assert [
        count_percent['group0']['Iris-virginica'],
        count_percent['group1']['Iris-virginica'],
        count_percent['group2']['Iris-virginica']
    ] == [30, 50, 20]

    assert result['out'].equals(test_out)
Пример #10
0
def test_uneven_split_k_fold_success():
    """
    With a uneven split, you get floating point class presence percentage
    in some groups
    """
    df = util.iris(['class'], size=30)
    df.loc[10:20, 'class'] = 'Iris-versicolor'
    df.loc[20:29, 'class'] = 'Iris-virginica'
    test_df = df.copy()

    arguments = {
        'parameters': {
            'n_splits': 4,
            'shuffle': 0,
            'attribute': 'groups',
            'stratified': 0,
            'random_state': 0
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = SplitKFoldOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df': df})

    data = {
        "groups": [
            0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
            2, 3, 3, 3, 3, 3, 3, 3
        ]
    }
    test_out = pd.concat([test_df, pd.DataFrame(data)], axis=1)
    count_percent = percent(result['out'], 4)

    assert count_percent['group0']['Iris-setosa'] == 100

    assert [
        count_percent['group1']['Iris-setosa'],
        count_percent['group1']['Iris-versicolor']
    ] == [25, 75]
    # Here
    assert [
        count_percent['group2']['Iris-versicolor'],
        count_percent['group2']['Iris-virginica']
    ] == pytest.approx([57.14, 42.85], 0.1)

    assert count_percent['group3']['Iris-virginica'] == 100

    assert result['out'].equals(test_out)
Пример #11
0
def test_unbalanced_split_k_fold_shuffle_success():
    """
    Unbalanced example/test, versicolor is occupying almost every groups space
    """
    df = util.iris(['class'], size=30)
    df.loc[8:27, 'class'] = 'Iris-versicolor'
    df.loc[27:29, 'class'] = 'Iris-virginica'
    test_df = df.copy()
    arguments = {
        'parameters': {
            'n_splits': 3,
            'shuffle': 1,
            'attribute': 'groups',
            'stratified': 0,
            'random_state': 0
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = SplitKFoldOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df': df})
    data = {
        "groups": [
            2, 1, 0, 2, 1, 1, 1, 2, 1, 2, 0, 0, 2, 0, 1, 2, 1, 0, 2, 2, 1, 2,
            0, 1, 0, 2, 0, 0, 0, 1
        ]
    }
    test_out = pd.concat([test_df, pd.DataFrame(data)], axis=1)
    count_percent = percent(result['out'], 3)

    assert [
        count_percent['group0']['Iris-setosa'],
        count_percent['group0']['Iris-versicolor'],
        count_percent['group0']['Iris-virginica']
    ] == [10, 70, 20]

    assert [
        count_percent['group1']['Iris-setosa'],
        count_percent['group1']['Iris-versicolor'],
        count_percent['group1']['Iris-virginica']
    ] == [40, 50, 10]

    assert [
        count_percent['group2']['Iris-setosa'],
        count_percent['group2']['Iris-versicolor']
    ] == [30, 70]

    assert result['out'].equals(test_out)
Пример #12
0
def test_union_success():
    df1 = util.iris(['sepallength', 'sepalwidth'], size=10)
    df2 = util.iris(['petalwidth', 'petallength'], size=10)
    test_df1 = df1.copy()
    test_df2 = df2.copy()

    arguments = {
        'parameters': {},
        'named_inputs': {
            'input data 1': 'df1',
            'input data 2': 'df2',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = UnionOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df1': df1, 'df2': df2})

    assert result['out'].equals(
        pd.concat([test_df1, test_df2], sort=False, axis=0, ignore_index=True))
    assert len(result['out']) == 20
Пример #13
0
def test_add_columns_different_size_dataframes_success():
    """
    In this case, AddColumnsOperation() returns a dataframe with the lowest
    size passed.
    """
    left_df = util.iris(['sepallength', 'sepalwidth'], size=10)
    right_df = util.iris(['petallength', 'petalwidth', 'class'], size=5)
    arguments = {
        'parameters': {},
        'named_inputs': {
            'input data 1': 'df1',
            'input data 2': 'df2'
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = AddColumnsOperation(**arguments)
    result = util.execute(instance.generate_code(),
                          {'df1': left_df, 'df2': right_df})
    assert result['out'].equals(util.iris([
        'sepallength', 'sepalwidth',
        'petallength', 'petalwidth', 'class'], size=5))
Пример #14
0
def test_difference_different_cols_success():
    """Returns nothing"""
    df1 = util.iris(['petalwidth'], size=20)
    df2 = util.iris(['class'], size=10)
    test_df = df1.copy().drop(columns='petalwidth', index=range(20))

    arguments = {
        'parameters': {
            'attributes': 2
        },
        'named_inputs': {
            'input data 1': 'df1',
            'input data 2': 'df2'
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = DifferenceOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df1': df1, 'df2': df2})

    assert result['out'].equals(test_df)
    assert len(result['out']) == 0
Пример #15
0
def test_drop_success_missing_input_implies_no_code():
    slice_size = 10
    df = ['df', util.iris(size=slice_size)]

    arguments = {
        'parameters': {
            DropOperation.ATTRIBUTES_PARAM: ['class']
        },
        'named_inputs': {},
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = DropOperation(**arguments)
    assert not instance.has_code
Пример #16
0
def test_add_columns_repetead_column_names_success():
    """
    AddColumnsOperation() automatically adds aliases to repetead column names
    """
    left_df = util.iris(['sepallength', 'class'], size=10)
    right_df = util.iris(['sepallength', 'class'], size=10)
    test_df = util.iris(
        ['sepallength', 'class', 'sepallength', 'class'], size=10)
    arguments = {
        'parameters': {},
        'named_inputs': {
            'input data 1': 'df1',
            'input data 2': 'df2'
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = AddColumnsOperation(**arguments)
    result = util.execute(instance.generate_code(),
                          {'df1': left_df, 'df2': right_df})
    test_df.columns = ['sepallength_ds0', 'class_ds0',
                       'sepallength_ds1', 'class_ds1']
    assert result['out'].equals(test_df)
Пример #17
0
def test_execute_sql_multiple_dataframes_success():
    df1 = util.iris(['class'], size=10)
    df2 = util.iris(['sepalwidth'], size=10)
    test_df = df1.copy().join(df2.copy())
    test_df.sort_values(by='sepalwidth', inplace=True)
    test_df.drop_duplicates(inplace=True, ignore_index=True)

    arguments = {
        'parameters': {
            'query': 'SELECT DISTINCT class, sepalwidth FROM ds1,'
            ' ds2 ORDER BY class, sepalwidth',
            'names': 'class,sepalwidth'
        },
        'named_inputs': {
            'input data 1': 'df1',
            'input data 2': 'df2'
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = ExecuteSQLOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df1': df1, 'df2': df2})
    assert result['out'].equals(test_df)
Пример #18
0
def test_drop_missing_parameters_fail():
    slice_size = 10
    df = ['df', util.iris(size=slice_size)]

    arguments = {
        'parameters': {},
        'named_inputs': {
            'input data': df[0],
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    with pytest.raises(ValueError) as val_err:
        DropOperation(**arguments)
    assert "'attributes' must be informed for task" in str(val_err.value)
Пример #19
0
def test_execute_sql_names_param_not_informed_success():
    df1 = util.iris(['class', 'sepalwidth'], size=10)
    test_df = df1.copy()
    arguments = {
        'parameters': {
            'query': 'SELECT class, sepalwidth FROM ds1'
        },
        'named_inputs': {
            'input data 1': 'df1',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = ExecuteSQLOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df1': df1})
    assert result['out'].equals(test_df)
Пример #20
0
def test_aggregation_multiple_attributes_and_functions_success():
    """You can pass multiple dicts to FUNCTION_PARAM and this allows to
    specify each parameter ('attribute', 'f' and 'alias').
    In the test below, 'sepalwidth' receives 'sum' and 'size' with their
    respective aliases, and 'petalwidth' receives 'min' and 'max' also
    with their own aliases."""
    df = util.iris(['sepalwidth', 'petalwidth', 'class'], size=150)
    test_out = df.copy()
    arguments = {
        'parameters': {
            'attributes': ['class'],
            'function': [{
                'attribute': 'sepalwidth',
                'f': 'sum',
                'alias': 'sepal_sum'
            }, {
                'attribute': 'sepalwidth',
                'f': 'size',
                'alias': 'sepal_size'
            }, {
                'attribute': 'petalwidth',
                'f': 'min',
                'alias': 'petal_min'
            }, {
                'attribute': 'petalwidth',
                'f': 'max',
                'alias': 'petal_max'
            }]
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = AggregationOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df': df})
    test_out = test_out.groupby(['class'
                                 ]).agg(sepal_sum=("sepalwidth", "sum"),
                                        sepal_size=("sepalwidth", "size"),
                                        petal_min=("petalwidth", "min"),
                                        petal_max=("petalwidth",
                                                   "max")).reset_index()
    assert result['out'].equals(test_out)
Пример #21
0
def test_feature_assembler_missing_multiplicity_param_fail():
    df = util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
                   size=10)
    arguments = {
        'parameters': {
            'attributes': ['sepalwidth', 'petalwidth']
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = FeatureAssemblerOperation(**arguments)
    with pytest.raises(KeyError) as key_err:
        util.execute(instance.generate_code(), {'df': df})
    assert "'multiplicity'" in str(key_err.value)
Пример #22
0
def test_distinct_missing_attributes_param_success():
    df = util.iris(['sepallength'], size=10)
    df.loc[0:3, 'sepallength'] = 'test'
    df.loc[6:9, 'sepallength'] = 'distinct'
    test_df = df.copy()

    arguments = {
        'parameters': {},
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = DistinctOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df': df})
    assert result['out'].equals(test_df.drop(index=[1, 2, 3, 7, 8, 9]))
Пример #23
0
def test_drop_invalid_attribute_param_fail():
    df = util.iris(size=10)

    arguments = {
        'parameters': {
            'attributes': ['invalid']
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = DropOperation(**arguments)
    with pytest.raises(KeyError) as key_err:
        util.execute(instance.generate_code(), {'df': df})
    assert "['invalid'] not found in axis" in str(key_err.value)
Пример #24
0
def test_execute_sql_column_not_found_fail():
    df1 = util.iris(['class', 'sepalwidth'], size=10)
    arguments = {
        'parameters': {
            'query': 'SELECT unknown FROM ds1'
        },
        'named_inputs': {
            'input data 1': 'df1',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = ExecuteSQLOperation(**arguments)
    with pytest.raises(pandasql.PandaSQLException) as psql_err:
        util.execute(instance.generate_code(), {'df1': df1})
    assert "(sqlite3.OperationalError) no such column: unknown" in str(
        psql_err.value)
Пример #25
0
def test_select_invalid_and_valid_attributes_param_fail():
    df = util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
                   size=10)

    arguments = {
        'parameters': {
            'attributes': ['sepallength', 'class']
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output projected data': 'out'
        }
    }
    instance = SelectOperation(**arguments)
    with pytest.raises(KeyError) as key_err:
        util.execute(instance.generate_code(), {'df': df})
    assert "['class'] not in index" in str(key_err.value)
Пример #26
0
def test_feature_assembler_invalid_multiplicity_param_fail():
    df = util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
                   size=10)
    arguments = {
        'parameters': {
            'attributes': ['sepalwidth', 'petalwidth'],
            'multiplicity': 'invalid'
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = FeatureAssemblerOperation(**arguments)
    with pytest.raises(TypeError) as typ_err:
        util.execute(instance.generate_code(), {'df': df})
    assert "string indices must be integers" in str(typ_err)
Пример #27
0
def test_split_invalid_seed_param_fail():
    df = util.iris(['sepallength', 'sepalwidth',
                    'petallength', 'petalwidth'], size=10)

    arguments = {
        'parameters': {'seed': 'invalid'},
        'named_inputs': {
            'input data': 'df'
        },
        'named_outputs': {
            'split 1': 'split_1_task_1',
            'split 2': 'split_2_task_1'
        }
    }
    instance = SplitOperation(**arguments)
    with pytest.raises(NameError) as nam_err:
        util.execute(instance.generate_code(),
                     {'df': df})
    assert "name 'invalid' is not defined" in str(nam_err.value)
Пример #28
0
def test_execute_sql_wrong_number_of_attributes_informed_fail():
    df1 = util.iris(['class', 'sepalwidth'], size=10)

    arguments = {
        'parameters': {
            'query': 'SELECT class, sepalwidth FROM ds1',
            'names': 'class'
        },
        'named_inputs': {
            'input data 1': 'df1'
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = ExecuteSQLOperation(**arguments)
    with pytest.raises(ValueError) as val_err:
        util.execute(instance.generate_code(), {'df1': df1})
    assert "Invalid names. Number of attributes in" \
           " result differs from names informed." in str(val_err.value)
Пример #29
0
def test_sample_or_partition_head_success():
    df = util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
                   size=10)
    test_df = df.copy()
    arguments = {
        'parameters': {
            'type': 'head',
            'value': 2
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'sampled data': 'out'
        }
    }
    instance = SampleOrPartitionOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df': df})
    assert len(result['out']) == 2
    assert result['out'].equals(test_df.iloc[:2, :])
Пример #30
0
def test_clean_missing_fill_mean_success():
    df = util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
                   size=10)
    df.loc[0, 'sepalwidth'] = np.NaN
    sepal_mean = df.copy().loc[:, 'sepalwidth']
    arguments = {
        'parameters': {
            'attributes': ['sepalwidth'],
            'cleaning_mode': 'MEAN'
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = CleanMissingOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df': df})
    assert result['output_data_1'].loc[0, 'sepalwidth'] == sepal_mean.mean()