Пример #1
0
def test_clean_missing_ratio_control_success():
    """
    Needs a better assertion...
    Ratio method is confusing.
    """
    df = util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
                   size=10)
    df.loc[:, ['sepallength', 'sepalwidth']] = np.NaN
    df.loc[0, 'petalwidth'] = np.NaN
    test = util.iris(
        ['sepallength', 'sepalwidth', 'petalwidth', 'petallength'], size=10)
    test.loc[:, ['sepallength', 'sepalwidth']] = np.NaN
    test.loc[0, 'petalwidth'] = np.NaN

    arguments = {
        'parameters': {
            'attributes':
            ['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
            'min_missing_ratio':
            0.025,
            'max_missing_ratio':
            0.1,
            'cleaning_mode':
            'REMOVE_COLUMN'
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = CleanMissingOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df': df})
    assert result['output_data_1'].equals(test.drop(columns=['petalwidth']))
Пример #2
0
def test_clean_missing_multiple_attributes_success():
    df = util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
                   size=10)
    df.loc[0:1, 'sepallength'] = np.NaN
    df.loc[2:3, 'sepalwidth'] = np.NaN
    df.loc[4:5, 'petalwidth'] = np.NaN
    df.loc[6:7, 'petallength'] = np.NaN
    arguments = {
        'parameters': {
            'attributes':
            ['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
            'cleaning_mode':
            'REMOVE_ROW'
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = CleanMissingOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df': df})
    assert result['output_data_1'].equals(
        util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
                  size=10).drop(index=[i for i in range(8)]))
Пример #3
0
def test_clean_missing_missing_input_implies_no_code_success():
    arguments = {
        'parameters': {
            'attributes': ['sepalwidth'],
            'cleaning_mode': 'VALUE',
            'value': 'replaced'
        },
        'named_inputs': {},
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = CleanMissingOperation(**arguments)
    assert instance.generate_code() is None
Пример #4
0
def test_clean_wrong_ratio_param_failure():
    params = {
        CleanMissingOperation.MIN_MISSING_RATIO_PARAM: 1.7,
        CleanMissingOperation.MAX_MISSING_RATIO_PARAM: -1.0,
    }
    n_in = {'input data': 'input_1'}
    n_out = {'output data': 'output_1'}
    with pytest.raises(ValueError):
        CleanMissingOperation(params, named_inputs=n_in, named_outputs=n_out)
Пример #5
0
def test_clean_missing_success():
    slice_size = 10
    df = ['df', util.iris(['sepallength', 'sepalwidth', 
        'petalwidth', 'petallength'], slice_size)]

    arguments = {
        'parameters': {},
        'named_inputs': {
            'input data': df[0],
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = CleanMissingOperation(**arguments)
    result = util.execute(instance.generate_code(), 
                          dict([df]))
    assert result['out'].equals(util.iris(size=slice_size))
Пример #6
0
def test_clean_missing_fill_mean_success():
    df = util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
                   size=10)
    df.loc[0, 'sepalwidth'] = np.NaN
    sepal_mean = df.copy().loc[:, 'sepalwidth']
    arguments = {
        'parameters': {
            'attributes': ['sepalwidth'],
            'cleaning_mode': 'MEAN'
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = CleanMissingOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df': df})
    assert result['output_data_1'].loc[0, 'sepalwidth'] == sepal_mean.mean()
Пример #7
0
def test_clean_missing_missing_cleaning_mode_param_success():
    """
    Defaults to REMOVE_ROW
    """
    df = util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
                   size=10)

    arguments = {
        'parameters': {
            'attributes':
            ['sepallength', 'sepalwidth', 'petalwidth', 'petallength']
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = CleanMissingOperation(**arguments)
    util.execute(instance.generate_code(), {'df': df})
Пример #8
0
def test_clean_missing_remove_column_success():
    df = util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
                   size=10)
    df.loc[4, 'sepalwidth'] = np.NaN
    arguments = {
        'parameters': {
            'attributes': ['sepalwidth'],
            'cleaning_mode': 'REMOVE_COLUMN'
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = CleanMissingOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df': df})
    assert result['output_data_1'].equals(
        util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
                  10).drop(columns=['sepalwidth']))
Пример #9
0
def test_clean_missing_minimal_params_type_value_success():
    params = {
        CleanMissingOperation.ATTRIBUTES_PARAM: ['name'],
        CleanMissingOperation.MIN_MISSING_RATIO_PARAM: 0.0,
        CleanMissingOperation.MAX_MISSING_RATIO_PARAM: 1.0,
        CleanMissingOperation.VALUE_PARAMETER: 200,
        CleanMissingOperation.CLEANING_MODE_PARAM: 'VALUE'
    }
    n_in = {'input data': 'input_1'}
    n_out = {'output result': 'output_1'}
    instance = CleanMissingOperation(params,
                                     named_inputs=n_in,
                                     named_outputs=n_out)
    code = instance.generate_code()
    expected_code = dedent("""
        min_missing_ratio = 0.0
        max_missing_ratio = 1.0
        output_1 = input_1
        for col in ['name']:
            ratio = input_1[col].isnull().sum()
            if ratio >= min_missing_ratio and ratio <= max_missing_ratio:
                output_1[col].fillna(value=200, inplace=True)
    """)
    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)

    # Test with value being number
    params[CleanMissingOperation.VALUE_PARAMETER] = 1200
    instance = CleanMissingOperation(params,
                                     named_inputs=n_in,
                                     named_outputs=n_out)
    code = instance.generate_code()
    expected_code = expected_code.replace('200', '1200')
    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
Пример #10
0
def test_clean_missing_minimal_params_success():
    params = {
        CleanMissingOperation.ATTRIBUTES_PARAM: ['col1', 'col2'],
        CleanMissingOperation.MIN_MISSING_RATIO_PARAM: 0.0,
        CleanMissingOperation.MAX_MISSING_RATIO_PARAM: 1.0,
    }
    n_in = {'input data': 'input_1'}
    n_out = {'output result': 'output_1'}
    instance = CleanMissingOperation(params,
                                     named_inputs=n_in,
                                     named_outputs=n_out)
    code = instance.generate_code()
    expected_code = dedent("""
     min_missing_ratio = 0.0
     max_missing_ratio = 1.0
     output_1 = input_1
     for col in ['col1', 'col2']:
        ratio = input_1[col].isnull().sum()
        if ratio >= min_missing_ratio and ratio <= max_missing_ratio:
            output_1.dropna(subset=col, axis='index', inplace=True)
    """)
    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
Пример #11
0
def test_clean_missing_without_missing_rating_params_success():
    params = {
        CleanMissingOperation.ATTRIBUTES_PARAM: ['name'],
    }
    n_in = {'input data': 'input_1'}
    n_out = {'output result': 'output_1'}
    instance = CleanMissingOperation(params,
                                     named_inputs=n_in,
                                     named_outputs=n_out)
    code = instance.generate_code()
    expected_code = dedent("""
        min_missing_ratio = 0.0
        max_missing_ratio = 1.0
        {output_1} = {input_1}
        for col in {attribute}:
            ratio = {input_1}[col].isnull().sum()
            if ratio >= min_missing_ratio and ratio <= max_missing_ratio:
                {output_1}.dropna(subset=col, axis='index', inplace=True)
    """.format(input_1=n_in['input data'],
               attribute=params['attributes'],
               output_1=n_out['output result']))
    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
Пример #12
0
def test_clean_missing_missing_attributes_param_fail():
    arguments = {
        'parameters': {
            'cleaning_mode': 'REMOVE_ROW'
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    with pytest.raises(ValueError) as val_err:
        CleanMissingOperation(**arguments)
    assert "'attributes' must be informed for task" in str(val_err.value)
Пример #13
0
def test_clean_missing_fill_value_missing_value_param_fail():
    df = util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
                   size=10)
    df.loc[0, 'sepalwidth'] = np.NaN
    arguments = {
        'parameters': {
            'attributes': ['sepalwidth'],
            'cleaning_mode': 'VALUE'
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }

    with pytest.raises(ValueError) as val_err:
        CleanMissingOperation(**arguments)
    assert "Parameter 'value' must be not None when mode is 'VALUE' for task" \
           in str(val_err.value)
Пример #14
0
def test_clean_missing_max_ratio_is_lower_than_min_ratio_fail():
    arguments = {
        'parameters': {
            'attributes':
            ['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
            'min_missing_ratio':
            0.25,
            'max_missing_ratio':
            0.025,
            'cleaning_mode':
            'REMOVE_COLUMN'
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    with pytest.raises(ValueError) as val_err:
        CleanMissingOperation(**arguments)
    assert "Parameter 'attributes' must be 0<=x<=1 for task" in str(
        val_err.value)
Пример #15
0
def test_clean_missing_value_param_failure():
    params = {CleanMissingOperation.CLEANING_MODE_PARAM: "VALUE"}
    n_in = {'input data': 'input_1'}
    n_out = {'output data': 'output_1'}
    with pytest.raises(ValueError):
        CleanMissingOperation(params, named_inputs=n_in, named_outputs=n_out)