def test_clean_missing_minimal_params_type_value_success(): params = { CleanMissingOperation.ATTRIBUTES_PARAM: ['name'], CleanMissingOperation.MIN_MISSING_RATIO_PARAM: 0.0, CleanMissingOperation.MAX_MISSING_RATIO_PARAM: 1.0, CleanMissingOperation.VALUE_PARAMETER: 200, CleanMissingOperation.CLEANING_MODE_PARAM: 'VALUE' } n_in = {'input data': 'input_1'} n_out = {'output result': 'output_1'} instance = CleanMissingOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" min_missing_ratio = 0.0 max_missing_ratio = 1.0 output_1 = input_1 for col in ['name']: ratio = input_1[col].isnull().sum() if ratio >= min_missing_ratio and ratio <= max_missing_ratio: output_1[col].fillna(value=200, inplace=True) """) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code) # Test with value being number params[CleanMissingOperation.VALUE_PARAMETER] = 1200 instance = CleanMissingOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = expected_code.replace('200', '1200') result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_clean_missing_ratio_control_success(): """ Needs a better assertion... Ratio method is confusing. """ df = util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'], size=10) df.loc[:, ['sepallength', 'sepalwidth']] = np.NaN df.loc[0, 'petalwidth'] = np.NaN test = util.iris( ['sepallength', 'sepalwidth', 'petalwidth', 'petallength'], size=10) test.loc[:, ['sepallength', 'sepalwidth']] = np.NaN test.loc[0, 'petalwidth'] = np.NaN arguments = { 'parameters': { 'attributes': ['sepallength', 'sepalwidth', 'petalwidth', 'petallength'], 'min_missing_ratio': 0.025, 'max_missing_ratio': 0.1, 'cleaning_mode': 'REMOVE_COLUMN' }, 'named_inputs': { 'input data': 'df', }, 'named_outputs': { 'output data': 'out' } } instance = CleanMissingOperation(**arguments) result = util.execute(instance.generate_code(), {'df': df}) assert result['output_data_1'].equals(test.drop(columns=['petalwidth']))
def test_clean_missing_multiple_attributes_success(): df = util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'], size=10) df.loc[0:1, 'sepallength'] = np.NaN df.loc[2:3, 'sepalwidth'] = np.NaN df.loc[4:5, 'petalwidth'] = np.NaN df.loc[6:7, 'petallength'] = np.NaN arguments = { 'parameters': { 'attributes': ['sepallength', 'sepalwidth', 'petalwidth', 'petallength'], 'cleaning_mode': 'REMOVE_ROW' }, 'named_inputs': { 'input data': 'df', }, 'named_outputs': { 'output data': 'out' } } instance = CleanMissingOperation(**arguments) result = util.execute(instance.generate_code(), {'df': df}) assert result['output_data_1'].equals( util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'], size=10).drop(index=[i for i in range(8)]))
def test_clean_missing_missing_input_implies_no_code_success(): arguments = { 'parameters': { 'attributes': ['sepalwidth'], 'cleaning_mode': 'VALUE', 'value': 'replaced' }, 'named_inputs': {}, 'named_outputs': { 'output data': 'out' } } instance = CleanMissingOperation(**arguments) assert instance.generate_code() is None
def test_clean_missing_success(): slice_size = 10 df = ['df', util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'], slice_size)] arguments = { 'parameters': {}, 'named_inputs': { 'input data': df[0], }, 'named_outputs': { 'output data': 'out' } } instance = CleanMissingOperation(**arguments) result = util.execute(instance.generate_code(), dict([df])) assert result['out'].equals(util.iris(size=slice_size))
def test_clean_missing_fill_mean_success(): df = util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'], size=10) df.loc[0, 'sepalwidth'] = np.NaN sepal_mean = df.copy().loc[:, 'sepalwidth'] arguments = { 'parameters': { 'attributes': ['sepalwidth'], 'cleaning_mode': 'MEAN' }, 'named_inputs': { 'input data': 'df', }, 'named_outputs': { 'output data': 'out' } } instance = CleanMissingOperation(**arguments) result = util.execute(instance.generate_code(), {'df': df}) assert result['output_data_1'].loc[0, 'sepalwidth'] == sepal_mean.mean()
def test_clean_missing_missing_cleaning_mode_param_success(): """ Defaults to REMOVE_ROW """ df = util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'], size=10) arguments = { 'parameters': { 'attributes': ['sepallength', 'sepalwidth', 'petalwidth', 'petallength'] }, 'named_inputs': { 'input data': 'df', }, 'named_outputs': { 'output data': 'out' } } instance = CleanMissingOperation(**arguments) util.execute(instance.generate_code(), {'df': df})
def test_clean_missing_remove_column_success(): df = util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'], size=10) df.loc[4, 'sepalwidth'] = np.NaN arguments = { 'parameters': { 'attributes': ['sepalwidth'], 'cleaning_mode': 'REMOVE_COLUMN' }, 'named_inputs': { 'input data': 'df', }, 'named_outputs': { 'output data': 'out' } } instance = CleanMissingOperation(**arguments) result = util.execute(instance.generate_code(), {'df': df}) assert result['output_data_1'].equals( util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'], 10).drop(columns=['sepalwidth']))
def test_clean_missing_minimal_params_success(): params = { CleanMissingOperation.ATTRIBUTES_PARAM: ['col1', 'col2'], CleanMissingOperation.MIN_MISSING_RATIO_PARAM: 0.0, CleanMissingOperation.MAX_MISSING_RATIO_PARAM: 1.0, } n_in = {'input data': 'input_1'} n_out = {'output result': 'output_1'} instance = CleanMissingOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" min_missing_ratio = 0.0 max_missing_ratio = 1.0 output_1 = input_1 for col in ['col1', 'col2']: ratio = input_1[col].isnull().sum() if ratio >= min_missing_ratio and ratio <= max_missing_ratio: output_1.dropna(subset=col, axis='index', inplace=True) """) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_clean_missing_without_missing_rating_params_success(): params = { CleanMissingOperation.ATTRIBUTES_PARAM: ['name'], } n_in = {'input data': 'input_1'} n_out = {'output result': 'output_1'} instance = CleanMissingOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" min_missing_ratio = 0.0 max_missing_ratio = 1.0 {output_1} = {input_1} for col in {attribute}: ratio = {input_1}[col].isnull().sum() if ratio >= min_missing_ratio and ratio <= max_missing_ratio: {output_1}.dropna(subset=col, axis='index', inplace=True) """.format(input_1=n_in['input data'], attribute=params['attributes'], output_1=n_out['output result'])) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)