def test_standardScale_1_attr_all_nan(): d = { 'id1': ['ab', 'sa', '121', '121', 'a'], 'id2': [1, np.nan, 0, 44, 0], 'col1': [1.0, -1.1, 3.0, 7.5, 10.0], 'col2': [np.nan, np.nan, np.nan, np.nan, np.nan], 'ww': [3, np.nan, 'ww', '1', '1'] } f = data.Frame(d) f = f.setIndex(['id1', 'id2']) op = StandardScaler() assert op.getOutputShape() is None op.setOptions(attributes={1: None}) assert op.getOutputShape() is None op.addInputShape(f.shape, 0) s = f.shape.clone() assert op.getOutputShape() == s op.removeInputShape(0) assert op.getOutputShape() is None op.addInputShape(f.shape, 0) g = op.execute(f) expected = { 'col1': d['col1'], 'col2': [((x - np.nanmean(d['col2'])) / np.nanstd(d['col2'])) for x in d['col2']], 'ww': [3, None, 'ww', '1', '1'] } assert nan_to_None(roundValues(g.to_dict(), 4)) == nan_to_None(roundValues(expected, 4)) assert g.shape == s
def test_ordinal_to_nominal_cat(): d = { 'col1': pd.Categorical(["5", "0", "5", "U", "0"], ordered=True), 'col2': [3, 4, 5.1, 6, 0] } f = Frame(d) op = ToCategorical() op.addInputShape(f.shape, pos=0) op.setOptions(attributes={0: {'cat': '5 0 1 2', 'ordered': False}}) # Predict output shape os = f.shape.columnsDict os['col1'] = Types.Nominal assert op.getOutputShape().columnsDict == os g = op.execute(f) gd = { 'col1': ["5", "0", "5", None, "0"], 'col2': [3.0, 4.0, 5.1, 6.0, 0.0] } assert nan_to_None(g.to_dict()) == gd assert g.shape.columnsDict == os assert list( g.getRawFrame()['col1'].dtype.categories) == ['5', '0', '1', '2'] assert g.getRawFrame()['col1'].dtype.ordered is False op.setOptions(attributes={0: {'cat': '5 0 1 2'}}) e = op.execute(f) assert nan_to_None(g.to_dict()) == nan_to_None(e.to_dict())
def test_minMaxScale(): d = { 'id1': ['ab', 'sa', '121', '121', 'a'], 'id2': [1, np.nan, 0, 44, 0], 'col1': [1, -1.1, 3, 7.5, 10], 'col2': [3, 4, np.nan, 6, np.nan], 'ww': [3, np.nan, 'ww', '1', '1'] } f = data.Frame(d) f = f.setIndex(['id1', 'id2']) op = MinMaxScaler() assert op.getOutputShape() is None op.setOptions(attributes={0: {'range': (-1, 1)}, 1: {'range': (2, 4)}}) assert op.getOutputShape() is None op.addInputShape(f.shape, 0) s = f.shape.clone() assert op.getOutputShape() == s op.removeInputShape(0) assert op.getOutputShape() is None op.addInputShape(f.shape, 0) g = op.execute(f) expected = { 'col1': [((x - min(d['col1'])) / (max(d['col1']) - min(d['col1']))) for x in d['col1']], 'col2': [((x - min(d['col2'])) / (max(d['col2']) - min(d['col2']))) for x in d['col2']], 'ww': [3, None, 'ww', '1', '1'] } expected = { 'col1': [x * (1 - (-1)) - 1 for x in expected['col1']], 'col2': [x * (4 - 2) + 2 for x in expected['col2']], 'ww': expected['ww'] } assert nan_to_None(roundValues(g.to_dict(), 4)) == nan_to_None(roundValues(expected, 4)) assert g.shape == s assert not numpy_equal(g.getRawFrame().values, f.getRawFrame().values) options = op.getOptions() assert options == { 'attributes': { 0: { 'range': (-1, 1) }, 1: { 'range': (2, 4) } } }
def test_merge_index_val(): d = { 'cowq': [1, 2, 3, 4.0, 10], 'col2': pd.Categorical(["3", "4", "5", "6", "0"]), 'col3': ['q', '2', 'c', '4', 'x'], 'date': pd.Series([ '05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012' ], dtype='datetime64[ns]') } f = data.Frame(d) op = ReplaceValues() op.addInputShape(f.shape, 0) op.setOptions(table={1: { 'values': '3 4; 6 0', 'replace': 'h; nan' }}, inverted=False) s = f.shape.clone() os = op.getOutputShape() assert f.shape.colTypes[1] == Types.Nominal == os.colTypes[1] assert os == s g = op.execute(f) assert g.shape == f.shape assert nan_to_None(data.Frame(g.getRawFrame()['col2']).to_dict()) == \ {'col2': ["h", "h", "5", None, None]}
def test_merge_from_nan(): d = { 'cowq': [1, 2, None, 4.0, None], 'col2': pd.Categorical(["3", "4", "5", "6", "0"]), 'col3': ['q', '2', 'c', '4', 'x'] } f = data.Frame(d) op = ReplaceValues() op.addInputShape(f.shape, 0) op.setOptions(table={0: { 'values': 'Nan 2.0;4.0', 'replace': '-1;-2' }}, inverted=False) s = f.shape.clone() assert f.shape.colTypes[1] == Types.Nominal assert op.getOutputShape() == s g = op.execute(f) assert g.shape == f.shape ff = { 'cowq': [1.0, -1.0, -1.0, -2.0, -1.0], 'col2': ["3", "4", "5", "6", "0"], 'col3': ['q', '2', 'c', '4', 'x'] } assert nan_to_None(g.to_dict()) == ff
def test_merge_nan(): d = { 'cowq': [1, 2, 3, 4.0, 10], 'col2': pd.Categorical(["3", "4", "5", "6", "0"]), 'col3': ['q', '2', 'c', '4', 'x'] } f = data.Frame(d) op = ReplaceValues() op.addInputShape(f.shape, 0) op.setOptions(table={ 1: { 'values': 'hello 2 6 0; 3', 'replace': 'NAN; nan' }, 0: { 'values': '2 4 10', 'replace': 'naN' } }, inverted=False) s = f.shape.clone() assert f.shape.colTypes[1] == Types.Nominal assert op.getOutputShape() == s g = op.execute(f) assert g.shape == f.shape ff = { 'cowq': [1, None, 3, None, None], 'col2': [None, "4", "5", None, None], 'col3': ['q', '2', 'c', '4', 'x'] } assert nan_to_None(g.to_dict()) == ff
def test_merge_category_inverted(): d = { 'col1': [1, 2, 3, 4.0, 10], 'col2': pd.Categorical(["3", "4", "5", "6", "0"]), 'col3': ['q', '2', 'c', '4', 'x'], 'date': ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'] } f = data.Frame(d) op = ReplaceValues() op.addInputShape(f.shape, 0) op.setOptions(table={1: { 'values': '4 0; 0', 'replace': 'val; NAN' }}, inverted=True) s = f.shape.clone() assert op.getOutputShape() == s assert s.colTypes[1] == Types.Nominal g = op.execute(f) assert nan_to_None(g.to_dict()) == { 'col1': [1, 2, 3, 4.0, 10], 'col2': [None, None, None, None, "0"], 'col3': ['q', '2', 'c', '4', 'x'], 'date': ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'] } assert g != f and g.shape == s
def test_fillnan_ffill(): e = {'col1': [np.nan, 2, np.nan, 4, 10], 'col2': pd.Categorical(['3', '4', np.nan, np.nan, '0'], ordered=True), 'col3': ['q', '2', 'c', np.nan, np.nan], 'date': pd.Series(['05-09-1988', np.nan, np.nan, '22-06-1994', '12-12-2012'], dtype='datetime64[ns]')} g = data.Frame(e) g = g.setIndex('col1') op = FillNan() assert op.getOutputShape() is None op.addInputShape(g.shape, 0) op.setOptions(selected={0: None, 1: None, 2: None}, fillMode='ffill') assert op.getOptions() == { 'selected': {0: None, 1: None, 2: None}, 'fillMode': 'ffill' } s = Shape() s.colNames = ['col3', 'col2', 'date'] s.colTypes = [Types.String, Types.Ordinal, Types.Datetime] s.index = ['col1'] s.indexTypes = [IndexType(Types.Numeric)] assert op.getOutputShape() == s h = op.execute(g) assert h.shape == s assert mapDate(roundValues(nan_to_None(h.to_dict()), decimals=3)) == { 'col3': ['q', '2', 'c', 'c', 'c'], 'col2': ['3', '4', '4', '4', '0'], 'date': [t.strftime(format='%Y-%m-%d') if not pd.isna(t) else '1988-05-09' for t in e['date']] }
def test_discretize_by_date_with_None(): d = {'col2': [3, 4, 5.1, 6, 0], 'col3': ['123', '2', '0.43', '4', '2021 January'], 'cold': [pd.Timestamp('05-09-1988'), pd.Timestamp('22-12-1994'), pd.Timestamp('21-11-1995'), None, pd.Timestamp('12-12-2012')], 'cold2': [pd.Timestamp('01-01-1950'), pd.Timestamp('22-12-1980'), pd.Timestamp('21-11-1995'), None, pd.Timestamp('12-12-2034')], 'cold_disc': [None, None, None, None, None] # test to see if it is removed } f = data.Frame(d) f = f.setIndex('col2') op = DateDiscretizer() op.addInputShape(f.shape, 0) assert op.getOutputShape() is None op.addInputShape(f.shape, 0) assert op.getOutputShape() is None shapeDict = f.shape.columnsDict assert shapeDict['cold'] == Types.Datetime assert shapeDict['cold2'] == Types.Datetime intervals = [pd.Timestamp('01-01-1950'), pd.Timestamp('01-01-1970'), pd.Timestamp('01-01-1990'), pd.Timestamp('01-01-2010'), pd.Timestamp('01-01-2030')] op.setOptions(selected={ 1: {'ranges': (intervals, True, False), 'labels': ['50', '70', '80', 'now']}, 2: {'ranges': (intervals, True, True), 'labels': ['50', '70', '80', 'now']}}, suffix=(True, '_disc')) assert op.getOptions() == { 'selected': { 1: {'ranges': (intervals, True, False), 'labels': ['50', '70', '80', 'now']}, 2: {'ranges': (intervals, True, True), 'labels': ['50', '70', '80', 'now']}}, 'suffix': (True, '_disc') } shapeDict['cold_disc'] = Types.Ordinal shapeDict['cold2_disc'] = Types.Ordinal s = data.Shape.fromDict(shapeDict, f.shape.indexDict) assert op.getOutputShape() == s g = op.execute(f) assert g.shape == s output = nan_to_None(g.to_dict()) assert output == {'col3': ['123', '2', '0.43', '4', '2021 January'], 'cold': [pd.Timestamp('05-09-1988'), pd.Timestamp('22-12-1994'), pd.Timestamp('21-11-1995'), None, pd.Timestamp('12-12-2012')], 'cold2': [pd.Timestamp('01-01-1950'), pd.Timestamp('22-12-1980'), pd.Timestamp('21-11-1995'), None, pd.Timestamp('12-12-2034')], 'cold_disc': ['70', '80', '80', None, 'now'], 'cold2_disc': [None, '70', '80', None, None] } assert g.getRawFrame()['cold_disc'].cat.categories.to_list() == ['50', '70', '80', 'now'] assert g.getRawFrame()['cold_disc'].dtype.ordered is True assert g.getRawFrame()['cold2_disc'].cat.categories.to_list() == ['50', '70', '80', 'now'] assert g.getRawFrame()['cold2_disc'].dtype.ordered is True
def test_remove_bijections(): op = RemoveBijections() d = { 'col1': [1.0, 2.0, 3.0, np.nan, 10.0], 'col2': [3.0, 4.0, np.nan, 6.0, np.nan], 'col3': ['q', '2', 'c', '4', 'x'], 'col11': ['q', '2', 'c', '4', 'x'], 'date': ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'] } f = data.Frame(d) op.addInputShape(f.shape, 0) assert op.getOutputShape() is None assert op.getOptions() == {'attributes': dict()} oo = {1: None, 0: None, 2: None, 3: None} op.setOptions(attributes=oo) assert op.getOutputShape() is None assert op.needsInputShapeKnown() is True assert op.isOutputShapeKnown() is False assert op.getOptions() == { 'attributes': { 1: None, 0: None, 3: None, 2: None } } oo[1] = 'ss' assert op.getOptions() == { 'attributes': { 1: None, 0: None, 3: None, 2: None } } g = op.execute(f) expected = copy.deepcopy(d) del expected['col11'] assert nan_to_None(expected) == nan_to_None(g.to_dict())
def test_discretize_by_date_and_time(): d = {'col2': [3, 4, 5.1, 6, 0], 'col3': ['123', '2', '0.43', '4', '2021 January'], 'cold': [pd.Timestamp('05-09-1988 13:45'), pd.Timestamp('22-12-1994 14:21'), pd.Timestamp('21-11-1995 11:50'), None, pd.Timestamp('12-12-2012 09:15')], 'cold2': [pd.Timestamp('05-09-1988 13:45'), pd.Timestamp('22-12-1994 14:21'), pd.Timestamp('21-11-1995 11:50'), None, pd.Timestamp('12-12-2012 09:15')] } f = data.Frame(d) f = f.setIndex(['col2', 'col3']) op = DateDiscretizer() op.addInputShape(f.shape, 0) assert op.getOutputShape() is None op.addInputShape(f.shape, 0) assert op.getOutputShape() is None shapeDict = f.shape.columnsDict assert shapeDict['cold'] == Types.Datetime intervals = [pd.Timestamp('05-09-1988 07:00'), pd.Timestamp('20-12-1994 11:30'), pd.Timestamp('05-09-2000 14:20'), pd.Timestamp('01-09-2010 14:30'), pd.Timestamp('12-12-2012 09:14')] labels = ['early mo', 'middle', 'late', 'now'] op.setOptions(selected={ 0: {'ranges': (intervals, True, True), 'labels': labels}, 1: {'ranges': (intervals, True, True), 'labels': labels}}, suffix=(False, '_disc')) assert op.getOptions() == { 'selected': { 0: {'ranges': (intervals, True, True), 'labels': labels}, 1: {'ranges': (intervals, True, True), 'labels': labels}}, 'suffix': (False, None) } shapeDict['cold'] = Types.Ordinal shapeDict['cold2'] = Types.Ordinal s = data.Shape.fromDict(shapeDict, f.shape.indexDict) assert op.getOutputShape() == s g = op.execute(f) assert g.shape == s assert g.shape != f.shape output = nan_to_None(g.to_dict()) assert output == { 'cold': ['early mo', 'middle', 'middle', None, None], 'cold2': ['early mo', 'middle', 'middle', None, None]} assert g.getRawFrame()['cold'].cat.categories.to_list() == ['early mo', 'middle', 'late', 'now'] assert g.getRawFrame()['cold'].dtype.ordered is True assert g.getRawFrame()['cold2'].cat.categories.to_list() == ['early mo', 'middle', 'late', 'now'] assert g.getRawFrame()['cold2'].dtype.ordered is True
def test_standardScale(): d = { 'id1': ['ab', 'sa', '121', '121', 'a'], 'id2': [1, np.nan, 0, 44, 0], 'col1': [1, -1.1, 3, 7.5, 10], 'col2': [3, 4, np.nan, 6, np.nan], 'ww': [3, np.nan, 'ww', '1', '1'] } f = data.Frame(d) f = f.setIndex(['id1', 'id2']) op = StandardScaler() assert op.getOutputShape() is None op.setOptions(attributes={0: None, 1: None}) assert op.getOutputShape() is None op.addInputShape(f.shape, 0) s = f.shape.clone() assert op.getOutputShape() == s op.removeInputShape(0) assert op.getOutputShape() is None op.addInputShape(f.shape, 0) g = op.execute(f) expected = { 'col1': [((x - np.nanmean(d['col1'])) / np.nanstd(d['col1'])) for x in d['col1']], 'col2': [((x - np.nanmean(d['col2'])) / np.nanstd(d['col2'])) for x in d['col2']], 'ww': [3, None, 'ww', '1', '1'] } assert nan_to_None(roundValues(g.to_dict(), 4)) == nan_to_None(roundValues(expected, 4)) assert g.shape == s assert not numpy_equal(g.getRawFrame().values, f.getRawFrame().values) options = op.getOptions() assert options == {'attributes': {0: None, 1: None}}
def test_discretize_num_uniform(): d = { 'col1': [1, -1.1, 3, 7.5, 10], 'col2': [3, 4, np.nan, 6, np.nan], 'ww': [3, 1, 'ww', '1', '1'] } f = data.Frame(d) op = BinsDiscretizer() assert op.getOutputShape() is None assert op.getOptions() == { 'attributes': {}, 'strategy': BinStrategy.Uniform, 'suffix': (True, '_discretized') } tabOpts = {0: {'bins': '2'}, 1: {'bins': '3'}} stra = BinStrategy.Uniform op.setOptions(attributes=tabOpts, strategy=stra, suffix=(False, None)) # Check for side effects tabOpts[0]['bins'] = '23' stra = '11' assert op.getOptions() == { 'attributes': { 0: { 'bins': '2' }, 1: { 'bins': '3' } }, 'strategy': BinStrategy.Uniform, 'suffix': (False, None) } op.addInputShape(f.shape, 0) s = f.shape.clone() s.colTypes[0] = Types.Ordinal s.colTypes[1] = Types.Ordinal assert op.getOutputShape() == s g = op.execute(f) assert nan_to_None(g.to_dict()) == { 'col1': ['0.0', '0.0', '0.0', '1.0', '1.0'], 'col2': ['0.0', '1.0', None, '2.0', None], 'ww': [3, 1, 'ww', '1', '1'] } assert g.shape == s
def test_discretize_range(): d = { 'col1': [1, -1.1, 3, 7.5, 10], 'col2': [3, 4, np.nan, 6, np.nan], 'ww': [3, 1, 'ww', '1', '1'] } f = data.Frame(d) op = RangeDiscretizer() assert op.getOutputShape() is None assert op.getOptions() == {'table': {}, 'suffix': (True, '_bins')} op.setOptions(table={ 0: { 'bins': [0, 2, 4, 6, 8, 10], 'labels': '"a u with\'" b c d e' }, 1: { 'bins': [0, 2, 4, 7], 'labels': 'A B C' } }, suffix=(False, None)) assert op.getOptions() == { 'table': { 0: { 'bins': [0, 2, 4, 6, 8, 10], 'labels': '"a u with\'" b c d e' }, 1: { 'bins': [0, 2, 4, 7], 'labels': 'A B C' } }, 'suffix': (False, None) } op.addInputShape(f.shape, 0) s = f.shape.clone() s.colTypes[0] = Types.Ordinal s.colTypes[1] = Types.Ordinal assert op.getOutputShape() == s g = op.execute(f) assert nan_to_None(g.to_dict()) == { 'col1': ['a u with\'', None, 'b', 'd', 'e'], 'col2': ['B', 'B', None, 'C', None], 'ww': [3, 1, 'ww', '1', '1'] } assert g.shape == s
def test_discretize_range_suffix(): d = { 'col1': [1, -1.1, 3, 7.5, 10], 'col2': [3, 4, None, 6, None], 'ww': [3, 1, 'ww', '1', '1'], 'col2_binss': [3, 1, 'ww', '1', '1'] } f = data.Frame(d) op = RangeDiscretizer() tOpts = {1: {'bins': [0, 2, 4, 7], 'labels': 'A B C'}} op.setOptions(table=tOpts, suffix=(True, '_binss')) tOpts[1] = {} assert op.getOptions() == { 'table': { 1: { 'bins': [0, 2, 4, 7], 'labels': 'A B C' } }, 'suffix': (True, '_binss') } op.addInputShape(f.shape, 0) s = f.shape.clone() s.colTypes[1] = Types.Numeric s.colTypes[3] = Types.Ordinal assert op.getOutputShape() == s g = op.execute(f) expected_output = { 'col1': [1, -1.1, 3, 7.5, 10], 'col2': [3, 4, None, 6, None], 'ww': [3, 1, 'ww', '1', '1'], 'col2_binss': ['B', 'B', None, 'C', None] # This column must replace the original duplicate } assert nan_to_None(g.to_dict()) == expected_output assert g.shape == s # Check that output is the same as with drop op.setOptions(table={1: { 'bins': [0, 2, 4, 7], 'labels': 'A B C' }}, suffix=(False, None)) o = op.execute(f) assert expected_output['col2_binss'] == nan_to_None(o.to_dict())['col2'] assert expected_output['col2'] != nan_to_None(o.to_dict())['col2'] assert expected_output['ww'] == nan_to_None(o.to_dict())['ww'] assert expected_output['col1'] == nan_to_None(o.to_dict())['col1'] assert expected_output['col2_binss'] != nan_to_None( o.to_dict())['col2_binss']
def test_fillnan_byVal_date_num(): e = {'col1': [np.nan, 2, np.nan, 4, 10], 'col2': pd.Categorical(['3', '4', np.nan, np.nan, '0'], ordered=True), 'col3': ['q', '2', 'c', np.nan, np.nan], 'date': pd.Series(['05-09-1988', np.nan, np.nan, '22-06-1994', '12-12-2012'], dtype='datetime64[ns]'), 'col4': [np.nan, 2, np.nan, 4, 10]} g = data.Frame(e) g = g.setIndex('col1') op = FillNan() assert op.getOutputShape() is None op.addInputShape(g.shape, 0) with pytest.raises(OptionValidationError): op.setOptions(selected={0: {'fill': 'pol'}, 1: {'fill': '23'}, # wrong 2: {'fill': '1966-04-02 00:00:30'}, 3: {'fill': 'march'}}, # wrong fillMode='value') op.setOptions(selected={2: {'fill': '1966-04-02 00:00:30'}, 3: {'fill': '0.9'}}, fillMode='value') assert op.getOptions() == { 'selected': {2: {'fill': '1966-04-02 00:00:30'}, 3: {'fill': '0.9'}}, 'fillMode': 'value' } s = Shape() s.colNames = ['col3', 'col2', 'date', 'col4'] s.colTypes = [Types.String, Types.Ordinal, Types.Datetime, Types.Numeric] s.index = ['col1'] s.indexTypes = [IndexType(Types.Numeric)] assert op.getOutputShape() == s h = op.execute(g) assert h.shape == s assert mapDate(roundValues(nan_to_None(h.to_dict()), decimals=3)) == { 'col3': ['q', '2', 'c', None, None], 'col2': ['3', '4', None, None, '0'], 'date': [t.strftime(format='%Y-%m-%d') if not pd.isna(t) else '1966-04-02' for t in e['date']], 'col4': [0.9, 2.0, 0.9, 4.0, 10.0] }
def test_discretize_by_date(): d = {'col2': [3, 4, 5.1, 6, 0], 'col3': ['123', '2', '0.43', '4', '2021 January'], 'cold': ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'] } f = data.Frame(d) op = DateDiscretizer() op.addInputShape(f.shape, 0) assert op.getOutputShape() is None op.addInputShape(f.shape, 0) assert op.getOutputShape() is None shapeDict = f.shape.columnsDict assert shapeDict['cold'] is Types.String intervals = [pd.Timestamp('01-01-1950'), pd.Timestamp('01-01-1970'), pd.Timestamp('30-12-1994'), pd.Timestamp('01-01-2010')] op.setOptions(selected={2: {'ranges': (intervals, True, False), 'labels': ['50', '70', 'now']}}, suffix=(False, None)) assert op.getOptions() == {'selected': {2: {'ranges': (intervals, True, False), 'labels': ['50', '70', 'now']}}, 'suffix': (False, None) } shapeDict['cold'] = Types.Ordinal s = data.Shape.fromDict(shapeDict, f.shape.indexDict) assert op.getOutputShape() == s g = op.execute(f) assert g.shape == s output = nan_to_None(g.to_dict()) assert output == {'col2': [3, 4, 5.1, 6, 0], 'col3': ['123', '2', '0.43', '4', '2021 January'], 'cold': ['70', '70', 'now', '70', None] } assert g.getRawFrame().iloc[:, 2].cat.categories.to_list() == ['50', '70', 'now'] assert g.getRawFrame().iloc[:, 2].dtype.ordered is True
def test_drop_columns(): e = {'col1': [np.nan, 2, np.nan, 4, 10], 'col2': pd.Categorical(['3', '4', np.nan, np.nan, '0'], ordered=True), 'col3': ['q', '2', 'c', np.nan, np.nan], 'date': pd.Series(['05-09-1988', np.nan, np.nan, '22-06-1994', '12-12-2012'], dtype='datetime64[ns]')} g = data.Frame(e) g = g.setIndex('col1') op = DropColumns() assert op.getOutputShape() is None op.addInputShape(g.shape, 0) assert op.getOutputShape() is None assert op.getOptions() == { 'selected': dict() } selOpts = {0: None, 2: None} op.setOptions(selected={0: None, 2: None}) opts = op.getOptions() assert opts['selected'] == selOpts opts['selected'] = {} assert op.getOptions()['selected'] == selOpts assert op.getOptions() != opts with pytest.raises(exc.OptionValidationError) as e: op.setOptions(selected={}) s = data.Shape() s.colNames = ['col3'] s.colTypes = [Types.String] s.index = ['col1'] s.indexTypes = [IndexType(Types.Numeric)] assert op.getOutputShape() == s h = op.execute(g) assert h.shape == s assert nan_to_None(h.to_dict()) == { 'col3': ['q', '2', 'c', None, None] }
def test_merge_string(): d = {'cowq': [1, 2, None, 4.0, None], 'col3': ['q', '2', 'c', '4', 'q']} f = data.Frame(d) op = ReplaceValues() op.addInputShape(f.shape, 0) op.setOptions(table={1: { 'values': 'q 2; nAn', 'replace': '-1;-2' }}, inverted=False) s = f.shape.clone() assert op.getOutputShape() == s g = op.execute(f) assert g.shape == f.shape ff = { 'cowq': [1.0, 2.0, None, 4.0, None], 'col3': ["-1", "-1", "c", "4", "-1"] } assert nan_to_None(g.to_dict()) == ff
def test_str_toNumeric_coerce(): d = { 'col1': pd.Categorical(['3', np.nan, 5, 6, 0]), 'col2': [3, 4, 5, 6, 0], 'col3': [np.nan, '2', '0.43', '4', np.nan] } # 'cold': pd.Series(['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'], # dtype='datetime64[ns]')} f = Frame(d) op = ToNumeric() op.addInputShape(f.shape, pos=0) op.setOptions(attributes={0: dict(), 2: dict()}, errors='coerce') # Predict output shape os = f.shape.columnsDict os['col1'] = Types.Numeric os['col3'] = Types.Numeric assert op.getOutputShape().columnsDict == os # Removing options/input_shape causes None to be returned op.removeInputShape(0) assert op.getOutputShape() is None op.addInputShape(f.shape, pos=0) op.unsetOptions() assert op.getOutputShape() is None op.setOptions(attributes={0: dict(), 2: dict()}, errors='coerce') assert op.getOutputShape().columnsDict == os # Re-adding everything g = op.execute(f) gd = { 'col1': [3., None, 5., 6., 0.], 'col2': [3., 4., 5., 6., 0.], 'col3': [None, 2.0, 0.43, 4.0, None] } assert roundValues(nan_to_None(g.to_dict()), 2) == gd assert g.shape.columnsDict == os assert g.shape.indexDict == f.shape.indexDict
def test_str_toCategory(): d = { 'col1': pd.Categorical(["3", "0", "5", "6", "0"]), 'col2': ["3", "4", "5.1", "6", None], 'col3': ['123', '2', '0.43', 'nan', '90'] } # 'cold': pd.Series(['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'], # dtype='datetime64[ns]')} f = Frame(d) op = ToCategorical() op.addInputShape(f.shape, pos=0) op.setOptions(attributes={1: {'cat': '4 3 0', 'ordered': True}, 2: dict()}) # Predict output shape os = f.shape.columnsDict os['col3'] = Types.Nominal os['col2'] = Types.Ordinal assert op.getOutputShape().columnsDict == os # Removing options/input_shape causes None to be returned op.removeInputShape(0) assert op.getOutputShape() is None op.addInputShape(f.shape, pos=0) op.unsetOptions() assert op.getOutputShape() is None op.setOptions(attributes={1: {'cat': '4 3 0', 'ordered': True}, 2: dict()}) assert op.getOutputShape().columnsDict == os # Re-adding everything g = op.execute(f) gd = { 'col1': ["3", "0", "5", "6", "0"], 'col2': ['3', '4', None, None, None], 'col3': ['123', '2', '0.43', 'nan', '90'] } assert nan_to_None(g.to_dict()) == gd assert g.shape.columnsDict == os
def test_cat_toCategory(): d = { 'col1': pd.Categorical(["5", "0", "5", "U", "0"]), 'col2': [3, 4, 5.1, 6, 0], 'col4': [1, 2, 3, 4, 5], # this will become a float 'col3': ['123', '2', '0.43', '4', '90'] } f = Frame(d) op = ToCategorical() op.addInputShape(f.shape, pos=0) op.setOptions(attributes={0: {'cat': '5 0'}}) # Predict output shape os = f.shape.columnsDict assert op.getOutputShape().columnsDict == os # Removing options/input_shape causes None to be returned op.removeInputShape(0) assert op.getOutputShape() is None op.addInputShape(f.shape, pos=0) op.unsetOptions() assert op.getOutputShape() is None op.setOptions(attributes={0: {'cat': '5 0'}}) assert op.getOutputShape().columnsDict == os # Re-adding everything g = op.execute(f) gd = { 'col1': ["5", "0", "5", None, "0"], 'col2': [3.0, 4.0, 5.1, 6.0, 0.0], 'col3': ['123', '2', '0.43', '4', '90'], 'col4': [1.0, 2.0, 3.0, 4.0, 5.0] } assert nan_to_None(g.to_dict()) == gd assert g.shape.columnsDict == os
def test_execute(): d = { 'id': ['hiab', 'gine', 'hiac', 'hiaa', 'hiad'], 'diab1': [2.3, 3.4, 10.2, 14.6, 66.3], 'diab2': [3, 4, np.nan, 6, np.nan], 'new1': ['cat1', 'cat2', 'ww', '1', '10'] } e = { 'id': ['hiab', 'hiae', 'hiac', 'hiaa', 'hiad'], 'diab4': [12, np.nan, 21.2, 13.45, 1.02], 'diab3': [1.0, -12, 2.3, 4.1, 5.6], 'newt': [0, 0, 1, 0, 0] } f = data.Frame(d) g = data.Frame(e) f = f.setIndex('id') g = g.setIndex('id') fShape = f.shape.clone() gShape = g.shape.clone() w = WorkbenchModelMock() w.setDataframeByName('frameF', f) w.setDataframeByName('frameG', g) assert g.getRawFrame().index.name == 'id' assert f.getRawFrame().index.name == 'id' op = ExtractTimeSeries(w) timeLabels = ['wave1', 'wave2', 'wave3', 'wave4'] options = { 'diab': [('frameG', 0, 3), ('frameF', 1, 1), ('frameF', 0, 0), ('frameG', 1, 2)], 'other': [('frameF', 2, 0), ('frameF', 1, 1), ('frameG', 2, 2), ('frameF', 0, 3)] } op.setOptions(series=options, time=timeLabels, outName='frameR') # Check for side effects options['diab'] = [] assert op._ExtractTimeSeries__timeLabels is not timeLabels assert op._ExtractTimeSeries__series is not options assert op._ExtractTimeSeries__series == { 'diab': [('frameG', 0, 3), ('frameF', 1, 1), ('frameF', 0, 0), ('frameG', 1, 2)], 'other': [('frameF', 2, 0), ('frameF', 1, 1), ('frameG', 2, 2), ('frameF', 0, 3)] } op.execute() assert w.getDataframeModelByName('frameF').frame.shape == fShape assert w.getDataframeModelByName('frameG').frame.shape == gShape r: data.Frame = w.getDataframeModelByName('frameR').frame rr = r.getRawFrame() rr_dict = { k: gr.to_dict(orient='records') for k, gr in rr.groupby(level=0) } assert { k: sorted(v, key=lambda rec: rec['time']) for k, v in nan_to_None(rr_dict).items() } == { 'hiaa': [{ 'diab': 14.60, 'other': '1', 'time': 'wave1' }, { 'diab': 6.000, 'other': 6.00, 'time': 'wave2' }, { 'diab': 4.100, 'other': 0.00, 'time': 'wave3' }, { 'diab': 13.45, 'other': 14.6, 'time': 'wave4' }], 'hiab': [{ 'diab': 2.30, 'other': 'cat1', 'time': 'wave1' }, { 'diab': 3.00, 'other': 3.0000, 'time': 'wave2' }, { 'diab': 1.00, 'other': 0.0000, 'time': 'wave3' }, { 'diab': 12.0, 'other': 2.3000, 'time': 'wave4' }], 'hiac': [{ 'diab': 10.2, 'other': 'ww', 'time': 'wave1' }, { 'diab': None, 'other': None, 'time': 'wave2' }, { 'diab': 2.30, 'other': 1.00, 'time': 'wave3' }, { 'diab': 21.2, 'other': 10.2, 'time': 'wave4' }], 'hiad': [{ 'diab': 66.3, 'other': '10', 'time': 'wave1' }, { 'diab': None, 'other': None, 'time': 'wave2' }, { 'diab': 5.60, 'other': 0.00, 'time': 'wave3' }, { 'diab': 1.02, 'other': 66.3, 'time': 'wave4' }], 'hiae': [{ 'diab': None, 'other': None, 'time': 'wave1' }, { 'diab': None, 'other': None, 'time': 'wave2' }, { 'diab': -12., 'other': 0.00, 'time': 'wave3' }, { 'diab': None, 'other': None, 'time': 'wave4' }], 'gine': [{ 'diab': 3.40, 'other': 'cat2', 'time': 'wave1' }, { 'diab': 4.00, 'other': 4.0000, 'time': 'wave2' }, { 'diab': None, 'other': None, 'time': 'wave3' }, { 'diab': None, 'other': 3.4000, 'time': 'wave4' }] } assert r.shape.columnsDict == { 'diab': Types.Numeric, 'other': Types.String, 'time': Types.Ordinal }
def test_discretize_num_uniform_nondrop(): d = { 'col1': [1, -1.1, 3, 7.5, 10], 'col2': [3, 4, np.nan, 6, np.nan], 'ww': [3, 1, 'ww', '1', '1'], 'col2_discre': [1, 1, 1, 1, 1] } # col2_discre replaced f = data.Frame(d) op = BinsDiscretizer() assert op.getOutputShape() is None assert op.getOptions() == { 'attributes': {}, 'strategy': BinStrategy.Uniform, 'suffix': (True, '_discretized') } op.setOptions(attributes={ 0: { 'bins': '2' }, 1: { 'bins': '3' } }, strategy=BinStrategy.Uniform, suffix=(True, '_discre')) assert op.getOptions() == { 'attributes': { 0: { 'bins': '2' }, 1: { 'bins': '3' } }, 'strategy': BinStrategy.Uniform, 'suffix': (True, '_discre') } op.addInputShape(f.shape, 0) s = f.shape.clone() cd = s.columnsDict cd['col1_discre'] = Types.Ordinal cd['col2_discre'] = Types.Ordinal s = data.Shape.fromDict(cd, s.indexDict) assert op.getOutputShape() == s g = op.execute(f) expected_output = { 'col1_discre': ['0.0', '0.0', '0.0', '1.0', '1.0'], 'col2_discre': ['0.0', '1.0', None, '2.0', None], 'col1': [1, -1.1, 3, 7.5, 10], 'col2': [3, 4, None, 6, None], 'ww': [3, 1, 'ww', '1', '1'] } assert nan_to_None(g.to_dict()) == expected_output assert g.shape == s # Check that output is the same as with drop op.setOptions(attributes={ 0: { 'bins': '2' }, 1: { 'bins': '3' } }, strategy=BinStrategy.Uniform, suffix=(False, None)) o = op.execute(f) assert expected_output['col1_discre'] == nan_to_None(o.to_dict())['col1'] assert expected_output['col2_discre'] == nan_to_None(o.to_dict())['col2'] assert expected_output['col1'] != nan_to_None(o.to_dict())['col1'] assert expected_output['col2'] != nan_to_None(o.to_dict())['col2']
def test_discretize_by_time(): d = {'col2': [3, 4, 5.1, 6, 0], 'col3': ['123', '2', '0.43', '4', '2021 January'], 'cold': [pd.Timestamp('10:42'), pd.Timestamp('23:59:07'), pd.Timestamp('07:12'), None, pd.Timestamp('18:13')], 'cold2': [pd.Timestamp('22:59'), pd.Timestamp('12:00'), pd.Timestamp('16:40:02'), pd.Timestamp('16:40:03'), pd.Timestamp('22:00:02')], 'nan': [None, None, None, None, None], 'cold_disc': [None, None, None, None, None] # test to see if it is removed } f = data.Frame(d) f = f.setIndex(['col2', 'col3']) op = DateDiscretizer() op.addInputShape(f.shape, 0) assert op.getOutputShape() is None op.addInputShape(f.shape, 0) assert op.getOutputShape() is None shapeDict = f.shape.columnsDict assert shapeDict['cold'] == Types.Datetime assert shapeDict['cold2'] == Types.Datetime intervals = [pd.Timestamp('00:00'), pd.Timestamp('06:00'), pd.Timestamp('12:00'), pd.Timestamp('16:40:02'), pd.Timestamp('22:00'), pd.Timestamp('23:59')] labels = ['night1', 'morning', 'afternoon', 'evening', 'night2'] # It's necessary to set a default date object, which is normally done by the editor intervals = withDefaultDate(intervals) op.setOptions(selected={ 0: {'ranges': (intervals, False, True), 'labels': labels}, 1: {'ranges': (intervals, False, True), 'labels': labels}, 2: {'ranges': (intervals, False, True), 'labels': labels}}, suffix=(True, '_disc')) assert op.getOptions() == { 'selected': { 0: {'ranges': (intervals, False, True), 'labels': labels}, 1: {'ranges': (intervals, False, True), 'labels': labels}, 2: {'ranges': (intervals, False, True), 'labels': labels}}, 'suffix': (True, '_disc') } shapeDict['cold_disc'] = Types.Ordinal shapeDict['cold2_disc'] = Types.Ordinal shapeDict['nan_disc'] = Types.Ordinal s = data.Shape.fromDict(shapeDict, f.shape.indexDict) assert op.getOutputShape() == s g = op.execute(f) assert g.shape == s output = nan_to_None(g.to_dict()) assert output == {'cold': [pd.Timestamp('10:42'), pd.Timestamp('23:59:07'), pd.Timestamp('07:12'), None, pd.Timestamp('18:13')], 'cold2': [pd.Timestamp('22:59'), pd.Timestamp('12:00'), pd.Timestamp('16:40:02'), pd.Timestamp('16:40:03'), pd.Timestamp('22:00:02')], 'nan': [None, None, None, None, None], 'nan_disc': [None, None, None, None, None], 'cold_disc': ['morning', None, 'morning', None, 'evening'], 'cold2_disc': ['night2', 'morning', 'afternoon', 'evening', 'night2'] } assert g.getRawFrame()['cold_disc'].cat.categories.to_list() == labels assert g.getRawFrame()['cold_disc'].dtype.ordered is True assert g.getRawFrame()['cold2_disc'].cat.categories.to_list() == labels assert g.getRawFrame()['cold2_disc'].dtype.ordered is True assert g.getRawFrame()['nan_disc'].cat.categories.to_list() == labels assert g.getRawFrame()['nan_disc'].dtype.ordered is True