def test_standardScale_1_attr_all_nan(): d = { 'id1': ['ab', 'sa', '121', '121', 'a'], 'id2': [1, np.nan, 0, 44, 0], 'col1': [1.0, -1.1, 3.0, 7.5, 10.0], 'col2': [np.nan, np.nan, np.nan, np.nan, np.nan], 'ww': [3, np.nan, 'ww', '1', '1'] } f = data.Frame(d) f = f.setIndex(['id1', 'id2']) op = StandardScaler() assert op.getOutputShape() is None op.setOptions(attributes={1: None}) assert op.getOutputShape() is None op.addInputShape(f.shape, 0) s = f.shape.clone() assert op.getOutputShape() == s op.removeInputShape(0) assert op.getOutputShape() is None op.addInputShape(f.shape, 0) g = op.execute(f) expected = { 'col1': d['col1'], 'col2': [((x - np.nanmean(d['col2'])) / np.nanstd(d['col2'])) for x in d['col2']], 'ww': [3, None, 'ww', '1', '1'] } assert nan_to_None(roundValues(g.to_dict(), 4)) == nan_to_None(roundValues(expected, 4)) assert g.shape == s
def test_minMaxScale(): d = { 'id1': ['ab', 'sa', '121', '121', 'a'], 'id2': [1, np.nan, 0, 44, 0], 'col1': [1, -1.1, 3, 7.5, 10], 'col2': [3, 4, np.nan, 6, np.nan], 'ww': [3, np.nan, 'ww', '1', '1'] } f = data.Frame(d) f = f.setIndex(['id1', 'id2']) op = MinMaxScaler() assert op.getOutputShape() is None op.setOptions(attributes={0: {'range': (-1, 1)}, 1: {'range': (2, 4)}}) assert op.getOutputShape() is None op.addInputShape(f.shape, 0) s = f.shape.clone() assert op.getOutputShape() == s op.removeInputShape(0) assert op.getOutputShape() is None op.addInputShape(f.shape, 0) g = op.execute(f) expected = { 'col1': [((x - min(d['col1'])) / (max(d['col1']) - min(d['col1']))) for x in d['col1']], 'col2': [((x - min(d['col2'])) / (max(d['col2']) - min(d['col2']))) for x in d['col2']], 'ww': [3, None, 'ww', '1', '1'] } expected = { 'col1': [x * (1 - (-1)) - 1 for x in expected['col1']], 'col2': [x * (4 - 2) + 2 for x in expected['col2']], 'ww': expected['ww'] } assert nan_to_None(roundValues(g.to_dict(), 4)) == nan_to_None(roundValues(expected, 4)) assert g.shape == s assert not numpy_equal(g.getRawFrame().values, f.getRawFrame().values) options = op.getOptions() assert options == { 'attributes': { 0: { 'range': (-1, 1) }, 1: { 'range': (2, 4) } } }
def test_fillnan_ffill(): e = {'col1': [np.nan, 2, np.nan, 4, 10], 'col2': pd.Categorical(['3', '4', np.nan, np.nan, '0'], ordered=True), 'col3': ['q', '2', 'c', np.nan, np.nan], 'date': pd.Series(['05-09-1988', np.nan, np.nan, '22-06-1994', '12-12-2012'], dtype='datetime64[ns]')} g = data.Frame(e) g = g.setIndex('col1') op = FillNan() assert op.getOutputShape() is None op.addInputShape(g.shape, 0) op.setOptions(selected={0: None, 1: None, 2: None}, fillMode='ffill') assert op.getOptions() == { 'selected': {0: None, 1: None, 2: None}, 'fillMode': 'ffill' } s = Shape() s.colNames = ['col3', 'col2', 'date'] s.colTypes = [Types.String, Types.Ordinal, Types.Datetime] s.index = ['col1'] s.indexTypes = [IndexType(Types.Numeric)] assert op.getOutputShape() == s h = op.execute(g) assert h.shape == s assert mapDate(roundValues(nan_to_None(h.to_dict()), decimals=3)) == { 'col3': ['q', '2', 'c', 'c', 'c'], 'col2': ['3', '4', '4', '4', '0'], 'date': [t.strftime(format='%Y-%m-%d') if not pd.isna(t) else '1988-05-09' for t in e['date']] }
def test_standardScale(): d = { 'id1': ['ab', 'sa', '121', '121', 'a'], 'id2': [1, np.nan, 0, 44, 0], 'col1': [1, -1.1, 3, 7.5, 10], 'col2': [3, 4, np.nan, 6, np.nan], 'ww': [3, np.nan, 'ww', '1', '1'] } f = data.Frame(d) f = f.setIndex(['id1', 'id2']) op = StandardScaler() assert op.getOutputShape() is None op.setOptions(attributes={0: None, 1: None}) assert op.getOutputShape() is None op.addInputShape(f.shape, 0) s = f.shape.clone() assert op.getOutputShape() == s op.removeInputShape(0) assert op.getOutputShape() is None op.addInputShape(f.shape, 0) g = op.execute(f) expected = { 'col1': [((x - np.nanmean(d['col1'])) / np.nanstd(d['col1'])) for x in d['col1']], 'col2': [((x - np.nanmean(d['col2'])) / np.nanstd(d['col2'])) for x in d['col2']], 'ww': [3, None, 'ww', '1', '1'] } assert nan_to_None(roundValues(g.to_dict(), 4)) == nan_to_None(roundValues(expected, 4)) assert g.shape == s assert not numpy_equal(g.getRawFrame().values, f.getRawFrame().values) options = op.getOptions() assert options == {'attributes': {0: None, 1: None}}
def test_fillnan_byVal_date_num(): e = {'col1': [np.nan, 2, np.nan, 4, 10], 'col2': pd.Categorical(['3', '4', np.nan, np.nan, '0'], ordered=True), 'col3': ['q', '2', 'c', np.nan, np.nan], 'date': pd.Series(['05-09-1988', np.nan, np.nan, '22-06-1994', '12-12-2012'], dtype='datetime64[ns]'), 'col4': [np.nan, 2, np.nan, 4, 10]} g = data.Frame(e) g = g.setIndex('col1') op = FillNan() assert op.getOutputShape() is None op.addInputShape(g.shape, 0) with pytest.raises(OptionValidationError): op.setOptions(selected={0: {'fill': 'pol'}, 1: {'fill': '23'}, # wrong 2: {'fill': '1966-04-02 00:00:30'}, 3: {'fill': 'march'}}, # wrong fillMode='value') op.setOptions(selected={2: {'fill': '1966-04-02 00:00:30'}, 3: {'fill': '0.9'}}, fillMode='value') assert op.getOptions() == { 'selected': {2: {'fill': '1966-04-02 00:00:30'}, 3: {'fill': '0.9'}}, 'fillMode': 'value' } s = Shape() s.colNames = ['col3', 'col2', 'date', 'col4'] s.colTypes = [Types.String, Types.Ordinal, Types.Datetime, Types.Numeric] s.index = ['col1'] s.indexTypes = [IndexType(Types.Numeric)] assert op.getOutputShape() == s h = op.execute(g) assert h.shape == s assert mapDate(roundValues(nan_to_None(h.to_dict()), decimals=3)) == { 'col3': ['q', '2', 'c', None, None], 'col2': ['3', '4', None, None, '0'], 'date': [t.strftime(format='%Y-%m-%d') if not pd.isna(t) else '1966-04-02' for t in e['date']], 'col4': [0.9, 2.0, 0.9, 4.0, 10.0] }
def test_str_toNumeric(): d = { 'col1': pd.Categorical([3, 0, 5, 6, 0]), 'col2': [3, 4, 5, 6, 0], 'col3': ['123', '2', '0.43', '4', '90'] } # 'cold': pd.Series(['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'], # dtype='datetime64[ns]')} f = Frame(d) op = ToNumeric() op.addInputShape(f.shape, pos=0) op.setOptions(attributes={0: dict(), 2: dict()}, errors='raise') # Predict output shape os = f.shape.columnsDict os['col1'] = Types.Numeric os['col3'] = Types.Numeric assert op.getOutputShape().columnsDict == os # Removing options/input_shape causes None to be returned op.removeInputShape(0) assert op.getOutputShape() is None op.addInputShape(f.shape, pos=0) op.unsetOptions() assert op.getOutputShape() is None op.setOptions(attributes={0: dict(), 2: dict()}, errors='coerce') assert op.getOutputShape().columnsDict == os # Re-adding everything g = op.execute(f) gd = { 'col1': [3.0, 0.0, 5.0, 6.00, 0.0], 'col2': [3., 4., 5., 6., 0.0], 'col3': [123.0, 2.0, 0.43, 4.0, 90.0] } assert roundValues(g.to_dict(), 3) == gd assert g.shape.columnsDict == os assert g.shape.indexDict == f.shape.indexDict