示例#1
0
def test_standardScale_1_attr_all_nan():
    d = {
        'id1': ['ab', 'sa', '121', '121', 'a'],
        'id2': [1, np.nan, 0, 44, 0],
        'col1': [1.0, -1.1, 3.0, 7.5, 10.0],
        'col2': [np.nan, np.nan, np.nan, np.nan, np.nan],
        'ww': [3, np.nan, 'ww', '1', '1']
    }
    f = data.Frame(d)
    f = f.setIndex(['id1', 'id2'])

    op = StandardScaler()
    assert op.getOutputShape() is None
    op.setOptions(attributes={1: None})
    assert op.getOutputShape() is None
    op.addInputShape(f.shape, 0)
    s = f.shape.clone()
    assert op.getOutputShape() == s

    op.removeInputShape(0)
    assert op.getOutputShape() is None
    op.addInputShape(f.shape, 0)

    g = op.execute(f)
    expected = {
        'col1':
        d['col1'],
        'col2': [((x - np.nanmean(d['col2'])) / np.nanstd(d['col2']))
                 for x in d['col2']],
        'ww': [3, None, 'ww', '1', '1']
    }
    assert nan_to_None(roundValues(g.to_dict(),
                                   4)) == nan_to_None(roundValues(expected, 4))
    assert g.shape == s
示例#2
0
def test_ordinal_to_nominal_cat():
    d = {
        'col1': pd.Categorical(["5", "0", "5", "U", "0"], ordered=True),
        'col2': [3, 4, 5.1, 6, 0]
    }

    f = Frame(d)

    op = ToCategorical()
    op.addInputShape(f.shape, pos=0)
    op.setOptions(attributes={0: {'cat': '5 0 1 2', 'ordered': False}})

    # Predict output shape
    os = f.shape.columnsDict
    os['col1'] = Types.Nominal
    assert op.getOutputShape().columnsDict == os

    g = op.execute(f)
    gd = {
        'col1': ["5", "0", "5", None, "0"],
        'col2': [3.0, 4.0, 5.1, 6.0, 0.0]
    }
    assert nan_to_None(g.to_dict()) == gd
    assert g.shape.columnsDict == os
    assert list(
        g.getRawFrame()['col1'].dtype.categories) == ['5', '0', '1', '2']
    assert g.getRawFrame()['col1'].dtype.ordered is False

    op.setOptions(attributes={0: {'cat': '5 0 1 2'}})
    e = op.execute(f)
    assert nan_to_None(g.to_dict()) == nan_to_None(e.to_dict())
示例#3
0
def test_minMaxScale():
    d = {
        'id1': ['ab', 'sa', '121', '121', 'a'],
        'id2': [1, np.nan, 0, 44, 0],
        'col1': [1, -1.1, 3, 7.5, 10],
        'col2': [3, 4, np.nan, 6, np.nan],
        'ww': [3, np.nan, 'ww', '1', '1']
    }
    f = data.Frame(d)
    f = f.setIndex(['id1', 'id2'])

    op = MinMaxScaler()
    assert op.getOutputShape() is None
    op.setOptions(attributes={0: {'range': (-1, 1)}, 1: {'range': (2, 4)}})
    assert op.getOutputShape() is None
    op.addInputShape(f.shape, 0)
    s = f.shape.clone()
    assert op.getOutputShape() == s

    op.removeInputShape(0)
    assert op.getOutputShape() is None
    op.addInputShape(f.shape, 0)

    g = op.execute(f)
    expected = {
        'col1': [((x - min(d['col1'])) / (max(d['col1']) - min(d['col1'])))
                 for x in d['col1']],
        'col2': [((x - min(d['col2'])) / (max(d['col2']) - min(d['col2'])))
                 for x in d['col2']],
        'ww': [3, None, 'ww', '1', '1']
    }
    expected = {
        'col1': [x * (1 - (-1)) - 1 for x in expected['col1']],
        'col2': [x * (4 - 2) + 2 for x in expected['col2']],
        'ww': expected['ww']
    }
    assert nan_to_None(roundValues(g.to_dict(),
                                   4)) == nan_to_None(roundValues(expected, 4))
    assert g.shape == s
    assert not numpy_equal(g.getRawFrame().values, f.getRawFrame().values)

    options = op.getOptions()
    assert options == {
        'attributes': {
            0: {
                'range': (-1, 1)
            },
            1: {
                'range': (2, 4)
            }
        }
    }
示例#4
0
def test_merge_index_val():
    d = {
        'cowq': [1, 2, 3, 4.0, 10],
        'col2':
        pd.Categorical(["3", "4", "5", "6", "0"]),
        'col3': ['q', '2', 'c', '4', 'x'],
        'date':
        pd.Series([
            '05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994',
            '12-12-2012'
        ],
                  dtype='datetime64[ns]')
    }
    f = data.Frame(d)

    op = ReplaceValues()
    op.addInputShape(f.shape, 0)
    op.setOptions(table={1: {
        'values': '3 4;  6  0',
        'replace': 'h; nan'
    }},
                  inverted=False)

    s = f.shape.clone()
    os = op.getOutputShape()
    assert f.shape.colTypes[1] == Types.Nominal == os.colTypes[1]
    assert os == s

    g = op.execute(f)
    assert g.shape == f.shape
    assert nan_to_None(data.Frame(g.getRawFrame()['col2']).to_dict()) == \
           {'col2': ["h", "h", "5", None, None]}
示例#5
0
def test_merge_from_nan():
    d = {
        'cowq': [1, 2, None, 4.0, None],
        'col2': pd.Categorical(["3", "4", "5", "6", "0"]),
        'col3': ['q', '2', 'c', '4', 'x']
    }
    f = data.Frame(d)

    op = ReplaceValues()
    op.addInputShape(f.shape, 0)
    op.setOptions(table={0: {
        'values': 'Nan 2.0;4.0',
        'replace': '-1;-2'
    }},
                  inverted=False)

    s = f.shape.clone()
    assert f.shape.colTypes[1] == Types.Nominal
    assert op.getOutputShape() == s

    g = op.execute(f)
    assert g.shape == f.shape
    ff = {
        'cowq': [1.0, -1.0, -1.0, -2.0, -1.0],
        'col2': ["3", "4", "5", "6", "0"],
        'col3': ['q', '2', 'c', '4', 'x']
    }
    assert nan_to_None(g.to_dict()) == ff
示例#6
0
def test_merge_nan():
    d = {
        'cowq': [1, 2, 3, 4.0, 10],
        'col2': pd.Categorical(["3", "4", "5", "6", "0"]),
        'col3': ['q', '2', 'c', '4', 'x']
    }
    f = data.Frame(d)

    op = ReplaceValues()
    op.addInputShape(f.shape, 0)
    op.setOptions(table={
        1: {
            'values': 'hello 2 6 0; 3',
            'replace': 'NAN; nan'
        },
        0: {
            'values': '2 4 10',
            'replace': 'naN'
        }
    },
                  inverted=False)

    s = f.shape.clone()
    assert f.shape.colTypes[1] == Types.Nominal
    assert op.getOutputShape() == s

    g = op.execute(f)
    assert g.shape == f.shape
    ff = {
        'cowq': [1, None, 3, None, None],
        'col2': [None, "4", "5", None, None],
        'col3': ['q', '2', 'c', '4', 'x']
    }
    assert nan_to_None(g.to_dict()) == ff
示例#7
0
def test_merge_category_inverted():
    d = {
        'col1': [1, 2, 3, 4.0, 10],
        'col2':
        pd.Categorical(["3", "4", "5", "6", "0"]),
        'col3': ['q', '2', 'c', '4', 'x'],
        'date':
        ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012']
    }
    f = data.Frame(d)
    op = ReplaceValues()

    op.addInputShape(f.shape, 0)
    op.setOptions(table={1: {
        'values': '4 0; 0',
        'replace': 'val;  NAN'
    }},
                  inverted=True)

    s = f.shape.clone()
    assert op.getOutputShape() == s
    assert s.colTypes[1] == Types.Nominal

    g = op.execute(f)

    assert nan_to_None(g.to_dict()) == {
        'col1': [1, 2, 3, 4.0, 10],
        'col2': [None, None, None, None, "0"],
        'col3': ['q', '2', 'c', '4', 'x'],
        'date':
        ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012']
    }
    assert g != f and g.shape == s
示例#8
0
def test_fillnan_ffill():
    e = {'col1': [np.nan, 2, np.nan, 4, 10],
         'col2': pd.Categorical(['3', '4', np.nan, np.nan, '0'], ordered=True),
         'col3': ['q', '2', 'c', np.nan, np.nan],
         'date': pd.Series(['05-09-1988', np.nan, np.nan, '22-06-1994', '12-12-2012'],
                           dtype='datetime64[ns]')}
    g = data.Frame(e)

    g = g.setIndex('col1')

    op = FillNan()
    assert op.getOutputShape() is None
    op.addInputShape(g.shape, 0)
    op.setOptions(selected={0: None, 1: None, 2: None}, fillMode='ffill')
    assert op.getOptions() == {
        'selected': {0: None, 1: None, 2: None},
        'fillMode': 'ffill'
    }

    s = Shape()
    s.colNames = ['col3', 'col2', 'date']
    s.colTypes = [Types.String, Types.Ordinal, Types.Datetime]
    s.index = ['col1']
    s.indexTypes = [IndexType(Types.Numeric)]
    assert op.getOutputShape() == s

    h = op.execute(g)
    assert h.shape == s

    assert mapDate(roundValues(nan_to_None(h.to_dict()), decimals=3)) == {
        'col3': ['q', '2', 'c', 'c', 'c'],
        'col2': ['3', '4', '4', '4', '0'],
        'date': [t.strftime(format='%Y-%m-%d') if not pd.isna(t) else '1988-05-09'
                 for t in e['date']]
    }
示例#9
0
def test_discretize_by_date_with_None():
    d = {'col2': [3, 4, 5.1, 6, 0],
         'col3': ['123', '2', '0.43', '4', '2021 January'],
         'cold': [pd.Timestamp('05-09-1988'), pd.Timestamp('22-12-1994'),
                  pd.Timestamp('21-11-1995'), None, pd.Timestamp('12-12-2012')],
         'cold2': [pd.Timestamp('01-01-1950'), pd.Timestamp('22-12-1980'),
                   pd.Timestamp('21-11-1995'), None, pd.Timestamp('12-12-2034')],
         'cold_disc': [None, None, None, None, None]  # test to see if it is removed
         }

    f = data.Frame(d)
    f = f.setIndex('col2')

    op = DateDiscretizer()
    op.addInputShape(f.shape, 0)
    assert op.getOutputShape() is None
    op.addInputShape(f.shape, 0)
    assert op.getOutputShape() is None
    shapeDict = f.shape.columnsDict
    assert shapeDict['cold'] == Types.Datetime
    assert shapeDict['cold2'] == Types.Datetime

    intervals = [pd.Timestamp('01-01-1950'), pd.Timestamp('01-01-1970'),
                 pd.Timestamp('01-01-1990'), pd.Timestamp('01-01-2010'),
                 pd.Timestamp('01-01-2030')]

    op.setOptions(selected={
        1: {'ranges': (intervals, True, False), 'labels': ['50', '70', '80', 'now']},
        2: {'ranges': (intervals, True, True), 'labels': ['50', '70', '80', 'now']}},
        suffix=(True, '_disc'))

    assert op.getOptions() == {
        'selected': {
            1: {'ranges': (intervals, True, False), 'labels': ['50', '70', '80', 'now']},
            2: {'ranges': (intervals, True, True), 'labels': ['50', '70', '80', 'now']}},
        'suffix': (True, '_disc')
    }

    shapeDict['cold_disc'] = Types.Ordinal
    shapeDict['cold2_disc'] = Types.Ordinal
    s = data.Shape.fromDict(shapeDict, f.shape.indexDict)
    assert op.getOutputShape() == s

    g = op.execute(f)
    assert g.shape == s

    output = nan_to_None(g.to_dict())
    assert output == {'col3': ['123', '2', '0.43', '4', '2021 January'],
                      'cold': [pd.Timestamp('05-09-1988'), pd.Timestamp('22-12-1994'),
                               pd.Timestamp('21-11-1995'), None, pd.Timestamp('12-12-2012')],
                      'cold2': [pd.Timestamp('01-01-1950'), pd.Timestamp('22-12-1980'),
                                pd.Timestamp('21-11-1995'), None, pd.Timestamp('12-12-2034')],
                      'cold_disc': ['70', '80', '80', None, 'now'],
                      'cold2_disc': [None, '70', '80', None, None]
                      }
    assert g.getRawFrame()['cold_disc'].cat.categories.to_list() == ['50', '70', '80', 'now']
    assert g.getRawFrame()['cold_disc'].dtype.ordered is True
    assert g.getRawFrame()['cold2_disc'].cat.categories.to_list() == ['50', '70', '80', 'now']
    assert g.getRawFrame()['cold2_disc'].dtype.ordered is True
示例#10
0
def test_remove_bijections():
    op = RemoveBijections()

    d = {
        'col1': [1.0, 2.0, 3.0, np.nan, 10.0],
        'col2': [3.0, 4.0, np.nan, 6.0, np.nan],
        'col3': ['q', '2', 'c', '4', 'x'],
        'col11': ['q', '2', 'c', '4', 'x'],
        'date':
        ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012']
    }

    f = data.Frame(d)

    op.addInputShape(f.shape, 0)
    assert op.getOutputShape() is None
    assert op.getOptions() == {'attributes': dict()}

    oo = {1: None, 0: None, 2: None, 3: None}
    op.setOptions(attributes=oo)
    assert op.getOutputShape() is None
    assert op.needsInputShapeKnown() is True
    assert op.isOutputShapeKnown() is False
    assert op.getOptions() == {
        'attributes': {
            1: None,
            0: None,
            3: None,
            2: None
        }
    }
    oo[1] = 'ss'
    assert op.getOptions() == {
        'attributes': {
            1: None,
            0: None,
            3: None,
            2: None
        }
    }

    g = op.execute(f)
    expected = copy.deepcopy(d)
    del expected['col11']
    assert nan_to_None(expected) == nan_to_None(g.to_dict())
示例#11
0
def test_discretize_by_date_and_time():
    d = {'col2': [3, 4, 5.1, 6, 0],
         'col3': ['123', '2', '0.43', '4', '2021 January'],
         'cold': [pd.Timestamp('05-09-1988 13:45'), pd.Timestamp('22-12-1994 14:21'),
                  pd.Timestamp('21-11-1995 11:50'), None, pd.Timestamp('12-12-2012 09:15')],
         'cold2': [pd.Timestamp('05-09-1988 13:45'), pd.Timestamp('22-12-1994 14:21'),
                   pd.Timestamp('21-11-1995 11:50'), None, pd.Timestamp('12-12-2012 09:15')]
         }

    f = data.Frame(d)
    f = f.setIndex(['col2', 'col3'])

    op = DateDiscretizer()
    op.addInputShape(f.shape, 0)
    assert op.getOutputShape() is None
    op.addInputShape(f.shape, 0)
    assert op.getOutputShape() is None
    shapeDict = f.shape.columnsDict
    assert shapeDict['cold'] == Types.Datetime

    intervals = [pd.Timestamp('05-09-1988 07:00'), pd.Timestamp('20-12-1994 11:30'),
                 pd.Timestamp('05-09-2000 14:20'), pd.Timestamp('01-09-2010 14:30'),
                 pd.Timestamp('12-12-2012 09:14')]
    labels = ['early mo', 'middle', 'late', 'now']

    op.setOptions(selected={
        0: {'ranges': (intervals, True, True), 'labels': labels},
        1: {'ranges': (intervals, True, True), 'labels': labels}},
        suffix=(False, '_disc'))

    assert op.getOptions() == {
        'selected': {
            0: {'ranges': (intervals, True, True), 'labels': labels},
            1: {'ranges': (intervals, True, True), 'labels': labels}},
        'suffix': (False, None)
    }

    shapeDict['cold'] = Types.Ordinal
    shapeDict['cold2'] = Types.Ordinal
    s = data.Shape.fromDict(shapeDict, f.shape.indexDict)
    assert op.getOutputShape() == s

    g = op.execute(f)
    assert g.shape == s
    assert g.shape != f.shape

    output = nan_to_None(g.to_dict())
    assert output == {
        'cold': ['early mo', 'middle', 'middle', None, None],
        'cold2': ['early mo', 'middle', 'middle', None, None]}
    assert g.getRawFrame()['cold'].cat.categories.to_list() == ['early mo', 'middle', 'late', 'now']
    assert g.getRawFrame()['cold'].dtype.ordered is True
    assert g.getRawFrame()['cold2'].cat.categories.to_list() == ['early mo', 'middle', 'late', 'now']
    assert g.getRawFrame()['cold2'].dtype.ordered is True
示例#12
0
def test_standardScale():
    d = {
        'id1': ['ab', 'sa', '121', '121', 'a'],
        'id2': [1, np.nan, 0, 44, 0],
        'col1': [1, -1.1, 3, 7.5, 10],
        'col2': [3, 4, np.nan, 6, np.nan],
        'ww': [3, np.nan, 'ww', '1', '1']
    }
    f = data.Frame(d)
    f = f.setIndex(['id1', 'id2'])

    op = StandardScaler()
    assert op.getOutputShape() is None
    op.setOptions(attributes={0: None, 1: None})
    assert op.getOutputShape() is None
    op.addInputShape(f.shape, 0)
    s = f.shape.clone()
    assert op.getOutputShape() == s

    op.removeInputShape(0)
    assert op.getOutputShape() is None
    op.addInputShape(f.shape, 0)

    g = op.execute(f)
    expected = {
        'col1': [((x - np.nanmean(d['col1'])) / np.nanstd(d['col1']))
                 for x in d['col1']],
        'col2': [((x - np.nanmean(d['col2'])) / np.nanstd(d['col2']))
                 for x in d['col2']],
        'ww': [3, None, 'ww', '1', '1']
    }
    assert nan_to_None(roundValues(g.to_dict(),
                                   4)) == nan_to_None(roundValues(expected, 4))
    assert g.shape == s
    assert not numpy_equal(g.getRawFrame().values, f.getRawFrame().values)

    options = op.getOptions()
    assert options == {'attributes': {0: None, 1: None}}
示例#13
0
def test_discretize_num_uniform():
    d = {
        'col1': [1, -1.1, 3, 7.5, 10],
        'col2': [3, 4, np.nan, 6, np.nan],
        'ww': [3, 1, 'ww', '1', '1']
    }
    f = data.Frame(d)

    op = BinsDiscretizer()
    assert op.getOutputShape() is None
    assert op.getOptions() == {
        'attributes': {},
        'strategy': BinStrategy.Uniform,
        'suffix': (True, '_discretized')
    }

    tabOpts = {0: {'bins': '2'}, 1: {'bins': '3'}}
    stra = BinStrategy.Uniform

    op.setOptions(attributes=tabOpts, strategy=stra, suffix=(False, None))
    # Check for side effects
    tabOpts[0]['bins'] = '23'
    stra = '11'

    assert op.getOptions() == {
        'attributes': {
            0: {
                'bins': '2'
            },
            1: {
                'bins': '3'
            }
        },
        'strategy': BinStrategy.Uniform,
        'suffix': (False, None)
    }

    op.addInputShape(f.shape, 0)
    s = f.shape.clone()
    s.colTypes[0] = Types.Ordinal
    s.colTypes[1] = Types.Ordinal
    assert op.getOutputShape() == s

    g = op.execute(f)
    assert nan_to_None(g.to_dict()) == {
        'col1': ['0.0', '0.0', '0.0', '1.0', '1.0'],
        'col2': ['0.0', '1.0', None, '2.0', None],
        'ww': [3, 1, 'ww', '1', '1']
    }
    assert g.shape == s
示例#14
0
def test_discretize_range():
    d = {
        'col1': [1, -1.1, 3, 7.5, 10],
        'col2': [3, 4, np.nan, 6, np.nan],
        'ww': [3, 1, 'ww', '1', '1']
    }
    f = data.Frame(d)

    op = RangeDiscretizer()
    assert op.getOutputShape() is None
    assert op.getOptions() == {'table': {}, 'suffix': (True, '_bins')}
    op.setOptions(table={
        0: {
            'bins': [0, 2, 4, 6, 8, 10],
            'labels': '"a u with\'" b c d e'
        },
        1: {
            'bins': [0, 2, 4, 7],
            'labels': 'A B C'
        }
    },
                  suffix=(False, None))
    assert op.getOptions() == {
        'table': {
            0: {
                'bins': [0, 2, 4, 6, 8, 10],
                'labels': '"a u with\'" b c d e'
            },
            1: {
                'bins': [0, 2, 4, 7],
                'labels': 'A B C'
            }
        },
        'suffix': (False, None)
    }

    op.addInputShape(f.shape, 0)
    s = f.shape.clone()
    s.colTypes[0] = Types.Ordinal
    s.colTypes[1] = Types.Ordinal
    assert op.getOutputShape() == s

    g = op.execute(f)
    assert nan_to_None(g.to_dict()) == {
        'col1': ['a u with\'', None, 'b', 'd', 'e'],
        'col2': ['B', 'B', None, 'C', None],
        'ww': [3, 1, 'ww', '1', '1']
    }
    assert g.shape == s
示例#15
0
def test_discretize_range_suffix():
    d = {
        'col1': [1, -1.1, 3, 7.5, 10],
        'col2': [3, 4, None, 6, None],
        'ww': [3, 1, 'ww', '1', '1'],
        'col2_binss': [3, 1, 'ww', '1', '1']
    }
    f = data.Frame(d)

    op = RangeDiscretizer()

    tOpts = {1: {'bins': [0, 2, 4, 7], 'labels': 'A B C'}}
    op.setOptions(table=tOpts, suffix=(True, '_binss'))

    tOpts[1] = {}
    assert op.getOptions() == {
        'table': {
            1: {
                'bins': [0, 2, 4, 7],
                'labels': 'A B C'
            }
        },
        'suffix': (True, '_binss')
    }

    op.addInputShape(f.shape, 0)
    s = f.shape.clone()
    s.colTypes[1] = Types.Numeric
    s.colTypes[3] = Types.Ordinal
    assert op.getOutputShape() == s

    g = op.execute(f)
    expected_output = {
        'col1': [1, -1.1, 3, 7.5, 10],
        'col2': [3, 4, None, 6, None],
        'ww': [3, 1, 'ww', '1', '1'],
        'col2_binss': ['B', 'B', None, 'C',
                       None]  # This column must replace the original duplicate
    }
    assert nan_to_None(g.to_dict()) == expected_output
    assert g.shape == s

    # Check that output is the same as with drop
    op.setOptions(table={1: {
        'bins': [0, 2, 4, 7],
        'labels': 'A B C'
    }},
                  suffix=(False, None))
    o = op.execute(f)
    assert expected_output['col2_binss'] == nan_to_None(o.to_dict())['col2']
    assert expected_output['col2'] != nan_to_None(o.to_dict())['col2']
    assert expected_output['ww'] == nan_to_None(o.to_dict())['ww']
    assert expected_output['col1'] == nan_to_None(o.to_dict())['col1']
    assert expected_output['col2_binss'] != nan_to_None(
        o.to_dict())['col2_binss']
示例#16
0
def test_fillnan_byVal_date_num():
    e = {'col1': [np.nan, 2, np.nan, 4, 10],
         'col2': pd.Categorical(['3', '4', np.nan, np.nan, '0'], ordered=True),
         'col3': ['q', '2', 'c', np.nan, np.nan],
         'date': pd.Series(['05-09-1988', np.nan, np.nan, '22-06-1994', '12-12-2012'],
                           dtype='datetime64[ns]'),
         'col4': [np.nan, 2, np.nan, 4, 10]}
    g = data.Frame(e)

    g = g.setIndex('col1')

    op = FillNan()
    assert op.getOutputShape() is None
    op.addInputShape(g.shape, 0)
    with pytest.raises(OptionValidationError):
        op.setOptions(selected={0: {'fill': 'pol'}, 1: {'fill': '23'},  # wrong
                                2: {'fill': '1966-04-02 00:00:30'},
                                3: {'fill': 'march'}},  # wrong
                      fillMode='value')

    op.setOptions(selected={2: {'fill': '1966-04-02 00:00:30'},
                            3: {'fill': '0.9'}},
                  fillMode='value')

    assert op.getOptions() == {
        'selected': {2: {'fill': '1966-04-02 00:00:30'},
                     3: {'fill': '0.9'}},
        'fillMode': 'value'
    }

    s = Shape()
    s.colNames = ['col3', 'col2', 'date', 'col4']
    s.colTypes = [Types.String, Types.Ordinal, Types.Datetime, Types.Numeric]
    s.index = ['col1']
    s.indexTypes = [IndexType(Types.Numeric)]
    assert op.getOutputShape() == s

    h = op.execute(g)
    assert h.shape == s

    assert mapDate(roundValues(nan_to_None(h.to_dict()), decimals=3)) == {
        'col3': ['q', '2', 'c', None, None],
        'col2': ['3', '4', None, None, '0'],
        'date': [t.strftime(format='%Y-%m-%d') if not pd.isna(t) else '1966-04-02'
                 for t in e['date']],
        'col4': [0.9, 2.0, 0.9, 4.0, 10.0]
    }
示例#17
0
def test_discretize_by_date():
    d = {'col2': [3, 4, 5.1, 6, 0],
         'col3': ['123', '2', '0.43', '4', '2021 January'],
         'cold': ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012']
         }

    f = data.Frame(d)

    op = DateDiscretizer()
    op.addInputShape(f.shape, 0)
    assert op.getOutputShape() is None
    op.addInputShape(f.shape, 0)
    assert op.getOutputShape() is None
    shapeDict = f.shape.columnsDict
    assert shapeDict['cold'] is Types.String

    intervals = [pd.Timestamp('01-01-1950'),
                 pd.Timestamp('01-01-1970'),
                 pd.Timestamp('30-12-1994'),
                 pd.Timestamp('01-01-2010')]

    op.setOptions(selected={2: {'ranges': (intervals, True, False), 'labels': ['50', '70', 'now']}},
                  suffix=(False, None))
    assert op.getOptions() == {'selected':
                                   {2: {'ranges': (intervals, True, False),
                                        'labels': ['50', '70', 'now']}},
                               'suffix': (False, None)
                               }
    shapeDict['cold'] = Types.Ordinal
    s = data.Shape.fromDict(shapeDict, f.shape.indexDict)

    assert op.getOutputShape() == s

    g = op.execute(f)
    assert g.shape == s

    output = nan_to_None(g.to_dict())
    assert output == {'col2': [3, 4, 5.1, 6, 0],
                      'col3': ['123', '2', '0.43', '4', '2021 January'],
                      'cold': ['70', '70', 'now', '70', None]
                      }
    assert g.getRawFrame().iloc[:, 2].cat.categories.to_list() == ['50', '70', 'now']
    assert g.getRawFrame().iloc[:, 2].dtype.ordered is True
示例#18
0
def test_drop_columns():
    e = {'col1': [np.nan, 2, np.nan, 4, 10],
         'col2': pd.Categorical(['3', '4', np.nan, np.nan, '0'], ordered=True),
         'col3': ['q', '2', 'c', np.nan, np.nan],
         'date': pd.Series(['05-09-1988', np.nan, np.nan, '22-06-1994', '12-12-2012'],
                           dtype='datetime64[ns]')}
    g = data.Frame(e)

    g = g.setIndex('col1')

    op = DropColumns()
    assert op.getOutputShape() is None
    op.addInputShape(g.shape, 0)
    assert op.getOutputShape() is None
    assert op.getOptions() == {
        'selected': dict()
    }

    selOpts = {0: None, 2: None}
    op.setOptions(selected={0: None, 2: None})
    opts = op.getOptions()
    assert opts['selected'] == selOpts
    opts['selected'] = {}
    assert op.getOptions()['selected'] == selOpts
    assert op.getOptions() != opts

    with pytest.raises(exc.OptionValidationError) as e:
        op.setOptions(selected={})

    s = data.Shape()
    s.colNames = ['col3']
    s.colTypes = [Types.String]
    s.index = ['col1']
    s.indexTypes = [IndexType(Types.Numeric)]
    assert op.getOutputShape() == s

    h = op.execute(g)
    assert h.shape == s

    assert nan_to_None(h.to_dict()) == {
        'col3': ['q', '2', 'c', None, None]
    }
示例#19
0
def test_merge_string():
    d = {'cowq': [1, 2, None, 4.0, None], 'col3': ['q', '2', 'c', '4', 'q']}
    f = data.Frame(d)

    op = ReplaceValues()
    op.addInputShape(f.shape, 0)
    op.setOptions(table={1: {
        'values': 'q 2; nAn',
        'replace': '-1;-2'
    }},
                  inverted=False)

    s = f.shape.clone()
    assert op.getOutputShape() == s

    g = op.execute(f)
    assert g.shape == f.shape
    ff = {
        'cowq': [1.0, 2.0, None, 4.0, None],
        'col3': ["-1", "-1", "c", "4", "-1"]
    }
    assert nan_to_None(g.to_dict()) == ff
示例#20
0
def test_str_toNumeric_coerce():
    d = {
        'col1': pd.Categorical(['3', np.nan, 5, 6, 0]),
        'col2': [3, 4, 5, 6, 0],
        'col3': [np.nan, '2', '0.43', '4', np.nan]
    }

    # 'cold': pd.Series(['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'],
    #                   dtype='datetime64[ns]')}
    f = Frame(d)

    op = ToNumeric()
    op.addInputShape(f.shape, pos=0)
    op.setOptions(attributes={0: dict(), 2: dict()}, errors='coerce')

    # Predict output shape
    os = f.shape.columnsDict
    os['col1'] = Types.Numeric
    os['col3'] = Types.Numeric
    assert op.getOutputShape().columnsDict == os

    # Removing options/input_shape causes None to be returned
    op.removeInputShape(0)
    assert op.getOutputShape() is None
    op.addInputShape(f.shape, pos=0)
    op.unsetOptions()
    assert op.getOutputShape() is None
    op.setOptions(attributes={0: dict(), 2: dict()}, errors='coerce')
    assert op.getOutputShape().columnsDict == os  # Re-adding everything

    g = op.execute(f)
    gd = {
        'col1': [3., None, 5., 6., 0.],
        'col2': [3., 4., 5., 6., 0.],
        'col3': [None, 2.0, 0.43, 4.0, None]
    }
    assert roundValues(nan_to_None(g.to_dict()), 2) == gd
    assert g.shape.columnsDict == os
    assert g.shape.indexDict == f.shape.indexDict
示例#21
0
def test_str_toCategory():
    d = {
        'col1': pd.Categorical(["3", "0", "5", "6", "0"]),
        'col2': ["3", "4", "5.1", "6", None],
        'col3': ['123', '2', '0.43', 'nan', '90']
    }

    # 'cold': pd.Series(['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'],
    #                   dtype='datetime64[ns]')}
    f = Frame(d)

    op = ToCategorical()
    op.addInputShape(f.shape, pos=0)
    op.setOptions(attributes={1: {'cat': '4 3 0', 'ordered': True}, 2: dict()})

    # Predict output shape
    os = f.shape.columnsDict
    os['col3'] = Types.Nominal
    os['col2'] = Types.Ordinal
    assert op.getOutputShape().columnsDict == os

    # Removing options/input_shape causes None to be returned
    op.removeInputShape(0)
    assert op.getOutputShape() is None
    op.addInputShape(f.shape, pos=0)
    op.unsetOptions()
    assert op.getOutputShape() is None
    op.setOptions(attributes={1: {'cat': '4 3 0', 'ordered': True}, 2: dict()})
    assert op.getOutputShape().columnsDict == os  # Re-adding everything

    g = op.execute(f)
    gd = {
        'col1': ["3", "0", "5", "6", "0"],
        'col2': ['3', '4', None, None, None],
        'col3': ['123', '2', '0.43', 'nan', '90']
    }
    assert nan_to_None(g.to_dict()) == gd
    assert g.shape.columnsDict == os
示例#22
0
def test_cat_toCategory():
    d = {
        'col1': pd.Categorical(["5", "0", "5", "U", "0"]),
        'col2': [3, 4, 5.1, 6, 0],
        'col4': [1, 2, 3, 4, 5],  # this will become a float
        'col3': ['123', '2', '0.43', '4', '90']
    }

    f = Frame(d)

    op = ToCategorical()
    op.addInputShape(f.shape, pos=0)
    op.setOptions(attributes={0: {'cat': '5 0'}})

    # Predict output shape
    os = f.shape.columnsDict
    assert op.getOutputShape().columnsDict == os

    # Removing options/input_shape causes None to be returned
    op.removeInputShape(0)
    assert op.getOutputShape() is None
    op.addInputShape(f.shape, pos=0)
    op.unsetOptions()
    assert op.getOutputShape() is None
    op.setOptions(attributes={0: {'cat': '5 0'}})
    assert op.getOutputShape().columnsDict == os  # Re-adding everything

    g = op.execute(f)
    gd = {
        'col1': ["5", "0", "5", None, "0"],
        'col2': [3.0, 4.0, 5.1, 6.0, 0.0],
        'col3': ['123', '2', '0.43', '4', '90'],
        'col4': [1.0, 2.0, 3.0, 4.0, 5.0]
    }
    assert nan_to_None(g.to_dict()) == gd
    assert g.shape.columnsDict == os
示例#23
0
def test_execute():
    d = {
        'id': ['hiab', 'gine', 'hiac', 'hiaa', 'hiad'],
        'diab1': [2.3, 3.4, 10.2, 14.6, 66.3],
        'diab2': [3, 4, np.nan, 6, np.nan],
        'new1': ['cat1', 'cat2', 'ww', '1', '10']
    }
    e = {
        'id': ['hiab', 'hiae', 'hiac', 'hiaa', 'hiad'],
        'diab4': [12, np.nan, 21.2, 13.45, 1.02],
        'diab3': [1.0, -12, 2.3, 4.1, 5.6],
        'newt': [0, 0, 1, 0, 0]
    }
    f = data.Frame(d)
    g = data.Frame(e)
    f = f.setIndex('id')
    g = g.setIndex('id')
    fShape = f.shape.clone()
    gShape = g.shape.clone()
    w = WorkbenchModelMock()
    w.setDataframeByName('frameF', f)
    w.setDataframeByName('frameG', g)

    assert g.getRawFrame().index.name == 'id'
    assert f.getRawFrame().index.name == 'id'

    op = ExtractTimeSeries(w)

    timeLabels = ['wave1', 'wave2', 'wave3', 'wave4']

    options = {
        'diab': [('frameG', 0, 3), ('frameF', 1, 1), ('frameF', 0, 0),
                 ('frameG', 1, 2)],
        'other': [('frameF', 2, 0), ('frameF', 1, 1), ('frameG', 2, 2),
                  ('frameF', 0, 3)]
    }

    op.setOptions(series=options, time=timeLabels, outName='frameR')

    # Check for side effects
    options['diab'] = []
    assert op._ExtractTimeSeries__timeLabels is not timeLabels
    assert op._ExtractTimeSeries__series is not options
    assert op._ExtractTimeSeries__series == {
        'diab': [('frameG', 0, 3), ('frameF', 1, 1), ('frameF', 0, 0),
                 ('frameG', 1, 2)],
        'other': [('frameF', 2, 0), ('frameF', 1, 1), ('frameG', 2, 2),
                  ('frameF', 0, 3)]
    }

    op.execute()

    assert w.getDataframeModelByName('frameF').frame.shape == fShape
    assert w.getDataframeModelByName('frameG').frame.shape == gShape
    r: data.Frame = w.getDataframeModelByName('frameR').frame

    rr = r.getRawFrame()
    rr_dict = {
        k: gr.to_dict(orient='records')
        for k, gr in rr.groupby(level=0)
    }
    assert {
        k: sorted(v, key=lambda rec: rec['time'])
        for k, v in nan_to_None(rr_dict).items()
    } == {
        'hiaa': [{
            'diab': 14.60,
            'other': '1',
            'time': 'wave1'
        }, {
            'diab': 6.000,
            'other': 6.00,
            'time': 'wave2'
        }, {
            'diab': 4.100,
            'other': 0.00,
            'time': 'wave3'
        }, {
            'diab': 13.45,
            'other': 14.6,
            'time': 'wave4'
        }],
        'hiab': [{
            'diab': 2.30,
            'other': 'cat1',
            'time': 'wave1'
        }, {
            'diab': 3.00,
            'other': 3.0000,
            'time': 'wave2'
        }, {
            'diab': 1.00,
            'other': 0.0000,
            'time': 'wave3'
        }, {
            'diab': 12.0,
            'other': 2.3000,
            'time': 'wave4'
        }],
        'hiac': [{
            'diab': 10.2,
            'other': 'ww',
            'time': 'wave1'
        }, {
            'diab': None,
            'other': None,
            'time': 'wave2'
        }, {
            'diab': 2.30,
            'other': 1.00,
            'time': 'wave3'
        }, {
            'diab': 21.2,
            'other': 10.2,
            'time': 'wave4'
        }],
        'hiad': [{
            'diab': 66.3,
            'other': '10',
            'time': 'wave1'
        }, {
            'diab': None,
            'other': None,
            'time': 'wave2'
        }, {
            'diab': 5.60,
            'other': 0.00,
            'time': 'wave3'
        }, {
            'diab': 1.02,
            'other': 66.3,
            'time': 'wave4'
        }],
        'hiae': [{
            'diab': None,
            'other': None,
            'time': 'wave1'
        }, {
            'diab': None,
            'other': None,
            'time': 'wave2'
        }, {
            'diab': -12.,
            'other': 0.00,
            'time': 'wave3'
        }, {
            'diab': None,
            'other': None,
            'time': 'wave4'
        }],
        'gine': [{
            'diab': 3.40,
            'other': 'cat2',
            'time': 'wave1'
        }, {
            'diab': 4.00,
            'other': 4.0000,
            'time': 'wave2'
        }, {
            'diab': None,
            'other': None,
            'time': 'wave3'
        }, {
            'diab': None,
            'other': 3.4000,
            'time': 'wave4'
        }]
    }
    assert r.shape.columnsDict == {
        'diab': Types.Numeric,
        'other': Types.String,
        'time': Types.Ordinal
    }
示例#24
0
def test_discretize_num_uniform_nondrop():
    d = {
        'col1': [1, -1.1, 3, 7.5, 10],
        'col2': [3, 4, np.nan, 6, np.nan],
        'ww': [3, 1, 'ww', '1', '1'],
        'col2_discre': [1, 1, 1, 1, 1]
    }  # col2_discre replaced
    f = data.Frame(d)

    op = BinsDiscretizer()
    assert op.getOutputShape() is None
    assert op.getOptions() == {
        'attributes': {},
        'strategy': BinStrategy.Uniform,
        'suffix': (True, '_discretized')
    }
    op.setOptions(attributes={
        0: {
            'bins': '2'
        },
        1: {
            'bins': '3'
        }
    },
                  strategy=BinStrategy.Uniform,
                  suffix=(True, '_discre'))

    assert op.getOptions() == {
        'attributes': {
            0: {
                'bins': '2'
            },
            1: {
                'bins': '3'
            }
        },
        'strategy': BinStrategy.Uniform,
        'suffix': (True, '_discre')
    }

    op.addInputShape(f.shape, 0)
    s = f.shape.clone()
    cd = s.columnsDict
    cd['col1_discre'] = Types.Ordinal
    cd['col2_discre'] = Types.Ordinal
    s = data.Shape.fromDict(cd, s.indexDict)
    assert op.getOutputShape() == s

    g = op.execute(f)
    expected_output = {
        'col1_discre': ['0.0', '0.0', '0.0', '1.0', '1.0'],
        'col2_discre': ['0.0', '1.0', None, '2.0', None],
        'col1': [1, -1.1, 3, 7.5, 10],
        'col2': [3, 4, None, 6, None],
        'ww': [3, 1, 'ww', '1', '1']
    }
    assert nan_to_None(g.to_dict()) == expected_output
    assert g.shape == s

    # Check that output is the same as with drop
    op.setOptions(attributes={
        0: {
            'bins': '2'
        },
        1: {
            'bins': '3'
        }
    },
                  strategy=BinStrategy.Uniform,
                  suffix=(False, None))
    o = op.execute(f)
    assert expected_output['col1_discre'] == nan_to_None(o.to_dict())['col1']
    assert expected_output['col2_discre'] == nan_to_None(o.to_dict())['col2']
    assert expected_output['col1'] != nan_to_None(o.to_dict())['col1']
    assert expected_output['col2'] != nan_to_None(o.to_dict())['col2']
示例#25
0
def test_discretize_by_time():
    d = {'col2': [3, 4, 5.1, 6, 0],
         'col3': ['123', '2', '0.43', '4', '2021 January'],
         'cold': [pd.Timestamp('10:42'), pd.Timestamp('23:59:07'),
                  pd.Timestamp('07:12'), None, pd.Timestamp('18:13')],
         'cold2': [pd.Timestamp('22:59'), pd.Timestamp('12:00'),
                   pd.Timestamp('16:40:02'), pd.Timestamp('16:40:03'), pd.Timestamp('22:00:02')],
         'nan': [None, None, None, None, None],
         'cold_disc': [None, None, None, None, None]  # test to see if it is removed
         }

    f = data.Frame(d)
    f = f.setIndex(['col2', 'col3'])

    op = DateDiscretizer()
    op.addInputShape(f.shape, 0)
    assert op.getOutputShape() is None
    op.addInputShape(f.shape, 0)
    assert op.getOutputShape() is None
    shapeDict = f.shape.columnsDict
    assert shapeDict['cold'] == Types.Datetime
    assert shapeDict['cold2'] == Types.Datetime

    intervals = [pd.Timestamp('00:00'), pd.Timestamp('06:00'), pd.Timestamp('12:00'),
                 pd.Timestamp('16:40:02'), pd.Timestamp('22:00'),
                 pd.Timestamp('23:59')]
    labels = ['night1', 'morning', 'afternoon', 'evening', 'night2']

    # It's necessary to set a default date object, which is normally done by the editor
    intervals = withDefaultDate(intervals)
    op.setOptions(selected={
        0: {'ranges': (intervals, False, True), 'labels': labels},
        1: {'ranges': (intervals, False, True), 'labels': labels},
        2: {'ranges': (intervals, False, True), 'labels': labels}},
        suffix=(True, '_disc'))

    assert op.getOptions() == {
        'selected': {
            0: {'ranges': (intervals, False, True), 'labels': labels},
            1: {'ranges': (intervals, False, True), 'labels': labels},
            2: {'ranges': (intervals, False, True), 'labels': labels}},
        'suffix': (True, '_disc')
    }

    shapeDict['cold_disc'] = Types.Ordinal
    shapeDict['cold2_disc'] = Types.Ordinal
    shapeDict['nan_disc'] = Types.Ordinal
    s = data.Shape.fromDict(shapeDict, f.shape.indexDict)
    assert op.getOutputShape() == s

    g = op.execute(f)
    assert g.shape == s

    output = nan_to_None(g.to_dict())
    assert output == {'cold': [pd.Timestamp('10:42'), pd.Timestamp('23:59:07'),
                               pd.Timestamp('07:12'), None, pd.Timestamp('18:13')],
                      'cold2': [pd.Timestamp('22:59'), pd.Timestamp('12:00'),
                                pd.Timestamp('16:40:02'), pd.Timestamp('16:40:03'),
                                pd.Timestamp('22:00:02')],
                      'nan': [None, None, None, None, None],
                      'nan_disc': [None, None, None, None, None],
                      'cold_disc': ['morning', None, 'morning', None, 'evening'],
                      'cold2_disc': ['night2', 'morning', 'afternoon', 'evening', 'night2']
                      }
    assert g.getRawFrame()['cold_disc'].cat.categories.to_list() == labels
    assert g.getRawFrame()['cold_disc'].dtype.ordered is True
    assert g.getRawFrame()['cold2_disc'].cat.categories.to_list() == labels
    assert g.getRawFrame()['cold2_disc'].dtype.ordered is True
    assert g.getRawFrame()['nan_disc'].cat.categories.to_list() == labels
    assert g.getRawFrame()['nan_disc'].dtype.ordered is True