Пример #1
0
def test_fromShape_categories():
    d = {
        'col1': [1, 2, 3, 4.0, 10],
        'col2':
        pd.Categorical([3, 4, 5, 6, 0]),
        'col3':
        pd.Categorical(['q', '2', 'c', '4', 'x'], ordered=True),
        'cold':
        pd.Series([
            '05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994',
            '12-12-2012'
        ],
                  dtype='datetime64[ns]')
    }
    f = Frame(d)
    f = f.setIndex(['col2', 'col3', 'col1'])

    g = Frame.fromShape(f.shape)

    s = Shape()
    # fromShape does preserve index
    s.colNames = ['cold']
    s.colTypes = [Types.Datetime]
    s.index = ['col3', 'col1', 'col2']
    s.indexTypes = [
        IndexType(Types.Ordinal),
        IndexType(Types.Numeric),
        IndexType(Types.Nominal)
    ]
    assert g.shape == s == f.shape
Пример #2
0
def test_set_index_num():
    e = {
        'cowq': [1, 2, 3, 4.0, 10],
        'col2':
        pd.Categorical([3, 4, 5, 6, 0]),
        'col3': ['q', '2', 'c', '4', 'x'],
        'date':
        pd.Series([
            '05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994',
            '12-12-2012'
        ],
                  dtype='datetime64[ns]')
    }
    g = data.Frame(e)

    op = SetIndex()
    assert op.getOptions() == {'selected': dict()}
    ops = {'selected': {0: None}}
    op.setOptions(**ops)

    assert op.getOptions() == ops
    assert isDictDeepCopy(op.getOptions(), ops)

    assert op.getOutputShape() is None

    op.addInputShape(g.shape, 0)
    s = Shape()
    s.colNames = ['col3', 'col2', 'date']
    s.colTypes = [Types.String, Types.Nominal, Types.Datetime]
    s.index = ['cowq']
    s.indexTypes = [IndexType(Types.Numeric)]
    assert op.getOutputShape() == s

    h = op.execute(g)
    assert h.shape == s

    # Reset index

    op = ResetIndex()
    assert op.getOutputShape() is None
    op.addInputShape(h.shape, 0)
    s = Shape()
    s.colNames = ['cowq', 'col2', 'date', 'col3']
    s.colTypes = [Types.Numeric, Types.Nominal, Types.Datetime, Types.String]
    s.index = ['Unnamed']
    s.indexTypes = [IndexType(Types.Numeric)]
    assert op.getOutputShape() == s
    j = op.execute(h)
    assert j.shape == s
Пример #3
0
def test_join_on_multiindex():
    d = {'col1': ['1', '2', '3', '4', '10'], 'col2': ['3', '4', '5', '6', '0'],
         'col3': ['q', '2', 'c', '4', 'x'],
         'date': ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012']}
    e = {'col2': pd.Categorical(['3', '4', '5', '6', '0'], ordered=True),
         'cowq': [1, 2, 3, 4.0, 10],
         'col3': ['q', '2', 'c', '4', 'x'],
         'date': pd.Series(['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'],
                           dtype='datetime64[ns]')}
    f = data.Frame(d)
    g = data.Frame(e)

    f = f.setIndex(['col1', 'col2'])  # String, String
    g = g.setIndex(['col2', 'cowq'])  # Category, Numeric

    defaultOpts = '_l', '_r', True, None, None, jt.Left
    op = Join()
    assert op.getOptions() == defaultOpts

    assert op.getOutputShape() is None
    op.addInputShape(f.shape, 0)
    assert op.getOutputShape() is None
    op.addInputShape(g.shape, 1)

    # Now set options
    op.setOptions('_ll', '_rr', True, None, None, jt.Outer)
    assert op.getOptions() == (
        '_ll', '_rr', True, None, None, jt.Outer
    )

    dc = {
        'col3_ll': Types.String,
        'col3_rr': Types.String,
        'date_ll': Types.String,
        'date_rr': Types.Datetime
    }
    # Join on multiindex is different
    di = {
        'col1': IndexType(Types.String),
        'col2': IndexType(Types.String),
        'cowq': IndexType(Types.Numeric)
    }
    s = data.Shape.fromDict(dc, di)
    assert op.getOutputShape() == s

    h = op.execute(f, g)

    assert h.shape == s
Пример #4
0
def test_fillnan_ffill():
    e = {'col1': [np.nan, 2, np.nan, 4, 10],
         'col2': pd.Categorical(['3', '4', np.nan, np.nan, '0'], ordered=True),
         'col3': ['q', '2', 'c', np.nan, np.nan],
         'date': pd.Series(['05-09-1988', np.nan, np.nan, '22-06-1994', '12-12-2012'],
                           dtype='datetime64[ns]')}
    g = data.Frame(e)

    g = g.setIndex('col1')

    op = FillNan()
    assert op.getOutputShape() is None
    op.addInputShape(g.shape, 0)
    op.setOptions(selected={0: None, 1: None, 2: None}, fillMode='ffill')
    assert op.getOptions() == {
        'selected': {0: None, 1: None, 2: None},
        'fillMode': 'ffill'
    }

    s = Shape()
    s.colNames = ['col3', 'col2', 'date']
    s.colTypes = [Types.String, Types.Ordinal, Types.Datetime]
    s.index = ['col1']
    s.indexTypes = [IndexType(Types.Numeric)]
    assert op.getOutputShape() == s

    h = op.execute(g)
    assert h.shape == s

    assert mapDate(roundValues(nan_to_None(h.to_dict()), decimals=3)) == {
        'col3': ['q', '2', 'c', 'c', 'c'],
        'col2': ['3', '4', '4', '4', '0'],
        'date': [t.strftime(format='%Y-%m-%d') if not pd.isna(t) else '1988-05-09'
                 for t in e['date']]
    }
Пример #5
0
    def shape(self) -> Shape:
        """ The shape of a Frame

        :return: Shape object
        """
        s = Shape()

        # Types are set in a more readable format using type_dict to convert names
        s.colNames = list()
        s.colTypes = list()
        # Index of the columns which are set as indexes
        s.index = list()
        s.indexTypes = list()

        # Index columns
        for i in range(self.__df.index.nlevels):
            index: pd.Index = self.__df.index.get_level_values(i)
            s.index.append(index.name if index.name else 'Unnamed')
            s.indexTypes.append(IndexType(wrapperType(index.dtype)))

        # Columns
        for col, type_val in self.__df.dtypes.items():
            wrappedType = wrapperType(type_val)
            s.colNames.append(col)
            s.colTypes.append(wrappedType)
        return s
Пример #6
0
def test_set_index_string():
    e = {
        'cowq': [1, 2, 3, 4.0, 10],
        'col2':
        pd.Categorical(['3', 4, 5, 6, 0]),
        'col3': ['q', '2', 'c', '4', 'x'],
        'date':
        pd.Series([
            '05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994',
            '12-12-2012'
        ],
                  dtype='datetime64[ns]')
    }
    g = data.Frame(e)

    op = SetIndex()
    op.setOptions(selected={2: None})

    assert op.getOutputShape() is None

    op.addInputShape(g.shape, 0)
    s = Shape()
    s.colNames = ['cowq', 'col2', 'date']
    s.colTypes = [Types.Numeric, Types.Nominal, Types.Datetime]
    s.index = ['col3']
    s.indexTypes = [IndexType(Types.String)]
    os = op.getOutputShape()
    assert os == s

    h = op.execute(g)
    hs = h.shape
    assert hs == s

    # Reset index

    op = ResetIndex()
    assert op.getOutputShape() is None
    op.addInputShape(h.shape, 0)
    s = Shape()
    s.colNames = ['cowq', 'col2', 'date', 'col3']
    s.colTypes = [Types.Numeric, Types.Nominal, Types.Datetime, Types.String]
    s.index = ['Unnamed']
    s.indexTypes = [IndexType(Types.Numeric)]
    assert op.getOutputShape() == s
    j = op.execute(h)
    assert j.shape == s
Пример #7
0
 def deserialize(state: Dict) -> 'Shape':
     """ Create a new shape from a serialization """
     s = Shape()
     s.__dict__ = state
     s.colTypes = [Type.fromCode(c) for c in state['colTypes']]
     s.indexTypes = [
         IndexType(Type.fromCode(c)) for c in state['indexTypes']
     ]
     return s
Пример #8
0
def test_join_on_index():
    d = {'col1': ['1', '2', '3', '4', '10'], 'col2': [3, 4, 5, 6, 0], 'col3': ['q', '2', 'c', '4', 'x'],
         'date': ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012']}
    e = {'col2': pd.Categorical(['3', '4', '5', '6', '0'], ordered=True),
         'cowq': [1, 2, 3, 4.0, 10],
         'col3': ['q', '2', 'c', '4', 'x'],
         'date': pd.Series(['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'],
                           dtype='datetime64[ns]')}
    f = data.Frame(d)
    g = data.Frame(e)

    f = f.setIndex('col1')
    g = g.setIndex('col2')

    defaultOpts = '_l', '_r', True, None, None, jt.Left
    op = Join()
    assert op.getOptions() == defaultOpts

    assert op.getOutputShape() is None
    # with pytest.raises(exc.OptionValidationError) as e:
    #     op.setOptions('_ll', '_rr', True, None, None, jt.Inner)
    # CAN set options before shapes
    # assert 'shape' in [a[0] for a in e.value.invalid]
    # assert op.getOptions() == defaultOpts

    assert op.getOutputShape() is None
    op.addInputShape(f.shape, 0)
    assert op.getOutputShape() is None
    op.addInputShape(g.shape, 1)

    # Now set options
    op.setOptions('_ll', '_rr', True, None, None, jt.Inner)
    assert op.getOptions() == (
        '_ll', '_rr', True, None, None, jt.Inner
    )

    dc = {
        'cowq': Types.Numeric,
        'col2': Types.Numeric,
        'col3_ll': Types.String,
        'col3_rr': Types.String,
        'date_ll': Types.String,
        'date_rr': Types.Datetime
    }
    # Note that join does not preserve index name
    di = {
        'Unnamed': IndexType(Types.String)
    }
    s = data.Shape.fromDict(dc, di)
    assert op.getOutputShape() == s

    h = op.execute(f, g)

    assert h.shape == s
Пример #9
0
def test_cloneShape():
    s = Shape()
    s.colNames = ['cold']
    s.colTypes = [Types.Datetime]
    s.index = ['col3', 'col1', 'col2']
    s.indexTypes = [
        IndexType(Types.Ordinal),
        IndexType(Types.Numeric),
        IndexType(Types.Nominal)
    ]
    sColDict = s.columnsDict
    sIndexDict = s.indexDict

    sc = s.clone()
    sc.index.append('col4')
    sc.indexTypes.append(IndexType(Types.Numeric))
    sc.colTypes[0] = Types.Ordinal
    sc.colNames[0] = 'col_new'
    assert sc != s
    assert s.columnsDict == {'cold': Types.Datetime}
    assert sc.columnsDict == {'col_new': Types.Ordinal}
    assert s.columnsDict == sColDict
    assert s.indexDict == sIndexDict
Пример #10
0
def test_fromShape_datetime():
    d = {
        'col1': [1, 2, 3, 4.0, 10],
        'col2': [3, 4, 5, 6, 0],
        'col3': ['q', '2', 'c', '4', 'x'],
        'cold':
        pd.Series([
            '05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994',
            '12-12-2012'
        ],
                  dtype='datetime64[ns]')
    }
    f = Frame(d)
    f = Frame(f.getRawFrame().set_index(['col3', 'cold']))

    g = Frame.fromShape(f.shape)

    s = Shape()
    # fromShape does preserve index
    s.colNames = ['col1', 'col2']
    s.colTypes = [Types.Numeric, Types.Numeric]
    s.index = ['col3', 'cold']
    s.indexTypes = [IndexType(Types.String), IndexType(Types.Datetime)]
    assert g.shape == s == f.shape
Пример #11
0
def test_shape():
    d = {
        'col1': [1, 2, 3, 4, 10],
        'col2': [3, 4, 5, 6, 0],
        'col3': ['q', '2', 'c', '4', 'x']
    }
    f = Frame(d)

    s = Shape()
    s.index = ['Unnamed']
    s.indexTypes = [IndexType(Types.Numeric)]
    s.colNames = ['col1', 'col2', 'col3']
    s.colTypes = [Types.Numeric, Types.Numeric, Types.String]

    assert f.shape == s
    assert f.nRows == 5
Пример #12
0
def test_fillnan_byVal_date_num():
    e = {'col1': [np.nan, 2, np.nan, 4, 10],
         'col2': pd.Categorical(['3', '4', np.nan, np.nan, '0'], ordered=True),
         'col3': ['q', '2', 'c', np.nan, np.nan],
         'date': pd.Series(['05-09-1988', np.nan, np.nan, '22-06-1994', '12-12-2012'],
                           dtype='datetime64[ns]'),
         'col4': [np.nan, 2, np.nan, 4, 10]}
    g = data.Frame(e)

    g = g.setIndex('col1')

    op = FillNan()
    assert op.getOutputShape() is None
    op.addInputShape(g.shape, 0)
    with pytest.raises(OptionValidationError):
        op.setOptions(selected={0: {'fill': 'pol'}, 1: {'fill': '23'},  # wrong
                                2: {'fill': '1966-04-02 00:00:30'},
                                3: {'fill': 'march'}},  # wrong
                      fillMode='value')

    op.setOptions(selected={2: {'fill': '1966-04-02 00:00:30'},
                            3: {'fill': '0.9'}},
                  fillMode='value')

    assert op.getOptions() == {
        'selected': {2: {'fill': '1966-04-02 00:00:30'},
                     3: {'fill': '0.9'}},
        'fillMode': 'value'
    }

    s = Shape()
    s.colNames = ['col3', 'col2', 'date', 'col4']
    s.colTypes = [Types.String, Types.Ordinal, Types.Datetime, Types.Numeric]
    s.index = ['col1']
    s.indexTypes = [IndexType(Types.Numeric)]
    assert op.getOutputShape() == s

    h = op.execute(g)
    assert h.shape == s

    assert mapDate(roundValues(nan_to_None(h.to_dict()), decimals=3)) == {
        'col3': ['q', '2', 'c', None, None],
        'col2': ['3', '4', None, None, '0'],
        'date': [t.strftime(format='%Y-%m-%d') if not pd.isna(t) else '1966-04-02'
                 for t in e['date']],
        'col4': [0.9, 2.0, 0.9, 4.0, 10.0]
    }
Пример #13
0
def test_shape_index():
    d = {
        'col1': [1, 2, 3, 4.0, 10],
        'col2': [3, 4, 5, 6, 0],
        'col3': ['q', '2', 'c', '4', 'x']
    }
    f = Frame(d)
    f = Frame(f.getRawFrame().set_index('col3'))

    # Desired shape obj
    s = Shape()
    s.index = ['col3']
    s.indexTypes = [IndexType(Types.String)]
    s.colNames = ['col1', 'col2']
    s.colTypes = [Types.Numeric, Types.Numeric]

    assert f.shape == s
    assert f.nRows == 5
Пример #14
0
def test_drop_columns():
    e = {'col1': [np.nan, 2, np.nan, 4, 10],
         'col2': pd.Categorical(['3', '4', np.nan, np.nan, '0'], ordered=True),
         'col3': ['q', '2', 'c', np.nan, np.nan],
         'date': pd.Series(['05-09-1988', np.nan, np.nan, '22-06-1994', '12-12-2012'],
                           dtype='datetime64[ns]')}
    g = data.Frame(e)

    g = g.setIndex('col1')

    op = DropColumns()
    assert op.getOutputShape() is None
    op.addInputShape(g.shape, 0)
    assert op.getOutputShape() is None
    assert op.getOptions() == {
        'selected': dict()
    }

    selOpts = {0: None, 2: None}
    op.setOptions(selected={0: None, 2: None})
    opts = op.getOptions()
    assert opts['selected'] == selOpts
    opts['selected'] = {}
    assert op.getOptions()['selected'] == selOpts
    assert op.getOptions() != opts

    with pytest.raises(exc.OptionValidationError) as e:
        op.setOptions(selected={})

    s = data.Shape()
    s.colNames = ['col3']
    s.colTypes = [Types.String]
    s.index = ['col1']
    s.indexTypes = [IndexType(Types.Numeric)]
    assert op.getOutputShape() == s

    h = op.execute(g)
    assert h.shape == s

    assert nan_to_None(h.to_dict()) == {
        'col3': ['q', '2', 'c', None, None]
    }
Пример #15
0
def test_join_on_cols():
    d = {'col1': [1, 2, 3, 4.0, 10], 'col2': [3, 4, 5, 6, 0], 'col3': ['q', '2', 'c', '4', 'x'],
         'date': ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012']}
    e = {'cowq': [1, 2, 3, 4.0, 10], 'col2': pd.Categorical([3, 4, 5, 7, 0]),
         'col3': ['q', '2', 'c', '4', 'x'],
         'date': pd.Series(['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'],
                           dtype='datetime64[ns]')}
    f = data.Frame(d)
    g = data.Frame(e)

    f = f.setIndex('col1')
    g = g.setIndex('col2')

    op = Join()

    assert op.getOutputShape() is None
    op.addInputShape(f.shape, 0)
    assert op.getOutputShape() is None
    op.addInputShape(g.shape, 1)

    op.setOptions('_l', '_r', False, 2, 1, jt.Right)
    assert op.getOptions() == ('_l', '_r', False, 2, 1, jt.Right)
    dc = {
        'cowq': Types.Numeric,
        'col2': Types.Numeric,
        'col3_l': Types.String,
        'col3_r': Types.String,
        'date_l': Types.String,
        'date_r': Types.Datetime
    }
    # Note that merge does not preserve index
    di = {
        'Unnamed': IndexType(Types.Numeric)  # Default index
    }

    s = data.Shape.fromDict(dc, di)
    assert op.getOutputShape() == s
    h = op.execute(f, g)
    assert h.shape == s