def test_fromShape_categories(): d = { 'col1': [1, 2, 3, 4.0, 10], 'col2': pd.Categorical([3, 4, 5, 6, 0]), 'col3': pd.Categorical(['q', '2', 'c', '4', 'x'], ordered=True), 'cold': pd.Series([ '05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012' ], dtype='datetime64[ns]') } f = Frame(d) f = f.setIndex(['col2', 'col3', 'col1']) g = Frame.fromShape(f.shape) s = Shape() # fromShape does preserve index s.colNames = ['cold'] s.colTypes = [Types.Datetime] s.index = ['col3', 'col1', 'col2'] s.indexTypes = [ IndexType(Types.Ordinal), IndexType(Types.Numeric), IndexType(Types.Nominal) ] assert g.shape == s == f.shape
def test_set_index_num(): e = { 'cowq': [1, 2, 3, 4.0, 10], 'col2': pd.Categorical([3, 4, 5, 6, 0]), 'col3': ['q', '2', 'c', '4', 'x'], 'date': pd.Series([ '05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012' ], dtype='datetime64[ns]') } g = data.Frame(e) op = SetIndex() assert op.getOptions() == {'selected': dict()} ops = {'selected': {0: None}} op.setOptions(**ops) assert op.getOptions() == ops assert isDictDeepCopy(op.getOptions(), ops) assert op.getOutputShape() is None op.addInputShape(g.shape, 0) s = Shape() s.colNames = ['col3', 'col2', 'date'] s.colTypes = [Types.String, Types.Nominal, Types.Datetime] s.index = ['cowq'] s.indexTypes = [IndexType(Types.Numeric)] assert op.getOutputShape() == s h = op.execute(g) assert h.shape == s # Reset index op = ResetIndex() assert op.getOutputShape() is None op.addInputShape(h.shape, 0) s = Shape() s.colNames = ['cowq', 'col2', 'date', 'col3'] s.colTypes = [Types.Numeric, Types.Nominal, Types.Datetime, Types.String] s.index = ['Unnamed'] s.indexTypes = [IndexType(Types.Numeric)] assert op.getOutputShape() == s j = op.execute(h) assert j.shape == s
def test_join_on_multiindex(): d = {'col1': ['1', '2', '3', '4', '10'], 'col2': ['3', '4', '5', '6', '0'], 'col3': ['q', '2', 'c', '4', 'x'], 'date': ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012']} e = {'col2': pd.Categorical(['3', '4', '5', '6', '0'], ordered=True), 'cowq': [1, 2, 3, 4.0, 10], 'col3': ['q', '2', 'c', '4', 'x'], 'date': pd.Series(['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'], dtype='datetime64[ns]')} f = data.Frame(d) g = data.Frame(e) f = f.setIndex(['col1', 'col2']) # String, String g = g.setIndex(['col2', 'cowq']) # Category, Numeric defaultOpts = '_l', '_r', True, None, None, jt.Left op = Join() assert op.getOptions() == defaultOpts assert op.getOutputShape() is None op.addInputShape(f.shape, 0) assert op.getOutputShape() is None op.addInputShape(g.shape, 1) # Now set options op.setOptions('_ll', '_rr', True, None, None, jt.Outer) assert op.getOptions() == ( '_ll', '_rr', True, None, None, jt.Outer ) dc = { 'col3_ll': Types.String, 'col3_rr': Types.String, 'date_ll': Types.String, 'date_rr': Types.Datetime } # Join on multiindex is different di = { 'col1': IndexType(Types.String), 'col2': IndexType(Types.String), 'cowq': IndexType(Types.Numeric) } s = data.Shape.fromDict(dc, di) assert op.getOutputShape() == s h = op.execute(f, g) assert h.shape == s
def test_fillnan_ffill(): e = {'col1': [np.nan, 2, np.nan, 4, 10], 'col2': pd.Categorical(['3', '4', np.nan, np.nan, '0'], ordered=True), 'col3': ['q', '2', 'c', np.nan, np.nan], 'date': pd.Series(['05-09-1988', np.nan, np.nan, '22-06-1994', '12-12-2012'], dtype='datetime64[ns]')} g = data.Frame(e) g = g.setIndex('col1') op = FillNan() assert op.getOutputShape() is None op.addInputShape(g.shape, 0) op.setOptions(selected={0: None, 1: None, 2: None}, fillMode='ffill') assert op.getOptions() == { 'selected': {0: None, 1: None, 2: None}, 'fillMode': 'ffill' } s = Shape() s.colNames = ['col3', 'col2', 'date'] s.colTypes = [Types.String, Types.Ordinal, Types.Datetime] s.index = ['col1'] s.indexTypes = [IndexType(Types.Numeric)] assert op.getOutputShape() == s h = op.execute(g) assert h.shape == s assert mapDate(roundValues(nan_to_None(h.to_dict()), decimals=3)) == { 'col3': ['q', '2', 'c', 'c', 'c'], 'col2': ['3', '4', '4', '4', '0'], 'date': [t.strftime(format='%Y-%m-%d') if not pd.isna(t) else '1988-05-09' for t in e['date']] }
def shape(self) -> Shape: """ The shape of a Frame :return: Shape object """ s = Shape() # Types are set in a more readable format using type_dict to convert names s.colNames = list() s.colTypes = list() # Index of the columns which are set as indexes s.index = list() s.indexTypes = list() # Index columns for i in range(self.__df.index.nlevels): index: pd.Index = self.__df.index.get_level_values(i) s.index.append(index.name if index.name else 'Unnamed') s.indexTypes.append(IndexType(wrapperType(index.dtype))) # Columns for col, type_val in self.__df.dtypes.items(): wrappedType = wrapperType(type_val) s.colNames.append(col) s.colTypes.append(wrappedType) return s
def test_set_index_string(): e = { 'cowq': [1, 2, 3, 4.0, 10], 'col2': pd.Categorical(['3', 4, 5, 6, 0]), 'col3': ['q', '2', 'c', '4', 'x'], 'date': pd.Series([ '05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012' ], dtype='datetime64[ns]') } g = data.Frame(e) op = SetIndex() op.setOptions(selected={2: None}) assert op.getOutputShape() is None op.addInputShape(g.shape, 0) s = Shape() s.colNames = ['cowq', 'col2', 'date'] s.colTypes = [Types.Numeric, Types.Nominal, Types.Datetime] s.index = ['col3'] s.indexTypes = [IndexType(Types.String)] os = op.getOutputShape() assert os == s h = op.execute(g) hs = h.shape assert hs == s # Reset index op = ResetIndex() assert op.getOutputShape() is None op.addInputShape(h.shape, 0) s = Shape() s.colNames = ['cowq', 'col2', 'date', 'col3'] s.colTypes = [Types.Numeric, Types.Nominal, Types.Datetime, Types.String] s.index = ['Unnamed'] s.indexTypes = [IndexType(Types.Numeric)] assert op.getOutputShape() == s j = op.execute(h) assert j.shape == s
def deserialize(state: Dict) -> 'Shape': """ Create a new shape from a serialization """ s = Shape() s.__dict__ = state s.colTypes = [Type.fromCode(c) for c in state['colTypes']] s.indexTypes = [ IndexType(Type.fromCode(c)) for c in state['indexTypes'] ] return s
def test_join_on_index(): d = {'col1': ['1', '2', '3', '4', '10'], 'col2': [3, 4, 5, 6, 0], 'col3': ['q', '2', 'c', '4', 'x'], 'date': ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012']} e = {'col2': pd.Categorical(['3', '4', '5', '6', '0'], ordered=True), 'cowq': [1, 2, 3, 4.0, 10], 'col3': ['q', '2', 'c', '4', 'x'], 'date': pd.Series(['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'], dtype='datetime64[ns]')} f = data.Frame(d) g = data.Frame(e) f = f.setIndex('col1') g = g.setIndex('col2') defaultOpts = '_l', '_r', True, None, None, jt.Left op = Join() assert op.getOptions() == defaultOpts assert op.getOutputShape() is None # with pytest.raises(exc.OptionValidationError) as e: # op.setOptions('_ll', '_rr', True, None, None, jt.Inner) # CAN set options before shapes # assert 'shape' in [a[0] for a in e.value.invalid] # assert op.getOptions() == defaultOpts assert op.getOutputShape() is None op.addInputShape(f.shape, 0) assert op.getOutputShape() is None op.addInputShape(g.shape, 1) # Now set options op.setOptions('_ll', '_rr', True, None, None, jt.Inner) assert op.getOptions() == ( '_ll', '_rr', True, None, None, jt.Inner ) dc = { 'cowq': Types.Numeric, 'col2': Types.Numeric, 'col3_ll': Types.String, 'col3_rr': Types.String, 'date_ll': Types.String, 'date_rr': Types.Datetime } # Note that join does not preserve index name di = { 'Unnamed': IndexType(Types.String) } s = data.Shape.fromDict(dc, di) assert op.getOutputShape() == s h = op.execute(f, g) assert h.shape == s
def test_cloneShape(): s = Shape() s.colNames = ['cold'] s.colTypes = [Types.Datetime] s.index = ['col3', 'col1', 'col2'] s.indexTypes = [ IndexType(Types.Ordinal), IndexType(Types.Numeric), IndexType(Types.Nominal) ] sColDict = s.columnsDict sIndexDict = s.indexDict sc = s.clone() sc.index.append('col4') sc.indexTypes.append(IndexType(Types.Numeric)) sc.colTypes[0] = Types.Ordinal sc.colNames[0] = 'col_new' assert sc != s assert s.columnsDict == {'cold': Types.Datetime} assert sc.columnsDict == {'col_new': Types.Ordinal} assert s.columnsDict == sColDict assert s.indexDict == sIndexDict
def test_fromShape_datetime(): d = { 'col1': [1, 2, 3, 4.0, 10], 'col2': [3, 4, 5, 6, 0], 'col3': ['q', '2', 'c', '4', 'x'], 'cold': pd.Series([ '05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012' ], dtype='datetime64[ns]') } f = Frame(d) f = Frame(f.getRawFrame().set_index(['col3', 'cold'])) g = Frame.fromShape(f.shape) s = Shape() # fromShape does preserve index s.colNames = ['col1', 'col2'] s.colTypes = [Types.Numeric, Types.Numeric] s.index = ['col3', 'cold'] s.indexTypes = [IndexType(Types.String), IndexType(Types.Datetime)] assert g.shape == s == f.shape
def test_shape(): d = { 'col1': [1, 2, 3, 4, 10], 'col2': [3, 4, 5, 6, 0], 'col3': ['q', '2', 'c', '4', 'x'] } f = Frame(d) s = Shape() s.index = ['Unnamed'] s.indexTypes = [IndexType(Types.Numeric)] s.colNames = ['col1', 'col2', 'col3'] s.colTypes = [Types.Numeric, Types.Numeric, Types.String] assert f.shape == s assert f.nRows == 5
def test_fillnan_byVal_date_num(): e = {'col1': [np.nan, 2, np.nan, 4, 10], 'col2': pd.Categorical(['3', '4', np.nan, np.nan, '0'], ordered=True), 'col3': ['q', '2', 'c', np.nan, np.nan], 'date': pd.Series(['05-09-1988', np.nan, np.nan, '22-06-1994', '12-12-2012'], dtype='datetime64[ns]'), 'col4': [np.nan, 2, np.nan, 4, 10]} g = data.Frame(e) g = g.setIndex('col1') op = FillNan() assert op.getOutputShape() is None op.addInputShape(g.shape, 0) with pytest.raises(OptionValidationError): op.setOptions(selected={0: {'fill': 'pol'}, 1: {'fill': '23'}, # wrong 2: {'fill': '1966-04-02 00:00:30'}, 3: {'fill': 'march'}}, # wrong fillMode='value') op.setOptions(selected={2: {'fill': '1966-04-02 00:00:30'}, 3: {'fill': '0.9'}}, fillMode='value') assert op.getOptions() == { 'selected': {2: {'fill': '1966-04-02 00:00:30'}, 3: {'fill': '0.9'}}, 'fillMode': 'value' } s = Shape() s.colNames = ['col3', 'col2', 'date', 'col4'] s.colTypes = [Types.String, Types.Ordinal, Types.Datetime, Types.Numeric] s.index = ['col1'] s.indexTypes = [IndexType(Types.Numeric)] assert op.getOutputShape() == s h = op.execute(g) assert h.shape == s assert mapDate(roundValues(nan_to_None(h.to_dict()), decimals=3)) == { 'col3': ['q', '2', 'c', None, None], 'col2': ['3', '4', None, None, '0'], 'date': [t.strftime(format='%Y-%m-%d') if not pd.isna(t) else '1966-04-02' for t in e['date']], 'col4': [0.9, 2.0, 0.9, 4.0, 10.0] }
def test_shape_index(): d = { 'col1': [1, 2, 3, 4.0, 10], 'col2': [3, 4, 5, 6, 0], 'col3': ['q', '2', 'c', '4', 'x'] } f = Frame(d) f = Frame(f.getRawFrame().set_index('col3')) # Desired shape obj s = Shape() s.index = ['col3'] s.indexTypes = [IndexType(Types.String)] s.colNames = ['col1', 'col2'] s.colTypes = [Types.Numeric, Types.Numeric] assert f.shape == s assert f.nRows == 5
def test_drop_columns(): e = {'col1': [np.nan, 2, np.nan, 4, 10], 'col2': pd.Categorical(['3', '4', np.nan, np.nan, '0'], ordered=True), 'col3': ['q', '2', 'c', np.nan, np.nan], 'date': pd.Series(['05-09-1988', np.nan, np.nan, '22-06-1994', '12-12-2012'], dtype='datetime64[ns]')} g = data.Frame(e) g = g.setIndex('col1') op = DropColumns() assert op.getOutputShape() is None op.addInputShape(g.shape, 0) assert op.getOutputShape() is None assert op.getOptions() == { 'selected': dict() } selOpts = {0: None, 2: None} op.setOptions(selected={0: None, 2: None}) opts = op.getOptions() assert opts['selected'] == selOpts opts['selected'] = {} assert op.getOptions()['selected'] == selOpts assert op.getOptions() != opts with pytest.raises(exc.OptionValidationError) as e: op.setOptions(selected={}) s = data.Shape() s.colNames = ['col3'] s.colTypes = [Types.String] s.index = ['col1'] s.indexTypes = [IndexType(Types.Numeric)] assert op.getOutputShape() == s h = op.execute(g) assert h.shape == s assert nan_to_None(h.to_dict()) == { 'col3': ['q', '2', 'c', None, None] }
def test_join_on_cols(): d = {'col1': [1, 2, 3, 4.0, 10], 'col2': [3, 4, 5, 6, 0], 'col3': ['q', '2', 'c', '4', 'x'], 'date': ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012']} e = {'cowq': [1, 2, 3, 4.0, 10], 'col2': pd.Categorical([3, 4, 5, 7, 0]), 'col3': ['q', '2', 'c', '4', 'x'], 'date': pd.Series(['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'], dtype='datetime64[ns]')} f = data.Frame(d) g = data.Frame(e) f = f.setIndex('col1') g = g.setIndex('col2') op = Join() assert op.getOutputShape() is None op.addInputShape(f.shape, 0) assert op.getOutputShape() is None op.addInputShape(g.shape, 1) op.setOptions('_l', '_r', False, 2, 1, jt.Right) assert op.getOptions() == ('_l', '_r', False, 2, 1, jt.Right) dc = { 'cowq': Types.Numeric, 'col2': Types.Numeric, 'col3_l': Types.String, 'col3_r': Types.String, 'date_l': Types.String, 'date_r': Types.Datetime } # Note that merge does not preserve index di = { 'Unnamed': IndexType(Types.Numeric) # Default index } s = data.Shape.fromDict(dc, di) assert op.getOutputShape() == s h = op.execute(f, g) assert h.shape == s