예제 #1
0
def test_merge_index_val():
    d = {
        'cowq': [1, 2, 3, 4.0, 10],
        'col2':
        pd.Categorical(["3", "4", "5", "6", "0"]),
        'col3': ['q', '2', 'c', '4', 'x'],
        'date':
        pd.Series([
            '05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994',
            '12-12-2012'
        ],
                  dtype='datetime64[ns]')
    }
    f = data.Frame(d)

    op = ReplaceValues()
    op.addInputShape(f.shape, 0)
    op.setOptions(table={1: {
        'values': '3 4;  6  0',
        'replace': 'h; nan'
    }},
                  inverted=False)

    s = f.shape.clone()
    os = op.getOutputShape()
    assert f.shape.colTypes[1] == Types.Nominal == os.colTypes[1]
    assert os == s

    g = op.execute(f)
    assert g.shape == f.shape
    assert nan_to_None(data.Frame(g.getRawFrame()['col2']).to_dict()) == \
           {'col2': ["h", "h", "5", None, None]}
예제 #2
0
def test_join_on_index():
    d = {'col1': ['1', '2', '3', '4', '10'], 'col2': [3, 4, 5, 6, 0], 'col3': ['q', '2', 'c', '4', 'x'],
         'date': ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012']}
    e = {'col2': pd.Categorical(['3', '4', '5', '6', '0'], ordered=True),
         'cowq': [1, 2, 3, 4.0, 10],
         'col3': ['q', '2', 'c', '4', 'x'],
         'date': pd.Series(['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'],
                           dtype='datetime64[ns]')}
    f = data.Frame(d)
    g = data.Frame(e)

    f = f.setIndex('col1')
    g = g.setIndex('col2')

    defaultOpts = '_l', '_r', True, None, None, jt.Left
    op = Join()
    assert op.getOptions() == defaultOpts

    assert op.getOutputShape() is None
    # with pytest.raises(exc.OptionValidationError) as e:
    #     op.setOptions('_ll', '_rr', True, None, None, jt.Inner)
    # CAN set options before shapes
    # assert 'shape' in [a[0] for a in e.value.invalid]
    # assert op.getOptions() == defaultOpts

    assert op.getOutputShape() is None
    op.addInputShape(f.shape, 0)
    assert op.getOutputShape() is None
    op.addInputShape(g.shape, 1)

    # Now set options
    op.setOptions('_ll', '_rr', True, None, None, jt.Inner)
    assert op.getOptions() == (
        '_ll', '_rr', True, None, None, jt.Inner
    )

    dc = {
        'cowq': Types.Numeric,
        'col2': Types.Numeric,
        'col3_ll': Types.String,
        'col3_rr': Types.String,
        'date_ll': Types.String,
        'date_rr': Types.Datetime
    }
    # Note that join does not preserve index name
    di = {
        'Unnamed': IndexType(Types.String)
    }
    s = data.Shape.fromDict(dc, di)
    assert op.getOutputShape() == s

    h = op.execute(f, g)

    assert h.shape == s
예제 #3
0
def test_join_on_multiindex():
    d = {'col1': ['1', '2', '3', '4', '10'], 'col2': ['3', '4', '5', '6', '0'],
         'col3': ['q', '2', 'c', '4', 'x'],
         'date': ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012']}
    e = {'col2': pd.Categorical(['3', '4', '5', '6', '0'], ordered=True),
         'cowq': [1, 2, 3, 4.0, 10],
         'col3': ['q', '2', 'c', '4', 'x'],
         'date': pd.Series(['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'],
                           dtype='datetime64[ns]')}
    f = data.Frame(d)
    g = data.Frame(e)

    f = f.setIndex(['col1', 'col2'])  # String, String
    g = g.setIndex(['col2', 'cowq'])  # Category, Numeric

    defaultOpts = '_l', '_r', True, None, None, jt.Left
    op = Join()
    assert op.getOptions() == defaultOpts

    assert op.getOutputShape() is None
    op.addInputShape(f.shape, 0)
    assert op.getOutputShape() is None
    op.addInputShape(g.shape, 1)

    # Now set options
    op.setOptions('_ll', '_rr', True, None, None, jt.Outer)
    assert op.getOptions() == (
        '_ll', '_rr', True, None, None, jt.Outer
    )

    dc = {
        'col3_ll': Types.String,
        'col3_rr': Types.String,
        'date_ll': Types.String,
        'date_rr': Types.Datetime
    }
    # Join on multiindex is different
    di = {
        'col1': IndexType(Types.String),
        'col2': IndexType(Types.String),
        'cowq': IndexType(Types.Numeric)
    }
    s = data.Shape.fromDict(dc, di)
    assert op.getOutputShape() == s

    h = op.execute(f, g)

    assert h.shape == s
예제 #4
0
파일: join.py 프로젝트: alek9z/dataMole
 def execute(self, dfl: data.Frame, dfr: data.Frame) -> data.Frame:
     if self.__onIndex:
         # Join on indexes
         return data.Frame(dfl.getRawFrame().join(dfr.getRawFrame(), how=self.__type.value,
                                                  lsuffix=self.__lSuffix,
                                                  rsuffix=self.__rSuffix))
     else:
         # Join (merge) on columns
         # onleft and onright must be set
         suffixes = (self.__lSuffix, self.__rSuffix)
         l_col = dfl.shape.colNames[self.__leftOn]
         r_col = dfr.shape.colNames[self.__rightOn]
         return data.Frame(dfl.getRawFrame().merge(dfr.getRawFrame(), how=self.__type.value,
                                                   left_on=l_col,
                                                   right_on=r_col,
                                                   suffixes=suffixes))
예제 #5
0
 def appendEmptyRow(self) -> bool:
     row = self.rowCount()
     # Create a dummy entry
     f = FrameModelMock(data.Frame(), ' ')
     self.__workbench.append(f)
     self.__nameToIndex[f.name] = row
     return True
예제 #6
0
def test_standardScale_1_attr_all_nan():
    d = {
        'id1': ['ab', 'sa', '121', '121', 'a'],
        'id2': [1, np.nan, 0, 44, 0],
        'col1': [1.0, -1.1, 3.0, 7.5, 10.0],
        'col2': [np.nan, np.nan, np.nan, np.nan, np.nan],
        'ww': [3, np.nan, 'ww', '1', '1']
    }
    f = data.Frame(d)
    f = f.setIndex(['id1', 'id2'])

    op = StandardScaler()
    assert op.getOutputShape() is None
    op.setOptions(attributes={1: None})
    assert op.getOutputShape() is None
    op.addInputShape(f.shape, 0)
    s = f.shape.clone()
    assert op.getOutputShape() == s

    op.removeInputShape(0)
    assert op.getOutputShape() is None
    op.addInputShape(f.shape, 0)

    g = op.execute(f)
    expected = {
        'col1':
        d['col1'],
        'col2': [((x - np.nanmean(d['col2'])) / np.nanstd(d['col2']))
                 for x in d['col2']],
        'ww': [3, None, 'ww', '1', '1']
    }
    assert nan_to_None(roundValues(g.to_dict(),
                                   4)) == nan_to_None(roundValues(expected, 4))
    assert g.shape == s
예제 #7
0
 def execute(self, df: data.Frame) -> data.Frame:
     frame = copy.deepcopy(df)
     f = frame.getRawFrame()
     # Operation ignores nan values
     nanRows = f.iloc[:, list(self.__attributes.keys())].isnull()
     # For every column, transform every non-nan row
     columns = f.columns
     edges: Dict[int, List[float]] = dict()
     for col, k in self.__attributes.items():
         colName = columns[col]
         notNa = (~nanRows.loc[:, colName]).to_list()
         discretizer = skp.KBinsDiscretizer(n_bins=k, encode='ordinal',
                                            strategy=self.__strategy.value)
         # Discretize and convert to string (since categories are strings)
         result = discretizer.fit_transform(f.loc[notNa, colName].values.reshape(-1, 1)).astype(str)
         name: str = colName
         if self.__attributeSuffix:
             # Make a new column with all nans
             name = colName + self.__attributeSuffix
             f.loc[:, name] = np.nan
         # Assign column
         f.loc[notNa, [name]] = result
         f.loc[:, name] = f[name].astype(
             pd.CategoricalDtype(categories=[str(float(i)) for i in range(k)], ordered=True))
         edges[col] = discretizer.bin_edges_[0].tolist()
     # Log what has been done
     self.__logExecution(columns, edges)
     return data.Frame(f)
예제 #8
0
def test_nan_removerows_byperc():
    d = {
        'col1': [1, 2, 3, np.nan, 10],
        'col2': [3, 4, np.nan, np.nan, np.nan],
        'col3': ['q', '2', 'c', '4', 'x'],
        'date':
        ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012']
    }
    f = data.Frame(d)

    op = RemoveNanRows()
    assert op.getOutputShape() is None
    op.setOptions(number=12121, percentage=0.3)
    assert op.getOptions() == (0.3, None)

    op.addInputShape(f.shape, 0)
    s = f.shape.clone()
    assert op.getOutputShape() == s

    g = op.execute(f)

    assert g != f and g.shape == s
    assert g.nRows == 4

    op.setOptions(percentage=0.5, number=1)
    g = op.execute(f)
    assert g == f and g.nRows == 5
예제 #9
0
def test_fillnan_ffill():
    e = {'col1': [np.nan, 2, np.nan, 4, 10],
         'col2': pd.Categorical(['3', '4', np.nan, np.nan, '0'], ordered=True),
         'col3': ['q', '2', 'c', np.nan, np.nan],
         'date': pd.Series(['05-09-1988', np.nan, np.nan, '22-06-1994', '12-12-2012'],
                           dtype='datetime64[ns]')}
    g = data.Frame(e)

    g = g.setIndex('col1')

    op = FillNan()
    assert op.getOutputShape() is None
    op.addInputShape(g.shape, 0)
    op.setOptions(selected={0: None, 1: None, 2: None}, fillMode='ffill')
    assert op.getOptions() == {
        'selected': {0: None, 1: None, 2: None},
        'fillMode': 'ffill'
    }

    s = Shape()
    s.colNames = ['col3', 'col2', 'date']
    s.colTypes = [Types.String, Types.Ordinal, Types.Datetime]
    s.index = ['col1']
    s.indexTypes = [IndexType(Types.Numeric)]
    assert op.getOutputShape() == s

    h = op.execute(g)
    assert h.shape == s

    assert mapDate(roundValues(nan_to_None(h.to_dict()), decimals=3)) == {
        'col3': ['q', '2', 'c', 'c', 'c'],
        'col2': ['3', '4', '4', '4', '0'],
        'date': [t.strftime(format='%Y-%m-%d') if not pd.isna(t) else '1988-05-09'
                 for t in e['date']]
    }
예제 #10
0
def test_merge_from_nan():
    d = {
        'cowq': [1, 2, None, 4.0, None],
        'col2': pd.Categorical(["3", "4", "5", "6", "0"]),
        'col3': ['q', '2', 'c', '4', 'x']
    }
    f = data.Frame(d)

    op = ReplaceValues()
    op.addInputShape(f.shape, 0)
    op.setOptions(table={0: {
        'values': 'Nan 2.0;4.0',
        'replace': '-1;-2'
    }},
                  inverted=False)

    s = f.shape.clone()
    assert f.shape.colTypes[1] == Types.Nominal
    assert op.getOutputShape() == s

    g = op.execute(f)
    assert g.shape == f.shape
    ff = {
        'cowq': [1.0, -1.0, -1.0, -2.0, -1.0],
        'col2': ["3", "4", "5", "6", "0"],
        'col3': ['q', '2', 'c', '4', 'x']
    }
    assert nan_to_None(g.to_dict()) == ff
예제 #11
0
def test_merge_nan():
    d = {
        'cowq': [1, 2, 3, 4.0, 10],
        'col2': pd.Categorical(["3", "4", "5", "6", "0"]),
        'col3': ['q', '2', 'c', '4', 'x']
    }
    f = data.Frame(d)

    op = ReplaceValues()
    op.addInputShape(f.shape, 0)
    op.setOptions(table={
        1: {
            'values': 'hello 2 6 0; 3',
            'replace': 'NAN; nan'
        },
        0: {
            'values': '2 4 10',
            'replace': 'naN'
        }
    },
                  inverted=False)

    s = f.shape.clone()
    assert f.shape.colTypes[1] == Types.Nominal
    assert op.getOutputShape() == s

    g = op.execute(f)
    assert g.shape == f.shape
    ff = {
        'cowq': [1, None, 3, None, None],
        'col2': [None, "4", "5", None, None],
        'col3': ['q', '2', 'c', '4', 'x']
    }
    assert nan_to_None(g.to_dict()) == ff
예제 #12
0
def test_merge_category_inverted():
    d = {
        'col1': [1, 2, 3, 4.0, 10],
        'col2':
        pd.Categorical(["3", "4", "5", "6", "0"]),
        'col3': ['q', '2', 'c', '4', 'x'],
        'date':
        ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012']
    }
    f = data.Frame(d)
    op = ReplaceValues()

    op.addInputShape(f.shape, 0)
    op.setOptions(table={1: {
        'values': '4 0; 0',
        'replace': 'val;  NAN'
    }},
                  inverted=True)

    s = f.shape.clone()
    assert op.getOutputShape() == s
    assert s.colTypes[1] == Types.Nominal

    g = op.execute(f)

    assert nan_to_None(g.to_dict()) == {
        'col1': [1, 2, 3, 4.0, 10],
        'col2': [None, None, None, None, "0"],
        'col3': ['q', '2', 'c', '4', 'x'],
        'date':
        ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012']
    }
    assert g != f and g.shape == s
예제 #13
0
def test_duplicate_columns():
    d = {
        'col1': [1, 2, 3, 4.0, 10],
        'col2': [3, 4, 5, 6, 0],
        'col3': ['q', 2, 'q', 'q', 2],
        'date':
        ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012']
    }
    f = data.Frame(d)
    f.setIndex('col1')

    op = DuplicateColumn()
    op.addInputShape(f.shape, 0)
    assert op.getOutputShape() is None
    opt = {'table': {0: {'rename': 'a name'}, 2: {'rename': 'new'}}}
    op.setOptions(**opt)

    assert op.getOptions() == opt
    copt = deepcopy(opt)
    opt['table'][0]['rename'] = 'newnn'
    assert op.getOptions() == copt and op.getOptions() != opt

    s = f.shape.clone()
    s.colNames.append('a name')
    s.colNames.append('new')
    s.colTypes.append(s.colTypes[0])
    s.colTypes.append(s.colTypes[2])
    assert op.getOutputShape() == s

    g = op.execute(f)

    assert g != f and g.shape == s
예제 #14
0
 def execute(self, df: data.Frame) -> data.Frame:
     f = df.getRawFrame().copy(True)
     pairs: List[Tuple[int, str]] = list(self.__attributes.items())
     names = [v[1] for v in pairs]
     indexes = [v[0] for v in pairs]
     f[names] = f.iloc[:, indexes]
     return data.Frame(f)
예제 #15
0
def test_exception():
    d = {
        'col1': [1, 2, 3, 4.0, 10],
        'col2': [3, 4, 5, 6, 0],
        'col3': ['q', '2', 'c', '4', 'x'],
        'date':
        ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012']
    }
    f = data.Frame(d)
    op = ReplaceValues()

    op.addInputShape(f.shape, 0)

    with pytest.raises(exp.OptionValidationError):
        op.setOptions(table={1: {
            'replace': '7; h',
            'values': '3 4 5; 2'
        }},
                      inverted=False)

    with pytest.raises(exp.OptionValidationError):
        op.setOptions(
            table={1: {
                'replace': '7;    8;1',
                'values': '3 4 5; 2'
            }},
            inverted=False)
예제 #16
0
 def execute(self, df: data.Frame) -> data.Frame:
     columns = df.getRawFrame().columns.to_list()
     # Execute
     pdf = df.getRawFrame().copy(True)
     fr = set(self.__attributes.values())
     if len(fr) == 1:
         # All ranges are the same, shortcut
         toProcess = pdf.iloc[:, list(self.__attributes.keys())]
         processedColNames = toProcess.columns
         scaled = minmax_scale(toProcess,
                               feature_range=fr.pop(),
                               axis=0,
                               copy=True)
         processed = pd.DataFrame(scaled).set_index(pdf.index)
         processed.columns = processedColNames
     else:
         processed = dict()
         for k, fr in self.__attributes.items():
             processed[columns[k]] = minmax_scale(pdf.iloc[:, k],
                                                  feature_range=fr,
                                                  axis=0,
                                                  copy=True)
         processed = pd.DataFrame(processed).set_index(pdf.index)
     # Merge result with other columns preserving order
     pdf = pdf.drop(columns=processed.columns)
     result = pd.concat([pdf, processed], ignore_index=False,
                        axis=1)[columns]
     return data.Frame(result)
예제 #17
0
def test_discretize_by_date_with_None():
    d = {'col2': [3, 4, 5.1, 6, 0],
         'col3': ['123', '2', '0.43', '4', '2021 January'],
         'cold': [pd.Timestamp('05-09-1988'), pd.Timestamp('22-12-1994'),
                  pd.Timestamp('21-11-1995'), None, pd.Timestamp('12-12-2012')],
         'cold2': [pd.Timestamp('01-01-1950'), pd.Timestamp('22-12-1980'),
                   pd.Timestamp('21-11-1995'), None, pd.Timestamp('12-12-2034')],
         'cold_disc': [None, None, None, None, None]  # test to see if it is removed
         }

    f = data.Frame(d)
    f = f.setIndex('col2')

    op = DateDiscretizer()
    op.addInputShape(f.shape, 0)
    assert op.getOutputShape() is None
    op.addInputShape(f.shape, 0)
    assert op.getOutputShape() is None
    shapeDict = f.shape.columnsDict
    assert shapeDict['cold'] == Types.Datetime
    assert shapeDict['cold2'] == Types.Datetime

    intervals = [pd.Timestamp('01-01-1950'), pd.Timestamp('01-01-1970'),
                 pd.Timestamp('01-01-1990'), pd.Timestamp('01-01-2010'),
                 pd.Timestamp('01-01-2030')]

    op.setOptions(selected={
        1: {'ranges': (intervals, True, False), 'labels': ['50', '70', '80', 'now']},
        2: {'ranges': (intervals, True, True), 'labels': ['50', '70', '80', 'now']}},
        suffix=(True, '_disc'))

    assert op.getOptions() == {
        'selected': {
            1: {'ranges': (intervals, True, False), 'labels': ['50', '70', '80', 'now']},
            2: {'ranges': (intervals, True, True), 'labels': ['50', '70', '80', 'now']}},
        'suffix': (True, '_disc')
    }

    shapeDict['cold_disc'] = Types.Ordinal
    shapeDict['cold2_disc'] = Types.Ordinal
    s = data.Shape.fromDict(shapeDict, f.shape.indexDict)
    assert op.getOutputShape() == s

    g = op.execute(f)
    assert g.shape == s

    output = nan_to_None(g.to_dict())
    assert output == {'col3': ['123', '2', '0.43', '4', '2021 January'],
                      'cold': [pd.Timestamp('05-09-1988'), pd.Timestamp('22-12-1994'),
                               pd.Timestamp('21-11-1995'), None, pd.Timestamp('12-12-2012')],
                      'cold2': [pd.Timestamp('01-01-1950'), pd.Timestamp('22-12-1980'),
                                pd.Timestamp('21-11-1995'), None, pd.Timestamp('12-12-2034')],
                      'cold_disc': ['70', '80', '80', None, 'now'],
                      'cold2_disc': [None, '70', '80', None, None]
                      }
    assert g.getRawFrame()['cold_disc'].cat.categories.to_list() == ['50', '70', '80', 'now']
    assert g.getRawFrame()['cold_disc'].dtype.ordered is True
    assert g.getRawFrame()['cold2_disc'].cat.categories.to_list() == ['50', '70', '80', 'now']
    assert g.getRawFrame()['cold2_disc'].dtype.ordered is True
예제 #18
0
def test_discretize_set_options_exceptions():
    d = {'col2': [3, 4, 5.1, 6, 0],
         'col3': ['123', '2', '0.43', '4', '2021 January'],
         'cold': [pd.Timestamp('05-09-1988 13:45'), pd.Timestamp('22-12-1994 14:21'),
                  pd.Timestamp('21-11-1995 11:50'), None, pd.Timestamp('12-12-2012 09:15')]
         }

    f = data.Frame(d)

    op = DateDiscretizer()
    op.addInputShape(f.shape, 0)
    assert op.getOutputShape() is None
    op.addInputShape(f.shape, 0)
    assert op.getOutputShape() is None
    shapeDict = f.shape.columnsDict
    assert shapeDict['cold'] == Types.Datetime

    intervals = [pd.Timestamp('05-09-1988 07:00'), pd.Timestamp('20-12-1994 11:30'),
                 pd.Timestamp('05-09-2000 14:20'), pd.Timestamp('01-09-2010 14:30'),
                 pd.Timestamp('12-12-2012 09:14')]
    labels = ['early', 'middle', 'late', 'now']

    with pytest.raises(exp.OptionValidationError) as e:
        op.setOptions(selected={
            2: {'ranges': (list(), True, True), 'labels': labels}},
            suffix=(False, '_disc'))
    assert e.value.invalid[0][0] == 'bins'

    with pytest.raises(exp.OptionValidationError) as e:
        op.setOptions(selected={
            2: {'ranges': None, 'labels': labels}},
            suffix=(False, '_disc'))
    assert e.value.invalid[0][0] == 'bins'

    with pytest.raises(exp.OptionValidationError) as e:
        op.setOptions(selected={
            2: {'ranges': (intervals, False, True), 'labels': labels[:1]}},
            suffix=(False, '_disc'))
    assert e.value.invalid[0][0] == 'len'

    with pytest.raises(exp.OptionValidationError) as e:
        op.setOptions(selected={
            2: {'ranges': (intervals, False, True), 'labels': None}},
            suffix=(False, '_disc'))
    assert e.value.invalid[0][0] == 'lab'

    with pytest.raises(exp.OptionValidationError) as e:
        op.setOptions(selected={
            2: {'ranges': (intervals, False, True), 'labels': ['a', 'b', 'c', 'a']}},
            suffix=(False, '_disc'))
    assert e.value.invalid[0][0] == 'unique'

    with pytest.raises(exp.OptionValidationError) as e:
        op.setOptions(selected={
            2: {'ranges': (intervals, False, True), 'labels': labels}},
            suffix=(True, ''))
    assert e.value.invalid[0][0] == 'suff'

    assert op.getOutputShape() is None
예제 #19
0
파일: rename.py 프로젝트: alek9z/dataMole
 def execute(self, df: data.Frame) -> data.Frame:
     """ Set new names for columns """
     names: List[str] = df.colnames
     for k, v in self.__names.items():
         names[k] = v
     new_df = df.getRawFrame().copy(deep=False)
     new_df.columns = names
     return data.Frame(new_df)
예제 #20
0
    def execute(self) -> None:
        def manipulateDf(s: pd.Series, lab: str, sName: str) -> pd.DataFrame:
            # Makes a dataframe with a value column named as the series and a time label column
            return s.to_frame(sName).assign(time=lab)

        allSeriesColumn: List[pd.DataFrame] = list()
        for seriesName, values in self.__series.items():
            values: List[Tuple[
                str, int, int]]  # [ (frameName, attrIndex, timeLabelIndex) ]

            # List of frames to append
            frames: List[pd.DataFrame] = list(
                map(
                    lambda tup: manipulateDf(
                        self.workbench.getDataframeModelByName(tup[0]).frame.
                        getRawFrame().iloc[:, tup[1]], self.__timeLabels[tup[
                            2]], seriesName), values))

            # Create a dataframe with two columns, one with the values of this series for every index
            # and 1 with the time label. A series column is index by Time and Index. In this way the
            # concatenation of all the series will be made correctly
            seriesColumn = pd.concat(frames, axis=0, join='outer')

            # Create a categorical ordinal index for time labels
            waves = pd.Index(seriesColumn['time'].unique(),
                             name='time',
                             dtype=pd.CategoricalDtype(
                                 ordered=True, categories=self.__timeLabels))
            ids = seriesColumn.index.unique()

            # Set index to [id, time]
            seriesColumn = seriesColumn.set_index(['time'],
                                                  drop=True,
                                                  append=True)
            # Reindex to provide vales for every possible combination of [time, values]
            multiIndex: pd.MultiIndex = pd.MultiIndex.from_product(
                [ids, waves])
            # Additionally sort indexes, otherwise concatenation drops index type
            seriesColumn = seriesColumn.reindex(multiIndex).sort_index(
                axis=0, ignore_index=False)
            allSeriesColumn.append(seriesColumn)

        # Concat all series in the same dataframe. Remove the 'time' column from index,
        # leaving only the original index (subject id)
        result = pd.concat(allSeriesColumn,
                           axis=1,
                           join='outer',
                           ignore_index=False).reset_index(level='time',
                                                           drop=False)

        # Result:
        # Index is set on the subject identifier
        # Column 'time' contains the names of the time axis (wave names or integers)
        # The other columns are named with the specified 'seriesName' and are the series values which
        # varies with time and index
        self._workbench.setDataframeByName(self.__outputName,
                                           data.Frame(result))
예제 #21
0
 def execute(self) -> None:
     if not self.hasOptions():
         raise exp.InvalidOptions('Options are not set')
     pd_df = pd.read_csv(self.__file,
                         sep=self.__separator,
                         index_col=False,
                         usecols=self.__selectedColumns,
                         chunksize=self.__splitByRowN)
     if self.__splitByRowN is not None:
         # pd_df is a chunk iterator
         for i, chunk in enumerate(pd_df):
             name: str = self.__wName + '_{:d}'.format(i)
             self._workbench.setDataframeByName(name, data.Frame(chunk))
             # TOCHECK: this does not set a parent for the FrameModel (since workbench lives in
             #  different thread)
     else:
         # entire dataframe is read
         self._workbench.setDataframeByName(self.__wName, data.Frame(pd_df))
예제 #22
0
 def execute(self, df: data.Frame) -> data.Frame:
     f = df.getRawFrame().copy(True)
     columns = f.columns.to_list()
     for c, o in self.__attributes.items():
         result = pd.cut(f.iloc[:, c], bins=o[0], labels=o[1], duplicates='drop')
         colName: str = columns[c]
         newColName: str = colName if not self.__attributeSuffix else colName + self.__attributeSuffix
         f.loc[:, newColName] = result
     return data.Frame(f)
예제 #23
0
def test_remove_column():
    op = RemoveNanColumns()
    assert op.hasOptions() is False

    d = {
        'col1': [1, 2, 3, np.nan, 10],
        'col2': [3, 4, np.nan, np.nan, np.nan],
        'col3': ['q', '2', 'c', '4', 'x'],
        'date': ['05-09-1988', None, '21-11-1995', '22-06-1994', '12-12-2012']
    }
    # None is same as nan
    f = data.Frame(d)
    f = f.setIndex('col1')

    assert op.getOutputShape() is None
    op.setOptions(number=1, percentage=0.3)
    assert op.getOptions() == (0.3, None)

    op.addInputShape(f.shape, 0)

    g = op.execute(f)

    assert g != f and g.shape != f.shape
    assert g.nRows == 5 == f.nRows
    assert g.shape.colNames == ['col3', 'date']
    assert g.shape.colTypes == [Types.String, Types.String]

    op.setOptions(percentage=None, number=3)
    g = op.execute(f)
    assert g == f and g.nRows == 5
    assert g.shape == f.shape

    op.setOptions(percentage=None, number=0)  # remove all cols with > 0 nan
    g = op.execute(f)
    assert g != f and g.nRows == 5
    # Removes also date because of None
    assert g.to_dict() == {'col3': ['q', '2', 'c', '4', 'x']}
    assert g.shape.colTypes == [Types.String]

    op.setOptions(percentage=0.6, number=0)  # remove nothing
    g = op.execute(f)
    assert g == f and g.nRows == 5
    # Removes also date because of None
    assert g.shape == f.shape

    op.setOptions(percentage=0.59, number=0)  # remove col2
    g = op.execute(f)
    assert g != f and g.nRows == 5
    # Removes also date because of None
    s = f.shape.clone()
    i = s.colNames.index('col2')
    del s.colTypes[i]
    del s.colNames[i]
    s.index = ['col1']
    assert g.shape == s
예제 #24
0
def test_discretize_range_suffix():
    d = {
        'col1': [1, -1.1, 3, 7.5, 10],
        'col2': [3, 4, None, 6, None],
        'ww': [3, 1, 'ww', '1', '1'],
        'col2_binss': [3, 1, 'ww', '1', '1']
    }
    f = data.Frame(d)

    op = RangeDiscretizer()

    tOpts = {1: {'bins': [0, 2, 4, 7], 'labels': 'A B C'}}
    op.setOptions(table=tOpts, suffix=(True, '_binss'))

    tOpts[1] = {}
    assert op.getOptions() == {
        'table': {
            1: {
                'bins': [0, 2, 4, 7],
                'labels': 'A B C'
            }
        },
        'suffix': (True, '_binss')
    }

    op.addInputShape(f.shape, 0)
    s = f.shape.clone()
    s.colTypes[1] = Types.Numeric
    s.colTypes[3] = Types.Ordinal
    assert op.getOutputShape() == s

    g = op.execute(f)
    expected_output = {
        'col1': [1, -1.1, 3, 7.5, 10],
        'col2': [3, 4, None, 6, None],
        'ww': [3, 1, 'ww', '1', '1'],
        'col2_binss': ['B', 'B', None, 'C',
                       None]  # This column must replace the original duplicate
    }
    assert nan_to_None(g.to_dict()) == expected_output
    assert g.shape == s

    # Check that output is the same as with drop
    op.setOptions(table={1: {
        'bins': [0, 2, 4, 7],
        'labels': 'A B C'
    }},
                  suffix=(False, None))
    o = op.execute(f)
    assert expected_output['col2_binss'] == nan_to_None(o.to_dict())['col2']
    assert expected_output['col2'] != nan_to_None(o.to_dict())['col2']
    assert expected_output['ww'] == nan_to_None(o.to_dict())['ww']
    assert expected_output['col1'] == nan_to_None(o.to_dict())['col1']
    assert expected_output['col2_binss'] != nan_to_None(
        o.to_dict())['col2_binss']
예제 #25
0
def test_discretize_by_date_and_time():
    d = {'col2': [3, 4, 5.1, 6, 0],
         'col3': ['123', '2', '0.43', '4', '2021 January'],
         'cold': [pd.Timestamp('05-09-1988 13:45'), pd.Timestamp('22-12-1994 14:21'),
                  pd.Timestamp('21-11-1995 11:50'), None, pd.Timestamp('12-12-2012 09:15')],
         'cold2': [pd.Timestamp('05-09-1988 13:45'), pd.Timestamp('22-12-1994 14:21'),
                   pd.Timestamp('21-11-1995 11:50'), None, pd.Timestamp('12-12-2012 09:15')]
         }

    f = data.Frame(d)
    f = f.setIndex(['col2', 'col3'])

    op = DateDiscretizer()
    op.addInputShape(f.shape, 0)
    assert op.getOutputShape() is None
    op.addInputShape(f.shape, 0)
    assert op.getOutputShape() is None
    shapeDict = f.shape.columnsDict
    assert shapeDict['cold'] == Types.Datetime

    intervals = [pd.Timestamp('05-09-1988 07:00'), pd.Timestamp('20-12-1994 11:30'),
                 pd.Timestamp('05-09-2000 14:20'), pd.Timestamp('01-09-2010 14:30'),
                 pd.Timestamp('12-12-2012 09:14')]
    labels = ['early mo', 'middle', 'late', 'now']

    op.setOptions(selected={
        0: {'ranges': (intervals, True, True), 'labels': labels},
        1: {'ranges': (intervals, True, True), 'labels': labels}},
        suffix=(False, '_disc'))

    assert op.getOptions() == {
        'selected': {
            0: {'ranges': (intervals, True, True), 'labels': labels},
            1: {'ranges': (intervals, True, True), 'labels': labels}},
        'suffix': (False, None)
    }

    shapeDict['cold'] = Types.Ordinal
    shapeDict['cold2'] = Types.Ordinal
    s = data.Shape.fromDict(shapeDict, f.shape.indexDict)
    assert op.getOutputShape() == s

    g = op.execute(f)
    assert g.shape == s
    assert g.shape != f.shape

    output = nan_to_None(g.to_dict())
    assert output == {
        'cold': ['early mo', 'middle', 'middle', None, None],
        'cold2': ['early mo', 'middle', 'middle', None, None]}
    assert g.getRawFrame()['cold'].cat.categories.to_list() == ['early mo', 'middle', 'late', 'now']
    assert g.getRawFrame()['cold'].dtype.ordered is True
    assert g.getRawFrame()['cold2'].cat.categories.to_list() == ['early mo', 'middle', 'late', 'now']
    assert g.getRawFrame()['cold2'].dtype.ordered is True
예제 #26
0
 def execute(self, df: data.Frame) -> data.Frame:
     # Deep copy
     raw_df = df.getRawFrame().copy(deep=True)
     # To string
     isNan = raw_df.iloc[:, self.__attributes].isnull()
     processed = raw_df.iloc[:, self.__attributes].astype(dtype=str, errors='raise')
     # Set to nan where values where nan
     processed = processed.mask(isNan, np.nan)
     colNames = df.shape.colNames
     raw_df.iloc[:, self.__attributes] = processed
     return data.Frame(raw_df)
예제 #27
0
 def execute(self, df: data.Frame) -> data.Frame:
     # Assume everything to go is set
     if self.__thresholdPercentage is not None and self.__thresholdNumber is not None:
         raise exp.InvalidOptions('Can\'t have both threshold set')
     pf = df.getRawFrame().copy()
     if self.__thresholdPercentage:
         # By percentage
         pf = pf.loc[:, pf.isnull().mean() <= self.__thresholdPercentage]
     else:
         # By nan number
         pf = pf.loc[:, pf.isnull().sum() <= self.__thresholdNumber]
     return data.Frame(pf)
예제 #28
0
def test_minMaxScale():
    d = {
        'id1': ['ab', 'sa', '121', '121', 'a'],
        'id2': [1, np.nan, 0, 44, 0],
        'col1': [1, -1.1, 3, 7.5, 10],
        'col2': [3, 4, np.nan, 6, np.nan],
        'ww': [3, np.nan, 'ww', '1', '1']
    }
    f = data.Frame(d)
    f = f.setIndex(['id1', 'id2'])

    op = MinMaxScaler()
    assert op.getOutputShape() is None
    op.setOptions(attributes={0: {'range': (-1, 1)}, 1: {'range': (2, 4)}})
    assert op.getOutputShape() is None
    op.addInputShape(f.shape, 0)
    s = f.shape.clone()
    assert op.getOutputShape() == s

    op.removeInputShape(0)
    assert op.getOutputShape() is None
    op.addInputShape(f.shape, 0)

    g = op.execute(f)
    expected = {
        'col1': [((x - min(d['col1'])) / (max(d['col1']) - min(d['col1'])))
                 for x in d['col1']],
        'col2': [((x - min(d['col2'])) / (max(d['col2']) - min(d['col2'])))
                 for x in d['col2']],
        'ww': [3, None, 'ww', '1', '1']
    }
    expected = {
        'col1': [x * (1 - (-1)) - 1 for x in expected['col1']],
        'col2': [x * (4 - 2) + 2 for x in expected['col2']],
        'ww': expected['ww']
    }
    assert nan_to_None(roundValues(g.to_dict(),
                                   4)) == nan_to_None(roundValues(expected, 4))
    assert g.shape == s
    assert not numpy_equal(g.getRawFrame().values, f.getRawFrame().values)

    options = op.getOptions()
    assert options == {
        'attributes': {
            0: {
                'range': (-1, 1)
            },
            1: {
                'range': (2, 4)
            }
        }
    }
예제 #29
0
def test_join_on_cols():
    d = {'col1': [1, 2, 3, 4.0, 10], 'col2': [3, 4, 5, 6, 0], 'col3': ['q', '2', 'c', '4', 'x'],
         'date': ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012']}
    e = {'cowq': [1, 2, 3, 4.0, 10], 'col2': pd.Categorical([3, 4, 5, 7, 0]),
         'col3': ['q', '2', 'c', '4', 'x'],
         'date': pd.Series(['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'],
                           dtype='datetime64[ns]')}
    f = data.Frame(d)
    g = data.Frame(e)

    f = f.setIndex('col1')
    g = g.setIndex('col2')

    op = Join()

    assert op.getOutputShape() is None
    op.addInputShape(f.shape, 0)
    assert op.getOutputShape() is None
    op.addInputShape(g.shape, 1)

    op.setOptions('_l', '_r', False, 2, 1, jt.Right)
    assert op.getOptions() == ('_l', '_r', False, 2, 1, jt.Right)
    dc = {
        'cowq': Types.Numeric,
        'col2': Types.Numeric,
        'col3_l': Types.String,
        'col3_r': Types.String,
        'date_l': Types.String,
        'date_r': Types.Datetime
    }
    # Note that merge does not preserve index
    di = {
        'Unnamed': IndexType(Types.Numeric)  # Default index
    }

    s = data.Shape.fromDict(dc, di)
    assert op.getOutputShape() == s
    h = op.execute(f, g)
    assert h.shape == s
예제 #30
0
def test_merge_numeric():
    d = {
        'col1': [1, 2, 3, 4.0, 10],
        'col2': [3, 4, 5, 6, 0],
        'col3': ['q', '2', 'c', '4', 'x'],
        'date':
        ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012']
    }
    f = data.Frame(d)
    op = ReplaceValues()
    assert op.getOptions() == {'table': dict(), 'inverted': False}
    op.addInputShape(f.shape, 0)
    tOps = {
        1: {
            'values': '1.0 3.0 4.0;  6  0.0',
            'replace': '-1.0;-2.0'
        },
        0: {
            'values': '1.0 4.0',
            'replace': '7.0'
        }
    }
    op.setOptions(table=tOps, inverted=False)
    dOps = op.getOptions()
    assert dOps == {
        'table': {
            1: {
                'values': '1.0 3.0 4.0; 6.0 0.0',
                'replace': '-1.0; -2.0'
            },
            0: {
                'values': '1.0 4.0',
                'replace': '7.0'
            }
        },
        'inverted': False
    }
    assert isDictDeepCopy(tOps, dOps['table'])
    s = f.shape.clone()
    assert op.getOutputShape() == s

    g = op.execute(f)

    assert g != f and g.shape == s
    assert g.to_dict() == {
        'col1': [7.0, 2.0, 3.0, 7.0, 10.0],
        'col2': [-1.0, -1.0, 5.0, -2.0, -2.0],
        'col3': ['q', '2', 'c', '4', 'x'],
        'date':
        ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012']
    }