Пример #1
0
def test_fromShape_categories():
    d = {
        'col1': [1, 2, 3, 4.0, 10],
        'col2':
        pd.Categorical([3, 4, 5, 6, 0]),
        'col3':
        pd.Categorical(['q', '2', 'c', '4', 'x'], ordered=True),
        'cold':
        pd.Series([
            '05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994',
            '12-12-2012'
        ],
                  dtype='datetime64[ns]')
    }
    f = Frame(d)
    f = f.setIndex(['col2', 'col3', 'col1'])

    g = Frame.fromShape(f.shape)

    s = Shape()
    # fromShape does preserve index
    s.colNames = ['cold']
    s.colTypes = [Types.Datetime]
    s.index = ['col3', 'col1', 'col2']
    s.indexTypes = [
        IndexType(Types.Ordinal),
        IndexType(Types.Numeric),
        IndexType(Types.Nominal)
    ]
    assert g.shape == s == f.shape
Пример #2
0
def test_SetInput():
    d = {
        'col1': [1, 2, 3, 4.0, 10],
        'col2': [3, 4, 5, 6, 0],
        'col3': ['q', '2', 'c', '4', 'x'],
        'date':
        ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012']
    }

    f = Frame(d)

    work = WorkbenchModelMock()
    # Set dataframe
    work.setDataframeByName('var', f)

    op = SetInput(work)
    assert op.getOutputShape() is None
    assert op.getOptions() == {'inputF': None}
    op.setOptions(inputF='var')
    op.addInputShape(Shape(), pos=0)  # this does nothing
    assert op.getOptions() == {'inputF': 'var'}

    assert op.getOutputShape() == f.shape

    g = op.execute()
    assert g == f

    # g should be a copy
    f = f.rename({'col1': 'ewew'})
    assert g != f
Пример #3
0
 def execute(self, df: data.Frame) -> data.Frame:
     columns = df.getRawFrame().columns.to_list()
     # Execute
     pdf = df.getRawFrame().copy(True)
     fr = set(self.__attributes.values())
     if len(fr) == 1:
         # All ranges are the same, shortcut
         toProcess = pdf.iloc[:, list(self.__attributes.keys())]
         processedColNames = toProcess.columns
         scaled = minmax_scale(toProcess,
                               feature_range=fr.pop(),
                               axis=0,
                               copy=True)
         processed = pd.DataFrame(scaled).set_index(pdf.index)
         processed.columns = processedColNames
     else:
         processed = dict()
         for k, fr in self.__attributes.items():
             processed[columns[k]] = minmax_scale(pdf.iloc[:, k],
                                                  feature_range=fr,
                                                  axis=0,
                                                  copy=True)
         processed = pd.DataFrame(processed).set_index(pdf.index)
     # Merge result with other columns preserving order
     pdf = pdf.drop(columns=processed.columns)
     result = pd.concat([pdf, processed], ignore_index=False,
                        axis=1)[columns]
     return data.Frame(result)
Пример #4
0
def test_rename_bis():
    d = {
        'col1': [1, 2, 3, 4, 10],
        'col2': [3, 4, 5, 6, 0],
        'col3': ['q', '2', 'c', '4', 'x']
    }
    f = Frame(d)
    g = f.rename(['cola', '21eeds', 'ij_'])
    assert g.colnames == ['cola', '21eeds', 'ij_'
                          ] and f.colnames == ['col1', 'col2', 'col3']
Пример #5
0
def test_rename():
    d = {
        'col1': [1, 2, 3, 4, 10],
        'col2': [3, 4, 5, 6, 0],
        'col3': ['q', '2', 'c', '4', 'x']
    }
    f = Frame(d)
    g = f.rename({'col2': 'new'})
    assert g.colnames == ['col1', 'new', 'col3'
                          ] and f.colnames == ['col1', 'col2', 'col3']
Пример #6
0
def test_rename_excep():
    d = {
        'col1': [1, 2, 3, 4, 10],
        'col2': [3, 4, 5, 6, 0],
        'col3': ['q', '2', 'c', '4', 'x']
    }
    f = Frame(d)
    names = f.colnames
    names.append('1')
    with pytest.raises(ValueError):
        f.rename(names)
Пример #7
0
    def execute(self, df: data.Frame) -> data.Frame:
        df = df.getRawFrame()
        colOrder: List[str] = df.columns.to_list()

        subDf = df.iloc[:, self.__selected]

        duplicates = find_duplicates(subDf)

        if duplicates:
            df = df.copy(True)
            df = df.drop(duplicates, axis=1)
            # Keep original order
            order = [c for c in colOrder if c not in duplicates]
            df = df[order]
        return data.Frame(df)
Пример #8
0
def test_typing():
    a = 4
    b = pd.DataFrame()
    d: int64 = 12
    c = Frame()
    assert isinstance(type(a), type) and isinstance(type(b), type) and issubclass(type(c), Frame) and \
           isinstance(type(c), type) and isinstance(type(d), type)
Пример #9
0
def test_str_to_Timestamp_validation():
    d = {
        'col1': ['3', '0', '5', '6', '0'],
        'col2': [3, 4, 5.1, 6, 0],
        'col3': ['123', '2', '0.43', '4', '2021 January'],
        'cold': ['05091988', '22121994', '21111995', '22061994', '12122012']
    }

    f = Frame(d)

    op = ToTimestamp()
    op.addInputShape(f.shape, 0)
    assert op.getOutputShape() is None

    with pytest.raises(exp.OptionValidationError):
        op.setOptions(attributes={}, errors='raise')
    assert not op.hasOptions() and op.getOutputShape() is None

    with pytest.raises(exp.OptionValidationError):
        op.setOptions(attributes={0: {'format': '%s'}}, errors='')
    assert not op.hasOptions() and op.getOutputShape() is None

    op.setOptions(attributes={0: {'format': '%d'}}, errors='coerce')
    assert op.hasOptions()
    assert op.getOutputShape().colTypes == [
        Types.Datetime, Types.Numeric, Types.String, Types.String
    ]
Пример #10
0
def test_toString():
    d = {
        'col1': ['3', '0', '5', '6', '0'],
        'col2': [3, 4, 5.1, 6, 0],
        'col3': ['123', '2', '0.43', '4', '2021 January'],
        'cold': ['05091988', '22121994', '21111995', '22061994', '12122012']
    }

    f = Frame(d)

    op = ToString()
    assert op.getOutputShape() is None
    op.addInputShape(f.shape, 0)
    assert op.getOutputShape() is None

    opts = {'attributes': {1: None}}
    assert op.getOptions() == {'attributes': dict()}

    op.setOptions(**opts)
    assert op.getOutputShape().colTypes == [
        Types.String, Types.String, Types.String, Types.String
    ]

    assert op.getOptions() == opts
    assert isDictDeepCopy(op.getOptions(), opts)

    g = op.execute(f)
    assert op.getOutputShape() == g.shape
Пример #11
0
def test_nominal_to_ordinal_cat():
    d = {
        'col1': pd.Categorical(["5", "0", "5", "U", "0 ww"], ordered=False),
        'col2': [3, 4, 5.1, 6, 0]
    }

    f = Frame(d)

    op = ToCategorical()
    op.addInputShape(f.shape, pos=0)
    op.setOptions(attributes={0: {'cat': 'U 0 1 5', 'ordered': True}})

    # Predict output shape
    os = f.shape.columnsDict
    os['col1'] = Types.Ordinal
    assert op.getOutputShape().columnsDict == os

    g = op.execute(f)
    gd = {
        'col1': ['5', '0', '5', 'U', None],
        'col2': [3.0, 4.0, 5.1, 6.0, 0.0]
    }
    assert nan_to_None(g.to_dict()) == gd
    assert g.shape.columnsDict == os
    assert list(
        g.getRawFrame()['col1'].dtype.categories) == ['U', '0', '1', '5']
    assert g.getRawFrame()['col1'].dtype.ordered is True
Пример #12
0
def test_rename():
    d = {
        'col1': [1, 2, 3, 4, 10],
        'col2': [3, 4, 5, 6, 0],
        'col3': ['q', '2', 'c', '4', 'x']
    }
    f = Frame(d)

    op = RenameColumns()
    op.addInputShape(f.shape, pos=0)
    assert op.getOutputShape() is None
    op.setOptions(names={0: 'col4', 2: 'col1'})
    assert op.getOptions() == [{0: 'col4', 2: 'col1'}]

    os = f.shape.clone()
    os.colNames = ['col4', 'col2', 'col1']

    assert op.getOutputShape() == os

    g = op.execute(f)
    gd = {
        'col4': [1, 2, 3, 4, 10],
        'col2': [3, 4, 5, 6, 0],
        'col1': ['q', '2', 'c', '4', 'x']
    }
    assert g.to_dict() == gd
Пример #13
0
 def execute(self, df: data.Frame) -> data.Frame:
     f = df.getRawFrame().copy(True)
     pairs: List[Tuple[int, str]] = list(self.__attributes.items())
     names = [v[1] for v in pairs]
     indexes = [v[0] for v in pairs]
     f[names] = f.iloc[:, indexes]
     return data.Frame(f)
Пример #14
0
def test_unsetOptions_toNumeric():
    d = {
        'col1': pd.Categorical([1, 2, 3, 4, 10]),
        'col2': [3, 4, 5, 6, 0],
        'col3': ['q', '2', 'c', '4', 'x']
    }
    f = Frame(d)

    op = ToNumeric()
    op.addInputShape(f.shape, pos=0)
    assert op.getOptions() == {
        'attributes': {},
        'errors': 'raise'
    } and not op.hasOptions()
    op.setOptions(attributes={0: dict()}, errors='raise')
    assert op.getOptions() == {'attributes': {0: None}, 'errors': 'raise'}
    assert op._shapes[0] == f.shape

    op.unsetOptions()
    assert op.getOptions() == {'attributes': {}, 'errors': 'raise'}
    assert op._shapes[0] == f.shape

    op.removeInputShape(0)
    assert op.getOptions() == {'attributes': {}, 'errors': 'raise'}
    assert op._shapes == [None]

    op.setOptions(attributes={1: dict()}, errors='coerce')
    assert op.getOptions() == {'attributes': {1: None}, 'errors': 'coerce'}
    assert op._shapes == [None]

    op.addInputShape(f.shape, pos=0)
    assert op.getOptions() == {'attributes': {1: None}, 'errors': 'coerce'}
    assert op._shapes[0] == f.shape
Пример #15
0
 def execute(self, df: data.Frame) -> data.Frame:
     columns = df.getRawFrame().columns.to_list()
     # Execute
     pdf = df.getRawFrame().copy(True)
     processedColNames = pdf.iloc[:, self.__attributes].columns
     scaled = scale(pdf.iloc[:, self.__attributes],
                    with_mean=True,
                    with_std=True,
                    copy=True)
     processed = pd.DataFrame(scaled).set_index(pdf.index)
     processed.columns = processedColNames
     # Merge result with other columns preserving order
     pdf = pdf.drop(columns=processedColNames)
     result = pd.concat([pdf, processed], ignore_index=False,
                        axis=1)[columns]
     return data.Frame(result)
Пример #16
0
 def execute(self, dfl: data.Frame, dfr: data.Frame) -> data.Frame:
     if self.__onIndex:
         # Join on indexes
         return data.Frame(dfl.getRawFrame().join(dfr.getRawFrame(), how=self.__type.value,
                                                  lsuffix=self.__lSuffix,
                                                  rsuffix=self.__rSuffix))
     else:
         # Join (merge) on columns
         # onleft and onright must be set
         suffixes = (self.__lSuffix, self.__rSuffix)
         l_col = dfl.shape.colNames[self.__leftOn]
         r_col = dfr.shape.colNames[self.__rightOn]
         return data.Frame(dfl.getRawFrame().merge(dfr.getRawFrame(), how=self.__type.value,
                                                   left_on=l_col,
                                                   right_on=r_col,
                                                   suffixes=suffixes))
Пример #17
0
 def computeDiff(self) -> None:
     frame1 = self.columnsL.model().frameModel().frame.getRawFrame()
     frame2 = self.columnsR.model().frameModel().frame.getRawFrame()
     changedMask = frame1 != frame2
     diffRows = changedMask.any(1)
     diffColumns = changedMask.any(0)
     frame = frame1.loc[diffRows, diffColumns]
     self.tableWidget.model().sourceModel().setFrame(Frame(frame))
Пример #18
0
 def execute(self, df: data.Frame) -> data.Frame:
     """ Set new names for columns """
     names: List[str] = df.colnames
     for k, v in self.__names.items():
         names[k] = v
     new_df = df.getRawFrame().copy(deep=False)
     new_df.columns = names
     return data.Frame(new_df)
Пример #19
0
def test_addInputShape_exc():
    d = {'col1': [1, 2, 3, 4.0, 10], 'col2': [3, 4, 5, 6, 0], 'col3': ['q', '2', 'c', '4', 'x'],
         'date': ['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012']}

    f = Frame(d)
    op = DummyOp()
    with pytest.raises(ValueError):
        op.addInputShape(f.shape, pos=-1)
Пример #20
0
    def execute(self, df: data.Frame) -> data.Frame:
        columns = df.colnames
        df = df.getRawFrame().copy(True)

        # Notice that this timestamps are already set to a proper format (with default time/date) by
        # the editor
        intervals: Dict[int, pd.IntervalIndex] = \
            {i: pd.IntervalIndex([pd.Interval(a, b, closed='right') for a, b in zip(opts[0],
                                                                                    opts[0][1:])])
             for i, opts in self.__attributes.items()}

        processedDict = dict()
        for i, opts in self.__attributes.items():
            _, labels, byDate, byTime = opts
            applyCol = df.iloc[:, i]
            if byTime and not byDate:
                # Replace the date part with the default date in a way that every ts has the
                # same date, but retains its original time. Nan values are propagated
                applyCol = applyCol \
                    .map(lambda ts:
                         pd.Timestamp(QDateTime(_IntervalWidget.DEFAULT_DATE,
                                                toQtDateTime(ts.to_pydatetime()).time()).toPython()),
                         na_action='ignore')
            name = columns[i]
            if self.__attributesSuffix:
                name += self.__attributesSuffix
            categoriesMap = dict(zip(intervals[i], labels))
            processedDict[name] = pd.cut(
                applyCol,
                bins=intervals[i]).cat.rename_categories(categoriesMap)

        if self.__attributesSuffix:
            duplicateColumns: Set[str] = set(
                processedDict.keys()) & set(columns)
        else:
            duplicateColumns: List[str] = list(processedDict.keys())
        if duplicateColumns:
            df = df.drop(columns=duplicateColumns)
        processed = pd.DataFrame(processedDict).set_index(df.index)

        df = pd.concat([df, processed], ignore_index=False, axis=1)
        if not self.__attributesSuffix:
            # Reorder columns
            df = df[columns]
        return data.Frame(df)
Пример #21
0
 def execute(self, df: data.Frame) -> data.Frame:
     f = df.getRawFrame().copy(True)
     columns = f.columns.to_list()
     for c, o in self.__attributes.items():
         result = pd.cut(f.iloc[:, c], bins=o[0], labels=o[1], duplicates='drop')
         colName: str = columns[c]
         newColName: str = colName if not self.__attributeSuffix else colName + self.__attributeSuffix
         f.loc[:, newColName] = result
     return data.Frame(f)
Пример #22
0
def test_shape_index():
    d = {
        'col1': [1, 2, 3, 4.0, 10],
        'col2': [3, 4, 5, 6, 0],
        'col3': ['q', '2', 'c', '4', 'x']
    }
    f = Frame(d)
    f = Frame(f.getRawFrame().set_index('col3'))

    # Desired shape obj
    s = Shape()
    s.index = ['col3']
    s.indexTypes = [IndexType(Types.String)]
    s.colNames = ['col1', 'col2']
    s.colTypes = [Types.Numeric, Types.Numeric]

    assert f.shape == s
    assert f.nRows == 5
Пример #23
0
def test_unsetOptions_toCategory():
    d = {
        'col1': pd.Categorical([1, 2, 3, 4, 10]),
        'col2': [3, 4, 5, 6, 0],
        'col3': ['q', '2', 'c', '4', 'x']
    }
    f = Frame(d)

    op = ToCategorical()
    op.addInputShape(f.shape, pos=0)
    op.setOptions(attributes={0: {'cat': ' " 2 e + " 1 ', 'ordered': True}})
    assert op.getOptions() == {
        'attributes': {
            0: {
                'cat': '"2 e +" 1',
                'ordered': True
            }
        }
    }
    assert op._ToCategorical__attributes == {0: (['2 e +', '1'], True)}
    assert op._shapes == [f.shape]

    op.unsetOptions()
    assert op.getOptions() == {'attributes': dict()}
    assert op._ToCategorical__attributes == dict()
    assert op._shapes == [f.shape]

    op.removeInputShape(0)
    assert op.getOptions() == {'attributes': dict()}
    assert op._ToCategorical__attributes == dict()
    assert op._shapes == [None]

    op.setOptions(attributes={1: dict()})
    assert op.getOptions() == {
        'attributes': {
            1: {
                'cat': '',
                'ordered': False
            }
        }
    }
    assert op._ToCategorical__attributes == {1: (None, None)}
    assert op._shapes == [None]
    assert op.getOutputShape() is None

    op.addInputShape(f.shape, pos=0)
    assert op.getOptions() == {
        'attributes': {
            1: {
                'cat': '',
                'ordered': False
            }
        }
    }
    assert op._ToCategorical__attributes == {1: (None, None)}
    assert op._shapes == [f.shape]
Пример #24
0
 def execute(self, df: data.Frame) -> data.Frame:
     # Deep copy
     raw_df = df.getRawFrame().copy(deep=True)
     # To string
     isNan = raw_df.iloc[:, self.__attributes].isnull()
     processed = raw_df.iloc[:, self.__attributes].astype(dtype=str, errors='raise')
     # Set to nan where values where nan
     processed = processed.mask(isNan, np.nan)
     colNames = df.shape.colNames
     raw_df.iloc[:, self.__attributes] = processed
     return data.Frame(raw_df)
Пример #25
0
 def execute(self, df: data.Frame) -> data.Frame:
     # Assume everything to go is set
     if self.__thresholdPercentage is not None and self.__thresholdNumber is not None:
         raise exp.InvalidOptions('Can\'t have both threshold set')
     pf = df.getRawFrame().copy()
     if self.__thresholdPercentage:
         # By percentage
         pf = pf.loc[:, pf.isnull().mean() <= self.__thresholdPercentage]
     else:
         # By nan number
         pf = pf.loc[:, pf.isnull().sum() <= self.__thresholdNumber]
     return data.Frame(pf)
Пример #26
0
 def __init__(self,
              parent: QWidget = None,
              frame: Union[Frame, Shape] = Frame()):
     super().__init__(parent)
     if isinstance(frame, Frame):
         self.__frame: Frame = frame
         self.__shape: Shape = self.__frame.shape
     elif isinstance(frame, Shape):  # it's a Shape
         self.__frame: Frame = Frame()
         self.__shape: Shape = frame
     else:
         self.__frame: Frame = Frame()
         self.__shape: Shape = Shape()
     # Dictionary { attributeIndex: value }
     self._statistics: Dict[int, Dict[str, object]] = dict()
     self._histogram: Dict[int, Dict[Any, int]] = dict()
     # Dataframe name
     self.name: str = ''
     # Set of alive workers by identifier (attribute number, type, operation)
     self._runningWorkers: Set[Tuple] = set()
     self._dataAccessMutex = QMutex()
Пример #27
0
 def execute(self, df: data.Frame) -> data.Frame:
     pdf = df.getRawFrame().copy(deep=True)
     prefixes = itemgetter(*self.__attributes)(self.shapes[0].colNames)
     npdf = pd.get_dummies(pdf.iloc[:, self.__attributes],
                           prefix=prefixes,
                           dummy_na=self.__includeNan,
                           dtype=int)
     npdf = npdf.astype('category', copy=False)
     # Replace eventual duplicate columns
     pdf = pdf.drop(columns=npdf.columns, errors='ignore')
     # Avoid dropping original columns (just append)
     # pdf = pdf.drop(columns[self.__attributes], axis=1, inplace=False)
     pdf = pd.concat([pdf, npdf], axis=1)
     return data.Frame(pdf)
Пример #28
0
def test_cat_toNumeric():
    d = {
        'col1': pd.Categorical(['3', '0', '5', '6', '0']),
        'col2': [3, 4, 5, 6, 0],
        'col3': ['123', '2', '0.43', '4', '90']
    }

    # 'cold': pd.Series(['05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994', '12-12-2012'],
    #                   dtype='datetime64[ns]')}
    f = Frame(d)

    op = ToNumeric()
    op.addInputShape(f.shape, pos=0)
    assert op.getOutputShape() is None
    assert op.getOptions() == {'attributes': {}, 'errors': 'raise'}
    op.setOptions(attributes={0: None}, errors='coerce')
    assert op.getOptions() == {'attributes': {0: None}, 'errors': 'coerce'}

    # Predict output shape
    os = f.shape.clone()
    os.colTypes[0] = Types.Numeric
    assert op.getOutputShape() == os

    # Removing options/input_shape causes None to be returned
    op.removeInputShape(0)
    assert op.getOutputShape() is None
    op.addInputShape(f.shape, pos=0)
    op.unsetOptions()
    assert op.getOutputShape() is None
    op.setOptions(attributes={0: dict()}, errors='raise')
    assert op.getOutputShape() == os  # Re-adding everything

    g = op.execute(f)
    gd = {
        'col1': [3.0, 0.0, 5.0, 6.0, 0.0],
        'col2': [3, 4, 5, 6, 0],
        'col3': ['123', '2', '0.43', '4', '90']
    }
    assert g.to_dict() == gd
    assert g.shape == os

    # Coerce is the same

    op.setOptions(attributes={0: dict()}, errors='coerce')
    assert op.getOutputShape() == os

    g = op.execute(f)
    assert g.to_dict() == gd
    assert g.shape == os
Пример #29
0
def test_fromShape_single_index():
    d = {
        'col1': [1, 2, 3, 4.0, 10],
        'col2': [3, 4, 5, 6, 0],
        'col3': ['q', '2', 'c', '4', 'x'],
        'cold':
        pd.Series([
            '05-09-1988', '22-12-1994', '21-11-1995', '22-06-1994',
            '12-12-2012'
        ],
                  dtype='datetime64[ns]')
    }
    f = Frame(d)
    f = f.setIndex('col1')

    g = Frame.fromShape(f.shape)

    s = Shape()
    # fromShape does preserve index
    s.colNames = ['cold', 'col2', 'col3']
    s.colTypes = [Types.Datetime, Types.Numeric, Types.String]
    s.index = ['col1']
    s.indexTypes = [IndexType(Types.Numeric)]
    assert g.shape == s == f.shape
Пример #30
0
def test_shape():
    d = {
        'col1': [1, 2, 3, 4, 10],
        'col2': [3, 4, 5, 6, 0],
        'col3': ['q', '2', 'c', '4', 'x']
    }
    f = Frame(d)

    s = Shape()
    s.index = ['Unnamed']
    s.indexTypes = [IndexType(Types.Numeric)]
    s.colNames = ['col1', 'col2', 'col3']
    s.colTypes = [Types.Numeric, Types.Numeric, Types.String]

    assert f.shape == s
    assert f.nRows == 5