Пример #1
0
 def _transform(self, df, verbose):
     new_cols = df.apply(self._func, axis=1)
     if isinstance(new_cols, pd.Series):
         loc = len(df.columns)
         if self._follow_column:
             loc = df.columns.get_loc(self._follow_column) + 1
         return out_of_place_col_insert(df=df,
                                        series=new_cols,
                                        loc=loc,
                                        column_name=self._colname)
     elif isinstance(new_cols, pd.DataFrame):
         sorted_cols = sorted(list(new_cols.columns))
         new_cols = new_cols[sorted_cols]
         if self._follow_column:
             inter_df = df
             loc = df.columns.get_loc(self._follow_column) + 1
             for colname in new_cols.columns:
                 inter_df = out_of_place_col_insert(
                     df=inter_df,
                     series=new_cols[colname],
                     loc=loc,
                     column_name=colname)
                 loc += 1
             return inter_df
         assign_map = {
             colname: new_cols[colname]
             for colname in new_cols.columns
         }
         return df.assign(**assign_map)
     raise TypeError(  # pragma: no cover
         "Unexpected type generated by applying a function to a DataFrame."
         " Only Series and DataFrame are allowed.")
Пример #2
0
def test_out_of_place_col_insert_nameless_error():
    """Testing the ColDrop pipeline stage."""
    df = _test_df()
    series = pd.Series(data=[10, 20], index=[1, 2])

    with pytest.raises(ValueError):
        out_of_place_col_insert(df, series, 1)
Пример #3
0
 def _transform(self, df, verbose):
     inter_df = df
     columns_to_transform = self._cols_to_transform
     if verbose:
         columns_to_transform = tqdm.tqdm(columns_to_transform)
     for colname in columns_to_transform:
         source_col = df[colname]
         loc = df.columns.get_loc(colname) + 1
         new_name = colname + "_log"
         if self._drop:
             inter_df = inter_df.drop(colname, axis=1)
             new_name = colname
             loc -= 1
         new_col = source_col
         if self._non_neg:
             if colname in self._col_to_minval:
                 absminval = self._col_to_minval[colname]
                 new_col = new_col + absminval
         # must check not None as neg numbers eval to False
         if self._const_shift is not None:
             new_col = new_col + self._const_shift
         new_col = np.log(new_col)
         inter_df = out_of_place_col_insert(df=inter_df,
                                            series=new_col,
                                            loc=loc,
                                            column_name=new_name)
     return inter_df
Пример #4
0
 def _fit_transform(self, df, verbose):
     columns_to_transform = self._columns
     if self._columns is None:
         columns_to_transform = get_numeric_column_names(df)
     columns_to_transform = list(
         set(columns_to_transform).difference(self._exclude))
     self._cols_to_transform = columns_to_transform
     if verbose:
         columns_to_transform = tqdm.tqdm(columns_to_transform)
     inter_df = df
     for colname in columns_to_transform:
         source_col = df[colname]
         loc = df.columns.get_loc(colname) + 1
         new_name = colname + "_log"
         if self._drop:
             inter_df = inter_df.drop(colname, axis=1)
             new_name = colname
             loc -= 1
         new_col = source_col
         if self._non_neg:
             minval = min(new_col)
             if minval < 0:
                 new_col = new_col + abs(minval)
                 self._col_to_minval[colname] = abs(minval)
         # must check not None as neg numbers eval to False
         if self._const_shift is not None:
             new_col = new_col + self._const_shift
         new_col = np.log(new_col)
         inter_df = out_of_place_col_insert(df=inter_df,
                                            series=new_col,
                                            loc=loc,
                                            column_name=new_name)
     self.is_fitted = True
     return inter_df
Пример #5
0
 def _fit_transform(self, df, verbose):
     self.encoders = {}
     columns_to_encode = self._columns
     if self._columns is None:
         columns_to_encode = list(
             set(df.select_dtypes(
                 include=["object", "category"]).columns).difference(
                     self._exclude_columns))
     if verbose:
         columns_to_encode = tqdm.tqdm(columns_to_encode)
     inter_df = df
     for colname in columns_to_encode:
         lbl_enc = sklearn.preprocessing.LabelEncoder()
         source_col = df[colname]
         loc = df.columns.get_loc(colname) + 1
         new_name = colname + "_enc"
         if self._drop:
             inter_df = inter_df.drop(colname, axis=1)
             new_name = colname
             loc -= 1
         inter_df = out_of_place_col_insert(
             df=inter_df,
             series=lbl_enc.fit_transform(source_col),
             loc=loc,
             column_name=new_name,
         )
         self.encoders[colname] = lbl_enc
     self.is_fitted = True
     return inter_df
Пример #6
0
def test_out_of_place_col_insert_no_col_name():
    """Testing the ColDrop pipeline stage."""
    df = _test_df()
    series = pd.Series(data=[10, 20], index=[1, 2], name='tens')

    result_df = out_of_place_col_insert(df, series, 1)
    assert 'tens' in result_df.columns
    assert result_df.columns.get_loc('tens') == 1
    assert result_df['tens'][1] == 10
    assert result_df['tens'][2] == 20
Пример #7
0
def test_out_of_place_col_last_position():
    """Testing the ColDrop pipeline stage."""
    df = _test_df()
    series = pd.Series(data=[10, 20], index=[1, 2], name='tens')

    result_df = out_of_place_col_insert(df, series, len(df.columns), 'Tigers')
    assert 'tens' not in result_df.columns
    assert 'Tigers' in result_df.columns
    assert result_df.columns.get_loc('Tigers') == 2
    assert result_df['Tigers'][1] == 10
    assert result_df['Tigers'][2] == 20
Пример #8
0
 def _transform(self, df, verbose):
     inter_df = df
     for i, colname in enumerate(self._columns):
         source_col = df[colname]
         loc = df.columns.get_loc(colname) + 1
         new_name = self._result_columns[i]
         if self._drop:
             inter_df = inter_df.drop(colname, axis=1)
             loc -= 1
         inter_df = out_of_place_col_insert(df=inter_df,
                                            series=source_col.agg(
                                                self._func),
                                            loc=loc,
                                            column_name=new_name)
     return inter_df
Пример #9
0
 def _transform(self, df, verbose):
     inter_df = df
     try:
         new_col = self._func(df)
     except Exception:
         raise PipelineApplicationError(
             "Exception raised applying function{} to dataframe.".format(
                 self._func_desc))
     if self._follow_column:
         loc = df.columns.get_loc(self._follow_column) + 1
     else:
         loc = len(df.columns)
     inter_df = out_of_place_col_insert(df=inter_df,
                                        series=new_col,
                                        loc=loc,
                                        column_name=self._column)
     return inter_df
Пример #10
0
 def _transform(self, df, verbose):
     inter_df = df
     for colname in self._columns:
         source_col = df[colname]
         loc = df.columns.get_loc(colname) + 1
         new_name = colname + "_norare"
         if self._drop:
             inter_df = inter_df.drop(colname, axis=1)
             new_name = colname
             loc -= 1
         rare_remover = self._rare_removers[colname]
         inter_df = out_of_place_col_insert(
             df=inter_df,
             series=source_col.map(rare_remover),
             loc=loc,
             column_name=new_name)
     return inter_df
Пример #11
0
 def _transform(self, df, verbose):
     inter_df = df
     for colname in self.encoders:
         lbl_enc = self.encoders[colname]
         source_col = df[colname]
         loc = df.columns.get_loc(colname) + 1
         new_name = colname + "_enc"
         if self._drop:
             inter_df = inter_df.drop(colname, axis=1)
             new_name = colname
             loc -= 1
         inter_df = out_of_place_col_insert(
             df=inter_df,
             series=lbl_enc.transform(source_col),
             loc=loc,
             column_name=new_name)
     return inter_df
Пример #12
0
 def _fit_transform(self, df, verbose):
     inter_df = df
     for colname in self._columns:
         source_col = df[colname]
         loc = df.columns.get_loc(colname) + 1
         new_name = colname + "_norare"
         if self._drop:
             inter_df = inter_df.drop(colname, axis=1)
             new_name = colname
             loc -= 1
         rare_remover = DropRareTokens.__get_rare_remover(
             source_col, self._threshold)
         self._rare_removers[colname] = rare_remover
         inter_df = out_of_place_col_insert(
             df=inter_df,
             series=source_col.map(rare_remover),
             loc=loc,
             column_name=new_name)
     self.is_fitted = True
     return inter_df
Пример #13
0
 def _transform(self, df, verbose):
     inter_df = df
     colnames = list(self._bin_map.keys())
     if verbose:
         colnames = tqdm.tqdm(colnames)
     for colname in colnames:
         if verbose:
             colnames.set_description(colname)
         source_col = df[colname]
         loc = df.columns.get_loc(colname) + 1
         new_name = colname + "_bin"
         if self._drop:
             inter_df = inter_df.drop(colname, axis=1)
             new_name = colname
             loc -= 1
         inter_df = out_of_place_col_insert(
             df=inter_df,
             series=source_col.apply(
                 self._get_col_binner(self._bin_map[colname])),
             loc=loc,
             column_name=new_name)
     return inter_df
Пример #14
0
 def _transform(self, df, verbose):
     inter_df = df
     columns_to_transform = self._get_columns(df, fit=False)
     if verbose:
         columns_to_transform = tqdm.tqdm(columns_to_transform)
     for colname in columns_to_transform:
         try:
             source_col = df[colname]
         except KeyError:  # pragma: no cover
             raise PipelineApplicationError(
                 ("Missig column {} when applying a fitted "
                  "Log pipeline stage by class {} !").format(
                      colname, self.__class__))
         loc = df.columns.get_loc(colname) + 1
         new_name = colname + "_log"
         if self._drop:
             inter_df = inter_df.drop(colname, axis=1)
             new_name = colname
             loc -= 1
         new_col = source_col
         if self._non_neg:
             if colname in self._col_to_minval:
                 absminval = self._col_to_minval[colname]
                 new_col = new_col + absminval
             else:  # pragma: no cover
                 raise PipelineApplicationError((
                     "Missig fitted parameter for column {} when applying a"
                     " fitted Log pipeline stage by class {}!").format(
                         colname, self.__class__))
         # must check not None as neg numbers eval to False
         if self._const_shift is not None:
             new_col = new_col + self._const_shift
         new_col = np.log(new_col)
         inter_df = out_of_place_col_insert(df=inter_df,
                                            series=new_col,
                                            loc=loc,
                                            column_name=new_name)
     return inter_df
Пример #15
0
 def _transformation(self, df, verbose, fit):
     columns = self._get_columns(df, fit=fit)
     result_columns = self._result_columns
     if self._result_columns is None:
         if self._drop:
             result_columns = columns
         else:
             result_columns = [col + self.suffix for col in columns]
     inter_df = df
     for i, colname in enumerate(columns):
         source_col = df[colname]
         loc = df.columns.get_loc(colname) + 1
         new_name = result_columns[i]
         if self._drop:
             inter_df = inter_df.drop(colname, axis=1)
             loc -= 1
         inter_df = out_of_place_col_insert(
             df=inter_df,
             series=self._col_transform(source_col, colname),
             loc=loc,
             column_name=new_name,
         )
     return inter_df
Пример #16
0
 def _fit_transform(self, df, verbose):
     self.encoders = {}
     columns_to_encode = self._get_columns(df, fit=True)
     if verbose:
         columns_to_encode = tqdm.tqdm(columns_to_encode)
     inter_df = df
     for colname in columns_to_encode:
         lbl_enc = sklearn.preprocessing.LabelEncoder()
         source_col = df[colname]
         loc = df.columns.get_loc(colname) + 1
         new_name = colname + "_enc"
         if self._drop:
             inter_df = inter_df.drop(colname, axis=1)
             new_name = colname
             loc -= 1
         inter_df = out_of_place_col_insert(
             df=inter_df,
             series=lbl_enc.fit_transform(source_col),
             loc=loc,
             column_name=new_name,
         )
         self.encoders[colname] = lbl_enc
     self.is_fitted = True
     return inter_df