def _transform(self, df, verbose): new_cols = df.apply(self._func, axis=1) if isinstance(new_cols, pd.Series): loc = len(df.columns) if self._follow_column: loc = df.columns.get_loc(self._follow_column) + 1 return out_of_place_col_insert(df=df, series=new_cols, loc=loc, column_name=self._colname) elif isinstance(new_cols, pd.DataFrame): sorted_cols = sorted(list(new_cols.columns)) new_cols = new_cols[sorted_cols] if self._follow_column: inter_df = df loc = df.columns.get_loc(self._follow_column) + 1 for colname in new_cols.columns: inter_df = out_of_place_col_insert( df=inter_df, series=new_cols[colname], loc=loc, column_name=colname) loc += 1 return inter_df assign_map = { colname: new_cols[colname] for colname in new_cols.columns } return df.assign(**assign_map) raise TypeError( # pragma: no cover "Unexpected type generated by applying a function to a DataFrame." " Only Series and DataFrame are allowed.")
def test_out_of_place_col_insert_nameless_error(): """Testing the ColDrop pipeline stage.""" df = _test_df() series = pd.Series(data=[10, 20], index=[1, 2]) with pytest.raises(ValueError): out_of_place_col_insert(df, series, 1)
def _transform(self, df, verbose): inter_df = df columns_to_transform = self._cols_to_transform if verbose: columns_to_transform = tqdm.tqdm(columns_to_transform) for colname in columns_to_transform: source_col = df[colname] loc = df.columns.get_loc(colname) + 1 new_name = colname + "_log" if self._drop: inter_df = inter_df.drop(colname, axis=1) new_name = colname loc -= 1 new_col = source_col if self._non_neg: if colname in self._col_to_minval: absminval = self._col_to_minval[colname] new_col = new_col + absminval # must check not None as neg numbers eval to False if self._const_shift is not None: new_col = new_col + self._const_shift new_col = np.log(new_col) inter_df = out_of_place_col_insert(df=inter_df, series=new_col, loc=loc, column_name=new_name) return inter_df
def _fit_transform(self, df, verbose): columns_to_transform = self._columns if self._columns is None: columns_to_transform = get_numeric_column_names(df) columns_to_transform = list( set(columns_to_transform).difference(self._exclude)) self._cols_to_transform = columns_to_transform if verbose: columns_to_transform = tqdm.tqdm(columns_to_transform) inter_df = df for colname in columns_to_transform: source_col = df[colname] loc = df.columns.get_loc(colname) + 1 new_name = colname + "_log" if self._drop: inter_df = inter_df.drop(colname, axis=1) new_name = colname loc -= 1 new_col = source_col if self._non_neg: minval = min(new_col) if minval < 0: new_col = new_col + abs(minval) self._col_to_minval[colname] = abs(minval) # must check not None as neg numbers eval to False if self._const_shift is not None: new_col = new_col + self._const_shift new_col = np.log(new_col) inter_df = out_of_place_col_insert(df=inter_df, series=new_col, loc=loc, column_name=new_name) self.is_fitted = True return inter_df
def _fit_transform(self, df, verbose): self.encoders = {} columns_to_encode = self._columns if self._columns is None: columns_to_encode = list( set(df.select_dtypes( include=["object", "category"]).columns).difference( self._exclude_columns)) if verbose: columns_to_encode = tqdm.tqdm(columns_to_encode) inter_df = df for colname in columns_to_encode: lbl_enc = sklearn.preprocessing.LabelEncoder() source_col = df[colname] loc = df.columns.get_loc(colname) + 1 new_name = colname + "_enc" if self._drop: inter_df = inter_df.drop(colname, axis=1) new_name = colname loc -= 1 inter_df = out_of_place_col_insert( df=inter_df, series=lbl_enc.fit_transform(source_col), loc=loc, column_name=new_name, ) self.encoders[colname] = lbl_enc self.is_fitted = True return inter_df
def test_out_of_place_col_insert_no_col_name(): """Testing the ColDrop pipeline stage.""" df = _test_df() series = pd.Series(data=[10, 20], index=[1, 2], name='tens') result_df = out_of_place_col_insert(df, series, 1) assert 'tens' in result_df.columns assert result_df.columns.get_loc('tens') == 1 assert result_df['tens'][1] == 10 assert result_df['tens'][2] == 20
def test_out_of_place_col_last_position(): """Testing the ColDrop pipeline stage.""" df = _test_df() series = pd.Series(data=[10, 20], index=[1, 2], name='tens') result_df = out_of_place_col_insert(df, series, len(df.columns), 'Tigers') assert 'tens' not in result_df.columns assert 'Tigers' in result_df.columns assert result_df.columns.get_loc('Tigers') == 2 assert result_df['Tigers'][1] == 10 assert result_df['Tigers'][2] == 20
def _transform(self, df, verbose): inter_df = df for i, colname in enumerate(self._columns): source_col = df[colname] loc = df.columns.get_loc(colname) + 1 new_name = self._result_columns[i] if self._drop: inter_df = inter_df.drop(colname, axis=1) loc -= 1 inter_df = out_of_place_col_insert(df=inter_df, series=source_col.agg( self._func), loc=loc, column_name=new_name) return inter_df
def _transform(self, df, verbose): inter_df = df try: new_col = self._func(df) except Exception: raise PipelineApplicationError( "Exception raised applying function{} to dataframe.".format( self._func_desc)) if self._follow_column: loc = df.columns.get_loc(self._follow_column) + 1 else: loc = len(df.columns) inter_df = out_of_place_col_insert(df=inter_df, series=new_col, loc=loc, column_name=self._column) return inter_df
def _transform(self, df, verbose): inter_df = df for colname in self._columns: source_col = df[colname] loc = df.columns.get_loc(colname) + 1 new_name = colname + "_norare" if self._drop: inter_df = inter_df.drop(colname, axis=1) new_name = colname loc -= 1 rare_remover = self._rare_removers[colname] inter_df = out_of_place_col_insert( df=inter_df, series=source_col.map(rare_remover), loc=loc, column_name=new_name) return inter_df
def _transform(self, df, verbose): inter_df = df for colname in self.encoders: lbl_enc = self.encoders[colname] source_col = df[colname] loc = df.columns.get_loc(colname) + 1 new_name = colname + "_enc" if self._drop: inter_df = inter_df.drop(colname, axis=1) new_name = colname loc -= 1 inter_df = out_of_place_col_insert( df=inter_df, series=lbl_enc.transform(source_col), loc=loc, column_name=new_name) return inter_df
def _fit_transform(self, df, verbose): inter_df = df for colname in self._columns: source_col = df[colname] loc = df.columns.get_loc(colname) + 1 new_name = colname + "_norare" if self._drop: inter_df = inter_df.drop(colname, axis=1) new_name = colname loc -= 1 rare_remover = DropRareTokens.__get_rare_remover( source_col, self._threshold) self._rare_removers[colname] = rare_remover inter_df = out_of_place_col_insert( df=inter_df, series=source_col.map(rare_remover), loc=loc, column_name=new_name) self.is_fitted = True return inter_df
def _transform(self, df, verbose): inter_df = df colnames = list(self._bin_map.keys()) if verbose: colnames = tqdm.tqdm(colnames) for colname in colnames: if verbose: colnames.set_description(colname) source_col = df[colname] loc = df.columns.get_loc(colname) + 1 new_name = colname + "_bin" if self._drop: inter_df = inter_df.drop(colname, axis=1) new_name = colname loc -= 1 inter_df = out_of_place_col_insert( df=inter_df, series=source_col.apply( self._get_col_binner(self._bin_map[colname])), loc=loc, column_name=new_name) return inter_df
def _transform(self, df, verbose): inter_df = df columns_to_transform = self._get_columns(df, fit=False) if verbose: columns_to_transform = tqdm.tqdm(columns_to_transform) for colname in columns_to_transform: try: source_col = df[colname] except KeyError: # pragma: no cover raise PipelineApplicationError( ("Missig column {} when applying a fitted " "Log pipeline stage by class {} !").format( colname, self.__class__)) loc = df.columns.get_loc(colname) + 1 new_name = colname + "_log" if self._drop: inter_df = inter_df.drop(colname, axis=1) new_name = colname loc -= 1 new_col = source_col if self._non_neg: if colname in self._col_to_minval: absminval = self._col_to_minval[colname] new_col = new_col + absminval else: # pragma: no cover raise PipelineApplicationError(( "Missig fitted parameter for column {} when applying a" " fitted Log pipeline stage by class {}!").format( colname, self.__class__)) # must check not None as neg numbers eval to False if self._const_shift is not None: new_col = new_col + self._const_shift new_col = np.log(new_col) inter_df = out_of_place_col_insert(df=inter_df, series=new_col, loc=loc, column_name=new_name) return inter_df
def _transformation(self, df, verbose, fit): columns = self._get_columns(df, fit=fit) result_columns = self._result_columns if self._result_columns is None: if self._drop: result_columns = columns else: result_columns = [col + self.suffix for col in columns] inter_df = df for i, colname in enumerate(columns): source_col = df[colname] loc = df.columns.get_loc(colname) + 1 new_name = result_columns[i] if self._drop: inter_df = inter_df.drop(colname, axis=1) loc -= 1 inter_df = out_of_place_col_insert( df=inter_df, series=self._col_transform(source_col, colname), loc=loc, column_name=new_name, ) return inter_df
def _fit_transform(self, df, verbose): self.encoders = {} columns_to_encode = self._get_columns(df, fit=True) if verbose: columns_to_encode = tqdm.tqdm(columns_to_encode) inter_df = df for colname in columns_to_encode: lbl_enc = sklearn.preprocessing.LabelEncoder() source_col = df[colname] loc = df.columns.get_loc(colname) + 1 new_name = colname + "_enc" if self._drop: inter_df = inter_df.drop(colname, axis=1) new_name = colname loc -= 1 inter_df = out_of_place_col_insert( df=inter_df, series=lbl_enc.fit_transform(source_col), loc=loc, column_name=new_name, ) self.encoders[colname] = lbl_enc self.is_fitted = True return inter_df