def _transform(self, data, on_cols): _data = data.copy() if len(self._drop_indices_row) > 0: _data.drop(self._drop_indices_row, inplace=True) if self.on_cols and on_cols: _data = _data.T.drop_duplicates() _data = _data.T self.logging(' shape: {}'.format(_shape(_data)), level=logging.DEBUG) return _data
def transform_y(self, y): """Transform y just dropping irrelevant columns. Args: :param y: a Pandas Dataframe of shape [n_samples] the target Returns: :return y_new: the new transformed y """ if len(self.y_cols) == 0: return y self.logging('y shape: {}'.format(_shape(y)), level=logging.DEBUG) y_new = y.copy() if len(self.y_cols) > 0: y_new.drop( list(set(y_new.columns.values.tolist()) - set(self.y_cols)), axis=1, inplace=True) self.logging(' shape: {}'.format(_shape(y_new)), level=logging.DEBUG) return y_new
def transform_x(self, x): """Transform x just dropping irrelevant columns. Args: :param x: a Pandas Dataframe of shape [n_samples, n_features] the dataset Returns:s :return x_new: the new transformed x """ if len(self.x_cols) == 0: return x self.logging('x shape: {}'.format(_shape(x)), level=logging.DEBUG) x_new = x.copy() if len(self.x_cols) > 0: x_new.drop( list(set(x_new.columns.values.tolist()) - set(self.x_cols)), axis=1, inplace=True) self.logging(' shape: {}'.format(_shape(x_new)), level=logging.DEBUG) return x_new
def _fit(self, data, cols, is_categorical, cat2num): if len(cols) == 0: return data assert _shape(data)[1] >= 2, 'at least 2 columns are needed' _data = data.copy() nan_ids = pandas.isnull(data).any(axis=1).nonzero()[0].tolist() _data.drop(nan_ids, inplace=True) self.logging('current shape: {}'.format(_shape(_data))) assert _shape( _data)[0] >= self.k, 'not enough rows: {} removed {} rows'.format( _shape(_data)[0], len(nan_ids)) _data, _ = cat2num.fit_transform(_data, None) models = {} for col, cat in zip(cols, is_categorical): if cat: models[col] = KNeighborsClassifier(n_neighbors=self.k) else: models[col] = KNeighborsRegressor(n_neighbors=self.k) x_train = _data[list(set(_data.columns.tolist()) - set([col]))] y_train = _data[col] self.logging( 'fitting imputing_model for column: {} x_train.shape={} y_train.shape={}' .format(col, _shape(x_train), _shape(y_train)), level=logging.DEBUG) models[col].fit(x_train, y_train) return models
def transform_y(self, y): self.logging('y shape: {}'.format(_shape(y)), level=logging.DEBUG) return self._transform(y, self.on_y)
def transform_x(self, x): self.logging('x shape: {}'.format(_shape(x)), level=logging.DEBUG) return self._transform(x, self.on_x)
def _transform(self, data): _data = data.copy() _data.drop(self._drop_indices, inplace=True) self.logging(' shape: {}'.format(_shape(_data)), level=logging.DEBUG) return _data
def _transform(self, data): _data = data.copy() _data.dropna(axis=1, how=self.how, inplace=True) self.logging(' shape: {}'.format(_shape(_data)), level=logging.DEBUG) return _data