def transform(self, X: dd, y=None): """ Remove columns computed in fit method Args: X (dd): Dataframe to be processed y (dd, optional): Target. Defaults to None. Returns: (dd): Dataframe with columns removed """ return X.drop(labels=self.feature_names, axis=1)
def remove_outliers(self, data: dataframe, threshold: float): data = data.compute(num_workers=self.workers) stats: dict = { "mean": data[self.cols["CONTINUOUS"]].mean(axis=0), "std_dev": data[self.cols["CONTINUOUS"]].std(axis=0) } z_cols = list(map(lambda col: "z" + col, self.cols["CONTINUOUS"])) zdata = data[self.cols["CONTINUOUS"]].apply( lambda col: (col - stats["mean"][col.name]) / (stats["std_dev"][col.name]), axis=0) zdata.columns = z_cols data = concat([data, zdata], axis=1) for z_col in z_cols: data = data[data[z_col].between(-1 * threshold, threshold)] return dataframe.from_pandas( data.drop(columns=z_cols).reset_index(drop=True), npartitions=self.workers)