예제 #1
0
    def transform(self, X: dd, y=None):
        """
        Remove columns computed in fit method

        Args:
            X (dd): Dataframe to be processed
            y (dd, optional): Target. Defaults to None.

        Returns:
            (dd): Dataframe with columns removed
        """
        return X.drop(labels=self.feature_names, axis=1)
예제 #2
0
    def remove_outliers(self, data: dataframe, threshold: float):

        data = data.compute(num_workers=self.workers)
        stats: dict = {
            "mean": data[self.cols["CONTINUOUS"]].mean(axis=0),
            "std_dev": data[self.cols["CONTINUOUS"]].std(axis=0)
        }

        z_cols = list(map(lambda col: "z" + col, self.cols["CONTINUOUS"]))
        zdata = data[self.cols["CONTINUOUS"]].apply(
            lambda col: (col - stats["mean"][col.name]) /
            (stats["std_dev"][col.name]),
            axis=0)
        zdata.columns = z_cols

        data = concat([data, zdata], axis=1)
        for z_col in z_cols:
            data = data[data[z_col].between(-1 * threshold, threshold)]

        return dataframe.from_pandas(
            data.drop(columns=z_cols).reset_index(drop=True),
            npartitions=self.workers)