예제 #1
0
    def fit(self, X: dd, y=None):
        """Calculate what columns should be removed, based on the defined thresholds

        Args:
            X (dd): Dataframe to be processed
            y (dd, optional): Target. Defaults to None.

        Returns:
            None
        """
        subset = X.select_dtypes(exclude=[np.number, "datetime64[ns]"])

        # Calculate the entropy column-wisely
        entropies_df = subset.compute().apply(entropy,
                                              axis=0).to_frame(name="entropy")
        entropies_df.reset_index(inplace=True)
        entropies_df.rename(columns={"index": "column_name"}, inplace=True)
        entropies_df.sort_values(by="entropy", inplace=True, ascending=False)

        # Get thresholds and calculate what columns will be removed
        thresholds = [float(value) for value in self.entropy_thresholds]
        mask_entropy = entropies_df["entropy"].between(
            min(thresholds), max(thresholds), inclusive=self.inclusive)

        # Get list of columns to be removed
        self.feature_names = list(entropies_df.loc[~mask_entropy,
                                                   "column_name"].values)
        mask_removed = entropies_df["column_name"].isin(self.feature_names)
        entropies_df.loc[mask_removed, "filtered_entropy"] = 1

        return self
예제 #2
0
def make_filter_std_pipeline(data: dd,
                             numerical_columns: list[str] or bool = True,
                             thresholds: list[float] = None,
                             inclusive: bool = False):
    #TODO: write unit tests
    """
    Makes pipeline to filter columns according to standard deviation

    Args:
        data (dd): Data frame to be filtered
        numerical_columns (list or bool, optional): Columns to subset the filtering. Defaults to True.
        thresholds (list, optional): Interval of std values to filter. Defaults to None.
        inclusive (bool, optional):  Includes or not the interval boundaries. Defaults to False.

    Returns:
        EPipeline: Pipeline to filter data frame
    """
    selected_columns = data.select_dtypes(
        include=[np.number]).columns.values if isinstance(
            numerical_columns, bool) else numerical_columns
    steps = [("extract", Extract(selected_columns)),
             ("std_filter",
              Filter_Std(std_thresholds=thresholds, inclusive=inclusive))]

    return EPipeline(steps)
예제 #3
0
def make_filter_entropy_pipeline(data: dd,
                                 categorical_columns: list[str] or bool = True,
                                 thresholds: list[float] = None,
                                 inclusive: bool = False):
    #TODO: write unit tests
    selected_columns = data.select_dtypes(
        exclude=[np.number], include=["object"]) if isinstance(
            categorical_columns, bool) else categorical_columns
    steps = [("extract", Extract(selected_columns)),
             ("entropy_filter",
              Filter_Entropy(entropy_thresholds=thresholds,
                             inclusive=inclusive))]

    return EPipeline(steps)
예제 #4
0
    def fit(self, X: dd, y=None):
        """Calculate what columns should be removed, based on the defined thresholds

        Args:
            X (dd): Dataframe to be processed
            y (dd, optional): Target. Defaults to None.

        Returns:
            None
        """
        subset = X.select_dtypes(include=[np.number])

        # Calculate the standad deviation column-wisely
        stds = np.nanstd(subset, axis=0)

        stds_df = pd.DataFrame.from_dict({
            "column_name": subset.columns.values,
            "std": stds
        })

        stds_df.sort_values(by="std", inplace=True, ascending=False)

        # Get thresholds and calculate what columns will be removed
        thresholds = [float(value) for value in self.std_thresholds]
        mask_variance = stds_df["std"].between(min(thresholds),
                                               max(thresholds),
                                               inclusive=self.inclusive)

        # Get list of columns to be removed
        self.feature_names = list(stds_df.loc[~mask_variance,
                                              "column_name"].values)
        mask_removed = stds_df["column_name"].isin(self.feature_names)

        stds_df.loc[mask_removed, "filtered_variance"] = 1
        stds_df.loc[~mask_removed, "filtered_variance"] = 0

        return self