Exemplo n.º 1
0
    def fill_nan_by_mode(self,
                         df,
                         df_features,
                         feature_name,
                         z_score=None,
                         _add_to_que=True):

        check_if_feature_exists(df,
                                feature_name)

        if feature_name not in df_features.all_features():
            raise KeyError(
                f"The feature \'{feature_name}\' was not found in the dataframe!"
                + " Please select a valid feature from the df_features.")

        if z_score:
            series_obj = zcore_remove_outliers(df,
                                               feature_name,
                                               z_score).dropna()
        else:
            series_obj = df[feature_name].dropna()

        mode_series = series_obj.mode()
        if not len(mode_series):
            pass
        else:
            replace_value = mode_series[0]

        if not self.__test_cleaning_methods:
            print("Fill nan by mode")
            self.fill_nan_with_specfic_value(df,
                                             df_features,
                                             feature_name=feature_name,
                                             replace_value=replace_value,
                                             _add_to_que=_add_to_que)
Exemplo n.º 2
0
    def fill_nan_with_specfic_value(self,
                                    df,
                                    df_features,
                                    feature_name,
                                    replace_value,
                                    _add_to_que=True):

        check_if_feature_exists(df,
                                feature_name)

        if feature_name not in df_features.all_features():
            raise KeyError(
                f"The feature \'{feature_name}\' was not found in the dataframe!"
                + " Please select a valid feature from the df_features.")

        try:
            replace_value = replace_value.item()
        except AttributeError:
            pass

        if not self.__test_cleaning_methods:
            df[feature_name].fillna(replace_value,
                                    inplace=True)

            if _add_to_que:
                params_dict = locals()
                parameters = get_parameters(self.fill_nan_with_specfic_value)

                print("Replace nan with {0} on feature: {1}".format(replace_value,
                                                                    feature_name))

                self._DataPipelineSegment__add_function_to_que("fill_nan_with_specfic_value",
                                                               parameters,
                                                               params_dict)
Exemplo n.º 3
0
    def make_nan_assertions(self,
                            df,
                            df_features,
                            feature_name,
                            _add_to_que=True):
        """

            Make nan assertions for boolean features.

        Args:
            df: pd.Dataframe
                Pandas Dataframe

            df_features: DataFrameType from eflow
                Organizes feature types into groups.

            feature_name: string
                Name of the feature in the datatframe

            _add_to_que: bool
                Pushes the function to pipeline segment parent if set to 'True'.
        """

        check_if_feature_exists(df,
                                feature_name)

        if feature_name not in df_features.all_features():
            raise KeyError(
                f"The feature \'{feature_name}\' was not found in the dataframe!"
                + " Please select a valid feature from the df_features.")


        if feature_name not in df_features.bool_features():
            raise UnsatisfiedRequirments(f"{feature_name} must be a bool feature.")

        unique_series = df[feature_name].dropna().unique().tolist()

        if len(unique_series) == 1 and (
                unique_series[0] == 1 or unique_series[0] == 0):
            replace_value = int(unique_series[0] == 1)

            self.fill_nan_with_specfic_value(df,
                                             df_features,
                                             feature_name=feature_name,
                                             replace_value=replace_value,
                                             _add_to_que=_add_to_que)

        else:
            raise UnsatisfiedRequirments(f"Boolean assertions can't be made with this given feature {feature_name}.")
Exemplo n.º 4
0
    def remove_nans(self,
                    df,
                    df_features,
                    feature_name,
                    _add_to_que=True):
        """

            Remove rows of data based on the given feature.

        Args:
            df: pd.Dataframe
                Pandas Dataframe

            df_features: DataFrameType from eflow
                Organizes feature types into groups.

            feature_name: string
                Name of the feature in the datatframe

            _add_to_que: bool
                Pushes the function to pipeline segment parent if set to 'True'.
        """
        check_if_feature_exists(df,
                                feature_name)

        if feature_name not in df_features.all_features():
            raise KeyError(
                f"The feature \'{feature_name}\' was not found in the dataframe!"
                + " Please select a valid feature from the df_features.")

        if not self.__test_cleaning_methods:

            print(f"Remove data from rows where the feature {feature_name} is equal to nan")

            df[feature_name].dropna(inplace=True)
            df.reset_index(drop=True,
                           inplace=True)
            df_features.remove_feature(feature_name)

            if _add_to_que:

                # Remove any unwanted arguments in params_dict
                params_dict = locals()
                parameters = get_parameters(self.remove_nans)

                self._DataPipelineSegment__add_function_to_que("remove_nans",
                                                               parameters,
                                                               params_dict)
Exemplo n.º 5
0
    def fill_nan_by_average(self,
                            df,
                            df_features,
                            feature_name,
                            z_score=None,
                            _add_to_que=True):

        check_if_feature_exists(df,
                                feature_name)

        if feature_name not in df_features.all_features():
            raise KeyError(
                f"The feature \'{feature_name}\' was not found in the dataframe!"
                + " Please select a valid feature from the df_features.")

        params_dict = locals()

        # Remove any unwanted arguments in params_dict
        if _add_to_que:
            params_dict = locals()
            for arg in ["self", "df", "df_features", "_add_to_que",
                        "params_dict"]:
                del params_dict[arg]

        if feature_name not in df_features.continuous_numerical_features():
            raise UnsatisfiedRequirments(f"{feature_name} must be a saved as float or integer in df_features")

        if z_score:
            if isinstance(z_score,float) or isinstance(z_score,int):
                series_obj = zcore_remove_outliers(df,
                                                   feature_name,
                                                   z_score).dropna()
            else:
                raise ValueError("Z-Score must be at numerical value.")
        else:
            series_obj = df[feature_name].dropna()

        replace_value = series_obj.mean()

        if not self.__test_cleaning_methods:
            print("Fill nan based on the average of the distribution.")
            self.fill_nan_with_specfic_value(df,
                                             df_features,
                                             feature_name=feature_name,
                                             replace_value=replace_value,
                                             _add_to_que=_add_to_que)
Exemplo n.º 6
0
    def drop_feature(self,
                     df,
                     df_features,
                     feature_name,
                     _add_to_que=True):
        """
            Drop a feature in the dataframe.

        Args:
            df: pd.Dataframe
                Pandas Dataframe

            df_features: DataFrameType from eflow
                Organizes feature types into groups.

            feature_name: string
                Name of the feature in the datatframe

            _add_to_que: bool
                Pushes the function to pipeline segment parent if set to 'True'.
        """
        check_if_feature_exists(df,
                                feature_name)

        if feature_name not in df_features.all_features():
            raise KeyError(
                f"The feature \'{feature_name}\' was not found in the dataframe!"
                + " Please select a valid feature from the df_features.")

        if not self.__test_cleaning_methods:

            print("Droping Feature: ", feature_name)

            df.drop(columns=feature_name,
                    inplace=True)
            df.reset_index(drop=True,
                           inplace=True)
            df_features.remove_feature(feature_name)

            if _add_to_que:
                params_dict = locals()
                parameters = get_parameters(self.drop_feature)

                self._DataPipelineSegment__add_function_to_que("drop_feature",
                                                               parameters,
                                                               params_dict)
Exemplo n.º 7
0
    def remove_features(self,
                        df,
                        df_features,
                        feature_names,
                        _add_to_que=True):
        """

            Removes unwanted features from the dataframe and saves them to the
            pipeline segment structure if _add_to_que is set to True.

        Args:
            df:
                Pandas Dataframe to update.

            df_features:
                DataFrameTypes object to update.

            feature_names:
                Features to remove

            _add_to_que:
                Pushes the function to pipeline segment parent if set to 'True'.
        """

        if isinstance(feature_names, str):
            feature_names = [feature_names]

        for feature_n in feature_names:

            try:

                if feature_n in df_features.all_features():
                    df_features.remove_feature(feature_n)

                check_if_feature_exists(df, feature_n)
                df.drop(columns=[feature_n], inplace=True)

            except KeyError:
                pass

        if _add_to_que:
            params_dict = locals()
            parameters = get_parameters(self.remove_features)
            self._DataPipelineSegment__add_function_to_que(
                "remove_features", parameters, params_dict)
Exemplo n.º 8
0
    def fill_nan_by_occurance_percentaile(self,
                                          df,
                                          df_features,
                                          feature_name,
                                          percentaile,
                                          z_score=None,
                                          _add_to_que=True):
        check_if_feature_exists(df,
                                feature_name)

        if feature_name not in df_features.all_features():
            raise KeyError(
                f"The feature \'{feature_name}\' was not found in the dataframe!"
                + " Please select a valid feature from the df_features.")

        # Remove any unwanted arguments in params_dict
        if _add_to_que:
            params_dict = locals()
            for arg in ["self", "df", "df_features", "_add_to_que",
                        "params_dict"]:
                try:
                    del params_dict[arg]
                except KeyError:
                    pass

        if z_score:
            series_obj = zcore_remove_outliers(df,
                                               feature_name,
                                               z_score).dropna()
        else:
            series_obj = df[feature_name].dropna()

        array = np.asarray(series_obj.value_counts() / df.dropna().shape[0])
        idx = (np.abs(array - (percentaile / 100))).argmin()
        replace_value = series_obj.value_counts().keys()[idx]

        if not self.__test_cleaning_methods:
            print("Fill nan by occurance percentaile.")
            self.fill_nan_with_specfic_value(df,
                                             df_features,
                                             feature_name=feature_name,
                                             replace_value=replace_value,
                                             _add_to_que=_add_to_que)
Exemplo n.º 9
0
    def fill_nan_by_distribution(self,
                                 df,
                                 df_features,
                                 feature_name,
                                 percentile,
                                 z_score=None,
                                 _add_to_que=True):
        """

            Fill nan by the distribution of data.

        Args:
            df: pd.Dataframe
                Pandas Dataframe

            df_features: DataFrameType from eflow
                Organizes feature types into groups.

            feature_name: string
                Name of the feature in the datatframe

            percentile: float or int

            z_score:

            _add_to_que: bool
                Pushes the function to pipeline segment parent if set to 'True'.
        """
        check_if_feature_exists(df,
                                feature_name)

        if feature_name not in df_features.all_features():
            raise KeyError(
                f"The feature \'{feature_name}\' was not found in the dataframe!"
                + " Please select a valid feature from the df_features.")

        if feature_name in df_features.continuous_numerical_features():
            series_obj = df[feature_name].sort_values()
        else:
            series_obj = df.sort_values([feature_name],
                                        ascending=True).groupby(feature_name).head(float("inf"))[feature_name]

        if z_score:
            if isinstance(z_score, float) or isinstance(z_score, int):
                series_obj = zcore_remove_outliers(series_obj.to_frame(),
                                                   feature_name,
                                                   z_score).dropna()
            else:
                raise ValueError("Z-Score must be at numerical value.")
        else:
            series_obj = df[feature_name].dropna()

        replace_value = np.percentile(series_obj, percentile)

        # Remove any unwanted arguments in params_dict

        if not self.__test_cleaning_methods:
            print(f"Fill nan on distribution; {percentile}% of {feature_name}")
            self.fill_nan_with_specfic_value(df,
                                             df_features,
                                             feature_name=feature_name,
                                             replace_value=replace_value,
                                             _add_to_que=_add_to_que)