Exemplo n.º 1
0
 def remove_outliers(self, df, outlier_removal_col):
     '''Need to check how it will affect multiple columns'''
     outlier_count, ol_lower_range, ol_upper_range = Stats.detect_outliers_z(
         self._data_frame, outlier_removal_col)
     df = self._data_frame.filter(
         self._data_frame[outlier_removal_col] > ol_lower_range)
     df = self._data_frame.filter(
         self._data_frame[outlier_removal_col] < ol_upper_range)
     return df
Exemplo n.º 2
0
 def cap_outliers(self, outlier_replacement_col):
     outlier_count, ol_lower_range, ol_upper_range = Stats.detect_outliers_z(
         self._data_frame, outlier_replacement_col)
     df_dup = self._data_frame
     self._data_frame = df_dup.withColumn(
         outlier_replacement_col,
         when((df_dup[outlier_replacement_col] < ol_lower_range),
              ol_lower_range).otherwise(df_dup[outlier_replacement_col]))
     self._data_frame = self._data_frame.withColumn(
         outlier_replacement_col,
         when((self._data_frame[outlier_replacement_col] > ol_upper_range),
              ol_upper_range).otherwise(
                  self._data_frame[outlier_replacement_col]))
     return self._data_frame
Exemplo n.º 3
0
 def mode_impute_outliers(self, outlier_imputation_col):
     outlier_count, ol_lower_range, ol_upper_range = Stats.detect_outliers_z(
         self._data_frame, outlier_imputation_col)
     # df_dup = self._data_frame
     df_without_outliers = self.remove_outliers(self._data_frame,
                                                outlier_imputation_col)
     mode_without_outliers = self.get_mode(
         self._data_frame, df_without_outliers[outlier_imputation_col])
     self._data_frame = self._data_frame.withColumn(
         outlier_imputation_col,
         when((self._data_frame[outlier_imputation_col] < ol_lower_range) |
              (self._data_frame[outlier_imputation_col] > ol_upper_range),
              mode_without_outliers).otherwise(
                  self._data_frame[outlier_imputation_col]))
     return self._data_frame
Exemplo n.º 4
0
 def mean_impute_outliers(self, outlier_imputation_col):
     outlier_count, ol_lower_range, ol_upper_range = Stats.detect_outliers_z(
         self._data_frame, outlier_imputation_col)
     # df_dup = self._data_frame
     df_without_outliers = self.remove_outliers(self._data_frame,
                                                outlier_imputation_col)
     mean_without_outliers = df_without_outliers.agg(
         avg(outlier_imputation_col)).first()[0]
     self._data_frame = self._data_frame.withColumn(
         outlier_imputation_col,
         when((self._data_frame[outlier_imputation_col] < ol_lower_range) |
              (self._data_frame[outlier_imputation_col] > ol_upper_range),
              mean_without_outliers).otherwise(
                  self._data_frame[outlier_imputation_col]))
     return self._data_frame