def balance_data(self, data_model: DataModel, target_column_name: str) -> DataModel: weights_by_label = self.map_weights_by_cat_label( data_model, target_column_name) df = data_model.get_dataframe() weights = self.get_weights_list(df, target_column_name, weights_by_label) weights_column_warning = 'note: adding weights column ({}), make sure it is passed to the estimator- and ' \ 'data builder!'.format(WEIGHTS_COLUMN) warnings.warn(weights_column_warning) df[WEIGHTS_COLUMN] = weights data_model.set_dataframe(df) return data_model
def balance_data(self, data_model: DataModel, target_column_name: str) -> DataModel: long_stack, short_stack = self.prepare_data( data_model=data_model, target_column_name=target_column_name) length_to_have = len(long_stack) duplicate_short_stack = short_stack.copy() while len(short_stack) < length_to_have: short_stack = pd.concat([short_stack, duplicate_short_stack]) short_stack = self.cut_df_to_length(short_stack, length_to_have) self.validate_result(long_stack, short_stack) new_df = self.merge_stacks(long_stack, short_stack) data_model.set_dataframe(new_df) return data_model
def randomize_data(data: DataModel, seed: int): df = data.get_dataframe() df = df.sample(frac=1, random_state=seed) data.set_dataframe(df)