# iterate over 30 runs for run in range(repetitions): logger.log_message(f'Starting run {run}') # take random sample from the training data train_data = pd.read_hdf(data_path, 'partB_train_normalized') test_data = pd.read_hdf(data_path, 'partB_test_normalized') logger.log_message('Data imbalance levels before sampling') logger.log_message( get_binary_imbalance_ratio(train_data['exclusion'])) pos_train, neg_train = split_on_binary_attribute( train_data, attribute='exclusion', pos_label=1, neg_label=0) train_data = apply_ros_rus(pos_train, neg_train, ros_rate=ros_rate, rus_rate=minority_ratio) del pos_train del neg_train logger.log_message('Minority class ratio after sampling: ') logger.log_message( get_binary_imbalance_ratio(train_data['exclusion'])) # separate features from labels train_y = train_data['exclusion'] train_x = train_data.drop(columns=['exclusion']) test_y = test_data['exclusion'] test_x = test_data.drop(columns=['exclusion']) # create subset of features
# -------------------------------------------------- # train_data = pd.read_hdf(data_path, key=train_key) logger.log_message('Data imbalance levels before sampling') logger.log_message(get_imbalance_description(train_data['class'])) logger.log_message('Size of train data = ' + str(len(train_data))) # LOAD NORMALIZED TEST DATA # -------------------------------------------------- # test_data = pd.read_hdf(data_path, key=test_key) # APPLY SAMPLING TO THE TRAINING DATA # -------------------------------------------------- pos_train, neg_train = split_on_binary_attribute(train_data, attribute='class', pos_label=1, neg_label=0) train_data = apply_ros_rus(pos_train, neg_train, ros_rate=ros_rate, rus_rate=rus_rate) del pos_train del neg_train # SEPARATE FEATURES/LABELS # -------------------------------------------------- train_y = train_data['class'] train_x = train_data.drop(columns=['class']) test_y = test_data['class'] test_x = test_data.drop(columns=['class']) logger.log_message('Training data imbalance levels after sampling') logger.log_message(get_imbalance_description(train_y)) logger.log_message('Test data imbalance levels') logger.log_message(get_imbalance_description(test_y))