def create_tr_tst_dfs(tr_set, tr_path, tst_set, tst_path, cl1, cl2, cl3, cl4, cl1_name, cl2_name, cl3_name, cl4_name): list_of_dataframes = [] for f in tst_set: list_of_dataframes.append(pd.read_csv('/'.join((tst_path, f)))) tst_df = pd.concat(list_of_dataframes) tst_df.drop(['Date', 'Time'], axis=1, inplace=True) tst_df = tst_df.loc[(tst_df['Label'] == cl1) | (tst_df['Label'] == cl2) | (tst_df['Label'] == cl3) | (tst_df['Label'] == cl4)] X_test, y_test = tst_df.iloc[:, :-1], tst_df.iloc[:, -1] list_of_dataframes = [] for f in tr_set: list_of_dataframes.append(pd.read_csv('/'.join((tr_path, f)))) tr_df = pd.concat(list_of_dataframes) tr_df.drop(['Date', 'Time'], axis=1, inplace=True) tr_df = tr_df.loc[(tr_df['Label'] == cl1) | (tr_df['Label'] == cl2) | (tr_df['Label'] == cl3) | (tr_df['Label'] == cl4)] #Show the training set class distribution show_class_distribution(tr_df, cl1, cl2, cl3, cl4, cl1_name, cl2_name, cl3_name, cl4_name) X_train, y_train = tr_df.iloc[:, :-1], tr_df.iloc[:, -1] target_count = tr_df.Label.value_counts() min_n_samples = min(target_count[cl1], target_count[cl2], target_count[cl3], target_count[cl4]) if target_count[cl1] != target_count[cl2] != target_count[ cl3] != target_count[cl4]: os = RandomUnderSampler( sampling_strategy={ cl1: min_n_samples, cl2: min_n_samples, cl3: min_n_samples, cl4: min_n_samples }) X_new, y_new = os.fit_resample(X_train, y_train) print('Original dataset shape {}'.format(Counter(y_train))) print('Resampled dataset shape {}'.format(Counter(y_new))) #Show the resampled training set class distribution resampled_df = X_new resampled_df.insert(len(resampled_df.columns), 'Label', y_new) show_class_distribution(resampled_df, cl1, cl2, cl3, cl4, cl1_name, cl2_name, cl3_name, cl4_name) X_new.drop(['Label'], axis=1, inplace=True) return X_new, y_new, X_test, y_test else: return X_train, y_train, X_test, y_test
def _smote(self, df): X = df.loc[:, df.columns != 'y'] y = df.loc[:, df.columns == 'y'] os = SMOTE(random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0) os_data_X, os_data_y = os.fit_resample(X_train, y_train) return os_data_X, os_data_y
def create_tr_tst_dfs(df,class1,class2): #Split into train and test X, y = df.iloc[:,:-1], df.iloc[:,-1] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, stratify = y) target_count = df.Label.value_counts() min_n_samples = min(target_count[class1], target_count[class2]) if target_count[class1] != target_count[class2]: os = RandomUnderSampler(sampling_strategy='majority') X_new, y_new = os.fit_resample(X_train, y_train) print('Original dataset shape {}'.format(Counter(y_train))) print('Resampled dataset shape {}'.format(Counter(y_new))) #Show the resampled training set class distribution resampled_df = X_new resampled_df.insert(len(resampled_df.columns),'Label',y_new) show_class_distribution(resampled_df,class1,class2,class1_name,class2_name) X_new.drop(['Label'], axis = 1, inplace = True) return X_new, y_new, X_test, y_test else: return X_train, y_train, X_test, y_test
def overSampling(self,ratio): from imblearn.over_sampling import RandomOverSampler from collections import Counter os=RandomOverSampler(ratio) self.X_train_, self.Y_train_ =os.fit_resample(self.X_train_, self.Y_train_) print('Over Sampled Training Dataset Shape {}'.format(Counter(self.Y_train_)))
data_final = df[to_keep] print(len(data_final.columns.values)) #data_final.to_csv(f"{path}data_lr5") X = data_final.loc[:, data_final.columns != 'MCQ010'] y = data_final.loc[:, data_final.columns == 'MCQ010'] os = SMOTE(random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) columns = X_train.columns os_data_X, os_data_y = os.fit_resample(X_train, y_train) os_data_X = pandas.DataFrame(data=os_data_X, columns=columns) os_data_y = pandas.DataFrame(data=os_data_y, columns=['MCQ010']) # we can Check the numbers of our data print("length of oversampled data is ", len(os_data_X)) print("Number of no subscription in oversampled data", len(os_data_y[os_data_y['MCQ010'] == 0])) print("Number of subscription", len(os_data_y[os_data_y['MCQ010'] == 1])) print("Proportion of no subscription data in oversampled data is ", len(os_data_y[os_data_y['MCQ010'] == 0]) / len(os_data_X)) print("Proportion of subscription data in oversampled data is ", len(os_data_y[os_data_y['MCQ010'] == 1]) / len(os_data_X)) data_final_vars = data_final.columns.values.tolist() y = ['MCQ010']