def create_tr_tst_dfs(tr_set, tr_path, tst_set, tst_path, cl1, cl2, cl3, cl4,
                      cl1_name, cl2_name, cl3_name, cl4_name):
    list_of_dataframes = []
    for f in tst_set:
        list_of_dataframes.append(pd.read_csv('/'.join((tst_path, f))))

    tst_df = pd.concat(list_of_dataframes)
    tst_df.drop(['Date', 'Time'], axis=1, inplace=True)
    tst_df = tst_df.loc[(tst_df['Label'] == cl1) | (tst_df['Label'] == cl2) |
                        (tst_df['Label'] == cl3) | (tst_df['Label'] == cl4)]
    X_test, y_test = tst_df.iloc[:, :-1], tst_df.iloc[:, -1]

    list_of_dataframes = []
    for f in tr_set:
        list_of_dataframes.append(pd.read_csv('/'.join((tr_path, f))))

    tr_df = pd.concat(list_of_dataframes)
    tr_df.drop(['Date', 'Time'], axis=1, inplace=True)
    tr_df = tr_df.loc[(tr_df['Label'] == cl1) | (tr_df['Label'] == cl2) |
                      (tr_df['Label'] == cl3) | (tr_df['Label'] == cl4)]

    #Show the training set class distribution
    show_class_distribution(tr_df, cl1, cl2, cl3, cl4, cl1_name, cl2_name,
                            cl3_name, cl4_name)

    X_train, y_train = tr_df.iloc[:, :-1], tr_df.iloc[:, -1]
    target_count = tr_df.Label.value_counts()

    min_n_samples = min(target_count[cl1], target_count[cl2],
                        target_count[cl3], target_count[cl4])

    if target_count[cl1] != target_count[cl2] != target_count[
            cl3] != target_count[cl4]:
        os = RandomUnderSampler(
            sampling_strategy={
                cl1: min_n_samples,
                cl2: min_n_samples,
                cl3: min_n_samples,
                cl4: min_n_samples
            })
        X_new, y_new = os.fit_resample(X_train, y_train)

        print('Original dataset shape {}'.format(Counter(y_train)))
        print('Resampled dataset shape {}'.format(Counter(y_new)))

        #Show the resampled training set class distribution
        resampled_df = X_new
        resampled_df.insert(len(resampled_df.columns), 'Label', y_new)
        show_class_distribution(resampled_df, cl1, cl2, cl3, cl4, cl1_name,
                                cl2_name, cl3_name, cl4_name)

        X_new.drop(['Label'], axis=1, inplace=True)
        return X_new, y_new, X_test, y_test
    else:
        return X_train, y_train, X_test, y_test
Exemplo n.º 2
0
    def _smote(self, df):
        X = df.loc[:, df.columns != 'y']
        y = df.loc[:, df.columns == 'y']
        os = SMOTE(random_state=0)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.30,
                                                            random_state=0)
        os_data_X, os_data_y = os.fit_resample(X_train, y_train)

        return os_data_X, os_data_y
Exemplo n.º 3
0
def create_tr_tst_dfs(df,class1,class2):
    #Split into train and test
    X, y = df.iloc[:,:-1], df.iloc[:,-1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, stratify = y)

    target_count = df.Label.value_counts()
    min_n_samples = min(target_count[class1], target_count[class2])

    if target_count[class1] != target_count[class2]:
        os = RandomUnderSampler(sampling_strategy='majority')
        X_new, y_new = os.fit_resample(X_train, y_train)

        print('Original dataset shape {}'.format(Counter(y_train)))
        print('Resampled dataset shape {}'.format(Counter(y_new)))

        #Show the resampled training set class distribution
        resampled_df = X_new
        resampled_df.insert(len(resampled_df.columns),'Label',y_new)
        show_class_distribution(resampled_df,class1,class2,class1_name,class2_name)

        X_new.drop(['Label'], axis = 1, inplace = True)
        return X_new, y_new, X_test, y_test
    else:
        return X_train, y_train, X_test, y_test
 def overSampling(self,ratio):
     from imblearn.over_sampling import RandomOverSampler
     from collections import Counter
     os=RandomOverSampler(ratio)
     self.X_train_, self.Y_train_ =os.fit_resample(self.X_train_, self.Y_train_)
     print('Over Sampled  Training Dataset Shape {}'.format(Counter(self.Y_train_)))
Exemplo n.º 5
0
data_final = df[to_keep]
print(len(data_final.columns.values))

#data_final.to_csv(f"{path}data_lr5")

X = data_final.loc[:, data_final.columns != 'MCQ010']
y = data_final.loc[:, data_final.columns == 'MCQ010']

os = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)
columns = X_train.columns

os_data_X, os_data_y = os.fit_resample(X_train, y_train)
os_data_X = pandas.DataFrame(data=os_data_X, columns=columns)
os_data_y = pandas.DataFrame(data=os_data_y, columns=['MCQ010'])

# we can Check the numbers of our data
print("length of oversampled data is ", len(os_data_X))
print("Number of no subscription in oversampled data",
      len(os_data_y[os_data_y['MCQ010'] == 0]))
print("Number of subscription", len(os_data_y[os_data_y['MCQ010'] == 1]))
print("Proportion of no subscription data in oversampled data is ",
      len(os_data_y[os_data_y['MCQ010'] == 0]) / len(os_data_X))
print("Proportion of subscription data in oversampled data is ",
      len(os_data_y[os_data_y['MCQ010'] == 1]) / len(os_data_X))

data_final_vars = data_final.columns.values.tolist()
y = ['MCQ010']