def preprocess(data_path, save_path): data = pd.read_csv(data_path) preprocessed_data = process_by_column(data) train, test = train_test_split( preprocessed_data, test_size=0.2, stratify=preprocessed_data[["stroke"]] ) columns = list(preprocessed_data.columns) x_columns = columns[:-1] y_columns = [columns[-1]] final_trian_data, final_train_label = SMOTE().fit_resample( train[x_columns], train[y_columns] ) final_test_data = test[x_columns] final_test_label = test[y_columns] final_trian_data.to_csv(save_path + "train_data.csv", encoding="utf-8", index=False) final_train_label.to_csv(save_path + "train_label.csv", encoding="utf-8", index=False) final_test_data.to_csv(save_path + "test_data.csv", encoding="utf-8", index=False) final_test_label.to_csv(save_path + "test_label.csv", encoding="utf-8", index=False)
X = dataset_train.drop(['user_id', 'is_churned'], axis=1) y = dataset_train['is_churned'] scaler = MinMaxScaler() X_mm = scaler.fit_transform(X) with open('source/scaler.pkl', 'wb') as file: pickle.dump(scaler, file) X_train, X_test, y_train, y_test = train_test_split(X_mm, y, test_size=0.3, shuffle=True, stratify=y, random_state=100) # Снизим дизбаланс классов smote_on_1 = int(X_train.shape[0] * 3 / 10) X_train_balanced, y_train_balanced = SMOTE(random_state=42, sampling_strategy={ 1: smote_on_1 }).fit_sample(X_train, y_train) X_train_balanced = pd.DataFrame(X_train_balanced, columns=X.columns) X_train_balanced['is_churned'] = y_train_balanced.values X_train_balanced.to_csv('dataset/dataset_train_balanced.csv', sep=';', index=False) X_test = pd.DataFrame(X_test, columns=X.columns) X_test['is_churned'] = y_test.values X_test.to_csv('dataset/dataset_test_balanced.csv', sep=';', index=False)