def train_xgboost(train_X, train_y, valid_X, valid_y, test_X, test_y, learning_rate=None, mask=None): learning_rate = 0.01 n_estimators = 350 subsample = 0.3 if mask: train_X = mask_columns(train_X, mask) valid_X = mask_columns(valid_X, mask) test_X = mask_columns(test_X, mask) xgb_model = xgb.XGBClassifier(objective="binary:logistic", tree_method='gpu_hist', predictor='gpu_predictor') xgb_model.learning_rate = learning_rate xgb_model.n_estimators = n_estimators xgb_model.subsample = subsample xgb_model.fit(train_X, train_y.values.ravel()) train_pred = xgb_model.predict(train_X) valid_pred = xgb_model.predict(valid_X) test_pred = xgb_model.predict(test_X) return (get_metrics(train_y, train_pred, verbose=True), get_metrics(valid_y, valid_pred, verbose=True), get_metrics(test_y, test_pred, verbose=True))
def train_neural_net(train_X, train_y, valid_X, valid_y, test_X, test_y, plotting=False, save_model=False): # settings: path = '~/Desktop/keras_models' # WRONG XD def scheduler(epoch, lr): if epoch < 20: return lr else: return lr * math.exp(-0.2) learning_rate = 0.0006 epochs = 22 batch_size = 8500 model = Sequential() model.add(Input(shape=(len(train_X.columns), ))) model.add(Dense(1000, activation='relu')) model.add(Dense(1500, activation='relu')) model.add(Dense(400, activation='relu')) model.add(Dense(1, activation='sigmoid')) optimizer = Adam(learning_rate=learning_rate) model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy']) lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler) history = model.fit(train_X, train_y, epochs=epochs, batch_size=batch_size, callbacks=[lr_scheduler], verbose=0, validation_data=(valid_X, valid_y)) train_pred = model.predict_classes(train_X) train_pred = [1 in lst for lst in train_pred] valid_pred = model.predict_classes(valid_X) valid_pred = [1 in lst for lst in valid_pred] test_pred = model.predict_classes(test_X) test_pred = [1 in lst for lst in test_pred] if plotting: plot_history_loss(history, "Artificial Neural Network") if save_model: model.save(os.path.join(path, next_name(path, "model-{0}"))) return (get_metrics(train_y, train_pred, verbose=True), get_metrics(valid_y, valid_pred, verbose=True), get_metrics(test_y, test_pred, verbose=True))
def train_svm_grid(train_path, test_path): train_X, train_y = prepare_data(train_path) test_X, test_y = prepare_data(test_path) for kernel in ['rbf', 'linear', 'poly', 'sigmoid']: for class_weight in [None, 'balanced']: for gamma in ['auto', 'scale']: for C in [0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3]: classifier = svm.SVC(kernel=kernel, class_weight=class_weight, gamma=gamma, C=C) classifier.fit(train_X, train_y.values.ravel()) test_pred = classifier.predict(test_X) train_pred = classifier.predict(train_X) get_metrics(test_y, test_pred) get_metrics(train_y, train_pred)
def train_cumulative_voting(train_X, train_y, valid_X, valid_y, test_X, test_y, threshold=0.5, mask=None): if mask: train_X = mask_columns(train_X, mask) valid_X = mask_columns(valid_X, mask) test_X = mask_columns(test_X, mask) model = CumulativeVoting(threshold) train_pred = model.predict(train_X) valid_pred = model.predict(valid_X) test_pred = model.predict(test_X) return (get_metrics(train_y, train_pred, verbose=True), get_metrics(valid_y, valid_pred, verbose=True), get_metrics(test_y, test_pred, verbose=True))
def train_svm(kernel, train_X, train_y, valid_X, valid_y, test_X, test_y, mask=None): if mask: train_X = mask_columns(train_X, mask) valid_X = mask_columns(valid_X, mask) test_X = mask_columns(test_X, mask) classifier = svm.SVC(kernel=kernel) classifier.fit(train_X, train_y.values.ravel()) train_pred = classifier.predict(train_X) valid_pred = classifier.predict(valid_X) test_pred = classifier.predict(test_X) return (get_metrics(train_y, train_pred, verbose=True), get_metrics(valid_y, valid_pred, verbose=True), get_metrics(test_y, test_pred, verbose=True))
def train_majority_voting(train_X, train_y, valid_X, valid_y, test_X, test_y, vote_threshold=0.5, count_threshold=0.5, mask=None): if mask: train_X = mask_columns(train_X, mask) valid_X = mask_columns(valid_X, mask) test_X = mask_columns(test_X, mask) model = MajorityVoting(vote_threshold, count_threshold) train_pred = model.predict(train_X) valid_pred = model.predict(valid_X) test_pred = model.predict(test_X) return (get_metrics(train_y, train_pred, verbose=True), get_metrics(valid_y, valid_pred, verbose=True), get_metrics(test_y, test_pred, verbose=True))
def train_xgboost_gridsearch(train_X, train_y, valid_X, valid_y, test_X, test_y, learning_rate=None, mask=None): if mask: train_X = mask_columns(train_X, mask) valid_X = mask_columns(valid_X, mask) test_X = mask_columns(test_X, mask) param_grid_gb = { 'learning_rate': [0.01], 'n_estimators': [350], 'subsample': [0.3] } # Regressor Instantiation gb = xgb.XGBClassifier() mse_grid = GridSearchCV(estimator=gb, param_grid=param_grid_gb, scoring='neg_mean_squared_error', cv=4, verbose=2) # xgb_model = xgb.XGBClassifier(objective="binary:logistic") mse_grid.fit(train_X, train_y.values.ravel()) train_pred = mse_grid.predict(train_X) valid_pred = mse_grid.predict(valid_X) test_pred = mse_grid.predict(test_X) print("Best parameters:", mse_grid.best_params_) return (get_metrics(train_y, train_pred, verbose=True), get_metrics(valid_y, valid_pred, verbose=True), get_metrics(test_y, test_pred, verbose=True))