def get_model(inputSize, classWeight): if aiLib == 'keras': model = Sequential() model.add(Dense(units=1000, activation='tanh', input_shape=(inputSize,), kernel_initializer='lecun_normal', kernel_regularizer=regularizers.l2(0.01), )) model.add(Dropout(rate=0.5)) model.add(Dense(units=1000, activation='tanh', bias_initializer='lecun_normal', bias_regularizer=regularizers.l2(0.01) )) model.add(Dropout(rate=0.5)) model.add(Dense(units=10, activation='softmax')) sgd = SGD(lr=1, clipvalue=0.5, decay=1, momentum=0.5, nesterov=True) model.compile(optimizer='adam', loss='categorical_crossentropy') elif aiLib == 'sklearn': model = SVC(probability=True, class_weight=classWeight) elif aiLib == 'xgboost': model = XGB(probability=True, class_weight=classWeight, eta=1e-3, objective='multi:softprob', num_class=10, max_depth=20) return model
def combinedTwo(X_train, X_test, y_train): # Predict with XGB for fftlog10 clf1 = XGB(n_estimators=1000, gamma=0.87) fft_train = np.log10(np.abs(np.fft.fft(X_train[:, 4:]))[:, :, :63] + 1) fft_train = fft_train.reshape([ np.shape(fft_train)[0], np.shape(fft_train)[1] * np.shape(fft_train)[2] ]) fft_test = np.log10(np.abs(np.fft.fft(X_test[:, 4:]))[:, :, :63] + 1) fft_test = fft_test.reshape( [np.shape(fft_test)[0], np.shape(fft_test)[1] * np.shape(fft_test)[2]]), y_train) p1 = clf1.predict_proba(fft_test) # Predict with RF for mean_std clf2 = RFC(n_estimators=1000) mean_std_train = np.hstack([np.mean(X_train, 2), np.std(X_train, 2)]) mean_std_test = np.hstack([np.mean(X_test, 2), np.std(X_test, 2)]), y_train) p2 = clf2.predict_proba(mean_std_test) # Take the prediction of which one of the classifiers is most sure of p = np.stack([p1, p2]) predicted_classes = np.argmax(np.max(p, 0), 1) return predicted_classes
def try_params(n_iterations, params, data, get_predictions=False): n_estimators = int(round(n_iterations * trees_per_iteration)) print("n_estimators:", n_estimators) pprint(params) clf = XGB(n_estimators=n_estimators, nthread=-1, **params) return train_and_eval_sklearn_classifier(clf, data)
def try_params(n_iterations, params, data, get_predictions=False): n_estimators = int(round(n_iterations * trees_per_iteration)) print("n_estimators:", n_estimators) pprint(params) model = XGB(n_estimators=n_estimators, nthread=-1, **params) return train_and_eval_sklearn_regressor(model, data)
def fast_gbtree_classifier( X, y, *, learning_rate: float = 1.0, n_estimators: int = 100, subsample: float = 0.8, max_depth: Optional[int] = None, reg_alpha: Optional[float] = None, # L1 reg_lambda: Optional[float] = 1e-05, # L2 gamma: Optional[float] = None, missing: Optional[Any] = np.nan, objective: Objectives = 'binary:logistic', grow_policy: Literal['depthwise', 'lossguide'] = 'depthwise', tree_method: Literal['auto', 'exact', 'approx', 'hist', 'gpu_hist'] = 'auto', importance_type: Literal['gain', 'weight', 'cover', 'total_gain', 'total_cover'] = 'gain', random_state: int = 1, n_jobs: Optional[int] = None, framework: Literal['auto', 'xgboost', 'sklearn'] = 'auto', **kwargs, ) -> GradientBoostingClassifier: """Shared interface for XGBoost and sklearn Gradient Boosting Tree Classifier""" kw = dict(locals()) kwargs = kw.pop('kwargs') X = kw.pop('X') y = kw.pop('y') kw.update(kwargs) framework = kw.pop('framework') ### XGBOOST is_xgboost = False if framework == 'sklearn': XGB = GradientBoostingClassifier else: try: from xgboost import XGBRFClassifier as XGB is_xgboost = True except ImportError as e: warn('Run `pip install xgboost` to get significant ' 'faster GradientBoostingTree') XGB = GradientBoostingClassifier ### fine-tune the keywords for sklearn if not is_xgboost: org = dict(kw) spec = inspect.getfullargspec(XGB.__init__) kw = dict() for k in spec.args + spec.kwonlyargs: if k in org: kw[k] = org[k] ### training tree = XGB(**kw), y) return tree
def baseline(X_train, X_test, y_train): # Predict with XGB for fftlog10 clf1 = XGB(n_estimators=300, gamma=0.87) fft_train = np.log10(np.abs(np.fft.fft(X_train[:, 4:]))[:, :, :63] + 1) fft_train = fft_train.reshape([ np.shape(fft_train)[0], np.shape(fft_train)[1] * np.shape(fft_train)[2] ]) fft_test = np.log10(np.abs(np.fft.fft(X_test[:, 4:]))[:, :, :63] + 1) fft_test = fft_test.reshape( [np.shape(fft_test)[0], np.shape(fft_test)[1] * np.shape(fft_test)[2]]), y_train) return clf1.predict(fft_test)
def main(args): start = time.time() if not args.all_feats: data = pickle.load(open(args.pruned_ds, 'rb')) else: data = pickle.load(open(args.full_ds, 'rb')) data = np.array([feats[1] for feats in data]) X = data[:, 1:] y = data[:,0] if args.num_folds > 0: print(f'Performing {args.num_folds}-fold validation') f_scores = kfold_validation(X, y, algorithm=args.algorithm, num_folds=args.num_folds) accs = kfold_scores(f_scores) print(f_scores) print(f'Average accuracy of {args.num_folds}-folds: {100*accs[0]:.2f}%') print(f'Best accuracy of {args.num_folds}-folds: {100*accs[1]:.2f}%') else: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=args.seed) print(f'Train data: {X_train.shape}, train labels: {y_train.shape}') print(f'Test data: {X_test.shape}, test labels: {y_train.shape}') if args.algorithm == 'NB': model = BernoulliNB(), y_train) if args.algorithm == 'RF': model = RandomForest(n_estimators=100, max_depth=10, n_jobs=os.cpu_count(), verbose=2), y_train) if args.algorithm == 'XGB': model = XGB(verbosity=1, n_estimators=1000, max_depth=8, reg_lambda=1e-2, reg_alpha=4), y_train, eval_set=[(X_test,y_test)], eval_metric='logloss', verbose=True, early_stopping_rounds=20) # test model test_model(model, X_test, y_test) print(f'Script completed in {time.time()-start:.2f} secs') return 0
def try_params(n_iterations, params): """El objetivo de esta funcion es evaluar las diferentes configuraciones obtenidas de la muestra. :param n_iterations: Aumento de estimadores del arbol que se agregaran por iteracion. :param params: Configuracion para el modelo correspondiente. :returns: Retorna la configuracion con el tratamiento correspondiente. """ n_estimators = int(round(n_iterations * trees_per_iteration)) print("n_estimators:", n_estimators) pprint(params) clf = XGB(n_estimators=n_estimators, nthread=-1, **params) return train_and_eval_sklearn_classifier(clf, data)
def kfold_validation(features, labels, algorithm='XGB', num_folds=2): kf = KFold(n_splits=num_folds) kf.get_n_splits(features) fold_scores = {'train':[], 'val':[]} fold_num = 0 for train_idx, val_idx in kf.split(features): fold_num += 1 print(f'Training on fold {fold_num}') X_train, y_train = features[train_idx], labels[train_idx] X_val, y_val = features[val_idx], labels[val_idx] if args.algorithm == 'NB': model = BernoulliNB(), y_train) if args.algorithm == 'RF': model = RandomForest(n_estimators=100, max_depth=10, n_jobs=os.cpu_count(), verbose=2), y_train) if args.algorithm == 'XGB': model = XGB(verbosity=1, n_estimators=1000, max_depth=3, reg_lambda=1, reg_alpha=1e-4), y_train, eval_set=[(X_val,y_val)], eval_metric='logloss', verbose=True, early_stopping_rounds=20) train_score = model.score(X_train, y_train) fold_scores['train'].append(train_score) val_score = model.score(X_val, y_val) fold_scores['val'].append(val_score) print(f'Fold {fold_num}: training score = {train_score}, validation score = {val_score}') with open('fold_accs_random_forest.npy', 'wb') as outfile: pickle.dump(fold_scores, outfile) return fold_scores
clf = GridSearchCV(RFC(), RFC_tuned_parameter, cv=7, scoring='%s' % score) elif (x == 9): clf = GridSearchCV(ABC(), ABC_tuned_parameter, cv=7, scoring='%s' % score) elif (x == 10): clf = GridSearchCV(GBC(), GBC_tuned_parameter, cv=7, scoring='%s' % score) elif (x == 11): clf = GridSearchCV(XGB(), XGB_tuned_parameter, cv=7, scoring='%s' % score) print("Check Point"), y_train) print() print("Grid scores on development set:") print() means = clf.cv_results_['mean_test_score'] stds = clf.cv_results_['std_test_score'] # for mean, std, params in zip(means, stds, clf.cv_results_['params']): # print("%0.3f (+/-%0.03f) for %r"
BINS = np.zeros((NUM + 1)) for i in range(1, NUM + 1): BINS[i] = NUM * i #Loading data X, Y = Loader.data_load(CLASS, PARTS, PATH) #Using colour histograms if needed X = Loader.histogram(X, BINS, NUM) #preprocessing and data split if needed X, TRAIN_IND, TEST_IND = Loader.preproc(X) print(X.shape) print(X[TRAIN_IND].shape, X[TEST_IND].shape) eval_set = [X[TEST_IND], Y[TEST_IND]] #Creating and fitting the model model = XGB(max_depth=DEPTH, n_estimators=ESTIMATORS, learning_rate=RATE, nthread=4)[TRAIN_IND], Y[TRAIN_IND]) NANI = np.copy(X[TEST_IND]) #Predictions for train data #Y_P = model.predict(X[TRAIN_IND]) #accuracy = score(Y[TRAIN_IND], Y_P.round()) #print('TRAIN ACCURACY = ', accuracy*100, '%') #Predictions for test data Y_P = model.predict(NANI) accuracy = score(Y[TEST_IND], Y_P.round()) print('TEST ACCURACY = ', accuracy * 100, '%')
print(f"Saving tfidf count vector to {file_name}") joblib.dump(cv, file_name) # Feature Scaling # ============================================================================= # from sklearn.preprocessing import StandardScaler # sc = StandardScaler() # X_train_scaled = sc.fit_transform(X_train_cv) # X_test_scaled = sc.transform(X_test_cv) # ============================================================================= classifiers = { 'MultinomialNB' : MNB(), 'RandomForest': RF(n_jobs=-1), 'GradientBoosting': GBC(), 'xgb': XGB()} score_list_columns = ['model_name', 'accuracy', 'precision', 'recall', 'f1_score'] score_list = [] best_score = 0 for model_name, model in classifiers.items(): # Fitting MultinomialNB print("="*60) y_score =, y_train) print(model) #predicting the test results y_pred = model.predict(X_test) y_pred_proba = model.predict_proba(X_test) #Making the confusion Matrix
cost_benefit = np.array([[100, -10], [0, 0]]) X_train_raw, X_test_raw, y_train, y_test = get_train_test(corpus_filepath) print("Training & Test", X_train_raw.shape, X_test_raw.shape, y_train.shape, y_test.shape) # Bag of words model cv = TfidfVectorizer(max_features=1000, stop_words="english") print(cv) X_train = cv.fit_transform(X_train_raw).toarray() X_test = cv.transform(X_test_raw).toarray() #models = [RF(n_jobs=-1), LR(n_jobs=-1), GBC(), SVC(probability=True)] models = [MultinomialNB(), GaussianNB(), RF(n_jobs=-1), GBC(), XGB()] model_profits = [] for model in models: print(model.__class__.__name__) profits, thresholds = get_model_profits(model, cost_benefit, X_train, X_test, y_train, y_test) model_profits.append((model, profits, thresholds)) plot_model_profits(model_profits, "./presentation/proft_curve.png") #plot_model_profits(model_profits) max_model, max_thresh, max_profit, summary_list = find_best_threshold( model_profits) max_labeled_positives = max_model.predict_proba(X_test) >= max_thresh proportion_positives = max_labeled_positives.mean()
model2 = Sequential() model2.add(LSTM(units=32, input_shape=(1, x_train.shape[1]))) model2.add(Dense(1, activation='sigmoid')) model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) ### 第一层模型 clfs = [ # GBDT(n_estimators=100), # RF(n_estimators=100), model1, XGB(n_estimators=100) # SVM() ] X_train_stack = np.zeros((x_train.shape[0], len(clfs))) X_test_stack = np.zeros((x_test.shape[0], len(clfs))) # 5折stacking n_folds = 5 skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=1) print("longding...") for i,clf in enumerate(clfs): # print("分类器:{}".format(clf))
y = data[1] groups = data[2] limit = 'all' feature = 'mean_std' X = getRelevantData(X, limit) X_feats = getFeatures(X, feature, False, 0) params = { 'max_depth': np.arange(2, 10, 1), 'min_child_weight': [0.1, 0.5, 1, 2], 'gamma': [0, 0.001, 0.01, 0.1, 0.2], 'learning_rate': [0.1, 0.2, 0.3, 0.5, 0.7, 1] } cv = GroupShuffleSplit(n_splits=5, test_size=0.2, random_state=0) clf = RandomizedSearchCV(XGB(n_jobs=-1), params, cv=cv), y, groups) printScores(clf) #%% test best found estimator bestclf = clf.best_estimator_ bestclf.n_estimators = 200 testcv = GroupShuffleSplit(n_splits=30, test_size=0.2, random_state=0) accuracies = cross_val_score(bestclf, X_feats, y, groups, cv=testcv) print('Real score for best found estimator is {}'.format(np.mean(accuracies))) #%% Make submission chosenclf = XGB(base_score=0.5,
def XGB_ModelBuilder(X_train, y_train, X_test, y_test, X_unknown=[]): # # Created by KAC on 02/12/2020 """ This function takes in data and completes a grid search to tune parameters automatically. It then makes predictions and calculates an MAE score for those predictions.""" import numpy as np import pandas as pd from sklearn.feature_selection import RFECV from sklearn.metrics import log_loss from xgboost import XGBClassifier as XGB from sklearn.model_selection import cross_val_score, RandomizedSearchCV from sklearn.metrics import make_scorer # scorer = make_scorer(log_loss, greater_is_better=False) XGB_model = XGB() selector = RFECV(estimator=XGB_model, scoring='neg_log_loss', cv=5), y_train) CV_score = cross_val_score(selector, X_train, y_train, scoring='neg_log_loss', cv=5) scr = np.mean(CV_score) print( pd.DataFrame({ 'Variable': X_train.columns, 'Importance': selector.ranking_ }).sort_values('Importance', ascending=True).head(50)) print("Optimal number of features: ", selector.n_features_) print("Log Loss for All Features: ", scr) if selector.n_features_ < len(X_train.columns): X_train_transformed = selector.transform(X_train) X_test_transformed = selector.transform(X_test) CV_score = cross_val_score(selector, X_train_transformed, y_train, scoring='neg_log_loss', cv=5) scr = np.mean(CV_score) print("Log Loss for Selected Features on Training Data: ", scr) else: X_train_transformed = X_train X_test_transformed = X_test print( "Not optimal to remove features. Proceeding to parameter tuning.") # Current Best: {'subsample': 0.9, 'n_estimators': 250, 'min_child_weight': 2, 'max_depth': 8, 'learning_rate': 0.02, 'colsample_bytree': 0.85} parameters = { "learning_rate": [0.01, 0.015, 0.02, 0.025, 0.03], #[0.01, 0.05, 0.1], "n_estimators": [250, 500, 600], #[500, 750, 1000], "max_depth": [8, 9, 10, 12], #[3, 6, 9], "min_child_weight": [2, 5, 8], #[1, 2], "colsample_bytree": [0.7, 0.75, 0.8, 0.85], #[0.5, 0.75, 1], "subsample": [0.9, 1] #[0.5, 0., 1] } rsearch = RandomizedSearchCV(estimator=XGB_model, param_distributions=parameters, scoring='neg_log_loss', n_iter=250, cv=5) #XGB_model, y_train) print(rsearch.best_params_) CV_score = cross_val_score(rsearch, X_train_transformed, y_train, scoring='neg_log_loss', cv=5) scr = np.mean(CV_score) print( "Log Loss for Selected Features and Parameter Tuning on Training Data: ", scr) predictions = rsearch.predict_proba(X_test_transformed) pred_scr = round(log_loss(y_test, predictions), 5) print("2019 Score: ", pred_scr) if X_unknown is not None: X_final = pd.concat([X_train, X_test]) X_final = RFECV.transform(X_final) y_final = pd.concat([y_train, y_test]) X_unknown = RFECV.transform(X_unknown), y_final) predictions_final = rsearch.predict(X_unknown) else: predictions_final = [] return predictions, predictions_final
import numpy as np from own_functions import loadData, getRelevantData, getFeatures, getMaxpeaks from sklearn.svm import LinearSVC from sklearn.multiclass import OneVsRestClassifier from xgboost import XGBClassifier as XGB from xgboost import plot_importance import matplotlib.pyplot as plt folder = getcwd() + '/robotsurface/' data = loadData(folder) X = data[0] X = getRelevantData(X, 'all') X_f = getFeatures(X, 'mean_std') y = data[1] clf = XGB(), y) # %% plt.figure(figsize=(20, 10)) ax = plt.axes() plot_importance(clf, ax) # %% see feature order fft = np.abs(np.fft.fft(X))[:, :, :63] fftmean = np.expand_dims(np.mean(fft, 2), axis=2) fftstd = np.expand_dims(np.std(fft, 2), axis=2) mean = np.expand_dims(np.mean(X, 2), axis=2) std = np.expand_dims(np.std(X, 2), axis=2) peaks = getMaxpeaks(fft, 2)
warnings.filterwarnings("ignore", category=ConvergenceWarning) ################################## ## 3.1 train and test models using GridSearchCV models = { 'DT': DTC(), 'LR': LR(), 'MLP': MLPC(), 'SVC': SVC(), 'NB': NB(), 'KNN': KNNC(), 'Bagging': BaggingC(), 'RF': RFC(), 'AdaBoost': AdaBoostC(), 'GB': GBC(), 'XGB': XGB(), } param_dict = { # 0.67 {'max_depth': 1, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 2} 'DT': { 'max_depth': [1,2,3,None], 'max_leaf_nodes': [4,6,8,10,None], 'min_samples_leaf': [1,2,3], 'min_samples_split': [2,4,6] }, # LR 0.64 {'C': 5.0, 'class_weight': None, 'fit_intercept': False, 'penalty': 'l2', 'solver': 'sag'} 'LR': { "solver": ['lbfgs', 'liblinear', 'sag', 'saga'], "penalty": ['l2'], "C": [1.0, 1.5, 2.0, 5.0, 10],
y_train.shape, y_test.shape) # Bag of words model cv = TfidfVectorizer(max_features=1000, stop_words="english") print(cv) X_train = cv.fit_transform(X_train_raw).toarray() X_test = cv.transform(X_test_raw).toarray() #models = [RF(n_jobs=-1), LR(n_jobs=-1), GBC(), SVC(probability=True)] models = [ MultinomialNB(), GaussianNB(), RF(n_jobs=-1), GBC(), XGB(n_jobs=-1) ] model_profits = [] for model in models: print(model.__class__.__name__) profits, thresholds = get_model_profits(model, cost_benefit, X_train, X_test, y_train, y_test) model_profits.append((model, profits, thresholds)) plot_model_profits(model_profits, "./presentation/proft_curve.png") #plot_model_profits(model_profits) max_model, max_thresh, max_profit, summary_list = find_best_threshold( model_profits) max_labeled_positives = max_model.predict_proba(X_test) >= max_thresh
from xgboost import XGBClassifier as XGB data = pd.read_csv("Train.csv") X = data.drop(['INCIDENT_ID', 'DATE', 'MULTIPLE_OFFENSE'], axis=1) Y = data['MULTIPLE_OFFENSE'] x = X.values y = Y.values x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0) clf = XGB(seed=0), y_train) # rfc = RandomForestClassifier() #,y_train) # y_pred = rfc.predict(x_test) # from sklearn.metrics import confusion_matrix # import matplotlib.pyplot as plt # import seaborn as sns # LABELS = ['Normal', 'Fraud'] # conf_matrix = confusion_matrix(y_test, y_pred) # plt.figure(figsize =(12, 12)) # sns.heatmap(conf_matrix, xticklabels = LABELS, # yticklabels = LABELS, annot = True, fmt ="d");
# In[86]: trainB = resample_data(train, target=target) print('Number of clients in the dataset is : {}'.format(len(dataset))) print('Number of clients in the balanced train set is : {}'.format( len(trainB))) print('Number of clients in the test set is : {}'.format(len(test))) # In[88]: model_XGB = XGB(max_depth=6, learning_rate=.1, n_estimators=100, reg_lambda=0.5, reg_alpha=0, verbosity=1, n_jobs=-1, tree_method='exact').fit(trainB[features], trainB[target]) pred = model_XGB.predict(test[features]) predp = model_XGB.predict_proba(test[features])[:, 1] importances = model_XGB.feature_importances_ indices = np.argsort(importances) plt.figure(figsize=(15, 8)) plt.title('Feature Importances: Balanced Extreme Gradient Boosting (XGBoost)') plt.barh(range(len(indices)), importances[indices], align='center') plt.yticks(range(len(indices)), [features[i] for i in indices]) plt.xlabel('Relative Importance')
def XGB_ModelBuilder(X_train, y_train, X_test, y_test, X_unknown): # # Created by KAC on 02/12/2020 """ This function takes in data and completes a grid search to tune parameters automatically. It then makes predictions and calculates an MAE score for those predictions.""" import numpy as np import pandas as pd from sklearn.feature_selection import RFECV from sklearn.metrics import mean_absolute_error from xgboost import XGBRegressor as XGB from sklearn.model_selection import cross_val_score, RandomizedSearchCV from sklearn.metrics import make_scorer scorer = make_scorer(mean_absolute_error, greater_is_better=False) XGB_model = XGB(objective='reg:squarederror') RFECV = RFECV(estimator=XGB_model, scoring=scorer), y_train) CV_score = cross_val_score(RFECV, X_train, y_train, scoring=scorer) scr = np.mean(CV_score) print( pd.DataFrame({ 'Variable': X_train.columns, 'Importance': RFECV.ranking_ }).sort_values('Importance', ascending=True).head(50)) print("Optimal number of features: ", RFECV.n_features_) print("MAE for All Features: ", scr) X_train_transformed = RFECV.transform(X_train) X_test_transformed = RFECV.transform(X_test) CV_score = cross_val_score(RFECV, X_train_transformed, y_train, scoring=scorer) scr = np.mean(CV_score) print("MAE for Selected Features on Training Data: ", scr) parameters = { "learning_rate": [0.01, 0.015, 0.02], "n_estimators": [650, 700, 750], "max_depth": [8, 9, 10], "min_child_weight": [1, 2], "gamma": [0.15, 0.2, 0.25], "colsample_bytree": [0.75, 0.8, 0.85], "subsample": [0.3, 0.4, 0.5] } rsearch = RandomizedSearchCV(estimator=XGB_model, param_distributions=parameters, n_iter=250), y_train) # print(rsearch.best_params_) CV_score = cross_val_score(rsearch, X_train_transformed, y_train, scoring=scorer) scr = np.mean(CV_score) print("MAE for Selected Features and Parameter Tuning on Training Data: ", scr) predictions = rsearch.predict(X_test_transformed) pred_scr = round(mean_absolute_error(y_test, predictions), 3) print("MAE for Selected Features and Parameter Tuning on 2019 Data: ", pred_scr) if X_unknown is not None: X_final = pd.concat([X_train, X_test]) X_final = RFECV.transform(X_final) y_final = pd.concat([y_train, y_test]) X_unknown = RFECV.transform(X_unknown), y_final) predictions_final = rsearch.predict(X_unknown) else: predictions_final = [] return predictions, predictions_final