def test(): ''' Trains the model and returns its score ''' matplotlib.rcParams['backend'] = 'Qt5Agg' matplotlib.get_backend() D = DataManager(data_name, data_dir) #Load le model mdl = model() Prepro = prepro.Preprocessor() #D.data['X_train'] = Prepro.removeOutliers(D.data['X_train']) #D.data['Y_train'] = Prepro.removeOutliers(D.data['Y_train']) X_train = D.data['X_train'] Y_train = D.data['Y_train'].ravel() #test de l'entrainement mdl.fit(X_train, Y_train) #test de la prediction Y_hat_train = mdl.predict(D.data['X_train']) Y_hat_valid = mdl.predict(D.data['X_valid']) Y_hat_test = mdl.predict(D.data['X_test']) metric_name, scoring_function = get_metric() scores = cross_val_score(mdl, X_train, Y_train, cv=5, scoring=make_scorer(scoring_function)) print('\nCV score (95 perc. CI): %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))
def find_best_params(self, speed): """ Search the best dimensions and features number. DO NOT USE THIS FUNCTION UNLESS YOU HAVE A POWERFULL COMPUTER OR/AND A LOT OF TIME OR USE A HIGH SPEED (>4) :param speed: The number of features and dimension jumped on each loop :return: The best dimensions and features number """ print(speed) scores = [[0] * 200] * 200 Y_train = D.data['Y_train'].ravel() for i in range(1, 200, speed): M = RandomForestClassifier(n_estimators=136, max_depth=None, min_samples_split=2, random_state=1) feature_selection = SelectKBest(chi2, k=i) feature_selection.fit(D.data['X_train'], Y_train) X_train = feature_selection.transform(D.data['X_train']) for j in range(1, 200, speed): tmpM = M pca = PCA(n_components=j) pca.fit(D.data['X_train'], Y_train) X_train = pca.transform(D.data['X_train']) tmpM.fit(X_train, Y_train) metric_name, scoring_function = get_metric() scrs = cross_val_score(M, X_train, Y_train, cv=5, scoring=make_scorer(scoring_function)) scores[i][j] = (scrs.mean()) max_pos = np.argmax(scores) self.best_features_nb = max_pos // 200 self.best_dim_nb = max_pos % 200 print(self.best_features_nb, self.best_dim_nb)
def f_test_models (X_train, Y_train): metric_name, scoring_function = get_metric() model_name = ["Nearest Neighbors", "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes"] model_list = [ KNeighborsClassifier(3), DecisionTreeClassifier(max_depth=10), RandomForestClassifier(max_depth=10, n_estimators=20), AdaBoostClassifier(), GaussianNB(), ] s_train = [] s_test = [] for i in range(len(model_list)): s_prime = cross_validate(model_list[i], X_train, Y_train,cv=5, scoring=make_scorer(scoring_function), return_train_score=True) s_train.append(s_prime['train_score'].mean()) s_test.append(s_prime['test_score'].mean()) d = {'Score_train': s_train, 'Score_test': s_test} #Plot sd = pd.DataFrame(d, index=[model_name[i] for i in range(len(model_name))] ) ax = sd.plot.bar() ax.set_ylabel("Score") ax.set_xlabel("Model") plt.show()
def f_test_estimator(X_train, Y_train): s = [] var =[] nb_arbre = np.linspace(15,120, num=6).astype(int) #Nombre d'arbres que l'on teste metric_name, scoring_function = get_metric() for i in range(len(nb_arbre)): clf = RandomForestClassifier(random_state = 42, n_estimators = nb_arbre[i]) M_prime = model(clf) scores = cross_val_score(M_prime, X_train, Y_train, cv=5, scoring=make_scorer(scoring_function)) s.append(scores.mean()) var.append(scores.std()) #Plot plt.figure(figsize=(6,6)) plt.xlabel("n_estimator") plt.ylabel('Score') plt.title('Score results of RandomForest with cross-validation') plt.errorbar(nb_arbre, s, var, label='Test set')
def best_param_MODEL(logistic, distributions): """ This function finds the best parameters for the RandomizedSearchCV model and returns the best parameters Parameters ---------- logistic: model's name distributions: dictionary of the different parameters of the model that will be tested Returns ------ search: the best parameters """ metric_name, scoring_function = get_metric() clf = RandomizedSearchCV(logistic, distributions, random_state=0, scoring=make_scorer(scoring_function) ) search = clf.fit(X_train, Y_train) search.best_params_ return search
def __init__(self, X_train, y_train, models_list, models_name, preprocessing_name = None, scoring_function = None): ''' This constructor initialises the datasets, scoring function, and models such that they can be used by the other methods. The default scoring function (scoring_function = None) is the provided one. ''' self.X_train = X_train self.y_train = y_train if scoring_function == None: _, self.scoring_function = get_metric() else: self.scoring_function = scoring_function self.models_list = models_list self.models_name = models_name self.preprocessing_name = preprocessing_name
def find_best_pca(self): """ Find the best dimensions number using the PCA (Principal Component Analysis). :return: The best dimensions number """ for i in range(1, 200, 1): M = RandomForestClassifier(n_estimators=136, max_depth=None, min_samples_split=2, random_state=1) pca = PCA(n_components=i) pca.fit(D.data['X_train'], D.data['Y_train']) X_train = pca.transform(D.data['X_train']) Y_train = D.data['Y_train'].ravel() M.fit(X_train, Y_train) metric_name, scoring_function = get_metric() scores = cross_val_score(M, X_train, Y_train, cv=5, scoring=make_scorer(scoring_function)) self.pca_scores.append(scores.mean()) self.best_dim_nb = self.pca_scores.index(max(self.pca_scores))
def find_best_features(self): """ Execute the model with different quantity of features (1 to 200) and return the quantity of features who give the best model's score. :return: The best features number """ for i in range(1, 200, 1): M = RandomForestClassifier(n_estimators=136, max_depth=None, min_samples_split=2, random_state=1) feature_selection = SelectKBest(chi2, k=i) feature_selection.fit(D.data['X_train'], D.data['Y_train']) X_train = feature_selection.transform(D.data['X_train']) Y_train = D.data['Y_train'].ravel() M.fit(X_train, Y_train) metric_name, scoring_function = get_metric() scores = cross_val_score(M, X_train, Y_train, cv=5, scoring=make_scorer(scoring_function)) self.features_scores.append(scores.mean()) self.best_features_nb = self.features_scores.index( max(self.features_scores))
def main(): #Choose while executing Preprocessed or RAW print("Here we goooo ! \n Preprocessed (0, default) or Raw (1) ?") try: choice = int(input()) except ValueError: print("ERREUR: Saisissez un NOMBRE, Fermeture du programme.") exit() warnings.filterwarnings("ignore") np.seterr(divide='ignore', invalid='ignore') metric_name, scoring_function = get_metric() #Choose appropriate directory and model if (choice == 1): directory = DIRECTORY + "Raw_Results/" data_dir = DATA_DIR_RAW clf = ModelRaw() else: directory = DIRECTORY + "Preprocessed_Results/" data_dir = DATA_DIR_PRE clf = ModelPreprocessed() #Create Directory if not os.path.exists(directory): os.makedirs(directory) #Load data as panda frame d_train = load_train(data_dir, DATA_NAME) #Transform to numpy X = d_train.drop(columns=['target']).to_numpy() y = d_train['target'].to_numpy() #split data train X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) #Train classif clf.fit(X_train, y_train) #Score and ROC curve accuracy = clf.score(X_test, y_test) print("accuracy =", accuracy) y_proba = clf.predict_proba(X_test) y_decision = y_proba[:, 1] fpr, tpr, thresholds = roc_curve(y_test, y_decision, pos_label=1) plot_ROC(fpr, tpr, directory=directory) #Confusion Matrix y_test_pre = clf.predict(X_test) cm = confusion_matrix(y_test, y_test_pre) plot_mat_conf(cm, directory=directory) if True: #plot score of 5 different model model_name = ["Nearest Neighbors","Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes"] model_list = [ KNeighborsClassifier(3), DecisionTreeClassifier(max_depth=10), RandomForestClassifier(max_depth=10, n_estimators=20), AdaBoostClassifier(), GaussianNB(), ] s_train = [] s_test = [] for i in range(len(model_list)): s_prime = cross_validate(ModelPreprocessed(classifier = model_list[i]), X, y, cv=3, scoring=make_scorer(scoring_function), return_train_score=True) s_train.append(s_prime['train_score'].mean()) s_test.append(s_prime['test_score'].mean()) plot_test_model (s_train, s_test, model_name, directory=directory) #plot score with different values of n_estimators s = [] var = [] n_est = np.linspace(15,120, num=4).astype(int) metric_name, scoring_function = get_metric() for i in range(len(n_est)): clf_prime = RandomForestClassifier(random_state = 42, n_estimators = n_est[i]) scores = cross_val_score(ModelPreprocessed(classifier = clf_prime), X, y, cv=5, scoring=make_scorer(scoring_function)) s.append(scores.mean()) var.append(scores.std()) plot_test_estimator(n_est, s, var, directory=directory) #plot decision surface for Decision tree, RandForest and Adaboost plot_decision_surface_tree_classif(X_train, y_train, directory=directory)
def cross_validation_Classifier(self): metric_name1, scoring_function1 = get_metric() return cross_val_score(self.M, self.x,self.y, cv=5 ,scoring = make_scorer(scoring_function1))
KNeighborsClassifier(1), DecisionTreeClassifier(max_depth=10), #RandomForestClassifier(max_depth=10, n_estimators=10, max_features=1), RandomForestClassifier(n_estimators=116, max_depth=None, min_samples_split=2, random_state=1), MLPClassifier(alpha=1, max_iter=1000), AdaBoostClassifier(), GaussianNB(), QuadraticDiscriminantAnalysis(), ExtraTreesClassifier() ] X_train = D.data['X_train'] Y_train = D.data['Y_train'].ravel() metric_name, scoring_function = get_metric() #compareModel(model_name, model_list) #M_Model = model(RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=2, random_state=0)) #doBestModel(M_Model) model_listS = [ ('rf', ExtraTreesClassifier()), #('knb', KNeighborsClassifier(1), ('rfc', RandomForestClassifier(n_estimators=116, max_depth=None, min_samples_split=2, random_state=1)), # ('rfc1', MLPClassifier(alpha=1, max_iter=1000)), # ('rfc2', GaussianNB()), #('rfc3', QuadraticDiscriminantAnalysis()),