def rfmodel(X,y): X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2) rf = RandomForestRegressor(n_estimators = 500) rf.fit(X_train, y_train) y_train_pred = rf.predict(X_train) y_test_pred = rf.predict(X_test) scores = cross_validate(rf, X, y, cv=5, scoring=('r2', 'neg_mean_squared_error')) cv_mse = -scores['test_neg_mean_squared_error'] cv_rmse = np.sqrt(cv_mse).mean() cv_r2 = scores['test_r2'].mean() print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred),r2_score(y_test, y_test_pred))) print('RMSE train: %.3f, test: %.3f' % (math.sqrt(mean_squared_error(y_train, y_train_pred)),math.sqrt(mean_squared_error(y_test, y_test_pred)))) print('CV R^2: %.3f, RMSE: %.3f' % (cv_r2,cv_rmse)) return y_train,y_train_pred,y_test,y_test_pred
def cv_evaluate(clf, train_data, train_labels): """ evaluation with cross validation in classifiers, by default: 10-CV. report the precision, recall, and f1-measure Args: -------- clf: the classifier train_data: the training data train_labels: the training labels Returns: -------- """ print("Using cross validation") scoring = ['precision', 'recall', 'f1'] scores_cv = cross_validate(clf, train_data, train_labels.ravel(), cv=10, scoring=scoring) precision, recall, f1_score = np.mean([scores_cv['test_precision'], scores_cv['test_recall'], scores_cv['test_f1']], axis=1) print('=' * 20, 'RESULT', '=' * 20) print("Precision: %.6f, Recall: %.6f, F1_score: %.6f" % (precision, recall, f1_score))
def train_and_test_dnn(args): for a in args: print(a) primitive = args[1] res = pickle.load(open(sys.argv[2], "rb" )) notes_with_truth_labels_for_query_primitives = pd.read_csv(args[3]) dl_results = pd.DataFrame(columns = ['primitive', 'avg_fit_time', 'avg_score_time', 'avg_score']) X = get_doc_term_matrix(res) y = notes_with_truth_labels_for_query_primitives.loc[:, primitive] clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(128, 5, 2), random_state=1) try: sm = SMOTE(random_state=357) X_sm, y_sm = sm.fit_sample(X, y) except ValueError: print("value error, smote") X_sm = X y_sm = y cv_results = cross_validate(clf, X_sm, y_sm, cv=3, return_train_score=False) print(cv_results) dump(clf, './models/{}_trained_dnn.joblib'.format(primitive)) dl_results.loc[0, 'primitive'] = primitive dl_results.loc[0, 'avg_fit_time'] = np.mean(cv_results['fit_time']) dl_results.loc[0, 'avg_score_time'] = np.mean(cv_results['score_time']) dl_results.loc[0, 'avg_test_score'] = np.mean(cv_results['test_score']) with open(args[4], 'a') as f: f.write("{}, {}, {}, {}\n".format(dl_results.loc[0,'primitive'], dl_results.loc[0,'avg_fit_time'], dl_results.loc[0,'avg_score_time'], dl_results.loc[0,'avg_test_score'])) #f.write(dl_results.loc[0,:]) #f.write("\n") f.close() print("DONE w/ {}".format(primitive))
# <div class="admonition note alert alert-info"> # <p class="first admonition-title" style="font-weight: bold;">Note</p> # <p class="last">Here, we need to increase the maximum number of iterations to obtain a fully # converged <tt class="docutils literal">LogisticRegression</tt> and silence a <tt class="docutils literal">ConvergenceWarning</tt>. Contrary # to the numerical features, the one-hot encoded categorical features are all # on the same scale (values are 0 or 1), so they would not benefit from # scaling. In this case, increasing <tt class="docutils literal">max_iter</tt> is the right thing to do.</p> # </div> # Finally, we can check the model's statistical performance only using the # categorical columns. # In[ ]: from sklearn.model_selection import cross_validate cv_results = cross_validate(model, data_categorical, target) cv_results # In[ ]: scores = cv_results["test_score"] print(f"The accuracy is: {scores.mean():.3f} +/- {scores.std():.3f}") # As you can see, this representation of the categorical variables is # slightly more predictive of the revenue than the numerical variables # that we used previously. # # In this notebook we have: # * seen two common strategies for encoding categorical features: **ordinal # encoding** and **one-hot encoding**;
scorerMCC = metrics.make_scorer(metrics.matthews_corrcoef) scorerSP = metrics.make_scorer(specificity_score) scorerPR = metrics.make_scorer(metrics.precision_score) scorerSE = metrics.make_scorer(metrics.recall_score) scorer = { 'ACC': 'accuracy', 'recall': scorerSE, 'roc_auc': 'roc_auc', 'MCC': scorerMCC, 'SP': scorerSP } five_fold = model_selection.cross_validate(clf, gram_train, y_train, cv=cv, scoring=scorer) mean_ACC = np.mean(five_fold['test_ACC']) mean_sensitivity = np.mean(five_fold['test_recall']) mean_AUC = np.mean(five_fold['test_roc_auc']) mean_MCC = np.mean(five_fold['test_MCC']) mean_SP = np.mean(five_fold['test_SP']) #print('five fold:') print(mean_sensitivity) print(mean_SP) print(mean_ACC) print(mean_MCC) print(mean_AUC)
from sklearn.externals import joblib import warnings warnings.filterwarnings("ignore", category=FutureWarning) from sklearn.linear_model import LogisticRegression #warnings.filterwarnings("ignore", category=DeprecationWarning) from sklearn.preprocessing import StandardScaler import random from sklearn.model_selection import ShuffleSplit from sklearn.model_selection import cross_validate import sweetviz from sklearn.metrics import accuracy_score data = pd.read_csv( "/home/gulshan/Desktop/Diseaseprediction/Diseaseprediction/Datasets/diabetes.csv" ) #print(data) #my_report=sweetviz.analyze(data) #my_report.show_html('report.html') target = data['Outcome'] data = data.drop(['Outcome'], axis=1) sc = StandardScaler() data = sc.fit_transform(data) lr = LogisticRegression() lr.fit(data, target) cv_results = cross_validate(lr, data, target, cv=10) print(lr.predict(data)) #joblib.dump(lr,"Diabetes_Model") #joblib.dump(sc,'dscaler') y_pred = lr.predict(data) #print(accuracy_score(target,y_pred))(78.38% acc.)
train_err = [0] * len(ks) test_err = [0] * len(ks) train_err2 = [0] * len(ks) test_err2 = [0] * len(ks) cv_scores = [0] * len(ks) cv_scores2 = [0] * len(ks) for i, k in enumerate(ks): print 'kNN: learning a kNN classifier with k = ' + str(k) clf = KNeighborsClassifier(n_neighbors = k) clf.fit(X_train, y_train) clf2 = KNeighborsClassifier(n_neighbors = k, weights='distance') clf2.fit(X_train, y_train) train_err[i] = accuracy_score(y_train, clf.predict(X_train)) cv_results = cross_validate(clf, X_train, y_train, cv=5, scoring='accuracy', return_train_score=True, return_estimator=True) cv_scores[i] = cv_results['test_score'].mean() index = np.argmax(cv_results['test_score']) estimator = cv_results['estimator'][index] YpredTest = estimator.predict(X_test) test_err[i] = accuracy_score(y_test, YpredTest) train_err2[i] = accuracy_score(y_train, clf2.predict(X_train)) cv_results = cross_validate(clf2, X_train, y_train, cv=5, scoring='accuracy', return_train_score=True, return_estimator=True) cv_scores2[i] = cv_results['test_score'].mean() index = np.argmax(cv_results['test_score']) estimator = cv_results['estimator'][index] YpredTest = estimator.predict(X_test) test_err2[i] = accuracy_score(y_test, YpredTest) print '---' # Plot results
########## Predection and Reporting ################ boston_Y_pred = boston_ridge_reg.predict(boston_X_test) boston_Y_train_pred = boston_ridge_reg.predict(boston_X_train) test_error_boston = mean_squared_error(boston_Y_test, boston_Y_pred) train_error_boston = mean_squared_error(boston_Y_train, boston_Y_train_pred) r2_score_boston = r2_score(boston_Y_train, boston_Y_train_pred) ########## Cross validation K = 5 ################## cross_val_boston = np.abs( np.mean( cross_validate( boston_ridge_reg, boston_X_train, boston_Y_train, cv=5, scoring='neg_mean_squared_error')['test_score'])) boston_test_error_ridge.append(test_error_boston) boston_train_error_ridge.append(train_error_boston) boston_r2_score_ridge.append(r2_score_boston) boston_cv.append(cross_val_boston) boston_test_error_ridge_global.append(boston_test_error_ridge) boston_train_error_ridge_global.append(boston_train_error_ridge) boston_r2_score_ridge_global.append(boston_r2_score_ridge) boston_cv_global.append(boston_cv) ########### Ploting the reports #################### ymin_error = np.min(
print(le_embarked.classes_) titanic_train['Embarked'] = le_embarked.transform(titanic_train['Embarked']) le_sex = preprocessing.LabelEncoder() le_sex.fit(titanic_train['Sex']) print(le_sex.classes_) titanic_train['Sex'] = le_sex.transform(titanic_train['Sex']) features = ['Pclass', 'Parch', 'SibSp', 'Age', 'Fare', 'Embarked', 'Sex'] X_train = titanic_train[features] y_train = titanic_train['Survived'] knn_estimator = neighbors.KNeighborsClassifier() knn_estimator.fit(X_train, y_train) scores = model_selection.cross_validate(knn_estimator, X_train, y_train, cv=10) test_scores = scores.get("test_score") print(test_scores.mean()) train_scores = scores.get("train_score") print(train_scores.mean()) #read test data titanic_test = pd.read_csv( "C:\\Users\\Algorithmica\\Downloads\\titanic_test.csv") print(titanic_test.info()) titanic_test[imputable_cont_features] = cont_imputer.transform( titanic_test[imputable_cont_features]) titanic_test['Embarked'] = cat_imputer.transform(titanic_test['Embarked']) titanic_test['Embarked'] = le_embarked.transform(titanic_test['Embarked'])
("imputer", MostFrequentImputer()), ("cat_encoder", OneHotEncoder(sparse=False)), ]) # Union all the pipeline preprocess_pipeline = FeatureUnion(transformer_list=[ ("num_pipeline", num_pipeline), ("cat_pipeline", cat_pipeline), ]) X_train = preprocess_pipeline.fit_transform(train_data) y_train = train_data["Survived"] from sklearn.svm import SVC from sklearn import tree decision_tree = tree.DecisionTreeClassifier() cv_results = cross_validate(decision_tree, X_train, y_train, cv=5, return_train_score=True) # decision_tree.fit(X_train, y_train) # X_test = preprocess_pipeline.transform(test_data) # y_pred = decision_tree.predict(X_test) print(cv_results)
# define oversampling strategy oversample = RandomOverSampler(sampling_strategy=i, random_state=1) #print(Counter(Y)) X_ov, Y_ov = oversample.fit_resample(X_train, dummy_y_train) #print(Counter(Y_ov)) under = RandomUnderSampler(sampling_strategy=1, random_state=1) X_un, Y_un = under.fit_resample(X_ov, Y_ov) #print(Counter(Y_un)) model=create_model() model.fit(X_un, Y_un, validation_split=0.2, epochs=num_epochs, batch_size=batch_size, verbose=1) y_pred=model.predict(X_test, batch_size=32) scores = cross_validate(estimator=model, X=X_train, y=dummy_y_train, cv=10,return_train_score=True) # class_names=['normal','dos','probe','u2r','r2l'] hehe = classification_report(dummy_y_Test.argmax(axis=1),y_pred.argmax(axis=1), target_names=class_names, output_dict=True) normal_f.append(hehe['normal']['f1-score']) dos_f.append(hehe['dos']['f1-score']) probe_f.append(hehe['probe']['f1-score']) u2r_f.append(hehe['u2r']['f1-score']) r2l_f.append(hehe['r2l']['f1-score']) ratios = [0, 0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,1] fig, ax = plt.subplots() ax.set_xlabel("Oversampling Ratio") ax.set_ylabel("F1-score")
num_rows - num_rows_clean)) "Feature Importance" input_features = [ column for column in list(individuals_train) if column != 'individuals' ] X = individuals_train[input_features] X['random_noise'] = np.random.normal(size=X.shape[0]) y = individuals_train['individuals'] # RF K-Fold train classifier = RandomForestRegressor(n_jobs=-1) cv = cross_validate(estimator=classifier, X=X, y=y, cv=5, return_estimator=True) feature_importance = {} for k in range(0, len(cv['estimator'])): feature_importance['k_{}'.format( k + 1)] = cv['estimator'][k].feature_importances_ feature_importance = pd.DataFrame(feature_importance, index=X.columns) feature_importance = feature_importance.mean(axis=1).to_frame('importance') \ .sort_values('importance', ascending=False) feature_selection = feature_importance.to_dict() # Get importance concentration score importance_concentration = (feature_importance.iloc[1] /
Documentation ''' if __name__ == '__main__': from Models_data_prep.NYTaxi_cross_ref_data_split_train_test import X_train, Y_train mse = make_scorer(mean_squared_error, greater_is_better=False) seed = 42 n_split = 10 dirname = 'linear_regression' reportname = 'RMSE_scores_{}.csv'.format(datetime) try: os.mkdir(os.path.join(Path_Reports, dirname)) except Exception as e: pass cv = ShuffleSplit(n_splits=n_split, test_size=0.2, random_state=seed) model = LinearRegression() scores = cross_validate(model, X_train.reshape(-1, 1), Y_train, scoring=mse, cv=cv, verbose=1, return_train_score=True) df = pd.DataFrame.from_dict(scores) # df.test_mean_squared_error = np.sqrt(df.test_score) # df.train_mean_squared_error = np.sqrt(df.train_score) df.to_csv(os.path.join(Path_Reports, dirname, reportname))
SVC(), KNeighborsClassifier(), LogisticRegression(), RandomForestClassifier(), GaussianNB(), LinearSVC(), DecisionTreeClassifier(), AdaBoostClassifier(), GradientBoostingClassifier() ] # Cross validation cv_train_score = list() cv_test_score = list() for model in mod: cv_result = cross_validate(model, train_data, train_Survived, cv=cv_split) cv_train_score.append(cv_result['train_score'].mean()) cv_test_score.append(cv_result['test_score'].mean()) cv_model = pd.DataFrame({ 'Model': [ 'Support Vector Machines', 'KNN', 'Logistic Regression', 'Random Forest', 'Naive Bayes', 'Linear SVC', 'Decision Tree', 'AdaBoost classifer', 'Gradient Boosting Classifier' ], 'CVTrainScore': cv_train_score, 'CVTestScore': cv_test_score })
def cross_validation(model,x,y,cv=3): cv_res=cross_validate(model,x,y,return_train_score=True,scoring=score_fn,cv=cv) return cv_res
print(f"[INFO] Reading data from {arg['dataset']}") X, y = data_to_model(pd.read_csv(arg["dataset"])) ## PLAIN RANDOM FOREST report.write("ESPERIMENTO 1. PLAIN MULTILAYER PERCEPTRON REGRESSOR:\n") report.write("\t\t Dati non riscalati\n\n") scoring = { 'r2': 'r2', "explained_variance_score": 'explained_variance', "max error": 'max_error' } #scoring=make_scorer(explained_variance_score,max_error,mean_absolute_error,r2_score) regr = MLPRegressor() scores = cross_validate(regr, X, y, cv=10, n_jobs=-1, verbose=1) print(scores) report.write(f"10 fold-cross validation: \n{scores}\n") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42) print(f"[INFO] Fitting model") regr.fit(X_train, y_train) y_pred = regr.predict(X_test)
rf_RSCV_end_time = time.time() duration = rf_RSCV_end_time - rf_RSCV_start_time print(f'Randomized CV search done. {search_iters} iterations took \ {int(duration // 3600):02d}::{int((duration % 3600) // 60):02d}::{int((duration % 3600) % 60):02d}' ) # print the best parameters chosen by CV pprint.pprint(rf_RSCV.best_params_) # get CV results with best parameters rf_clf.set_params(**rf_RSCV.best_params_) rf_cv = cross_validate(rf_clf, X_train, y_train, n_jobs=32, scoring={ 'log_loss': log_loss_scorer, 'accuracy': accuracy_scorer }) print('RF 5-fold Validation Performance') # note test_log_loss is negated due to how scorers work # in parameter searches in sklearn print('Mean Log Loss\t{}'.format(np.mean(-rf_cv['test_log_loss']))) print('Mean Accuracy\t{}'.format(np.mean(rf_cv['test_accuracy']))) # get performance on test set rf_clf.fit(X_train, y_train) rf_y_test_pred = rf_clf.predict(X_test) print('RF Test Set Performance')
# :class:`~sklearn.linear_model.QuantileRegressor` than # :class:`~sklearn.linear_model.LinearRegression`. In contrast to that, MSE is # lower for :class:`~sklearn.linear_model.LinearRegression` than # :class:`~sklearn.linear_model.QuantileRegressor`. These results confirms that # MAE is the loss minimized by :class:`~sklearn.linear_model.QuantileRegressor` # while MSE is the loss minimized # :class:`~sklearn.linear_model.LinearRegression`. # # We can make a similar evaluation but looking a the test error obtained by # cross-validation. from sklearn.model_selection import cross_validate cv_results_lr = cross_validate( linear_regression, X, y_pareto, cv=3, scoring=["neg_mean_absolute_error", "neg_mean_squared_error"], ) cv_results_qr = cross_validate( quantile_regression, X, y_pareto, cv=3, scoring=["neg_mean_absolute_error", "neg_mean_squared_error"], ) print(f"""Test error (cross-validated performance) {linear_regression.__class__.__name__}: MAE = {-cv_results_lr["test_neg_mean_absolute_error"].mean():.3f} MSE = {-cv_results_lr["test_neg_mean_squared_error"].mean():.3f} {quantile_regression.__class__.__name__}:
# In[ ]: Y = dftrain["Survived"] Y_model = dftrain["Survived_Model"] print("recall score on training set", recall_score(Y, Y_model)) # In[ ]: print("precision score on training set", precision_score(Y, Y_model)) # In[ ]: scores = cross_validate(clf, dftrain, dftrain["Survived"], scoring=["f1", "accuracy"], cv=10, return_train_score=False) # In[ ]: def display_cross_validate(scores): print("cross val scores") print("f1 scores", scores["test_f1"]) print("f1 mean", scores["test_f1"].mean()) print("f1 std", scores["test_f1"].std()) print("accuracy scores", scores["test_accuracy"]) print("accuracy mean", scores["test_accuracy"].mean()) print("accuracy std", scores["test_accuracy"].std())
} merged_data_set = merged_data_set.dropna() print('merged:' + str(type(merged_data_set))) print('df:' + str(type(df))) columns = [ 'date_time', 'srch_ci', 'srch_co', 'user_id', 'disc_orig_destination_distance', 'std_srch_children_cnt', 'std_srch_adults_cnt' ] merged_data_set = merged_data_set.drop(columns=columns, axis=1) print('merged:' + str(type(merged_data_set))) y = merged_data_set['hotel_cluster'] merged_data_set = merged_data_set.drop(['hotel_cluster'], 1) X = merged_data_set print('Going into the classifier') resultMNB = cross_validate(LogisticRegression(multi_class='multinomial', solver='newton-cg'), X, y, cv=KFold(n_splits=5, shuffle=True), scoring=scoring) print('Accuracy per fold =', resultMNB['test_accuracy']) print('Mean Accuracy =', np.mean(resultMNB['test_accuracy'])) print('Mean Precision =', np.mean(resultMNB['test_precision'])) print('Mean Recall =', np.mean(resultMNB['test_recall'])) print('Mean F1 Score =', np.mean(resultMNB['test_f1_score']))
activity.replace([np.inf, -np.inf], np.nan, inplace=True) data_concat = pd.concat(data) y = data_concat['original']['power'] X = data_concat.drop('power', axis=1, level=1) X.fillna(X.mean(), inplace=True) groups = [] for group_idx, activity in enumerate(data): groups += [group_idx] * activity.shape[0] groups = np.array(groups) scores = cross_validate(GradientBoostingRegressor(random_state=42, n_jobs=-1), X, y, groups=groups, scoring=['r2', 'neg_median_absolute_error'], cv=GroupKFold(n_splits=3), n_jobs=1, return_train_score=True, verbose=0) print('The obtained scores on training and testing in terms of ' 'R2 and MAE are: \n') print(scores) # Store the prediction for visualization y_pred = cross_val_predict(GradientBoostingRegressor(random_state=42, n_jobs=-1), X, y, groups=groups,
#geração de histogramas #x.hist(bins=10, figsize=(9, 10)) #pl.savefig('histogramas') #matriz de correlação #sns.heatmap(x.corr(), annot=True).figure.savefig('corr.png') log_file = open('classification_scores-naive.txt', 'w+') def report(scores, experimentName): print(experimentName) print('Mean accuracy on train: %0.2f' % (scores['train_score'].mean())) print('Standard deviation accuracy on train: %0.2f' % (scores['train_score'].std())) print('Mean accuracy on test: %0.2f' % (scores['test_score'].mean())) print('Standard deviation accuracy on test: %0.2f' % (scores['test_score'].std())) #writing test scores log_file.write('{} score per fold\n'.format(experimentName)) for s in scores['test_score']: log_file.write('{}\n'.format(s)) naive = GaussianNB() experimento = '*** NAIVE BAYES - No Scaler ***' x_n = x #trainning cv_scores = cross_validate(naive, x, y, scoring='accuracy', cv=KFold(n_splits=10), return_train_score=True) #results report(cv_scores, experimento) log_file.close()
""" reg_H = linear_model.LinearRegression().fit(X, H) reg_I = linear_model.LinearRegression().fit(X, I) reg_J = linear_model.LinearRegression().fit(X, J) """ #print(r2(H, reg_H.predict(X)), r2(I, reg_I.predict(X)), r2(J, reg_J.predict(X))) # Lasso is Linear Regression with Regularization parameter reg_H = linear_model.Lasso(alpha=0.1).fit(X, H) reg_I = linear_model.Lasso(alpha=0.1).fit(X, I) reg_J = linear_model.Lasso(alpha=0.1).fit(X, J) H_results = cross_validate(reg_H, X, H, cv=13, scoring=('r2', 'neg_mean_squared_error')) I_results = cross_validate(reg_I, X, I, cv=13, scoring=('r2', 'neg_mean_squared_error')) J_results = cross_validate(reg_H, X, J, cv=13, scoring=('r2', 'neg_mean_squared_error')) print("H_neg_MSE", np.mean(H_results['test_neg_mean_squared_error'])) print("I_neg_MSE", np.mean(I_results['test_neg_mean_squared_error']))
def _train(self): """Trains one iteration of the model called when ``tune.run`` is called. Different routines are run depending on if the ``early_stopping`` attribute is True or not. If ``self.early_stopping`` is not None, each fold is fit with `partial_fit`, which stops training the model if the validation score is not improving for a particular fold. Otherwise, run the full cross-validation procedure. In both cases, the average test accuracy is returned over all folds, as well as the individual folds' accuracies as a dictionary. Returns: ret (:obj:`dict): Dictionary of results as a basis for ``cv_results_`` for one of the cross-validation interfaces. """ if self.early_stopping: for i, (train, test) in enumerate(self.cv.split(self.X, self.y)): X_train, y_train = _safe_split(self.estimator[i], self.X, self.y, train) X_test, y_test = _safe_split( self.estimator[i], self.X, self.y, test, train_indices=train) self.estimator[i].partial_fit(X_train, y_train, np.unique(self.y)) if self.return_train_score: self.fold_train_scores[i] = self.scoring( self.estimator[i], X_train, y_train) self.fold_scores[i] = self.scoring(self.estimator[i], X_test, y_test) ret = {} total = 0 for i, score in enumerate(self.fold_scores): total += score key_str = f"split{i}_test_score" ret[key_str] = score self.mean_score = total / len(self.fold_scores) ret["average_test_score"] = self.mean_score if self.return_train_score: total = 0 for i, score in enumerate(self.fold_train_scores): total += score key_str = f"split{i}_train_score" ret[key_str] = score self.mean_train_score = total / len(self.fold_train_scores) ret["average_train_score"] = self.mean_train_score return ret else: try: scores = cross_validate( self.estimator, self.X, self.y, cv=self.cv, n_jobs=self.n_jobs, fit_params=self.fit_params, groups=self.groups, scoring=self.scoring, return_train_score=self.return_train_score, ) except PicklingError: warnings.warn("An error occurred in parallelizing the cross " "validation. Proceeding to cross validate with " "one core.") scores = cross_validate( self.estimator, self.X, self.y, cv=self.cv, fit_params=self.fit_params, groups=self.groups, scoring=self.scoring, return_train_score=self.return_train_score, ) ret = {} for i, score in enumerate(scores["test_score"]): key_str = f"split{i}_test_score" ret[key_str] = score self.test_accuracy = sum(scores["test_score"]) / len( scores["test_score"]) ret["average_test_score"] = self.test_accuracy if self.return_train_score: for i, score in enumerate(scores["train_score"]): key_str = f"split{i}_train_score" ret[key_str] = score self.train_accuracy = sum(scores["train_score"]) / len( scores["train_score"]) ret["average_train_score"] = self.train_accuracy return ret
sgd = SGD(lr=0.02, momentum=0.01, decay=0, nesterov=False) model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy']) return model classifier = KerasClassifier(build_fn=create_model, epochs=10, batch_size=15, verbose=1) y = np_utils.to_categorical(y, 10) scores = cross_validate(classifier, X, y, return_train_score=True) print("Train Accuracy: %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2)) print("Test Accuracy: %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2)) print("Time: %0.6f (+/- %0.6f)" % (scores['score_time'].mean(), scores['score_time'].std() * 2)) exit() def plot_learning_curve(estimator, title, X, y, ylim=None,
print(os.listdir(".")) # # ライブラリで解く # In[ ]: X = train.get(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']) y = train['Survived'] clf = LogisticRegression() skf = StratifiedKFold(shuffle=True) scoring = { 'acc': 'accuracy', 'auc': 'roc_auc', } scores = cross_validate(clf, X, y, cv=skf, scoring=scoring) print('Accuracy (mean):', scores['test_acc'].mean()) print('AUC (mean):', scores['test_auc'].mean()) # In[ ]: X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True) clf = LogisticRegression() clf.fit(X, y) print(clf.intercept_) print(clf.coef_) # In[ ]:
arrayOfYears = pickle.load(open("arrayOfYears.p", "rb")) arrayOfGrades = pickle.load(open("arrayOfGrades.p", "rb")) featuresToRemove = pickle.load(open("featuresToRemove.p", "rb")) arrayOfGraduation = pickle.load(open("arrayOfGraduation.p", "rb")) arrayOfJustGrades = [] for studYear in arrayOfYears: arrayOfJustGrades+=[[studYear["GPA"]]] vec = DictVectorizer() vectorizedArrayOfYears = vec.fit_transform(arrayOfYears).toarray() reg2 = linear_model.BayesianRidge() reg2.fit(vectorizedArrayOfYears,arrayOfGrades) a = zip(reg2.coef_,vec.get_feature_names()) gpaFeatures = list(a) gpaFeatures.sort() clf2 = svm.SVC(kernel="linear") clf2.fit(vectorizedArrayOfYears, arrayOfGraduation) b = zip(clf2.coef_[0],vec.get_feature_names()) gradFeatures = list(b) gradFeatures.sort() #regr = linear_model.BayesianRidge() #clf = svm.SVC(kernel="linear") clf2 = linear_model.LogisticRegression() #clf3 = naive_bayes.GaussianNB() scoring = ['precision', 'recall','accuracy','f1'] scores = cross_validate(clf2, vectorizedArrayOfYears, arrayOfGraduation, cv=10, scoring=scoring) scores2 = cross_validate(clf2, arrayOfJustGrades, arrayOfGraduation, cv=10, scoring=scoring) print("run time = "+str(timer()-start))
'max_depth': 8, 'learning_rate': 0.25, 'n_estimators': 200, 'reg_alpha': 1.12, 'lambda': 18.51, 'subsample': 0.9, } model = XGBClassifier(**p_grid) # clf = GridSearchCV( estimator = model,param_grid = p_grid, cv = 10 ) scoring = {'accuracy', 'precision', 'recall', 'f1', 'roc_auc'} nested_scores = cross_validate(estimator=model, X=inp, y=y, cv=10, scoring=scoring) op = pd.DataFrame(nested_scores) op.to_csv( '/Users/shreyaspatel/Desktop/Machine_Learning/Aduri/Scores/Scores_2333.csv' ) # clf.fit(X,y) print(op) # print( clf.best_params_ ) print("--- %s seconds ---" % (time.time() - start_time))
DecisionTreeClassifier(random_state=0), RandomForestClassifier(random_state=0), SVC(kernel="linear", random_state=0), RidgeClassifier(random_state=0), LogisticRegression(solver='lbfgs', tol=1e-3, max_iter=400, random_state=0), SGDClassifier(loss="log", random_state=0), MLPClassifier(early_stopping=True, random_state=0), AdaBoostClassifier(random_state=0), KNeighborsClassifier(3) ] cv_scores = pd.DataFrame(columns=['Classifier', 'Precision', 'Recall', 'F1']) for i, clf in enumerate(classifiers): s = cross_validate(clf, X_train, y_train, scoring=['recall', 'precision', 'f1'], cv=3, return_train_score=False) cv_scores.loc[i] = [ clf.__class__.__name__, s['test_precision'].mean(), s['test_recall'].mean(), s['test_f1'].mean() ] clf = MLPClassifier(early_stopping=True, random_state=0) clf.fit(X_train, y_train) instances_ = [] for i in range(200, 300): text_file = os.path.join(TEST_DOCS_DIR, f'{i}.txt')
start_time = time.time() model = model.fit(images, labels) print("Train LINEAR SVC --- %s seconds ---" % (time.time() - start_time)) start_time = time.time() basic_score = model.score(images_validation, labels_validation) print("Validation LINEAR SVC --- %s seconds ---" % (time.time() - start_time)) print("Linear SVC scikit learn basic score: %0.4f" % basic_score) # Validating the model and evaluation start_time = time.time() scores = cross_validate(model, images_validation, labels_validation, cv=5, scoring=('f1', 'roc_auc_ovo'), return_train_score=True) print("Cross Validation LINEAR SVC --- %s seconds ---" % (time.time() - start_time)) cross_score = model.score(images_validation, labels_validation) print("Linear SVC scikit learn cross-val score: %0.4f" % cross_score) print(scores) pickle.dump(model, open(model_file, 'wb')) # calculate the fpr and tpr for all thresholds of the classification probs = model.predict_proba(images_validation)
subset = rich[rich['Taxon'] == taxa[j]] x = subset[covar] y = subset[yval] # find the best model params tuner = aei.model.tune(x, y, n_splits=3) tuner.GradientBoostRegressor(scoring='neg_mean_squared_error') # clean up a deprecated param del(tuner.best_params['min_impurity_split']) # set up the model gbr = ensemble.GradientBoostingRegressor(**tuner.best_params) # run cross validation metrics cv_score = model_selection.cross_validate(gbr, x, y, scoring=['r2', 'neg_mean_squared_error']) # fit the model on all the data gbr.fit(x, y) # calculate the metrics y_eval = gbr.predict(x) rsq = metrics.r2_score(y, y_eval) mse = metrics.mean_squared_error(y, y_eval) # set the linear fit z = np.polyfit(y_eval, y, 1) f = np.poly1d(z) x_new = np.linspace(y.min(), y.max(), 50) y_new = f(x_new)
random_state=rand_st) rgr.fit(data_train, target_train) scores_RMSE = 'asdasdasde2asd' print('Decision Tree RMSE:', scores_RMSE) scores_Expl_Var = 'asdasdfadfasdfs' print('Decision Tree Expl Var:', scores_Expl_Var) ####Cross-Val Regressors#### if binning == 0 & cross_val == 1: #Setup Crossval regression scorers scores = 'asdfasdfqwefawefa' #SciKit Decision Tree Regressor - Cross Val start_ts = time.time() rgr = DecisionTreeRegressor(criterion='mse', splitter='best', max_depth=None, min_samples_split=3, mean_samples_leaf=1, max_features=None, random_state=rand_st) scores = cross_validate(rgr, data_np, target_np, scoring=scorers, cv=5) scores_RMSE = 'asdasdasde2asd' scores_Expl_Var = 'asdasdfadfasdfs' print("Decision Tree RMSE:: %0.2f (+/- %0.2f)" % ((scores_RMSE.mean()), (scores_RMSE.std() * 2))) print("Decision Tree Expl Var: %0.2f (+/- %0.2f)" % ((scores_Expl_Var.mean()), (scores_Expl_Var.std() * 2))) print("CV Runtime:", time.time() - start_ts)
Y = Y[index].astype('float64') print(X.shape, Y.shape) print("train_model!") clf = RandomForestClassifier(n_estimators=100, min_samples_leaf=2, class_weight="balanced", n_jobs=10) #clf=LogisticRegression(class_weight="balanced",max_iter=500) #clf = SVC(probability=True,class_weight="balanced") #clf=KNeighborsClassifier(n_neighbors=5) scoring = ['roc_auc', 'recall', 'f1', 'average_precision', 'accuracy'] scores = cross_validate(clf, X, Y, cv=10, n_jobs=10, scoring=scoring, return_train_score=True) #,scoring='roc_auc' auc_v = scores['test_roc_auc'] train_auc = scores['train_roc_auc'] recall = scores['test_recall'] f1 = scores['test_f1'] aupr = scores['test_average_precision'] acc = scores['test_accuracy'] print(str(auc)) print(str(train_auc)) print("test_AUC: %0.4f (+/- %0.2f)" % (auc_v.mean(), auc_v.std() * 2)) #print("train_AUC: %0.4f (+/- %0.2f)" % (train_auc.mean(), train_auc.std() * 2)) print("recall: %0.4f (+/- %0.2f)" % (recall.mean(), recall.std() * 2)) print("f1: %0.4f (+/- %0.2f)" % (f1.mean(), f1.std() * 2))
edgecolor="none", linewidth=0) ax.legend([extra], [scores], loc="upper left") title = title + "\n Evaluation in {:.2f} seconds".format(elapsed_time) ax.set_title(title) fig, axs = plt.subplots(2, 2, figsize=(9, 7)) axs = np.ravel(axs) for ax, (name, est) in zip( axs, estimators + [("Stacking Regressor", stacking_regressor)]): start_time = time.time() score = cross_validate(est, X, y, scoring=["r2", "neg_mean_absolute_error"], n_jobs=-1, verbose=0) elapsed_time = time.time() - start_time y_pred = cross_val_predict(est, X, y, n_jobs=-1, verbose=0) plot_regression_results( ax, y, y_pred, name, (r"$R^2={:.2f} \pm {:.2f}$" + "\n" + r"$MAE={:.2f} \pm {:.2f}$").format( np.mean(score["test_r2"]), np.std(score["test_r2"]),