def Random_forest(features,target,test_size_percent=0.2,cv_split=3): X_array = features.as_matrix() y_array = target.as_matrix() model_rdf = RandomForestRegressor() X_train, X_test, y_train, y_test = train_test_split(X_array, y_array.T.squeeze(), test_size=test_size_percent, random_state=4) model_rdf.fit(X_train,y_train) test_prediction = model_rdf.predict(X_test) tscv = TimeSeriesSplit(cv_split) training_score = cross_val_score(model_rdf,X_train,y_train,cv=tscv.n_splits) testing_score = cross_val_score(model_rdf,X_test,y_test,cv=tscv.n_splits) print"Cross-val Training score:", training_score.mean() # print"Cross-val Testing score:", testing_score.mean() training_predictions = cross_val_predict(model_rdf,X_train,y_train,cv=tscv.n_splits) testing_predictions = cross_val_predict(model_rdf,X_test,y_test,cv=tscv.n_splits) training_accuracy = metrics.r2_score(y_train,training_predictions) # test_accuracy_model = metrics.r2_score(y_test,test_prediction_model) test_accuracy = metrics.r2_score(y_test,testing_predictions) # print"Cross-val predicted accuracy:", training_accuracy print"Test-predictions accuracy:",test_accuracy plot_model(target,y_train,y_test,training_predictions,testing_predictions) return model_rdf
def svm_regressor(features,target,test_size_percent=0.2,cv_split=5): scale=preprocessing.MinMaxScaler() X_array = scale.fit_transform(features) y_array = scale.fit_transform(target) X_train, X_test, y_train, y_test = train_test_split(X_array, y_array.T.squeeze(), test_size=test_size_percent, random_state=4) svr = SVR(kernel='rbf',C=10,gamma=1) svr.fit(X_train,y_train.ravel()) test_prediction = svr.predict(X_test) tscv = TimeSeriesSplit(cv_split) training_score = cross_val_score(svr,X_train,y_train,cv=tscv.n_splits) testing_score = cross_val_score(svr,X_test,y_test,cv=tscv.n_splits) print"Cross-val Training score:", training_score.mean() # print"Cross-val Testing score:", testing_score.mean() training_predictions = cross_val_predict(svr,X_train,y_train,cv=tscv.n_splits) testing_predictions = cross_val_predict(svr,X_test,y_test,cv=tscv.n_splits) training_accuracy = metrics.r2_score(y_train,training_predictions) # test_accuracy_model = metrics.r2_score(y_test,test_prediction_model) test_accuracy = metrics.r2_score(y_test,testing_predictions) # print"Cross-val predicted accuracy:", training_accuracy print"Test-predictions accuracy:",test_accuracy return svr
def test_cross_val_predict_with_method(): iris = load_iris() X, y = iris.data, iris.target X, y = shuffle(X, y, random_state=0) classes = len(set(y)) kfold = KFold(len(iris.target)) methods = ['decision_function', 'predict_proba', 'predict_log_proba'] for method in methods: est = LogisticRegression() predictions = cross_val_predict(est, X, y, method=method) assert_equal(len(predictions), len(y)) expected_predictions = np.zeros([len(y), classes]) func = getattr(est, method) # Naive loop (should be same as cross_val_predict): for train, test in kfold.split(X, y): est.fit(X[train], y[train]) expected_predictions[test] = func(X[test]) predictions = cross_val_predict(est, X, y, method=method, cv=kfold) assert_array_almost_equal(expected_predictions, predictions)
def linear_regression(features,target,test_size_percent=0.2,cv_split=5): ''' Features -> Pandas Dataframe with attributes as columns target -> Pandas Dataframe with target column for prediction Test_size_percent -> Percentage of data point to be used for testing''' X_array = features.as_matrix() y_array = target.as_matrix() ols = linear_model.LinearRegression() X_train, X_test, y_train, y_test = train_test_split(X_array, y_array.T.squeeze(), test_size=test_size_percent, random_state=4) # model = ols.fit(X_train, y_train) ols.fit(X_train, y_train) # test_prediction_model = ols.predict(X_test) tscv = TimeSeriesSplit(cv_split) training_score = cross_val_score(ols,X_train,y_train,cv=tscv.n_splits) testing_score = cross_val_score(ols,X_test,y_test,cv=tscv.n_splits) print"Cross-val Training score:", training_score.mean() # print"Cross-val Testing score:", testing_score.mean() training_predictions = cross_val_predict(ols,X_train,y_train,cv=tscv.n_splits) testing_predictions = cross_val_predict(ols,X_test,y_test,cv=tscv.n_splits) training_accuracy = metrics.r2_score(y_train,training_predictions) # test_accuracy_model = metrics.r2_score(y_test,test_prediction_model) test_accuracy = metrics.r2_score(y_test,testing_predictions) # print"Cross-val predicted accuracy:", training_accuracy print"Test-predictions accuracy:",test_accuracy plot_model(target,y_train,y_test,training_predictions,testing_predictions) return ols
def scan2D(X, y, window=(10, 10), estimator_params=dict(n_jobs=-1), cv=3): "2D scanning" inputs, labels, instances = [], [], [] instance_count = 0 for sample, label in zip(X, y): sample_shape = sample.shape for s1 in range(sample.shape[0]-window[0]): for s2 in range(sample.shape[1]-window[1]): part = sample[s1:s1+window[0], s2:s2+window[1]] inputs.append(part.flatten()) labels.append(label) instances.append(instance_count) instance_count += 1 rf = RandomForestClassifier(**estimator_params) estimator_params.update({'max_features': 1}) cf = RandomForestClassifier(**estimator_params) probas1 = cross_val_predict(rf, inputs, labels, cv=cv, method='predict_proba') probas2 = cross_val_predict(cf, inputs, labels, cv=cv, method='predict_proba') probas = [] for instance in set(instances): mask = [i == instance for i in instances] p1 = probas1[mask] p2 = probas2[mask] p = np.concatenate([p1.flatten(), p2.flatten()], axis=0) probas.append(p) return probas
def neural_net(features,target,test_size_percent=0.2,cv_split=3,n_iter=100,learning_rate=0.01): '''Features -> Pandas Dataframe with attributes as columns target -> Pandas Dataframe with target column for prediction Test_size_percent -> Percentage of data point to be used for testing''' scale=preprocessing.MinMaxScaler() X_array = scale.fit_transform(features) y_array = scale.fit_transform(target) mlp = Regressor(layers=[Layer("Rectifier",units=5), # Hidden Layer1 Layer("Rectifier",units=3) # Hidden Layer2 ,Layer("Linear")], # Output Layer n_iter = n_iter, learning_rate=0.01) X_train, X_test, y_train, y_test = train_test_split(X_array, y_array.T.squeeze(), test_size=test_size_percent, random_state=4) mlp.fit(X_train,y_train) test_prediction = mlp.predict(X_test) tscv = TimeSeriesSplit(cv_split) training_score = cross_val_score(mlp,X_train,y_train,cv=tscv.n_splits) testing_score = cross_val_score(mlp,X_test,y_test,cv=tscv.n_splits) print"Cross-val Training score:", training_score.mean() # print"Cross-val Testing score:", testing_score.mean() training_predictions = cross_val_predict(mlp,X_train,y_train,cv=tscv.n_splits) testing_predictions = cross_val_predict(mlp,X_test,y_test,cv=tscv.n_splits) training_accuracy = metrics.r2_score(y_train,training_predictions) # test_accuracy_model = metrics.r2_score(y_test,test_prediction_model) test_accuracy = metrics.r2_score(y_test,testing_predictions) # print"Cross-val predicted accuracy:", training_accuracy print"Test-predictions accuracy:",test_accuracy plot_model(target,y_train,y_test,training_predictions,testing_predictions) return mlp
def fit(self, X, y): # Check data X, y = np.array(X), np.array(y) X, y = check_X_y(X, y) # Split to grow cascade and validate mask = np.random.random(y.shape[0]) < self.validation_fraction X_tr, X_vl = X[mask], X[~mask] y_tr, y_vl = y[mask], y[~mask] self.classes_ = unique_labels(y) self.layers_, inp_tr, inp_vl = [], X_tr, X_vl self.scores_ = [] # First layer forests = [RandomForestClassifier(max_features=1, n_estimators=self.n_estimators, min_samples_split=10, criterion='gini', n_jobs=-1), # Complete random RandomForestClassifier(max_features=1, n_estimators=self.n_estimators, min_samples_split=10, criterion='gini', n_jobs=-1), # Complete random RandomForestClassifier(n_estimators=self.n_estimators, n_jobs=-1), RandomForestClassifier(n_estimators=self.n_estimators, n_jobs=-1)] _ = [f.fit(inp_tr, y_tr) for f in forests] p_vl = [f.predict_proba(inp_vl) for f in forests] labels = [self.classes_[i] for i in np.argmax(np.array(p_vl).mean(axis=0), axis=1)] score = self.scoring(y_vl, labels) self.layers_.append(forests) self.scores_.append(score) p_tr = [cross_val_predict(f, inp_tr, y_tr, cv=self.cv, method='predict_proba') for f in forests] # Fit other layers last_score = score inp_tr, inp_vl = np.concatenate([X_tr]+p_tr, axis=1), np.concatenate([X_vl]+p_vl, axis=1) while True: # Grow cascade forests = [RandomForestClassifier(max_features=1, n_estimators=self.n_estimators, min_samples_split=10, criterion='gini', n_jobs=-1), # Complete random RandomForestClassifier(max_features=1, n_estimators=self.n_estimators, min_samples_split=10, criterion='gini', n_jobs=-1), # Complete random RandomForestClassifier(n_estimators=self.n_estimators, n_jobs=-1), RandomForestClassifier(n_estimators=self.n_estimators, n_jobs=-1)] _ = [forest.fit(inp_tr, y_tr) for forest in forests] # Fit the forest p_vl = [forest.predict_proba(inp_vl) for forest in forests] labels = [self.classes_[i] for i in np.argmax(np.array(p_vl).mean(axis=0), axis=1)] score = self.scoring(y_vl, labels) if score - last_score > self.tolerance: self.layers_.append(forests) p_tr = [cross_val_predict(f, inp_tr, y_tr, cv=self.cv, method='predict_proba') for f in forests] inp_tr, inp_vl = np.concatenate([X_tr]+p_tr, axis=1), np.concatenate([X_vl]+p_vl, axis=1) self.scores_.append(score) last_score = score print(self.scores_) else: break # Retrain on entire dataset inp_ = X for forests in self.layers_: _ = [f.fit(inp_, y) for f in forests] p = [cross_val_predict(f, inp_, y, cv=self.cv, method='predict_proba') for f in forests] inp_ = np.concatenate([X]+p, axis=1) return self
def test_cross_val_predict_sparse_prediction(): # check that cross_val_predict gives same result for sparse and dense input X, y = make_multilabel_classification(n_classes=2, n_labels=1, allow_unlabeled=False, return_indicator=True, random_state=1) X_sparse = csr_matrix(X) y_sparse = csr_matrix(y) classif = OneVsRestClassifier(SVC(kernel='linear')) preds = cross_val_predict(classif, X, y, cv=10) preds_sparse = cross_val_predict(classif, X_sparse, y_sparse, cv=10) preds_sparse = preds_sparse.toarray() assert_array_almost_equal(preds_sparse, preds)
def test_cross_val_predict_pandas(): # check cross_val_score doesn't destroy pandas dataframe types = [(MockDataFrame, MockDataFrame)] try: from pandas import Series, DataFrame types.append((Series, DataFrame)) except ImportError: pass for TargetType, InputFeatureType in types: # X dataframe, y series X_df, y_ser = InputFeatureType(X), TargetType(y2) check_df = lambda x: isinstance(x, InputFeatureType) check_series = lambda x: isinstance(x, TargetType) clf = CheckingClassifier(check_X=check_df, check_y=check_series) cross_val_predict(clf, X_df, y_ser)
def fit(self, x, y, **params): """ fit training data """ preds = [] for i, clf in enumerate(self.clfs): log.info("fit %s"%i) if "Keras" in str(clf) and "verbose" in params: params["fit_params"] = dict(verbose=params["verbose"]) # save out-of-fold predictions to fit metaclf if clf.hasattr("predict_proba"): method = "predict_proba" else: method = "predict" pred = cross_val_predict(clf, x, y, cv=self.cv, verbose=0, method=method, **params) preds.append(pred) # fully fitted to predict test data clf.fit(x, y, verbose=0) # fit metaclf on out-of-fold predictions log.info("fit metaclf") self.metaclf.fit(np.hstack(preds), y) return self
def crossval(self, verbose=0, seed=0, method="predict", **params): """ returns crossval score sets self.preds """ # track time spent per run starttime = time() np.random.seed(seed) # useful for keras but throws exception for others if "Keras" in get_clfname(self.clf): self.clf.set_params(verbose=verbose) self.clf.set_params(**params) self.preds = cross_val_predict(self.clf, self.xtrain, self.ytrain, method=method) score = self.scorer._score_func(self.ytrain, self.preds) \ * self.scorer._sign # log results params.update(clf=get_clfname(self.clf), name=self.name, score=score, elapsed=time()-starttime) if self.runs: self.runs.append(params, self.preds) return score
def test_cross_val_predict_input_types(): clf = Ridge() # Smoke test predictions = cross_val_predict(clf, X, y) assert_equal(predictions.shape, (10,)) # test with multioutput y predictions = cross_val_predict(clf, X_sparse, X) assert_equal(predictions.shape, (10, 2)) predictions = cross_val_predict(clf, X_sparse, y) assert_array_equal(predictions.shape, (10,)) # test with multioutput y predictions = cross_val_predict(clf, X_sparse, X) assert_array_equal(predictions.shape, (10, 2)) # test with X and y as list list_check = lambda x: isinstance(x, list) clf = CheckingClassifier(check_X=list_check) predictions = cross_val_predict(clf, X.tolist(), y.tolist()) clf = CheckingClassifier(check_y=list_check) predictions = cross_val_predict(clf, X, y.tolist()) # test with 3d X and X_3d = X[:, :, np.newaxis] check_3d = lambda x: x.ndim == 3 clf = CheckingClassifier(check_X=check_3d) predictions = cross_val_predict(clf, X_3d, y) assert_array_equal(predictions.shape, (10,))
def crossVertifyTestData(self): """ 交叉验证Test数据并返回结果 :param self: 类变量本身 :returns: 返回真正的y和预测的y,真正的y在前面 """ # 进行交叉验证 predict_y = cross_val_predict(self.model, self.test_X, cv=10) return self.test_y, predict_y
def _get_estimator_mse(self, x, y, estimator): """Return the RMSE for *estimator*. Use GroupKFold where a group is a combination of input size and number of workers. The prediction of a group is done when it is out of the training set. """ groups = self._groups.loc[x.index] cv = GroupKFold(n_splits=3) prediction = cross_val_predict(estimator, x, y, groups, cv) return metrics.mean_squared_error(y, prediction)
def evaluate(self, exp): """Split data, fit, transfrom features, tf*idf, svd, report.""" t1 = time() exp.seed = 42 exp.nj = -1 exp.test_size = 0.3 if not hasattr(exp, 'test_size') else exp.test_size np.random.RandomState(exp.seed) # report features if hasattr(exp.pln[0], 'features'): exp.log.head(exp.pln.features, exp.name, exp.seed) # stream data to features X, y = exp.vec.fit_transform(exp.data) # if no test data, split if not hasattr(self, 'test_data'): X, Xi, y, yi = train_test_split( X, y, test_size=exp.test_size, stratify=y) else: Xi, yi = exp.vec.transform(self.test_data) av = self.average # grid search and fit best model choice exp.pln = self.grid_search(exp.pln, X, y, exp.seed) print("\n Training model...") exp.pln.fit(X, y) print(" done!") labs = exp.vec.encoder.classes_ exp.log.data('sparse', 'train', X) # if user wants to report more than best score, do another CV on train # if hasattr(self, 'detailed_train'): sco = cross_val_predict(exp.pln, X, y, cv=self.cv, n_jobs=exp.nj) self.res['train'] = exp.log.report('train', y, sco, av, labs) exp.log.data('sparse', 'test', Xi, dump=True) res = exp.pln.predict(Xi) self.res['test'] = exp.log.report('test', yi, res, av, labs) if hasattr(self, 'proportions'): self._run_proportions((X, Xi, y, yi), exp) print("\n # ------------------------------------------ \n") t2 = time() dur = round(t2 - t1, 1) self.res['dur'] = dur print("\n Experiment took {0} seconds".format(dur)) exp.store() print("\n" + '-' * 10, "\n")
def save_fit_plot(x, y, fit, name, folder): predicted = cross_val_predict(fit, x, y, cv=10) linfit = np.polyfit(y, predicted, 1) fig, ax = plt.subplots() ax.scatter(y, predicted, s=1, alpha=0.1) ax.plot([y.min(), y.max()], [y.min(), y.max()], "k--", lw=2) ax.plot(y, np.poly1d(linfit)(y), "g--", lw=2) ax.set_xlabel("Measured") ax.set_ylabel("Predicted") f_name = timed_filename(name, "pdf") plt.savefig(os.path.join(folder, f_name))
def test_cross_val_predict(): """Test cross_val_predict with predict_proba.""" from sklearn.linear_model import LinearRegression from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.base import BaseEstimator, clone from sklearn.model_selection import cross_val_predict rng = np.random.RandomState(42) X = rng.randn(10, 1, 3) y = rng.randint(0, 2, 10) estimator = SlidingEstimator(LinearRegression()) cross_val_predict(estimator, X, y, cv=2) class Classifier(BaseEstimator): """Moch class that does not have classes_ attribute.""" def __init__(self): self.base_estimator = LinearDiscriminantAnalysis() def fit(self, X, y): self.estimator_ = clone(self.base_estimator).fit(X, y) return self def predict_proba(self, X): return self.estimator_.predict_proba(X) with pytest.raises(AttributeError, match="classes_ attribute"): estimator = SlidingEstimator(Classifier()) cross_val_predict(estimator, X, y, method='predict_proba', cv=2) estimator = SlidingEstimator(LinearDiscriminantAnalysis()) cross_val_predict(estimator, X, y, method='predict_proba', cv=2)
def test_cross_val_predict(): """Make sure it works in cross_val_predict.""" X, y = load_iris(return_X_y=True) X = StandardScaler().fit_transform(X) clf = FMClassifier(rank=2, solver='L-BFGS-B', random_state=4567).fit(X, y) cv = KFold(n_splits=4, random_state=457, shuffle=True) y_oos = cross_val_predict(clf, X, y, cv=cv, method='predict') acc = accuracy_score(y, y_oos) assert acc >= 0.90, "accuracy is too low for iris in cross_val_predict!"
def cross_val_pred_plot(model,X,y,consum_col,consum_col_pred,denorm_target,model_name=None,print_plot=False,cv=5): if 'multi' or 'mlp' or 'preceptron' in model_name.lower(): warnings.filterwarnings("ignore", category=DeprecationWarning) #run this line separately whole_pred = cross_val_predict(model,X.values,y.values,cv=5) else: whole_pred = cross_val_predict(model,X,y,cv=5) whole_predictions=pd.Series(whole_pred.ravel(),index=y.index) whole_predictions = whole_predictions.rename(consum_col_pred) whole = pd.DataFrame(whole_predictions).join(y) whole[whole[consum_col_pred] <0.0] = 0 r2 = metrics.r2_score(y,whole_pred) if print_plot: if ('multi' or 'mlp' or 'preceptron') in model_name.lower(): whole.plot(title=model_name+'-Whole dataset predictions - score {}'.format(r2)) else: if model_name==None: model_name = 'Model';print"\nInsert model name\n"; whole.plot(title=model_name+'-Whole dataset predictions - score {}'.format(r2)) plt.ylabel('Power consumption in Watts') # plt.xlabel('Date Time') # print"\nR2 score: ",metrics.r2_score(y,whole_pred),"\n" if (model_name == 'svr') or (model_name == 'mlp'): denorm_whole = whole*(denorm_target.max().values[0]-denorm_target.min().values[0])+denorm_target.min().values[0] mae = metrics.mean_absolute_error(denorm_whole[consum_col],denorm_whole[consum_col_pred]) mse = metrics.mean_squared_error(denorm_whole[consum_col],denorm_whole[consum_col_pred]) whole = denorm_whole # if 'mlp' in model_name: # print'calculating metrics of MLP' # acc = model.score(X.values,y.values) # else: # print'calculating metrics of SVR' # acc = model.score(X,y) else: print'calculating metrics of LNR or RDF' mae = metrics.mean_absolute_error(y,whole_pred) mse = metrics.mean_squared_error(y,whole_pred) # acc = model.score(X,y) return whole,r2,mae,mse
def test_cross_val_predict_class_subset(): X = np.arange(8).reshape(4, 2) y = np.array([0, 0, 1, 2]) classes = 3 kfold3 = KFold(n_splits=3) kfold4 = KFold(n_splits=4) le = LabelEncoder() methods = ['decision_function', 'predict_proba', 'predict_log_proba'] for method in methods: est = LogisticRegression() # Test with n_splits=3 predictions = cross_val_predict(est, X, y, method=method, cv=kfold3) # Runs a naive loop (should be same as cross_val_predict): expected_predictions = get_expected_predictions(X, y, kfold3, classes, est, method) assert_array_almost_equal(expected_predictions, predictions) # Test with n_splits=4 predictions = cross_val_predict(est, X, y, method=method, cv=kfold4) expected_predictions = get_expected_predictions(X, y, kfold4, classes, est, method) assert_array_almost_equal(expected_predictions, predictions) # Testing unordered labels y = [1, 1, -4, 6] predictions = cross_val_predict(est, X, y, method=method, cv=kfold3) y = le.fit_transform(y) expected_predictions = get_expected_predictions(X, y, kfold3, classes, est, method) assert_array_almost_equal(expected_predictions, predictions)
def test_cross_val_predict(): boston = load_boston() X, y = boston.data, boston.target cv = KFold() est = Ridge() # Naive loop (should be same as cross_val_predict): preds2 = np.zeros_like(y) for train, test in cv.split(X, y): est.fit(X[train], y[train]) preds2[test] = est.predict(X[test]) preds = cross_val_predict(est, X, y, cv=cv) assert_array_almost_equal(preds, preds2) preds = cross_val_predict(est, X, y) assert_equal(len(preds), len(y)) cv = LeaveOneOut() preds = cross_val_predict(est, X, y, cv=cv) assert_equal(len(preds), len(y)) Xsp = X.copy() Xsp *= (Xsp > np.median(Xsp)) Xsp = coo_matrix(Xsp) preds = cross_val_predict(est, Xsp, y) assert_array_almost_equal(len(preds), len(y)) preds = cross_val_predict(KMeans(), X) assert_equal(len(preds), len(y)) class BadCV(): def split(self, X, y=None, labels=None): for i in range(4): yield np.array([0, 1, 2, 3]), np.array([4, 5, 6, 7, 8]) assert_raises(ValueError, cross_val_predict, est, X, y, cv=BadCV())
def test_ridge_gcv_sample_weights( gcv_mode, X_constructor, fit_intercept, n_features, y_shape, noise): alphas = [1e-3, .1, 1., 10., 1e3] rng = np.random.RandomState(0) n_targets = y_shape[-1] if len(y_shape) == 2 else 1 X, y = _make_sparse_offset_regression( n_samples=11, n_features=n_features, n_targets=n_targets, random_state=0, shuffle=False, noise=noise) y = y.reshape(y_shape) sample_weight = 3 * rng.randn(len(X)) sample_weight = (sample_weight - sample_weight.min() + 1).astype(int) indices = np.repeat(np.arange(X.shape[0]), sample_weight) sample_weight = sample_weight.astype(float) X_tiled, y_tiled = X[indices], y[indices] cv = GroupKFold(n_splits=X.shape[0]) splits = cv.split(X_tiled, y_tiled, groups=indices) kfold = RidgeCV( alphas=alphas, cv=splits, scoring='neg_mean_squared_error', fit_intercept=fit_intercept) # ignore warning from GridSearchCV: DeprecationWarning: The default of the # `iid` parameter will change from True to False in version 0.22 and will # be removed in 0.24 with ignore_warnings(category=DeprecationWarning): kfold.fit(X_tiled, y_tiled) ridge_reg = Ridge(alpha=kfold.alpha_, fit_intercept=fit_intercept) splits = cv.split(X_tiled, y_tiled, groups=indices) predictions = cross_val_predict(ridge_reg, X_tiled, y_tiled, cv=splits) kfold_errors = (y_tiled - predictions)**2 kfold_errors = [ np.sum(kfold_errors[indices == i], axis=0) for i in np.arange(X.shape[0])] kfold_errors = np.asarray(kfold_errors) X_gcv = X_constructor(X) gcv_ridge = RidgeCV( alphas=alphas, store_cv_values=True, gcv_mode=gcv_mode, fit_intercept=fit_intercept) gcv_ridge.fit(X_gcv, y, sample_weight=sample_weight) if len(y_shape) == 2: gcv_errors = gcv_ridge.cv_values_[:, :, alphas.index(kfold.alpha_)] else: gcv_errors = gcv_ridge.cv_values_[:, alphas.index(kfold.alpha_)] assert kfold.alpha_ == pytest.approx(gcv_ridge.alpha_) assert_allclose(gcv_errors, kfold_errors, rtol=1e-3) assert_allclose(gcv_ridge.coef_, kfold.coef_, rtol=1e-3) assert_allclose(gcv_ridge.intercept_, kfold.intercept_, rtol=1e-3)
def cv_BIKE_Ridge( A_list, yV, alpha = 0.5, XX = None, n_splits = 5, n_jobs = -1, grid_std = None): clf = binary_model.BIKE_Ridge( A_list, XX, alpha = alpha) ln = A_list[0].shape[0] # ls is the number of molecules. kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True) kf_n = kf5_ext_c.split( A_list[0]) AX_idx = np.array([list(range( ln))]).T yV_pred = model_selection.cross_val_predict( clf, AX_idx, yV, cv = kf_n, n_jobs = n_jobs) print('The prediction output using cross-validation is given by:') jutil.cv_show( yV, yV_pred, grid_std = grid_std) return yV_pred
def fit(self,proba_exclude=False,proba_threshold=0.5,n_jobs=1,cv=None,clf=None): from sklearn.linear_model import LogisticRegressionCV from sklearn.model_selection import cross_val_predict,KFold from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler decision_features = self.decision_features auto_labels = self.auto_labels if cv is None: cv = KFold(n_splits=5,shuffle=True,random_state=12345) if clf is None: clf = LogisticRegressionCV(Cs=np.logspace(-4,6,11), cv=cv, tol=1e-5, max_iter=int(1e4), scoring='roc_auc', class_weight='balanced', n_jobs=n_jobs) clf = Pipeline([('scaler',StandardScaler()), ('estimator',clf)]) try: auto_proba = cross_val_predict(clf,decision_features,auto_labels,cv=cv,method='predict_proba',n_jobs=n_jobs) auto_proba = auto_proba[:,-1] except: try: auto_proba = cross_val_predict(clf,decision_features,auto_labels,cv=5,method='predict_proba',n_jobs=n_jobs) auto_proba = auto_proba[:,-1] except: auto_proba = cross_val_predict(clf,decision_features,auto_labels,cv=3,method='predict_proba',n_jobs=n_jobs) auto_proba = auto_proba[:,-1] if proba_exclude: idx_ = np.where(auto_proba < proba_threshold) auto_labels[idx_] = 0 #auto_proba[idx_] self.auto_labels = auto_labels self.auto_proba = auto_proba
def check_cross_val_predict_with_method(est): iris = load_iris() X, y = iris.data, iris.target X, y = shuffle(X, y, random_state=0) classes = len(set(y)) kfold = KFold(len(iris.target)) methods = ['decision_function', 'predict_proba', 'predict_log_proba'] for method in methods: predictions = cross_val_predict(est, X, y, method=method) assert_equal(len(predictions), len(y)) expected_predictions = np.zeros([len(y), classes]) func = getattr(est, method) # Naive loop (should be same as cross_val_predict): for train, test in kfold.split(X, y): est.fit(X[train], y[train]) expected_predictions[test] = func(X[test]) predictions = cross_val_predict(est, X, y, method=method, cv=kfold) assert_array_almost_equal(expected_predictions, predictions) # Test alternative representations of y predictions_y1 = cross_val_predict(est, X, y + 1, method=method, cv=kfold) assert_array_equal(predictions, predictions_y1) predictions_y2 = cross_val_predict(est, X, y - 2, method=method, cv=kfold) assert_array_equal(predictions, predictions_y2) predictions_ystr = cross_val_predict(est, X, y.astype('str'), method=method, cv=kfold) assert_array_equal(predictions, predictions_ystr)
def scan1D(X, y, window=100, estimator_params=dict(n_jobs=-1), cv=3): "Sliding scanner for variable length input samples" inputs, labels, instances = [], [], [] instance_count = 0 for sample, label in zip(X, y): sample_len = len(sample) for s in range(sample_len-window): inputs.append(sample[s: s+window].flatten()) labels.append(label) instances.append(instance_count) instance_count += 1 rf = RandomForestClassifier(**estimator_params) estimator_params.update({'max_features': 1}) cf = RandomForestClassifier(**estimator_params) probas1 = cross_val_predict(rf, inputs, labels, cv=cv, method='predict_proba') probas2 = cross_val_predict(cf, inputs, labels, cv=cv, method='predict_proba') probas = [] for instance in set(instances): mask = [i == instance for i in instances] p1 = probas1[mask] p2 = probas2[mask] p = np.concatenate([p1.flatten(), p2.flatten()], axis=0) probas.append(p) return probas
def XValidatePredict(self, labels, values, folds, stratified=True): ''' :param labels: class of each sample :param values: feature values for each sample :param folds: number of folds :param stratified: boolean whether to use stratified K fold :return: cross-validated estimates for each input data point ''' num_samples = values.shape[0] if stratified: CV = folds else: CV = KFold(num_samples, folds) predictions = cross_val_predict(self.classifier, X=values, y=labels, cv=CV, n_jobs=1) return np.array(predictions)
def test_cross_val_predict(): # Make sure it works in cross_val_predict for multiclass. X, y = load_iris(return_X_y=True) y = LabelBinarizer().fit_transform(y) X = StandardScaler().fit_transform(X) mlp = MLPClassifier(n_epochs=10, solver_kwargs={'learning_rate': 0.05}, random_state=4567).fit(X, y) cv = KFold(n_splits=4, random_state=457, shuffle=True) y_oos = cross_val_predict(mlp, X, y, cv=cv, method='predict_proba') auc = roc_auc_score(y, y_oos, average=None) assert np.all(auc >= 0.96)
def test_cross_validation_is_finite(estimator, build_dataset): """Tests that validation on metric-learn estimators returns something finite """ input_data, labels, preprocessor, _ = build_dataset() estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) set_random_state(estimator) assert np.isfinite(cross_val_score(estimator, *remove_y_quadruplets(estimator, input_data, labels))).all() assert np.isfinite(cross_val_predict(estimator, *remove_y_quadruplets(estimator, input_data, labels) )).all()
def cv_SVR( xM, yV, svr_params, n_splits = 5, n_jobs = -1, grid_std = None, graph = True, shuffle = True): """ method can be 'Ridge', 'Lasso' cross validation is performed so as to generate prediction output for all input molecules """ print(xM.shape, yV.shape) clf = svm.SVR( **svr_params) kf_n_c = model_selection.KFold( n_splits=n_splits, shuffle=shuffle) kf_n = kf5_ext_c.split( xM) yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs) if graph: print('The prediction output using cross-validation is given by:') jutil.cv_show( yV, yV_pred, grid_std = grid_std) return yV_pred
validationErrors = [] for num in range(1, 4): print(f'Degree {num} ') poly_reg = PolynomialFeatures(degree=num) X_poly = poly_reg.fit_transform(X_train) poly_reg.fit(X_poly, y_train) lin_reg_2 = LinearRegression(n_jobs=-1) lin_reg_2.fit(X_poly, y_train) y_pred = lin_reg_2.predict(X_poly) traniningMSE = (y_pred - y_train)**2 traniningMSE = (np.sum(traniningMSE)) / len(y_pred) print(f'traning error {traniningMSE}') traningError.append(traniningMSE) prediction = cross_val_predict(lin_reg_2, X_poly, y_train, cv=5) validationError = (prediction - y_train)**2 validationError = (np.sum(validationError)) / len(prediction) print(f'valiadtion error {validationError} \n') validationErrors.append(validationError) validationErrors = np.array(validationErrors) pos = validationErrors.argmin() #printing results for tranning error and validation erro fig, ax = plt.subplots() ax.plot(list(range(1, 4)), traningError, '-', label='training data') ax.plot(list(range(1, 4)), validationErrors, '-', label='validation data') ax.axvline(x=list(range(1, 4))[pos], linestyle='--', label='best fit') ax.set_xlabel('Model Complexity (Degree)') ax.set_ylabel('Mean Squared Error')
model.fit(train_x, train_y) prediction = model.predict(test_x) print('The accuracy of the Random Forests is', metrics.accuracy_score(prediction, test_y)) from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score from sklearn.model_selection import cross_val_predict kfold = KFold(n_splits=10, random_state=1998) cv_result = cross_val_score(model, x, y, cv=kfold, scoring='accuracy') print(cv_result.mean(), cv_result.std()) from sklearn.metrics import confusion_matrix y_pred = cross_val_predict(RandomForestClassifier(n_estimators=100), x, y, cv=10) sns.heatmap(confusion_matrix(y, y_pred), annot=True, fmt='2.0f') plt.title('Random Forest Confusion Matrix') test.info() train.info() y_pred = model.predict(test) df = pd.DataFrame(y_pred) df.to_csv('pred.csv')
def regression(): warnings.filterwarnings('ignore') warnings.simplefilter('ignore') pd.options.display.float_format = '{:.2f}'.format # init_notebook_mode(connected=True) connection = MongoClient(os.environ["MONGODB_URL"]) db = connection.admin.mobilede allData = pd.read_csv(os.environ['DIR-WORKING'] + '/vehicles.csv', delimiter=';') allDataUnmodified = pd.read_csv(os.environ['DIR-WORKING'] + '/vehicles.csv', delimiter=';') # Print some basic data print("Price of all vehicles: {:6.2f} EUR".format( allData['priceEur'].mean())) # decide on columns - features and target - with which we are going to work target_column = 'priceEur' feature_columns = [ 'id', 'kmState', 'numOfPrevOwners', 'makeModel', 'power', 'firstRegistration', 'derivedKmPerYear', 'emissionSticker', 'plz', 'numOfSeats' ] feature_and_target_columns = list(feature_columns) feature_and_target_columns.append(target_column) regressionData = allData[feature_and_target_columns] # transform strings to NaN where appropriate and drop the rows, # where at least one feature value is missed prevSize = regressionData.shape[0] regressionData = regressionData.replace('unknown', np.NaN) regressionData = regressionData.replace('NaN', np.NaN) regressionData = regressionData.replace('nan', np.NaN) regressionData = regressionData.dropna() # make some features conversion, like from numerical into categorical regressionData['makeModel'] = regressionData['makeModel'].astype( 'category').cat.codes regressionData['plz'] = regressionData['plz'].astype('category').cat.codes regressionData['firstRegistration'] = regressionData[ 'firstRegistration'].astype('category').cat.codes regressionData['emissionSticker'] = regressionData[ 'emissionSticker'].astype('category').cat.codes regressionData['numOfPrevOwners'] = regressionData[ 'numOfPrevOwners'].astype('float64') # remember the order of IDs, and do not use it in regression idValues = regressionData['id'] del regressionData['id'] feature_columns.remove('id') feature_and_target_columns.remove('id') print("Left {}/{} entries after clean up and drop".format( regressionData.shape[0], prevSize)) featuresData = regressionData[feature_columns] y = regressionData[target_column] # Calculate Regression regr = linear_model.LinearRegression(normalize=True) regr.fit(featuresData, y) predictions = regr.predict(featuresData) print() zipped = zip(feature_columns, regr.coef_) for name, coef in zipped: print("{}: {}".format(name, coef)) predicted = cross_val_predict(regr, featuresData, y, cv=5) # # x - predicted values # # y - actual values # fix, ax = plt.subplots() # ax.scatter(y, predicted, color='green', s=9) # ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=1) # ax.set_xlabel("Observed") # # TODO: add regression coefficient to the plot # ax.set_ylabel("Predicted") # # plt.savefig(os.environ['DIR-DATA'] + '/price_observed_vs_predicted.png') zipped = zip(idValues, y, predicted) for itemId, observedValue, predictedValue in zipped: itemIdStr = str(itemId) origRow = allData.loc[allData['id'] == itemId] goneOnStr = "{}".format(origRow['goneOn'].asobject[0]) firstSeenOn = "{}".format(origRow['firstSeenOn'].asobject[0]) if not "nan" in goneOnStr: daysOnline = (parser.parse(goneOnStr) - parser.parse(firstSeenOn)).days else: daysOnline = -1 try: derivedKmPerYear = "{:.0f}".format( origRow['derivedKmPerYear'].asobject[0]) except Exception as e: print("Failed on derivedKmPerYear, id {} with {}".format( itemId, e)) dbItem = db.find_one({"id": itemIdStr}) dbItem["daysOnline"] = daysOnline dbItem["derivedKmPerYear"] = derivedKmPerYear dbItem["predictedPrice"] = '{:.0f}'.format(predictedValue) dbItem["diffSaving"] = int(predictedValue - observedValue) dbItem["diffSavingPercent"] = '{:.03f}'.format( (predictedValue - observedValue) / observedValue) dbItem["inputIsRegressionExcluded"] = "" dbItem["inputIsFavoured"] = "" result = db.update({"id": itemIdStr}, dbItem) print("*** DONE **** DONE ***")
neural_network = KerasClassifier(build_fn=create_network, epochs=250) from sklearn.preprocessing import LabelBinarizer lb = LabelBinarizer() clinicalOutput = np.array([number[0] for number in lb.fit_transform(clinicalOutput)]) results = cross_validate(neural_network, clinicalInput, clinicalOutput,cv=10,scoring=("accuracy","f1","recall","precision")) import matplotlib.pyplot as plt plt.plot(results["test_accuracy"],color="c") plt.plot(results["test_f1"],color="m") plt.plot(results["test_recall"],color="y") plt.plot(results["test_precision"],color="k") plt.title("Model Information (CNNLSTM)") plt.ylabel("Model Performance") plt.xlabel("Number of Folds") plt.legend(["Accuracy","F1-Score","Recall","Precision"], loc="lower right") plt.show() #Determine the prediction y_pred = cross_val_predict(neural_network, clinicalInput, clinicalOutput, cv=10) #Provide AUC score from sklearn.metrics import roc_auc_score print("Accuracy result: ", np.mean(results["test_accuracy"])) print("Recall result: ", np.mean(results["test_recall"])) print("Precision result: ", np.mean(results["test_precision"])) print("F1 result: ", np.mean(results["test_f1"])) print("ROC: ", roc_auc_score(clinicalOutput, y_pred))
] # オートスケーリング autoscaled_y_train = (y_train - y_train.mean()) / y_train.std() autoscaled_x_train = (x_train - x_train.mean()) / x_train.std() # クロスバリデーションによるカーネル関数の最適化 cross_validation = KFold(n_splits=fold_number, random_state=9, shuffle=True) # クロスバリデーションの分割の設定 r2cvs = [] # 空の list。カーネル関数ごとに、クロスバリデーション後の r2 を入れていきます for index, kernel in enumerate(kernels): print(index + 1, '/', len(kernels)) model = GaussianProcessRegressor(alpha=0, kernel=kernel) estimated_y_in_cv = np.ndarray.flatten( cross_val_predict(model, autoscaled_x_train, autoscaled_y_train, cv=cross_validation)) estimated_y_in_cv = estimated_y_in_cv * y_train.std( ddof=1) + y_train.mean() r2cvs.append(r2_score(y_train, estimated_y_in_cv)) optimal_kernel_number = np.where( r2cvs == np.max(r2cvs))[0][0] # クロスバリデーション後の r2 が最も大きいカーネル関数の番号 optimal_kernel = kernels[optimal_kernel_number] # クロスバリデーション後の r2 が最も大きいカーネル関数 print('クロスバリデーションで選択されたカーネル関数の番号 :', optimal_kernel_number) print('クロスバリデーションで選択されたカーネル関数 :', optimal_kernel) # モデル構築 model = GaussianProcessRegressor(alpha=0, kernel=optimal_kernel) # GPR モデルの宣言 model.fit(autoscaled_x_train, autoscaled_y_train) # モデル構築 # トレーニングデータの推定
np.set_printoptions(threshold=np.inf) with open("dataset.txt", "w") as f: f.write(str(dataset)) ###機器學習 #分割訓練集與測試集 X = dataset["data"] y = dataset["target"] X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.2, random_state=0) #建立預測模型 print("開始建立預測模型...") time.sleep(2) reg = LinearRegression() predicted = cross_val_predict(reg, X, y, cv=10) reg.fit(X_train, y_train) accuracy_train = reg.score(X_train, y_train) accuracy_test = reg.score(X_test, y_test) predict_y = reg.predict(X_test) time.sleep(1) print("建立完成") print() time.sleep(2) print("訓練集預測分數為 %s" % (accuracy_train)) print("測試集預測分數為 %s" % (accuracy_test)) #畫出預測成果 plt.scatter(predicted, y, s=2) plt.plot(predict_y, predict_y, 'ro') plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
) print(-1 * cross_val_score(lre, x_data[['horsepower']], y_data, cv=4, scoring='neg_mean_squared_error')) print( 'Calculate the average R^2 using two folds, find the average R^2 for the second fold utilizing the horsepower as a feature :' ) Rcross1 = cross_val_score(lre, x_data[['horsepower']], y_data, cv=2) print(Rcross1[1]) print( 'You can also use the function <cross_val_predict> to predict the output. The function splits up the data into the specified number of folds, using one fold to get a prediction while the rest of the folds are used as test data. First import the function:' ) yhat = cross_val_predict(lre, x_data[['horsepower']], y_data, cv=4) print(yhat[0:5]) #part 2: Over/Under fitting and model selection lr = LinearRegression() lr.fit(x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']], y_train) yhat_train = lr.predict( x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']]) print(yhat_train[0:5]) yhat_test = lr.predict( x_test[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']]) print(yhat_test[0:5])
scores = cross_validate(estimator=grid_search, X=X, y=y, cv=5, error_score='raise', return_estimator=True, scoring=scoring) # outer print('Scores: {}'.format(scores['test_score'])) print('Mean score: {}'.format(np.mean(scores['test_score']))) # Creamos clasificador 'tonto' y obtenemos resultados también con validación cruzada (CV=5) para tener resultados más realistas dummy_clf = DummyClassifier(strategy='most_frequent', random_state=random_state) dummy_scores = cross_validate(estimator=dummy_clf, X=X, y=y, cv=5, error_score='raise', return_estimator=True, scoring=scoring) print('Dummy scores: {}'.format(dummy_scores['test_score'])) print('Dummy mean score: {}'.format(np.mean(dummy_scores['test_score']))) # Matriz de confusion results = cross_val_predict(grid_search, X=X, y=y, cv=5) conf_m = confusion_matrix(y, results, labels=[1, 0]) print(conf_m) # F1_Score print(f1_score(y, results))
'model__gamma': [3], 'model__max_depth': [4] } model = GridSearchCV(pipe, param_grid, cv=cfg["folds"], scoring='roc_auc') logger.info("Getting best model...") model.fit(X, y) logger.info("Best Params: {}".format(model.best_params_)) model = model.best_estimator_ logger.info("Fitting model on upscaled X...") model.fit(X_up, y_up) logger.info("Predicting score (w/Cross-Val) on X...") results = cross_val_predict(model, X, y, cv=cfg["folds"], method='predict_proba')[:, 1] score = gini_normalized(y, results) logger.info("normalized gini score on training set is {}".format(score)) logger.info("Loading and predicting on Test set...") test = load_file("test") test['target'] = model.predict_proba(test)[:, 1] write_submission_file(test, columns=['target'], name='xgb-ups') logger.info("Finished with time {:.3f} minutes".format( (time.time() - start) / 60.0))
trueY_60_20 = trueY(splitPseudonym(selectData(60, 20))) trueY_60_10 = trueY(splitPseudonym(selectData(60, 10))) trueY_60_30 = trueY(splitPseudonym(selectData(60, 30))) # for train_ix, test_ix in kfold.split(X_60_20, y_60_20): # # select rows # train_X, test_X = X_60_20[train_ix], X_60_20[test_ix] # train_y, test_y = y_60_20[train_ix], y_60_20[test_ix] # print("train_X: ", train_X, type(train_X), train_X.shape) # print("test_X: ", test_X, type(test_X), test_X.shape) # print("train_y: ", train_y, type(train_y), train_y.shape) # print("test_y: ", test_y, type(test_y), test_y.shape) predictions_20_20_hat = cross_val_predict(clf, X_20_20, y_20_20, cv=kfold, method='predict_proba') predictions_40_20_hat = cross_val_predict(clf, X_40_20, y_40_20, cv=kfold, method='predict_proba') predictions_60_20_hat = cross_val_predict(clf, X_60_20, y_60_20, cv=kfold, method='predict_proba') predictions_60_10_hat = cross_val_predict(clf, X_60_10, y_60_10,
print("\n") print("Model predicted for house {0} value {1}".format(did, linear_regression_prediction)) print("\n") print("Real value for house {0} is {1}".format(did, bmd_test_target[did])) # model evaluation bmd_mean_square_error = mean_squared_error(bmd_test_target, linear_regression.predict(bmd_test_data)) print("\n") print("Mean square error of a learned model: %.3f " % bmd_mean_square_error) bmd_r2_score = r2_score(bmd_test_target, linear_regression.predict(bmd_test_data)) print("\n") print(f"Variance score: %.3f" % bmd_r2_score) print("\n") print('Coefficients of a learned model: \n', linear_regression.coef_) scores = cross_val_score(LinearRegression(), bmd['data'], bmd['target'], cv=4) print("\n") print(f"Cross-validation score: {scores}") # cross_val_predict returns an array of the same size as `y` where each entry # is a prediction obtained by cross validation: predicted = cross_val_predict(linear_regression, bmd['data'], bmd['target'], cv=4) fig, ax = plt.subplots() ax.scatter(bmd['target'], predicted, edgecolors=(0, 0, 0)) ax.plot([bmd['target'].min(), bmd['target'].max()], [bmd['target'].min(), bmd['target'].max()], 'k--', lw=4) ax.set_xlabel('Measured') ax.set_ylabel('Predicted') plt.show()
scores = cross_val_score(logreg, iris.data, iris.target ,cv = 3) # In[418]: scores # In[414]: from sklearn.model_selection import cross_val_predict iris = load_iris() logreg = LogisticRegression() scores = cross_val_predict(logreg, iris.data, iris.target ,cv = 5) # In[415]: scores # In[420]: from sklearn.model_selection import KFold kfold = KFold(n_splits = 3) cross_val_score(logreg, iris.data, iris.target, cv = kfold)
y_train_folds = y_train[train_index] x_test_fold = x_train[test_index] y_test_fold = y_train[test_index] clone_clf.fit(x_train_folds, y_train_folds) y_pred = clone_clf.predict(x_test_fold) n_correct = sum(y_pred == y_test_fold) print('accuracy:', (n_correct / len(y_pred))) my_cross_val_score(sgd_clf, x_train, y_train_9, cv=3) # %% # 混淆矩阵 from sklearn.model_selection import cross_val_predict y_train_predict = cross_val_predict(sgd_clf, x_train, y_train_9, cv=3) print(y_train_predict.shape) y_train_predict[:5] # %% from sklearn.metrics import confusion_matrix confusion_matrix(y_train_9, y_train_predict) # %% from sklearn.metrics import precision_score, recall_score, f1_score print('precision_score:', precision_score(y_train_9, y_train_predict)) print('recall_score:', recall_score(y_train_9, y_train_predict)) print('f1_score:', f1_score(y_train_9, y_train_predict)) # %%
# In[ ]: lreg = LogisticRegression() lreg_yhat= lreg.fit(X, y).predict(X) lreg_sas = accuracy_score(y, lreg_yhat) lreg_cv5s = cross_val_score(lreg, X, y, cv=5, n_jobs=-1).mean() lreg_l1os = cross_val_score(lreg, X, y, cv=LeaveOneOut().split(X), n_jobs=-1).mean() print('Self Accuracy Score : {}'.format(lreg_sas)) print('CV5 Score : {}'.format(lreg_cv5s)) print('CVLeave1Out Score : {}'.format(lreg_l1os)) lreg_pvsa_survival = np.column_stack((cross_val_predict(lreg, X, y, cv=5, n_jobs=-1), y)) print('Predicted Survival : {}'.format(lreg_pvsa_survival[:,0].mean())) print('Actual Survival : {}'.format(lreg_pvsa_survival[:,1].mean())) print(classification_report(y, lreg_pvsa_survival[:,0], target_names=['dead','notdead'])) cm = confusion_matrix(y,lreg_pvsa_survival[:,0]) ax = plt.axes() sns.heatmap(cm, ax=ax, fmt='d', square=True, annot=True, vmin=0) ax.set_xlabel('Predicted') ax.set_ylabel('Actual') ax.set_title('LREG - Survival - Confusion Matrix') # In[ ]:
# pprint(y) # iris = sns.load_dataset("iris") # X = iris.values[50:150, 0:4] # y = iris.values[50:150, 4] # sns.pairplot(iris,hue='species') # sns.plt.show() # 2-nd logistic regression using sklearn # log-regression lib model log_model = LogisticRegression() m = np.shape(X)[0] # 10-folds CV y_pred = cross_val_predict(log_model, X, y, cv=10) print(metrics.accuracy_score(y, y_pred)) # LOOCV # loo = LeaveOneOut() # accuracy = 0 # for train, test in loo.split(X): # log_model.fit(X[train], y[train]) # fitting # y_p = log_model.predict(X[test]) # if y_p == y[test]: accuracy += 1 # print(accuracy / np.shape(X)[0]) # m = np.shape(X)[0] # scores_loo = cross_val_score(log_model, X, y, cv=m) # print(scores_loo)
if regression_flag == 1: PLScomponents = np.arange( 1, min( np.linalg.matrix_rank(autoscaled_Xtrain) + 1, maxPLScomponentnumber + 1), 1) r2all = list() r2cvall = list() for PLScomponent in PLScomponents: plsmodelincv = PLSRegression(n_components=PLScomponent) plsmodelincv.fit(autoscaled_Xtrain, autoscaled_ytrain) calculatedyincv = np.ndarray.flatten( plsmodelincv.predict(autoscaled_Xtrain)) estimatedyincv = np.ndarray.flatten( model_selection.cross_val_predict(plsmodelincv, autoscaled_Xtrain, autoscaled_ytrain, cv=fold_number)) calculatedyincv = calculatedyincv * ytrain.std(ddof=1) + ytrain.mean() estimatedyincv = estimatedyincv * ytrain.std(ddof=1) + ytrain.mean() r2all.append( float(1 - sum((ytrain - calculatedyincv)**2) / sum((ytrain - ytrain.mean())**2))) r2cvall.append( float(1 - sum((ytrain - estimatedyincv)**2) / sum((ytrain - ytrain.mean())**2))) plt.plot(PLScomponents, r2all, 'bo-') plt.plot(PLScomponents, r2cvall, 'ro-') plt.ylim(0, 1) plt.xlabel('Number of PLS components') plt.ylabel('r2(blue), r2cv(red)')
features_tfidf = pandas.DataFrame(tfidfX.todense()) # Assign column names to make it easier to print most useful features later features_tfidf.columns = tfidf.get_feature_names() features_combined = pandas.concat([features_tfidf, derived_features], axis=1) logging.info('Combined features shape:') logging.info(features_combined.shape) svm_object = LogisticRegression() classifier = OneVsRestClassifierBalance(svm_object) logging.info('Getting per-class scores') y_pred = cross_val_predict(classifier, features_combined.values, labels_matrix, cv=10) logging.info('Computing overall results') scores_f1 = cross_val_score(classifier, features_combined.values, labels_matrix, cv=10, scoring='f1_weighted').mean() logging.info(classification_report(labels_matrix, y_pred, digits=3)) logging.info('f1_weighted : {0}'.format(scores_f1)) end = time.time() runtime_in_seconds = end - start logging.info('Processing completed in {0}'.format(runtime_in_seconds))
random_state=1) # Perform cross-validation scores = cross_val_score(cv=kf, estimator=clf, X=X_train, y=y_train, scoring='accuracy' ) print('Scores: ' + str(scores)) print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), 2*scores.std())) # Gather predictions predictions = cross_val_predict(cv=kf, estimator=clf, X=X_train, y=y_train ) accuracy_score = metrics.accuracy_score(y_train, predictions) print('accuracy score: '+str(accuracy_score)) confusion_matrix = metrics.confusion_matrix(y_train, predictions) class_names = encoder.classes_.tolist() #Train the classifier clf.fit(X=X_train, y=y_train) model = {'classifier': clf, 'classes': encoder.classes_, 'scaler': X_scaler}
def predict(self): '''do predictions using the best extreme random forest an the test set as well as training set with 3 cross-validation folds and doing some initial analysis on the output''' print('*' * 80) print('* Predict using new forest and test/train_CV set') print('*' * 80) #try out how well the classifier works to predict from the test set self.y_pred = self.extra_clf_rand_new.predict(self.X_metrix_test) self.y_pred_proba = self.extra_clf_rand_new.predict_proba( self.X_metrix_test) with open( os.path.join(self.output_dir, 'extreme_randomforest_randomsearch.txt'), 'a') as text_file: text_file.write( 'Saving predictions and probabilities for X_metrix_test in y_pred and probabilities in y_pred_proba \n' ) #alternative way to not have to use the test set self.y_train_CV_pred = cross_val_predict(self.extra_clf_rand_new, self.X_metrix_train, self.y_train, cv=3) self.y_train_CV_pred_proba = cross_val_predict(self.extra_clf_rand_new, self.X_metrix_train, self.y_train, cv=3, method='predict_proba') with open( os.path.join(self.output_dir, 'extreme_randomforest_randomsearch.txt'), 'a') as text_file: text_file.write( 'Saving predictions and probabilities for X_metrix_train with 3-fold CV in y_train_pred \n' ) print('*' * 80) print('* Calculate prediction stats') print('*' * 80) def prediction_stats(y_test, y_pred, directory): # calculate accuracy y_accuracy = metrics.accuracy_score(self.y_test, y_pred) # examine the class distribution of the testing set (using a Pandas Series method) class_dist = self.y_test.value_counts() # calculate the percentage of ones # because y_test only contains ones and zeros, we can simply calculate the mean = percentage of ones ones = self.y_test.mean() # calculate the percentage of zeros zeros = 1 - self.y_test.mean() # calculate null accuracy in a single line of code # only for binary classification problems coded as 0/1 null_acc = max(self.y_test.mean(), 1 - self.y_test.mean()) with open( os.path.join(directory, 'extreme_randomforest_randomsearch.txt'), 'a') as text_file: text_file.write( 'Accuracy score or agreement between y_test and y_pred: %s \n' % y_accuracy) text_file.write('Class distribution for y_test: %s \n' % class_dist) text_file.write('Percent 1s in y_test: %s \n' % ones) text_file.write('Percent 0s in y_test: %s \n' % zeros) text_file.write('Null accuracy in y_test: %s \n' % null_acc) prediction_stats(self.y_test, self.y_pred, self.output_dir)
} grid_log_reg = GridSearchCV(LogisticRegression(), log_reg_params) grid_log_reg.fit(X_train, y_train) #Nous obtenons le modèle de régression logistique avec les meilleurs paramètres: log_reg = grid_log_reg.best_estimator_ # In[30]: from sklearn.metrics import roc_curve from sklearn.model_selection import cross_val_predict # Nous créons un dataframe avec tous les scores et classifiers log_reg_pred = cross_val_predict(log_reg, X_train, y_train, cv=5, method="decision_function") # In[31]: from sklearn.metrics import roc_auc_score print('Logistic Regression: ', roc_auc_score(y_train, log_reg_pred)) # In[32]: #Nous visualisons la courbe ROC qui mesure les performances du modèle. Nous retiendrons ici la régression logistique #à appliquer sur nos données log_fpr, log_tpr, log_thresold = roc_curve(y_train, log_reg_pred)
#train, validate, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))]) if test and train data are not separate #do the PCA first input_train, input_validate, output_train, output_validate = train_test_split( data_train_nzv, classe_col_train, test_size=0.3, random_state=0) std_scale = preprocessing.StandardScaler().fit(input_train) input_train_std = std_scale.transform(input_train) input_valid_std = std_scale.transform(input_validate) input_test_std = std_scale.transform(data_test_nzv) pca_std = PCA(n_components=0.9).fit(input_train_std) input_train_std = pca_std.transform(input_train_std) input_valid_std = pca_std.transform(input_valid_std) data_test_nzv_std = pca_std.transform(data_test_nzv) dt = tree.DecisionTreeClassifier() #can provide depth value in () model = dt.fit(X=input_train_std, y=output_train) predictions = cross_val_predict(model, X=input_valid_std, y=output_validate, cv=10) #njobs for number of CPUs print('Decision Tree Score:', metrics.accuracy_score(output_validate, predictions)) #print “Score:”, model.score(X_valid, y_valid) #Visualize Tree tree.export_graphviz(model, out_file='tree.dot') dot_data = StringIO() tree.export_graphviz(model, out_file=dot_data) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_png("decision_tree.png") #fit random forest rf = RandomForestClassifier() model_rf = rf.fit(X=input_train_std, y=output_train) predictions_rf = cross_val_predict(model_rf, X=input_valid_std,
def sample_and_cross_val_clf(train_size=200, noise_corr=2, dim=3, sep=.5, random_state=0): """ Runs an experiments and returns the corresponding lines in the results dataframe. """ clf = LinearSVC_continuous(penalty='l2', fit_intercept=True) n_samples = train_size + 10000 X, y = mk_data(n_samples=n_samples, separability=sep, random_state=random_state, noise_corr=noise_corr, dim=dim) X_train = X[:train_size] y_train = y[:train_size] X_test = X[train_size:] y_test = y[train_size:] validation_score = roc_auc_score(y_test, clf.fit(X_train, y_train).predict(X_test)) # # Create 10 blocks of evenly-spaced labels for GroupShuffleSplit groups = np.arange(train_size) // (train_size // 10) scores = list() for name, cv in [('10 repeated 10-fold', RepeatedKFold(n_splits=10, n_repeats=10, random_state=random_state)), ('50 splits', GroupShuffleSplit(n_splits=50, random_state=random_state))]: try: cv_scores = cross_val_score(clf, X_train, y_train, groups=groups, scoring='roc_auc', cv=cv) except: if name == '10 repeated 10-fold': try: cv_scores = [ roc_auc_score( y_train, cross_val_predict(clf, X_train, y_train, groups=groups, cv=10)) ] except: cv_scores = [np.nan] else: cv_scores = [np.nan] scores.append( dict(cv_name=name, validation_score=validation_score, train_size=train_size, dim=dim, noise_corr=noise_corr, sep=sep, score_error=(np.mean(cv_scores) - validation_score), score_sem=(np.std(cv_scores) / np.sqrt(len(cv_scores))))) return scores
for i, dicionario in enumerate(grid.cv_results_['params']): z = dicionario.copy() z.update({'mean': round(medias_teste[i], 4)}) print(z) #Imprime os melhores parâmetros e acurácia do melhor gaussianNB grid.best_params_ grid.best_score_ #Criação do classificador clf = GaussianNB(var_smoothing=1e-10) #Visualização do score, acurácia e métricas do modelo utilizando validação cruzada resultado = cross_val_score(clf, previsores, classe, cv=5, scoring='accuracy') resultados = cross_val_predict(clf, previsores, classe, cv=5) valor_classes = sorted(np.unique(classe)) print( f'O desvio padrão da soma de todos os folds do modelo GausianNB é de {round(resultado.std(), 4)}' ) print( f'A acurácia do modelo GausianNB é de {round(metrics.accuracy_score(classe,resultados) * 100, 2)}%' ) print( f'As métricas do modelo GausianNB é:\n {metrics.classification_report(classe,resultados,valor_classes)}' ) #Criação do modelo utilizando holdout p_treinamento, p_teste, c_treinamento, c_teste = train_test_split( previsores, classe, test_size=0.2, random_state=0) clf.fit(p_treinamento, c_treinamento)
participant = 'fcm' df_sub = df[df['participant'] == participant] # for 1-back to 4-back for n_back in np.arange(1,5): X,y,groups = utils.get_features_targets_groups( df_sub.dropna(), n_back = n_back, names = name_for_scale, independent_variables = feature_names, dependent_variable = [target_name,'correctness']) X,y,groups = shuffle(X,y,groups) y,correctness = y[:,0],y[:,1] for model_name,model in utils.make_clfs().items(): cv = LeaveOneOut() print('{}-back,{}'.format(n_back,model_name)) preds = cross_val_predict(model,X,y,groups=groups,cv=cv,method='predict',verbose=2,n_jobs=4) df_pred_ = pd.DataFrame(np.vstack([preds,correctness]).T,columns = ['preds','correct']) p_correct = float(np.sum(correctness == 1)+1) / (len(correctness)+1) p_incorrect = float(np.sum(correctness == 0)+1) / (len(correctness)+1) p_aware = float(np.sum(preds == 1)+1) / (len(preds)+1) p_unaware = float(np.sum(preds == 0)+1) / (len(preds)+1) p_correct_aware = float(np.sum(np.logical_and(correctness == 1, preds == 1))+1) / (len(df_pred_)+1) p_correct_unaware = float(np.sum(np.logical_and(correctness == 1, preds == 0))+1) / (len(df_pred_)+1) p_incorrect_aware = float(np.sum(np.logical_and(correctness == 0, preds == 1))+1) / (len(df_pred_)+1) p_incorrect_unaware = float(np.sum(np.logical_and(correctness == 0, preds == 0))+1) / (len(df_pred_)+1) correlation,pval = stats.spearmanr(preds,correctness) results['sub'].append(participant) results['model'].append(model_name) results['corre'].append(correlation) results['pval'].append(pval) results['p(correct|awareness)'].append(p_correct_aware/p_aware)
def predict(self, X): return np.zeros((len(X), 1), dtype=bool) never_5_clf = Never5Classifier() cross_val_score2 = cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring="accuracy") print("cross_val_score_never5", cross_val_score2) # 计算混淆矩阵 from sklearn.model_selection import cross_val_predict y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3) from sklearn.metrics import confusion_matrix confusion_matrix1 = confusion_matrix(y_train_5, y_train_pred) print("confusion_matrix", confusion_matrix1) # 计算查准率,查全率,F1 from sklearn.metrics import precision_score, recall_score precision_score1 = precision_score(y_train_5, y_train_pred) print("precision_score1", precision_score1) recall_score1 = recall_score(y_train_5, y_train_pred) print("recall_score1", recall_score1) from sklearn.metrics import f1_score f1_score = f1_score(y_train_5, y_train_pred) print("f1_score", f1_score)
autoscaled_y_train = (y_train - y_train.mean()) / y_train.std() autoscaled_x_test = (x_test - x_train.mean()) / x_train.std() if method_name == 'pls': # CV による成分数の最適化 components = [] # 空の list の変数を作成して、成分数をこの変数に追加していきます同じく成分数をこの変数に追加 r2_in_cv_all = [] # 空の list の変数を作成して、成分数ごとのクロスバリデーション後の r2 をこの変数に追加 for component in range( 1, min(np.linalg.matrix_rank(autoscaled_x_train), max_number_of_principal_components) + 1): # PLS model = PLSRegression(n_components=component) # PLS モデルの宣言 estimated_y_in_cv = pd.DataFrame( cross_val_predict( model, autoscaled_x_train, autoscaled_y_train, cv=fold_number)) # クロスバリデーション推定値の計算し、DataFrame型に変換 estimated_y_in_cv = estimated_y_in_cv * y_train.std() + y_train.mean( ) # スケールをもとに戻す r2_in_cv = metrics.r2_score(y_train, estimated_y_in_cv) # r2 を計算 print(component, r2_in_cv) # 成分数と r2 を表示 r2_in_cv_all.append(r2_in_cv) # r2 を追加 components.append(component) # 成分数を追加 # 成分数ごとの CV 後の r2 をプロットし、CV 後のr2が最大のときを最適成分数に optimal_component_number = sample_functions.plot_and_selection_of_hyperparameter( components, r2_in_cv_all, 'number of components', 'cross-validated r2') print('\nCV で最適化された成分数 :', optimal_component_number) # PLS model = PLSRegression(n_components=optimal_component_number) # モデルの宣言 elif method_name == 'svr':
# Probabilidades de cada classe do conjunto de dados. # Usamos o predict_proba para visualizar os dados de probabilidade. # A visualização será um array com cada frase em sua respectiva posição. print (modelo.classes_) modelo.predict_proba(freq_testes).round(2) # In[ ]: # Vamos realizar uma avaliação do modelo utilizando a técnica de "Cross Validation" # Utilizaremos com 10 folds. Mais informações no link abaixo: # https://scikit-learn.org/stable/modules/cross_validation.html resultados = cross_val_predict(modelo, freq_tweets, classes, cv=10) # In[ ]: # Utilizando a matriz de confusão. Essa técnica é excelente para validação. # Temos como analisar os dados que foram interpretados em classes "incorretas". print (pd.crosstab(classes, resultados, rownames=['Real'], colnames=['Predito'], margins=True)) # In[ ]: # Vamos umsar o método metrics do sklearn. Vale a pena uma leitura na documentação.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3,random_state=1) print(x_train.shape) print(x_test.shape) # linearReg = linear_model.ElasticNet(alpha=0.001,l1_ratio=0.1) # linearReg = linear_model.LinearRegression() # linearReg = linear_model.LassoCV(alphas=[0.001,0.1, 0.01, 0.5, 1, 3, 5, 7, 10, 20, 100], cv=5) linearReg = linear_model.Lasso(alpha=0.01) # linearReg = linear_model.Ridge(alpha=0.001) # linearReg.fit(x_train,y_train) # linearReg = Ridge() linearReg.fit(x_train,y_train.values.ravel()) # print(linearReg.alpha_) print(linearReg.intercept_) print(linearReg.coef_) y_pred = linearReg.predict(x_test) Y_pred = cross_val_predict(linearReg, x, y,cv=100) print('MSE:') print(metrics.mean_squared_error(y_test,y_pred)) print('RMSE:') print(np.sqrt(metrics.mean_squared_error(y_test,y_pred))) # Y_pred = cross_val_predict(linearReg, x, y, cv=50) print("10折交叉验证MSE:", metrics.mean_squared_error(y, Y_pred)) print("10折交叉验证RMSE:", np.sqrt(metrics.mean_squared_error(y, Y_pred))) plt.figure() plt.title("Model Star") plt.xlabel("Measured") plt.ylabel("Predicted") plt.ylim(0,3.5) plt.grid(True)
from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) #ML Algorithm from sklearn import linear_model clf = linear_model.LogisticRegression(random_state=0) clf.fit(X_train, y_train) #Cross validation from sklearn.model_selection import cross_val_score from sklearn.model_selection import cross_val_predict from sklearn.metrics import confusion_matrix cross_val_score(clf, X_train, y_train, cv=3, scoring='accuracy') y_train_pred = cross_val_predict(clf, X_train, y_train, cv=3) cm = confusion_matrix(y_train, y_train_pred) print(cm) from sklearn.metrics import precision_score, recall_score print("precision score = {0:.4f}".format(precision_score( y_train, y_train_pred))) print("recall score = {0:.4f}".format(recall_score(y_train, y_train_pred))) #Predicting results, confusion matrix y_pred = clf.predict(X_test) from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) print(cm) print("precision score = {0:.4f}".format(precision_score(y_test, y_pred))) print("recall score = {0:.4f}".format(recall_score(y_test, y_pred)))
from sklearn.linear_model import LogisticRegression lr = LogisticRegression() cols = loans.columns train_cols = cols.drop("loan_status") features = loans[train_cols] target = loans["loan_status"] lr.fit(features, target) predictions = lr.predict(features) ## Cross Validation ## from sklearn.linear_model import LogisticRegression from sklearn.model_selection import cross_val_predict lr = LogisticRegression() predictions = cross_val_predict(lr, features, target, cv=3) predictions = pd.Series(predictions) # False positives. fp_filter = (predictions == 1) & (loans["loan_status"] == 0) fp = len(predictions[fp_filter]) # True positives. tp_filter = (predictions == 1) & (loans["loan_status"] == 1) tp = len(predictions[tp_filter]) # False negatives. fn_filter = (predictions == 0) & (loans["loan_status"] == 1) fn = len(predictions[fn_filter]) # True negatives