def test_stacking_classifier_iris(cv, final_estimator, passthrough): # prescale the data to avoid convergence warning without using a pipeline # for later assert X_train, X_test, y_train, y_test = train_test_split( scale(X_iris), y_iris, stratify=y_iris, random_state=42 ) estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())] clf = StackingClassifier( estimators=estimators, final_estimator=final_estimator, cv=cv, passthrough=passthrough ) clf.fit(X_train, y_train) clf.predict(X_test) clf.predict_proba(X_test) assert clf.score(X_test, y_test) > 0.8 X_trans = clf.transform(X_test) expected_column_count = 10 if passthrough else 6 assert X_trans.shape[1] == expected_column_count if passthrough: assert_allclose(X_test, X_trans[:, -4:]) clf.set_params(lr='drop') clf.fit(X_train, y_train) clf.predict(X_test) clf.predict_proba(X_test) if final_estimator is None: # LogisticRegression has decision_function method clf.decision_function(X_test) X_trans = clf.transform(X_test) expected_column_count_drop = 7 if passthrough else 3 assert X_trans.shape[1] == expected_column_count_drop if passthrough: assert_allclose(X_test, X_trans[:, -4:])
def test_stacking_classifier_iris(cv, final_estimator): # prescale the data to avoid convergence warning without using a pipeline # for later assert X_train, X_test, y_train, y_test = train_test_split(scale(X_iris), y_iris, stratify=y_iris, random_state=42) estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())] clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator, cv=cv) clf.fit(X_train, y_train) clf.predict(X_test) clf.predict_proba(X_test) assert clf.score(X_test, y_test) > 0.8 X_trans = clf.transform(X_test) assert X_trans.shape[1] == 6 clf.set_params(lr='drop') clf.fit(X_train, y_train) clf.predict(X_test) clf.predict_proba(X_test) if final_estimator is None: # LogisticRegression has decision_function method clf.decision_function(X_test) X_trans = clf.transform(X_test) assert X_trans.shape[1] == 3
def run(): import numpy as np import pandas as pd import seaborn import matplotlib.pyplot as pyplot import seaborn as sns from sklearn.model_selection import train_test_split from xgboost import XGBClassifier from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, roc_auc_score from sklearn.svm import SVC svc=SVC(probability=True, kernel='linear') from sklearn.ensemble import GradientBoostingClassifier from sklearn.linear_model import LogisticRegression from sklearn.ensemble import StackingClassifier df = pd.read_table("./data/australian.csv", sep='\s+', header=None) y = df[14] X = df.drop(columns = 14) y.value_counts() # Split features and target into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y, test_size = 0.4) # Instantiate the Classifiers estimators = [('xgb', XGBClassifier()), ('gbdt', GradientBoostingClassifier(random_state=1))] clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()) clf.fit(X_train, y_train) # Make predictions for the test set y_pred_test = clf.predict(X_test) # View accuracy score print(classification_report(y_test, y_pred_test)) clf_probs = clf.predict_proba(X_test) # keep probabilities for the positive outcome only clf_probs = clf_probs[:, 1] # calculate scores clf_auc = roc_auc_score(y_test, clf_probs) # summarize scores print('ensemble: ROC AUC=%.3f' % (clf_auc)) print("accuracy_score is %.3f" % (accuracy_score(y_test, y_pred_test, normalize=True))) # calculate roc curves clf_fpr, clf_tpr, _ = roc_curve(y_test, clf_probs) # plot the roc curve for the model pyplot.plot(clf_fpr, clf_tpr, marker='.', label='Ensemble') # axis labels pyplot.xlabel('False Positive Rate') pyplot.ylabel('True Positive Rate') # show the legend pyplot.legend() # show the plot pyplot.show()
def week10(C, random_state, criterion, min_samples_leaf, max_leaf_samples, n_estimators, solver, cv, clazz, images): trainData, Y = catsvsdogs.train Y = [(y + 1) % 2 for y in Y] from sklearn.ensemble import BaggingClassifier from sklearn.tree import DecisionTreeClassifier tree = DecisionTreeClassifier( criterion=criterion, # критерий разделения min_samples_leaf=min_samples_leaf, # минимальное число объектов в листе max_leaf_nodes=max_leaf_samples, # максимальное число листьев random_state=random_state) bagging = BaggingClassifier( tree, # базовый алгоритм n_estimators=n_estimators, # количество деревьев random_state=random_state) bagging.fit(trainData, Y) from sklearn.svm import LinearSVC svm = LinearSVC(random_state=random_state, C=C) svm.fit(trainData, Y) from sklearn.ensemble import RandomForestClassifier forest = RandomForestClassifier( n_estimators=n_estimators, # количество деревьев criterion=criterion, # критерий разделения min_samples_leaf=min_samples_leaf, # минимальное число объектов в листе max_leaf_nodes=max_leaf_samples, # максимальное число листьев random_state=random_state) forest.fit(trainData, Y) from sklearn.linear_model import LogisticRegression lr = LogisticRegression(solver=solver, random_state=random_state) from sklearn.ensemble import StackingClassifier base_estimators = [('SVM', svm), ('Bagging DT', bagging), ('DecisionForest', forest)] sclf = StackingClassifier(estimators=base_estimators, final_estimator=lr, cv=cv) sclf.fit(trainData, Y) accuracy = sclf.score(trainData, Y) probas = [] for img in images: histt = catsvsdogs.test[img].reshape(1, -1) probas += [(img, sclf.predict_proba(histt)[0][clazz])] return {'accuracy': accuracy, 'probas': probas}
def test_stacking_classifier_drop_estimator(): # prescale the data to avoid convergence warning without using a pipeline # for later assert X_train, X_test, y_train, _ = train_test_split(scale(X_iris), y_iris, stratify=y_iris, random_state=42) estimators = [('lr', 'drop'), ('svc', LinearSVC(random_state=0))] rf = RandomForestClassifier(n_estimators=10, random_state=42) clf = StackingClassifier(estimators=[('svc', LinearSVC(random_state=0))], final_estimator=rf, cv=5) clf_drop = StackingClassifier(estimators=estimators, final_estimator=rf, cv=5) clf.fit(X_train, y_train) clf_drop.fit(X_train, y_train) assert_allclose(clf.predict(X_test), clf_drop.predict(X_test)) assert_allclose(clf.predict_proba(X_test), clf_drop.predict_proba(X_test)) assert_allclose(clf.transform(X_test), clf_drop.transform(X_test))
def Model_1(train, test): ''' Trains the model and Saves the predictions in a CSV file train : Training set test : Test set ''' # Preprocessing X_train = OneHotEncoder(sparse=False).fit_transform( [[x for x in s] for s in train['Sequence']]) X_test = OneHotEncoder(sparse=False).fit_transform( [[x for x in s] for s in test['Sequence']]) Y_train = train['label'] X_train, Y_train = RandomUnderSampler(random_state=100).fit_resample( X_train, Y_train) X_train, Y_train = shuffle(X_train, Y_train, random_state=100) # Training estimators = [('rf', RandomForestClassifier(n_estimators=300, max_depth=45, min_samples_leaf=7, random_state=100)), ('mlp', MLPClassifier(max_iter=200, random_state=100)), ('knn', KNeighborsClassifier(n_neighbors=4))] clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression(random_state=100), n_jobs=-1, verbose=1) clf.fit(X_train, Y_train) # Predicting Y_pred = clf.predict(X_test) Y_prob = [x[1] for x in clf.predict_proba(X_test)] result = pd.DataFrame() result["ID"] = test["ID"] result["Label"] = Y_prob result.to_csv("Submission_1.csv", index=False) result["Label"] = Y_pred result.to_csv("Predictions_1.csv", index=False)
def ensemble_predictions(members, X_te, params): assert params["type"] in ("weighted", "stacked") # make predictions if params["type"] == "weighted": y_preds = np.array([model.predict_proba(X_te) for model in members]) # mean across ensemble members y_ensemble_pred = np.average(y_preds, weights=params["weights"], axis=0) else: estimators = [(f'expert_{i}', members[i]) for i in range(len(members))] # only final estimator should be fitted here clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression()) X_tr = params["X_tr"] print(X_tr.columns.tolist()) y_tr = params["y_tr"] clf.fit(X_tr, y_tr.values.ravel()) y_ensemble_pred = clf.predict_proba(X_te) return y_ensemble_pred
# ('1_5', DecisionTreeClassifier(max_depth=9)), # ('1_6', RandomForestClassifier(max_depth=12, n_estimators=13, max_features=11)), ] stack_clf = StackingClassifier(estimators=base_learners, final_estimator=LogisticRegression(C=0.123456789, solver="lbfgs", max_iter=5000), cv=10) # stack_clf.fit(xtrain, ytrain) # stack_acc=stack_clf.score(xtest, ytest) # print('stack_acc',stack_acc) print('1') score=cross_validate(stack_clf, X_train, y_train, cv=3, scoring="roc_auc")["test_score"].mean() print(f"{score:.6f}") stack_clf.fit(X_train, y_train) pred=stack_clf.predict_proba(X_test)[:,1] pd.DataFrame({"id": original_test["id"], "target": pred}).to_csv("stacking_submission.csv", index=False) from sklearn.ensemble import VotingClassifier base_learners = [ # ('1_1',AdaBoostClassifier()), # ('1_2',GaussianNB()), #('1_1',MLPClassifier(alpha=1, max_iter=1000)), ('1_2',LogisticRegression(C=0.123456789, solver="lbfgs", max_iter=5000)), ('1_4', KNeighborsClassifier(n_neighbors=7,weights='distance')) # ('1_5', DecisionTreeClassifier(max_depth=9)), # ('1_6', RandomForestClassifier(max_depth=12, n_estimators=13, max_features=11)), ] v_clf = VotingClassifier(estimators=base_learners,voting='soft')
X = pd.DataFrame({'Yamnet': y_predicted_yamnet, 'SVM': pd.Series(y_pred_svm)}) estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)), ('svr', make_pipeline(StandardScaler(), LinearSVC(random_state=42)))] clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()) X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42) clf.fit(X_train, y_train).score(X_test, y_test) # y_test == y_real.iloc[X_test.index] y_pred_combined = clf.predict_proba( X_test)[:, 1] # The probability of getting the output as 1 (cough) Confusion_Matrix(y_test, y_pred_combined, pred_prob=True) y_pred_combined = clf.predict_proba(X)[:, 1] y_real, y_predicted_combined = Confusion_Matrix(y, y_pred_combined, pred_prob=True) X_new = pd.DataFrame({'Yamnet': [0], 'SVM': [0.95]}) clf.predict_proba(X_new)[:, 1] # Import Joblib Module from Scikit Learn import joblib # Save RL_Model to file in the current working directory
def run(dataset, config): log.info( f"\n**** Stacking Ensemble [sklearn v{sklearn.__version__}] ****\n") is_classification = config.type == 'classification' X_train, X_test = dataset.train.X_enc, dataset.test.X_enc y_train, y_test = dataset.train.y_enc, dataset.test.y_enc training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } n_jobs = config.framework_params.get( '_n_jobs', config.cores ) # useful to disable multicore, regardless of the dataset config estimators_params = { e: config.framework_params.get(f'_{e}_params', {}) for e in ['rf', 'gbm', 'linear', 'svc', 'final'] } log.info( "Running Sklearn Stacking Ensemble with a maximum time of {}s on {} cores." .format(config.max_runtime_seconds, n_jobs)) log.warning( "We completely ignore the requirement to stay within the time limit.") log.warning( "We completely ignore the advice to optimize towards metric: {}.". format(config.metric)) if is_classification: estimator = StackingClassifier( estimators=[ ('rf', RandomForestClassifier(n_jobs=n_jobs, random_state=config.seed, **estimators_params['rf'])), ('gbm', GradientBoostingClassifier(random_state=config.seed, **estimators_params['gbm'])), ('linear', SGDClassifier(n_jobs=n_jobs, random_state=config.seed, **estimators_params['linear'])), # ('svc', LinearSVC(random_state=config.seed, **estimators_params['svc'])) ], # final_estimator=SGDClassifier(n_jobs=n_jobs, random_state=config.seed, **estimators_params['final']), final_estimator=LogisticRegression(n_jobs=n_jobs, random_state=config.seed, **estimators_params['final']), stack_method='predict_proba', n_jobs=n_jobs, **training_params) else: estimator = StackingRegressor( estimators=[ ('rf', RandomForestRegressor(n_jobs=n_jobs, random_state=config.seed, **estimators_params['rf'])), ('gbm', GradientBoostingRegressor(random_state=config.seed, **estimators_params['gbm'])), ('linear', SGDRegressor(random_state=config.seed, **estimators_params['linear'])), ('svc', LinearSVR(random_state=config.seed, **estimators_params['svc'])) ], # final_estimator=SGDRegressor(random_state=config.seed, **estimators_params['final']), final_estimator=LinearRegression(n_jobs=n_jobs, random_state=config.seed, **estimators_params['final']), n_jobs=n_jobs, **training_params) with utils.Timer() as training: estimator.fit(X_train, y_train) predictions = estimator.predict(X_test) probabilities = estimator.predict_proba( X_test) if is_classification else None return result(output_file=config.output_predictions_file, predictions=predictions, truth=y_test, probabilities=probabilities, target_is_encoded=is_classification, models_count=len(estimator.estimators_) + 1, training_duration=training.duration)
X_train , X_test ,y_train, y_test = train_test_split(X , y , stratify = y) basetree = DecisionTreeClassifier( criterion="gini" , min_samples_split=0.4) clf1 = AdaBoostClassifier(n_estimators=50 , learning_rate=0.5) # 1 # basetree= DecisionTreeClassifier( criterion="entropy" , min_samples_split=0.4) # AdaBoostClassifier(basetree,n_estimators=50 , learning_rate=0.5) # 0.86 clf2 = GradientBoostingClassifier(init=basetree ,subsample=0.8 , max_features=0.8) clf1.fit(X_train , y_train) roc_auc_score(y_test , clf1.predict_proba(X_test)[:,1]) from sklearn.ensemble import StackingClassifier clf = StackingClassifier( n_jobs=-1 ,estimators=[ ('gbc' , clf2) , ('abc' , clf1) ]) clf.fit(X_train , y_train) roc_auc_score(y_test , clf.predict_proba(X_test)[:,1])
reg = StackingClassifier(estimators=estimators, final_estimator=KNeighborsClassifier(n_neighbors=11)) reg.fit(x_train, y_train) y_pred = reg.predict(x_test) cm = confusion_matrix(y_test, y_pred) print(cm) acc = accuracy_score(y_test, y_pred) print("accuracy score %0.2f%%" % (acc * 100)) #ROC and AUC curve from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_curve clf_probs = reg.predict_proba(x_test) clf_probs = clf_probs[:, 1] print(clf_probs) ras = roc_auc_score(y_test, clf_probs) print("Logistic : ROC AUC = %.3f" % (ras)) from sklearn.preprocessing import label_binarize y = label_binarize(y_test, classes=[1, 2]) n_classes = y.shape[1] fpr, tpr, _ = roc_curve(y, clf_probs) plt.figure() lw = 2 plt.plot(fpr, tpr, color="orange", lw=lw,
from sklearn.linear_model import SGDClassifier sgdClass = SGDClassifier(loss='log', random_state=2, verbose=2) from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=11) models_considered = [('Logistic Regression', logreg), ('Dtree', dtc), ('sgd', sgdClass), ('knn', knn)] from xgboost import XGBClassifier xgb = XGBClassifier(random_state=2000, n_estimators=500, verbosity=2) from sklearn.ensemble import StackingClassifier stack = StackingClassifier(estimators=models_considered, final_estimator=xgb, stack_method="predict_proba", passthrough=True) stack.fit(X_train, y_train) y_pred_prob = stack.predict_proba(X_test)[:, 1] from sklearn.metrics import roc_auc_score roc_auc_score(y_test, y_pred_prob)
# varimp.sort_values(by="values", ascending=True, inplace=True) # sns.scatterplot(x=varimp["values"],y=varimp["features"]) #_____________________________________________________________________________________________________________________________ #Stacking Classifier est=estimators = [('rf', RandomForestClassifier(random_state=4, max_features="auto", min_samples_leaf=5,min_samples_split=4, n_estimators=1500)), ('lr', LogisticRegression(penalty="l2", max_iter=500))] meta=GradientBoostingClassifier(learning_rate=0.01, n_estimators=500, max_features="sqrt", min_samples_leaf=5) ms=StackingClassifier(estimators=est, final_estimator=meta, stack_method="predict_proba", passthrough=True).fit(TrainX, TrainY) Validpred_prob=ms.predict_proba(ValidX)[:,1] Validpred=np.where(Validpred_prob>0.50,1,0) confusion_matrix(Validpred, ValidY) f1_score(Validpred, ValidY) RFS=sum(np.diagonal(confusion_matrix(Validpred, ValidY)))/ValidY.shape[0]*100 #_______________________________________________________________________________ #_____________________________________________________________________________________________________________________________ #finaloutput Test_pred_prob=ms.predict_proba(TestX)[:,1] Survived=np.where(Test_pred_prob>0.50,1,0) submission=pd.DataFrame({"PassengerId":id, "Survived":Survived}) submission.to_csv("titanic01_RFLRGB_GB.csv", index=False)
sclf.fit(train_data, train_label) tra_label = sclf.predict(train_data) # 训练集的预测标签 tes_label = sclf.predict(test_data) # 测试集的预测标签 print("训练集:", accuracy_score(train_label, tra_label)) print("测试集:", accuracy_score(test_label, tes_label)) matrix = confusion_matrix(test_label, tes_label, labels=[0, 1]) TP = matrix[1, 1] TN = matrix[0, 0] FP = matrix[0, 1] FN = matrix[1, 0] sn = TP / (TP + FN) sp = TN / (TN + FP) decision_score = sclf.predict_proba(test_data) fprs, tprs, thresholds = roc_curve(test_label, decision_score[:, 1]) # plt.plot(fprs, tprs) # plt.show() roc_auc = auc(fprs, tprs) plt.figure() lw = 2 plt.plot(fprs, tprs, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05])
from sklearn.utils.estimator_checks import parametrize_with_checks @parametrize_with_checks([LogisticRegression(), DecisionTreeRegressor()]) def test_sklearn_compatible_estimator(estimator, check): check(estimator) # %% # ROC AUC now supports multiclass classification # ---------------------------------------------- # The :func:`roc_auc_score` function can also be used in multi-class # classification. Two averaging strategies are currently supported: the # one-vs-one algorithm computes the average of the pairwise ROC AUC scores, and # the one-vs-rest algorithm computes the average of the ROC AUC scores for each # class against all other classes. In both cases, the multiclass ROC AUC scores # are computed from the probability estimates that a sample belongs to a # particular class according to the model. The OvO and OvR algorithms support # weighting uniformly (``average='macro'``) and weighting by the prevalence # (``average='weighted'``). # # Read more in the :ref:`User Guide <roc_metrics>`. from sklearn.datasets import make_classification from sklearn.svm import SVC from sklearn.metrics import roc_auc_score X, y = make_classification(n_classes=4, n_informative=16) clf = SVC(decision_function_shape='ovo', probability=True).fit(X, y) print(roc_auc_score(y, clf.predict_proba(X), multi_class='ovo'))
] #('rf', grid_search_rf.best_estimator_)] # In[257]: clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression( random_state=20202020), # logreg is better than gbm stack_method='predict_proba') clf.fit(X_tr, y_tr) # In[258]: results = clf.predict_proba(X_val)[:, 1] act = y_val.array roc_auc_score(act, results) # #### 71.246 best on validation # with lgbm and gbm as base learners # ## CatBoost (left out of Stack model - takes forever to train) # In[111]: from catboost import CatBoostClassifier # In[116]:
allmodmeanaucs.append((RFmean_auc,'RF')) print("RFmodels done.") for k in Stackmodels.keys(): modname = k modelresults = Stackmodels[k] Stack = modelresults['Stack'] X_test = modelresults['X_test'] Y_test = modelresults['Y_test'] X_train = modelresults['X_train'] Y_train = modelresults['Y_train'] try: Y_pred = Stack.predict_proba(X_test)[:, 1] except Exception as e: params = Stack.get_params() Stack = StackingClassifier(estimators=params['estimators'], final_estimator=params['final_estimator'], cv=params['cv'], stack_method=params['stack_method'], n_jobs=-1).fit(X_train, Y_train) Y_pred = Stack.predict_proba(X_test)[:,1] fpr, tpr, thresholds = roc_curve(Y_test, Y_pred) Stacktprs.append(np.interp(Stackmean_fpr, fpr, tpr)) Stacktprs[-1][0] = 0.0 roc_auc = auc(fpr,tpr) Stackaucs.append(roc_auc) Stackrocs.append((fpr,tpr,roc_auc,modname)) Stackrocs_sorted = sorted(Stackrocs,key=lambda x:x[2],reverse=True) Stackmean_tpr = np.mean(Stacktprs, axis=0) Stackmean_tpr[-1] = 1.0 Stackmean_auc = auc(Stackmean_fpr, Stackmean_tpr) Stackstd_auc = np.std(Stackaucs,ddof=1) Stackstd_tpr = np.std(Stacktprs, axis=0) Stacktprs_upper = np.minimum(Stackmean_tpr + Stackstd_tpr, 1) Stacktprs_lower = np.maximum(Stackmean_tpr - Stackstd_tpr, 0) Stackrocresults = {'Stackmodels':Stackmodels,'Stackrocs':Stackrocs_sorted, 'Stackmean_tpr':Stackmean_tpr, 'Stackmean_fpr':Stackmean_fpr, 'Stacktprs_upper':Stacktprs_upper, 'Stacktprs_lower':Stacktprs_lower}
train_scores2 = [] clfs = [] for train_index, validate_index in kf.split(X): X_train, X_validate = X[train_index], X[validate_index] y_train, y_validate = t[train_index], t[validate_index] clf = clf.fit(X_train, y_train) clfs.append(clf) score = clf.score(X_validate, y_validate) scores.append(score) print(score) score = clf.score(X_train, y_train) train_scores2.append(score) index.append([train_index, validate_index]) print(score) max_index = scores.index(max(scores)) clf = clfs[max_index] result = clf.predict_proba(values_test1) p = features2.index('listing_id') list_id = values2[:, p].reshape((values2.shape[0], 1)) result = np.append(list_id, result, axis=1) print(result) data = pd.DataFrame(result, columns=['listing_id', 'low', 'medium', 'high']) # data = pd.DataFrame(result, columns=['listing_id', 'high', 'low', 'medium']) # cols = list(data) # cols.insert(2, cols.pop(cols.index('medium'))) # data = data.loc[:, cols] data.to_csv('submission.csv', index=None)
rf_only.fit(X_train, y_train) pred_prob_rf = rf_only.predict_proba(X_validation) # %% plot, random_forest_roc = roc_multiclass(pred_prob_rf, y_validation, 0) # %% random_forest_roc # %% [markdown] # Evaluamos un modelo tipo stack: # %% if long_run: stacking_model.fit(X_train, y_train) pred_prob_stack = stacking_model.predict_proba(X_validation) plot, stacking_roc = roc_multiclass(pred_prob_stack, y_validation, 0) print(stacking_roc) print( "Accurary \nKnn solo: {:.2f} \nRandom forest solo: {:.2f} \nStacking: {:.3f}" .format(knn_roc, random_forest_roc, stacking_roc)) # %% # %% # %% # %% # %% [markdown]
duration = 1000 # milliseconds freq = 440 # Hz winsound.Beep(freq, duration) #%% stacking_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), passthrough=False, stack_method='auto', n_jobs=-1) X_train=features_train_scaled.iloc[:,feature_imp_union] X_test=features_test_scaled.iloc[:,feature_imp_union] y_train=response_train stacking_fit=stacking_clf.fit(X_train, y_train) stacking_proba=stacking_clf.predict_proba(X_test) pd.DataFrame(stacking_proba).to_csv('stacking_proba.csv') #%% import winsound duration = 1000 # milliseconds freq = 440 # Hz winsound.Beep(freq, duration) # =============================================================================
n_jobs = -1, ) #-------------------- # モデルの学習処理 #-------------------- model.fit(X_train, y_train) #-------------------- # モデルの推論処理 #-------------------- if( args.output_type == "fixed" ): y_preds_train = model.predict(X_train) y_preds_test = model.predict(X_test) else: y_preds_train = model.predict_proba(X_train) y_preds_test = model.predict_proba(X_test) print( "y_preds_train.shape: ", y_preds_train.shape ) print( "y_preds_test.shape: ", y_preds_test.shape ) accuracy = (y_train == y_preds_train).sum()/len(y_preds_train) print( "accuracy [k-fold CV train-valid] : {:0.5f}".format(accuracy) ) #================================ # 可視化処理 #================================ # 分類対象の分布図 fig = plt.figure() axis = fig.add_subplot(111) sns.distplot(df_train['Survived'], label='correct' )
#Base estimator XGBoost and LGBM Classifier, final estimator Logistic Regression estimators = [('xgb',clf_xgb),('lgb',clf_lgb)] clf_stacking1 = StackingClassifier(estimators=estimators,final_estimator=LogisticRegression()) #Base estimator XGBoost, LGBM Classifier and CatBoost, final estimator Logistic Regression estimators = [('xgb',clf_xgb),('lgb',clf_lgb),('catboost',clf_cat)] clf_stacking2 = StackingClassifier(estimators=estimators,final_estimator=LogisticRegression()) #Base estimator Catboost and LGBM Classifier, final estimatorXgboost estimators = [('catboost',clf_cat),('lgb',clf_lgb)] clf_stacking3 = StackingClassifier(estimators=estimators,final_estimator=clf_xgb) #for all the above three stacking models trained the data individually clf_stacking1.fit(xtrain,ytrain) predictions_probas = clf_stacking1.predict_proba(xvalid) score=gini_normalized(yvalid,predictions_probas) print(score) print('Confusion matrix\n',confusion_matrix(yvalid,predictions)) #After analysing gini score fit the model on complete train data clf_stacking1.fit(X_train1,Y_train1) result = clf_stacking.predict_proba(X_out) result id=test1['id'] submit=pd.DataFrame({'id':id,'target':result1[:,1]}) submit=submit[['id','target']]
max_iter = 1500, random_state = 0 ) clf = StackingClassifier( estimators = [('svm', svm)], final_estimator = glm, cv = 5, n_jobs = 5 ) clf.fit(X_train, y_train) dump(clf, 'output/svm_model.joblib') ## Test fitted model logging.info("Predicting test set...") y_pred = clf.predict(X_test) y_prob = clf.predict_proba(X_test) logging.info("Overall Accuracy: {:.2f}%".format( 100 * metrics.accuracy_score(y_test, y_pred) )) logging.info("Balanced Accuracy: {:.2f}%".format( 100 * metrics.balanced_accuracy_score(y_test, y_pred) )) logging.info("Micro F1-score: {:.2f}%".format( 100 * metrics.f1_score(y_test, y_pred, average = "micro") )) logging.info("Macro F1-score: {:.2f}%".format( 100 * metrics.f1_score(y_test, y_pred, average = "macro") )) logging.info("Log-loss: {:.5f}".format( metrics.log_loss(y_test, y_prob) ))
def find_steady_coalition(): x_train, y_train, x_val, y_val, x_test, y_test = load_data() # trying to implement LDA with Least Squares solver #clf = LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto', store_covariance=True) #clf = RandomForestClassifier(criterion='gini', max_depth=50, min_samples_split=5, n_estimators=50) estimators = [('Random Forest', RandomForestClassifier(criterion='gini', max_depth=50, min_samples_split=5, n_estimators=50)), ('SVC', SVC(kernel='poly', degree=4, probability=True)), ('Percepton', MLPClassifier(activation="relu", alpha=0.1, hidden_layer_sizes=(10, 10, 10), learning_rate="constant", max_iter=2000))] clf = StackingClassifier(estimators) clf.fit(x_train, y_train) parties_list = np.unique(y_train.values) feature_to_index_map = { clf.classes_[i]: i for i in range(len(clf.classes_)) } probabilities_per_voter = clf.predict_proba(x_val) best_coalition = [] best_coalition_v_score = float(-np.inf) best_coalition_homo = float(-np.inf) for possible_coalition in get_possible_coalitions(parties_list): y_coalition = np.isin(y_val.values.ravel(), possible_coalition) probabilities_coalition = np.sum(probabilities_per_voter[:, [ feature_to_index_map[feat] for feat in possible_coalition ]], axis=1) coalition_score = np.mean(probabilities_coalition) if (coalition_score < 0.51): continue voters_likely_to_vote = [ voter > 0.5 for voter in probabilities_coalition ] v_score = v_measure_score(y_coalition, voters_likely_to_vote) homo_score = homogeneity_score(y_coalition, voters_likely_to_vote) #print('Homogeneity score: {} \nV-Measure score: {} '.format(homo_score, v_score)) #print('Predicition mean {} and std {}'.format(val_predict_score, standart_deviation)) if v_score > best_coalition_v_score: best_coalition = possible_coalition best_coalition_v_score = homo_score plot_coalition(x_train, y_train, best_coalition) print(best_coalition) print('Coalition: {}'.format(best_coalition)) print('{} coalition votes vs {} '.format( np.sum(voters_likely_to_vote), len(y_val) - np.sum(voters_likely_to_vote))) #lda_tst = LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto', store_covariance=True) prediction = clf.predict(x_test) performance_data = [ ('Random Forest + SVC + Neueron Network stacking method', prediction, y_test) ] print_accuracy_scores(performance_data) print_f1_score(performance_data)
# Third forest = RandomForestClassifier(n_estimators=13, criterion='entropy', min_samples_leaf=10, max_leaf_nodes=20, random_state=80) forest.fit(trainData, Y) # Logistic regression lr = LogisticRegression(solver='lbfgs', random_state=80) base_estimators = [('SVM', svm), ('Bagging DT', bagging), ('DecisionForest', forest)] sclf = StackingClassifier(estimators=base_estimators, final_estimator=lr, cv=2) sclf.fit(trainData, Y) score1 = sclf.score(trainData, Y) print('Logistic regression score:', score1) # Predictions names = ['dog.1049.jpg', 'dog.1028.jpg', 'dog.1011.jpg', 'cat.1016.jpg'] for name in names: singleImage = cv2.imread('data/test/' + name) histt = extract_histogram(singleImage) histt2 = histt.reshape(1, -1) prediction = sclf.predict(histt2) proba = sclf.predict_proba(histt2) print(f'Predictions for {name}:', proba)