def test_sparse_inputs_with_features_in_secondary(): rf = RandomForestClassifier(n_estimators=10, random_state=42) lr = LogisticRegression(multi_class='ovr', solver='liblinear') stclf = StackingCVClassifier(classifiers=[rf, rf], meta_classifier=lr, random_state=42, use_features_in_secondary=True) X_train, X_test, y_train, y_test = train_test_split(X_breast, y_breast, test_size=0.3) # dense stclf.fit(X_train, y_train) if Version(sklearn_version) < Version("0.21"): expected_value = 1.0 else: expected_value = 0.99 assert round(stclf.score(X_train, y_train), 2) == expected_value, \ round(stclf.score(X_train, y_train), 2) # sparse stclf.fit(sparse.csr_matrix(X_train), y_train) if Version(sklearn_version) < Version("0.21"): expected_value = 1.0 else: expected_value = 0.99 assert round(stclf.score(X_train, y_train), 2) == expected_value, \ round(stclf.score(X_train, y_train), 2)
def test_train_meta_features_(): knn = KNeighborsClassifier() lr = LogisticRegression() gnb = GaussianNB() stclf = StackingCVClassifier(classifiers=[knn, gnb], meta_classifier=lr, store_train_meta_features=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) stclf.fit(X_train, y_train) train_meta_features = stclf.train_meta_features_ assert train_meta_features.shape == (X_train.shape[0], 2)
def test_no_weight_support_meta(): w = np.array([random.random() for _ in range(len(y_iris))]) meta = KNeighborsClassifier() clf1 = RandomForestClassifier(n_estimators=10) clf2 = GaussianNB() sclf = StackingCVClassifier(classifiers=[clf1, clf2], meta_classifier=meta, shuffle=False) with pytest.raises(TypeError): sclf.fit(X_iris, y_iris, sample_weight=w)
def test_verbose(): np.random.seed(123) meta = LogisticRegression(multi_class='ovr', solver='liblinear') clf1 = RandomForestClassifier(n_estimators=10) clf2 = GaussianNB() sclf = StackingCVClassifier(classifiers=[clf1, clf2], use_probas=True, meta_classifier=meta, shuffle=False, verbose=3) sclf.fit(X_iris, y_iris)
def test_no_weight_support(): w = np.array([random.random() for _ in range(len(y_iris))]) meta = LogisticRegression(multi_class='ovr', solver='liblinear') clf1 = RandomForestClassifier(n_estimators=10) clf2 = GaussianNB() clf3 = KNeighborsClassifier() sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=meta, shuffle=False) with pytest.raises(TypeError): sclf.fit(X_iris, y_iris, sample_weight=w)
def test_verbose(): np.random.seed(123) meta = LogisticRegression() clf1 = RandomForestClassifier() clf2 = GaussianNB() sclf = StackingCVClassifier(classifiers=[clf1, clf2], use_probas=True, meta_classifier=meta, shuffle=False, verbose=3) sclf.fit(iris.data, iris.target)
def test_train_meta_features_(): knn = KNeighborsClassifier() lr = LogisticRegression(multi_class='ovr', solver='liblinear') gnb = GaussianNB() stclf = StackingCVClassifier(classifiers=[knn, gnb], meta_classifier=lr, store_train_meta_features=True) X_train, _, y_train, _ = train_test_split(X_iris, y_iris, test_size=0.3) stclf.fit(X_train, y_train) train_meta_features = stclf.train_meta_features_ assert train_meta_features.shape == (X_train.shape[0], 2)
def _build_model(self, X_train, y_train): knn = KNeighborsClassifier(n_neighbors=1) rf = RandomForestClassifier(max_depth=3,max_features=6,n_estimators=50,random_state=0) SVM = svm.SVC(C=1.0,kernel='poly',degree=5) Xgb = XGBClassifier(alpha=15, colsample_bytree=0.1,learning_rate=1, max_depth=5,reg_lambda=10.0) gnb = GaussianNB() lr = LogisticRegression(C = 10.0, dual=False, max_iter=100, solver='lbfgs') sclf = StackingCVClassifier(classifiers=[knn, rf,lr,SVM,Xgb], meta_classifier=gnb, random_state=42) sclf.fit(X_train,y_train) return sclf
def test_predict_meta_features(): knn = KNeighborsClassifier() lr = LogisticRegression(multi_class='ovr', solver='liblinear') gnb = GaussianNB() X_train, X_test, y_train, y_test = train_test_split(X_iris, y_iris, test_size=0.3) # test default (class labels) stclf = StackingCVClassifier(classifiers=[knn, gnb], meta_classifier=lr, store_train_meta_features=True) stclf.fit(X_train, y_train) test_meta_features = stclf.predict(X_test) assert test_meta_features.shape == (X_test.shape[0],)
def test_pandas(): X_df = pd.DataFrame(X) meta = LogisticRegression() clf1 = RandomForestClassifier() clf2 = GaussianNB() sclf = StackingCVClassifier(classifiers=[clf1, clf2], use_probas=True, meta_classifier=meta, shuffle=False, verbose=0) try: sclf.fit(X_df, iris.target) except KeyError as e: assert 'are NumPy arrays. If X and y are pandas DataFrames' in str(e)
def test_meta_feat_reordering(): knn = KNeighborsClassifier() lr = LogisticRegression() gnb = GaussianNB() stclf = StackingCVClassifier(classifiers=[knn, gnb], meta_classifier=lr, shuffle=True, store_train_meta_features=True) X_train, X_test, y_train, y_test = train_test_split(X_breast, y_breast, test_size=0.3) stclf.fit(X_train, y_train) assert round(roc_auc_score(y_train, stclf.train_meta_features_[:, 1]), 2) == 0.88
def test_no_weight_support_with_no_weight(): logit = LogisticRegression() rf = RandomForestClassifier() gnb = GaussianNB() knn = KNeighborsClassifier() sclf = StackingCVClassifier(classifiers=[logit, rf, gnb], meta_classifier=knn, shuffle=False) sclf.fit(X_iris, y_iris) sclf = StackingCVClassifier(classifiers=[logit, knn, gnb], meta_classifier=rf, shuffle=False) sclf.fit(X_iris, y_iris)
def test_pandas(): X_df = pd.DataFrame(X_iris) meta = LogisticRegression(multi_class='ovr', solver='liblinear') clf1 = RandomForestClassifier(n_estimators=10) clf2 = GaussianNB() sclf = StackingCVClassifier(classifiers=[clf1, clf2], use_probas=True, meta_classifier=meta, shuffle=False, verbose=0) try: sclf.fit(X_df, y_iris) except KeyError as e: assert 'are NumPy arrays. If X and y are pandas DataFrames' in str(e)
def test_no_weight_support_with_no_weight(): logit = LogisticRegression(multi_class='ovr', solver='liblinear') rf = RandomForestClassifier(n_estimators=10) gnb = GaussianNB() knn = KNeighborsClassifier() sclf = StackingCVClassifier(classifiers=[logit, rf, gnb], meta_classifier=knn, shuffle=False) sclf.fit(X_iris, y_iris) sclf = StackingCVClassifier(classifiers=[logit, knn, gnb], meta_classifier=rf, shuffle=False) sclf.fit(X_iris, y_iris)
def test_sparse_inputs(): rf = RandomForestClassifier(random_state=1) lr = LogisticRegression() stclf = StackingCVClassifier(classifiers=[rf, rf], meta_classifier=lr) X_train, X_test, y_train, y_test = train_test_split(X_breast, y_breast, test_size=0.3) # dense stclf.fit(X_train, y_train) assert round(stclf.score(X_train, y_train), 2) == 0.99 # sparse stclf.fit(sparse.csr_matrix(X_train), y_train) assert round(stclf.score(X_train, y_train), 2) == 0.99
def test_list_of_lists(): X_list = [i for i in X] meta = LogisticRegression() clf1 = RandomForestClassifier() clf2 = GaussianNB() sclf = StackingCVClassifier(classifiers=[clf1, clf2], use_probas=True, meta_classifier=meta, shuffle=False, verbose=0) try: sclf.fit(X_list, iris.target) except TypeError as e: assert 'are NumPy arrays. If X and y are lists' in str(e)
def test_list_of_lists(): X_list = [i for i in X_iris] meta = LogisticRegression(multi_class='ovr', solver='liblinear') clf1 = RandomForestClassifier(n_estimators=10) clf2 = GaussianNB() sclf = StackingCVClassifier(classifiers=[clf1, clf2], use_probas=True, meta_classifier=meta, shuffle=False, verbose=0) try: sclf.fit(X_list, y_iris) except TypeError as e: assert 'are NumPy arrays. If X and y are lists' in str(e)
def stacking(X_train_log,X_test_log,y_train,y_test): #global model_sclf_pred global model_RF_pred clf3 = MLPClassifier(activation= 'tanh', learning_rate = 'adaptive', solver= 'sgd') clf2 = SVC(probability=True, C=100, gamma=0.001) clf1 = GaussianNB() clf5 = LogisticRegression() nb=GaussianNB() sclf = StackingCVClassifier(classifiers=[clf2, clf3,clf1], shuffle = False, use_probas = True, cv = 5, n_jobs = -1, meta_classifier=clf5) model_sclf_pred = sclf.fit(X_train_log,y_train) sclfpred = model_sclf_pred.predict_proba(X_train_log) text.insert(END,'Stacking Accuracy on whole training data: '+str(model_sclf_pred.score(X_train_log,y_train))+"\n") fpr, tpr, _ = metrics.roc_curve(y_train, sclfpred[:,1:2], pos_label=1) auc = metrics.auc(fpr,tpr) text.insert(END,'Stacking AUC: '+str(auc)+"\n") sclfpred = model_sclf_pred.predict_proba(X_test_log) text.insert(END,'stacking Accuracy on whole testing data: '+str(model_sclf_pred.score(X_test_log,y_test))+"\n")
def test_sparse_inputs(): np.random.seed(123) rf = RandomForestClassifier(n_estimators=10) lr = LogisticRegression(multi_class='ovr', solver='liblinear') stclf = StackingCVClassifier(classifiers=[rf, rf], meta_classifier=lr) X_train, X_test, y_train, y_test = train_test_split(X_breast, y_breast, test_size=0.3) # dense stclf.fit(X_train, y_train) assert round(stclf.score(X_train, y_train), 2) == 0.99 # sparse stclf.fit(sparse.csr_matrix(X_train), y_train) assert round(stclf.score(X_train, y_train), 2) == 0.99
def test_sparse_inputs(): np.random.seed(123) rf = RandomForestClassifier(n_estimators=10) lr = LogisticRegression(multi_class='ovr', solver='liblinear') stclf = StackingCVClassifier(classifiers=[rf, rf], meta_classifier=lr, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X_breast, y_breast, test_size=0.3) # dense stclf.fit(X_train, y_train) assert round(stclf.score(X_train, y_train), 2) == 0.99 # sparse stclf.fit(sparse.csr_matrix(X_train), y_train) assert round(stclf.score(X_train, y_train), 2) == 0.99
def test_meta_feat_reordering(): np.random.seed(123) knn = KNeighborsClassifier() lr = LogisticRegression(multi_class='ovr', solver='liblinear') gnb = GaussianNB() stclf = StackingCVClassifier(classifiers=[knn, gnb], meta_classifier=lr, shuffle=True, store_train_meta_features=True) X_train, X_test, y_train, y_test = train_test_split(X_breast, y_breast, test_size=0.3) stclf.fit(X_train, y_train) assert round(roc_auc_score(y_train, stclf.train_meta_features_[:, 1]), 2) == 0.87, \ round(roc_auc_score(y_train, stclf.train_meta_features_[:, 1]), 2)
def test_sparse_inputs_with_features_in_secondary(): rf = RandomForestClassifier(n_estimators=10, random_state=42) lr = LogisticRegression(multi_class='ovr', solver='liblinear') stclf = StackingCVClassifier(classifiers=[rf, rf], meta_classifier=lr, random_state=42, use_features_in_secondary=True) X_train, X_test, y_train, y_test = train_test_split(X_breast, y_breast, test_size=0.3) # dense stclf.fit(X_train, y_train) assert round(stclf.score(X_train, y_train), 2) == 1.0, \ round(stclf.score(X_train, y_train), 2) # sparse stclf.fit(sparse.csr_matrix(X_train), y_train) assert round(stclf.score(X_train, y_train), 2) == 1.0, \ round(stclf.score(X_train, y_train), 2)
def stack(self,X,y,test_X): """ 模型融合 :param X: X是一个训练数据集合,array或者list :param y: Y是真实值集合,array或者list :param test_X: 测试数据集合,array或者list :return: result_Y:根据测试数据预测出来的结果 """ logging.info('------Stacking之后的模型效果') sclf = StackingCVClassifier(classifiers=self.clfArr,meta_classifier=self.lr,cv=4) # sclf = StackingClassifier(classifiers=self.clfArr,meta_classifier=self.lr,verbose=1) X=np.array(X) y=np.array(y).flatten() sclf.fit(X,y) result_Y = sclf.predict(test_X) scores = model_selection.cross_val_score(sclf,X,y,cv=5,scoring='accuracy') print('The Accuracy , mean: {:.5f} , std:+/- {:.5f}'.format(scores.mean(), scores.std())) return result_Y
def test_works_with_df_if_fold_indexes_missing(): """This is a regression test to make sure fitting will still work even if training data has ids that cannot be indexed using the indexes from the cv (e.g. skf) Some possibilities: + Output of the folds are not neatly consecutive (i.e. [341, 345, 543, ...] instead of [0, 1, ... n]) + Indexes just start from some number greater than the size of the input (see test case) Training data sometimes has ids that carry other information, and selection of rows based on cv should not break. This is fixed in the code using `safe_indexing` """ np.random.seed(123) rf = RandomForestClassifier(n_estimators=10, random_state=42) lr = LogisticRegression(multi_class='ovr', solver='liblinear') stclf = StackingCVClassifier(classifiers=[rf, rf], meta_classifier=lr, random_state=42, use_features_in_secondary=True) X_modded = pd.DataFrame(X_breast, index=np.arange(X_breast.shape[0]) + 1000) y_modded = pd.Series(y_breast, index=np.arange(y_breast.shape[0]) + 1000) X_train, X_test, y_train, y_test = train_test_split(X_modded, y_modded, test_size=0.3) # dense stclf.fit(X_train, y_train) if Version(sklearn_version) < Version("0.22"): assert round(stclf.score(X_train, y_train), 2) == 0.99, \ round(stclf.score(X_train, y_train), 2) else: assert round(stclf.score(X_train, y_train), 2) == 0.98, \ round(stclf.score(X_train, y_train), 2)
def test_StackingClassifier_drop_last_proba(): np.random.seed(123) lr1 = LogisticRegression(solver='liblinear', multi_class='ovr') sclf1 = StackingCVClassifier(classifiers=[lr1, lr1], use_probas=True, drop_last_proba=False, meta_classifier=lr1) sclf1.fit(X_iris, y_iris) r1 = sclf1.predict_meta_features(X_iris[:2]) assert r1.shape == (2, 6) sclf2 = StackingCVClassifier(classifiers=[lr1, lr1], use_probas=True, drop_last_proba=True, meta_classifier=lr1) sclf2.fit(X_iris, y_iris) r2 = sclf2.predict_meta_features(X_iris[:2]) assert r2.shape == (2, 4), r2.shape sclf3 = StackingCVClassifier(classifiers=[lr1, lr1], use_probas=True, drop_last_proba=True, meta_classifier=lr1) sclf3.fit(X_iris[0:100], y_iris[0:100]) # only 2 classes r3 = sclf3.predict_meta_features(X_iris[:2]) assert r3.shape == (2, 2), r3.shape
def test_sample_weight(): # with no weight given np.random.seed(123) meta = LogisticRegression(multi_class='ovr', solver='liblinear') clf1 = RandomForestClassifier(n_estimators=10) clf2 = GaussianNB() sclf = StackingCVClassifier(classifiers=[clf1, clf2], meta_classifier=meta, shuffle=False) prob1 = sclf.fit(X_iris, y_iris).predict_proba(X_iris) # with weight = 1 np.random.seed(123) meta = LogisticRegression(multi_class='ovr', solver='liblinear') clf1 = RandomForestClassifier(n_estimators=10) clf2 = GaussianNB() sclf = StackingCVClassifier(classifiers=[clf1, clf2], meta_classifier=meta, shuffle=False) w = np.ones(len(y_iris)) prob2 = sclf.fit(X_iris, y_iris, sample_weight=w).predict_proba(X_iris) # with random weight random.seed(87) w = np.array([random.random() for _ in range(len(y_iris))]) np.random.seed(123) meta = LogisticRegression(multi_class='ovr', solver='liblinear') clf1 = RandomForestClassifier(n_estimators=10) clf2 = GaussianNB() sclf = StackingCVClassifier(classifiers=[clf1, clf2], meta_classifier=meta, shuffle=False) prob3 = sclf.fit(X_iris, y_iris, sample_weight=w).predict_proba(X_iris) diff12 = np.max(np.abs(prob1 - prob2)) diff23 = np.max(np.abs(prob2 - prob3)) assert diff12 < 1e-3, "max diff is %.4f" % diff12 assert diff23 > 1e-3, "max diff is %.4f" % diff23
def test_works_with_df_if_fold_indexes_missing(): """This is a regression test to make sure fitting will still work even if training data has ids that cannot be indexed using the indexes from the cv (e.g. skf) Some possibilities: + Output of the folds are not neatly consecutive (i.e. [341, 345, 543, ...] instead of [0, 1, ... n]) + Indexes just start from some number greater than the size of the input (see test case) Training data sometimes has ids that carry other information, and selection of rows based on cv should not break. This is fixed in the code using `safe_indexing` """ np.random.seed(123) rf = RandomForestClassifier(n_estimators=10, random_state=42) lr = LogisticRegression(multi_class='ovr', solver='liblinear') stclf = StackingCVClassifier(classifiers=[rf, rf], meta_classifier=lr, random_state=42, use_features_in_secondary=True) X_modded = pd.DataFrame(X_breast, index=np.arange(X_breast.shape[0]) + 1000) y_modded = pd.Series(y_breast, index=np.arange(y_breast.shape[0]) + 1000) X_train, X_test, y_train, y_test = train_test_split(X_modded, y_modded, test_size=0.3) # dense stclf.fit(X_train, y_train) assert round(stclf.score(X_train, y_train), 2) == 0.99, \ round(stclf.score(X_train, y_train), 2)
def test_meta_feat_reordering(): knn = KNeighborsClassifier() lr = LogisticRegression(multi_class='ovr', solver='liblinear') gnb = GaussianNB() stclf = StackingCVClassifier(classifiers=[knn, gnb], meta_classifier=lr, shuffle=True, random_state=42, store_train_meta_features=True) X_train, X_test, y_train, y_test = train_test_split(X_breast, y_breast, random_state=0, test_size=0.3) stclf.fit(X_train, y_train) if Version(sklearn_version) < Version("0.21"): expected_value = 0.86 else: expected_value = 0.87 assert round(roc_auc_score(y_train, stclf.train_meta_features_[:, 1]), 2) == expected_value, \ round(roc_auc_score(y_train, stclf.train_meta_features_[:, 1]), 2)
# optimized votingClassifier eclf = VotingClassifier(estimators=[('rf',rf),('lr',lr),('gb',gb)],voting='soft', weights=[3,2,3]) # Building and running the StackingClassifier on the test data from mlxtend.classifier import StackingCVClassifier sclf=StackingCVClassifier(classifiers=[rf,lr,gb,et,gnb,svc,knn,xgb,ada,mlp,lda,qda], use_features_in_secondary=True, use_probas=True, meta_classifier=eclf) cmetrics=[] cmetrics.append(cross_val_score(sclf,X.values,y.values,cv=5,scoring='accuracy').mean()) cmetrics.append(cross_val_score(sclf,X.values,y.values,cv=5,scoring='precision').mean()) cmetrics.append(cross_val_score(sclf,X.values,y.values,cv=5,scoring='recall').mean()) cmetrics.append(cross_val_score(sclf,X.values,y.values,cv=5,scoring='roc_auc').mean()) sclf.fit(X.values,y.values) pred=sclf.predict(Xt.values) # plotting ROC-Curve pred_proba=sclf.predict_proba(Xt.values)[:,1] fpr, tpr, threshold = roc_curve(yt, pred_proba) roc_auc=auc(fpr,tpr) plt.title('Receiver Operating Characteristic') plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc) plt.legend(loc = 'lower right') plt.plot([0, 1], [0, 1],'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.savefig('ROC_curve_test.png',bbox_inches='tight')
setClass=clas, show=False) elif (META == False): HeldOutDataPredictions = pf.Classification_Model( data_training=vec_training, target_training=out_train, data_testing=vec_testing, Classifier=EnsembleCustom[0][1], target_testing=None, ModelName=EnsembleCustom[0][0], accur=False, grph=False, setClass=clas, show=False) else: MetaClass.fit(vec_training, out_train) HeldOutDataPredictions = MetaClass.predict(vec_testing) runingTime = timeit.default_timer( ) - tStart #Stopping clock and getting time spent print("Fitting and predictions done in %0.4fs." % runingTime) print("=" * 100) """ PRINTING THE PREDICTIONS MADE AND SAVING CSV FILE """ Preds = pd.DataFrame({"Category": HeldOutDataPredictions}) Results = pd.concat([dataTest["id"], Preds], axis=1, sort=False) print(Results) pf.Write_File_DF(Data_Set=Results, File_Name="Predictions_Group_4", separation=",", head=True, ind=False)
clf3 = RFC() meta_clf = RC() # In[ ]: stacker = SCVC(classifiers=[clf1, clf2, clf3, clf1], meta_classifier=meta_clf, use_probas=True, use_features_in_secondary=True) # In[ ]: for c in train.columns: train[c] = train[c].fillna(train[c].median()) test[c] = test[c].fillna(train[c].median()) stacker.fit(train.values, np.array(Y)) # In[ ]: my_prediction = stacker.predict(test.values) # In[ ]: # PassengerId,Survived submission = pd.DataFrame() submission['PassengerId'] = test.index.tolist() submission['Survived'] = my_prediction # In[ ]: submission.to_csv("submission.csv", index=False)
#voting ensemlbe ensemble = VotingClassifier(estimators, voting='soft', weights=[1, 1, 1]) ensemble.fit(X_train, y_train) pred = ensemble.predict(X_test) print("predicted values----------:", pred) pickle.dump(ensemble, open('ensemble-clf.sav', 'wb')) # pred_op = ensemble.predict(otpt) # print("Predicted values:" ,pred_op) print('fscore:{0:.3f}'.format(f1_score(y_test, pred, average='micro'))) #meta classifier ensemble stack = StackingCVClassifier(classifiers=[mlp, xgb, rf], meta_classifier=lr, use_probas=True) stack.fit(X_train.values, y_train.values) pred2 = stack.predict(X_test.values) print("predicted values: ", pred2) print('fscore:{0:.3f}'.format(f1_score(y_test, pred2, average='micro'))) from sklearn.metrics import confusion_matrix confusion_lr = confusion_matrix(y_test, pred) pickle.dump(stack, open('stack-clf.sav', 'wb')) print(confusion_lr) #################################################################################################################### # #REPORT AND PLOT MICRO-AVERAGE ROC AUC FOR EACH MODEL # from sklearn.preprocessing import label_binarize # import matplotlib.pyplot as plt # from itertools import cycle # from sklearn.multiclass import OneVsRestClassifier # from scipy import interp
def main_leave_one_week(offline, mall_ids=-1, save_offline_predict=False): model_name = "stack_balance_strong_matrix_lonlat_wh" train_all = load_train() test_all = load_testA() shop_info = load_shop_info() if mall_ids == -1: mall_ids = shop_info.mall_id.unique() offline_predicts = {} all_rowid = {} offline_reals = {} all_predicts = {} for _index, mall_id in enumerate(mall_ids): print "train: ", mall_id, " {}/{}".format(_index + 1, len(mall_ids)) shops = shop_info[shop_info.mall_id == mall_id].shop_id.unique() train = train_all[train_all.mall_id == mall_id] test = test_all[test_all.mall_id == mall_id] # y label encoder y = train.shop_id.values label_encoder = LabelEncoder().fit(y) y = label_encoder.transform(y) num_class = len(shops) print "num_class", num_class # all wifi matrix df, train_cache, test_cache = get_wifi_cache2(mall_id) train_matrix_origin_all = train_cache[2] test_matrix_origin_all = test_cache[2] test_index = test_cache[0] # choose_strong_wifi_index strong_wifi_index = choose_strong_wifi_index(-90, 6, train_matrix_origin_all) train_strong_matrix = train_matrix_origin_all[:, strong_wifi_index] test_strong_matrix = test_matrix_origin_all[:, strong_wifi_index] # train valid split and get index _train_index, _valid_index = get_last_one_week_index(train) # weekday and hour preprocess_basic_time(train) preprocess_basic_time(test) preprocess_basic_wifi(train) preprocess_basic_wifi(test) train_time_features = train[["weekday", "hour", "is_weekend"]].values test_time_features = test[["weekday", "hour", "is_weekend"]].values train_wh_features = train[["weekday", "hour"]].values test_wh_features = test[["weekday", "hour"]].values # 是否连接wifi train_connect_wifi = ( train.basic_wifi_info.map(lambda x: len(x[1])).values > 0).astype(int).reshape(-1, 1) test_connect_wifi = ( test.basic_wifi_info.map(lambda x: len(x[1])).values > 0).astype(int).reshape(-1, 1) # 搜到的wifi数量 train_search_wifi_size = train.basic_wifi_info.map( lambda x: x[0]).values.reshape(-1, 1) test_search_wifi_size = test.basic_wifi_info.map( lambda x: x[0]).values.reshape(-1, 1) # lon lat train_lonlats = train[["longitude", "latitude"]].values test_lonlats = test[["longitude", "latitude"]].values # concatenate train/test features train_matrix = np.concatenate( [ train_strong_matrix, train_lonlats, train_wh_features, # train_connect_wifi, # train_search_wifi_size ], axis=1) test_matrix = np.concatenate( [ test_strong_matrix, test_lonlats, test_wh_features, # test_connect_wifi, # test_search_wifi_size ], axis=1) # train valid get _train_x = train_matrix[_train_index] _train_y = y[_train_index] _valid_x = train_matrix[_valid_index] _valid_y = y[_valid_index] # stack base model def get_model1(): model1 = RandomForestClassifier(n_estimators=500, n_jobs=-1, class_weight="balanced") return model1 def get_model2(): model2 = OneVsRestClassifier(estimator=RandomForestClassifier( n_estimators=188, n_jobs=-1, class_weight="balanced")) return model2 # stack meta model def get_meta_model(): meta_model = RandomForestClassifier(n_estimators=777, n_jobs=-1, class_weight="balanced") return meta_model # stack cv cv = 3 # offline # expansion train _x, _y = expansion(_train_x, _train_y, cv) stack = StackingCVClassifier([get_model1(), get_model2()], get_meta_model(), use_probas=True, use_features_in_secondary=True, cv=cv) stack.fit(_x, _y) best_predict = stack.predict(_valid_x) predict = label_encoder.inverse_transform(best_predict) offline_predicts[mall_id] = predict _real_y = label_encoder.inverse_transform(_valid_y) offline_reals[mall_id] = _real_y print mall_id + "'s acc is", acc(predict, _real_y) # online if not offline: # expansion train _x, _y = expansion(train_matrix, y, cv) stack = StackingCVClassifier( [get_model1(), get_model2()], get_meta_model(), use_probas=True, use_features_in_secondary=True, cv=cv) stack.fit(_x, _y) predict = stack.predict(test_matrix) predict = label_encoder.inverse_transform(predict) all_predicts[mall_id] = predict all_rowid[mall_id] = test_all[np.in1d(test_all.index, test_index)].row_id.values # offline acc result result = {} for _mall_id in mall_ids: _acc = acc(offline_predicts[_mall_id], offline_reals[_mall_id]) print _mall_id + "'s acc is", _acc result[_mall_id] = _acc if save_offline_predict: pd.DataFrame({ "predict": offline_predicts[_mall_id], "real": offline_reals[_mall_id] }).to_csv("../result/offline_predict/{}.csv".format(_mall_id), index=None) all_predict = np.concatenate(offline_reals.values()) all_true = np.concatenate(offline_predicts.values()) _acc = acc(all_predict, all_true) print "all acc is", _acc if len(mall_ids) < 50: exit(1) result["all_acc"] = _acc path = "../result/offline/{}".format(model_name) save_acc(result, path, None) # online save result if not offline: all_rowid = np.concatenate(all_rowid.values()) all_predict = np.concatenate(all_predicts.values()) result = pd.DataFrame(data={ "row_id": all_rowid, "shop_id": all_predict }) result.sort_values(by="row_id", inplace=True) path = "../result/online/{}".format(model_name) save_result(result, path, None)