def kfold_cv(X_train, y_train,idx,k): kf = StratifiedKFold(y_train,n_folds=k) xx=[] count=0 for train_index, test_index in kf: count+=1 X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:] gc.collect() y_train_cv, y_test_cv = y_train[train_index],y_train[test_index] y_pred=np.zeros(X_test_cv.shape[0]) m=0 for j in range(m): clf=xgb_classifier(eta=0.05,min_child_weight=20,col=0.5,subsample=0.7,depth=7,num_round=400,seed=j*77,gamma=0.1) y_pred+=clf.train_predict(X_train_cv,(y_train_cv),X_test_cv,y_test=(y_test_cv)) yqq=y_pred*(1.0/(j+1)) print j,llfun(y_test_cv,yqq) #y_pred/=m; clf=XGBClassifier(max_depth=10,colsample_bytree=0.8,learning_rate=0.02,n_estimators=500,nthread=-1) #clf=RandomForestClassifier(n_jobs=-1,n_estimators=100,max_depth=100) clf.fit(X_train_cv,(y_train_cv),eval_metric="logloss",eval_set=[(X_test_cv, y_test_cv)]) y_pred=clf.predict_proba(X_test_cv).T[1] print y_pred.shape xx.append(llfun(y_test_cv,(y_pred))) ypred=y_pred yreal=y_test_cv idx=idx[test_index] print xx[-1]#,y_pred.shape break print xx,'average:',np.mean(xx),'std',np.std(xx) return ypred,yreal,idx#np.mean(xx)
def xgb_meta_predict(data_base_dir,data_meta_random_dir,submission_dir): test_id=pickle.load(open(data_base_dir+"test_id.p","rb")) y_meta=pickle.load(open(data_meta_random_dir+"y_meta.p","rb")) X_numerical_random=pickle.load(open(data_meta_random_dir+"X_numerical_meta.p","rb")) X_test_numerical=pickle.load(open(data_base_dir+"X_test_numerical.p","rb")) X_random_rf=pickle.load(open(data_meta_random_dir+ "X_meta_random_rf.p", "rb" ) ) X_test_rf=pickle.load(open(data_meta_random_dir+ "X_test_meta_rf.p", "rb" ) ) X_random_svc=pickle.load(open(data_meta_random_dir+ "X_meta_random_svc.p", "rb" ) ) X_test_svc=pickle.load(open(data_meta_random_dir+ "X_test_meta_svc.p", "rb" ) ) # private LB 0.0054101 xgb_clf=xgb_classifier(eta=0.2,min_child_weight=1,depth=10,num_round=70,threads=16) X_xgb_predict = xgb_clf.train_predict_all_labels(np.hstack([X_random_rf,X_random_svc,X_numerical_random]), y_meta,np.hstack([ X_test_rf,X_test_svc,X_test_numerical]),predict_y14=True) save_predictions(submission_dir+'xgb-random-d10-e0.2-min1-tree70.csv.gz', test_id , X_xgb_predict) # private LB 0.0053053 xgb_clf=xgb_classifier(eta=0.2,min_child_weight=6,depth=12,num_round=80,threads=16) X_xgb_predict = xgb_clf.train_predict_all_labels(np.hstack([X_random_rf,X_random_svc,X_numerical_random]), y_meta,np.hstack([X_test_rf,X_test_svc,X_test_numerical]),predict_y14=True) save_predictions(submission_dir+'xgb-random-d12-e0.2-min6-tree80.csv.gz', test_id , X_xgb_predict) # private LB 0.0052910 xgb_clf=xgb_classifier(eta=0.09,min_child_weight=6,depth=25,num_round=100,threads=16) X_xgb_predict = xgb_clf.train_predict_all_labels(np.hstack([X_random_rf,X_random_svc,X_numerical_random]), y_meta,np.hstack([X_test_rf,X_test_svc,X_test_numerical]),predict_y14=True) save_predictions(submission_dir+'xgb-random-d25-svc-e0.09-min6-tree100.csv.gz', test_id , X_xgb_predict)
def kfold_cv(X_train, y_train,idx,k): kf = StratifiedKFold(y_train,n_folds=k) xx=[] count=0 for train_index, test_index in kf: count+=1 X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:] gc.collect() y_train_cv, y_test_cv = y_train[train_index],y_train[test_index] y_pred=np.zeros(X_test_cv.shape[0]) m=0 for j in range(m): clf=xgb_classifier(eta=0.1,min_child_weight=20,col=0.5,subsample=0.7,depth=5,num_round=200,seed=j*77,gamma=0.1) y_pred+=clf.train_predict(X_train_cv,(y_train_cv),X_test_cv,y_test=(y_test_cv)) #y_pred/=m; clf=ExtraTreesClassifier(n_estimators=700,max_features= 50,criterion= 'entropy',min_samples_split= 3, max_depth= 60, min_samples_leaf= 4,verbose=1,n_jobs=-1) #clf=RandomForestClassifier(n_jobs=-1,n_estimators=100,max_depth=100) clf.fit(X_train_cv,(y_train_cv)) y_pred=clf.predict_proba(X_test_cv).T[1] print y_pred.shape xx.append(llfun(y_test_cv,(y_pred))) ypred=y_pred yreal=y_test_cv idx=idx[test_index] print xx[-1]#,y_pred.shape break print xx,'average:',np.mean(xx),'std',np.std(xx) return ypred,yreal,idx#np.mean(xx)
def kfold_cv(X_train, y_train,idx,k): kf = StratifiedKFold(y_train,n_folds=k) xx=[] count=0 for train_index, test_index in kf: count+=1 X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:] gc.collect() y_train_cv, y_test_cv = y_train[train_index],y_train[test_index] y_pred=np.zeros(X_test_cv.shape[0]) m=1 for j in range(m): clf=xgb_classifier(eta=0.1,min_child_weight=20,col=0.7,subsample=1,depth=10,num_round=50,seed=j*77,gamma=0.1) y_pred+=clf.train_predict(X_train_cv,(y_train_cv),X_test_cv,y_test=(y_test_cv)) y_pred/=m; print y_pred.shape xx.append(llfun(y_test_cv,(y_pred))) #ypred=y_pred #yreal=y_test_cv #idx=idx[test_index] print xx[-1]#,y_pred.shape #break print xx,'average:',np.mean(xx),'std',np.std(xx)
def kfold_cv(X_train, y_train, k): kf = StratifiedKFold(y_train, n_folds=k) xx = [] zz = [] ypred = np.zeros((y_train.shape[0], 3)) for train_index, test_index in kf: X_train_cv, X_test_cv = X_train[train_index, :], X_train[test_index, :] y_train_cv, y_test_cv = y_train[train_index], y_train[test_index] #clf=RandomForestClassifier(n_jobs=-1,max_depth=21,max_features=30,n_estimators=100) #clf.fit(X_train_cv,y_train_cv) #y_pred=clf.predict_proba(X_test_cv) clf = xgb_classifier(eta=0.25, col=0.1, min_child_weight=1, depth=6, num_round=70) y_pred = clf.multi(X_train_cv, y_train_cv, X_test_cv, 3, y_test=y_test_cv) xx.append(multiclass_log_loss(y_test_cv, y_pred)) print xx[-1] #,y_pred.shape,zz[-1] ypred[test_index] = y_pred print xx print 'average:', np.mean(xx), 'std', np.std(xx) return ypred, np.mean(xx)
def kfold_cv(X_train, y_train, k): kf = StratifiedKFold(y_train, n_folds=k) xx = [] zz = [] ypred = np.zeros((y_train.shape[0], 3)) for train_index, test_index in kf: X_train_cv, X_test_cv = X_train[train_index, :], X_train[test_index, :] y_train_cv, y_test_cv = y_train[train_index], y_train[test_index] clf = xgb_classifier(eta=0.1, gamma=1e-3, col=0.3, min_child_weight=0.5, depth=7, num_round=160) y_pred = clf.multi(X_train_cv, y_train_cv, X_test_cv, 3, y_test=y_test_cv) xx.append(multiclass_log_loss(y_test_cv, y_pred)) print xx[-1] #,y_pred.shape,zz[-1] ypred[test_index] = y_pred print xx print 'average:', np.mean(xx), 'std', np.std(xx) return ypred, np.mean(xx)
def kfold_cv(X_train, y_train,idx,k): kf = StratifiedKFold(y_train,n_folds=k) xx=[] count=0 ypred=np.zeros(X_train.shape[0]) for train_index, test_index in kf: count+=1 X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:] gc.collect() y_train_cv, y_test_cv = y_train[train_index],y_train[test_index] y_pred=np.zeros(X_test_cv.shape[0]) m=1 for j in range(m): clf=xgb_classifier(eta=0.01,min_child_weight=10,col=0.7,subsample=0.68,depth=5,num_round=500,seed=j*77,gamma=0) y_pred+=clf.train_predict(X_train_cv,(y_train_cv),X_test_cv,y_test=(y_test_cv)) yqq=y_pred/(1+j) print j,llfun(y_test_cv,yqq) y_pred/=m; #clf=RandomForestClassifier(n_jobs=-1,n_estimators=100,max_depth=100) #clf.fit(X_train_cv,(y_train_cv)) #y_pred=clf.predict_proba(X_test_cv).T[1] print y_pred.shape xx.append(llfun(y_test_cv,(y_pred))) ypred[test_index]=y_pred print xx[-1]#,y_pred.shape print xx,'average:',np.mean(xx),'std',np.std(xx) return ypred
def xgb_meta_predict(data_base_dir,data_meta_part1_dir,submission_dir): test_id=pickle.load(open(data_base_dir+"test_id.p","rb")) y_all=pickle.load(open(data_base_dir+"y.p","rb")) y_part1=y_all[:y_all.shape[0]/2,:] X_numerical=pickle.load(open(data_base_dir+"X_numerical.p","rb")) X_numerical_part1=X_numerical[:X_numerical.shape[0]/2,:] X_test_numerical=pickle.load(open(data_base_dir+"X_test_numerical.p","rb")) X_part1_xgb=pickle.load(open(data_meta_part1_dir+ "X_meta_part1_xgb.p", "rb" ) ) X_test_xgb =pickle.load(open(data_meta_part1_dir+ "X_test_meta_xgb_all.p", "rb" ) ) X_part1_rf=pickle.load(open(data_meta_part1_dir+ "X_meta_part1_rf.p", "rb" ) ) X_test_rf=pickle.load(open(data_meta_part1_dir+ "X_test_meta_rf.p", "rb" ) ) X_part1_sgd=pickle.load(open(data_meta_part1_dir+ "X_meta_part1_sgd.p", "rb" ) ) X_test_sgd=pickle.load(open(data_meta_part1_dir+ "X_test_meta_sgd.p", "rb" ) ) X_part1_best_online=pickle.load(open(data_meta_part1_dir+ "X_meta_part1_online.p", "rb" ) ) X_test_best_online=pickle.load(open(data_meta_part1_dir+ "X_test_meta_online.p", "rb" ) ) X_test_online_ensemble=pickle.load(open(data_meta_part1_dir+ "X_test_meta_online_ensemble.p", "rb" ) ) # best single model submitted, private LB 0.0044595, X_test_meta xgb_clf=xgb_classifier(eta=0.09,min_child_weight=6,depth=18,num_round=120,threads=16) X_xgb_predict = xgb_clf.train_predict_all_labels(np.hstack([X_part1_best_online,X_part1_rf,X_part1_sgd,X_part1_xgb,X_numerical_part1]), y_part1,np.hstack([X_test_online_ensemble, X_test_rf,X_test_sgd,X_test_xgb,X_test_numerical]),predict_y14=True) #save_predictions(submission_dir+'xgb-part1-d18-e0.09-min6-tree120-xgb_base.csv.gz', test_id , X_xgb_predict) # best single model (not submitted by itself), private LB 0.0044591, not submitted alone xgb_clf=xgb_classifier(eta=0.07,min_child_weight=6,depth=20,num_round=150,threads=16) X_xgb_predict = xgb_clf.train_predict_all_labels(np.hstack([X_part1_best_online,X_part1_rf,X_part1_sgd,X_part1_xgb,X_numerical_part1]), y_part1,np.hstack([X_test_online_ensemble, X_test_rf,X_test_sgd,X_test_xgb,X_test_numerical]),predict_y14=True) #save_predictions(submission_dir+'xgb-part1-d20-e0.07-min6-tree150-xgb_base.csv.gz', test_id , X_xgb_predict) # private LB 0.0047360 correct! try "boosting from existing predictions" xgb_clf=xgb_classifier(eta=0.07,min_child_weight=6,depth=20,num_round=20,threads=16,exist_prediction=True,exist_num_round=150) X_xgb_predict = xgb_clf.train_predict_all_labels(np.hstack([X_part1_best_online,X_part1_rf,X_part1_sgd,X_numerical_part1]), y_part1,np.hstack([X_test_best_online, X_test_rf,X_test_sgd,X_test_numerical]),predict_y14=True) #save_predictions(submission_dir+'xgb-part1-d20-e0.07-min6-tree20-extree-150.csv.gz', test_id , X_xgb_predict) # private LB 0.0047103, xgb_clf=xgb_classifier(eta=0.09,min_child_weight=6,depth=18,num_round=1,threads=16,exist_prediction=True,exist_num_round=120) X_xgb_predict = xgb_clf.train_predict_all_labels(np.hstack([X_part1_best_online,X_part1_rf,X_part1_sgd,X_numerical_part1]), y_part1,np.hstack([X_test_online_ensemble, X_test_rf,X_test_sgd,X_test_numerical]),predict_y14=True) # save_predictions(submission_dir+'xgb-part1-d18-e0.09-min6-tree1-extree-120.csv.gz', test_id , X_xgb_predict) # private LB 0.0047000, using ensembled online predictions as meta feature for test sets! xgb_clf=xgb_classifier(eta=0.07,min_child_weight=6,depth=20,num_round=150,threads=16) X_xgb_predict = xgb_clf.train_predict_all_labels(np.hstack([X_part1_best_online,X_part1_rf,X_part1_sgd,X_numerical_part1]), y_part1,np.hstack([X_test_online_ensemble, X_test_rf,X_test_sgd,X_test_numerical]),predict_y14=True) #save_predictions(submission_dir+'xgb-part1-d20-e0.07-min6-tree150.csv.gz', test_id , X_xgb_predict) # private LB 0.0047313, correct! xgb_clf=xgb_classifier(eta=0.07,min_child_weight=6,depth=19,num_round=150,threads=16) X_xgb_predict = xgb_clf.train_predict_all_labels(np.hstack([X_part1_best_online,X_part1_rf,X_part1_sgd,X_numerical_part1]), y_part1,np.hstack([X_test_best_online, X_test_rf,X_test_sgd,X_test_numerical]),predict_y14=True) #save_predictions(submission_dir+'xgb-part1-d19-e0.07-min6-tree150.csv.gz', test_id , X_xgb_predict) # private LB 0.0047446, correct! xgb_clf=xgb_classifier(eta=0.09,min_child_weight=6,depth=18,num_round=120,threads=16) X_xgb_predict = xgb_clf.train_predict_all_labels(np.hstack([X_part1_best_online,X_part1_rf,X_part1_sgd,X_numerical_part1]), y_part1,np.hstack([X_test_best_online, X_test_rf,X_test_sgd,X_test_numerical]),predict_y14=True) # save_predictions(submission_dir+'xgb-part1-d18-e0.09-min6-tree120.csv.gz', test_id , X_xgb_predict)
def train_predict(X, y, Xt, yt=[], c=1): if c == 1: #clf=xgb_classifier(num_round=45,eta=0.1,min_child_weight=5,depth=10, subsample=0.5,col=1) clf = xgb_classifier(num_round=45, eta=0.1, min_child_weight=20, depth=20, subsample=0.1, col=0.7) return clf.train_predict(X, y, Xt, yt)
def xgb_meta_predict(data_base_dir,data_meta_part1_dir,submission_dir): test_id=pickle.load(open(data_base_dir+"test_id.p","rb")) y_all=pickle.load(open(data_base_dir+"y.p","rb")) X_all=pickle.load(open(data_base_dir+"X_all.p","rb")) X_test=pickle.load(open(data_base_dir+"X_test_all.p","rb")) y_part1=y_all[:y_all.shape[0]/2,:] xgb_clf=xgb_classifier(eta=0.07,min_child_weight=6,depth=20,num_round=150,threads=16) X_xgb_predict = xgb_clf.train_predict_all_labels(X_all, y_all,X_test,predict_y14=True) save_predictions(submission_dir+'xgb-raw-d20-e0.07-min6-tree150.csv.gz', test_id , X_xgb_predict) xgb_clf=xgb_classifier(eta=0.1,min_child_weight=7,depth=100,num_round=150,threads=16) X_xgb_predict = xgb_clf.train_predict_label(X_all, y_all,X_test,label=33) # predict label 33 only save_predictions(submission_dir+'xgb-y33-d100-e0.1-min7-tree150.csv.gz', test_id , X_xgb_predict) X_part1_best_online=pickle.load(open(data_meta_part1_dir+ "X_meta_part1_online.p", "rb" ) ) X_test_best_online=pickle.load(open(data_meta_part1_dir+ "X_test_meta_online.p", "rb" ) ) X_numerical=pickle.load(open(data_base_dir+"X_numerical.p","rb")) X_numerical_part1=X_numerical[:X_numerical.shape[0]/2,:] X_test_numerical=pickle.load(open(data_base_dir+"X_test_numerical.p","rb")) X_part1_xgb=pickle.load(open(data_meta_part1_dir+ "X_meta_part1_xgb.p", "rb" ) ) X_test_xgb =pickle.load(open(data_meta_part1_dir+ "X_test_meta_xgb_all.p", "rb" ) ) X_part1_rf=pickle.load(open(data_meta_part1_dir+ "X_meta_part1_rf.p", "rb" ) ) X_test_rf=pickle.load(open(data_meta_part1_dir+ "X_test_meta_rf.p", "rb" ) ) X_part1_sgd=pickle.load(open(data_meta_part1_dir+ "X_meta_part1_sgd.p", "rb" ) ) X_test_sgd=pickle.load(open(data_meta_part1_dir+ "X_test_meta_sgd.p", "rb" ) ) X_sparse=pickle.load(open(data_base_dir+"X_sparse.p","rb")) X_test_sparse=pickle.load(open(data_base_dir+"X_test_sparse.p","rb")) X_sparse_part1=X_sparse[:X_sparse.shape[0]/2,:] X=sparse.csr_matrix(sparse.hstack((X_sparse_part1,sparse.coo_matrix(np.hstack ([X_part1_best_online,X_part1_rf,X_part1_sgd,X_part1_xgb,X_numerical_part1]).astype(float))))) Xt=sparse.csr_matrix(sparse.hstack((X_test_sparse,sparse.coo_matrix(np.hstack ([X_test_best_online,X_test_rf,X_test_sgd,X_test_xgb,X_test_numerical]).astype(float))))) xgb_clf=xgb_classifier(eta=0.1,min_child_weight=6,depth=30,num_round=80,threads=16) X_xgb_predict = xgb_clf.train_predict_label(X, y_part1,Xt,label=33) # predict label 33 only save_predictions(submission_dir+'xgb-y33-d30-e0.1-min6-tree80-all-sparse.csv.gz', test_id , X_xgb_predict)
def train_predict(X,y,Xt,yt=[],c=1): if c=='xgb': clf=xgb_classifier(num_round=200,eta=0.1,min_child_weight=2,depth=20, subsample=1,col=0.6) return clf.train_predict(X,y,Xt,yt) if c=='rf': clf=RandomForestClassifier(n_estimators=200,n_jobs=-1,max_depth=13,min_samples_split=4,min_samples_leaf=9, max_leaf_nodes= 1100) clf.fit(X,y) return clf.predict_proba(Xt).T[1] if c=='rf1': clf=RandomForestClassifier(n_estimators=1000,n_jobs=-1) clf.fit(X,y) return clf.predict_proba(Xt).T[1]
def train_predict(X,y,Xt,yt=[],c=1): if c==1: clf=xgb_classifier(num_round=60,eta=0.1,min_child_weight=5,depth=7, subsample=1,col=1) return clf.train_predict(X,y,Xt,yt) if c==2: clf=RandomForestRegressor(n_estimators=200,n_jobs=-1,max_depth=13,min_samples_split=4,min_samples_leaf=9, max_leaf_nodes= 1100) clf.fit(X,y) return clf.predict(Xt) if c==3: clf=RankSVM() clf.fit(X,y) return clf.predict(Xt)
def train_predict(X,y,Xt,yt=[],c=1): if c==1: #clf=xgb_classifier(num_round=45,eta=0.1,min_child_weight=5,depth=10, subsample=0.5,col=1) clf=xgb_classifier(num_round=45,eta=0.1,min_child_weight=20,depth=20, subsample=0.1,col=0.7) #clf=xgb_classifier(num_round=300,eta=0.01,min_child_weight=20,depth=8, subsample=0.1,col=0.7) return clf.train_predict(X,y,Xt,yt) elif c==2: clf = LDA() clf.fit(X,y) preds = clf.predict_proba(Xt)[:,1] return preds elif c==3: clf = LogisticRegression() clf.fit(X,y) preds = clf.predict_proba(Xt)[:,1] return preds
def train_predict(X,y,Xt,yt=[],c=1): if c==1: #clf=xgb_classifier(num_round=45,eta=0.1,min_child_weight=5,depth=10, subsample=0.5,col=1) #clf=xgb_classifier(num_round=55,eta=0.1,min_child_weight=20,depth=20, subsample=0.1,col=0.7) clf=xgb_classifier(num_round=500,eta=0.01,min_child_weight=20,depth=10, subsample=0.1,col=0.7) #clf=xgb_classifier(num_round=500,eta=0.01,min_child_weight=20,depth=10, subsample=0.1,col=0.7) # First digit touch - 0.966262479533 #BothStartLoadPhase-0.969428966329 #clf=xgb_classifier(num_round=500,eta=0.01,min_child_weight=20,depth=10, subsample=0.1,col=0.7) # HandStart - 0.930538668081 return clf.train_predict(X,y,Xt,yt) elif c==2: clf = LDA() clf.fit(X,y) preds = clf.predict_proba(Xt)[:,1] return preds elif c==3: clf = LogisticRegression() clf.fit(X,y) preds = clf.predict_proba(Xt)[:,1] return preds
def kfold_cv(X_train, y_train, k): kf = StratifiedKFold(y_train, n_folds=k) xx = [] zz = [] ypred = np.zeros((y_train.shape[0], 3)) for train_index, test_index in kf: X_train_cv, X_test_cv = X_train[train_index, :], X_train[test_index, :] y_train_cv, y_test_cv = y_train[train_index], y_train[test_index] clf = xgb_classifier(eta=0.1, col=0.4, min_child_weight=10, depth=6, num_round=50) # good! y_pred = clf.multi(X_train_cv, y_train_cv, X_test_cv, 3, y_test=y_test_cv) xx.append(multiclass_log_loss(y_test_cv, y_pred)) print xx[-1] # ,y_pred.shape,zz[-1] ypred[test_index] = y_pred print "average:", np.mean(xx), "std", np.std(xx) return ypred, np.mean(xx)
def kfold_cv(X_train, y_train,k): kf = StratifiedKFold(y_train,n_folds=k) xx=[] zz=[] ypred=np.zeros((y_train.shape[0],3)) for train_index, test_index in kf: X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:] y_train_cv, y_test_cv = y_train[train_index],y_train[test_index] clf=xgb_classifier(eta=0.1,gamma=1e-3,col=0.35,min_child_weight=0.5,depth=7,num_round=160) y_pred=clf.multi(X_train_cv,y_train_cv,X_test_cv,3,y_test=y_test_cv) xx.append(multiclass_log_loss(y_test_cv,y_pred)) print xx[-1]#,y_pred.shape,zz[-1] ypred[test_index]=y_pred print xx print 'average:',np.mean(xx),'std',np.std(xx) return ypred,np.mean(xx)
def train_predict(X, y, Xt, yt=[], c=1): if c == 1: #clf=xgb_classifier(num_round=45,eta=0.1,min_child_weight=5,depth=10, subsample=0.5,col=1) #clf=xgb_classifier(num_round=55,eta=0.1,min_child_weight=20,depth=20, subsample=0.1,col=0.7) clf = xgb_classifier(num_round=500, eta=0.01, min_child_weight=20, depth=10, subsample=0.1, col=0.7) #clf=xgb_classifier(num_round=500,eta=0.01,min_child_weight=20,depth=10, subsample=0.1,col=0.7) # First digit touch - 0.966262479533 #BothStartLoadPhase-0.969428966329 #clf=xgb_classifier(num_round=500,eta=0.01,min_child_weight=20,depth=10, subsample=0.1,col=0.7) # HandStart - 0.930538668081 return clf.train_predict(X, y, Xt, yt) elif c == 2: clf = LDA() clf.fit(X, y) preds = clf.predict_proba(Xt)[:, 1] return preds elif c == 3: clf = LogisticRegression() clf.fit(X, y) preds = clf.predict_proba(Xt)[:, 1] return preds
def kfold_cv(X_train, y_train,k): kf = StratifiedKFold(y_train,n_folds=k) xx=[] zz=[] ypred=np.zeros((y_train.shape[0],3)) for train_index, test_index in kf: X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:] y_train_cv, y_test_cv = y_train[train_index],y_train[test_index] #clf=RandomForestClassifier(n_jobs=-1,max_depth=21,max_features=30,n_estimators=100) #clf.fit(X_train_cv,y_train_cv) #y_pred=clf.predict_proba(X_test_cv) clf=xgb_classifier(eta=0.25,col=0.4,min_child_weight=1,depth=6,num_round=70) y_pred=clf.multi(X_train_cv,y_train_cv,X_test_cv,3,y_test=y_test_cv) xx.append(multiclass_log_loss(y_test_cv,y_pred)) print xx[-1]#,y_pred.shape,zz[-1] ypred[test_index]=y_pred print xx print 'average:',np.mean(xx),'std',np.std(xx) return ypred,np.mean(xx)
X=np.vstack([X.T,pd.read_csv('../bench/h2ocv.csv',index_col=idname)['PredictedProb'].values]).T X=np.vstack([X.T,pd.read_csv('../bench/gene1cv.csv',index_col=idname)['PredictedProb'].values]).T labelname='PredictedProb' Xt=np.vstack([Xt.T,pd.read_csv('../tian/tian_sub.csv',index_col=idname)[labelname].values]).T Xt=np.vstack([Xt.T,pd.read_csv('../cv/xgb4.csv',index_col=idname)[labelname].values]).T Xt=np.vstack([Xt.T,pd.read_csv('../cv/ridge1.csv',index_col=idname)[labelname].values]).T Xt=np.vstack([Xt.T,pd.read_csv('../cv/xgb11.csv',index_col=idname)[labelname].values]).T #Xt=np.vstack([Xt.T,pd.read_csv('../cv/ex1.csv',index_col=idname)[labelname].values]).T #Xt=np.vstack([Xt.T,pd.read_csv('../bench/bsub1.csv',index_col=idname).as_matrix().ravel()]).T Xt=np.vstack([Xt.T,pd.read_csv('../srv3/sub/ridge1.csv',index_col=idname)[labelname].values]).T Xt=np.vstack([Xt.T,pd.read_csv('../dahei/sub/ftrl2sub.csv',index_col=idname)['target'].values]).T Xt=np.vstack([Xt.T,pd.read_csv('../bench/xgb1.csv',index_col=idname)['PredictedProb'].values]).T Xt=np.vstack([Xt.T,pd.read_csv('../bench/tree1.csv',index_col=idname)['PredictedProb'].values]).T Xt=np.vstack([Xt.T,pd.read_csv('../bench/ex5.csv',index_col=idname)['PredictedProb'].values]).T Xt=np.vstack([Xt.T,pd.read_csv('../bench/h2o.csv',index_col=idname)['PredictedProb'].values]).T Xt=np.vstack([Xt.T,pd.read_csv('../bench/gene1.csv',index_col=idname)['PredictedProb'].values]).T print X.shape,y.shape,Xt.shape X=sparse.hstack([X,X_sparse],format='csr')#.toarray() Xt=sparse.hstack([Xt,Xt_sparse],format='csr') print X.shape,y.shape,Xt.shape bad=[8,114] xx=[i for i in range(X.shape[1]) if i not in bad] assert(X.shape[1]==Xt.shape[1]) clf=xgb_classifier(eta=0.01,min_child_weight=2,col=0.7,subsample=0.68,depth=5,num_round=500,seed=0,gamma=0) yp=clf.train_predict(X[:,xx],y,Xt[:,xx]) s=pd.DataFrame({idname:idx,'PredictedProb':yp}) s.to_csv('stack21.csv',index=False)
def pre_processing_meta_random(data_base_dir,data_meta_random_dir): xgb_clf=xgb_classifier(eta=0.3,min_child_weight=6,depth=100,num_round=20,threads=16,exist_prediction=True,exist_num_round=20) y_all=pickle.load(open(data_base_dir+"y.p","rb")) X_numerical=pickle.load(open(data_base_dir+"X_numerical.p","rb")) X_test_numerical=pickle.load(open(data_base_dir+"X_test_numerical.p","rb")) X_sparse=pickle.load(open(data_base_dir+"X_sparse.p","rb")) X_test_sparse=pickle.load(open(data_base_dir+"X_test_sparse.p","rb")) X_numerical_base, X_numerical_meta, X_sparse_base, X_sparse_meta, y_base, y_meta = train_test_split( X_numerical, X_sparse, y_all, test_size = 0.5 ) X_meta_rf=[] X_meta_svc=[] X_test_rf=[] X_test_svc=[] for i in range(33) : predicted = None if i==13: print "%d is constant like: " % (i),"not included in meta features" else : print 'train',i y = y_base[:, i] rf = RandomForestClassifier(n_estimators = 150, n_jobs = 16) rf.fit(X_numerical_base, y) X_meta_rf.append(rf.predict_proba(X_numerical_meta)) X_test_rf.append(rf.predict_proba(X_test_numerical)) y = y_base[:, i] svm = LinearSVC() svm.fit(X_sparse_base, y) X_meta_svc.append(svm.decision_function(X_sparse_meta)) X_test_svc.append(svm.decision_function(X_test_sparse)) X_meta_rf = np.column_stack(X_meta_rf) X_test_rf= np.column_stack(X_test_rf) pickle.dump( X_meta_rf, open(data_meta_random_dir+ "X_meta_random_rf.p", "wb" ) ) pickle.dump( X_test_rf, open(data_meta_random_dir+ "X_test_meta_rf.p", "wb" ) ) X_meta_svc = np.column_stack(X_meta_svc) X_test_svc= np.column_stack(X_test_svc) pickle.dump( X_meta_svc, open(data_meta_random_dir+ "X_meta_random_svc.p", "wb" ) ) pickle.dump( X_test_svc, open(data_meta_random_dir+ "X_test_meta_svc.p", "wb" ) ) pickle.dump( y_meta, open(data_meta_random_dir+ "y_meta.p", "wb" ) ) pickle.dump( y_base, open(data_meta_random_dir+ "y_base.p", "wb" ) ) pickle.dump( X_numerical_meta, open(data_meta_random_dir+ "X_numerical_meta.p", "wb" ) )
names_categorical = [] for name in train.columns.values : if train[name].value_counts().shape[0]<200: train[name] = map(str, train[name]) names_categorical.append(name) print name,train[name].value_counts().shape[0] X_sparse = vec.fit_transform(train[names_categorical].T.to_dict().values()) idx=np.array(train.index) del train gc.collect() X=sparse.hstack([X,X_sparse],format='csr')#.toarray() train=pd.read_csv('test_clean1.csv',index_col=idname) train.drop(bad,inplace=True,axis=1) Xt=train.as_matrix() Xt_sparse = vec.transform(train[names_categorical].T.to_dict().values()) idx=np.array(train.index) Xt=sparse.hstack([Xt,Xt_sparse],format='csr') print X.shape,y.shape,Xt.shape yp=np.zeros(Xt.shape[0]) m=10 for j in range(m): clf=xgb_classifier(eta=0.1,min_child_weight=20,col=0.5,subsample=0.7,depth=5,num_round=200,seed=j*77,gamma=0.1) yp+=clf.train_predict(X,y,Xt) yp/=m; s=pd.DataFrame({idname:idx,'PredictedProb':yp}) s.to_csv('xgb3.csv',index=False)
test = pd.read_csv('../../../input/patients_test.csv', index_col='patient_id') idx = np.array(test.index) del test gc.collect() print X.shape, y.shape, Xt.shape from xgb_classifier import xgb_classifier eta = 0.1 myname = sys.argv[0] for seed in [0]: #[i*777 for i in range(1,10)]: for depth in [10]: for child in [2]: for col in [0.4]: for sub in [1]: for num in [2000]: clf = xgb_classifier(eta=eta, min_child_weight=child, depth=depth, num_round=num, col=col, subsample=sub, seed=seed) ypred = clf.train_predict(X, y, Xt) s = pd.DataFrame({ 'patient_id': idx, 'predict_screener': ypred }) s.to_csv('rxgb5.csv', index=False) #s.to_csv('va_result/%s_eta_%f_depth_%d_child_%d_col_%f_sub_%f_num_%d_seed_%d_score_%f'% (myname,eta,depth,child,col,sub,num,seed,score),index=False)
from sklearn.externals.joblib import Memory from sklearn.datasets import load_svmlight_file mem = Memory("./mycache") @mem.cache def get_data(path): data = load_svmlight_file(path) return data[0], data[1] idname='ID' labelname='target' train=pd.read_csv('train_clean1.csv',index_col=idname) y=np.array(train[labelname]).astype(float) train.drop([labelname],inplace=True,axis=1) X=train.as_matrix() del train train=pd.read_csv('test_clean1.csv',index_col=idname) Xt=train.as_matrix() idx=np.array(train.index) del train clf=xgb_classifier(eta=0.1,min_child_weight=20,col=0.7,subsample=1,depth=10,num_round=50,seed=0,gamma=0.1) yp=clf.train_predict(X,y,Xt) s=pd.DataFrame({idname:idx,'PredictedProb':yp}) s.to_csv('xgb1.csv',index=False)
def xgb_meta_predict(data_base_dir, data_meta_part2_dir, submission_dir): test_id = pickle.load(open(data_base_dir + "test_id.p", "rb")) y_all = pickle.load(open(data_base_dir + "y.p", "rb")) y_part2 = y_all[y_all.shape[0] / 2:, :] X_numerical = pickle.load(open(data_base_dir + "X_numerical.p", "rb")) X_numerical_part2 = X_numerical[X_numerical.shape[0] / 2:, :] X_test_numerical = pickle.load( open(data_base_dir + "X_test_numerical.p", "rb")) X_part2_rf = pickle.load( open(data_meta_part2_dir + "X_meta_part2_rf.p", "rb")) X_test_rf = pickle.load( open(data_meta_part2_dir + "X_test_meta_rf.p", "rb")) X_part2_svc = pickle.load( open(data_meta_part2_dir + "X_meta_part2_svc.p", "rb")) X_test_svc = pickle.load( open(data_meta_part2_dir + "X_test_meta_svc.p", "rb")) X_part2_sgd = pickle.load( open(data_meta_part2_dir + "X_meta_part2_sgd.p", "rb")) X_test_sgd = pickle.load( open(data_meta_part2_dir + "X_test_meta_sgd.p", "rb")) X_part2_best_online = pickle.load( open(data_meta_part2_dir + "X_meta_part2_online.p", "rb")) X_test_best_online = pickle.load( open(data_meta_part2_dir + "X_test_meta_online.p", "rb")) # private LB 0.0048854 xgb_clf = xgb_classifier(eta=0.09, min_child_weight=6, depth=18, num_round=120, threads=16) X_xgb_predict = xgb_clf.train_predict_all_labels( np.hstack([X_part2_best_online, X_part2_rf, X_numerical_part2]), y_part2, np.hstack([X_test_best_online, X_test_rf, X_test_numerical]), predict_y14=True) save_predictions( submission_dir + 'xgb-part2-d18-e0.09-min6-tree120.csv.gz', test_id, X_xgb_predict) # private LB 0.0048763 xgb_clf = xgb_classifier(eta=0.07, min_child_weight=6, depth=20, num_round=150, threads=16) X_xgb_predict = xgb_clf.train_predict_all_labels( np.hstack([X_part2_best_online, X_part2_rf, X_numerical_part2]), y_part2, np.hstack([X_test_best_online, X_test_rf, X_test_numerical]), predict_y14=True) save_predictions( submission_dir + 'xgb-part2-d20-e0.07-min6-tree150.csv.gz', test_id, X_xgb_predict) # private LB 0.0048978 xgb_clf = xgb_classifier(eta=0.09, min_child_weight=6, depth=18, num_round=100, threads=16) X_xgb_predict = xgb_clf.train_predict_all_labels(np.hstack( [X_part2_best_online, X_part2_rf, X_part2_svc, X_numerical_part2]), y_part2, np.hstack([ X_test_best_online, X_test_rf, X_test_svc, X_test_numerical ]), predict_y14=True) save_predictions( submission_dir + 'xgb-part2-d18-svc-e0.09-min6-tree100.csv.gz', test_id, X_xgb_predict) # private LB 0.0050270 xgb_clf = xgb_classifier(eta=0.1, min_child_weight=6, depth=20, num_round=110, threads=16) X_xgb_predict = xgb_clf.train_predict_all_labels( np.hstack([X_part2_best_online, X_part2_rf, X_part2_svc, X_part2_sgd]), y_part2, np.hstack([X_test_best_online, X_test_rf, X_test_svc, X_test_sgd]), predict_y14=True) save_predictions( submission_dir + 'xgb-part2-d20-e0.1-min6-tree110-metaonly.csv.gz', test_id, X_xgb_predict)
def pre_processing_meta_part1(data_base_dir,data_meta_part1_dir): y_all=pickle.load(open(data_base_dir+"y.p","rb")) y_part2=y_all[y_all.shape[0]/2:,:] X_all=pickle.load(open(data_base_dir+"X_all.p","rb")) X_test=pickle.load(open(data_base_dir+"X_test_all.p","rb")) X_part1=X_all[:X_all.shape[0]/2,:] X_part2=X_all[X_all.shape[0]/2:,:] X_numerical=pickle.load(open(data_base_dir+"X_numerical.p","rb")) X_test_numerical=pickle.load(open(data_base_dir+"X_test_numerical.p","rb")) X_numerical_part1=X_numerical[:X_numerical.shape[0]/2,:] X_numerical_part2=X_numerical[X_numerical.shape[0]/2:,:] X_sparse=pickle.load(open(data_base_dir+"X_sparse.p","rb")) X_test_sparse=pickle.load(open(data_base_dir+"X_test_sparse.p","rb")) X_sparse_part1=X_sparse[:X_sparse.shape[0]/2,:] X_sparse_part2=X_sparse[X_sparse.shape[0]/2:,:] X_part1_xgb=[] X_part1_rf=[] X_part1_sgd=[] X_test_xgb=[] X_test_rf=[] X_test_sgd=[] # use pypy to accelerate online model X_part1_best_online=np.array(pd.read_csv(data_meta_part1_dir+'part1_online.csv')[['pred']]) X_part1_best_online=X_part1_best_online.reshape((X_part1_best_online.shape[0]/32,32)) X_test_best_online=np.array(pd.read_csv(data_meta_part1_dir+'best_online_test.csv')[['pred']]) X_test_best_online=X_test_best_online.reshape((X_test_best_online.shape[0]/32,32)) pickle.dump( X_part1_best_online, open(data_meta_part1_dir+ "X_meta_part1_online.p", "wb" ) ) pickle.dump( X_test_best_online, open(data_meta_part1_dir+ "X_test_meta_online.p", "wb" ) ) xgb_clf=xgb_classifier(eta=0.3,min_child_weight=6,depth=100,num_round=20,threads=16,exist_prediction=True,exist_num_round=20) X_part1_xgb = xgb_clf.train_predict_all_labels(X_part2, y_part2,X_part1,predict_y14=False) X_test_xgb = xgb_clf.train_predict_all_labels(X_all, y_all,X_test,predict_y14=False) # a little trick to make test data's meta features more accurate pickle.dump( X_part1_xgb, open(data_meta_part1_dir+ "X_meta_part1_xgb.p", "wb" ) ) pickle.dump( X_test_xgb, open(data_meta_part1_dir+ "X_test_meta_xgb_all.p", "wb" ) ) for i in range(33) : predicted = None if i==13: print "%d is constant like: " % (i),"not included in meta features" else : print 'train',i y = y_part2[:, i] rf = RandomForestClassifier(n_estimators=200, n_jobs=16, min_samples_leaf = 10,random_state=1,bootstrap=False,criterion='entropy',min_samples_split=5,verbose=1) rf.fit(X_numerical_part2, y) X_part1_rf.append(rf.predict_proba(X_numerical_part1)) X_test_rf.append(rf.predict_proba(X_test_numerical)) y = y_part2[:, i] clf=SGDClassifier(loss='log',alpha=0.000001,n_iter=100) clf.fit(X_sparse_part2,y) X_part1_sgd.append(clf.predict_proba(X_sparse_part1).T[1]) X_test_sgd.append(clf.predict_proba(X_test_sparse).T[1]) X_part1_rf = np.column_stack(X_part1_rf) X_test_rf= np.column_stack(X_test_rf) pickle.dump( X_part1_rf, open(data_meta_part1_dir+ "X_meta_part1_rf.p", "wb" ) ) pickle.dump( X_test_rf, open(data_meta_part1_dir+ "X_test_meta_rf.p", "wb" ) ) X_part1_sgd = np.column_stack(X_part1_sgd) X_test_sgd= np.column_stack(X_test_sgd) pickle.dump( X_part1_sgd, open(data_meta_part1_dir+ "X_meta_part1_sgd.p", "wb" ) ) pickle.dump( X_test_sgd, open(data_meta_part1_dir+ "X_test_meta_sgd.p", "wb" ) )
df = pd.read_csv(path) df = df[df.FeedBackEvent != 0] df = df.drop('FeedBackEvent', axis = 1) if i == 0: test = np.array(df) else: test = np.vstack((test, np.array(df))) """ import pickle pickle.dump(train,open("train.p","wb")) pickle.dump(test,open("test.p","wb")) pickle.dump(labels.Prediction.values,open("label.p","wb")) import pickle train=pickle.load(open("train.p","rb")) test=pickle.load(open("test.p","rb")) label=pickle.load(open("label.p","rb")) """ clf = ensemble.RandomForestClassifier(n_jobs = -1, n_estimators=10, min_samples_leaf=10, random_state=42) xgb_clf=xgb_classifier(eta=0.1,min_child_weight=1,depth=10,num_round=40,threads=8,boost_from_exist_prediction=True,exist_num_round=10) clf.fit(train, labels.Prediction.values) base_train_prediction=clf.predict_proba(train).T[1] base_test_prediction=clf.predict_proba(test).T[1] preds = xgb_clf.train_predict(train,labels.Prediction.values,test,base_train_prediction,base_test_prediction) submission['Prediction'] = preds submission.to_csv('xgb_boost_from_rf.csv', index = False)
def train_predict(X,y,Xt,yt=[],c=1): if c==1: #clf=xgb_classifier(num_round=45,eta=0.1,min_child_weight=5,depth=10, subsample=0.5,col=1) clf=xgb_classifier(num_round=45,eta=0.1,min_child_weight=20,depth=20, subsample=0.1,col=0.7) return clf.train_predict(X,y,Xt,yt)
kf=KFold(len(y),n_folds=4) for train_index, test_index in kf: Xt=X[test_index] X=X[train_index] idx=idx[test_index] yt=y[test_index] y=y[train_index] break test=pd.read_csv('../../../input/patients_test.csv',index_col='patient_id') idx=np.array(test.index) del test gc.collect() print X.shape,y.shape,Xt.shape from xgb_classifier import xgb_classifier eta=0.1 myname=sys.argv[0] for seed in [0]:#[i*777 for i in range(1,10)]: for depth in [10]: for child in [2]: for col in [0.4]: for sub in [1]: for num in [2000]: clf=xgb_classifier(eta=eta,min_child_weight=child,depth=depth,num_round=num,col=col,subsample=sub,seed=seed) ypred=clf.train_predict(X,y,Xt) s=pd.DataFrame({'patient_id':idx,'predict_screener':ypred}) s.to_csv('rxgb5.csv',index=False) #s.to_csv('va_result/%s_eta_%f_depth_%d_child_%d_col_%f_sub_%f_num_%d_seed_%d_score_%f'% (myname,eta,depth,child,col,sub,num,seed,score),index=False)
def pre_processing_meta_part2(data_base_dir, data_meta_part2_dir): xgb_clf = xgb_classifier(eta=0.3, min_child_weight=6, depth=100, num_round=20, threads=16, exist_prediction=True, exist_num_round=20) y_all = pickle.load(open(data_base_dir + "y.p", "rb")) y_part1 = y_all[:y_all.shape[0] / 2, :] X_numerical = pickle.load(open(data_base_dir + "X_numerical.p", "rb")) X_test_numerical = pickle.load( open(data_base_dir + "X_test_numerical.p", "rb")) X_numerical_part1 = X_numerical[:X_numerical.shape[0] / 2, :] X_numerical_part2 = X_numerical[X_numerical.shape[0] / 2:, :] X_sparse = pickle.load(open(data_base_dir + "X_sparse.p", "rb")) X_test_sparse = pickle.load(open(data_base_dir + "X_test_sparse.p", "rb")) X_sparse_part1 = X_sparse[:X_sparse.shape[0] / 2, :] X_sparse_part2 = X_sparse[X_sparse.shape[0] / 2:, :] X_part2_rf = [] X_part2_svc = [] X_part2_sgd = [] X_test_rf = [] X_test_svc = [] X_test_sgd = [] # use pypy to accelerate online model X_part2_best_online = np.array( pd.read_csv(data_meta_part2_dir + 'part2_online.csv')[['pred']]) X_part2_best_online = X_part2_best_online.reshape( (X_part2_best_online.shape[0] / 32, 32)) pickle.dump(X_part2_best_online, open(data_meta_part2_dir + "X_meta_part2_online.p", "wb")) X_test_best_online = np.array( pd.read_csv(data_meta_part2_dir + 'best_online_test.csv')[['pred']]) X_test_best_online = X_test_best_online.reshape( (X_test_best_online.shape[0] / 32, 32)) pickle.dump(X_test_best_online, open(data_meta_part2_dir + "X_test_meta_online.p", "wb")) for i in range(33): predicted = None if i == 13: print("%d is constant like: " % (i), "not included in meta features") else: print('train', i) y = y_part1[:, i] rf = RandomForestClassifier(n_estimators=200, n_jobs=16, min_samples_leaf=10, random_state=1, bootstrap=False, criterion='entropy', min_samples_split=5, verbose=1) rf.fit(X_numerical_part1, y) X_part2_rf.append(rf.predict_proba(X_numerical_part2)) X_test_rf.append(rf.predict_proba(X_test_numerical)) y = y_part1[:, i] svm = LinearSVC(C=0.17) svm.fit(X_sparse_part1, y) X_part2_svc.append(svm.decision_function(X_sparse_part2)) X_test_svc.append(svm.decision_function(X_test_sparse)) y = y_part1[:, i] clf = SGDClassifier(loss='log', alpha=0.000001, n_iter=100) clf.fit(X_sparse_part1, y) X_part2_sgd.append(clf.predict_proba(X_sparse_part2).T[1]) X_test_sgd.append(clf.predict_proba(X_test_sparse).T[1]) X_part2_rf = np.column_stack(X_part2_rf) X_test_rf = np.column_stack(X_test_rf) pickle.dump(X_part2_rf, open(data_meta_part2_dir + "X_meta_part2_rf.p", "wb")) pickle.dump(X_test_rf, open(data_meta_part2_dir + "X_test_meta_rf.p", "wb")) X_part2_svc = np.column_stack(X_part2_svc) X_test_svc = np.column_stack(X_test_svc) pickle.dump(X_part2_svc, open(data_meta_part2_dir + "X_meta_part2_svc.p", "wb")) pickle.dump(X_test_svc, open(data_meta_part2_dir + "X_test_meta_svc.p", "wb")) X_part2_sgd = np.column_stack(X_part2_sgd) X_test_sgd = np.column_stack(X_test_sgd) pickle.dump(X_part2_sgd, open(data_meta_part2_dir + "X_meta_part2_sgd.p", "wb")) pickle.dump(X_test_sgd, open(data_meta_part2_dir + "X_test_meta_sgd.p", "wb"))
def xgb_meta_predict(data_base_dir, data_meta_random_dir, submission_dir): test_id = pickle.load(open(data_base_dir + "test_id.p", "rb")) y_meta = pickle.load(open(data_meta_random_dir + "y_meta.p", "rb")) X_numerical_random = pickle.load( open(data_meta_random_dir + "X_numerical_meta.p", "rb")) X_test_numerical = pickle.load( open(data_base_dir + "X_test_numerical.p", "rb")) X_random_rf = pickle.load( open(data_meta_random_dir + "X_meta_random_rf.p", "rb")) X_test_rf = pickle.load( open(data_meta_random_dir + "X_test_meta_rf.p", "rb")) X_random_svc = pickle.load( open(data_meta_random_dir + "X_meta_random_svc.p", "rb")) X_test_svc = pickle.load( open(data_meta_random_dir + "X_test_meta_svc.p", "rb")) # private LB 0.0054101 xgb_clf = xgb_classifier(eta=0.2, min_child_weight=1, depth=10, num_round=70, threads=16) X_xgb_predict = xgb_clf.train_predict_all_labels( np.hstack([X_random_rf, X_random_svc, X_numerical_random]), y_meta, np.hstack([X_test_rf, X_test_svc, X_test_numerical]), predict_y14=True) save_predictions(submission_dir + 'xgb-random-d10-e0.2-min1-tree70.csv.gz', test_id, X_xgb_predict) # private LB 0.0053053 xgb_clf = xgb_classifier(eta=0.2, min_child_weight=6, depth=12, num_round=80, threads=16) X_xgb_predict = xgb_clf.train_predict_all_labels( np.hstack([X_random_rf, X_random_svc, X_numerical_random]), y_meta, np.hstack([X_test_rf, X_test_svc, X_test_numerical]), predict_y14=True) save_predictions(submission_dir + 'xgb-random-d12-e0.2-min6-tree80.csv.gz', test_id, X_xgb_predict) # private LB 0.0052910 xgb_clf = xgb_classifier(eta=0.09, min_child_weight=6, depth=25, num_round=100, threads=16) X_xgb_predict = xgb_clf.train_predict_all_labels( np.hstack([X_random_rf, X_random_svc, X_numerical_random]), y_meta, np.hstack([X_test_rf, X_test_svc, X_test_numerical]), predict_y14=True) save_predictions( submission_dir + 'xgb-random-d25-svc-e0.09-min6-tree100.csv.gz', test_id, X_xgb_predict)
X2, _ = get_data('../sparse/rebuild3_test.svm') X3, _ = get_data('../sparse/rebuild4_test.svm') X4, _ = get_data('../sparse/rebuild5_test.svm') X6 = np.sum(Xt.todense(), axis=1) X7 = np.sum(X1.todense(), axis=1) X8 = np.sum(X2.todense(), axis=1) X9 = np.sum(X3.todense(), axis=1) X10 = np.sum(X4.todense(), axis=1) Xt = sparse.hstack([Xt, X1, X2, X3, X4, X6, X7, X8, X9, X10], format='csr').todense() train = pd.read_csv('../explore/test1.csv') idx = train[idname].as_matrix().astype(int) Xt = np.hstack([Xt, train.drop([label, idname], axis=1).as_matrix()]) print X.shape, y.shape print Xt.shape #, y.sha clf = xgb_classifier(eta=0.25, col=0.7, min_child_weight=1, depth=6, num_round=70) yp = clf.multi(X, y, Xt, 3) s = pd.DataFrame({ idname: idx, 'predict_0': yp[:, 0], 'predict_1': yp[:, 1], 'predict_2': yp[:, 2] }) s.to_csv('xgb4.csv', index=False)
def pre_processing_meta_random(data_base_dir, data_meta_random_dir): xgb_clf = xgb_classifier(eta=0.3, min_child_weight=6, depth=100, num_round=20, threads=16, exist_prediction=True, exist_num_round=20) y_all = pickle.load(open(data_base_dir + "y.p", "rb")) X_numerical = pickle.load(open(data_base_dir + "X_numerical.p", "rb")) X_test_numerical = pickle.load( open(data_base_dir + "X_test_numerical.p", "rb")) X_sparse = pickle.load(open(data_base_dir + "X_sparse.p", "rb")) X_test_sparse = pickle.load(open(data_base_dir + "X_test_sparse.p", "rb")) X_numerical_base, X_numerical_meta, X_sparse_base, X_sparse_meta, y_base, y_meta = train_test_split( X_numerical, X_sparse, y_all, test_size=0.5) X_meta_rf = [] X_meta_svc = [] X_test_rf = [] X_test_svc = [] for i in range(33): predicted = None if i == 13: print "%d is constant like: " % ( i), "not included in meta features" else: print 'train', i y = y_base[:, i] rf = RandomForestClassifier(n_estimators=150, n_jobs=16) rf.fit(X_numerical_base, y) X_meta_rf.append(rf.predict_proba(X_numerical_meta)) X_test_rf.append(rf.predict_proba(X_test_numerical)) y = y_base[:, i] svm = LinearSVC() svm.fit(X_sparse_base, y) X_meta_svc.append(svm.decision_function(X_sparse_meta)) X_test_svc.append(svm.decision_function(X_test_sparse)) X_meta_rf = np.column_stack(X_meta_rf) X_test_rf = np.column_stack(X_test_rf) pickle.dump(X_meta_rf, open(data_meta_random_dir + "X_meta_random_rf.p", "wb")) pickle.dump(X_test_rf, open(data_meta_random_dir + "X_test_meta_rf.p", "wb")) X_meta_svc = np.column_stack(X_meta_svc) X_test_svc = np.column_stack(X_test_svc) pickle.dump(X_meta_svc, open(data_meta_random_dir + "X_meta_random_svc.p", "wb")) pickle.dump(X_test_svc, open(data_meta_random_dir + "X_test_meta_svc.p", "wb")) pickle.dump(y_meta, open(data_meta_random_dir + "y_meta.p", "wb")) pickle.dump(y_base, open(data_meta_random_dir + "y_base.p", "wb")) pickle.dump(X_numerical_meta, open(data_meta_random_dir + "X_numerical_meta.p", "wb"))
""" import pickle pickle.dump(train,open("train.p","wb")) pickle.dump(test,open("test.p","wb")) pickle.dump(labels.Prediction.values,open("label.p","wb")) import pickle train=pickle.load(open("train.p","rb")) test=pickle.load(open("test.p","rb")) label=pickle.load(open("label.p","rb")) """ clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=10, min_samples_leaf=10, random_state=42) xgb_clf = xgb_classifier(eta=0.1, min_child_weight=1, depth=10, num_round=40, threads=8, boost_from_exist_prediction=True, exist_num_round=10) clf.fit(train, labels.Prediction.values) base_train_prediction = clf.predict_proba(train).T[1] base_test_prediction = clf.predict_proba(test).T[1] preds = xgb_clf.train_predict(train, labels.Prediction.values, test, base_train_prediction, base_test_prediction) submission['Prediction'] = preds submission.to_csv('xgb_boost_from_rf.csv', index=False)
idx = train[idname].as_matrix() y = np.array(train[label]) import pickle #X=np.hstack([X,pickle.load(open('count.p'))]) Xt, _ = get_data('../sparse/rebuild1_test.svm') X1, _ = get_data('../sparse/rebuild2_test.svm') X2, _ = get_data('../sparse/rebuild3_test.svm') X3, _ = get_data('../sparse/rebuild4_test.svm') X4, _ = get_data('../sparse/rebuild5_test.svm') Xt = sparse.hstack([Xt, X1, X2, X3, X4], format='csr').todense() train = pd.read_csv('../input/test.csv') idx = train[idname].as_matrix().astype(int) print X.shape, y.shape print Xt.shape #, y.sha clf = xgb_classifier(eta=0.01, col=0.8, min_child_weight=2, depth=4, num_round=50) #good! yp = clf.multi(X, y, Xt, 3) s = pd.DataFrame({ idname: idx, 'predict_0': yp[:, 0], 'predict_1': yp[:, 1], 'predict_2': yp[:, 2] }) s.to_csv('xgb1.csv', index=False)
xx.append(np.sum(Xt.todense(),axis=1)) xx.append(np.sum(X1.todense(),axis=1)) xx.append(np.sum(X2.todense(),axis=1)) xx.append(np.sum(X3.todense(),axis=1)) xx.append(np.sum(X4.todense(),axis=1)) xx.append(np.std(Xt.todense(),axis=1)) xx.append(np.std(X1.todense(),axis=1)) xx.append(np.std(X2.todense(),axis=1)) xx.append(np.std(X3.todense(),axis=1)) xx.append(np.std(X4.todense(),axis=1)) xx=np.hstack(xx) Xt=sparse.hstack([Xt,X1,X2,X3,X4,xx,pickle.load(open('../explore/Xt2.p'))],format='csr').todense() train=pd.read_csv('../explore/test1.csv') idx=train[idname].as_matrix().astype(int) Xt=np.hstack([Xt,train.drop([label,idname],axis=1).as_matrix()]) print X.shape, y.shape print Xt.shape#, y.sha clf=xgb_classifier(eta=0.1,col=0.2,min_child_weight=1,depth=6,num_round=200) yp=clf.multi(X,y,Xt,3) s=pd.DataFrame({idname:idx,'predict_0':yp[:,0],'predict_1':yp[:,1],'predict_2':yp[:,2]}) s.to_csv('xgb5.csv',index=False)
def xgb_meta_predict(data_base_dir, data_meta_part1_dir, submission_dir): test_id = pickle.load(open(data_base_dir + "test_id.p", "rb")) y_all = pickle.load(open(data_base_dir + "y.p", "rb")) y_part1 = y_all[:y_all.shape[0] / 2, :] X_numerical = pickle.load(open(data_base_dir + "X_numerical.p", "rb")) X_numerical_part1 = X_numerical[:X_numerical.shape[0] / 2, :] X_test_numerical = pickle.load( open(data_base_dir + "X_test_numerical.p", "rb")) X_part1_xgb = pickle.load( open(data_meta_part1_dir + "X_meta_part1_xgb.p", "rb")) X_test_xgb = pickle.load( open(data_meta_part1_dir + "X_test_meta_xgb_all.p", "rb")) X_part1_rf = pickle.load( open(data_meta_part1_dir + "X_meta_part1_rf.p", "rb")) X_test_rf = pickle.load( open(data_meta_part1_dir + "X_test_meta_rf.p", "rb")) X_part1_sgd = pickle.load( open(data_meta_part1_dir + "X_meta_part1_sgd.p", "rb")) X_test_sgd = pickle.load( open(data_meta_part1_dir + "X_test_meta_sgd.p", "rb")) X_part1_best_online = pickle.load( open(data_meta_part1_dir + "X_meta_part1_online.p", "rb")) X_test_best_online = pickle.load( open(data_meta_part1_dir + "X_test_meta_online.p", "rb")) X_test_online_ensemble = pickle.load( open(data_meta_part1_dir + "X_test_meta_online_ensemble.p", "rb")) # best single model submitted, private LB 0.0044595, X_test_meta xgb_clf = xgb_classifier(eta=0.09, min_child_weight=6, depth=18, num_round=120, threads=16) X_xgb_predict = xgb_clf.train_predict_all_labels( np.hstack([ X_part1_best_online, X_part1_rf, X_part1_sgd, X_part1_xgb, X_numerical_part1 ]), y_part1, np.hstack([ X_test_online_ensemble, X_test_rf, X_test_sgd, X_test_xgb, X_test_numerical ]), predict_y14=True) save_predictions( submission_dir + 'xgb-part1-d18-e0.09-min6-tree120-xgb_base.csv.gz', test_id, X_xgb_predict) # best single model (not submitted by itself), private LB 0.0044591, not submitted alone xgb_clf = xgb_classifier(eta=0.07, min_child_weight=6, depth=20, num_round=150, threads=16) X_xgb_predict = xgb_clf.train_predict_all_labels( np.hstack([ X_part1_best_online, X_part1_rf, X_part1_sgd, X_part1_xgb, X_numerical_part1 ]), y_part1, np.hstack([ X_test_online_ensemble, X_test_rf, X_test_sgd, X_test_xgb, X_test_numerical ]), predict_y14=True) save_predictions( submission_dir + 'xgb-part1-d20-e0.07-min6-tree150-xgb_base.csv.gz', test_id, X_xgb_predict) # private LB 0.0047360 correct! try "boosting from existing predictions" xgb_clf = xgb_classifier(eta=0.07, min_child_weight=6, depth=20, num_round=20, threads=16, exist_prediction=True, exist_num_round=150) X_xgb_predict = xgb_clf.train_predict_all_labels(np.hstack( [X_part1_best_online, X_part1_rf, X_part1_sgd, X_numerical_part1]), y_part1, np.hstack([ X_test_best_online, X_test_rf, X_test_sgd, X_test_numerical ]), predict_y14=True) save_predictions( submission_dir + 'xgb-part1-d20-e0.07-min6-tree20-extree-150.csv.gz', test_id, X_xgb_predict) # private LB 0.0047103, xgb_clf = xgb_classifier(eta=0.09, min_child_weight=6, depth=18, num_round=1, threads=16, exist_prediction=True, exist_num_round=120) X_xgb_predict = xgb_clf.train_predict_all_labels( np.hstack( [X_part1_best_online, X_part1_rf, X_part1_sgd, X_numerical_part1]), y_part1, np.hstack( [X_test_online_ensemble, X_test_rf, X_test_sgd, X_test_numerical]), predict_y14=True) save_predictions( submission_dir + 'xgb-part1-d18-e0.09-min6-tree1-extree-120.csv.gz', test_id, X_xgb_predict) # private LB 0.0047000, using ensembled online predictions as meta feature for test sets! xgb_clf = xgb_classifier(eta=0.07, min_child_weight=6, depth=20, num_round=150, threads=16) X_xgb_predict = xgb_clf.train_predict_all_labels( np.hstack( [X_part1_best_online, X_part1_rf, X_part1_sgd, X_numerical_part1]), y_part1, np.hstack( [X_test_online_ensemble, X_test_rf, X_test_sgd, X_test_numerical]), predict_y14=True) save_predictions( submission_dir + 'xgb-part1-d20-e0.07-min6-tree150.csv.gz', test_id, X_xgb_predict) # private LB 0.0047313, correct! xgb_clf = xgb_classifier(eta=0.07, min_child_weight=6, depth=19, num_round=150, threads=16) X_xgb_predict = xgb_clf.train_predict_all_labels(np.hstack( [X_part1_best_online, X_part1_rf, X_part1_sgd, X_numerical_part1]), y_part1, np.hstack([ X_test_best_online, X_test_rf, X_test_sgd, X_test_numerical ]), predict_y14=True) save_predictions( submission_dir + 'xgb-part1-d19-e0.07-min6-tree150.csv.gz', test_id, X_xgb_predict) # private LB 0.0047446, correct! xgb_clf = xgb_classifier(eta=0.09, min_child_weight=6, depth=18, num_round=120, threads=16) X_xgb_predict = xgb_clf.train_predict_all_labels(np.hstack( [X_part1_best_online, X_part1_rf, X_part1_sgd, X_numerical_part1]), y_part1, np.hstack([ X_test_best_online, X_test_rf, X_test_sgd, X_test_numerical ]), predict_y14=True) save_predictions( submission_dir + 'xgb-part1-d18-e0.09-min6-tree120.csv.gz', test_id, X_xgb_predict)
xx.append(np.std(X1.todense(), axis=1)) xx.append(np.std(X2.todense(), axis=1)) xx.append(np.std(X3.todense(), axis=1)) xx.append(np.std(X4.todense(), axis=1)) xx = np.hstack(xx) Xt = sparse.hstack( [Xt, X1, X2, X3, X4, xx, pickle.load(open('../explore/Xt2.p'))], format='csr').todense() train = pd.read_csv('../explore/test1.csv') idx = train[idname].as_matrix().astype(int) Xt = np.hstack([Xt, train.drop([label, idname], axis=1).as_matrix()]) print X.shape, y.shape print Xt.shape #, y.sha clf = xgb_classifier(eta=0.1, col=0.2, min_child_weight=1, depth=6, num_round=200) yp = clf.multi(X, y, Xt, 3) s = pd.DataFrame({ idname: idx, 'predict_0': yp[:, 0], 'predict_1': yp[:, 1], 'predict_2': yp[:, 2] }) s.to_csv('xgb5.csv', index=False)
X4, _ =get_data('../sparse/rebuild5.svm') X=sparse.hstack([X,X1,X2,X3,X4],format='csr').todense() train=pd.read_csv('../input/train.csv') idname='id' label='fault_severity' idx=train[idname].as_matrix() y=np.array(train[label]) import pickle #X=np.hstack([X,pickle.load(open('count.p'))]) Xt, _ = get_data('../sparse/rebuild1_test.svm') X1, _ =get_data('../sparse/rebuild2_test.svm') X2, _ = get_data('../sparse/rebuild3_test.svm') X3, _ =get_data('../sparse/rebuild4_test.svm') X4, _ =get_data('../sparse/rebuild5_test.svm') Xt=sparse.hstack([Xt,X1,X2,X3,X4],format='csr').todense() train=pd.read_csv('../input/test.csv') idx=train[idname].as_matrix().astype(int) print X.shape, y.shape print Xt.shape#, y.sha clf=xgb_classifier(eta=0.01,col=0.8,min_child_weight=2,depth=4,num_round=50) #good! yp=clf.multi(X,y,Xt,3) s=pd.DataFrame({idname:idx,'predict_0':yp[:,0],'predict_1':yp[:,1],'predict_2':yp[:,2]}) s.to_csv('xgb1.csv',index=False)
Xt=sparse.hstack([Xt,X1,X2,X3,X4,xx,pickle.load(open('../explore/Xt2.p'))],format='csr').todense() train=pd.read_csv('../explore/test1.csv') idx=train[idname].as_matrix().astype(int) Xt=np.hstack([Xt,train.drop([label,idname],axis=1).as_matrix()]) print X.shape, y.shape print Xt.shape#, y.sha from scipy.stats import pearsonr xx=[] xt=[] for i,j in zip(X.T,Xt.T): score=pearsonr(np.array(i.T).ravel(),y)[0] if np.abs(score)>1e-2: xx.append(np.array(i.T).ravel()) xt.append(np.array(j.T).ravel()) X=np.array(xx).T Xt=np.array(xt).T print X.shape, y.shape print Xt.shape clf=xgb_classifier(eta=0.1,gamma=1e-3,col=0.3,min_child_weight=0.5,depth=7,num_round=160) yp=clf.multi(X,y,Xt,3) s=pd.DataFrame({idname:idx,'predict_0':yp[:,0],'predict_1':yp[:,1],'predict_2':yp[:,2]}) s.to_csv('xgb6.csv',index=False)
def pre_processing_meta_part1(data_base_dir, data_meta_part1_dir): y_all = pickle.load(open(data_base_dir + "y.p", "rb")) y_part2 = y_all[y_all.shape[0] / 2:, :] X_all = pickle.load(open(data_base_dir + "X_all.p", "rb")) X_test = pickle.load(open(data_base_dir + "X_test_all.p", "rb")) X_part1 = X_all[:X_all.shape[0] / 2, :] X_part2 = X_all[X_all.shape[0] / 2:, :] X_numerical = pickle.load(open(data_base_dir + "X_numerical.p", "rb")) X_test_numerical = pickle.load( open(data_base_dir + "X_test_numerical.p", "rb")) X_numerical_part1 = X_numerical[:X_numerical.shape[0] / 2, :] X_numerical_part2 = X_numerical[X_numerical.shape[0] / 2:, :] X_sparse = pickle.load(open(data_base_dir + "X_sparse.p", "rb")) X_test_sparse = pickle.load(open(data_base_dir + "X_test_sparse.p", "rb")) X_sparse_part1 = X_sparse[:X_sparse.shape[0] / 2, :] X_sparse_part2 = X_sparse[X_sparse.shape[0] / 2:, :] X_part1_xgb = [] X_part1_rf = [] X_part1_sgd = [] X_test_xgb = [] X_test_rf = [] X_test_sgd = [] # use pypy to accelerate online model X_part1_best_online = np.array( pd.read_csv(data_meta_part1_dir + 'part1_online.csv')[['pred']]) X_part1_best_online = X_part1_best_online.reshape( (X_part1_best_online.shape[0] / 32, 32)) X_test_best_online = np.array( pd.read_csv(data_meta_part1_dir + 'best_online_test.csv')[['pred']]) X_test_best_online = X_test_best_online.reshape( (X_test_best_online.shape[0] / 32, 32)) pickle.dump(X_part1_best_online, open(data_meta_part1_dir + "X_meta_part1_online.p", "wb")) pickle.dump(X_test_best_online, open(data_meta_part1_dir + "X_test_meta_online.p", "wb")) xgb_clf = xgb_classifier(eta=0.3, min_child_weight=6, depth=100, num_round=20, threads=16, exist_prediction=True, exist_num_round=20) X_part1_xgb = xgb_clf.train_predict_all_labels(X_part2, y_part2, X_part1, predict_y14=False) X_test_xgb = xgb_clf.train_predict_all_labels( X_all, y_all, X_test, predict_y14=False ) # a little trick to make test data's meta features more accurate pickle.dump(X_part1_xgb, open(data_meta_part1_dir + "X_meta_part1_xgb.p", "wb")) pickle.dump(X_test_xgb, open(data_meta_part1_dir + "X_test_meta_xgb_all.p", "wb")) for i in range(33): predicted = None if i == 13: print "%d is constant like: " % ( i), "not included in meta features" else: print 'train', i y = y_part2[:, i] rf = RandomForestClassifier(n_estimators=200, n_jobs=16, min_samples_leaf=10, random_state=1, bootstrap=False, criterion='entropy', min_samples_split=5, verbose=1) rf.fit(X_numerical_part2, y) X_part1_rf.append(rf.predict_proba(X_numerical_part1)) X_test_rf.append(rf.predict_proba(X_test_numerical)) y = y_part2[:, i] clf = SGDClassifier(loss='log', alpha=0.000001, n_iter=100) clf.fit(X_sparse_part2, y) X_part1_sgd.append(clf.predict_proba(X_sparse_part1).T[1]) X_test_sgd.append(clf.predict_proba(X_test_sparse).T[1]) X_part1_rf = np.column_stack(X_part1_rf) X_test_rf = np.column_stack(X_test_rf) pickle.dump(X_part1_rf, open(data_meta_part1_dir + "X_meta_part1_rf.p", "wb")) pickle.dump(X_test_rf, open(data_meta_part1_dir + "X_test_meta_rf.p", "wb")) X_part1_sgd = np.column_stack(X_part1_sgd) X_test_sgd = np.column_stack(X_test_sgd) pickle.dump(X_part1_sgd, open(data_meta_part1_dir + "X_meta_part1_sgd.p", "wb")) pickle.dump(X_test_sgd, open(data_meta_part1_dir + "X_test_meta_sgd.p", "wb"))
print Xt.shape #, y.sha from scipy.stats import pearsonr xx = [] xt = [] for i, j in zip(X.T, Xt.T): score = pearsonr(np.array(i.T).ravel(), y)[0] if np.abs(score) > 1e-2: xx.append(np.array(i.T).ravel()) xt.append(np.array(j.T).ravel()) X = np.array(xx).T Xt = np.array(xt).T print X.shape, y.shape print Xt.shape clf = xgb_classifier(eta=0.1, gamma=1e-3, col=0.3, min_child_weight=0.5, depth=7, num_round=160) yp = clf.multi(X, y, Xt, 3) s = pd.DataFrame({ idname: idx, 'predict_0': yp[:, 0], 'predict_1': yp[:, 1], 'predict_2': yp[:, 2] }) s.to_csv('xgb6.csv', index=False)
X=sparse.hstack([X,X1,X2,X3,X4],format='csr').todense() train=pd.read_csv('../explore/train1.csv') idname='id' label='fault_severity' idx=train[idname].as_matrix() y=np.array(train[label]) import pickle X=np.hstack([X,train.drop([label,idname],axis=1).as_matrix()]) #X=np.hstack([X,pickle.load(open('count.p'))]) Xt, _ = get_data('../sparse/rebuild1_test.svm') X1, _ =get_data('../sparse/rebuild2_test.svm') X2, _ = get_data('../sparse/rebuild3_test.svm') X3, _ =get_data('../sparse/rebuild4_test.svm') X4, _ =get_data('../sparse/rebuild5_test.svm') Xt=sparse.hstack([Xt,X1,X2,X3,X4],format='csr').todense() train=pd.read_csv('../explore/test1.csv') idx=train[idname].as_matrix().astype(int) Xt=np.hstack([Xt,train.drop([label,idname],axis=1).as_matrix()]) print X.shape, y.shape print Xt.shape#, y.sha clf=xgb_classifier(eta=0.25,col=0.7,min_child_weight=1,depth=6,num_round=70) yp=clf.multi(X,y,Xt,3) s=pd.DataFrame({idname:idx,'predict_0':yp[:,0],'predict_1':yp[:,1],'predict_2':yp[:,2]}) s.to_csv('xgb3.csv',index=False)