Пример #1
1
def kfold_cv(X_train, y_train,idx,k):

    kf = StratifiedKFold(y_train,n_folds=k)
    xx=[]
    count=0
    for train_index, test_index in kf:
        count+=1
        X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:]
        gc.collect()
        y_train_cv, y_test_cv = y_train[train_index],y_train[test_index]
        y_pred=np.zeros(X_test_cv.shape[0])
        m=0
         
        for j in range(m):
            clf=xgb_classifier(eta=0.05,min_child_weight=20,col=0.5,subsample=0.7,depth=7,num_round=400,seed=j*77,gamma=0.1)
            y_pred+=clf.train_predict(X_train_cv,(y_train_cv),X_test_cv,y_test=(y_test_cv))
            yqq=y_pred*(1.0/(j+1))

            print j,llfun(y_test_cv,yqq)

        #y_pred/=m;
        clf=XGBClassifier(max_depth=10,colsample_bytree=0.8,learning_rate=0.02,n_estimators=500,nthread=-1)
        #clf=RandomForestClassifier(n_jobs=-1,n_estimators=100,max_depth=100)
        clf.fit(X_train_cv,(y_train_cv),eval_metric="logloss",eval_set=[(X_test_cv, y_test_cv)])
        y_pred=clf.predict_proba(X_test_cv).T[1]
        print y_pred.shape
        xx.append(llfun(y_test_cv,(y_pred)))
        ypred=y_pred
        yreal=y_test_cv
        idx=idx[test_index]
        print xx[-1]#,y_pred.shape
        break

    print xx,'average:',np.mean(xx),'std',np.std(xx)
    return ypred,yreal,idx#np.mean(xx)
def xgb_meta_predict(data_base_dir,data_meta_random_dir,submission_dir):

    test_id=pickle.load(open(data_base_dir+"test_id.p","rb"))
    y_meta=pickle.load(open(data_meta_random_dir+"y_meta.p","rb"))
    
   
    X_numerical_random=pickle.load(open(data_meta_random_dir+"X_numerical_meta.p","rb"))
    X_test_numerical=pickle.load(open(data_base_dir+"X_test_numerical.p","rb"))
    
    
    X_random_rf=pickle.load(open(data_meta_random_dir+ "X_meta_random_rf.p", "rb" ) )
    X_test_rf=pickle.load(open(data_meta_random_dir+ "X_test_meta_rf.p", "rb" ) )
    
    X_random_svc=pickle.load(open(data_meta_random_dir+ "X_meta_random_svc.p", "rb" ) )
    X_test_svc=pickle.load(open(data_meta_random_dir+ "X_test_meta_svc.p", "rb" ) )
    
  
    
    # private LB  0.0054101
    xgb_clf=xgb_classifier(eta=0.2,min_child_weight=1,depth=10,num_round=70,threads=16) 
    X_xgb_predict = xgb_clf.train_predict_all_labels(np.hstack([X_random_rf,X_random_svc,X_numerical_random]), y_meta,np.hstack([ X_test_rf,X_test_svc,X_test_numerical]),predict_y14=True)
    save_predictions(submission_dir+'xgb-random-d10-e0.2-min1-tree70.csv.gz', test_id , X_xgb_predict)
    
    # private LB 0.0053053
    xgb_clf=xgb_classifier(eta=0.2,min_child_weight=6,depth=12,num_round=80,threads=16) 
    X_xgb_predict = xgb_clf.train_predict_all_labels(np.hstack([X_random_rf,X_random_svc,X_numerical_random]), y_meta,np.hstack([X_test_rf,X_test_svc,X_test_numerical]),predict_y14=True)
    save_predictions(submission_dir+'xgb-random-d12-e0.2-min6-tree80.csv.gz', test_id , X_xgb_predict)
    
    # private LB  0.0052910
    xgb_clf=xgb_classifier(eta=0.09,min_child_weight=6,depth=25,num_round=100,threads=16) 
    X_xgb_predict = xgb_clf.train_predict_all_labels(np.hstack([X_random_rf,X_random_svc,X_numerical_random]), y_meta,np.hstack([X_test_rf,X_test_svc,X_test_numerical]),predict_y14=True)
    save_predictions(submission_dir+'xgb-random-d25-svc-e0.09-min6-tree100.csv.gz', test_id , X_xgb_predict)
Пример #3
0
def kfold_cv(X_train, y_train,idx,k):

    kf = StratifiedKFold(y_train,n_folds=k)
    xx=[]
    count=0
    for train_index, test_index in kf:
        count+=1
        X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:]
        gc.collect()
        y_train_cv, y_test_cv = y_train[train_index],y_train[test_index]
        y_pred=np.zeros(X_test_cv.shape[0])
        m=0
         
        for j in range(m):
            clf=xgb_classifier(eta=0.1,min_child_weight=20,col=0.5,subsample=0.7,depth=5,num_round=200,seed=j*77,gamma=0.1)
            y_pred+=clf.train_predict(X_train_cv,(y_train_cv),X_test_cv,y_test=(y_test_cv))
        #y_pred/=m;
        clf=ExtraTreesClassifier(n_estimators=700,max_features= 50,criterion= 'entropy',min_samples_split= 3,
                            max_depth= 60, min_samples_leaf= 4,verbose=1,n_jobs=-1)
        #clf=RandomForestClassifier(n_jobs=-1,n_estimators=100,max_depth=100)
        clf.fit(X_train_cv,(y_train_cv))
        y_pred=clf.predict_proba(X_test_cv).T[1]
        print y_pred.shape
        xx.append(llfun(y_test_cv,(y_pred)))
        ypred=y_pred
        yreal=y_test_cv
        idx=idx[test_index]
        print xx[-1]#,y_pred.shape
        break

    print xx,'average:',np.mean(xx),'std',np.std(xx)
    return ypred,yreal,idx#np.mean(xx)
Пример #4
0
def kfold_cv(X_train, y_train,idx,k):

    kf = StratifiedKFold(y_train,n_folds=k)
    xx=[]
    count=0
    for train_index, test_index in kf:
        count+=1
        X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:]
        gc.collect()
        y_train_cv, y_test_cv = y_train[train_index],y_train[test_index]
        y_pred=np.zeros(X_test_cv.shape[0])
        m=1
         
        for j in range(m):
            clf=xgb_classifier(eta=0.1,min_child_weight=20,col=0.7,subsample=1,depth=10,num_round=50,seed=j*77,gamma=0.1)
            y_pred+=clf.train_predict(X_train_cv,(y_train_cv),X_test_cv,y_test=(y_test_cv))
        y_pred/=m;
        print y_pred.shape
        xx.append(llfun(y_test_cv,(y_pred)))
        #ypred=y_pred
        #yreal=y_test_cv
        #idx=idx[test_index]
        print xx[-1]#,y_pred.shape
        #break

    print xx,'average:',np.mean(xx),'std',np.std(xx)
Пример #5
0
def kfold_cv(X_train, y_train, k):

    kf = StratifiedKFold(y_train, n_folds=k)

    xx = []
    zz = []
    ypred = np.zeros((y_train.shape[0], 3))
    for train_index, test_index in kf:

        X_train_cv, X_test_cv = X_train[train_index, :], X_train[test_index, :]
        y_train_cv, y_test_cv = y_train[train_index], y_train[test_index]
        #clf=RandomForestClassifier(n_jobs=-1,max_depth=21,max_features=30,n_estimators=100)
        #clf.fit(X_train_cv,y_train_cv)
        #y_pred=clf.predict_proba(X_test_cv)
        clf = xgb_classifier(eta=0.25,
                             col=0.1,
                             min_child_weight=1,
                             depth=6,
                             num_round=70)
        y_pred = clf.multi(X_train_cv,
                           y_train_cv,
                           X_test_cv,
                           3,
                           y_test=y_test_cv)
        xx.append(multiclass_log_loss(y_test_cv, y_pred))
        print xx[-1]  #,y_pred.shape,zz[-1]
        ypred[test_index] = y_pred
    print xx
    print 'average:', np.mean(xx), 'std', np.std(xx)
    return ypred, np.mean(xx)
Пример #6
0
def kfold_cv(X_train, y_train, k):

    kf = StratifiedKFold(y_train, n_folds=k)

    xx = []
    zz = []
    ypred = np.zeros((y_train.shape[0], 3))
    for train_index, test_index in kf:

        X_train_cv, X_test_cv = X_train[train_index, :], X_train[test_index, :]
        y_train_cv, y_test_cv = y_train[train_index], y_train[test_index]
        clf = xgb_classifier(eta=0.1,
                             gamma=1e-3,
                             col=0.3,
                             min_child_weight=0.5,
                             depth=7,
                             num_round=160)
        y_pred = clf.multi(X_train_cv,
                           y_train_cv,
                           X_test_cv,
                           3,
                           y_test=y_test_cv)
        xx.append(multiclass_log_loss(y_test_cv, y_pred))
        print xx[-1]  #,y_pred.shape,zz[-1]
        ypred[test_index] = y_pred
    print xx
    print 'average:', np.mean(xx), 'std', np.std(xx)
    return ypred, np.mean(xx)
Пример #7
0
def kfold_cv(X_train, y_train,idx,k):

    kf = StratifiedKFold(y_train,n_folds=k)
    xx=[]
    count=0
    ypred=np.zeros(X_train.shape[0])
    for train_index, test_index in kf:
        count+=1
        X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:]
        gc.collect()
        y_train_cv, y_test_cv = y_train[train_index],y_train[test_index]
        y_pred=np.zeros(X_test_cv.shape[0])
        m=1
         
        for j in range(m):
            clf=xgb_classifier(eta=0.01,min_child_weight=10,col=0.7,subsample=0.68,depth=5,num_round=500,seed=j*77,gamma=0)

            y_pred+=clf.train_predict(X_train_cv,(y_train_cv),X_test_cv,y_test=(y_test_cv))
            yqq=y_pred/(1+j)
            print j,llfun(y_test_cv,yqq)
        y_pred/=m;
        #clf=RandomForestClassifier(n_jobs=-1,n_estimators=100,max_depth=100)
        #clf.fit(X_train_cv,(y_train_cv))
        #y_pred=clf.predict_proba(X_test_cv).T[1]
        print y_pred.shape
        xx.append(llfun(y_test_cv,(y_pred)))
        ypred[test_index]=y_pred
        print xx[-1]#,y_pred.shape

    print xx,'average:',np.mean(xx),'std',np.std(xx)
    return ypred
Пример #8
0
def xgb_meta_predict(data_base_dir,data_meta_part1_dir,submission_dir):
    test_id=pickle.load(open(data_base_dir+"test_id.p","rb"))
    y_all=pickle.load(open(data_base_dir+"y.p","rb"))
    y_part1=y_all[:y_all.shape[0]/2,:]
    
    X_numerical=pickle.load(open(data_base_dir+"X_numerical.p","rb"))
    X_numerical_part1=X_numerical[:X_numerical.shape[0]/2,:]
    X_test_numerical=pickle.load(open(data_base_dir+"X_test_numerical.p","rb"))
    
    X_part1_xgb=pickle.load(open(data_meta_part1_dir+ "X_meta_part1_xgb.p", "rb" ) )
    X_test_xgb =pickle.load(open(data_meta_part1_dir+ "X_test_meta_xgb_all.p", "rb" ) )
    
    X_part1_rf=pickle.load(open(data_meta_part1_dir+ "X_meta_part1_rf.p", "rb" ) )
    X_test_rf=pickle.load(open(data_meta_part1_dir+ "X_test_meta_rf.p", "rb" ) )
    
    X_part1_sgd=pickle.load(open(data_meta_part1_dir+ "X_meta_part1_sgd.p", "rb" ) )
    X_test_sgd=pickle.load(open(data_meta_part1_dir+ "X_test_meta_sgd.p", "rb" ) )
    
    X_part1_best_online=pickle.load(open(data_meta_part1_dir+ "X_meta_part1_online.p", "rb" ) )
    X_test_best_online=pickle.load(open(data_meta_part1_dir+ "X_test_meta_online.p", "rb" ) )
    X_test_online_ensemble=pickle.load(open(data_meta_part1_dir+ "X_test_meta_online_ensemble.p", "rb" ) )
    
    
    # best single model submitted, private LB 0.0044595, X_test_meta 
    xgb_clf=xgb_classifier(eta=0.09,min_child_weight=6,depth=18,num_round=120,threads=16) 
    X_xgb_predict = xgb_clf.train_predict_all_labels(np.hstack([X_part1_best_online,X_part1_rf,X_part1_sgd,X_part1_xgb,X_numerical_part1]), y_part1,np.hstack([X_test_online_ensemble, X_test_rf,X_test_sgd,X_test_xgb,X_test_numerical]),predict_y14=True)
    #save_predictions(submission_dir+'xgb-part1-d18-e0.09-min6-tree120-xgb_base.csv.gz', test_id , X_xgb_predict)
    
    # best single model (not submitted by itself), private LB 0.0044591, not submitted alone
    xgb_clf=xgb_classifier(eta=0.07,min_child_weight=6,depth=20,num_round=150,threads=16) 
    X_xgb_predict = xgb_clf.train_predict_all_labels(np.hstack([X_part1_best_online,X_part1_rf,X_part1_sgd,X_part1_xgb,X_numerical_part1]), y_part1,np.hstack([X_test_online_ensemble, X_test_rf,X_test_sgd,X_test_xgb,X_test_numerical]),predict_y14=True)
    #save_predictions(submission_dir+'xgb-part1-d20-e0.07-min6-tree150-xgb_base.csv.gz', test_id , X_xgb_predict)
    
    # private LB 0.0047360 correct! try "boosting from existing predictions"
    xgb_clf=xgb_classifier(eta=0.07,min_child_weight=6,depth=20,num_round=20,threads=16,exist_prediction=True,exist_num_round=150) 
    X_xgb_predict = xgb_clf.train_predict_all_labels(np.hstack([X_part1_best_online,X_part1_rf,X_part1_sgd,X_numerical_part1]), y_part1,np.hstack([X_test_best_online, X_test_rf,X_test_sgd,X_test_numerical]),predict_y14=True)
    #save_predictions(submission_dir+'xgb-part1-d20-e0.07-min6-tree20-extree-150.csv.gz', test_id , X_xgb_predict)
    
    # private LB 0.0047103, 
    xgb_clf=xgb_classifier(eta=0.09,min_child_weight=6,depth=18,num_round=1,threads=16,exist_prediction=True,exist_num_round=120) 
    X_xgb_predict = xgb_clf.train_predict_all_labels(np.hstack([X_part1_best_online,X_part1_rf,X_part1_sgd,X_numerical_part1]), y_part1,np.hstack([X_test_online_ensemble, X_test_rf,X_test_sgd,X_test_numerical]),predict_y14=True)
   # save_predictions(submission_dir+'xgb-part1-d18-e0.09-min6-tree1-extree-120.csv.gz', test_id , X_xgb_predict)
    
    # private LB 0.0047000, using ensembled online predictions as meta feature for test sets!
    xgb_clf=xgb_classifier(eta=0.07,min_child_weight=6,depth=20,num_round=150,threads=16) 
    X_xgb_predict = xgb_clf.train_predict_all_labels(np.hstack([X_part1_best_online,X_part1_rf,X_part1_sgd,X_numerical_part1]), y_part1,np.hstack([X_test_online_ensemble, X_test_rf,X_test_sgd,X_test_numerical]),predict_y14=True)
    #save_predictions(submission_dir+'xgb-part1-d20-e0.07-min6-tree150.csv.gz', test_id , X_xgb_predict)
    
    # private LB 0.0047313, correct!
    xgb_clf=xgb_classifier(eta=0.07,min_child_weight=6,depth=19,num_round=150,threads=16) 
    X_xgb_predict = xgb_clf.train_predict_all_labels(np.hstack([X_part1_best_online,X_part1_rf,X_part1_sgd,X_numerical_part1]), y_part1,np.hstack([X_test_best_online, X_test_rf,X_test_sgd,X_test_numerical]),predict_y14=True)
    #save_predictions(submission_dir+'xgb-part1-d19-e0.07-min6-tree150.csv.gz', test_id , X_xgb_predict)
    
    # private LB 0.0047446, correct!
    xgb_clf=xgb_classifier(eta=0.09,min_child_weight=6,depth=18,num_round=120,threads=16) 
    X_xgb_predict = xgb_clf.train_predict_all_labels(np.hstack([X_part1_best_online,X_part1_rf,X_part1_sgd,X_numerical_part1]), y_part1,np.hstack([X_test_best_online, X_test_rf,X_test_sgd,X_test_numerical]),predict_y14=True)
   # save_predictions(submission_dir+'xgb-part1-d18-e0.09-min6-tree120.csv.gz', test_id , X_xgb_predict)
    
    
    
Пример #9
0
def train_predict(X, y, Xt, yt=[], c=1):
    if c == 1:
        #clf=xgb_classifier(num_round=45,eta=0.1,min_child_weight=5,depth=10, subsample=0.5,col=1)
        clf = xgb_classifier(num_round=45,
                             eta=0.1,
                             min_child_weight=20,
                             depth=20,
                             subsample=0.1,
                             col=0.7)
        return clf.train_predict(X, y, Xt, yt)
Пример #10
0
def xgb_meta_predict(data_base_dir,data_meta_part1_dir,submission_dir):
    test_id=pickle.load(open(data_base_dir+"test_id.p","rb"))
    y_all=pickle.load(open(data_base_dir+"y.p","rb"))
    X_all=pickle.load(open(data_base_dir+"X_all.p","rb"))
    X_test=pickle.load(open(data_base_dir+"X_test_all.p","rb"))
    y_part1=y_all[:y_all.shape[0]/2,:]
    
    
    xgb_clf=xgb_classifier(eta=0.07,min_child_weight=6,depth=20,num_round=150,threads=16) 
    X_xgb_predict = xgb_clf.train_predict_all_labels(X_all, y_all,X_test,predict_y14=True)
    save_predictions(submission_dir+'xgb-raw-d20-e0.07-min6-tree150.csv.gz', test_id , X_xgb_predict)
    
    xgb_clf=xgb_classifier(eta=0.1,min_child_weight=7,depth=100,num_round=150,threads=16) 
    X_xgb_predict = xgb_clf.train_predict_label(X_all, y_all,X_test,label=33) # predict label 33 only
    save_predictions(submission_dir+'xgb-y33-d100-e0.1-min7-tree150.csv.gz', test_id , X_xgb_predict)
    
    
    X_part1_best_online=pickle.load(open(data_meta_part1_dir+ "X_meta_part1_online.p", "rb" ) )
    X_test_best_online=pickle.load(open(data_meta_part1_dir+ "X_test_meta_online.p", "rb" ) )
    
    X_numerical=pickle.load(open(data_base_dir+"X_numerical.p","rb"))
    X_numerical_part1=X_numerical[:X_numerical.shape[0]/2,:]
    X_test_numerical=pickle.load(open(data_base_dir+"X_test_numerical.p","rb"))
    
    X_part1_xgb=pickle.load(open(data_meta_part1_dir+ "X_meta_part1_xgb.p", "rb" ) )
    X_test_xgb =pickle.load(open(data_meta_part1_dir+ "X_test_meta_xgb_all.p", "rb" ) )
    
    X_part1_rf=pickle.load(open(data_meta_part1_dir+ "X_meta_part1_rf.p", "rb" ) )
    X_test_rf=pickle.load(open(data_meta_part1_dir+ "X_test_meta_rf.p", "rb" ) )
    
    X_part1_sgd=pickle.load(open(data_meta_part1_dir+ "X_meta_part1_sgd.p", "rb" ) )
    X_test_sgd=pickle.load(open(data_meta_part1_dir+ "X_test_meta_sgd.p", "rb" ) )
    
    X_sparse=pickle.load(open(data_base_dir+"X_sparse.p","rb"))
    X_test_sparse=pickle.load(open(data_base_dir+"X_test_sparse.p","rb"))
    X_sparse_part1=X_sparse[:X_sparse.shape[0]/2,:]
    
    X=sparse.csr_matrix(sparse.hstack((X_sparse_part1,sparse.coo_matrix(np.hstack  ([X_part1_best_online,X_part1_rf,X_part1_sgd,X_part1_xgb,X_numerical_part1]).astype(float)))))
    Xt=sparse.csr_matrix(sparse.hstack((X_test_sparse,sparse.coo_matrix(np.hstack  ([X_test_best_online,X_test_rf,X_test_sgd,X_test_xgb,X_test_numerical]).astype(float)))))
    xgb_clf=xgb_classifier(eta=0.1,min_child_weight=6,depth=30,num_round=80,threads=16)
    X_xgb_predict = xgb_clf.train_predict_label(X, y_part1,Xt,label=33) # predict label 33 only
    save_predictions(submission_dir+'xgb-y33-d30-e0.1-min6-tree80-all-sparse.csv.gz', test_id , X_xgb_predict)
def train_predict(X,y,Xt,yt=[],c=1):
    if c=='xgb':
        clf=xgb_classifier(num_round=200,eta=0.1,min_child_weight=2,depth=20, subsample=1,col=0.6)
        return clf.train_predict(X,y,Xt,yt)
    if c=='rf':
        clf=RandomForestClassifier(n_estimators=200,n_jobs=-1,max_depth=13,min_samples_split=4,min_samples_leaf=9, max_leaf_nodes= 1100)
        clf.fit(X,y)
        return clf.predict_proba(Xt).T[1]    
    if c=='rf1':
        clf=RandomForestClassifier(n_estimators=1000,n_jobs=-1)
        clf.fit(X,y)
        return clf.predict_proba(Xt).T[1]
def train_predict(X,y,Xt,yt=[],c=1):
    if c==1:
        clf=xgb_classifier(num_round=60,eta=0.1,min_child_weight=5,depth=7, subsample=1,col=1)
        return clf.train_predict(X,y,Xt,yt)
    if c==2:
        clf=RandomForestRegressor(n_estimators=200,n_jobs=-1,max_depth=13,min_samples_split=4,min_samples_leaf=9, max_leaf_nodes= 1100)
        clf.fit(X,y)
        return clf.predict(Xt)    
    if c==3:
        clf=RankSVM()
        clf.fit(X,y)
        return clf.predict(Xt)
Пример #13
0
def train_predict(X,y,Xt,yt=[],c=1):
    if c==1:
        #clf=xgb_classifier(num_round=45,eta=0.1,min_child_weight=5,depth=10, subsample=0.5,col=1) 
        clf=xgb_classifier(num_round=45,eta=0.1,min_child_weight=20,depth=20, subsample=0.1,col=0.7)
	#clf=xgb_classifier(num_round=300,eta=0.01,min_child_weight=20,depth=8, subsample=0.1,col=0.7)
        return clf.train_predict(X,y,Xt,yt)
    elif c==2:
	clf = LDA()
	clf.fit(X,y)
	preds = clf.predict_proba(Xt)[:,1]
	return preds
    elif c==3:
        clf = LogisticRegression()
        clf.fit(X,y)
        preds = clf.predict_proba(Xt)[:,1]
        return preds
Пример #14
0
def train_predict(X,y,Xt,yt=[],c=1):
    if c==1:
        #clf=xgb_classifier(num_round=45,eta=0.1,min_child_weight=5,depth=10, subsample=0.5,col=1) 
        clf=xgb_classifier(num_round=45,eta=0.1,min_child_weight=20,depth=20, subsample=0.1,col=0.7)
	#clf=xgb_classifier(num_round=300,eta=0.01,min_child_weight=20,depth=8, subsample=0.1,col=0.7)
        return clf.train_predict(X,y,Xt,yt)
    elif c==2:
	clf = LDA()
	clf.fit(X,y)
	preds = clf.predict_proba(Xt)[:,1]
	return preds
    elif c==3:
        clf = LogisticRegression()
        clf.fit(X,y)
        preds = clf.predict_proba(Xt)[:,1]
        return preds
def train_predict(X,y,Xt,yt=[],c=1):
    if c==1:
        #clf=xgb_classifier(num_round=45,eta=0.1,min_child_weight=5,depth=10, subsample=0.5,col=1) 
        #clf=xgb_classifier(num_round=55,eta=0.1,min_child_weight=20,depth=20, subsample=0.1,col=0.7)
	clf=xgb_classifier(num_round=500,eta=0.01,min_child_weight=20,depth=10, subsample=0.1,col=0.7)
	#clf=xgb_classifier(num_round=500,eta=0.01,min_child_weight=20,depth=10, subsample=0.1,col=0.7) # First digit touch - 0.966262479533 #BothStartLoadPhase-0.969428966329
	#clf=xgb_classifier(num_round=500,eta=0.01,min_child_weight=20,depth=10, subsample=0.1,col=0.7)  # HandStart - 0.930538668081
        return clf.train_predict(X,y,Xt,yt)
    elif c==2:
	clf = LDA()
	clf.fit(X,y)
	preds = clf.predict_proba(Xt)[:,1]
	return preds
    elif c==3:
        clf = LogisticRegression()
        clf.fit(X,y)
        preds = clf.predict_proba(Xt)[:,1]
        return preds
Пример #16
0
def kfold_cv(X_train, y_train, k):

    kf = StratifiedKFold(y_train, n_folds=k)

    xx = []
    zz = []
    ypred = np.zeros((y_train.shape[0], 3))
    for train_index, test_index in kf:

        X_train_cv, X_test_cv = X_train[train_index, :], X_train[test_index, :]
        y_train_cv, y_test_cv = y_train[train_index], y_train[test_index]
        clf = xgb_classifier(eta=0.1, col=0.4, min_child_weight=10, depth=6, num_round=50)  # good!
        y_pred = clf.multi(X_train_cv, y_train_cv, X_test_cv, 3, y_test=y_test_cv)
        xx.append(multiclass_log_loss(y_test_cv, y_pred))
        print xx[-1]  # ,y_pred.shape,zz[-1]
        ypred[test_index] = y_pred
    print "average:", np.mean(xx), "std", np.std(xx)
    return ypred, np.mean(xx)
Пример #17
0
def kfold_cv(X_train, y_train,k):


    kf = StratifiedKFold(y_train,n_folds=k)

    xx=[]
    zz=[]
    ypred=np.zeros((y_train.shape[0],3))
    for train_index, test_index in kf:

        X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:]
        y_train_cv, y_test_cv = y_train[train_index],y_train[test_index]
        clf=xgb_classifier(eta=0.1,gamma=1e-3,col=0.35,min_child_weight=0.5,depth=7,num_round=160)
        y_pred=clf.multi(X_train_cv,y_train_cv,X_test_cv,3,y_test=y_test_cv)
        xx.append(multiclass_log_loss(y_test_cv,y_pred))
        print xx[-1]#,y_pred.shape,zz[-1]
        ypred[test_index]=y_pred
    print xx
    print 'average:',np.mean(xx),'std',np.std(xx)
    return ypred,np.mean(xx)
Пример #18
0
def train_predict(X, y, Xt, yt=[], c=1):
    if c == 1:
        #clf=xgb_classifier(num_round=45,eta=0.1,min_child_weight=5,depth=10, subsample=0.5,col=1)
        #clf=xgb_classifier(num_round=55,eta=0.1,min_child_weight=20,depth=20, subsample=0.1,col=0.7)
        clf = xgb_classifier(num_round=500,
                             eta=0.01,
                             min_child_weight=20,
                             depth=10,
                             subsample=0.1,
                             col=0.7)
        #clf=xgb_classifier(num_round=500,eta=0.01,min_child_weight=20,depth=10, subsample=0.1,col=0.7) # First digit touch - 0.966262479533 #BothStartLoadPhase-0.969428966329
        #clf=xgb_classifier(num_round=500,eta=0.01,min_child_weight=20,depth=10, subsample=0.1,col=0.7)  # HandStart - 0.930538668081
        return clf.train_predict(X, y, Xt, yt)
    elif c == 2:
        clf = LDA()
        clf.fit(X, y)
        preds = clf.predict_proba(Xt)[:, 1]
        return preds
    elif c == 3:
        clf = LogisticRegression()
        clf.fit(X, y)
        preds = clf.predict_proba(Xt)[:, 1]
        return preds
Пример #19
0
def kfold_cv(X_train, y_train,k):


    kf = StratifiedKFold(y_train,n_folds=k)

    xx=[]
    zz=[]
    ypred=np.zeros((y_train.shape[0],3))
    for train_index, test_index in kf:

        X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:]
        y_train_cv, y_test_cv = y_train[train_index],y_train[test_index]
        #clf=RandomForestClassifier(n_jobs=-1,max_depth=21,max_features=30,n_estimators=100)
        #clf.fit(X_train_cv,y_train_cv)
        #y_pred=clf.predict_proba(X_test_cv)
        clf=xgb_classifier(eta=0.25,col=0.4,min_child_weight=1,depth=6,num_round=70)
        y_pred=clf.multi(X_train_cv,y_train_cv,X_test_cv,3,y_test=y_test_cv)
        xx.append(multiclass_log_loss(y_test_cv,y_pred))
        print xx[-1]#,y_pred.shape,zz[-1]
        ypred[test_index]=y_pred
    print xx
    print 'average:',np.mean(xx),'std',np.std(xx)
    return ypred,np.mean(xx)
Пример #20
0
X=np.vstack([X.T,pd.read_csv('../bench/h2ocv.csv',index_col=idname)['PredictedProb'].values]).T
X=np.vstack([X.T,pd.read_csv('../bench/gene1cv.csv',index_col=idname)['PredictedProb'].values]).T

labelname='PredictedProb'
Xt=np.vstack([Xt.T,pd.read_csv('../tian/tian_sub.csv',index_col=idname)[labelname].values]).T
Xt=np.vstack([Xt.T,pd.read_csv('../cv/xgb4.csv',index_col=idname)[labelname].values]).T
Xt=np.vstack([Xt.T,pd.read_csv('../cv/ridge1.csv',index_col=idname)[labelname].values]).T
Xt=np.vstack([Xt.T,pd.read_csv('../cv/xgb11.csv',index_col=idname)[labelname].values]).T

#Xt=np.vstack([Xt.T,pd.read_csv('../cv/ex1.csv',index_col=idname)[labelname].values]).T
#Xt=np.vstack([Xt.T,pd.read_csv('../bench/bsub1.csv',index_col=idname).as_matrix().ravel()]).T
Xt=np.vstack([Xt.T,pd.read_csv('../srv3/sub/ridge1.csv',index_col=idname)[labelname].values]).T
Xt=np.vstack([Xt.T,pd.read_csv('../dahei/sub/ftrl2sub.csv',index_col=idname)['target'].values]).T
Xt=np.vstack([Xt.T,pd.read_csv('../bench/xgb1.csv',index_col=idname)['PredictedProb'].values]).T
Xt=np.vstack([Xt.T,pd.read_csv('../bench/tree1.csv',index_col=idname)['PredictedProb'].values]).T
Xt=np.vstack([Xt.T,pd.read_csv('../bench/ex5.csv',index_col=idname)['PredictedProb'].values]).T
Xt=np.vstack([Xt.T,pd.read_csv('../bench/h2o.csv',index_col=idname)['PredictedProb'].values]).T
Xt=np.vstack([Xt.T,pd.read_csv('../bench/gene1.csv',index_col=idname)['PredictedProb'].values]).T
print X.shape,y.shape,Xt.shape
X=sparse.hstack([X,X_sparse],format='csr')#.toarray()
Xt=sparse.hstack([Xt,Xt_sparse],format='csr')
print X.shape,y.shape,Xt.shape
bad=[8,114]
xx=[i for i in range(X.shape[1]) if i not in bad]
assert(X.shape[1]==Xt.shape[1])
clf=xgb_classifier(eta=0.01,min_child_weight=2,col=0.7,subsample=0.68,depth=5,num_round=500,seed=0,gamma=0)
yp=clf.train_predict(X[:,xx],y,Xt[:,xx])
s=pd.DataFrame({idname:idx,'PredictedProb':yp})
s.to_csv('stack21.csv',index=False)

def pre_processing_meta_random(data_base_dir,data_meta_random_dir):
    xgb_clf=xgb_classifier(eta=0.3,min_child_weight=6,depth=100,num_round=20,threads=16,exist_prediction=True,exist_num_round=20)
    

    y_all=pickle.load(open(data_base_dir+"y.p","rb"))
 
    

    X_numerical=pickle.load(open(data_base_dir+"X_numerical.p","rb"))
    X_test_numerical=pickle.load(open(data_base_dir+"X_test_numerical.p","rb"))
    X_sparse=pickle.load(open(data_base_dir+"X_sparse.p","rb"))
    X_test_sparse=pickle.load(open(data_base_dir+"X_test_sparse.p","rb"))

    X_numerical_base, X_numerical_meta, X_sparse_base, X_sparse_meta, y_base, y_meta = train_test_split(
        X_numerical, 
        X_sparse, 
        y_all,
        test_size = 0.5
    )



    X_meta_rf=[]
    X_meta_svc=[]



    X_test_rf=[]
    X_test_svc=[]
 
    
    for i in range(33) :
        
        predicted = None
     
        if i==13:
        
            print "%d is constant like: " % (i),"not included in meta features"
        else :
            print 'train',i
                
            y = y_base[:, i]
            rf = RandomForestClassifier(n_estimators = 150, n_jobs = 16)
            rf.fit(X_numerical_base, y)
            X_meta_rf.append(rf.predict_proba(X_numerical_meta))
            X_test_rf.append(rf.predict_proba(X_test_numerical))
    
            y = y_base[:, i]
            svm = LinearSVC()
            svm.fit(X_sparse_base, y)            
            X_meta_svc.append(svm.decision_function(X_sparse_meta))
            X_test_svc.append(svm.decision_function(X_test_sparse))

         
    
  
    X_meta_rf = np.column_stack(X_meta_rf)
    X_test_rf= np.column_stack(X_test_rf)
    pickle.dump( X_meta_rf, open(data_meta_random_dir+ "X_meta_random_rf.p", "wb" ) )
    pickle.dump( X_test_rf, open(data_meta_random_dir+ "X_test_meta_rf.p", "wb" ) )

    X_meta_svc = np.column_stack(X_meta_svc)
    X_test_svc= np.column_stack(X_test_svc)
    pickle.dump( X_meta_svc, open(data_meta_random_dir+ "X_meta_random_svc.p", "wb" ) )
    pickle.dump( X_test_svc, open(data_meta_random_dir+ "X_test_meta_svc.p", "wb" ) )

    pickle.dump( y_meta, open(data_meta_random_dir+ "y_meta.p", "wb" ) )
    pickle.dump( y_base, open(data_meta_random_dir+ "y_base.p", "wb" ) )
    pickle.dump( X_numerical_meta, open(data_meta_random_dir+ "X_numerical_meta.p", "wb" ) )
Пример #22
0
names_categorical = []
for name in train.columns.values :
    if train[name].value_counts().shape[0]<200:
        train[name] = map(str, train[name])
        names_categorical.append(name)
        print name,train[name].value_counts().shape[0] 
X_sparse = vec.fit_transform(train[names_categorical].T.to_dict().values())

idx=np.array(train.index)
del train
gc.collect()
X=sparse.hstack([X,X_sparse],format='csr')#.toarray()

train=pd.read_csv('test_clean1.csv',index_col=idname)
train.drop(bad,inplace=True,axis=1)
Xt=train.as_matrix()
Xt_sparse = vec.transform(train[names_categorical].T.to_dict().values())
idx=np.array(train.index)
Xt=sparse.hstack([Xt,Xt_sparse],format='csr')
print X.shape,y.shape,Xt.shape

yp=np.zeros(Xt.shape[0])
m=10
for j in range(m):
    clf=xgb_classifier(eta=0.1,min_child_weight=20,col=0.5,subsample=0.7,depth=5,num_round=200,seed=j*77,gamma=0.1)
    yp+=clf.train_predict(X,y,Xt)
yp/=m;
s=pd.DataFrame({idname:idx,'PredictedProb':yp})
s.to_csv('xgb3.csv',index=False)

Пример #23
0
test = pd.read_csv('../../../input/patients_test.csv', index_col='patient_id')
idx = np.array(test.index)
del test
gc.collect()
print X.shape, y.shape, Xt.shape

from xgb_classifier import xgb_classifier

eta = 0.1
myname = sys.argv[0]
for seed in [0]:  #[i*777 for i in range(1,10)]:
    for depth in [10]:
        for child in [2]:
            for col in [0.4]:
                for sub in [1]:
                    for num in [2000]:
                        clf = xgb_classifier(eta=eta,
                                             min_child_weight=child,
                                             depth=depth,
                                             num_round=num,
                                             col=col,
                                             subsample=sub,
                                             seed=seed)
                        ypred = clf.train_predict(X, y, Xt)
                        s = pd.DataFrame({
                            'patient_id': idx,
                            'predict_screener': ypred
                        })
                        s.to_csv('rxgb5.csv', index=False)
                        #s.to_csv('va_result/%s_eta_%f_depth_%d_child_%d_col_%f_sub_%f_num_%d_seed_%d_score_%f'% (myname,eta,depth,child,col,sub,num,seed,score),index=False)
Пример #24
0
from sklearn.externals.joblib import Memory
from sklearn.datasets import load_svmlight_file
mem = Memory("./mycache")

@mem.cache
def get_data(path):
    data = load_svmlight_file(path)
    return data[0], data[1]
idname='ID'
labelname='target'
train=pd.read_csv('train_clean1.csv',index_col=idname)

y=np.array(train[labelname]).astype(float)
train.drop([labelname],inplace=True,axis=1)
X=train.as_matrix()
del train


train=pd.read_csv('test_clean1.csv',index_col=idname)

Xt=train.as_matrix()

idx=np.array(train.index)
del train

clf=xgb_classifier(eta=0.1,min_child_weight=20,col=0.7,subsample=1,depth=10,num_round=50,seed=0,gamma=0.1)
yp=clf.train_predict(X,y,Xt)
s=pd.DataFrame({idname:idx,'PredictedProb':yp})
s.to_csv('xgb1.csv',index=False)
Пример #25
0
def xgb_meta_predict(data_base_dir, data_meta_part2_dir, submission_dir):
    test_id = pickle.load(open(data_base_dir + "test_id.p", "rb"))
    y_all = pickle.load(open(data_base_dir + "y.p", "rb"))
    y_part2 = y_all[y_all.shape[0] / 2:, :]

    X_numerical = pickle.load(open(data_base_dir + "X_numerical.p", "rb"))
    X_numerical_part2 = X_numerical[X_numerical.shape[0] / 2:, :]
    X_test_numerical = pickle.load(
        open(data_base_dir + "X_test_numerical.p", "rb"))

    X_part2_rf = pickle.load(
        open(data_meta_part2_dir + "X_meta_part2_rf.p", "rb"))
    X_test_rf = pickle.load(
        open(data_meta_part2_dir + "X_test_meta_rf.p", "rb"))

    X_part2_svc = pickle.load(
        open(data_meta_part2_dir + "X_meta_part2_svc.p", "rb"))
    X_test_svc = pickle.load(
        open(data_meta_part2_dir + "X_test_meta_svc.p", "rb"))

    X_part2_sgd = pickle.load(
        open(data_meta_part2_dir + "X_meta_part2_sgd.p", "rb"))
    X_test_sgd = pickle.load(
        open(data_meta_part2_dir + "X_test_meta_sgd.p", "rb"))

    X_part2_best_online = pickle.load(
        open(data_meta_part2_dir + "X_meta_part2_online.p", "rb"))
    X_test_best_online = pickle.load(
        open(data_meta_part2_dir + "X_test_meta_online.p", "rb"))

    # private LB 0.0048854
    xgb_clf = xgb_classifier(eta=0.09,
                             min_child_weight=6,
                             depth=18,
                             num_round=120,
                             threads=16)
    X_xgb_predict = xgb_clf.train_predict_all_labels(
        np.hstack([X_part2_best_online, X_part2_rf, X_numerical_part2]),
        y_part2,
        np.hstack([X_test_best_online, X_test_rf, X_test_numerical]),
        predict_y14=True)
    save_predictions(
        submission_dir + 'xgb-part2-d18-e0.09-min6-tree120.csv.gz', test_id,
        X_xgb_predict)

    # private LB 0.0048763
    xgb_clf = xgb_classifier(eta=0.07,
                             min_child_weight=6,
                             depth=20,
                             num_round=150,
                             threads=16)
    X_xgb_predict = xgb_clf.train_predict_all_labels(
        np.hstack([X_part2_best_online, X_part2_rf, X_numerical_part2]),
        y_part2,
        np.hstack([X_test_best_online, X_test_rf, X_test_numerical]),
        predict_y14=True)
    save_predictions(
        submission_dir + 'xgb-part2-d20-e0.07-min6-tree150.csv.gz', test_id,
        X_xgb_predict)

    # private LB  0.0048978
    xgb_clf = xgb_classifier(eta=0.09,
                             min_child_weight=6,
                             depth=18,
                             num_round=100,
                             threads=16)
    X_xgb_predict = xgb_clf.train_predict_all_labels(np.hstack(
        [X_part2_best_online, X_part2_rf, X_part2_svc, X_numerical_part2]),
                                                     y_part2,
                                                     np.hstack([
                                                         X_test_best_online,
                                                         X_test_rf, X_test_svc,
                                                         X_test_numerical
                                                     ]),
                                                     predict_y14=True)
    save_predictions(
        submission_dir + 'xgb-part2-d18-svc-e0.09-min6-tree100.csv.gz',
        test_id, X_xgb_predict)

    # private LB  0.0050270
    xgb_clf = xgb_classifier(eta=0.1,
                             min_child_weight=6,
                             depth=20,
                             num_round=110,
                             threads=16)
    X_xgb_predict = xgb_clf.train_predict_all_labels(
        np.hstack([X_part2_best_online, X_part2_rf, X_part2_svc, X_part2_sgd]),
        y_part2,
        np.hstack([X_test_best_online, X_test_rf, X_test_svc, X_test_sgd]),
        predict_y14=True)
    save_predictions(
        submission_dir + 'xgb-part2-d20-e0.1-min6-tree110-metaonly.csv.gz',
        test_id, X_xgb_predict)
def pre_processing_meta_part1(data_base_dir,data_meta_part1_dir):
    
    

    y_all=pickle.load(open(data_base_dir+"y.p","rb"))
    y_part2=y_all[y_all.shape[0]/2:,:]
 
    X_all=pickle.load(open(data_base_dir+"X_all.p","rb"))
    X_test=pickle.load(open(data_base_dir+"X_test_all.p","rb"))
    X_part1=X_all[:X_all.shape[0]/2,:]
    X_part2=X_all[X_all.shape[0]/2:,:]

    X_numerical=pickle.load(open(data_base_dir+"X_numerical.p","rb"))
    X_test_numerical=pickle.load(open(data_base_dir+"X_test_numerical.p","rb"))
    X_numerical_part1=X_numerical[:X_numerical.shape[0]/2,:]
    X_numerical_part2=X_numerical[X_numerical.shape[0]/2:,:]

    X_sparse=pickle.load(open(data_base_dir+"X_sparse.p","rb"))
    X_test_sparse=pickle.load(open(data_base_dir+"X_test_sparse.p","rb"))
    X_sparse_part1=X_sparse[:X_sparse.shape[0]/2,:]
    X_sparse_part2=X_sparse[X_sparse.shape[0]/2:,:]


    X_part1_xgb=[]
    X_part1_rf=[]
    X_part1_sgd=[]
  

    X_test_xgb=[]
    X_test_rf=[]
    X_test_sgd=[]
    
    # use pypy to accelerate online model
    
    X_part1_best_online=np.array(pd.read_csv(data_meta_part1_dir+'part1_online.csv')[['pred']])
    X_part1_best_online=X_part1_best_online.reshape((X_part1_best_online.shape[0]/32,32))
    X_test_best_online=np.array(pd.read_csv(data_meta_part1_dir+'best_online_test.csv')[['pred']])
    X_test_best_online=X_test_best_online.reshape((X_test_best_online.shape[0]/32,32))
    pickle.dump( X_part1_best_online, open(data_meta_part1_dir+ "X_meta_part1_online.p", "wb" ) )
    pickle.dump( X_test_best_online, open(data_meta_part1_dir+ "X_test_meta_online.p", "wb" ) )
    
    
    xgb_clf=xgb_classifier(eta=0.3,min_child_weight=6,depth=100,num_round=20,threads=16,exist_prediction=True,exist_num_round=20)
    X_part1_xgb = xgb_clf.train_predict_all_labels(X_part2, y_part2,X_part1,predict_y14=False)
    X_test_xgb = xgb_clf.train_predict_all_labels(X_all, y_all,X_test,predict_y14=False) # a little trick to make test data's meta features more accurate
    
    pickle.dump( X_part1_xgb, open(data_meta_part1_dir+ "X_meta_part1_xgb.p", "wb" ) )
    pickle.dump( X_test_xgb, open(data_meta_part1_dir+ "X_test_meta_xgb_all.p", "wb" ) )
   
    
    
    for i in range(33) :
        
        predicted = None
     
        if i==13:
        
            print "%d is constant like: " % (i),"not included in meta features"
        else :
            print 'train',i
                
            y = y_part2[:, i]
            rf = RandomForestClassifier(n_estimators=200, n_jobs=16, min_samples_leaf = 10,random_state=1,bootstrap=False,criterion='entropy',min_samples_split=5,verbose=1)
            rf.fit(X_numerical_part2, y)
            X_part1_rf.append(rf.predict_proba(X_numerical_part1))
            X_test_rf.append(rf.predict_proba(X_test_numerical))
    
            y = y_part2[:, i]
            clf=SGDClassifier(loss='log',alpha=0.000001,n_iter=100)
            clf.fit(X_sparse_part2,y)
            X_part1_sgd.append(clf.predict_proba(X_sparse_part1).T[1])
            X_test_sgd.append(clf.predict_proba(X_test_sparse).T[1])

        

         
    
  
    X_part1_rf = np.column_stack(X_part1_rf)
    X_test_rf= np.column_stack(X_test_rf)
    pickle.dump( X_part1_rf, open(data_meta_part1_dir+ "X_meta_part1_rf.p", "wb" ) )
    pickle.dump( X_test_rf, open(data_meta_part1_dir+ "X_test_meta_rf.p", "wb" ) )

    X_part1_sgd = np.column_stack(X_part1_sgd)
    X_test_sgd= np.column_stack(X_test_sgd)
    pickle.dump( X_part1_sgd, open(data_meta_part1_dir+ "X_meta_part1_sgd.p", "wb" ) )
    pickle.dump( X_test_sgd, open(data_meta_part1_dir+ "X_test_meta_sgd.p", "wb" ) )
Пример #27
0
    df = pd.read_csv(path)
    df = df[df.FeedBackEvent != 0]
    df = df.drop('FeedBackEvent', axis = 1)
    if i == 0:
        test = np.array(df)
    else:
        test = np.vstack((test, np.array(df)))
"""
import pickle
pickle.dump(train,open("train.p","wb"))
pickle.dump(test,open("test.p","wb"))
pickle.dump(labels.Prediction.values,open("label.p","wb"))

import pickle
train=pickle.load(open("train.p","rb"))
test=pickle.load(open("test.p","rb"))
label=pickle.load(open("label.p","rb"))
"""
clf = ensemble.RandomForestClassifier(n_jobs = -1, 
				     n_estimators=10,
                                     min_samples_leaf=10, 
			             random_state=42)
xgb_clf=xgb_classifier(eta=0.1,min_child_weight=1,depth=10,num_round=40,threads=8,boost_from_exist_prediction=True,exist_num_round=10)
clf.fit(train, labels.Prediction.values)
base_train_prediction=clf.predict_proba(train).T[1]
base_test_prediction=clf.predict_proba(test).T[1]
preds = xgb_clf.train_predict(train,labels.Prediction.values,test,base_train_prediction,base_test_prediction)

submission['Prediction'] = preds
submission.to_csv('xgb_boost_from_rf.csv', index = False)
Пример #28
0
def train_predict(X,y,Xt,yt=[],c=1):
    if c==1:
        #clf=xgb_classifier(num_round=45,eta=0.1,min_child_weight=5,depth=10, subsample=0.5,col=1) 
        clf=xgb_classifier(num_round=45,eta=0.1,min_child_weight=20,depth=20, subsample=0.1,col=0.7)
        return clf.train_predict(X,y,Xt,yt)
Пример #29
0
    kf=KFold(len(y),n_folds=4)

    for train_index, test_index in kf:
        Xt=X[test_index]
        X=X[train_index]
        idx=idx[test_index]
        yt=y[test_index]
        y=y[train_index]
        break
test=pd.read_csv('../../../input/patients_test.csv',index_col='patient_id')
idx=np.array(test.index)
del test
gc.collect()
print X.shape,y.shape,Xt.shape

from xgb_classifier import xgb_classifier
eta=0.1
myname=sys.argv[0]
for seed in [0]:#[i*777 for i in range(1,10)]:
    for depth in [10]:
        for child in [2]:
            for col in [0.4]:
                for sub in [1]:
                    for num in [2000]:
                        clf=xgb_classifier(eta=eta,min_child_weight=child,depth=depth,num_round=num,col=col,subsample=sub,seed=seed)
                        ypred=clf.train_predict(X,y,Xt)        
                        s=pd.DataFrame({'patient_id':idx,'predict_screener':ypred})     
                        s.to_csv('rxgb5.csv',index=False)           
                        #s.to_csv('va_result/%s_eta_%f_depth_%d_child_%d_col_%f_sub_%f_num_%d_seed_%d_score_%f'% (myname,eta,depth,child,col,sub,num,seed,score),index=False)

Пример #30
0
def pre_processing_meta_part2(data_base_dir, data_meta_part2_dir):
    xgb_clf = xgb_classifier(eta=0.3,
                             min_child_weight=6,
                             depth=100,
                             num_round=20,
                             threads=16,
                             exist_prediction=True,
                             exist_num_round=20)

    y_all = pickle.load(open(data_base_dir + "y.p", "rb"))
    y_part1 = y_all[:y_all.shape[0] / 2, :]

    X_numerical = pickle.load(open(data_base_dir + "X_numerical.p", "rb"))
    X_test_numerical = pickle.load(
        open(data_base_dir + "X_test_numerical.p", "rb"))
    X_numerical_part1 = X_numerical[:X_numerical.shape[0] / 2, :]
    X_numerical_part2 = X_numerical[X_numerical.shape[0] / 2:, :]

    X_sparse = pickle.load(open(data_base_dir + "X_sparse.p", "rb"))
    X_test_sparse = pickle.load(open(data_base_dir + "X_test_sparse.p", "rb"))
    X_sparse_part1 = X_sparse[:X_sparse.shape[0] / 2, :]
    X_sparse_part2 = X_sparse[X_sparse.shape[0] / 2:, :]

    X_part2_rf = []
    X_part2_svc = []
    X_part2_sgd = []

    X_test_rf = []
    X_test_svc = []
    X_test_sgd = []

    # use pypy to accelerate online model

    X_part2_best_online = np.array(
        pd.read_csv(data_meta_part2_dir + 'part2_online.csv')[['pred']])
    X_part2_best_online = X_part2_best_online.reshape(
        (X_part2_best_online.shape[0] / 32, 32))

    pickle.dump(X_part2_best_online,
                open(data_meta_part2_dir + "X_meta_part2_online.p", "wb"))

    X_test_best_online = np.array(
        pd.read_csv(data_meta_part2_dir + 'best_online_test.csv')[['pred']])
    X_test_best_online = X_test_best_online.reshape(
        (X_test_best_online.shape[0] / 32, 32))

    pickle.dump(X_test_best_online,
                open(data_meta_part2_dir + "X_test_meta_online.p", "wb"))

    for i in range(33):

        predicted = None

        if i == 13:

            print("%d is constant like: " % (i),
                  "not included in meta features")
        else:
            print('train', i)

            y = y_part1[:, i]
            rf = RandomForestClassifier(n_estimators=200,
                                        n_jobs=16,
                                        min_samples_leaf=10,
                                        random_state=1,
                                        bootstrap=False,
                                        criterion='entropy',
                                        min_samples_split=5,
                                        verbose=1)
            rf.fit(X_numerical_part1, y)
            X_part2_rf.append(rf.predict_proba(X_numerical_part2))
            X_test_rf.append(rf.predict_proba(X_test_numerical))

            y = y_part1[:, i]
            svm = LinearSVC(C=0.17)
            svm.fit(X_sparse_part1, y)
            X_part2_svc.append(svm.decision_function(X_sparse_part2))
            X_test_svc.append(svm.decision_function(X_test_sparse))

            y = y_part1[:, i]
            clf = SGDClassifier(loss='log', alpha=0.000001, n_iter=100)
            clf.fit(X_sparse_part1, y)
            X_part2_sgd.append(clf.predict_proba(X_sparse_part2).T[1])
            X_test_sgd.append(clf.predict_proba(X_test_sparse).T[1])

    X_part2_rf = np.column_stack(X_part2_rf)
    X_test_rf = np.column_stack(X_test_rf)
    pickle.dump(X_part2_rf,
                open(data_meta_part2_dir + "X_meta_part2_rf.p", "wb"))
    pickle.dump(X_test_rf, open(data_meta_part2_dir + "X_test_meta_rf.p",
                                "wb"))

    X_part2_svc = np.column_stack(X_part2_svc)
    X_test_svc = np.column_stack(X_test_svc)
    pickle.dump(X_part2_svc,
                open(data_meta_part2_dir + "X_meta_part2_svc.p", "wb"))
    pickle.dump(X_test_svc,
                open(data_meta_part2_dir + "X_test_meta_svc.p", "wb"))

    X_part2_sgd = np.column_stack(X_part2_sgd)
    X_test_sgd = np.column_stack(X_test_sgd)
    pickle.dump(X_part2_sgd,
                open(data_meta_part2_dir + "X_meta_part2_sgd.p", "wb"))
    pickle.dump(X_test_sgd,
                open(data_meta_part2_dir + "X_test_meta_sgd.p", "wb"))
def xgb_meta_predict(data_base_dir, data_meta_random_dir, submission_dir):

    test_id = pickle.load(open(data_base_dir + "test_id.p", "rb"))
    y_meta = pickle.load(open(data_meta_random_dir + "y_meta.p", "rb"))

    X_numerical_random = pickle.load(
        open(data_meta_random_dir + "X_numerical_meta.p", "rb"))
    X_test_numerical = pickle.load(
        open(data_base_dir + "X_test_numerical.p", "rb"))

    X_random_rf = pickle.load(
        open(data_meta_random_dir + "X_meta_random_rf.p", "rb"))
    X_test_rf = pickle.load(
        open(data_meta_random_dir + "X_test_meta_rf.p", "rb"))

    X_random_svc = pickle.load(
        open(data_meta_random_dir + "X_meta_random_svc.p", "rb"))
    X_test_svc = pickle.load(
        open(data_meta_random_dir + "X_test_meta_svc.p", "rb"))

    # private LB  0.0054101
    xgb_clf = xgb_classifier(eta=0.2,
                             min_child_weight=1,
                             depth=10,
                             num_round=70,
                             threads=16)
    X_xgb_predict = xgb_clf.train_predict_all_labels(
        np.hstack([X_random_rf, X_random_svc, X_numerical_random]),
        y_meta,
        np.hstack([X_test_rf, X_test_svc, X_test_numerical]),
        predict_y14=True)
    save_predictions(submission_dir + 'xgb-random-d10-e0.2-min1-tree70.csv.gz',
                     test_id, X_xgb_predict)

    # private LB 0.0053053
    xgb_clf = xgb_classifier(eta=0.2,
                             min_child_weight=6,
                             depth=12,
                             num_round=80,
                             threads=16)
    X_xgb_predict = xgb_clf.train_predict_all_labels(
        np.hstack([X_random_rf, X_random_svc, X_numerical_random]),
        y_meta,
        np.hstack([X_test_rf, X_test_svc, X_test_numerical]),
        predict_y14=True)
    save_predictions(submission_dir + 'xgb-random-d12-e0.2-min6-tree80.csv.gz',
                     test_id, X_xgb_predict)

    # private LB  0.0052910
    xgb_clf = xgb_classifier(eta=0.09,
                             min_child_weight=6,
                             depth=25,
                             num_round=100,
                             threads=16)
    X_xgb_predict = xgb_clf.train_predict_all_labels(
        np.hstack([X_random_rf, X_random_svc, X_numerical_random]),
        y_meta,
        np.hstack([X_test_rf, X_test_svc, X_test_numerical]),
        predict_y14=True)
    save_predictions(
        submission_dir + 'xgb-random-d25-svc-e0.09-min6-tree100.csv.gz',
        test_id, X_xgb_predict)
Пример #32
0
X2, _ = get_data('../sparse/rebuild3_test.svm')
X3, _ = get_data('../sparse/rebuild4_test.svm')
X4, _ = get_data('../sparse/rebuild5_test.svm')

X6 = np.sum(Xt.todense(), axis=1)
X7 = np.sum(X1.todense(), axis=1)
X8 = np.sum(X2.todense(), axis=1)
X9 = np.sum(X3.todense(), axis=1)
X10 = np.sum(X4.todense(), axis=1)
Xt = sparse.hstack([Xt, X1, X2, X3, X4, X6, X7, X8, X9, X10],
                   format='csr').todense()
train = pd.read_csv('../explore/test1.csv')
idx = train[idname].as_matrix().astype(int)
Xt = np.hstack([Xt, train.drop([label, idname], axis=1).as_matrix()])
print X.shape, y.shape
print Xt.shape  #, y.sha
clf = xgb_classifier(eta=0.25,
                     col=0.7,
                     min_child_weight=1,
                     depth=6,
                     num_round=70)

yp = clf.multi(X, y, Xt, 3)
s = pd.DataFrame({
    idname: idx,
    'predict_0': yp[:, 0],
    'predict_1': yp[:, 1],
    'predict_2': yp[:, 2]
})
s.to_csv('xgb4.csv', index=False)
def pre_processing_meta_random(data_base_dir, data_meta_random_dir):
    xgb_clf = xgb_classifier(eta=0.3,
                             min_child_weight=6,
                             depth=100,
                             num_round=20,
                             threads=16,
                             exist_prediction=True,
                             exist_num_round=20)

    y_all = pickle.load(open(data_base_dir + "y.p", "rb"))

    X_numerical = pickle.load(open(data_base_dir + "X_numerical.p", "rb"))
    X_test_numerical = pickle.load(
        open(data_base_dir + "X_test_numerical.p", "rb"))
    X_sparse = pickle.load(open(data_base_dir + "X_sparse.p", "rb"))
    X_test_sparse = pickle.load(open(data_base_dir + "X_test_sparse.p", "rb"))

    X_numerical_base, X_numerical_meta, X_sparse_base, X_sparse_meta, y_base, y_meta = train_test_split(
        X_numerical, X_sparse, y_all, test_size=0.5)

    X_meta_rf = []
    X_meta_svc = []

    X_test_rf = []
    X_test_svc = []

    for i in range(33):

        predicted = None

        if i == 13:

            print "%d is constant like: " % (
                i), "not included in meta features"
        else:
            print 'train', i

            y = y_base[:, i]
            rf = RandomForestClassifier(n_estimators=150, n_jobs=16)
            rf.fit(X_numerical_base, y)
            X_meta_rf.append(rf.predict_proba(X_numerical_meta))
            X_test_rf.append(rf.predict_proba(X_test_numerical))

            y = y_base[:, i]
            svm = LinearSVC()
            svm.fit(X_sparse_base, y)
            X_meta_svc.append(svm.decision_function(X_sparse_meta))
            X_test_svc.append(svm.decision_function(X_test_sparse))

    X_meta_rf = np.column_stack(X_meta_rf)
    X_test_rf = np.column_stack(X_test_rf)
    pickle.dump(X_meta_rf,
                open(data_meta_random_dir + "X_meta_random_rf.p", "wb"))
    pickle.dump(X_test_rf, open(data_meta_random_dir + "X_test_meta_rf.p",
                                "wb"))

    X_meta_svc = np.column_stack(X_meta_svc)
    X_test_svc = np.column_stack(X_test_svc)
    pickle.dump(X_meta_svc,
                open(data_meta_random_dir + "X_meta_random_svc.p", "wb"))
    pickle.dump(X_test_svc,
                open(data_meta_random_dir + "X_test_meta_svc.p", "wb"))

    pickle.dump(y_meta, open(data_meta_random_dir + "y_meta.p", "wb"))
    pickle.dump(y_base, open(data_meta_random_dir + "y_base.p", "wb"))
    pickle.dump(X_numerical_meta,
                open(data_meta_random_dir + "X_numerical_meta.p", "wb"))
Пример #34
0
"""
import pickle
pickle.dump(train,open("train.p","wb"))
pickle.dump(test,open("test.p","wb"))
pickle.dump(labels.Prediction.values,open("label.p","wb"))

import pickle
train=pickle.load(open("train.p","rb"))
test=pickle.load(open("test.p","rb"))
label=pickle.load(open("label.p","rb"))
"""
clf = ensemble.RandomForestClassifier(n_jobs=-1,
                                      n_estimators=10,
                                      min_samples_leaf=10,
                                      random_state=42)
xgb_clf = xgb_classifier(eta=0.1,
                         min_child_weight=1,
                         depth=10,
                         num_round=40,
                         threads=8,
                         boost_from_exist_prediction=True,
                         exist_num_round=10)
clf.fit(train, labels.Prediction.values)
base_train_prediction = clf.predict_proba(train).T[1]
base_test_prediction = clf.predict_proba(test).T[1]
preds = xgb_clf.train_predict(train, labels.Prediction.values, test,
                              base_train_prediction, base_test_prediction)

submission['Prediction'] = preds
submission.to_csv('xgb_boost_from_rf.csv', index=False)
Пример #35
0
idx = train[idname].as_matrix()
y = np.array(train[label])
import pickle
#X=np.hstack([X,pickle.load(open('count.p'))])

Xt, _ = get_data('../sparse/rebuild1_test.svm')
X1, _ = get_data('../sparse/rebuild2_test.svm')
X2, _ = get_data('../sparse/rebuild3_test.svm')
X3, _ = get_data('../sparse/rebuild4_test.svm')
X4, _ = get_data('../sparse/rebuild5_test.svm')
Xt = sparse.hstack([Xt, X1, X2, X3, X4], format='csr').todense()
train = pd.read_csv('../input/test.csv')
idx = train[idname].as_matrix().astype(int)

print X.shape, y.shape
print Xt.shape  #, y.sha

clf = xgb_classifier(eta=0.01,
                     col=0.8,
                     min_child_weight=2,
                     depth=4,
                     num_round=50)  #good!
yp = clf.multi(X, y, Xt, 3)
s = pd.DataFrame({
    idname: idx,
    'predict_0': yp[:, 0],
    'predict_1': yp[:, 1],
    'predict_2': yp[:, 2]
})
s.to_csv('xgb1.csv', index=False)
Пример #36
0
xx.append(np.sum(Xt.todense(),axis=1))
xx.append(np.sum(X1.todense(),axis=1))
xx.append(np.sum(X2.todense(),axis=1))
xx.append(np.sum(X3.todense(),axis=1))
xx.append(np.sum(X4.todense(),axis=1))
xx.append(np.std(Xt.todense(),axis=1))
xx.append(np.std(X1.todense(),axis=1))
xx.append(np.std(X2.todense(),axis=1))
xx.append(np.std(X3.todense(),axis=1))
xx.append(np.std(X4.todense(),axis=1))
xx=np.hstack(xx)

Xt=sparse.hstack([Xt,X1,X2,X3,X4,xx,pickle.load(open('../explore/Xt2.p'))],format='csr').todense()
train=pd.read_csv('../explore/test1.csv')
idx=train[idname].as_matrix().astype(int)
Xt=np.hstack([Xt,train.drop([label,idname],axis=1).as_matrix()])


print X.shape, y.shape
print Xt.shape#, y.sha



clf=xgb_classifier(eta=0.1,col=0.2,min_child_weight=1,depth=6,num_round=200)
yp=clf.multi(X,y,Xt,3)
s=pd.DataFrame({idname:idx,'predict_0':yp[:,0],'predict_1':yp[:,1],'predict_2':yp[:,2]})
s.to_csv('xgb5.csv',index=False)



Пример #37
0
def xgb_meta_predict(data_base_dir, data_meta_part1_dir, submission_dir):
    test_id = pickle.load(open(data_base_dir + "test_id.p", "rb"))
    y_all = pickle.load(open(data_base_dir + "y.p", "rb"))
    y_part1 = y_all[:y_all.shape[0] / 2, :]

    X_numerical = pickle.load(open(data_base_dir + "X_numerical.p", "rb"))
    X_numerical_part1 = X_numerical[:X_numerical.shape[0] / 2, :]
    X_test_numerical = pickle.load(
        open(data_base_dir + "X_test_numerical.p", "rb"))

    X_part1_xgb = pickle.load(
        open(data_meta_part1_dir + "X_meta_part1_xgb.p", "rb"))
    X_test_xgb = pickle.load(
        open(data_meta_part1_dir + "X_test_meta_xgb_all.p", "rb"))

    X_part1_rf = pickle.load(
        open(data_meta_part1_dir + "X_meta_part1_rf.p", "rb"))
    X_test_rf = pickle.load(
        open(data_meta_part1_dir + "X_test_meta_rf.p", "rb"))

    X_part1_sgd = pickle.load(
        open(data_meta_part1_dir + "X_meta_part1_sgd.p", "rb"))
    X_test_sgd = pickle.load(
        open(data_meta_part1_dir + "X_test_meta_sgd.p", "rb"))

    X_part1_best_online = pickle.load(
        open(data_meta_part1_dir + "X_meta_part1_online.p", "rb"))
    X_test_best_online = pickle.load(
        open(data_meta_part1_dir + "X_test_meta_online.p", "rb"))
    X_test_online_ensemble = pickle.load(
        open(data_meta_part1_dir + "X_test_meta_online_ensemble.p", "rb"))

    # best single model submitted, private LB 0.0044595, X_test_meta
    xgb_clf = xgb_classifier(eta=0.09,
                             min_child_weight=6,
                             depth=18,
                             num_round=120,
                             threads=16)
    X_xgb_predict = xgb_clf.train_predict_all_labels(
        np.hstack([
            X_part1_best_online, X_part1_rf, X_part1_sgd, X_part1_xgb,
            X_numerical_part1
        ]),
        y_part1,
        np.hstack([
            X_test_online_ensemble, X_test_rf, X_test_sgd, X_test_xgb,
            X_test_numerical
        ]),
        predict_y14=True)
    save_predictions(
        submission_dir + 'xgb-part1-d18-e0.09-min6-tree120-xgb_base.csv.gz',
        test_id, X_xgb_predict)

    # best single model (not submitted by itself), private LB 0.0044591, not submitted alone
    xgb_clf = xgb_classifier(eta=0.07,
                             min_child_weight=6,
                             depth=20,
                             num_round=150,
                             threads=16)
    X_xgb_predict = xgb_clf.train_predict_all_labels(
        np.hstack([
            X_part1_best_online, X_part1_rf, X_part1_sgd, X_part1_xgb,
            X_numerical_part1
        ]),
        y_part1,
        np.hstack([
            X_test_online_ensemble, X_test_rf, X_test_sgd, X_test_xgb,
            X_test_numerical
        ]),
        predict_y14=True)
    save_predictions(
        submission_dir + 'xgb-part1-d20-e0.07-min6-tree150-xgb_base.csv.gz',
        test_id, X_xgb_predict)

    # private LB 0.0047360 correct! try "boosting from existing predictions"
    xgb_clf = xgb_classifier(eta=0.07,
                             min_child_weight=6,
                             depth=20,
                             num_round=20,
                             threads=16,
                             exist_prediction=True,
                             exist_num_round=150)
    X_xgb_predict = xgb_clf.train_predict_all_labels(np.hstack(
        [X_part1_best_online, X_part1_rf, X_part1_sgd, X_numerical_part1]),
                                                     y_part1,
                                                     np.hstack([
                                                         X_test_best_online,
                                                         X_test_rf, X_test_sgd,
                                                         X_test_numerical
                                                     ]),
                                                     predict_y14=True)
    save_predictions(
        submission_dir + 'xgb-part1-d20-e0.07-min6-tree20-extree-150.csv.gz',
        test_id, X_xgb_predict)

    # private LB 0.0047103,
    xgb_clf = xgb_classifier(eta=0.09,
                             min_child_weight=6,
                             depth=18,
                             num_round=1,
                             threads=16,
                             exist_prediction=True,
                             exist_num_round=120)
    X_xgb_predict = xgb_clf.train_predict_all_labels(
        np.hstack(
            [X_part1_best_online, X_part1_rf, X_part1_sgd, X_numerical_part1]),
        y_part1,
        np.hstack(
            [X_test_online_ensemble, X_test_rf, X_test_sgd, X_test_numerical]),
        predict_y14=True)
    save_predictions(
        submission_dir + 'xgb-part1-d18-e0.09-min6-tree1-extree-120.csv.gz',
        test_id, X_xgb_predict)

    # private LB 0.0047000, using ensembled online predictions as meta feature for test sets!
    xgb_clf = xgb_classifier(eta=0.07,
                             min_child_weight=6,
                             depth=20,
                             num_round=150,
                             threads=16)
    X_xgb_predict = xgb_clf.train_predict_all_labels(
        np.hstack(
            [X_part1_best_online, X_part1_rf, X_part1_sgd, X_numerical_part1]),
        y_part1,
        np.hstack(
            [X_test_online_ensemble, X_test_rf, X_test_sgd, X_test_numerical]),
        predict_y14=True)
    save_predictions(
        submission_dir + 'xgb-part1-d20-e0.07-min6-tree150.csv.gz', test_id,
        X_xgb_predict)

    # private LB 0.0047313, correct!
    xgb_clf = xgb_classifier(eta=0.07,
                             min_child_weight=6,
                             depth=19,
                             num_round=150,
                             threads=16)
    X_xgb_predict = xgb_clf.train_predict_all_labels(np.hstack(
        [X_part1_best_online, X_part1_rf, X_part1_sgd, X_numerical_part1]),
                                                     y_part1,
                                                     np.hstack([
                                                         X_test_best_online,
                                                         X_test_rf, X_test_sgd,
                                                         X_test_numerical
                                                     ]),
                                                     predict_y14=True)
    save_predictions(
        submission_dir + 'xgb-part1-d19-e0.07-min6-tree150.csv.gz', test_id,
        X_xgb_predict)

    # private LB 0.0047446, correct!
    xgb_clf = xgb_classifier(eta=0.09,
                             min_child_weight=6,
                             depth=18,
                             num_round=120,
                             threads=16)
    X_xgb_predict = xgb_clf.train_predict_all_labels(np.hstack(
        [X_part1_best_online, X_part1_rf, X_part1_sgd, X_numerical_part1]),
                                                     y_part1,
                                                     np.hstack([
                                                         X_test_best_online,
                                                         X_test_rf, X_test_sgd,
                                                         X_test_numerical
                                                     ]),
                                                     predict_y14=True)
    save_predictions(
        submission_dir + 'xgb-part1-d18-e0.09-min6-tree120.csv.gz', test_id,
        X_xgb_predict)
Пример #38
0
xx.append(np.std(X1.todense(), axis=1))
xx.append(np.std(X2.todense(), axis=1))
xx.append(np.std(X3.todense(), axis=1))
xx.append(np.std(X4.todense(), axis=1))
xx = np.hstack(xx)

Xt = sparse.hstack(
    [Xt, X1, X2, X3, X4, xx,
     pickle.load(open('../explore/Xt2.p'))],
    format='csr').todense()
train = pd.read_csv('../explore/test1.csv')
idx = train[idname].as_matrix().astype(int)
Xt = np.hstack([Xt, train.drop([label, idname], axis=1).as_matrix()])

print X.shape, y.shape
print Xt.shape  #, y.sha

clf = xgb_classifier(eta=0.1,
                     col=0.2,
                     min_child_weight=1,
                     depth=6,
                     num_round=200)
yp = clf.multi(X, y, Xt, 3)
s = pd.DataFrame({
    idname: idx,
    'predict_0': yp[:, 0],
    'predict_1': yp[:, 1],
    'predict_2': yp[:, 2]
})
s.to_csv('xgb5.csv', index=False)
Пример #39
0
X4, _ =get_data('../sparse/rebuild5.svm')
X=sparse.hstack([X,X1,X2,X3,X4],format='csr').todense()
train=pd.read_csv('../input/train.csv')
idname='id'
label='fault_severity'
idx=train[idname].as_matrix()
y=np.array(train[label])
import pickle
#X=np.hstack([X,pickle.load(open('count.p'))])


Xt, _ = get_data('../sparse/rebuild1_test.svm')
X1, _ =get_data('../sparse/rebuild2_test.svm')
X2, _ = get_data('../sparse/rebuild3_test.svm')
X3, _ =get_data('../sparse/rebuild4_test.svm')
X4, _ =get_data('../sparse/rebuild5_test.svm')
Xt=sparse.hstack([Xt,X1,X2,X3,X4],format='csr').todense()
train=pd.read_csv('../input/test.csv')
idx=train[idname].as_matrix().astype(int)

print X.shape, y.shape
print Xt.shape#, y.sha

clf=xgb_classifier(eta=0.01,col=0.8,min_child_weight=2,depth=4,num_round=50) #good!
yp=clf.multi(X,y,Xt,3)
s=pd.DataFrame({idname:idx,'predict_0':yp[:,0],'predict_1':yp[:,1],'predict_2':yp[:,2]})
s.to_csv('xgb1.csv',index=False)



Пример #40
0
Xt=sparse.hstack([Xt,X1,X2,X3,X4,xx,pickle.load(open('../explore/Xt2.p'))],format='csr').todense()
train=pd.read_csv('../explore/test1.csv')
idx=train[idname].as_matrix().astype(int)
Xt=np.hstack([Xt,train.drop([label,idname],axis=1).as_matrix()])


print X.shape, y.shape
print Xt.shape#, y.sha
from scipy.stats import pearsonr
xx=[]
xt=[]
for i,j in zip(X.T,Xt.T):
    score=pearsonr(np.array(i.T).ravel(),y)[0]
    if np.abs(score)>1e-2:
        xx.append(np.array(i.T).ravel())
        xt.append(np.array(j.T).ravel())

X=np.array(xx).T
Xt=np.array(xt).T
print X.shape, y.shape
print Xt.shape

clf=xgb_classifier(eta=0.1,gamma=1e-3,col=0.3,min_child_weight=0.5,depth=7,num_round=160)

yp=clf.multi(X,y,Xt,3)
s=pd.DataFrame({idname:idx,'predict_0':yp[:,0],'predict_1':yp[:,1],'predict_2':yp[:,2]})
s.to_csv('xgb6.csv',index=False)



def pre_processing_meta_part1(data_base_dir, data_meta_part1_dir):

    y_all = pickle.load(open(data_base_dir + "y.p", "rb"))
    y_part2 = y_all[y_all.shape[0] / 2:, :]

    X_all = pickle.load(open(data_base_dir + "X_all.p", "rb"))
    X_test = pickle.load(open(data_base_dir + "X_test_all.p", "rb"))
    X_part1 = X_all[:X_all.shape[0] / 2, :]
    X_part2 = X_all[X_all.shape[0] / 2:, :]

    X_numerical = pickle.load(open(data_base_dir + "X_numerical.p", "rb"))
    X_test_numerical = pickle.load(
        open(data_base_dir + "X_test_numerical.p", "rb"))
    X_numerical_part1 = X_numerical[:X_numerical.shape[0] / 2, :]
    X_numerical_part2 = X_numerical[X_numerical.shape[0] / 2:, :]

    X_sparse = pickle.load(open(data_base_dir + "X_sparse.p", "rb"))
    X_test_sparse = pickle.load(open(data_base_dir + "X_test_sparse.p", "rb"))
    X_sparse_part1 = X_sparse[:X_sparse.shape[0] / 2, :]
    X_sparse_part2 = X_sparse[X_sparse.shape[0] / 2:, :]

    X_part1_xgb = []
    X_part1_rf = []
    X_part1_sgd = []

    X_test_xgb = []
    X_test_rf = []
    X_test_sgd = []

    # use pypy to accelerate online model

    X_part1_best_online = np.array(
        pd.read_csv(data_meta_part1_dir + 'part1_online.csv')[['pred']])
    X_part1_best_online = X_part1_best_online.reshape(
        (X_part1_best_online.shape[0] / 32, 32))
    X_test_best_online = np.array(
        pd.read_csv(data_meta_part1_dir + 'best_online_test.csv')[['pred']])
    X_test_best_online = X_test_best_online.reshape(
        (X_test_best_online.shape[0] / 32, 32))
    pickle.dump(X_part1_best_online,
                open(data_meta_part1_dir + "X_meta_part1_online.p", "wb"))
    pickle.dump(X_test_best_online,
                open(data_meta_part1_dir + "X_test_meta_online.p", "wb"))

    xgb_clf = xgb_classifier(eta=0.3,
                             min_child_weight=6,
                             depth=100,
                             num_round=20,
                             threads=16,
                             exist_prediction=True,
                             exist_num_round=20)
    X_part1_xgb = xgb_clf.train_predict_all_labels(X_part2,
                                                   y_part2,
                                                   X_part1,
                                                   predict_y14=False)
    X_test_xgb = xgb_clf.train_predict_all_labels(
        X_all, y_all, X_test, predict_y14=False
    )  # a little trick to make test data's meta features more accurate

    pickle.dump(X_part1_xgb,
                open(data_meta_part1_dir + "X_meta_part1_xgb.p", "wb"))
    pickle.dump(X_test_xgb,
                open(data_meta_part1_dir + "X_test_meta_xgb_all.p", "wb"))

    for i in range(33):

        predicted = None

        if i == 13:

            print "%d is constant like: " % (
                i), "not included in meta features"
        else:
            print 'train', i

            y = y_part2[:, i]
            rf = RandomForestClassifier(n_estimators=200,
                                        n_jobs=16,
                                        min_samples_leaf=10,
                                        random_state=1,
                                        bootstrap=False,
                                        criterion='entropy',
                                        min_samples_split=5,
                                        verbose=1)
            rf.fit(X_numerical_part2, y)
            X_part1_rf.append(rf.predict_proba(X_numerical_part1))
            X_test_rf.append(rf.predict_proba(X_test_numerical))

            y = y_part2[:, i]
            clf = SGDClassifier(loss='log', alpha=0.000001, n_iter=100)
            clf.fit(X_sparse_part2, y)
            X_part1_sgd.append(clf.predict_proba(X_sparse_part1).T[1])
            X_test_sgd.append(clf.predict_proba(X_test_sparse).T[1])

    X_part1_rf = np.column_stack(X_part1_rf)
    X_test_rf = np.column_stack(X_test_rf)
    pickle.dump(X_part1_rf,
                open(data_meta_part1_dir + "X_meta_part1_rf.p", "wb"))
    pickle.dump(X_test_rf, open(data_meta_part1_dir + "X_test_meta_rf.p",
                                "wb"))

    X_part1_sgd = np.column_stack(X_part1_sgd)
    X_test_sgd = np.column_stack(X_test_sgd)
    pickle.dump(X_part1_sgd,
                open(data_meta_part1_dir + "X_meta_part1_sgd.p", "wb"))
    pickle.dump(X_test_sgd,
                open(data_meta_part1_dir + "X_test_meta_sgd.p", "wb"))
Пример #42
0
print Xt.shape  #, y.sha
from scipy.stats import pearsonr
xx = []
xt = []
for i, j in zip(X.T, Xt.T):
    score = pearsonr(np.array(i.T).ravel(), y)[0]
    if np.abs(score) > 1e-2:
        xx.append(np.array(i.T).ravel())
        xt.append(np.array(j.T).ravel())

X = np.array(xx).T
Xt = np.array(xt).T
print X.shape, y.shape
print Xt.shape

clf = xgb_classifier(eta=0.1,
                     gamma=1e-3,
                     col=0.3,
                     min_child_weight=0.5,
                     depth=7,
                     num_round=160)

yp = clf.multi(X, y, Xt, 3)
s = pd.DataFrame({
    idname: idx,
    'predict_0': yp[:, 0],
    'predict_1': yp[:, 1],
    'predict_2': yp[:, 2]
})
s.to_csv('xgb6.csv', index=False)
Пример #43
0
X=sparse.hstack([X,X1,X2,X3,X4],format='csr').todense()
train=pd.read_csv('../explore/train1.csv')
idname='id'
label='fault_severity'
idx=train[idname].as_matrix()
y=np.array(train[label])
import pickle
X=np.hstack([X,train.drop([label,idname],axis=1).as_matrix()])
#X=np.hstack([X,pickle.load(open('count.p'))])


Xt, _ = get_data('../sparse/rebuild1_test.svm')
X1, _ =get_data('../sparse/rebuild2_test.svm')
X2, _ = get_data('../sparse/rebuild3_test.svm')
X3, _ =get_data('../sparse/rebuild4_test.svm')
X4, _ =get_data('../sparse/rebuild5_test.svm')
Xt=sparse.hstack([Xt,X1,X2,X3,X4],format='csr').todense()
train=pd.read_csv('../explore/test1.csv')
idx=train[idname].as_matrix().astype(int)
Xt=np.hstack([Xt,train.drop([label,idname],axis=1).as_matrix()])
print X.shape, y.shape
print Xt.shape#, y.sha
clf=xgb_classifier(eta=0.25,col=0.7,min_child_weight=1,depth=6,num_round=70)

yp=clf.multi(X,y,Xt,3)
s=pd.DataFrame({idname:idx,'predict_0':yp[:,0],'predict_1':yp[:,1],'predict_2':yp[:,2]})
s.to_csv('xgb3.csv',index=False)