def make_predictions(load_list=[]): train, test, target, test_index = io.load_data() result = np.zeros(len(test)) seed_list = [1234, 2345, 3456, 4567, 5678, 6789, 7890, 8901, 9012, 123] mcw_list = [1, 1, 1, 1, 1, 8, 8, 8, 8, 8] seed_list2 = [1242, 5432] mcw_list2 = [1, 8] train, test, target, test_index = io.load_data() train, test, target = fe.preprocess_data(train, test, target, preprocess_type=1) train_meta, test_meta = train_level1(load_list=["xgb1", "xgb2", "knn", "ext1", "ext2", "rf1", "rf2"]) train = np.column_stack((train, train_meta)) test = np.column_stack((test, test_meta)) print "Training model1..." pred1 = np.zeros(len(test)) for i in range(10): if "xgb1" not in load_list: model1 = xgb.XGBClassifier(n_estimators=550, learning_rate=0.01, max_depth=6, colsample_bytree=0.95, subsample=1, min_child_weight=mcw_list[i], seed=seed_list[i]) model1.fit(train, target) pickle.dump(model1, open("final_models/xgb/1/xgb_n_"+str(i)+".pkl", "wb")) else: model1 = pickle.load(open("final_models/xgb/1/xgb_n_"+str(i)+".pkl", "rb")) pred1 += model1.predict_proba(test)[:, 1] pred1 /= len(seed_list) print "Training model2..." pred2 = np.zeros(len(test)) for i in range(2): if "xgb2" not in load_list: model2 = xgb.XGBClassifier(n_estimators=550, learning_rate=0.01, max_depth=6, colsample_bytree=0.95, subsample=1, min_child_weight=mcw_list2[i], seed=seed_list2[i]) model2 = sklearn.calibration.CalibratedClassifierCV(model2, method="isotonic", cv=10) model2.fit(train, target) pickle.dump(model2, open("final_models/xgb/2/xgb_n_"+str(i)+".pkl", "wb")) else: model2 = pickle.load(open("final_models/xgb/2/xgb_n_"+str(i)+".pkl", "rb")) pred2 += model2.predict_proba(test)[:, 1] pred2 /= len(seed_list2) print "Training model4..." pred4 = np.zeros(len(test)) if "ext1" not in load_list: model4 = sklearn.ensemble.ExtraTreesClassifier(n_estimators=1000,max_features=50,criterion='entropy',min_samples_split=4, max_depth=35, min_samples_leaf=2, n_jobs =-1, random_state=1234) model4.fit(train, target) pred4 = model4.predict_proba(test)[:, 1] pred4.dump("final_models/ext/ext1_pred") else: pred4 = np.load("final_models/ext/ext1_pred") result = 0.7*np.sqrt(pred1*pred2) + 0.3*pred4 return result
def main(): train, test, target, test_index = io.load_data() train, test, target = fe.preprocess_data(train, test, target, preprocess_type=1) #tuning.parametr_tuning(train, target, param_grid={}) #tuning.ensemble_tuning(train, test, target, load_list=["xgb1"]) result = make_predictions(load_list=["xgb1", "xgb2", "ext1"]) io.save_result(test_index, result)
def main(): train, test, target, test_index = io.load_data() train, test, target = fe.preprocess_data(train, test, target) #tuning.tune_xgboost(train, target, load_list=[]) #tuning.parametr_tuning(train, target, param_grid={}) #tuning.ensemble_tuning(train, target, load_list=[]) model = sklearn.ensemble.RandomForestClassifier(n_estimators=2000, max_depth=8, criterion="entropy", bootstrap=False, min_samples_leaf=4, min_samples_split=2, random_state=1234) model.fit(train, target) result = model.predict_proba(test)[:, 1] """ result = make_predictions(train, target, test, load_list=["rf_entropy", "xgb"]) """ io.save_result(test_index, result)
from keras.models import Sequential from keras.layers import Dense, Activation from keras.optimizers import SGD from input_output import load_data import numpy as np from keras.utils import np_utils DATAFILE = "SEQC_NB_batchCorr_tr_FAV_TopFeats.txt" LABELSFILE = "SEQC_NB_batchCorr_tr_FAV.lab" sample_names_tr, var_names_tr, X_train = load_data(DATAFILE) y_tr = np.loadtxt(LABELSFILE, dtype=np.int) n_classes = np.max(y_tr)+1 y_train = np_utils.to_categorical(y_tr+1) y_train = np_utils.to_categorical(y_tr,n_classes) model = Sequential() model.add(Dense(64,input_dim=X_train.shape[1],init='uniform')) model.add(Activation('tanh')) model.add(Dense(64,init='uniform')) model.add(Activation('tanh')) model.add(Dense(2,init='uniform')) model.add(Activation('softmax')) sgd = SGD(lr=0.003) model.compile(loss='binary_crossentropy',optimizer=sgd,metrics=['accuracy']) model.fit(X_train,y_train,nb_epoch=200,batch_size=16)
RANK_METHOD = 'randomForest' # number of CV folds #CV_K = 5 # number of CV cycles #CV_N = 10 # number of Montecarlo CV cycles (for SVM tuning) TUN_CV_K = 10 # fraction of the dataset to keep apart as test split (for SVM tuning) TUN_CV_P = 50 # list of C values for SVM tuning TUN_SVM_C = [10**k for k in np.arange(-7, 5)] # maximum count of trying k-fold data selection KFOLD_TRY = 100 sample_names, var_names, x = load_data(DATAFILE) y = np.loadtxt(LABELSFILE, dtype=np.int) # build FSTEPS according to dataset size nfeat = x.shape[1] ord = np.int(np.log10(nfeat)) fs = np.empty(0, dtype=np.int) for p in range(ord+1): fs = np.concatenate( (fs, np.dot(10**p, np.arange(10))) ) fs = np.unique(fs)[1:] # cap FSTEPS at 10000 features, if applicable FLIM = 10000 if nfeat>10000 else nfeat FSTEPS = fs[ fs <= FLIM ].tolist() + [nfeat] ### FSTEPS = range(1,10) + range(10, 100, 10) + range(100, 1000, 100) + range(1000, 10000, 1000) + [10000] + [x.shape[1]]
update_momentum=0.9, eval_size=0.01, verbose=0, max_epochs=100, use_label_encoder=True) model3.fit(train, target) pickle.dump(model3, open("final_models/nn/nn_n_"+str(i)+".pkl", "wb")) else: model3 = pickle.load(open("final_models/nn/nn_n_"+str(i)+".pkl", "rb")) pred3 += model3.predict_proba(test)[:, 1] pred3 /= 10 if ranking: pred1 = scipy.stats.rankdata(pred1) pred2 = scipy.stats.rankdata(pred2) pred3 = scipy.stats.rankdata(pred3) result = 0.21*pred1 + 0.47*pred2 + 0.32*pred3 return result train, test, target, test_index = io.load_data() train, test, target = fe.preprocess_data(train, test, target) tuning.parametr_tuning(train, target, param_grid={"alpha": [0.01]}) #tuning.ensemble_tuning(train, target, ranking=True, load_list=["linear", "xgb"]) """ result = make_predictions(train, target, test, ranking=True, load_list=["linear", "xgb", "nn"]) io.save_result(test_index, result) """
def train_level1(load_list=[]): print "Training xgb1..." train, test, target, test_index = io.load_data() train, test, target = fe.preprocess_data(train, test, target, preprocess_type=1) N_FOLDS = 10 cv = sklearn.cross_validation.StratifiedKFold(target, n_folds=N_FOLDS, shuffle=True, random_state=1234) train_xgb1 = np.zeros(train.shape[0]) test_xgb1 = np.zeros(test.shape[0]) for fold, (train_index, test_index) in enumerate(cv): if "xgb1" not in load_list: model = xgb.XGBClassifier(n_estimators=180, learning_rate=0.05, max_depth=11, colsample_bytree=0.8, subsample=0.96, min_child_weight=4, seed=1234) model.fit(train[train_index], target[train_index]) pickle.dump(model, open("level1/xgb/1/xgb_1_fold_"+str(fold)+".pkl", "wb")) else: model = pickle.load(open("level1/xgb/1/xgb_1_fold_"+str(fold)+".pkl", "rb")) train_xgb1[test_index] = model.predict_proba(train[test_index])[:, 1] test_xgb1 += model.predict_proba(test)[:, 1]/N_FOLDS train_meta = train_xgb1.reshape((train_xgb1.shape[0], 1)) test_meta = test_xgb1.reshape((test_xgb1.shape[0], 1)) print "Training xgb2..." train, test, target, test_index = io.load_data() train, test, target = fe.preprocess_data(train, test, target, preprocess_type=1) N_FOLDS = 10 cv = sklearn.cross_validation.StratifiedKFold(target, n_folds=N_FOLDS, shuffle=True, random_state=1234) train_xgb2 = np.zeros(train.shape[0]) test_xgb2 = np.zeros(test.shape[0]) for fold, (train_index, test_index) in enumerate(cv): if "xgb2" not in load_list: model = xgb.XGBRegressor(n_estimators=600, learning_rate=0.02, max_depth=9, colsample_bytree=1, subsample=1, min_child_weight=1, seed=1234) model.fit(train[train_index], target[train_index]) pickle.dump(model, open("level1/xgb/2/xgb_1_fold_"+str(fold)+".pkl", "wb")) else: model = pickle.load(open("level1/xgb/2/xgb_1_fold_"+str(fold)+".pkl", "rb")) train_xgb2[test_index] = model.predict(train[test_index]) test_xgb2 += model.predict(test)/N_FOLDS train_meta = np.column_stack((train_meta, train_xgb2)) test_meta = np.column_stack((test_meta, test_xgb2)) print "Training knn..." if "knn" not in load_list: train, test, target, test_index = io.load_data() train, test, target = fe.preprocess_data(train, test, target, preprocess_type=4) N_FOLDS = 10 cv = sklearn.cross_validation.StratifiedKFold(target, n_folds=N_FOLDS, shuffle=True, random_state=1234) train_knn = np.zeros(train.shape[0]) test_knn = np.zeros(test.shape[0]) for fold, (train_index, test_index) in enumerate(cv): model = sklearn.neighbors.KNeighborsClassifier(n_neighbors=128, metric="minkowski", weights="distance", n_jobs=-1) model.fit(train[train_index], target[train_index]) train_knn[test_index] = model.predict_proba(train[test_index])[:, 1] model = sklearn.neighbors.KNeighborsClassifier(n_neighbors=128, metric="minkowski", weights="distance", n_jobs=-1) model.fit(train, target) test_knn = model.predict_proba(test)[:, 1] train_knn.dump("level1/knn/train_knn") test_knn.dump("level1/knn/test_knn") else: train_knn = np.load("level1/knn/train_knn") test_knn = np.load("level1/knn/test_knn") train_meta = np.column_stack((train_meta, train_knn)) test_meta = np.column_stack((test_meta, test_knn)) print "Training ext1..." if "ext1" not in load_list: train, test, target, test_index = io.load_data() train, test, target = fe.preprocess_data(train, test, target, preprocess_type=3) N_FOLDS = 10 cv = sklearn.cross_validation.StratifiedKFold(target, n_folds=N_FOLDS, shuffle=True, random_state=1234) train_ext1 = np.zeros(train.shape[0]) test_ext1 = np.zeros(test.shape[0]) for fold, (train_index, test_index) in enumerate(cv): model = sklearn.ensemble.ExtraTreesClassifier(n_estimators=1000,max_features=50,criterion='entropy',min_samples_split=4, max_depth=35, min_samples_leaf=2, n_jobs =-1, random_state=1234) model.fit(train[train_index], target[train_index]) train_ext1[test_index] = model.predict_proba(train[test_index])[:, 1] test_ext1 += model.predict_proba(test)[:, 1]/N_FOLDS train_ext1.dump("level1/ext/1/train_ext") test_ext1.dump("level1/ext/1/test_ext") else: train_ext1 = np.load("level1/ext/1/train_ext") test_ext1 = np.load("level1/ext/1/test_ext") train_meta = np.column_stack((train_meta, train_ext1)) test_meta = np.column_stack((test_meta, test_ext1)) print "Training ext2..." if "ext2" not in load_list: train, test, target, test_index = io.load_data() train, test, target = fe.preprocess_data(train, test, target, preprocess_type=5) N_FOLDS = 10 cv = sklearn.cross_validation.StratifiedKFold(target, n_folds=N_FOLDS, shuffle=True, random_state=1234) train_ext2 = np.zeros(train.shape[0]) test_ext2 = np.zeros(test.shape[0]) for fold, (train_index, test_index) in enumerate(cv): model = sklearn.ensemble.ExtraTreesRegressor(n_estimators=1000,max_features=50,min_samples_split=4, max_depth=35, min_samples_leaf=2, n_jobs =-1, random_state=1234) model.fit(train[train_index], target[train_index]) train_ext2[test_index] = model.predict(train[test_index]) test_ext2 += model.predict(test)/N_FOLDS train_ext2.dump("level1/ext/2/train_ext") test_ext2.dump("level1/ext/2/test_ext") else: train_ext2 = np.load("level1/ext/2/train_ext") test_ext2 = np.load("level1/ext/2/test_ext") train_meta = np.column_stack((train_meta, train_ext2)) test_meta = np.column_stack((test_meta, test_ext2)) print "Training rf1..." if "rf1" not in load_list: train, test, target, test_index = io.load_data() train, test, target = fe.preprocess_data(train, test, target, preprocess_type=1) N_FOLDS = 10 cv = sklearn.cross_validation.StratifiedKFold(target, n_folds=N_FOLDS, shuffle=True, random_state=1234) train_rf1 = np.zeros(train.shape[0]) test_rf1 = np.zeros(test.shape[0]) for fold, (train_index, test_index) in enumerate(cv): model = sklearn.ensemble.RandomForestClassifier(n_estimators=500, criterion="entropy", max_depth=32, min_samples_leaf=4, n_jobs=-1, random_state=1234) model.fit(train[train_index], target[train_index]) train_rf1[test_index] = model.predict_proba(train[test_index])[:, 1] test_rf1 += model.predict_proba(test)[:, 1]/N_FOLDS train_rf1.dump("level1/rf/1/train_rf") test_rf1.dump("level1/rf/1/test_rf") else: train_rf1 = np.load("level1/rf/1/train_rf") test_rf1 = np.load("level1/rf/1/test_rf") train_meta = np.column_stack((train_meta, train_rf1)) test_meta = np.column_stack((test_meta, test_rf1)) print "Training rf2..." if "rf2" not in load_list: train, test, target, test_index = io.load_data() train, test, target = fe.preprocess_data(train, test, target, preprocess_type=1) N_FOLDS = 10 cv = sklearn.cross_validation.StratifiedKFold(target, n_folds=N_FOLDS, shuffle=True, random_state=1234) train_rf2 = np.zeros(train.shape[0]) test_rf2 = np.zeros(test.shape[0]) for fold, (train_index, test_index) in enumerate(cv): model = sklearn.ensemble.RandomForestRegressor(n_estimators=500, max_depth=32, min_samples_leaf=4, n_jobs=-1, random_state=1234) model.fit(train[train_index], target[train_index]) train_rf2[test_index] = model.predict(train[test_index]) test_rf2 += model.predict(test)/N_FOLDS train_rf2.dump("level1/rf/2/train_rf") test_rf2.dump("level1/rf/2/test_rf") else: train_rf2 = np.load("level1/rf/2/train_rf") test_rf2 = np.load("level1/rf/2/test_rf") train_meta = np.column_stack((train_meta, train_rf2)) test_meta = np.column_stack((test_meta, test_rf2)) return train_meta, test_meta
def preprocess_data(train, test, target, preprocess_type=1): if preprocess_type == 1: train = add_features(train) test = add_features(test) train, test = label_cats(train, test, nan_value=-999) train = add_features(train) test = add_features(test) train = drop_features(train) test = drop_features(test) train, test = input_missing(train, test, input_type="-999") elif preprocess_type == 2: train, test = label_cats(train, test, nan_value=10000) train_cat, test_cat = do_one_hot(train, test, drop_cat=True) train = add_features(train) test = add_features(test) train = drop_features(train) test = drop_features(test) train, test = input_missing(train, test, input_type="mean") train = scipy.sparse.hstack((train, train_cat), format="csr") test = scipy.sparse.hstack((test, test_cat), format="csr") elif preprocess_type == 3: train = add_features(train) test = add_features(test) train, test = label_cats(train, test, nan_value=-999) train, test = MungeData(train, target, test) train = drop_features(train) test = drop_features(test) train, test = input_missing(train, test, input_type="-999") elif preprocess_type == 4: train = add_features(train) test = add_features(test) train, test = MungeData(train, target, test) train = drop_features(train) test = drop_features(test) train, test = input_missing(train, test, input_type="-1") scaler = sklearn.preprocessing.StandardScaler() train = scaler.fit_transform(train) test = scaler.transform(test) elif preprocess_type == 5: train, test, target, test_index = io.load_data(drop="ext") train, test = label_cats(train, test, nan_value=-999) train['v22-1']=train['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[0])) test['v22-1']=test['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[0])) train['v22-2']=train['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[1])) test['v22-2']=test['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[1])) train['v22-3']=train['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[2])) test['v22-3']=test['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[2])) train['v22-4']=train['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[3])) test['v22-4']=test['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[3])) train = add_features(train) test = add_features(test) train = drop_features(train) test = drop_features(test) train, test = input_missing(train, test, input_type="-999") rnd = 12 n_ft = 20 max_elts = 3 a=addNearestNeighbourLinearFeatures(n_neighbours=n_ft, max_elts=max_elts, verbose=True, random_state=rnd) a.fit(train, target) train = a.transform(train) test = a.transform(test) elif preprocess_type == 6: train = add_features(train) test = add_features(test) train, test = label_cats(train, test, nan_value=-999) train = add_features(train) test = add_features(test) train = drop_features(train) test = drop_features(test) train.drop(["v50"], axis=1, inplace=True) test.drop(["v50"], axis=1, inplace=True) train, test = input_missing(train, test, input_type="-999") if type(train) is pd.DataFrame: train = train.values test = test.values return train, test, target
from sklearn.preprocessing import StandardScaler from sklearn.neighbors import KNeighborsClassifier import pandas as pd import input_output import models import preprocess import numpy as np from sklearn.preprocessing import MultiLabelBinarizer # Define constants TEST_SIZE_SAMPLE = 0.0001 RANDOM_STATE_SPLIT = 38 NUMBER_OF_ENTRIES = 300000 rawData = input_output.load_data("train.csv") processedData = preprocess.prepare_data(rawData) # Separate features and output + scale data # pandas has some weird column counting scaler = StandardScaler() multiBinarizer = MultiLabelBinarizer() trainData = scaler.fit_transform(processedData[:, 0:2]) # Convert output to binarized array numbers = np.reshape((processedData[:, 2]), (len(processedData[:, 2]), 1)) predOutput = multiBinarizer.fit_transform(numbers) X_train, X_test, y_train, y_test = train_test_split(trainData, predOutput, test_size=TEST_SIZE_SAMPLE, random_state=RANDOM_STATE_SPLIT)
def ensemble_tuning(train, test, target, load_list=[]): N_FOLDS = 10 seed_list = [1234, 2345, 6789, 7890] mcw_list = [1, 1, 8, 8] cv = sklearn.cross_validation.StratifiedKFold(target, n_folds=N_FOLDS, shuffle=True, random_state=1234) scores = np.zeros((N_FOLDS, 101), dtype=np.float32) train, test, target, test_index = io.load_data() X1, _, target = fe.preprocess_data(train, test, target, preprocess_type=1) train_meta, test_meta = main.train_level1(load_list=["xgb1", "xgb2", "knn", "ext1", "ext2", "rf1", "rf2"]) X1 = np.column_stack((X1, train_meta)) for fold, (train_index, test_index) in enumerate(cv): print("Training model1...") pred1 = np.zeros(len(X1[test_index])) for i in range(4): if "xgb1" not in load_list: model1 = xgb.XGBClassifier(n_estimators=550, learning_rate=0.01, max_depth=6, colsample_bytree=0.95, subsample=1, min_child_weight=mcw_list[i], seed=seed_list[i]) model1.fit(X1[train_index], target[train_index]) pickle.dump(model1, open("cv/xgb/1/xgb_1_fold_"+str(fold)+"_n_"+str(i)+".pkl", "wb")) else: model1 = pickle.load(open("cv/xgb/1/xgb_1_fold_"+str(fold)+"_n_"+str(i)+".pkl", "rb")) pred1 += model1.predict_proba(X1[test_index])[:, 1] pred1 /= len(seed_list) #print("Training model2...") #pred2 = np.zeros(len(X1[test_index])) """ for i in range(6): if "xgb2" not in load_list: model2 = xgb.XGBRegressor(n_estimators=550, learning_rate=0.01, max_depth=6, colsample_bytree=0.95, subsample=1, min_child_weight=mcw_list[i], seed=seed_list[i]) model2.fit(X1[train_index], target[train_index]) pickle.dump(model1, open("cv/xgb/2/xgb_1_fold_"+str(fold)+"_n_"+str(i)+".pkl", "wb")) else: model2 = pickle.load(open("cv/xgb/2/xgb_1_fold_"+str(fold)+"_n_"+str(i)+".pkl", "rb")) pred2 += model1.predict(X1[test_index]) pred2 /= len(seed_list) pred2[pred2 >= 0.99] = 0.99 pred2[pred2 <= 0.01] = 0.01 """ print("Training model3...") if "ext1" not in load_list: model3 = sklearn.ensemble.ExtraTreesClassifier(n_estimators=1000,max_features=50,criterion='entropy',min_samples_split=4, max_depth=35, min_samples_leaf=2, n_jobs =-1, random_state=1234) model3.fit(X1[train_index], target[train_index]) pred3 = model3.predict_proba(X1[test_index])[:, 1] pred3.dump("cv/rf/pred_fold_"+str(fold)) else: pred3 = np.load("cv/rf/pred_fold_"+str(fold)) print("Calculating scores...") for alpha in np.ndindex(101): scores[fold][alpha] = sklearn.metrics.log_loss(target[test_index], 0.01*alpha[0]*pred1 + np.max(1 - 0.01*alpha[0], 0)*pred3)#np.power(pred1**(0.01*alpha[0])*pred2**(0.01*alpha[1]), 1/(0.01*(alpha[0] + alpha[1] + 1)))) # 0.01*alpha[0]*pred1 + np.max(1 - 0.01*alpha[0], 0)*pred2) print("Current fold:", np.min(scores[fold]), np.unravel_index(scores[fold].argmin(), scores[fold].shape), scores[fold][100], scores[fold][0]) sc1 = np.mean(scores, axis=0) * 1.0 / (fold+1) * N_FOLDS print("Accumulated:", np.min(sc1), np.unravel_index(sc1.argmin(), sc1.shape), sc1[100], sc1[0]) scores1 = np.mean(scores, axis=0) print(np.min(scores1), np.unravel_index(scores1.argmin(), scores1.shape), scores1[100], scores1[0]) return scores
X_test = scaler.transform(X_test) X_train = scipy.sparse.hstack((text_train_tfidf, X_train), format="csr") X_test = scipy.sparse.hstack((text_test_tfidf, X_test), format="csr") model = sklearn.linear_model.LogisticRegression(C=0.7, penalty="l2") result = make_predictions(model, X_train, target, X_test) io.save_result(test["PostId"], result) return result def make_dirs(dir_names): for name in dir_names: if not os.path.exists(name): os.makedirs(name) dir_names = ["input", "output", "w2v", "metafeatures"] make_dirs(dir_names) train, target, test = io.load_data() text_train_tfidf, text_test_tfidf = get_tfidf(train, test) preds1 = rf_model(train, target, test, text_train_tfidf, text_test_tfidf) preds2 = linear_model(train, target, test, text_train_tfidf, text_test_tfidf) result = 0.7*preds1 + 0.3*preds2 io.save_result(test["PostId"], result)