# Concatenate results & data: predProba_Train2 = preTreatment.concatenate_vectors(predProba_Train2_s) yTrain2 = preTreatment.concatenate_vectors(train_RM_s_2[2]) weightsTrain2 = preTreatment.concatenate_vectors(train_RM_s_2[3]) # Looking for the best threshold: if type(train_s[1]) == list: best_ams_train2, best_ratio = tresholding.\ best_ratio_combinaison_global( predProba_Train2_s, train_RM_s_2[2], train_RM_s_2[3], 5) else: best_ams_train2, best_ratio = tresholding.best_ratio( predProba_Train2, yTrain2, weightsTrain2) print "Train2 - best ratio : %s - best ams : %f" \ %(', '.join(map(str,best_ratio)), best_ams_train2) print(" ") print "Making predictions on the validation set..." # Prediction of the validation set 2: predProba_Valid2_s = xgBoost.predict_proba(predictor_s, valid_RM_s_2[1]) # Thresholding the predictions: predProba_Valid2 = preTreatment.concatenate_vectors(predProba_Valid2_s) predLabel5_Valid2 = tresholding.get_yPredicted_ratio(predProba_Valid2,
def train(max_depth, n_rounds): ############### ### IMPORT #### ############### # Importation parameters: split= True normalize = True noise_var = 0. train_size = 200000 train_size2 = 25000 valid_size = 25000 remove_999 = False # Import the training data: print("Extracting the data sets...") start = time.clock() train_s, train2_s, valid_s, test_s = tokenizer.extract_data(split= split, \ normalize= normalize, \ remove_999 = remove_999, \ noise_variance= noise_var, \ n_classes = "multiclass", \ train_size = train_size, \ train_size2 = train_size2, \ valid_size = valid_size) #RANDOM FOREST: #kwargs_grad = {} #kwargs_rdf = {'n_estimators': 100} print "Training on the train set ..." #predictor_s = randomForest.train_classifier(train_s[1], train_s[2], kwargs_rdf) #XGBOOST kwargs_xgb = {'bst_parameters': \ {'booster_type': 0, \ #'objective': 'binary:logitraw', 'objective': 'multi:softprob', 'num_class': 5, 'bst:eta': 0.1, # the bigger the more conservative 'bst:subsample': 1, # prevent over fitting if <1 'bst:max_depth': max_depth, 'eval_metric': 'auc', 'silent': 1, 'nthread': 8 }, \ 'n_rounds': n_rounds} predictor_s = xgBoost.train_classifier(train_s[1], train_s[2], train_s[3], 550000, kwargs_xgb) #TEST / SUBMISSION """ yProbaTest_s = [] yProbaTestBinary_s = [] print "Classifying the test set..." for i in range(8): yProbaTest = xgBoost.predict_proba(predictor_s[i], test_s[1][i]) yProbaTest_s.append(yProbaTest) print "Making the binary proba vector..." for i in range(8): yProbaTestBinary_s.append(np.zeros(yProbaTest_s[i].shape[0])) for i in range(8): for j in range(yProbaTest_s[i].shape[0]): yProbaTestBinary_s[i][j] = 1 - yProbaTest_s[i][j][0] print "Concatenating the vectors..." yProbaTestBinary = preTreatment.concatenate_vectors(yProbaTestBinary_s) IDs = preTreatment.concatenate_vectors(test_s[0]) yProbaTestBinaryRanked = submission.rank_signals(yProbaTestBinary) yPredictedTest = tresholding.get_yPredicted_ratio(yProbaTestBinary, 0.15) s = submission.print_submission(IDs, yProbaTestBinaryRanked, yPredictedTest, "newAMSmesure") """ # TRAIN AND VALID yPredictedTrain2_s = [] yProbaTrain2_s = [] yProbaTrain2Binary_s = [] yPredictedValid_s = [] yProbaValid_s = [] yProbaValidBinary_s = [] print "Classifying the train2 set..." for i in range(8): yProbaTrain2 = xgBoost.predict_proba(predictor_s[i], train2_s[1][i]) yProbaTrain2_s.append(yProbaTrain2) print "Classifying the valid set..." for i in range(8): yProbaValid = xgBoost.predict_proba(predictor_s[i], valid_s[1][i]) yProbaValid_s.append(yProbaValid) print "Making the binary proba vector..." for i in range(8): yProbaTrain2Binary_s.append(np.zeros(yProbaTrain2_s[i].shape[0])) yProbaValidBinary_s.append(np.zeros(yProbaValid_s[i].shape[0])) for i in range(8): for j in range(yProbaTrain2_s[i].shape[0]): yProbaTrain2Binary_s[i][j] = 1 - yProbaTrain2_s[i][j][0] for j in range(yProbaValid_s[i].shape[0]): yProbaValidBinary_s[i][j] = 1 - yProbaValid_s[i][j][0] print "Concatenating the vectors..." yProbaTrain2Binary = preTreatment.concatenate_vectors(yProbaTrain2Binary_s) yProbaValidBinary = preTreatment.concatenate_vectors(yProbaValidBinary_s) yTrain2 = preTreatment.concatenate_vectors(train2_s[2]) yValid = preTreatment.concatenate_vectors(valid_s[2]) weightsTrain2 = preTreatment.concatenate_vectors(train2_s[3]) weightsValid = preTreatment.concatenate_vectors(valid_s[3]) print "Putting all the real labels to 1" yTrain2 = preTreatment.multiclass2binary(yTrain2) yValid = preTreatment.multiclass2binary(yValid) print "Getting the best ratios..." best_ams_train2_global, best_ratio_global = tresholding.best_ratio(yProbaTrain2Binary, yTrain2, weightsTrain2) #best_ams_train2_combinaison, best_ratio_combinaison = tresholding.best_ratio_combinaison_global(yProbaTrain2Binary_s, train2_s[2], train2_s[3], 1) yPredictedValid = tresholding.get_yPredicted_ratio(yProbaValidBinary, 0.15) yPredictedValid_best_ratio_global = tresholding.get_yPredicted_ratio(yProbaValidBinary, best_ratio_global) #yPredictedValid_best_ratio_combinaison_s, yPredictedValid_best_ratio_combinaison = tresholding.get_yPredicted_ratio_8(yProbaTrain2Binary_s, best_ratio_combinaison) #Let's compute the predicted AMS s, b = submission.get_s_b(yPredictedValid, yValid, weightsValid) AMS = hbc.AMS(s,b) #s_best_ratio_combinaison, b_best_ratio_combinaison = submission.get_s_b(yPredictedValid_best_ratio_combinaison, yValid, weightsValid) #AMS_best_ratio_combinaison = hbc.AMS(s_best_ratio_combinaison, b_best_ratio_combinaison) s_best_ratio_global, b_best_ratio_global = submission.get_s_b(yPredictedValid_best_ratio_global, yValid, weightsValid) AMS_best_ratio_global = hbc.AMS(s_best_ratio_global, b_best_ratio_global) print "AMS 0.15 = %f" %AMS print " " #print "AMS best ratio combi= %f" %AMS_best_ratio_combinaison #print "best AMS train2 ratio combinaison= %f" %best_ams_train2_combinaison #print "best ratio combinaison train 2 = %s" %str(best_ratio_combinaison) print " " print "best AMS valid ratio global= %f" %AMS_best_ratio_global print "best AMS train2 ratio global= %f" %best_ams_train2_global print "best ratio global train2 = %f" %best_ratio_global return AMS