def Blend(): trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv", False) target = [x[0] for x in trainBase] dataset_blend_train, dataset_blend_test = stack_gb.run_stack() clf = LogisticRegression() clf.fit(dataset_blend_train, target) submission = clf.predict_proba(dataset_blend_test)[:, 1] submission = ["%f" % x for x in submission] now = datetime.datetime.now() csv_io.write_delimited_file_GUID( "../Submissions/stack_" + now.strftime("%Y%m%d%H%M") + ".csv", "PreProcessData/test_PatientGuid.csv", submission) # attempt to score the training set to predict score for blend... probSum = 0.0 trainPrediction = clf.predict_proba(dataset_blend_train)[:, 1] for i in range(0, len(trainPrediction)): probX = trainPrediction[i] if (probX > 0.999): probX = 0.999 if (probX < 0.001): probX = 0.001 probSum += int( target[i]) * log(probX) + (1 - int(target[i])) * log(1 - probX) print "Train Score: ", (-probSum / len(trainPrediction)) var = raw_input("Enter to terminate.")
def Blend(): trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv", False) target = [x[0] for x in trainBase] dataset_blend_train, dataset_blend_test = stack_knn.run_stack() clf = LogisticRegression() clf.fit(dataset_blend_train, target) submission = clf.predict_proba(dataset_blend_test)[:,1] submission = ["%f" % x for x in submission] now = datetime.datetime.now() csv_io.write_delimited_file_GUID("../Submissions/stack" + now.strftime("%Y%m%d%H%M") + ".csv", "PreProcessData/test_PatientGuid.csv", submission) # attempt to score the training set to predict score for blend... probSum = 0.0 trainPrediction = clf.predict_proba(dataset_blend_train)[:,1] for i in range(0, len(trainPrediction)): probX = trainPrediction[i] if ( probX > 0.999): probX = 0.999; if ( probX < 0.001): probX = 0.001; probSum += int(target[i])*log(probX)+(1-int(target[i]))*log(1-probX) print "Train Score: ", (-probSum/len(trainPrediction)) var = raw_input("Enter to terminate.")
def Blend(): trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv", False) SEED = 448 random.seed(SEED) random.shuffle(trainBase) target = [x[0] for x in trainBase] dataset_blend_train, dataset_blend_test = stack.run_stack(SEED) clfs = [LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None), LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None), LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None), LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None), LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None), LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None)] test = csv_io.read_data("PreProcessData/test_PreProcess2.csv", False) dataset_blend_test_j = np.zeros((len(test), len(clfs))) for ExecutionIndex, clf in enumerate(clfs): #clf = LogisticRegression() clf.fit(dataset_blend_train, target) submission = clf.predict_proba(dataset_blend_test)[:,1] submission = ["%f" % x for x in submission] now = datetime.datetime.now() csv_io.write_delimited_file_GUID("../Submissions/stack" + now.strftime("%Y%m%d%H%M%S") + ".csv", "PreProcessData/test_PatientGuid.csv", submission) # attempt to score the training set to predict score for blend... probSum = 0.0 trainPrediction = clf.predict_proba(dataset_blend_train)[:,1] for i in range(0, len(trainPrediction)): probX = trainPrediction[i] if ( probX > 0.999): probX = 0.999; if ( probX < 0.001): probX = 0.001; probSum += int(target[i])*log(probX)+(1-int(target[i]))*log(1-probX) print "Train Score: ", (-probSum/len(trainPrediction)) dataset_blend_test_j[:, ExecutionIndex] = submission csv_io.write_delimited_file_GUID_numpy("../Submissions/stack_LG_" + now.strftime("%Y%m%d%H%M%S") + ".csv", "PreProcessData/test_PatientGuid.csv", dataset_blend_test_j.mean(1)) var = raw_input("Enter to terminate.")
def Blend(): trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv", False) SEED = 448 random.seed(SEED) random.shuffle(trainBase) target = [x[0] for x in trainBase] dataset_blend_train, dataset_blend_test = stack.run_stack(SEED) clfs = [ LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None), LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None), LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None), LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None), LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.5, fit_intercept=True, intercept_scaling=1, class_weight=None), LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None) ] test = csv_io.read_data("PreProcessData/test_PreProcess2.csv", False) dataset_blend_test_j = np.zeros((len(test), len(clfs))) for ExecutionIndex, clf in enumerate(clfs): #clf = LogisticRegression() clf.fit(dataset_blend_train, target) submission = clf.predict_proba(dataset_blend_test)[:, 1] submission = ["%f" % x for x in submission] now = datetime.datetime.now() csv_io.write_delimited_file_GUID( "../Submissions/stack" + now.strftime("%Y%m%d%H%M%S") + ".csv", "PreProcessData/test_PatientGuid.csv", submission) # attempt to score the training set to predict score for blend... probSum = 0.0 trainPrediction = clf.predict_proba(dataset_blend_train)[:, 1] for i in range(0, len(trainPrediction)): probX = trainPrediction[i] if (probX > 0.999): probX = 0.999 if (probX < 0.001): probX = 0.001 probSum += int( target[i]) * log(probX) + (1 - int(target[i])) * log(1 - probX) print "Train Score: ", (-probSum / len(trainPrediction)) dataset_blend_test_j[:, ExecutionIndex] = submission csv_io.write_delimited_file_GUID_numpy( "../Submissions/stack_LG_" + now.strftime("%Y%m%d%H%M%S") + ".csv", "PreProcessData/test_PatientGuid.csv", dataset_blend_test_j.mean(1)) var = raw_input("Enter to terminate.")
def run_stack(): trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv", False) test = csv_io.read_data("PreProcessData/test_PreProcess2.csv", False) avg = 0 NumFolds = 5 # should be odd for median predicted_list = [] spanDistance = 12 bootstrapLists = [] clfs = [RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini'), RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy'), ExtraTreesClassifier(n_estimators=100, n_jobs=1, criterion='gini'), ExtraTreesClassifier(n_estimators=100, n_jobs=1, criterion='entropy'), GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50)] print len(trainBase), len(test) dataset_blend_train = np.zeros((len(trainBase), len(clfs))) dataset_blend_test = np.zeros((len(test), len(clfs))) for ExecutionIndex, clf in enumerate(clfs): print clf predicted_list = [] avg = 0 dataset_blend_test_j = np.zeros((len(test), NumFolds)) foldCount = 0 #print [trainBase[i][0] for i in range(len(trainBase))] #Folds = cross_validation.KFold(len(trainBase) - 1, k=NumFolds, indices=True, shuffle=False, random_state=None) Folds = cross_validation.StratifiedKFold([trainBase[i][0] for i in range(len(trainBase))], k=NumFolds, indices=True) for train_index, test_index in Folds: trainBaseTemp = [trainBase[i] for i in train_index] target = [x[0] for x in trainBaseTemp] train = [x[1:] for x in trainBaseTemp] testBaseTemp = [trainBase[i] for i in test_index] targetTest = [x[0] for x in testBaseTemp] trainTest = [x[1:] for x in testBaseTemp] test = [x[0:] for x in test] #rf = RandomForestClassifier(n_estimators=n_est, criterion='entropy', max_depth=None, min_samples_split=1, min_samples_leaf=1, min_density=0.10000000000000001, max_features='auto', bootstrap=True, compute_importances=False, oob_score=False, n_jobs=1, random_state=None, verbose=0) # , max_features=None clf.fit(train, target) prob = clf.predict_proba(trainTest) dataset_blend_train[test_index, ExecutionIndex] = prob[:,1] probSum = 0 totalOffByHalf = 0 totalPositive = 0 totalPositiveOffByHalf = 0 totalPositivePredictions = 0 for i in range(0, len(prob)): probX = prob[i][1] # [1] if ( probX > 0.999): probX = 0.999; if ( probX < 0.001): probX = 0.001; #print i, probSum, probX, targetTest[i] #print target[i]*log(probX), (1-target[i])*log(1-probX) probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX) if ( math.fabs(probX - int(targetTest[i])) > 0.5 ): totalOffByHalf = totalOffByHalf + 1 if ( int(targetTest[i]) == 1 ): totalPositive = totalPositive + 1 if ( int(targetTest[i]) == 1 and probX < 0.5): totalPositiveOffByHalf = totalPositiveOffByHalf + 1 if (probX > 0.5): totalPositivePredictions = totalPositivePredictions + 1 print "Total Off By > 0.5 ", totalOffByHalf print "Total Positive ", totalPositive print "Total Positive Off By Half ", totalPositiveOffByHalf print "Total Positive Predictions ", totalPositivePredictions print -probSum/len(prob) avg += (-probSum/len(prob))/NumFolds predicted_probs = clf.predict_proba(test) # was test #print [x[1] for x in predicted_probs] predicted_list.append([x[1] for x in predicted_probs]) dataset_blend_test_j[:, foldCount] = predicted_probs[:,1] foldCount = foldCount + 1 dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_j.mean(1) print "------------------------Average: ", avg avg_list = np.zeros(len(test)) med_list = np.zeros(len(test)) # For N folds, get the average/median for each prediction item in test set. for p in range(0, len(test)): temp_list =[] for q in range(0, len(predicted_list)): temp_list.append( predicted_list[q][p]) avg_list[p] = mean(temp_list) med_list[p] = getMedian(temp_list) #print p, q, temp_list, mean(temp_list), getMedian(temp_list) bootstrapLists.append(avg_list) # This would be used if we ran multiple runs with different training values. # Primitive stacking, should rather save data, and do formal stacking. if ( len(bootstrapLists) > 1 ): finalList = [] for p in range(0, len(test)): temp_list =[] for q in range(0, len(bootstrapLists)): temp_list.append( bootstrapLists[q][p]) finalList.append( meanSpan(temp_list, spanDistance) ) #print p, q, temp_list, meanSpan(temp_list, spanDistance) else: finalList = bootstrapLists[0] #finalList = SimpleScale(finalList) avg_values = ["%f" % x for x in finalList] csv_io.write_delimited_file_GUID("../Submissions/rf2_5fold_avg.csv", "PreProcessData/test_PatientGuid.csv", avg_values) #for rec in dataset_blend_train: # print rec return dataset_blend_train, dataset_blend_test
def run_stack(): print "Running GB Stack" trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv", False) test = csv_io.read_data("PreProcessData/test_PreProcess2.csv", False) avg = 0 NumFolds = 5 # should be odd for median NumFeatures = 1000 predicted_list = [] spanDistance = 12 bootstrapLists = [] #clfs = [GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125), # RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini',compute_importances=True), # RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy',compute_importances=True), # ExtraTreesClassifier(n_estimators=100, n_jobs=1, criterion='gini', compute_importances=True), # ExtraTreesClassifier(n_estimators=100, n_jobs=1, criterion='entropy',compute_importances=True)] rnd_start = 456 #n_estArr = [40, 80, 120] #20,40,80,160,,640,1280,4000,8000,16000 #learn_rArr = [0.5, 0.2, 0.1, 0.05, 0.01, 0.005, 0.001] n_estArr = [0.8, 0.4, 0.2, 0.1, 0.05, 0.025] #20,40,80,160,,640,1280,4000,8000,16000 learn_rArr = [4, 8, 12, 18] print len(trainBase), len(test) dataset_blend_train = np.zeros( (len(trainBase), len(n_estArr) * len(learn_rArr))) dataset_blend_test = np.zeros((len(test), len(n_estArr) * len(learn_rArr))) trainNew = [] trainTestNew = [] testNew = [] trainNewSelect = [] trainTestNewSelect = [] testNewSelect = [] print "Start Feaure Select" #f_classif(np.array([x[1:] for x in trainBase]), np.array([x[0] for x in trainBase])) #print "done1" #fs = SelectKBest(chi2, k=NumFeatures) #fs.fit(scipy.array([x[1:] for x in trainBase]), scipy.array([x[0] for x in trainBase])) #fs.fit(np.array([x[1:] for x in trainBase]), np.array([x[0] for x in trainBase])) print "End Feaure Select" LastClassifier = "" ExecutionIndex = 0 #for ExecutionIndex, clf in enumerate(clfs): for n_est in n_estArr: for learn_r in learn_rArr: print "n_est ", n_est, "learn_r ", learn_r #clf = GradientBoostingClassifier(loss='deviance', learn_rate=learn_r, n_estimators=n_est, subsample=0.2, min_samples_split=1, min_samples_leaf=1, max_depth=8, init=None, random_state=rnd_start) clf = GradientBoostingClassifier(loss='deviance', learn_rate=0.05, n_estimators=50, subsample=n_est, min_samples_split=1, min_samples_leaf=1, max_depth=learn_r, init=None, random_state=rnd_start) print clf avg = 0 predicted_list = [] dataset_blend_test_j = np.zeros((len(test), NumFolds)) foldCount = 0 #print [trainBase[i][0] for i in range(len(trainBase))] #Folds = cross_validation.KFold(len(trainBase) - 1, k=NumFolds, indices=True, shuffle=False, random_state=None) Folds = cross_validation.StratifiedKFold( [trainBase[i][0] for i in range(len(trainBase))], k=NumFolds, indices=True) for train_index, test_index in Folds: trainBaseTemp = [trainBase[i] for i in train_index] target = [x[0] for x in trainBaseTemp] train = [x[1:] for x in trainBaseTemp] testBaseTemp = [trainBase[i] for i in test_index] targetTest = [x[0] for x in testBaseTemp] trainTest = [x[1:] for x in testBaseTemp] test = [x[0:] for x in test] #rf = RandomForestClassifier(n_estimators=n_est, criterion='entropy', max_depth=None, min_samples_split=1, min_samples_leaf=1, min_density=0.10000000000000001, max_features='auto', bootstrap=True, compute_importances=False, oob_score=False, n_jobs=1, random_state=None, verbose=0) # , max_features=None print "LEN: ", len(train), len(target) if (False and LastClassifier != str(clf)[:10] and (str(clf).startswith('RandomForest') or str(clf).startswith('ExtraTrees'))): clf.fit(train, target) LastClassifier = str(clf)[:10] print "Computing Importances" importances = clf.feature_importances_ #print importances importancesTemp = sorted(importances, reverse=True) print len(importancesTemp), "importances" if (len(importancesTemp) > NumFeatures): threshold = importancesTemp[NumFeatures] #print "Sorted and deleted importances" #print importancesTemp for row in train: newRow = [] for impIndex, importance in enumerate(importances): if (importance > threshold): newRow.append(row[impIndex]) trainNew.append(newRow) for row in trainTest: newRow = [] for impIndex, importance in enumerate(importances): if (importance > threshold): newRow.append(row[impIndex]) trainTestNew.append(newRow) for row in test: newRow = [] for impIndex, importance in enumerate(importances): if (importance > threshold): #print impIndex, len(importances) newRow.append(row[impIndex]) testNew.append(newRow) else: trainNew = train trainTestNew = trainTest testNew = test else: #trainNew = fs.transform(train) #trainTestNew = fs.transform(trainTest) #testNew = fs.transform(test) trainNew = train trainTestNew = trainTest testNew = test clf.fit(trainNew, target) prob = clf.predict_proba(trainTestNew) dataset_blend_train[test_index, ExecutionIndex] = prob[:, 1] probSum = 0 totalOffByHalf = 0 totalPositive = 0 totalPositiveOffByHalf = 0 totalPositivePredictions = 0 for i in range(0, len(prob)): probX = prob[i][1] # [1] if (probX > 0.999): probX = 0.999 if (probX < 0.001): probX = 0.001 #print i, probSum, probX, targetTest[i] #print target[i]*log(probX), (1-target[i])*log(1-probX) probSum += int(targetTest[i]) * log(probX) + ( 1 - int(targetTest[i])) * log(1 - probX) if (math.fabs(probX - int(targetTest[i])) > 0.5): totalOffByHalf = totalOffByHalf + 1 if (int(targetTest[i]) == 1): totalPositive = totalPositive + 1 if (int(targetTest[i]) == 1 and probX < 0.5): totalPositiveOffByHalf = totalPositiveOffByHalf + 1 if (probX > 0.5): totalPositivePredictions = totalPositivePredictions + 1 print "Total Off By > 0.5 ", totalOffByHalf print "Total Positive ", totalPositive print "Total Positive Off By Half ", totalPositiveOffByHalf print "Total Positive Predictions ", totalPositivePredictions print -probSum / len(prob) avg += (-probSum / len(prob)) / NumFolds predicted_probs = clf.predict_proba(testNew) # was test #print [x[1] for x in predicted_probs] predicted_list.append([x[1] for x in predicted_probs]) dataset_blend_test_j[:, foldCount] = predicted_probs[:, 1] foldCount = foldCount + 1 dataset_blend_test[:, ExecutionIndex] = dataset_blend_test_j.mean(1) now = datetime.datetime.now() #csv_io.write_delimited_file_GUID("../Submissions/stack_avg" + now.strftime("%Y%m%d%H%M") + "_" + str(avg) + ".csv", "PreProcessData/test_PatientGuid.csv", dataset_blend_test_j.mean(1)) print "------------------------------------------------Average: ", avg open("stack_gb_data.txt", "a").write( str(n_est) + ',' + str(learn_r) + ',' + str(avg) + "\n") avg_list = np.zeros(len(test)) med_list = np.zeros(len(test)) # For N folds, get the average/median for each prediction item in test set. for p in range(0, len(test)): temp_list = [] for q in range(0, len(predicted_list)): temp_list.append(predicted_list[q][p]) avg_list[p] = mean(temp_list) med_list[p] = getMedian(temp_list) #print p, q, temp_list, mean(temp_list), getMedian(temp_list) bootstrapLists.append(avg_list) ExecutionIndex = ExecutionIndex + 1 # This would be used if we ran multiple runs with different training values. # Primitive stacking, should rather save data, and do formal stacking. if (len(bootstrapLists) > 1): finalList = [] for p in range(0, len(test)): temp_list = [] for q in range(0, len(bootstrapLists)): temp_list.append(bootstrapLists[q][p]) finalList.append(meanSpan(temp_list, spanDistance)) #print p, q, temp_list, meanSpan(temp_list, spanDistance) else: finalList = bootstrapLists[0] #finalList = SimpleScale(finalList) avg_values = ["%f" % x for x in finalList] csv_io.write_delimited_file_GUID("../Submissions/gb_5fold_avg.csv", "PreProcessData/test_PatientGuid.csv", avg_values) #for rec in dataset_blend_train: # print rec return dataset_blend_train, dataset_blend_test
def Blend(): trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv", False) SEED = 448 random.seed(SEED) random.shuffle(trainBase) target = [x[0] for x in trainBase] dataset_blend_train, dataset_blend_test = stack_rf.run_stack(SEED) clf = LogisticRegression() clf.fit(dataset_blend_train, target) submission = clf.predict_proba(dataset_blend_test)[:, 1] submission = ["%f" % x for x in submission] now = datetime.datetime.now() csv_io.write_delimited_file_GUID( "../Submissions/stack" + now.strftime("%Y%m%d%H%M") + ".csv", "PreProcessData/test_PatientGuid.csv", submission) # attempt to score the training set to predict score for blend... probSum = 0.0 trainPrediction = clf.predict_proba(dataset_blend_train)[:, 1] for i in range(0, len(trainPrediction)): probX = trainPrediction[i] if (probX > 0.999): probX = 0.999 if (probX < 0.001): probX = 0.001 probSum += int( target[i]) * log(probX) + (1 - int(target[i])) * log(1 - probX) print "Train Score: ", (-probSum / len(trainPrediction)) trainPredictionNew = stack_rf.SimpleScale(trainPrediction, floor=0.001, ceiling=0.999) probSum = 0.0 for i in range(0, len(trainPredictionNew)): probX = trainPredictionNew[i] if (probX > 0.999): probX = 0.999 if (probX < 0.001): probX = 0.001 probSum += int( target[i]) * log(probX) + (1 - int(target[i])) * log(1 - probX) print "Train Score for 0.999 and 0.001 with SimpleScale: ", ( -probSum / len(trainPredictionNew)) trainPredictionNew = stack_rf.SimpleScale(trainPrediction, floor=0.01, ceiling=0.99) probSum = 0.0 for i in range(0, len(trainPredictionNew)): probX = trainPredictionNew[i] if (probX > 0.999): probX = 0.999 if (probX < 0.001): probX = 0.001 probSum += int( target[i]) * log(probX) + (1 - int(target[i])) * log(1 - probX) print "Train Score for 0.99 and 0.01 with SimpleScale: ", ( -probSum / len(trainPredictionNew)) trainPredictionNew = stack_rf.SimpleScale(trainPrediction, floor=0.05, ceiling=0.95) probSum = 0.0 for i in range(0, len(trainPredictionNew)): probX = trainPredictionNew[i] if (probX > 0.999): probX = 0.999 if (probX < 0.001): probX = 0.001 probSum += int( target[i]) * log(probX) + (1 - int(target[i])) * log(1 - probX) print "Train Score for 0.95 and 0.05 with SimpleScale: ", ( -probSum / len(trainPredictionNew)) submissionNew = stack_rf.SimpleScale(submission, floor=0.001, ceiling=0.999) csv_io.write_delimited_file_GUID( "../Submissions/stack" + now.strftime("%Y%m%d%H%M") + "_SimpleScale999.csv", "PreProcessData/test_PatientGuid.csv", submissionNew) submissionNew = stack_rf.SimpleScale(submission, floor=0.01, ceiling=0.99) csv_io.write_delimited_file_GUID( "../Submissions/stack" + now.strftime("%Y%m%d%H%M") + "_SimpleScale99.csv", "PreProcessData/test_PatientGuid.csv", submissionNew) submissionNew = stack_rf.SimpleScale(submission, floor=0.05, ceiling=0.95) csv_io.write_delimited_file_GUID( "../Submissions/stack" + now.strftime("%Y%m%d%H%M") + "_SimpleScale95.csv", "PreProcessData/test_PatientGuid.csv", submissionNew) var = raw_input("Enter to terminate.")
def run_stack(SEED): print "Running GB, RF, ET stack x2" trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv", False) test = csv_io.read_data("PreProcessData/test_PreProcess2.csv", False) random.seed(SEED) random.shuffle(trainBase) avg = 0 NumFolds = 10 # should be odd for median NumFeatures = 1000 predicted_list = [] spanDistance = 12 bootstrapLists = [] #clfs = [GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), # GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50) # ] # try to vary n_est #clfs = [GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125), # RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini'), # RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy')] clfs = [GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125), RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini'), RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy')] # note, can use 50, 100, 150, 200 for n_estimators for ET and RF # clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3), # ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4), # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6), # ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7), # ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8), # GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200), # GradientBoostingClassifier(lesarn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3), # RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6), # RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7), # RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8)] #clfs = [GradientBoostingClassifier(learn_rate=0.2, subsample=0.2, max_depth=8, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.1, subsample=0.2, max_depth=8, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.1, subsample=0.2, max_depth=8, n_estimators=320), # GradientBoostingClassifier(learn_rate=0.05, subsample=0.2, max_depth=8, n_estimators=640), # RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini',compute_importances=True), # RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy',compute_importances=True), # ExtraTreesClassifier(n_estimators=100, n_jobs=1, criterion='gini', compute_importances=True), # ExtraTreesClassifier(n_estimators=100, n_jobs=1, criterion='entropy',compute_importances=True)] print len(trainBase), len(test) dataset_blend_train = np.zeros((len(trainBase), len(clfs))) dataset_blend_test = np.zeros((len(test), len(clfs))) trainNew = [] trainTestNew = [] testNew = [] trainNewSelect = [] trainTestNewSelect = [] testNewSelect = [] print "Start Feaure Select" #f_classif(np.array([x[1:] for x in trainBase]), np.array([x[0] for x in trainBase])) #print "done1" #fs = SelectKBest(chi2, k=NumFeatures) #fs.fit(scipy.array([x[1:] for x in trainBase]), scipy.array([x[0] for x in trainBase])) #fs.fit(np.array([x[1:] for x in trainBase]), np.array([x[0] for x in trainBase])) print "End Feaure Select" LastClassifier = "" for ExecutionIndex, clf in enumerate(clfs): print clf avg = 0 predicted_list = [] dataset_blend_test_j = np.zeros((len(test), NumFolds)) foldCount = 0 #print [trainBase[i][0] for i in range(len(trainBase))] #Folds = cross_validation.KFold(len(trainBase) - 1, k=NumFolds, indices=True, shuffle=False, random_state=None) #StratifiedShuffleSplit has much poorer performance than StratifiedKFold #NOTE, the shuffle and bootstrap don't promise all elements are used in training, and then the blend has missing values which means it won't predicte correctly. #Folds = StratifiedShuffleSplit([trainBase[i][0] for i in range(len(trainBase))], NumFolds, indices=True) #Folds = cross_validation.Bootstrap(len(trainBase), n_bootstraps=5, train_size=0.8, random_state=0) Folds = cross_validation.StratifiedKFold([trainBase[i][0] for i in range(len(trainBase))], k=NumFolds, indices=True) for train_index, test_index in Folds: trainBaseTemp = [trainBase[i] for i in train_index] target = [x[0] for x in trainBaseTemp] train = [x[1:] for x in trainBaseTemp] testBaseTemp = [trainBase[i] for i in test_index] targetTest = [x[0] for x in testBaseTemp] trainTest = [x[1:] for x in testBaseTemp] test = [x[0:] for x in test] #rf = RandomForestClassifier(n_estimators=n_est, criterion='entropy', max_depth=None, min_samples_split=1, min_samples_leaf=1, min_density=0.10000000000000001, max_features='auto', bootstrap=True, compute_importances=False, oob_score=False, n_jobs=1, random_state=None, verbose=0) # , max_features=None print "LEN: ", len(train), len(target) if (False and LastClassifier != str(clf)[:10] and (str(clf).startswith( 'RandomForest' ) or str(clf).startswith( 'ExtraTrees' ))) : clf.fit(train, target) LastClassifier = str(clf)[:10] print "Computing Importances" importances = clf.feature_importances_ #print importances importancesTemp = sorted(importances, reverse=True) print len(importancesTemp), "importances" if ( len(importancesTemp) > NumFeatures): threshold = importancesTemp[NumFeatures] #print "Sorted and deleted importances" #print importancesTemp for row in train: newRow = [] for impIndex, importance in enumerate(importances): if ( importance > threshold ) : newRow.append(row[impIndex]) trainNew.append(newRow) for row in trainTest: newRow = [] for impIndex, importance in enumerate(importances): if ( importance > threshold ) : newRow.append(row[impIndex]) trainTestNew.append(newRow) for row in test: newRow = [] for impIndex, importance in enumerate(importances): if ( importance > threshold ) : #print impIndex, len(importances) newRow.append(row[impIndex]) testNew.append(newRow) else: trainNew = train trainTestNew = trainTest testNew = test else: #trainNew = fs.transform(train) #trainTestNew = fs.transform(trainTest) #testNew = fs.transform(test) trainNew = train trainTestNew = trainTest testNew = test clf.fit(trainNew, target) prob = clf.predict_proba(trainTestNew) dataset_blend_train[test_index, ExecutionIndex] = prob[:,1] probSum = 0 totalOffByHalf = 0 totalPositive = 0 totalPositiveOffByHalf = 0 totalPositivePredictions = 0 for i in range(0, len(prob)): probX = prob[i][1] # [1] if ( probX > 0.999): probX = 0.999; if ( probX < 0.001): probX = 0.001; #print i, probSum, probX, targetTest[i] #print target[i]*log(probX), (1-target[i])*log(1-probX) probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX) if ( math.fabs(probX - int(targetTest[i])) > 0.5 ): totalOffByHalf = totalOffByHalf + 1 if ( int(targetTest[i]) == 1 ): totalPositive = totalPositive + 1 if ( int(targetTest[i]) == 1 and probX < 0.5): totalPositiveOffByHalf = totalPositiveOffByHalf + 1 if (probX > 0.5): totalPositivePredictions = totalPositivePredictions + 1 print "Total Off By > 0.5 ", totalOffByHalf print "Total Positive ", totalPositive print "Total Positive Off By Half ", totalPositiveOffByHalf print "Total Positive Predictions ", totalPositivePredictions print -probSum/len(prob) avg += (-probSum/len(prob))/NumFolds predicted_probs = clf.predict_proba(testNew) # was test #print [x[1] for x in predicted_probs] predicted_list.append([x[1] for x in predicted_probs]) dataset_blend_test_j[:, foldCount] = predicted_probs[:,1] foldCount = foldCount + 1 dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_j.mean(1) # try median here (seems not implemented) now = datetime.datetime.now() #csv_io.write_delimited_file_GUID("../Submissions/stack_avg" + now.strftime("%Y%m%d%H%M") + "_" + str(avg) + ".csv", "PreProcessData/test_PatientGuid.csv", dataset_blend_test_j.mean(1)) print "------------------------Average: ", avg avg_list = np.zeros(len(test)) med_list = np.zeros(len(test)) # For N folds, get the average/median for each prediction item in test set. for p in range(0, len(test)): temp_list =[] for q in range(0, len(predicted_list)): temp_list.append( predicted_list[q][p]) avg_list[p] = mean(temp_list) med_list[p] = getMedian(temp_list) #print p, q, temp_list, mean(temp_list), getMedian(temp_list) bootstrapLists.append(avg_list) # This would be used if we ran multiple runs with different training values. # Primitive stacking, should rather save data, and do formal stacking. if ( len(bootstrapLists) > 1 ): finalList = [] for p in range(0, len(test)): temp_list =[] for q in range(0, len(bootstrapLists)): temp_list.append( bootstrapLists[q][p]) finalList.append( meanSpan(temp_list, spanDistance) ) #print p, q, temp_list, meanSpan(temp_list, spanDistance) else: finalList = bootstrapLists[0] #finalList = SimpleScale(finalList) avg_values = ["%f" % x for x in finalList] csv_io.write_delimited_file_GUID("../Submissions/rf2_5fold_avg.csv", "PreProcessData/test_PatientGuid.csv", avg_values) #for rec in dataset_blend_train: # print rec return dataset_blend_train, dataset_blend_test
def run_stack(SEED): print "Running GB, RF, ET stack x2" trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv", False) test = csv_io.read_data("PreProcessData/test_PreProcess2.csv", False) random.seed(SEED) random.shuffle(trainBase) avg = 0 NumFolds = 10 # should be odd for median NumFeatures = 1000 predicted_list = [] spanDistance = 12 bootstrapLists = [] #clfs = [GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), # GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50) # ] # try to vary n_est #clfs = [GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125), # RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini'), # RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy')] clfs = [ GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125), RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini'), RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy') ] # note, can use 50, 100, 150, 200 for n_estimators for ET and RF # clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3), # ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4), # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6), # ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7), # ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8), # GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200), # GradientBoostingClassifier(lesarn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3), # RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6), # RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7), # RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8)] #clfs = [GradientBoostingClassifier(learn_rate=0.2, subsample=0.2, max_depth=8, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.1, subsample=0.2, max_depth=8, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.1, subsample=0.2, max_depth=8, n_estimators=320), # GradientBoostingClassifier(learn_rate=0.05, subsample=0.2, max_depth=8, n_estimators=640), # RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini',compute_importances=True), # RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy',compute_importances=True), # ExtraTreesClassifier(n_estimators=100, n_jobs=1, criterion='gini', compute_importances=True), # ExtraTreesClassifier(n_estimators=100, n_jobs=1, criterion='entropy',compute_importances=True)] print len(trainBase), len(test) dataset_blend_train = np.zeros((len(trainBase), len(clfs))) dataset_blend_test = np.zeros((len(test), len(clfs))) trainNew = [] trainTestNew = [] testNew = [] trainNewSelect = [] trainTestNewSelect = [] testNewSelect = [] print "Start Feaure Select" #f_classif(np.array([x[1:] for x in trainBase]), np.array([x[0] for x in trainBase])) #print "done1" #fs = SelectKBest(chi2, k=NumFeatures) #fs.fit(scipy.array([x[1:] for x in trainBase]), scipy.array([x[0] for x in trainBase])) #fs.fit(np.array([x[1:] for x in trainBase]), np.array([x[0] for x in trainBase])) print "End Feaure Select" LastClassifier = "" for ExecutionIndex, clf in enumerate(clfs): print clf avg = 0 predicted_list = [] dataset_blend_test_j = np.zeros((len(test), NumFolds)) foldCount = 0 #print [trainBase[i][0] for i in range(len(trainBase))] #Folds = cross_validation.KFold(len(trainBase) - 1, k=NumFolds, indices=True, shuffle=False, random_state=None) #StratifiedShuffleSplit has much poorer performance than StratifiedKFold #NOTE, the shuffle and bootstrap don't promise all elements are used in training, and then the blend has missing values which means it won't predicte correctly. #Folds = StratifiedShuffleSplit([trainBase[i][0] for i in range(len(trainBase))], NumFolds, indices=True) #Folds = cross_validation.Bootstrap(len(trainBase), n_bootstraps=5, train_size=0.8, random_state=0) Folds = cross_validation.StratifiedKFold( [trainBase[i][0] for i in range(len(trainBase))], k=NumFolds, indices=True) for train_index, test_index in Folds: trainBaseTemp = [trainBase[i] for i in train_index] target = [x[0] for x in trainBaseTemp] train = [x[1:] for x in trainBaseTemp] testBaseTemp = [trainBase[i] for i in test_index] targetTest = [x[0] for x in testBaseTemp] trainTest = [x[1:] for x in testBaseTemp] test = [x[0:] for x in test] #rf = RandomForestClassifier(n_estimators=n_est, criterion='entropy', max_depth=None, min_samples_split=1, min_samples_leaf=1, min_density=0.10000000000000001, max_features='auto', bootstrap=True, compute_importances=False, oob_score=False, n_jobs=1, random_state=None, verbose=0) # , max_features=None print "LEN: ", len(train), len(target) if (False and LastClassifier != str(clf)[:10] and (str(clf).startswith('RandomForest') or str(clf).startswith('ExtraTrees'))): clf.fit(train, target) LastClassifier = str(clf)[:10] print "Computing Importances" importances = clf.feature_importances_ #print importances importancesTemp = sorted(importances, reverse=True) print len(importancesTemp), "importances" if (len(importancesTemp) > NumFeatures): threshold = importancesTemp[NumFeatures] #print "Sorted and deleted importances" #print importancesTemp for row in train: newRow = [] for impIndex, importance in enumerate(importances): if (importance > threshold): newRow.append(row[impIndex]) trainNew.append(newRow) for row in trainTest: newRow = [] for impIndex, importance in enumerate(importances): if (importance > threshold): newRow.append(row[impIndex]) trainTestNew.append(newRow) for row in test: newRow = [] for impIndex, importance in enumerate(importances): if (importance > threshold): #print impIndex, len(importances) newRow.append(row[impIndex]) testNew.append(newRow) else: trainNew = train trainTestNew = trainTest testNew = test else: #trainNew = fs.transform(train) #trainTestNew = fs.transform(trainTest) #testNew = fs.transform(test) trainNew = train trainTestNew = trainTest testNew = test clf.fit(trainNew, target) prob = clf.predict_proba(trainTestNew) dataset_blend_train[test_index, ExecutionIndex] = prob[:, 1] probSum = 0 totalOffByHalf = 0 totalPositive = 0 totalPositiveOffByHalf = 0 totalPositivePredictions = 0 for i in range(0, len(prob)): probX = prob[i][1] # [1] if (probX > 0.999): probX = 0.999 if (probX < 0.001): probX = 0.001 #print i, probSum, probX, targetTest[i] #print target[i]*log(probX), (1-target[i])*log(1-probX) probSum += int(targetTest[i]) * log(probX) + ( 1 - int(targetTest[i])) * log(1 - probX) if (math.fabs(probX - int(targetTest[i])) > 0.5): totalOffByHalf = totalOffByHalf + 1 if (int(targetTest[i]) == 1): totalPositive = totalPositive + 1 if (int(targetTest[i]) == 1 and probX < 0.5): totalPositiveOffByHalf = totalPositiveOffByHalf + 1 if (probX > 0.5): totalPositivePredictions = totalPositivePredictions + 1 print "Total Off By > 0.5 ", totalOffByHalf print "Total Positive ", totalPositive print "Total Positive Off By Half ", totalPositiveOffByHalf print "Total Positive Predictions ", totalPositivePredictions print -probSum / len(prob) avg += (-probSum / len(prob)) / NumFolds predicted_probs = clf.predict_proba(testNew) # was test #print [x[1] for x in predicted_probs] predicted_list.append([x[1] for x in predicted_probs]) dataset_blend_test_j[:, foldCount] = predicted_probs[:, 1] foldCount = foldCount + 1 dataset_blend_test[:, ExecutionIndex] = dataset_blend_test_j.mean( 1) # try median here (seems not implemented) now = datetime.datetime.now() #csv_io.write_delimited_file_GUID("../Submissions/stack_avg" + now.strftime("%Y%m%d%H%M") + "_" + str(avg) + ".csv", "PreProcessData/test_PatientGuid.csv", dataset_blend_test_j.mean(1)) print "------------------------Average: ", avg avg_list = np.zeros(len(test)) med_list = np.zeros(len(test)) # For N folds, get the average/median for each prediction item in test set. for p in range(0, len(test)): temp_list = [] for q in range(0, len(predicted_list)): temp_list.append(predicted_list[q][p]) avg_list[p] = mean(temp_list) med_list[p] = getMedian(temp_list) #print p, q, temp_list, mean(temp_list), getMedian(temp_list) bootstrapLists.append(avg_list) # This would be used if we ran multiple runs with different training values. # Primitive stacking, should rather save data, and do formal stacking. if (len(bootstrapLists) > 1): finalList = [] for p in range(0, len(test)): temp_list = [] for q in range(0, len(bootstrapLists)): temp_list.append(bootstrapLists[q][p]) finalList.append(meanSpan(temp_list, spanDistance)) #print p, q, temp_list, meanSpan(temp_list, spanDistance) else: finalList = bootstrapLists[0] #finalList = SimpleScale(finalList) avg_values = ["%f" % x for x in finalList] csv_io.write_delimited_file_GUID("../Submissions/rf2_5fold_avg.csv", "PreProcessData/test_PatientGuid.csv", avg_values) #for rec in dataset_blend_train: # print rec return dataset_blend_train, dataset_blend_test
def run_stack(): print "Running KNN Stack" trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv", False) test = csv_io.read_data("PreProcessData/test_PreProcess2.csv", False) avg = 0 NumFolds = 5 # should be odd for median NumFeatures = 1000 predicted_list = [] spanDistance = 12 bootstrapLists = [] #clfs = [GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125), # RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini',compute_importances=True), # RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy',compute_importances=True), # ExtraTreesClassifier(n_estimators=100, n_jobs=1, criterion='gini', compute_importances=True), # ExtraTreesClassifier(n_estimators=100, n_jobs=1, criterion='entropy',compute_importances=True)] rnd_start = 456 CC = [10,30,50,100] gg = [0] print len(trainBase), len(test) dataset_blend_train = np.zeros((len(trainBase), len(CC)*len(gg))) dataset_blend_test = np.zeros((len(test), len(CC)*len(gg))) trainNew = [] trainTestNew = [] testNew = [] trainNewSelect = [] trainTestNewSelect = [] testNewSelect = [] print "Start Feaure Select" #f_classif(np.array([x[1:] for x in trainBase]), np.array([x[0] for x in trainBase])) #print "done1" #fs = SelectKBest(chi2, k=NumFeatures) #fs.fit(scipy.array([x[1:] for x in trainBase]), scipy.array([x[0] for x in trainBase])) #fs.fit(np.array([x[1:] for x in trainBase]), np.array([x[0] for x in trainBase])) print "End Feaure Select" LastClassifier = "" ExecutionIndex = 0 #for ExecutionIndex, clf in enumerate(clfs): for g in gg: for C in CC: print "g ", g, " C " ,C clf = KNeighborsClassifier(n_neighbors=C, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2) print clf avg = 0 predicted_list = [] dataset_blend_test_j = np.zeros((len(test), NumFolds)) foldCount = 0 #print [trainBase[i][0] for i in range(len(trainBase))] #Folds = cross_validation.KFold(len(trainBase) - 1, k=NumFolds, indices=True, shuffle=False, random_state=None) Folds = cross_validation.StratifiedKFold([trainBase[i][0] for i in range(len(trainBase))], k=NumFolds, indices=True) for train_index, test_index in Folds: trainBaseTemp = [trainBase[i] for i in train_index] target = [x[0] for x in trainBaseTemp] train = [x[1:] for x in trainBaseTemp] testBaseTemp = [trainBase[i] for i in test_index] targetTest = [x[0] for x in testBaseTemp] trainTest = [x[1:] for x in testBaseTemp] test = [x[0:] for x in test] #rf = RandomForestClassifier(n_estimators=n_est, criterion='entropy', max_depth=None, min_samples_split=1, min_samples_leaf=1, min_density=0.10000000000000001, max_features='auto', bootstrap=True, compute_importances=False, oob_score=False, n_jobs=1, random_state=None, verbose=0) # , max_features=None print "LEN: ", len(train), len(target) if (False and LastClassifier != str(clf)[:10] and (str(clf).startswith( 'RandomForest' ) or str(clf).startswith( 'ExtraTrees' ))) : clf.fit(train, target) LastClassifier = str(clf)[:10] print "Computing Importances" importances = clf.feature_importances_ #print importances importancesTemp = sorted(importances, reverse=True) print len(importancesTemp), "importances" if ( len(importancesTemp) > NumFeatures): threshold = importancesTemp[NumFeatures] #print "Sorted and deleted importances" #print importancesTemp for row in train: newRow = [] for impIndex, importance in enumerate(importances): if ( importance > threshold ) : newRow.append(row[impIndex]) trainNew.append(newRow) for row in trainTest: newRow = [] for impIndex, importance in enumerate(importances): if ( importance > threshold ) : newRow.append(row[impIndex]) trainTestNew.append(newRow) for row in test: newRow = [] for impIndex, importance in enumerate(importances): if ( importance > threshold ) : #print impIndex, len(importances) newRow.append(row[impIndex]) testNew.append(newRow) else: trainNew = train trainTestNew = trainTest testNew = test else: #trainNew = fs.transform(train) #trainTestNew = fs.transform(trainTest) #testNew = fs.transform(test) trainNew = train trainTestNew = trainTest testNew = test clf.fit(trainNew, target) prob = clf.predict_proba(trainTestNew) dataset_blend_train[test_index, ExecutionIndex] = prob[:,1] probSum = 0 totalOffByHalf = 0 totalPositive = 0 totalPositiveOffByHalf = 0 totalPositivePredictions = 0 for i in range(0, len(prob)): probX = prob[i][1] # [1] if ( probX > 0.999): probX = 0.999; if ( probX < 0.001): probX = 0.001; #print i, probSum, probX, targetTest[i] #print target[i]*log(probX), (1-target[i])*log(1-probX) probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX) if ( math.fabs(probX - int(targetTest[i])) > 0.5 ): totalOffByHalf = totalOffByHalf + 1 if ( int(targetTest[i]) == 1 ): totalPositive = totalPositive + 1 if ( int(targetTest[i]) == 1 and probX < 0.5): totalPositiveOffByHalf = totalPositiveOffByHalf + 1 if (probX > 0.5): totalPositivePredictions = totalPositivePredictions + 1 print "Total Off By > 0.5 ", totalOffByHalf print "Total Positive ", totalPositive print "Total Positive Off By Half ", totalPositiveOffByHalf print "Total Positive Predictions ", totalPositivePredictions print -probSum/len(prob) avg += (-probSum/len(prob))/NumFolds predicted_probs = clf.predict_proba(testNew) # was test #print [x[1] for x in predicted_probs] predicted_list.append([x[1] for x in predicted_probs]) dataset_blend_test_j[:, foldCount] = predicted_probs[:,1] foldCount = foldCount + 1 #break ## ****************************************************************** cut off cross folds to 1. dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_j.mean(1) print "------------------------------------------------Average: ", avg open("stack_svm_poly_data.txt","a").write(str(g)+','+str(C)+','+str(avg)+"\n") avg_list = np.zeros(len(test)) med_list = np.zeros(len(test)) # For N folds, get the average/median for each prediction item in test set. for p in range(0, len(test)): temp_list =[] for q in range(0, len(predicted_list)): temp_list.append( predicted_list[q][p]) avg_list[p] = mean(temp_list) med_list[p] = getMedian(temp_list) #print p, q, temp_list, mean(temp_list), getMedian(temp_list) bootstrapLists.append(avg_list) ExecutionIndex = ExecutionIndex + 1 # This would be used if we ran multiple runs with different training values. # Primitive stacking, should rather save data, and do formal stacking. if ( len(bootstrapLists) > 1 ): finalList = [] for p in range(0, len(test)): temp_list =[] for q in range(0, len(bootstrapLists)): temp_list.append( bootstrapLists[q][p]) finalList.append( meanSpan(temp_list, spanDistance) ) #print p, q, temp_list, meanSpan(temp_list, spanDistance) else: finalList = bootstrapLists[0] #finalList = SimpleScale(finalList) avg_values = ["%f" % x for x in finalList] csv_io.write_delimited_file_GUID("../Submissions/gb_5fold_avg.csv", "PreProcessData/test_PatientGuid.csv", avg_values) #for rec in dataset_blend_train: # print rec return dataset_blend_train, dataset_blend_test
def run_rf(): trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv", False) test = csv_io.read_data("PreProcessData/test_PreProcess2.csv", False) test = [x[0:] for x in test] avg = 0 NumFolds = 5 # should be odd for median predicted_list = [] spanDistance = 12 bootstrapLists = [] NEstimators = [150, 250] # [50,100,150,200,300,400,500,600] ExecutionIndex = 0 print len(trainBase), len(test) dataset_blend_train = np.zeros((len(trainBase), len(NEstimators))) dataset_blend_test = np.zeros((len(test), len(NEstimators))) for n_est in NEstimators: predicted_list = [] avg = 0 dataset_blend_test_j = np.zeros((len(test), NumFolds)) foldCount = 0 Folds = cross_validation.KFold(len(trainBase) - 1, k=NumFolds, indices=True, shuffle=False, random_state=None) for train_index, test_index in Folds: trainBaseTemp = [trainBase[i] for i in train_index] target = [x[0] for x in trainBaseTemp] train = [x[1:] for x in trainBaseTemp] testBaseTemp = [trainBase[i] for i in test_index] targetTest = [x[0] for x in testBaseTemp] trainTest = [x[1:] for x in testBaseTemp] rf = RandomForestClassifier(n_estimators=n_est, criterion='entropy', max_depth=None, min_samples_split=1, min_samples_leaf=1, min_density=0.10000000000000001, max_features='auto', bootstrap=True, compute_importances=False, oob_score=False, n_jobs=1, random_state=None, verbose=0) # , max_features=None rf.fit(train, target) prob = rf.predict_proba(trainTest) dataset_blend_train[test_index, ExecutionIndex] = prob[:, 1] probSum = 0 totalOffByHalf = 0 totalPositive = 0 totalPositiveOffByHalf = 0 totalPositivePredictions = 0 for i in range(0, len(prob)): probX = prob[i][1] # [1] if (probX > 0.999): probX = 0.999 if (probX < 0.001): probX = 0.001 #print i, probSum, probX, targetTest[i] #print target[i]*log(probX), (1-target[i])*log(1-probX) probSum += int(targetTest[i]) * log(probX) + ( 1 - int(targetTest[i])) * log(1 - probX) if (math.fabs(probX - int(targetTest[i])) > 0.5): totalOffByHalf = totalOffByHalf + 1 if (int(targetTest[i]) == 1): totalPositive = totalPositive + 1 if (int(targetTest[i]) == 1 and probX < 0.5): totalPositiveOffByHalf = totalPositiveOffByHalf + 1 if (probX > 0.5): totalPositivePredictions = totalPositivePredictions + 1 print "Total Off By > 0.5 ", totalOffByHalf print "Total Positive ", totalPositive print "Total Positive Off By Half ", totalPositiveOffByHalf print "Total Positive Predictions ", totalPositivePredictions print "NEstimators: ", n_est print -probSum / len(prob) avg += (-probSum / len(prob)) / NumFolds predicted_probs = rf.predict_proba(test) # was test predicted_list.append([x[1] for x in predicted_probs]) dataset_blend_test_j[:, foldCount] = predicted_probs[:, 1] foldCount = foldCount + 1 dataset_blend_test[:, ExecutionIndex] = dataset_blend_test_j.mean(1) print "------------------------Average: ", avg avg_list = np.zeros(len(test)) med_list = np.zeros(len(test)) # For N folds, get the average/median for each prediction item in test set. for p in range(0, len(test)): temp_list = [] for q in range(0, len(predicted_list)): temp_list.append(predicted_list[q][p]) avg_list[p] = mean(temp_list) med_list[p] = getMedian(temp_list) #print p, q, temp_list, mean(temp_list), getMedian(temp_list) bootstrapLists.append(avg_list) ExecutionIndex = ExecutionIndex + 1 # This would be used if we ran multiple runs with different training values. # Primitive stacking, should rather save data, and do formal stacking. if (len(bootstrapLists) > 1): finalList = [] for p in range(0, len(test)): temp_list = [] for q in range(0, len(bootstrapLists)): temp_list.append(bootstrapLists[q][p]) finalList.append(meanSpan(temp_list, spanDistance)) #print p, q, temp_list, meanSpan(temp_list, spanDistance) else: finalList = bootstrapLists[0] finalList = SimpleScale(finalList) avg_values = ["%f" % x for x in finalList] csv_io.write_delimited_file_GUID("../Submissions/rf2_5fold_avg.csv", "PreProcessData/test_PatientGuid.csv", avg_values) #for rec in dataset_blend_train: # print rec return dataset_blend_train, dataset_blend_test