def compute_SVR(train_x, train_y, test_x): # make MAE scoring MAE = make_scorer(compute_error, greater_is_better = False) ######### SVR - Polynomial/rbf Kernel ######### # make pipeline std_SVR = make_pipeline(StandardScaler(), SVR()) params = {'svr__kernel': ['poly', 'rbf'], 'svr__degree': [1, 2]} gs = GridSearchCV(estimator = std_SVR, param_grid = params, scoring = MAE, n_jobs=-1, cv = 5, return_train_score = True) # fit grid search gs.fit(train_x, train_y) print('SVR train score', -gs.cv_results_['mean_train_score']) print('SVR test score', -gs.cv_results_['mean_test_score']) print('Best Parameter', gs.best_params_) print('Best score', -gs.best_score_) print('Parameters', gs.cv_results_['params']) # Train the best Model best_SVR = make_pipeline(StandardScaler(), SVR(kernel='poly', degree=1)) best_SVR.fit(train_x, train_y) # Make Prediction test_y = best_SVR.predict(test_x) # Create test output values predicted_y = test_y * -1 # Output file location file_name = '../Predictions/SVR_best.csv' # Writing output in Kaggle format print('Writing output to ', file_name) kaggle.kaggleize(predicted_y, file_name)
def trainer(model,feature_selection1,train_data,test_data, data_set,split): #features with low variance removed using formula Var(x)=p(1-p). Here, the equation is looking for features with variance below 80% if feature_selection1: X_all=np.concatenate((train_data[:,1:],test_data[:,1:]),axis=0) sel = VarianceThreshold(threshold=(.8 * (1 - .8))) X_red=sel.fit_transform(X_all) X_learn=X_red[:(len(X_red)/2)] X_test=X_red[(len(X_red)/2):] Y_learn=train_data[:,0] X_learnS, Y_learnS=shuffle(X_learn,Y_learn) else: X_learnS, Y_learnS=shuffle(train_data[:,1:],train_data[:,0]) X_test=test_data[:,1:] #best_model: trained model using optimized hyperparameters; errorF: Mean training error; c1: optimized n_estimators; #c2: optimized max_features; c3: optimized min_samples_leaf; e1: array of training errors for n_estimators; e2: array of training errors for max_features #e3: array of training errors for min_samples_leaf if model=='RF': best_model, errorF, c1, c2, c3,e1,e2,e3=cv_cv(X_learnS, Y_learnS, model, split) print model,"for", data_set, "- Mean Training Error, optimized n_estimators,max_features,min_samples_leaf: ", errorF, c1, c2,c3 plot_line(range(1,20),e1,'n_estimators') plot_line(np.arange(0.1,1.1,0.1),e2,'max_features') plot_line(range(1,10),e3,'min_samples_leaf') elif model=='DT': best_model, errorF, c1, c2,e1,e2=cv_cv(X_learnS, Y_learnS, model, split) print model,"for", data_set, "- Mean Training Error, optimized min_samples_leaf, max_depth: ", errorF, c1, c2 else: best_model, errorF, c1, c2,e1,e2=cv_cv(X_learnS, Y_learnS, model, split) print model,"for", data_set, "- Mean Training Error, optimized p, n_neighbors: ", errorF, c1, c2 reel=best_model.predict(X_test) # # #Save prediction file in Kaggle format predictions = reel kaggle.kaggleize(predictions, "../Predictions/"+data_set+"/test.csv")
def executeTrainDT(self, data, kfold, depthLst, fileTestOutputDT): trainX = data[0] trainY = data[1] testX = data[2] tree_para = {'criterion': ['gini'], 'max_depth': depthLst} clf = GridSearchCV(DecisionTreeClassifier(), tree_para, cv=kfold, n_jobs=12) clf.fit(trainX, trainY) meanTestAccuracy = clf.cv_results_['mean_test_score'] bestPara = clf.best_estimator_ print("DT cvResult : ", bestPara.max_depth, 1.0 - meanTestAccuracy) kwargs = {'criterion': 'gini', 'max_depth': bestPara.max_depth} predY = self.trainTestWholeData(trainX, trainY, testX, DecisionTreeClassifier, kwargs) #print ("predY DT: ", predY) #output to file if fileTestOutputDT != "": kaggle.kaggleize(predY, fileTestOutputDT) return (min(1.0 - meanTestAccuracy), kfold, bestPara.max_depth)
def indoor_localization_best(): train_x, train_y, test_x = read_data_localization_indoors() print('Train=', train_x.shape) print('Test=', test_x.shape) print('Best model for Indoor Localization Dataset') parameter = 9 print("Training with full train data with Model: KNN for n=" + str(parameter)) neigh = KNeighborsRegressor(n_neighbors=parameter) neigh = neigh.fit(train_x, train_y) y_hat = neigh.predict(train_x) e = compute_error(y_hat, train_y) print("MAE train error=" + str(e)) y_pred = neigh.predict(test_x) # Create dummy test output values predicted_y = y_pred # Output file location file_name = '../Predictions/IndoorLocalization/best.csv' # Writing output in Kaggle format print('Writing output to ', file_name) kaggle.kaggleize(predicted_y, file_name) error_mat[:] = [] time_mat[:] = [] min_e = 10000000 min_n = 3 print('\n\n')
def testDataOutputFile(weights, test_x, unflatten, fileTestOutput): (W, b, V, c) = unflatten(weights) out = feedForward(W, b, V, c, test_x) predY = np.argmax(out, axis=1) #output to file if fileTestOutput != "": kaggle.kaggleize(predY, fileTestOutput)
def executeTrainLinearReg(self, data, kfold, alphaLst, fileTestOutputDT): trainX = data[0] trainY = data[1] testX = data[2] logReg_para = {'loss': ['hinge', 'log'], 'alpha': alphaLst} clf = GridSearchCV(linear_model.SGDClassifier(), logReg_para, cv=kfold, n_jobs=12) clf.fit(trainX, trainY) meanTestAccuracy = clf.cv_results_['mean_test_score'] bestPara = clf.best_estimator_ print("logistic Regreesion cvResult : ", bestPara.loss, bestPara.alpha, 1.0 - meanTestAccuracy) kwargs = {'loss': 'hinge', 'alpha': bestPara.alpha} predY = self.trainTestWholeData(trainX, trainY, testX, linear_model.SGDClassifier, kwargs) #print ("predY DT: ", predY) #output to file if fileTestOutputDT != "": kaggle.kaggleize(predY, fileTestOutputDT + 'hinge') kwargs = {'loss': 'log', 'alpha': bestPara.alpha} predY = self.trainTestWholeData(trainX, trainY, testX, linear_model.SGDClassifier, kwargs) #print ("predY DT: ", predY) #output to file if fileTestOutputDT != "": kaggle.kaggleize(predY, fileTestOutputDT + 'log') return (min(1.0 - meanTestAccuracy), kfold, bestPara.alpha)
def distance_effect(train_x, train_y, test_x): regr = KNeighborsRegressor(n_neighbors=3, p=1) # Fit Model regr.fit(train_x, train_y) # Make Prediction test_y = regr.predict(test_x) # Create test output values predicted_y = test_y * -1 # Output file location file_name = '../Predictions/KNN_Manhattan.csv' # Writing output in Kaggle format print('Writing output to ', file_name) kaggle.kaggleize(predicted_y, file_name) regr = KNeighborsRegressor(n_neighbors=3, metric='chebyshev') # Fit Model regr.fit(train_x, train_y) # Make Prediction test_y = regr.predict(test_x) # Create test output values predicted_y = test_y * -1 # Output file location file_name = '../Predictions/KNN_chebyshev.csv' # Writing output in Kaggle format print('Writing output to ', file_name) kaggle.kaggleize(predicted_y, file_name)
def power_plant_best(): train_x, train_y, test_x = read_data_power_plant() print('Train=', train_x.shape) print('Test=', test_x.shape) print('Best model for Power Output Dataset') parameter = 13 print("Training with full train data: Model=Decision Tree for depth=" + str(parameter)) clf = tree.DecisionTreeRegressor(criterion='mse', max_depth=parameter) clf = clf.fit(train_x, train_y) y_hat = clf.predict(train_x) e = compute_error(y_hat, train_y) print("MAE train error=" + str(e)) y_pred = clf.predict(test_x) # Create dummy test output values predicted_y = y_pred # Output file location file_name = '../Predictions/PowerOutput/best.csv' # Writing output in Kaggle format print('Writing output to ', file_name) kaggle.kaggleize(predicted_y, file_name) error_mat[:] = [] time_mat[:] = [] min_e = 10000000 depth_for_min_e = 3 print('\n\n')
def trainSVMExtra(fileTestOutput, resultFile): ''' for extra credit 1 try different kernel ridge model or even different kernel how to select effective kernel ''' train_x, train_y, test_x = read_tumor_data() print('Train=', train_x.shape, type(train_x)) print('Test=', test_x.shape) #[1, 0.01, 0.001, 0.0001] fd = open(resultFile, 'a') kfoldLst = range(3, 12) biggestAccuracy = -2**32 for kfold in kfoldLst: parameters = { 'kernel': ('linear', 'rbf', 'sigmoid', 'poly'), 'C': np.linspace(1, 5, 5), 'gamma': [0.001, 0.1, 20], 'degree': np.linspace(1, 5, 5) } clf = GridSearchCV(SVC(), parameters, cv=kfold, n_jobs=10) #scoring= "neg_mean_squared_error" ) clf.fit(train_x, train_y) meanTestError = clf.cv_results_['mean_test_score'] bestPara = clf.best_estimator_ if clf.best_score_ > biggestAccuracy: biggestAccuracy = clf.best_score_ paramtersBest = [ bestPara.C, bestPara.gamma, bestPara.degree, bestPara.kernel, kfold, clf.best_score_ ] #print ("trainKernelRidgeExtra Result : ", bestPara.C, bestPara.gamma, bestPara.degree, bestPara.kernel, clf.best_score_, meanTestError,) writeToFile(fd, [ bestPara.C, bestPara.gamma, bestPara.degree, bestPara.kernel, kfold, clf.best_score_ ] + list([meanTestError])) # kwargs = {'n_neighbors': bestPara.n_neighbors} clf = SVC(C=bestPara.C, gamma=bestPara.gamma, degree=bestPara.degree, kernel=bestPara.kernel) clf.fit(train_x, train_y) predY = clf.predict(test_x) #print ("predY DT: ", predY) #output to file if fileTestOutput != "": kaggle.kaggleize(predY, fileTestOutput + str(kfold), False) print("best final trainKernelRidgeExtra Result: ", paramtersBest)
def stratifyDataTrainTest3layerNN(): data = read_image_data() train_x = data[0] train_y_integers = data[1] test_x = data[2] #normalize scaler = StandardScaler() train_x = scaler.fit_transform(train_x) test_x = scaler.fit_transform(test_x) print("train_x. shape:", train_x.shape) #split xsplitTrain, xsplitTest, ysplitTrain_integer, ysplitTest_integer = train_test_split( train_x, train_y_integers, test_size=0.2, random_state=0, stratify=train_y_integers) hidden_layer_sizes_lst = [(5, 5, 5), (10, 10, 10), (40, 40, 40), (70, 70, 70), (100, 100, 100)] largestAccuracy = -2**32 best_hidden_layer_size = None for hidden_layer_sizes in hidden_layer_sizes_lst: beginTime = time.time() mlp = MLPClassifier(hidden_layer_sizes, activation='tanh', max_iter=1000, momentum=0.9, epsilon=1e-8) mlp.fit(xsplitTrain, ysplitTrain_integer) #pred = mlp.predict(xsplitTest) #predict validation set meanAccuracy = mlp.score(xsplitTest, ysplitTest_integer) if meanAccuracy > largestAccuracy: largestAccuracy = meanAccuracy best_hidden_layer_size = hidden_layer_sizes print("SstratifyDataTrainTest3layerNN. smalleAccuracy:", time.time() - beginTime, meanAccuracy) print("SstratifyDataTrainTest3layerNN. smalleAccuracy:", largestAccuracy, best_hidden_layer_size) #train and test the whole data mlp = MLPClassifier(best_hidden_layer_size, activation='tanh', max_iter=1000, momentum=0.9, epsilon=1e-8) mlp.fit(train_x, train_y_integers) predyTest = mlp.predict(test_x) #output to file fileTestOutput3LayerNN = "../Predictions/best_3HiddenNN.csv" if fileTestOutput3LayerNN != "": kaggle.kaggleize(predyTest, fileTestOutput3LayerNN)
def kFold(features_train, labels_train, features_test, labels_test, path, DataSet): n = features_train.shape k = 10 # make it 10 fold test size = n[0] / k # size of each fold. errors1 = {} # to save CrossValidation Errors errors2 = {} # to save Training Errors e1 = [] # to obtain mean square error of whole CrossValidation set e2 = [] #to obtain mean square error of whole Training set for p in range(1, 20): # considering neighbours from range 1 to 20. print "Considering Neighbor=" + str(p) for i in range(1, k): # Select the cross Validation set, it will change with changing values of i Feature_CrossVal = features_train[i * size:][:size] Label_CrossVal = labels_train[i * size:][:size] # Add the rest of Training set Feature_Train = features_train[:i * size] np.append(Feature_Train, features_train[(i + 1) * size:]) Label_Train = labels_train[:i * size] np.append(Label_Train, labels_train[(i + 1) * size:]) # K Nearest Neighbor training with testset and testing with Cross Validation set neigh = KNeighborsClassifier(n_neighbors=p) neigh.fit(Feature_Train, Label_Train) predict = np.zeros(Label_CrossVal.shape) predict = neigh.predict(Feature_CrossVal) # K Nearest Neighbor training and testing on same set ne = KNeighborsClassifier(n_neighbors=p) ne.fit(Feature_Train, Label_Train) pre = np.zeros(Label_Train.shape) pre = ne.predict(Feature_Train) # Find the mean square error between the found predictions to the cross validation output set add error in a list e1.append(mean_squared_error(Label_CrossVal, predict)) e2.append(mean_squared_error(Label_Train, pre)) print e1, e2 # printing so that it doesn't look like program got hanged :P # take mean of the K fold errors for particular neighbor and append it errors1[p] = np.mean(e1) errors2[p] = np.mean(e2) #Find predictions with Min Error Neighbor neigh = KNeighborsClassifier(n_neighbors=min(errors1, key=errors1.get)) #fitting the training data neigh.fit(features_train, labels_train) predict = np.zeros(labels_test.shape) #testing with test features predict = neigh.predict(features_test) kaggle.kaggleize( predict, str(path) + "/Submission/Predictions/" + str(DataSet) + "/CrossValidation_KNN.csv") #returning errors to run_me.py file for plotting CrossValidation and Training Error return errors1, errors2
def kernelRidgeSkLearnCV(kfold=8, fileTestOutput="best_cv"): ''' call kernel ridge functions for different parameters for credit card activity data use cross validation to get out of sample out of sample meansquared error ''' train_x, train_y, test_x = read_creditcard_data() print('Train=', train_x.shape, type(train_x)) print('Test=', test_x.shape) #train_x = normalize(train_x, axis=0) #test_x = normalize(test_x, axis=0) #train_x = StandardScaler().fit_transform(train_x) #test_x = StandardScaler().fit_transform(test_x) alphaParaLst = [1, 0.0001] gammaParaLst = [None, 1, 0.001] kernelParaLst = ["rbf", "polynomial", "linear"] mseErrorSmallest = 2**32 #mseErrorLst = [] for alpha in alphaParaLst: for gamma in gammaParaLst: for kernel in kernelParaLst: clf = KernelRidge(alpha=alpha, gamma=gamma, degree=3, kernel=kernel) mseError = -1 * np.mean( cross_val_score(clf, train_x, train_y, cv=kfold, scoring="neg_mean_squared_error")) print("mseError: ", alpha, gamma, kernel, mseError) #mseErrorLst.append(mseError) if mseError < mseErrorSmallest: mseErrorSmallest = mseError paramtersBest = [alpha, gamma, kernel] print("best mseError: ", kfold, paramtersBest, mseErrorSmallest) #train whole data clf = KernelRidge(alpha=paramtersBest[0], gamma=paramtersBest[1], degree=3, kernel=paramtersBest[2]) clf.fit(train_x, train_y) yPred = clf.predict(test_x) #output file if fileTestOutput != "": kaggle.kaggleize(yPred, fileTestOutput, True)
def compute_KNN(train_x, train_y, test_x): # Different number of neighbors to run neighbor = [3, 5, 10, 20, 25] # Initialize Variables train_score = [0] * len(neighbor) test_score = [0] * len(neighbor) fit_time = [0] * len(neighbor) score_time = [0] * len(neighbor) # make MAE scoring MAE = make_scorer(compute_error, greater_is_better=False) # indexing index = 0 for n in neighbor: # Create the model regr = KNeighborsRegressor(n_neighbors=n) # Cross Validation cv_score = cross_validate(regr, train_x, train_y, return_train_score=True, scoring=MAE, cv=2) # Extract Statistics, scorer negates compute_error output train_score[index] = -cv_score['train_score'].mean() test_score[index] = -cv_score['test_score'].mean() # Print Statistics print('Number of Neighbors:', n) print('train score', train_score[index]) print('test score', test_score[index]) print('===============================') # Fit Model regr.fit(train_x, train_y) # Make Prediction test_y = regr.predict(test_x) # Create test output values predicted_y = test_y * -1 # Output file location file_name = '../Predictions/KNearestN_Neighbors_%d.csv' % n # Writing output in Kaggle format print('Writing output to ', file_name) kaggle.kaggleize(predicted_y, file_name) # Increase indexing index = index + 1
def predictNextBallBayesInference(file_name): iterations = np.arange(10000, 1000000, 10000) #[100000] #np.arange(10000, 1000000, 10000) for iters in iterations: prediction_prob = list() lengths = [10, 15, 20, 25] for l in lengths: BArray = np.loadtxt('../../Data/B_sequences_%s.txt' % (l), delimiter=',', dtype=float) for b in np.arange(BArray.shape[0]): prob = getNextBallMCMCQuestionl(BArray[b, :], iters) prediction_prob.append(prob) #print('Prob of next entry in ', BArray[b, :], 'is black is', prediction_prob[-1]) #print('Prob of next entry in is black is', prediction_prob[-1]) print('Writing output to ', file_name + "_iteration_" + str(iters)) kaggle.kaggleize(np.array(prediction_prob), file_name + "iteration_" + str(iters))
def svmSklearnCV(kfold=7, fileTestOutput="best_cv"): ''' call svm train and predict for different parameters for tumor data use cross validation to get out of sample out of sample meansquared error ''' train_x, train_y, test_x = read_tumor_data() print('Train=', train_x.shape) print('Test=', test_x.shape) cLst = [1, 0.01, 0.0001] gammaLst = [1, 0.01, 0.001] kernelParaLst = ['rbf', 'poly=3', 'poly=5', 'linear'] degree = 3 accuracyLargest = -2**32 for c in cLst: for gamma in gammaLst: for kernel in kernelParaLst: if kernel.split("=")[0] == "poly": degree = int(kernel.split("=")[1]) kernel = kernel.split("=")[0] clf = SVC(C=c, kernel=kernel, degree=degree, gamma=gamma) accuracy = np.mean( cross_val_score(clf, train_x, train_y, cv=kfold, scoring="accuracy")) print("accuracy: ", c, gamma, kernel, degree, accuracy) if accuracy > accuracyLargest: accuracyLargest = accuracy paramtersBest = [c, gamma, degree, kernel] print("best accuracy parameters: ", kfold, paramtersBest, accuracyLargest) #train whole data clf = SVC(C=paramtersBest[0], gamma=paramtersBest[1], degree=paramtersBest[2], kernel=paramtersBest[3]) clf.fit(train_x, train_y) yPred = clf.predict(test_x) #output file if fileTestOutput != "": kaggle.kaggleize(yPred, fileTestOutput, False)
def power_plant_LinearModel(k_fold): train_x, train_y, test_x = read_data_power_plant() print('Train=', train_x.shape) print('Test=', test_x.shape) alphas = [pow(10, -6), pow(10, -4), pow(10, -2), 1, 10] y_pred = choose_best_LinearModel(train_x, train_y, test_x, alphas, k_fold) #####plot avg_time = [] log_alphas = [] for i in time_mat: avg_time.append(i[k_fold]) for i in alphas: log_alphas.append(math.log10(i)) plt.plot(log_alphas, avg_time[0:len(alphas)], 'ro', label='Lasso') plt.plot(log_alphas, avg_time[len(alphas):2 * len(alphas)], 'bo', label='Ridge') plt.legend(loc='upper center') plt.ylabel('Avg Time for validation') plt.xlabel('Alpha for Linear Model (Log)') plt.title('Power Plant dataset (Model=Linear Model)') plt.show() predicted_y = y_pred file_name = '../Predictions/PowerOutput/power_plant_LM.csv' print('Writing output to ', file_name) kaggle.kaggleize(predicted_y, file_name) file2 = open('PPlogfile_LM.txt', 'w') file2.write(str(error_mat) + '\n\n\n') file2.write(str(time_mat)) file2.close() #clear global variables error_mat[:] = [] time_mat[:] = [] alpha_for_min_e = 3 flag = True min_e = 100000000 avg_time[:] = [] log_alphas[:] = [] print('\n\n')
def compute_NN(train_x, train_y, test_x): # make MAE scoring MAE = make_scorer(compute_error, greater_is_better=False) ######### Neural Network ######### # make pipeline std_NN = make_pipeline(StandardScaler(), MLPRegressor()) params = { 'mlpregressor__hidden_layer_sizes': [(10, ), (20, ), (30, ), (40, )], 'mlpregressor__max_iter': [1000] } gs = GridSearchCV(estimator=std_NN, param_grid=params, scoring=MAE, n_jobs=-1, cv=5, return_train_score=True) # fit grid search gs.fit(train_x, train_y) print('NN train score', -gs.cv_results_['mean_train_score']) print('NN test score', -gs.cv_results_['mean_test_score']) print('Best Parameter', gs.best_params_) print('Best score', -gs.best_score_) print('Parameters', gs.cv_results_['params']) # Train the best Model best_NN = make_pipeline(StandardScaler(), MLPRegressor(hidden_layer_sizes=(20, ))) best_NN.fit(train_x, train_y) # Make Prediction test_y = best_NN.predict(test_x) # Create test output values predicted_y = test_y * -1 # Output file location file_name = '../Predictions/NN_best.csv' # Writing output in Kaggle format print('Writing output to ', file_name) kaggle.kaggleize(predicted_y, file_name)
def classifierKnn(train1, label1, train2, label2): metric = 'euclidean' k = 2 clf = neighbors.KNeighborsClassifier(n_neighbors=k, weights='distance', metric=metric) clf.fit(features_train, labels_train) #Compute a prediction for every point in the grid for i in range(len(train2)): x = features_test[i, :].reshape(1, -1) predictions = clf.predict(x) labels_test[i] = predictions #Save prediction file in Kaggle format #predictions = np.zeros(labels_test.shape) for i in range(len(labels_test)): print features_test[i, :], labels_test[ssi] kaggle.kaggleize(labels_test, "../Predictions/Digits/test.csv")
def pipe_line_final(path, param_grid, nfs, data_set): if data_set=='Blog': train_m = np.load(path + 'train.npy') X_all=train_m[:,0:train_m.shape[1]-1] Y_all=train_m[:,-1] f=SelectKBest(f_regression) X_new=f.fit_transform(X_all, Y_all) train_x, train_y=shuffle(X_new, Y_all) selected_params=f.get_support(indices=True) reg=GridSearchCV(GradientBoostingRegressor(), param_grid, scoring='neg_mean_squared_error') reg.fit(train_x,train_y) print "For ", data_set, "- best params: ", reg.best_params_, "Feature_subset: ", f.get_support(indices=True) test=np.load(path + 'test_distribute.npy') test_x=test[:, 0:test.shape[1]-1] tr=[] for i in range(0, 280): if i in selected_params: tr.append(True) else: tr.append(False) tr=np.array(tr) X_masked=test_x[:,tr] predictions=reg.predict(X_masked) kaggle.kaggleize(predictions, "../Predictions/BlogFeedback/test.csv") else: train = np.load(path + 'train.npy') test = np.load(path + 'test_private.npy') train_x, train_y=shuffle(train[:, 0:train.shape[1]-1], train[:, -1]) test_x = test[:, 0:test.shape[1]-1] test_y = test[:, -1] rfe=RFE(DecisionTreeRegressor(), n_features_to_select=nfs, step=2) reg=GridSearchCV(rfe, param_grid) reg.fit(train_x,train_y) print "For ", data_set, "- best params: ", reg.best_params_, "Feature_subset: ", reg.best_estimator_.ranking_, "RMSE: ", np.sqrt(mean_squared_error(test_y,reg.predict(test_x)))
def Experiment_DataSet(features_train, labels_train, features_test, labels_test, path, i): #Support Vector Regression svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1) y_rbf = svr_rbf.fit(features_train, labels_train).predict(features_test) kaggle.kaggleize( y_rbf, path + "Submission/Predictions/" + str(i) + "/RFE_SVR.csv") print "Support Vector Regression" #K Nearest Neighbor neigh = KNeighborsRegressor(n_neighbors=6) neigh.fit(features_train, labels_train) predict = neigh.predict(features_test) kaggle.kaggleize( predict, path + "Submission/Predictions/" + str(i) + "/KNN_REG.csv") print "K Nearest Neighbor" #LassoCV with RFE and pipeline merge of both alpha = np.arange(0.1, 2, 0.1) #defining range of alphas lasso = linear_model.LassoCV(alphas=alpha) rfe = RFE(estimator=lasso, step=1) Lasso_Pipeline = make_pipeline(rfe, svr_rbf) Lasso_Pipeline.fit(features_train, labels_train) predict = Lasso_Pipeline.predict(features_test) kaggle.kaggleize( predict, path + "Submission/Predictions/" + str(i) + "/LassoCV.csv") print "LassoCV with RFE and pipeline"
def final_model(train_x, train_y, test_x): np.random.seed(2018) # make MAE scoring MAE = make_scorer(compute_error, greater_is_better=False) scaler = StandardScaler() scaler.fit(train_x) train_x = scaler.transform(train_x) test_x = scaler.transform(test_x) # create model model = Sequential() model.add(Dense(20, input_shape=(52, ), activation='relu')) model.add(Dropout(0.1)) model.add(Dense(20, activation='relu')) model.add(Dropout(0.1)) model.add(Dense(20, activation='relu')) model.add(Dropout(0.1)) model.add(Dense(1, activation='relu')) opti_adam = optimizers.Adam(lr=0.1, beta_1=0.9) # Compile model model.compile(loss='MAE', optimizer=opti_adam, metrics=['accuracy']) # Fit the model model.fit(train_x, train_y, epochs=150, batch_size=200) # evaluate the model scores = model.evaluate(train_x, train_y) print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100)) test_y = model.predict(test_x) predicted_y = test_y * -1 # Output file location file_name = '../Predictions/NN_best_competition.csv' # Writing output in Kaggle format print('Writing output to ', file_name) kaggle.kaggleize(predicted_y, file_name)
def pipe_line(path, test_y_p, param_grid, nfs, data_set): if test_y_p: train = np.load(path + 'train.npy') test = np.load(path + 'test_private.npy') else: train_m = np.load(path + 'train.npy') train=train_m[:(len(train_m)/2)] test = train_m[(len(train_m)/2):] train_x, train_y=shuffle(train[:, 0:train.shape[1]-1], train[:, -1]) test_x = test[:, 0:test.shape[1]-1] test_y = test[:, -1] rfe=RFE(DecisionTreeRegressor(), n_features_to_select=nfs, step=2) reg=GridSearchCV(rfe, param_grid) reg.fit(train_x,train_y) print "For ", data_set, "- best params: ", reg.best_params_, "Feature_subset: ", reg.best_estimator_.ranking_, "RMSE: ", np.sqrt(mean_squared_error(test_y,reg.predict(test_x))) if not test_y_p: test_f=np.load(path + 'test_distribute.npy') predictions=reg.predict(test_f[:, 0:test.shape[1]-1]) kaggle.kaggleize(predictions, "../Predictions/BlogFeedback/test.csv")
def indoor_localization_KNN(): train_x, train_y, test_x = read_data_localization_indoors() print('Train=', train_x.shape) print('Test=', test_x.shape) k_fold = 5 neighbour_list = [3, 5, 10, 20, 25] y_pred = choose_best_KNN(train_x, train_y, test_x, k_fold, neighbour_list) # Create dummy test output values predicted_y = y_pred # Output file location file_name = '../Predictions/IndoorLocalization/indoor_localization_KNN.csv' # Writing output in Kaggle format print('Writing output to ', file_name) kaggle.kaggleize(predicted_y, file_name) file2 = open('ILlogfile_KNN.txt', 'w') file2.write(str(error_mat) + '\n') file2.write(str(time_mat)) file2.close() #######plot avg_time = [] for i in time_mat: avg_time.append(i[k_fold]) graph = plt.plot(neighbour_list, avg_time, 'rs') plt.ylabel('Avg Time for Validation(ms)') plt.xlabel('NUmber of neighbours') plt.title('Indoor Localization dataset (Model=KNN)') plt.show() error_mat[:] = [] time_mat[:] = [] min_e = 10000000 min_n = 3 avg_time[:] = [] print('\n\n')
def power_plant_DT(k_fold): train_x, train_y, test_x = read_data_power_plant() print('Train=', train_x.shape) print('Test=', test_x.shape) depths = [3, 6, 9, 12, 15] y_pred = choose_best_DT(train_x, train_y, test_x, depths, k_fold) #######plot avg_time = [] for i in time_mat: avg_time.append(i[k_fold]) graph = plt.plot(depths, avg_time, 'rs') plt.ylabel('Avg Time for Validation(ms)') plt.xlabel('Depth of Tree') plt.title('Power plant dataset (Model=Decision Tree)') plt.show() ######writing predictions to CSV file predicted_y = y_pred file_name = '../Predictions/PowerOutput/power_plant_DT.csv' print('Writing output to ', file_name) kaggle.kaggleize(predicted_y, file_name) #######logs file2 = open('PPlogfile_DT.txt', 'w') file2.write(str(error_mat) + '\n\n') file2.write(str(time_mat) + '\n\n') file2.write(str(avg_time) + '\n') file2.close() error_mat[:] = [] time_mat[:] = [] avg_time[:] = [] min_e = 10000000 depth_for_min_e = 3 print('\n\n')
def indoor_localization_DT(k_fold): train_x, train_y, test_x = read_data_localization_indoors() print('Train=', train_x.shape) print('Test=', test_x.shape) depths = [3, 6, 9, 12, 15] y_pred = choose_best_DT(train_x, train_y, test_x, depths, k_fold) #####plot avg time avg_time = [] for i in time_mat: avg_time.append(i[k_fold]) plt.plot(depths, avg_time, 'bo') plt.ylabel('Avg Time for Validation') plt.xlabel('Depth of Tree') plt.title('Indoor Localisation Dataset') plt.show() #####write predictions in CSV file predicted_y = y_pred file_name = '../Predictions/IndoorLocalization/indoor_localization_DT.csv' print('Writing output to ', file_name) kaggle.kaggleize(predicted_y, file_name) #####log file file2 = open('ILlogfile_DT.txt', 'w') file2.write(str(error_mat) + '\n\n') file2.write(str(time_mat) + '\n\n') file2.write(str(avg_time) + '\n') file2.close() error_mat[:] = [] time_mat[:] = [] avg_time[:] = [] min_e = 10000000 depth_for_min_e = 3 print('\n\n')
def executeTrainKNN(self, data, kfold, knnLst, fileTestOutputDT): trainX = data[ 0] #[0:1000, : ] #smaller first for debugging trainY = data[1] #[0:1000] testX = data[2] knn_para = {'n_neighbors': knnLst} clf = GridSearchCV(KNeighborsClassifier(), knn_para, cv=kfold, n_jobs=12) clf.fit(trainX, trainY) meanTestAccuracy = clf.cv_results_['mean_test_score'] bestPara = clf.best_estimator_ print("KNN cvResult : ", bestPara.n_neighbors, 1.0 - meanTestAccuracy) kwargs = {'n_neighbors': bestPara.n_neighbors} predY = self.trainTestWholeData(trainX, trainY, testX, KNeighborsClassifier, kwargs) #print ("predY DT: ", predY) #output to file if fileTestOutputDT != "": kaggle.kaggleize(predY, fileTestOutputDT)
def CrossValidation_Robot(features_train,labels_train,features_test,labels_test,path): n=features_train.shape k=2 # make it 2 fold size = n[0]/k # size of each fold. errors1={} e1=[] errors2={} Cvalue=np.arange(100,500,100) degree=np.arange(1,n[1],2) for d in degree: for p in Cvalue: # considering neighbours from range 1 to 20. print "Considering"+str(p) for i in range(1,k): # Select the cross Validation set, it will change with changing values of i Feature_CrossVal = features_train[i*size:][:size] Label_CrossVal = labels_train[i*size:][:size] # Add the rest of Training set Feature_Train = features_train[:i*size] np.append(Feature_Train ,features_train[(i+1)*size:]) Label_Train = labels_train[:i*size] np.append(Label_Train ,labels_train[(i+1)*size:]) #define Support Vector Regression with Degree and Regression Coefficient as Hyperparameter svr_rbf = SVR(kernel='rbf', C=p, degree=d) y_rbf = svr_rbf.fit(Feature_Train, Label_Train).predict(Feature_CrossVal) # Find the mean square error between the found predictions to the cross validation output set add error in a list e1.append(np.sqrt(mean_squared_error(Label_CrossVal,y_rbf))) print e1 # take mean of the K fold errors for particular neighbor and append it errors1[p]=np.mean(e1) #saving absolute error with Value of regularization constant errors2[d]=np.mean(e1) #saving absolute error with Value of Polynomial Degree #Find predictions with Min Error degree and Regression Coefficient print min(errors1, key=errors1.get) #define the Support Vector Regression with optimum Regression Coefficient and Degree svr_rbf = SVR(kernel='rbf', C=min(errors1, key=errors1.get), degree=min(errors1, key=errors2.get)) predict=np.zeros(labels_test.shape) #fit the data and predict the values predict = svr_rbf.fit(features_train, labels_train).predict(features_test) #save the output in CSV format kaggle.kaggleize(predict, path+"Submission/Predictions/RobotArm/SupportVectorRegression_CrossValidated.csv") kaggle.kaggleize(predict, path+"Submission/Predictions/RobotArm/best.csv") #Plot graph representing Regression coefficient vs Error plt.figure(1, figsize=(6,4)) plt.plot(errors1.keys(),errors1.values(),'sb-', linewidth=3) #Plot the first series in blue with square marker plt.ylabel("Error") #Y-axis label plt.xlabel("Regression Coefficient") #X-axis label plt.title("Error vs Regression Coefficient for Robot Dataset") #Plot title #Save the chart plt.savefig(path+"/Submission/Figures/ErrorVsRegressionCoefficient_Robot.pdf") plt.show() #Plot graph representing Degree vs Error plt.figure(2, figsize=(6,4)) plt.plot(errors2.keys(),errors1.values(),'or-', linewidth=3) #Plot the second series in red with square marker plt.ylabel("Error") #Y-axis label plt.xlabel("Degree") #X-axis label plt.title("Degree vs Mean Square Error for Robot Dataset") #Plot title #Save the chart plt.savefig(path+"/Submission/Figures/ErrorVsDegree_Robot.pdf") plt.show()
#model_selection.credit_card(train_x, train_y) result_cc = model_selection.credit_card(train_x, train_y) print(result_cc, "\n") print("Best parameter is: ", min(result_cc, key = result_cc.get) ) clf = KernelRidge(alpha=0.0001, kernel='rbf', gamma=None) clf.fit(train_x, train_y) predicted_y = clf.predict(test_x) # Output file location file_name = '../Predictions/CreditCard/best.csv' # Writing output in Kaggle format print('Writing output to ', file_name) kaggle.kaggleize(predicted_y, file_name, True) ######################### 2.a train_x, train_y, test_x = read_tumor_data() print('Train=', train_x.shape) print('Test=', test_x.shape) result_t = model_selection.tumor(train_x, train_y) print(result_t, "\n") print("Best parameter is: ", max(result_t, key = result_t.get)) clf = SVC(C=1.0, kernel='rbf', gamma=0.001) clf.fit(train_x, train_y) predicted_y = clf.predict(test_x)
def kagglizing(predicted_y, best): file_name = '../Predictions/' + best + '.csv' # Writing output in Kaggle format print('Writing output to ', file_name) kaggle.kaggleize(predicted_y, file_name)
def compute_DT(train_x, train_y, test_x): # Different values of max_depth to run depth = [3, 6, 9, 12, 15] # Initialize Variables train_score = [0] * len(depth) test_score = [0] * len(depth) fit_time = [0] * len(depth) score_time = [0] * len(depth) # make MAE scoring MAE = make_scorer(compute_error, greater_is_better=False) # indexing index = 0 for d in depth: # Create the model regr = DecisionTreeRegressor(criterion="mae", max_depth=d) # Cross Validation cv_score = cross_validate(regr, train_x, train_y, return_train_score=True, scoring=MAE, cv=5) # Extract Statistics, scorer negates compute_error output train_score[index] = -cv_score['train_score'].mean() test_score[index] = -cv_score['test_score'].mean() fit_time[index] = 1000 * cv_score['fit_time'].sum() score_time[index] = 1000 * cv_score['score_time'].sum() # Print Statistics print('Depth of Decision Tree:', d) print('train score', train_score[index]) print('test score', test_score[index]) print('fit time', fit_time[index]) print('score time', score_time[index]) print('===============================') # Fit Model regr.fit(train_x, train_y) # Make Prediction test_y = regr.predict(test_x) # Create test output values predicted_y = test_y * -1 # Output file location file_name = '../Predictions/Decision_Tree_depth_%d.csv' % d # Writing output in Kaggle format print('Writing output to ', file_name) kaggle.kaggleize(predicted_y, file_name) # Increase indexing index = index + 1 # Plot CV Time plt.figure(num=None, figsize=(16, 8), dpi=80, facecolor='w', edgecolor='k') plt.plot(depth, fit_time, '.') plt.xlabel('Depth of Decision Tree') plt.ylabel('Cross Validation Time [msec]') plt.savefig('../Figures/DT_cv_time.png')
def choose_regression_model(problem_instance): if problem_instance == 1: #Load the Computer Activity Data path = '../../Data/ComputerActivity/' elif problem_instance == 2: #Load the Housing Data path = '../../Data/Housing/' data = np.load(path + 'Data.npz') features_train = data['X_train'] labels_train = data['y_train'] features_test = data['X_test'] labels_test = data['y_test'] n_estimator = [] print("Computer Activity:", features_train.shape, labels_train.shape, features_test.shape, labels_test.shape) #Regression Method if problem_instance == 1: print("Executing Computer Activity problem") #transform = feature_selection.SelectKBest(feature_selection.f_regression) transform = feature_selection.RFECV(estimator = RidgeCV()) pipeline = Pipeline([('anova', transform), ('adr', ensemble.GradientBoostingRegressor(random_state=404))]) n_estimator = np.arange(75, 86, 1) depth = range(6, 8) #n_estimator = np.arange(10, 100, 10) parameters = {'anova__cv': [5,10], #'anova__k': np.arange(15, 22, 1), 'adr__n_estimators': n_estimator, 'adr__max_depth': depth } elif problem_instance == 2: print("Executing Housing problem") transform = feature_selection.SelectKBest(feature_selection.f_regression) #transform = feature_selection.RFECV(estimator = RidgeCV()) n_estimator = np.arange(130, 160, 10) #n_estimator = np.arange(10, 100, 1) depth = range(6,8) pipeline = Pipeline([('anova', transform), ('adr', ensemble.GradientBoostingRegressor(random_state=404))]) parameters = {'anova__k': np.arange(5, 9, 1), #'anova__cv': [5,10], 'adr__n_estimators': n_estimator, #50 'adr__max_depth': depth } grid = grid_search.GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1) grid.fit(features_train, labels_train) predictions = grid.predict(features_test) print(grid.best_params_, grid.best_score_, grid.best_estimator_, grid.grid_scores_) scores = grid.grid_scores_ #print(type(scores), len(scores)) mean_score_list = [] parameters_list = [] for x in range(0, len(scores)): mean_score_list.append(scores[x][1]) parameters_list.append(scores[x][0]) print(mean_score_list) #print(parameters_list) scores_list = np.array(mean_score_list) plot_score_linechart(scores_list, problem_instance) if problem_instance ==1: kaggle.kaggleize(predictions, "../Predictions/ComputerActivity/test.csv") else: kaggle.kaggleize(predictions, "../Predictions/Housing/test.csv")