def parksParamTuning(): train_scores_dict = {} test_scores_dict = {} radius_list = [0.5, 1] area_list = [100, 200] for radius in radius_list: for area in area_list: file_name = "_parksRadius" + str(radius) + "_area" + str(area) all_data = MainTable(extra=file_name) df = all_data.getDB() # Split to Data and Actual results X = selectCols(df, features) y = df['SQR_FEET_PRICE'] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42) tot_train_score = 0 tot_test_score = 0 n = 5 for i in range(0, n): regressor = RandomForestRegressor(n_estimators=N_ESTIMATORS, min_impurity_decrease=200) regressor.fit(X_train, y_train) tot_train_score += regressor.score(X_train, y_train) tot_test_score += regressor.score(X_test, y_test) train_scores_dict["radius " + str(radius) + "\narea " + str(area)] = tot_train_score / n test_scores_dict["radius " + str(radius) + "\narea " + str(area)] = tot_test_score / n graph_paramTuning(train_scores_dict, test_scores_dict, 'Tuning parks radius and area with Desicion Trees', 'Parks radius and area')
def paramTuning(file_name, param_values_list, param_name): train_scores_dict = {} test_scores_dict = {} for p in param_values_list: # Get the base table all_data = MainTable(extra=file_name + str(p)) df = all_data.getDB() # Split to Data and Actual results X = selectCols(df, features) y = df['SQR_FEET_PRICE'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) tot_train_score = 0 tot_test_score = 0 n = 5 for i in range(1, n): regressor = RandomForestRegressor(n_estimators=N_ESTIMATORS, min_impurity_decrease=200) regressor.fit(X_train, y_train) tot_train_score += regressor.score(X_train, y_train) tot_test_score += regressor.score(X_test, y_test) train_scores_dict[p] = tot_train_score / n test_scores_dict[p] = tot_test_score / n graph_paramTuning(train_scores_dict, test_scores_dict, 'Tuning ' + param_name + 'with Desicion Trees', param_name)
def compareFeatures(): # Get the base table all_data = MainTable() df = all_data.getDB() base_feats = [ 'BOROUGH', 'BUILDING_AGE' ] external_feats = [ 'CRIMES', 'HI_ED', 'HIGH_SCHOOLS', 'BUS_STOPS', 'SUBWAY_STOPS', 'NUM_OF_PARKS', 'AREA_OF_PARKS', 'NOISE', 'HEALTH', 'GALLERIES', 'MUSEUMS' ] mean_train_score_b, mean_test_score_b = getBaseFeatsScores(df, base_feats) train_scores_dict = {} test_scores_dict = {} for feat in external_feats: curr_feats = base_feats curr_feats.append(feat) # Split to Data and Actual results X = selectCols(df, curr_feats) y = df['SQR_FEET_PRICE'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) tot_train_score = 0 tot_test_score = 0 n = 5 for i in range(1, n+1): regressor = DecisionTreeRegressor(min_impurity_decrease=200) regressor.fit(X_train, y_train) tot_train_score += regressor.score(X_train, y_train) tot_test_score += regressor.score(X_test, y_test) train_scores_dict[feat] = tot_train_score / n test_scores_dict[feat] = tot_test_score / n graph_barsForFeatures(train_scores_dict, test_scores_dict, 'Comparing features using Desicion Trees', 'Feature Name', mean_train_score_b, mean_test_score_b)
def paramTuning(file_name, param_values_list, param_name): train_scores_dict = {} test_scores_dict = {} for p in param_values_list: # Get the base table all_data = MainTable(extra = file_name + str(p)) df = all_data.getDB() # Split to Data and Actual results X = selectCols(df, features) y = df['SQR_FEET_PRICE'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) regressor = neighbors.KNeighborsRegressor(n_neighbors=16) regressor.fit(X_train, y_train) train_score = regressor.score(X_train, y_train) test_score = regressor.score(X_test, y_test) train_scores_dict[p] = train_score test_scores_dict[p] = test_score graph_paramTuning(train_scores_dict, test_scores_dict, 'KNN', param_name)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) regressor = neighbors.KNeighborsRegressor(n_neighbors=16) regressor.fit(X_train, y_train) train_score = regressor.score(X_train, y_train) test_score = regressor.score(X_test, y_test) train_scores_dict[p] = train_score test_scores_dict[p] = test_score graph_paramTuning(train_scores_dict, test_scores_dict, 'KNN', param_name) if __name__ == '__main__': # # Get the base table all_data = MainTable() df = all_data.getDB() # # Split to Data and Actual results X = selectCols(df, features) y = df['SQR_FEET_PRICE'] # Basic prediction with optimal K y_pred, test_score, train_score = predictionAndScore(X, y, 16) print("KNN: Training score: " + str(train_score) + "Test score: " + str(test_score)) # Cross validation X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) knn_regressor = neighbors.KNeighborsRegressor(n_neighbors=16) cv_results = cross_validate(knn_regressor, X_train, y_train, cv=3, return_train_score=True) print("KNN w/ Cross Validation: Training score: " + str(cv_results['train_score']) + ", Test score: " + str(cv_results['test_score']))