示例#1
0
def parksParamTuning():
    train_scores_dict = {}
    test_scores_dict = {}
    radius_list = [0.5, 1]
    area_list = [100, 200]
    for radius in radius_list:
        for area in area_list:
            file_name = "_parksRadius" + str(radius) + "_area" + str(area)
            all_data = MainTable(extra=file_name)
            df = all_data.getDB()
            # Split to Data and Actual results
            X = selectCols(df, features)
            y = df['SQR_FEET_PRICE']

            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42)
            tot_train_score = 0
            tot_test_score = 0
            n = 5
            for i in range(0, n):
                regressor = RandomForestRegressor(n_estimators=N_ESTIMATORS,
                                                  min_impurity_decrease=200)
                regressor.fit(X_train, y_train)

                tot_train_score += regressor.score(X_train, y_train)
                tot_test_score += regressor.score(X_test, y_test)

            train_scores_dict["radius " + str(radius) + "\narea " +
                              str(area)] = tot_train_score / n
            test_scores_dict["radius " + str(radius) + "\narea " +
                             str(area)] = tot_test_score / n
    graph_paramTuning(train_scores_dict, test_scores_dict,
                      'Tuning parks radius and area with Desicion Trees',
                      'Parks radius and area')
示例#2
0
def paramTuning(file_name, param_values_list, param_name):
    train_scores_dict = {}
    test_scores_dict = {}
    for p in param_values_list:
        # Get the base table
        all_data = MainTable(extra=file_name + str(p))
        df = all_data.getDB()

        # Split to Data and Actual results
        X = selectCols(df, features)
        y = df['SQR_FEET_PRICE']

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.25,
                                                            random_state=42)

        tot_train_score = 0
        tot_test_score = 0
        n = 5
        for i in range(1, n):
            regressor = RandomForestRegressor(n_estimators=N_ESTIMATORS,
                                              min_impurity_decrease=200)
            regressor.fit(X_train, y_train)

            tot_train_score += regressor.score(X_train, y_train)
            tot_test_score += regressor.score(X_test, y_test)

        train_scores_dict[p] = tot_train_score / n
        test_scores_dict[p] = tot_test_score / n
    graph_paramTuning(train_scores_dict, test_scores_dict,
                      'Tuning ' + param_name + 'with Desicion Trees',
                      param_name)
示例#3
0
def compareFeatures():
    # Get the base table
    all_data = MainTable()
    df = all_data.getDB()

    base_feats =    [   'BOROUGH',
                        'BUILDING_AGE'  ]

    external_feats = [   'CRIMES',
                         'HI_ED',
                         'HIGH_SCHOOLS',
                         'BUS_STOPS',
                         'SUBWAY_STOPS',
                         'NUM_OF_PARKS',
                         'AREA_OF_PARKS',
                         'NOISE',
                         'HEALTH',
                         'GALLERIES',
                         'MUSEUMS' ]


    mean_train_score_b, mean_test_score_b = getBaseFeatsScores(df, base_feats)

    train_scores_dict = {}
    test_scores_dict = {}
    for feat in external_feats:
        curr_feats = base_feats
        curr_feats.append(feat)

        # Split to Data and Actual results
        X = selectCols(df, curr_feats)
        y = df['SQR_FEET_PRICE']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

        tot_train_score    = 0
        tot_test_score     = 0
        n = 5
        for i in range(1, n+1):
            regressor = DecisionTreeRegressor(min_impurity_decrease=200)
            regressor.fit(X_train, y_train)

            tot_train_score += regressor.score(X_train, y_train)
            tot_test_score += regressor.score(X_test, y_test)

        train_scores_dict[feat] = tot_train_score / n
        test_scores_dict[feat] = tot_test_score / n
    graph_barsForFeatures(train_scores_dict, test_scores_dict, 'Comparing features using Desicion Trees',
                          'Feature Name', mean_train_score_b, mean_test_score_b)
示例#4
0
    def __init__(self, save):
        self.save = save
        self.data = MainTable().getDB()
        training_set, test_set = train_test_split(self.data, test_size=TEST_SIZE)
        # separate data to apartments features (without prices) and apartments prices
        self.training_features = removeCols(training_set, ['SQR_FEET_PRICE'])
        self.test_features = removeCols(test_set, ['SQR_FEET_PRICE'])
        self.training_prices = selectCols(training_set, ['SQR_FEET_PRICE'])
        self.test_prices = selectCols(test_set, ['SQR_FEET_PRICE'])

        self.all_data_without_prices = removeCols(self.data, ['SQR_FEET_PRICE'])
        self.all_data_only_prices = selectCols(self.data, ['SQR_FEET_PRICE'])
示例#5
0
def paramTuning(file_name, param_values_list, param_name):
    train_scores_dict = {}
    test_scores_dict = {}
    for p in param_values_list:
        # Get the base table
        all_data = MainTable(extra = file_name + str(p))
        df = all_data.getDB()

        # Split to Data and Actual results
        X = selectCols(df, features)
        y = df['SQR_FEET_PRICE']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
        regressor = neighbors.KNeighborsRegressor(n_neighbors=16)
        regressor.fit(X_train, y_train)

        train_score = regressor.score(X_train, y_train)
        test_score = regressor.score(X_test, y_test)

        train_scores_dict[p] = train_score
        test_scores_dict[p] = test_score
    graph_paramTuning(train_scores_dict, test_scores_dict, 'KNN', param_name)
示例#6
0
        y = df['SQR_FEET_PRICE']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
        regressor = neighbors.KNeighborsRegressor(n_neighbors=16)
        regressor.fit(X_train, y_train)

        train_score = regressor.score(X_train, y_train)
        test_score = regressor.score(X_test, y_test)

        train_scores_dict[p] = train_score
        test_scores_dict[p] = test_score
    graph_paramTuning(train_scores_dict, test_scores_dict, 'KNN', param_name)

if __name__ == '__main__':
    # # Get the base table
    all_data = MainTable()
    df = all_data.getDB()

    # # Split to Data and Actual results
    X = selectCols(df, features)
    y = df['SQR_FEET_PRICE']

    # Basic prediction with optimal K
    y_pred, test_score, train_score = predictionAndScore(X, y, 16)
    print("KNN: Training score: " + str(train_score) + "Test score: " + str(test_score))

    # Cross validation
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    knn_regressor = neighbors.KNeighborsRegressor(n_neighbors=16)
    cv_results = cross_validate(knn_regressor, X_train, y_train, cv=3, return_train_score=True)
    print("KNN w/ Cross Validation: Training score: " + str(cv_results['train_score']) + ", Test score: " + str(cv_results['test_score']))