Exemplo n.º 1
0
    def __init__(self, save):
        self.save = save
        self.data = MainTable().getDB()
        training_set, test_set = train_test_split(self.data, test_size=TEST_SIZE)
        # separate data to apartments features (without prices) and apartments prices
        self.training_features = removeCols(training_set, ['SQR_FEET_PRICE'])
        self.test_features = removeCols(test_set, ['SQR_FEET_PRICE'])
        self.training_prices = selectCols(training_set, ['SQR_FEET_PRICE'])
        self.test_prices = selectCols(test_set, ['SQR_FEET_PRICE'])

        self.all_data_without_prices = removeCols(self.data, ['SQR_FEET_PRICE'])
        self.all_data_only_prices = selectCols(self.data, ['SQR_FEET_PRICE'])
Exemplo n.º 2
0
 def pushMuseumsDB(self, radius):
     self.museums = self._extractMuseumsData()
     self.data = Apartments.getInstance().getData()
     self.data = selectCols(self.data, ['ADDRESS', 'LAT', 'LON'])
     self.data['MUSEUMS'] = self.data.apply(self._countMuseumsInRadius,
                                            args=(radius, ),
                                            axis=1)
     self.data = selectCols(self.data,
                            ['ADDRESS', 'MUSEUMS']).drop_duplicates()
     self.data.to_csv(path_or_buf=DATASETS_PATH + "/museums_db" +
                      str(radius) + ".csv",
                      index=False)
Exemplo n.º 3
0
def parksParamTuning():
    train_scores_dict = {}
    test_scores_dict = {}
    radius_list = [0.5, 1]
    area_list = [100, 200]
    for radius in radius_list:
        for area in area_list:
            file_name = "_parksRadius" + str(radius) + "_area" + str(area)
            all_data = MainTable(extra=file_name)
            df = all_data.getDB()
            # Split to Data and Actual results
            X = selectCols(df, features)
            y = df['SQR_FEET_PRICE']

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            tot_train_score = 0
            tot_test_score = 0
            n = 5
            for i in range(0, n):
                regressor = DecisionTreeRegressor(min_impurity_decrease=200)
                regressor.fit(X_train, y_train)

                tot_train_score += regressor.score(X_train, y_train)
                tot_test_score += regressor.score(X_test, y_test)

            train_scores_dict["radius " + str(radius) + "\narea " + str(area)] = tot_train_score / n
            test_scores_dict["radius " + str(radius) + "\narea " + str(area)] = tot_test_score / n
    graph_paramTuning(train_scores_dict, test_scores_dict, 'Tuning parks radius and area with Desicion Trees', 'Parks radius and area')
Exemplo n.º 4
0
def paramTuning(file_name, param_values_list, param_name):
    train_scores_dict = {}
    test_scores_dict = {}
    for p in param_values_list:
        # Get the base table
        all_data = MainTable(extra = file_name + str(p))
        df = all_data.getDB()

        # Split to Data and Actual results
        X = selectCols(df, features)
        y = df['SQR_FEET_PRICE']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

        tot_train_score    = 0
        tot_test_score     = 0
        n = 5
        for i in range(1, n+1):
            regressor = DecisionTreeRegressor(min_impurity_decrease=200)
            regressor.fit(X_train, y_train)

            tot_train_score += regressor.score(X_train, y_train)
            tot_test_score += regressor.score(X_test, y_test)

        train_scores_dict[p] = tot_train_score / n
        test_scores_dict[p] = tot_test_score / n
    graph_paramTuning(train_scores_dict, test_scores_dict, 'Tuning ' + param_name + 'with Desicion Trees', param_name)
Exemplo n.º 5
0
 def loadMuseumsDB(self, radius):
     try:
         self.data = pd.read_csv(DATASETS_PATH + "/museums_db" +
                                 str(radius) + ".csv")
         self.data = selectCols(self.data,
                                ['ADDRESS', 'MUSEUMS']).drop_duplicates(
                                    subset='ADDRESS', keep='first')
     except FileNotFoundError:
         self.pushMuseumsDB(radius)
Exemplo n.º 6
0
def compareFeatures():
    # Get the base table
    all_data = MainTable()
    df = all_data.getDB()

    base_feats =    [   'BOROUGH',
                        'BUILDING_AGE'  ]

    external_feats = [   'CRIMES',
                         'HI_ED',
                         'HIGH_SCHOOLS',
                         'BUS_STOPS',
                         'SUBWAY_STOPS',
                         'NUM_OF_PARKS',
                         'AREA_OF_PARKS',
                         'NOISE',
                         'HEALTH',
                         'GALLERIES',
                         'MUSEUMS' ]


    mean_train_score_b, mean_test_score_b = getBaseFeatsScores(df, base_feats)

    train_scores_dict = {}
    test_scores_dict = {}
    for feat in external_feats:
        curr_feats = base_feats
        curr_feats.append(feat)

        # Split to Data and Actual results
        X = selectCols(df, curr_feats)
        y = df['SQR_FEET_PRICE']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

        tot_train_score    = 0
        tot_test_score     = 0
        n = 5
        for i in range(1, n+1):
            regressor = DecisionTreeRegressor(min_impurity_decrease=200)
            regressor.fit(X_train, y_train)

            tot_train_score += regressor.score(X_train, y_train)
            tot_test_score += regressor.score(X_test, y_test)

        train_scores_dict[feat] = tot_train_score / n
        test_scores_dict[feat] = tot_test_score / n
    graph_barsForFeatures(train_scores_dict, test_scores_dict, 'Comparing features using Desicion Trees',
                          'Feature Name', mean_train_score_b, mean_test_score_b)
Exemplo n.º 7
0
def getBaseFeatsScores(df, base_feats):
    # Split to Data and Actual results
    X_b = selectCols(df, base_feats)
    y_b = df['SQR_FEET_PRICE']

    X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_b, y_b, test_size=0.25, random_state=42)

    tot_train_score_b = 0
    tot_test_score_b = 0
    n = 5
    for i in range(1, n + 1):
        regressor = DecisionTreeRegressor(min_impurity_decrease=200)
        regressor.fit(X_train_b, y_train_b)

        tot_train_score_b += regressor.score(X_train_b, y_train_b)
        tot_test_score_b += regressor.score(X_test_b, y_test_b)

    mean_train_score_b = tot_train_score_b / n
    mean_test_score_b = tot_test_score_b / n

    return mean_train_score_b, mean_test_score_b