示例#1
0
    def train(self, model, save=False, make_chart=False):
        """
		Trains an input model. Makes Calculations, Charts, and Saves
		the model if necessary.

		Parameters
		----------
		model:     SKLearn Model The regression model to use
		save:      Boolean Whether or not the model should be saved
		make_chart Boolean Whether or not to make/save a chart

		Returns
		-------
		float, float, float: The Average CV Mean Squared Error, Mean Absolute Error, and Test MSE 
		"""
        #get/split data
        reader = DataReader()
        df = reader.create_input_data()
        df = self.preprocess(df)
        self.X_train, self.X_test, self.y_train, self.y_test = self.split_data(
            df)

        parameters = {
            'n_estimators': [1, 5, 10, 20, 30],
            'max_depth': [1, 5, 10]
        }
        rf = RandomForestRegressor()
        self.model = GridSearchCV(rf, parameters, cv=10)
        #train model
        self.model.fit(self.X_train, self.y_train)

        #Feature importance
        importances = self.model.best_estimator_.feature_importances_
        cols = self.X_train.columns
        for i in range(len(importances)):
            print(cols[i], importances[i])

        if save:
            joblib.dump(self.model.best_estimator_,
                        "../models/" + self.name + "_2017.joblib")

        print("------------------------")
        MSEs = cross_val_score(estimator=self.model,
                               X=self.X_train,
                               y=self.y_train,
                               scoring='neg_mean_squared_error',
                               cv=8)

        predicted = self.model.predict(self.X_test)
        print("Average CV Mean Squared Error: ", abs(np.mean(MSEs)))
        print(
            "Testing Mean Absolute Error: ",
            mean_absolute_error(self.y_test, self.model.predict(self.X_test)))
        print("Testing MSE: ", mean_squared_error(self.y_test, predicted))
        #print(self.model.feature_importances_)
        if make_chart:
            print("Generating Chart...")
            plt.style.use('dark_background')
            fig, ax = plt.subplots(nrows=1, ncols=1)
            ax.set_ylabel('HDI')
            ax.set_xlabel("Municipality Codmun ID")
            ax.set_title(self.name + 'Real vs Predicted')
            green, = ax.plot(np.arange(20),
                             self.y_test[0:100:5],
                             'g',
                             label='True')
            red, = ax.plot(np.arange(20),
                           predicted[0:100:5],
                           'r',
                           label='Predicted')
            ax.set_xticks(np.arange(20))
            x_labels = self.X_test.iloc[0:100:5]['codmun'].tolist()
            ax.set_xticklabels([str(int(y)) for y in x_labels],
                               rotation='vertical')
            plt.legend(handles=[green, red], labels=["True", "Predicted"])
            plt.tight_layout()
            fig.savefig(self.name + "_real_v_predicted")
            for x in range(0, 100, 5):
                print(predicted[x], x_labels[int(x / 5)])
            print(x_labels, predicted[0:100:5])

        return np.mean(MSEs), mean_absolute_error(
            self.y_test, self.model.predict(self.X_test)), mean_squared_error(
                self.y_test, predicted)
示例#2
0
        print("Please add --train or --test after py Regressor.py")
        options = None

    if options == "--train":
        r = Regressor("Random Forest", load_model=False)
        mod = Regressor("Random Forest", load_model=False)
        cv, ma, mse = r.train(mod, save=False, make_chart=False)
        print(cv, ma, mse)

    elif options == "--test":
        model_name = sys.argv[2] + " " + sys.argv[
            3]  #Random Forest_2017 or Random Forest_2016
        year = int(model_name.split("_")[-1])
        r = Regressor(model_name, load_model=True)
        reader = DataReader()
        df = reader.create_input_data()
        predictions = r.predict(df, year)
        print("Actual || Predicted")
        for i in range(len(predictions)):
            print(df.iloc[i]['hdi'], "||", predictions[i])

######Training Code#########

#cv_error = []
#testing_ma_error = []
#testing_mse = []
#mod = RandomForestRegressor(bootstrap=True, criterion='mae', n_estimators=100)
#mod = RandomForestRegressor()
#r = Regressor("Random Forest_2017", load_model=True)
#importances = r.model.feature_importances_
#reader = DataReader()