def make_ols_sklearn(X, y, test_size=0.20, fit_intercept=False, standardize=False):
    """
    
    Makes an OLS in sklearn
    
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    if standardize:
        ss = StandardScaler()
        ss.fit(X_train)
        X_train = ss.transform(X_train)
        X_test = ss.transform(X_test)
    ols = LinearRegression(fit_intercept=fit_intercept, normalize=False)
    ols.fit(X_train, y_train)
    train_score = ols.score(X_train, y_train)
    test_score = ols.score(X_test, y_test)
   
    cvmae_5 = np.mean(cross_val_score(ols,X , y, cv=5, scoring='neg_mean_absolute_error'))
    cvmae_10 = np.mean(cross_val_score(ols,X, y, cv=10, scoring='neg_mean_absolute_error'))
    print(f"train R2 score = {train_score}")
    print(f"test R2 score = {test_score}\n")
   
    print(f"cv5 MSE score = {cvmae_5}")
    print(f"cv10  MSE score = {cvmae_10}")

    return ols
Exemplo n.º 2
0
    def cleanRP_SG(self):
        from math import exp
        NoneOrZero=['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1',
                'BsmtFinType2','BsmtFinSF1','BsmtFinSF2','Alley',
               'Fence','GarageType','GarageQual',
               'GarageCond','GarageFinish','GarageCars',
                'GarageArea','MasVnrArea','MasVnrType','MiscFeature','PoolQC',
                'BsmtFullBath', 'BsmtHalfBath', 'BsmtUnfSF']
        mode=['Electrical','Exterior1st','Exterior2nd','FireplaceQu','Functional','KitchenQual','MSZoning','SaleType','Utilities']
        mean=['TotalBsmtSF']
        columns_with_missing_data=[name for name in self.all.columns if np.sum(self.all[name].isnull()) !=0]
        columns_with_missing_data.remove('SalePrice')
        for column in columns_with_missing_data:
            col_data = self.all[column]
            #print( 'Cleaning ' + str(np.sum(col_data.isnull())) + ' data entries for column: ' + column )
        #log transformation for missing LotFrontage
            if  column=='LotFrontage':
                ols=linear_model.LinearRegression()
                my_ind=self.all[self.all['LotFrontage'].isnull()].index

                y_train=self.all['LotFrontage'].loc[self.all['LotFrontage'].isnull()==False].values
                x_train=self.all['LotArea'].loc[self.all['LotFrontage'].isnull()==False].values
                x_train=np.log(x_train)
                ols.fit(x_train.reshape(-1,1), y_train)   #### What happen if we remove the 'reshape' method?
                for i in my_ind:
                    print(np.log(self.all['LotArea'].loc[i])*ols.coef_)
                    self.all.loc[i,'LotFrontage']=((np.log(self.all.loc[i,'LotArea'])*ols.coef_)+ols.intercept_)[0]

            #imputing the value of YearBuiltto the GarageYrBlt.
            elif  column=='GarageYrBlt':
                missing_index=self.all[self.all['GarageYrBlt'].isnull()].index
                for i in missing_index:
                    self.all.loc[i,'GarageYrBlt']=1871
            elif column in mode:
                # in case of function messing up - remove [0]
                self.all[column] = [self.all[column].mode()[0] if pd.isnull(x) else x for x in self.all[column]]
            elif column in mean:
                self.all[column].fillna(self.all[column].mean(),inplace=True)
            elif column in NoneOrZero:
                if col_data.dtype == 'object':
                    no_string = 'None'
                    self.all[column] = [ no_string if pd.isnull(x) else x for x in self.all[column]]
                else:
                    self.all[column] = [ 0 if pd.isnull(x) else x for x in self.all[column]]
            else:
                print( 'Uh oh!!! No cleaning strategy for:' + column )
Exemplo n.º 3
0
def regression(df):
    msk = np.random.rand(len(df)) < 0.75
    train = df[msk]
    test = df[~msk]

    ols = sm.OLS(train['Number of total People injured and killed'], train.drop('Number of total People injured and killed', 1))
    result = ols.fit()

    return result.summary()
Exemplo n.º 4
0
def regression(df):
    msk = np.random.rand(len(df)) < 0.75
    train = df[msk]
    test = df[~msk]

    ols = sm.OLS(train['Number of total People injured and killed'],
                 train.drop('Number of total People injured and killed', 1))
    result = ols.fit()

    return result.summary()
Exemplo n.º 5
0
 def _regression(self):
     '''
     generate a summary table about the parameters from the regression analysis
     '''
     mask = np.random.rand(len(self._data)) < 0.75
     train = self._data[mask]
     test = self._data[~mask]
     ols = sm.OLS(train['TOTAL FATALITIES'],
                  train.drop('TOTAL FATALITIES', 1))
     result = ols.fit()
     return result.summary()
    def regression(self):
        '''
        generate a summary table about the parameters from the regression analysis
        '''
        df = self._data
        msk = np.random.rand(len(df)) < 0.75
        train = df[msk]
        test = df[~msk]

        ols = sm.OLS(train['Total Fatalities'], train.drop('Total Fatalities', 1))
        result = ols.fit()

        print result.summary()
Exemplo n.º 7
0
    def sg_simpleLM(self):
        self.test_train_split()

        x = np.asarray(self.x_train)
        # x = sm.add_constant(x)
        ols = linear_model.LinearRegression()
        # ols = sm.OLS(np.asarray(self.y_train),)
        model = ols.fit(x, np.asarray(self.y_train))  #.reshape(-1,1)

        sm_model_pred = model.predict(self.x_test)
        self.plot_results(sm_model_pred)
        print('RMSLE from Kaggle: ' +
              str(self.rmsle(y_pred=sm_model_pred, y_test=self.y_test)))
        print(self.rmse_cv(model, self.x_train, self.y_train))
        self.results_dict['simpleLM'] = self.rmse_cv(model, self.x_train,
                                                     self.y_train)
Exemplo n.º 8
0
def make_ols(df, x_columns, target='price'):
    """Pass in a DataFrame & your predictive columns to return an OLS regression model """
    #set your x and y variables
    X = df[x_columns]
    y = df[target]
    # pass them into stats models OLS package
    ols = sm.OLS(y, X)
    #fit your model
    model = ols.fit()
    #display the model summarry
    display(model.summary())
    #plot the residuals 
    fig = sm.graphics.qqplot(model.resid, dist=stats.norm, line='r', alpha=.65, fit=True, markerfacecolor="y")
    plt.xlim(-2.5, 2.5)
    plt.ylim(-2.5, 2.5)
    #return model for later use 
    return 
sc = StandardScaler()
X_train_S=sc.fit_transform(X_train)
X_test_S=sc.transform(X_test)

model_lr= LinearRegression()
model_lr.fit(X_train_S, y_train)

y_pred_lr = model_lr.predict(X_test_S)

print(model_lr.intercept_)
print(model_lr.coef_)



ols = sm.OLS(y_train, X_train_S)
lr=ols.fit()
pvalues = lr.pvalues
print(lr.summary())



yF = fulldata_NMiss['Premium']
XF = fulldata_NMiss[['ins_Cancelation', 'Ticket', 'Accident']]

XF_train, XF_test, yF_train, yF_test = train_test_split(XF, yF, test_size=0.25, random_state=100)
sc = StandardScaler()
XF_train_S=sc.fit_transform(XF_train)
XF_test_S=sc.transform(XF_test)

model_lr= LinearRegression()
model_lr.fit(XF_train_S, yF_train)