X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) # Fitting the Linear model ML_Regression = LinearRegression() ML_Regression.fit(X_train, Y_train) # Prediting the Results Y_pred = ML_Regression.predict(X_test) # Building Backward Elimination X=np.append(arr=np.ones((50,1)),values=X,axis=1) X_opt = X[:, [0, 1, 2, 3, 4, 5]] OLS_Regression = sm.OLS(endog=Y, exog=X_opt).fit() OLS_Regression.summary() X_opt = X[:, [0, 1, 3, 4, 5]] OLS_Regression = sm.OLS(endog=Y, exog=X_opt).fit() OLS_Regression.summary() X_opt = X[:, [0, 3, 4, 5]] OLS_Regression = sm.OLS(endog=Y, exog=X_opt).fit() OLS_Regression.summary() X_opt = X[:, [0, 3, 5]] OLS_Regression = sm.OLS(endog=Y, exog=X_opt).fit() OLS_Regression.summary()
from sklearn.linear_model import LinearRegression regressor = LinearRegression() regressor.fit(x_train, y_train) y_pred = regressor.predict(x_test) print(y_pred) #backward elimination import statsmodels.formula.api as sm X = np.append(arr=np.ones((14, 1)).astype(int), values=sonveriler.iloc[:, :-1], axis=1) X_l = sonveriler.iloc[:, [0, 1, 2, 3, 4, 5]].values r_ols = sm.OLS(endog=sonveriler.iloc[:, -1:], exog=X_l) r = r_ols.fit() print(r.summary()) sonveriler = sonveriler.iloc[:, 1:] import statsmodels.formula.api as sm X = np.append(arr=np.ones((14, 1)).astype(int), values=sonveriler.iloc[:, :-1], axis=1) X_l = sonveriler.iloc[:, [0, 1, 2, 3, 4]].values r_ols = sm.OLS(endog=sonveriler.iloc[:, -1:], exog=X_l) r = r_ols.fit() print(r.summary()) x_train = x_train.iloc[:, 1:]
# Splitting the data set into the Training set and Test set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Fitting Multiple Linear Regression to the training set regressor = LinearRegression() regressor.fit(X_train, y_train) # Predecting the test result y_pred = regressor.predict(X_test) # Building the optimal model using backward elimination method # X = np.append(arr=X, values=np.ones((50,1)).astype(int), axis=1) X = np.append(arr=np.ones((50,1)).astype(int), values=X, axis=1) X_opt = X[:, [0, 1, 2, 3, 4, 5]] regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit() regressor_OLS.summary() X_opt = X[:, [0, 1, 3, 4, 5]] regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit() regressor_OLS.summary() X_opt = X[:, [0, 3, 4, 5]] regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit() regressor_OLS.summary() X_opt = X[:, [0, 3, 5]] regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit() regressor_OLS.summary() X_opt = X[:, [0, 3]] regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit() regressor_OLS.summary()
r2 = LinearRegression() r2.fit(x_train, y_train) y_prediction2 = r2.predict(x_test) X = np.append(arr=np.ones((22, 1)).astype(int), values=veri, axis=1) X_list = veri.iloc[:, [ 0, 1, 2, 3, 4, 5, ]].values r = sm.OLS(endog=boy, exog=X_list).fit() print(r.summary()) X = np.append(arr=np.ones((22, 1)).astype(int), values=veri, axis=1) X_list = veri.iloc[:, [ 0, 1, 2, 3, 5, ]].values r = sm.OLS(endog=boy, exog=X_list).fit() print(r.summary())
oneHotDev = pd.concat([oneHotDev, dev[featuresNum], dev.iloc[:,-1]], axis=1) oneHotTest = pd.concat([oneHotTest, test[featuresNum]], axis=1) # reduce features by using statsmodel oneHotTrainSM = pd.concat([pd.DataFrame(np.ones(len(oneHotTrain)), columns=['bias']), oneHotTrain], axis=1) oneHotDevSM = pd.concat([pd.DataFrame(np.ones(len(oneHotDev)), columns=['bias']), oneHotDev], axis=1) oneHotTestSM = pd.concat([pd.DataFrame(np.ones(len(oneHotTest)), columns=['bias']), oneHotTest], axis=1) # initialize variables pVal = 0.15 count = 0 # reduce features using bic while count < len(oneHotTrainSM.columns)-1: count += 1 regOLS = sm.OLS(endog=oneHotTrainSM.iloc[:,-1], exog=oneHotTrainSM.iloc[:,:-1]).fit() if regOLS.pvalues.max() > pVal: oneHotTrainSM = oneHotTrainSM.drop([regOLS.pvalues.idxmax()], axis=1) oneHotDevSM = oneHotDevSM.drop([regOLS.pvalues.idxmax()], axis=1) oneHotTestSM = oneHotTestSM.drop([regOLS.pvalues.idxmax()], axis=1) else: break if 'bias' in oneHotTrainSM.columns.values: oneHotTrainSM = oneHotTrainSM.drop(['bias'], axis=1) oneHotDevSM = oneHotDevSM.drop(['bias'], axis=1) oneHotTestSM = oneHotTestSM.drop(['bias'], axis=1) # reassign training, dev and test data oneHotTrain = oneHotTrainSM oneHotDev = oneHotDevSM
import numpy as np from sklearn.preprocessing import LabelEncoder, OneHotEncoder from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split import pandas as pd import statsmodels.formula.api as sm data = pd.read_csv("C:/Users/GEU/Downloads/data2.csv") X = data.iloc[:, :-1].values Y = data.iloc[:, 4].values labelencoder = LabelEncoder() X[:, 0] = labelencoder.fit_transform(X[:, 0]) onehotencoder = OneHotEncoder(categorical_features=[0]) X = onehotencoder.fit_transform(X).toarray() X = np.append(np.ones([30, 1]).astype(int), values=X, axis=1) X_opt = X[:, [0, 1, 2, 3, 4, 5, 6]] print(X_opt) reg_ols = sm.OLS(endog=Y, exog=X_opt).fit() print(reg_ols.summary()) X_train, X_test, Y_train, Y_test = train_test_split(X_opt, Y, test_size=1 / 3, random_state=0) m = LinearRegression() m.fit(X_train, Y_train) print(m.score(X_test, Y_test))
"""ADDING b0""" data_length = len(df_train); df_train['b0'] = [1]*data_length; """BACKWARD ELIMINATION""" max_p_value = 1; non_significant_column = None; eliminator = None; num = 0 sm_result = None while max_p_value > 0.05: if not non_significant_column == None: del df_train[non_significant_column]; del df_test[non_significant_column]; sm_result = sm.OLS(endog = df_train["SalePrice"], exog = df_train.loc[:, df_train.columns != "SalePrice"]).fit(); p_values = sm_result.pvalues; max_p_value = np.amax(p_values) i = np.where(p_values == max_p_value); non_significant_column = list(p_values.index[i])[0]; # remove b0 del df_train['b0']; """ LOGISTIC REGRESSION """ regressor = LogisticRegression(random_state=0); regressor.fit(df_train.loc[:, df_train.columns != "SalePrice"], df_train["SalePrice"]); prediction = regressor.predict(df_test.loc[:, df_test.columns != "SalePrice"]); # ################# # SUBMIT ANSWER # #################
regression.fit(x_train2,y_train2) boy_pred = regression.predict(x_test2) #print(y_test2) #print(boy_pred) #-----------------backwardElimination---------------------------- X = np.append(arr=np.ones((22,1)).astype(int),values=newData,axis=1) # birebirlik array oluşturuldu int türünde, newData arrayine yukardan aşağı şeklinde eklendi #print(X) X_l = newData.iloc[:,[0,1,2,3,4,5]].values # daha sonradan üzerinde oynma yapabilmek için bir dizi şeklinde aldık print(type (X_l)) # burada yapılan asıl olay tam olarak bizim multilinner regrestion modelimizde y = B + B1X1 + B2X2 + B3X3 ... bir denklem var. Bu denklem # için elimizde bağımsız değişkenlerimiz bulunmakta ama ilgi B sabiti yok bu sabiti de ekleyebilmek için 1 lerden oluşan bir # sutun ekledik. Bu sutun 1 lerden oluşma sebebi katsayısını 1 olmasıdır. boyArray = boyDFrame.iloc[:,0:1].values result_OLS = sm.OLS(endog=boyArray,exog=X_l) # boy verisine göre diğer değişkenlerin bilgilerini "Ols raporu"(koveryans varyans p_value vb.) çıkarıyor. Bu çıkarma işleminin r = result_OLS.fit() # gerçekleşmesi için fit() demen lazım. print(r.summary()) # çıkarılan değerlerin özeti. Buradan P_value olan en büyük değeri elemeliyiz. X_l2 = newData.iloc[:,[1,2,3,4,5]].values result_OLS2 = sm.OLS(endog=boyArray,exog=X_l2) # burada 0. index deki değeri eledik. Bu şekilde devam etcek. Genelde 0.05 altında olana kadar eleme işlemi devam eder r2 = result_OLS2.fit() print(r2.summary())
y_train = sc_y.fit_transform(y_train)""" # Fitting Multiple Linear Regression to the Training set from sklearn.linear_model import LinearRegression regressor = LinearRegression() regressor.fit(X_train, y_train) # Predicting the Test set results y_pred = regressor.predict(X_test) # building the optimal model using Bbackward Elimination import statsmodels.formula.api as sm X = np.append(np.ones((50, 1)).astype(int), X, 1) X_opt = X[:, [0, 1, 2, 3, 4, 5]] regressor_OLS = sm.OLS(y, X_opt).fit() print(regressor_OLS.summary()) # remove hihest significance value above 5% (2) X_opt = X[:, [0, 1, 3, 4, 5]] regressor_OLS = sm.OLS(y, X_opt).fit() print(regressor_OLS.summary()) # remove hihest significance value above 5% (1) X_opt = X[:, [0, 3, 4, 5]] regressor_OLS = sm.OLS(y, X_opt).fit() print(regressor_OLS.summary()) # remove hihest significance value above 5% (4) X_opt = X[:, [0, 3, 5]] regressor_OLS = sm.OLS(y, X_opt).fit() print(regressor_OLS.summary())
def _fit_ols(y, x, **kwargs): """ Perform the basic ols regression.""" # mixed effects code is obtained here: # http://stackoverflow.com/a/22439820/1167475 return [smf.OLS(endog=y[b], exog=x, **kwargs) for b in y.columns]
def ols_summary(ones_length, x_dataset, y_dataset, columns): B0 = np.append(arr = np.ones((ones_length,1)).astype(int), values=x_dataset, axis=1) ols = sm.OLS(endog=y_dataset, exog=x_dataset.iloc[:,columns]).fit() return ols.summary()
#----------------------------------------------------------- ##### 6- Build Optimal model using Backward Elimination ##### #----------------------------------------------------------- import statsmodels.formula.api as sm # add column of 1s in the beggining of the matrix X = np.append(arr = np.ones((50, 1)).astype(int), values = X, axis = 1) # X_opt will have matrix of variables that have high impact on the profit X_opt = X[:, [0, 1, 2, 3, 4, 5]] #copy entire X regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit() # fit the model with all possible predictors regressor_OLS.summary() # view the summary and P-value # remove the independent variable that has highest p-value (P>t) and t=0.05 = 5% X_opt = X[:, [0, 1, 3, 4, 5]] #copy entire X regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit() # fit the model with all possible predictors regressor_OLS.summary() # view the summary and P-value # remove the independent variable that has highest p-value (P>t) and t=0.05 = 5% X_opt = X[:, [0, 3, 4, 5]] #copy entire X regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit() # fit the model with all possible predictors regressor_OLS.summary() # view the summary and P-value # remove the independent variable that has highest p-value (P>t) and t=0.05 = 5% X_opt = X[:, [0, 3, 5]] #copy entire X regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit() # fit the model with all possible predictors
"""ADDING b0""" data_length = len(X_train); X_train['b0'] = [1]*data_length; """BACKWARD ELIMINATION""" max_p_value = 1; non_significant_column = None; eliminator = None; num = 0 sm_result = None while max_p_value > 0.05: if not non_significant_column == None: del X_train[non_significant_column]; del X_test[non_significant_column]; sm_result = sm.OLS(endog = y_train, exog = X_train.loc[:, X_train.columns != "Survived"]).fit(); p_values = sm_result.pvalues; max_p_value = np.amax(p_values) i = np.where(p_values == max_p_value); non_significant_column = list(p_values.index[i])[0]; # remove b0 del X_train['b0']; """ KNN """ from sklearn.neighbors import KNeighborsClassifier classifier = KNeighborsClassifier(n_neighbors = 5, metric= 'minkowski', p =2) classifier.fit(X_train.loc[:, X_train.columns != "Survived"], y_train) prediction = classifier.predict(X_test.loc[:, X_test.columns != "Survived"])
plt.plot(y_test, color='red') # USE BACKWARD ELIMINATION METHOD import statsmodels.formula.api as sm # append X to an array of ones X1 = np.append(arr=np.ones((len(dataset.index), 1)).astype(int), values=X, axis=1) # step 1 confidence-value is 0.05 # step 2: Fit the full modell with all possible predictors/variables X1_optimal = X1[:, [0, 1, 2, 3, 4, 5]] # use ordinaly least significant regressor regressor_OLS = sm.OLS(endog=y, exog=X1_optimal).fit() regressor_OLS.summary() # step 3, 4, 5: Find the variable with higest p-value and remove it from the model and fit again # x2 has the highest p-value, index 2 X1_optimal = X1[:, [0, 1, 3, 4, 5]] # use ordinaly least significant regressor regressor_OLS = sm.OLS(endog=y, exog=X1_optimal).fit() regressor_OLS.summary() # x1 has the highest p-value, index 1 X1_optimal = X1[:, [0, 3, 4, 5]] # use ordinaly least significant regressor regressor_OLS = sm.OLS(endog=y, exog=X1_optimal).fit() regressor_OLS.summary()
if deviance < dist.chi2.ppf(0.95, degree_of_freedom): print('test_regression_c est rejete pour best_regression_b') # Finance # a) r_royal = helper.yf_log_yield_extractor('./data/royalbank_monthly.csv', number_datapoint=30) r_royal.to_excel('./output/log_yield_royalbank.xlsx') # b) plt.scatter([x for x in range(30)], r_royal['Yield']**2) plt.title('Yield squared vs Time') # c) plt.scatter([x for x in range(30)], r_royal['Yield']) plt.title('Yield vs Time') mu = np.mean(r_royal['Yield']) y = (r_royal['Yield'][:29] - mu)**2 x = (r_royal['Yield'][1:30] - mu)**2 x = sti.add_constant(x) regression = stt.OLS(list(y), np.array(x)) results = regression.fit() results.summary() h31 = results.predict((1, y[0]))[0] h32 = results.predict((1, h31))[0]
x[:, 3] = labelencoder_X.fit_transform(x[:, 3]) onehotencoder = OneHotEncoder(categorical_features = [3]) x = onehotencoder.fit_transform(x).toarray() #Avoiding the dummy variable trap x = x[:, 1:] #Splitting training and test data sets from sklearn.cross_validation import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state=0) #split 20% test/80% train #Fitting Multiple Linear Regression to training set from sklearn.linear_model import LinearRegression regressor = LinearRegression() regressor.fit(x_train, y_train) #Predicting the Test set results y_pred = regressor.predict(x_test) #backward elimination import statsmodels.formula.api as sm x = np.append(arr=np.ones((50,1)).astype(int),values=x, axis=1) x_opt = x[:,[0,1,2,3,4,5]] regressor_OLS=sm.OLS(endog=y, exog=x_opt).fit() #fit with all possible predictors regressor_OLS.summary() x_opt = x[:,[0,1,3,4,5]] regressor_OLS=sm.OLS(endog=y, exog=x_opt).fit() #fit with all possible predictors regressor_OLS.summary()
# Splitting the dataset into the Training set and Test set from sklearn.cross_validation import train_test_split X_train_new, X_test_new, Y_train_new, Y_test_new = train_test_split( X_new, Y_new, test_size=0.2, random_state=0) #fitting the multiple linear regression model regressor.fit(X_train_new, Y_train_new) #predicting test set result Y_pred_new = regressor.predict(X_test_new) # bakward elimination regression import statsmodels.formula.api as sm X_new = np.append(arr=np.ones((50, 1)).astype(int), values=X_new, axis=1) X_optimal = X_new[:, [0, 1, 2, 3, 4, 5]] regressorfromols = sm.OLS( endog=Y_new, exog=X_optimal).fit() # parameters are dependent and independent variable regressorfromols.summary() #show the summary including p-values X_optimal = X_optimal[:, [0, 2, 3, 4]] regressorfromols = sm.OLS(endog=Y_new, exog=X_optimal).fit() regressorfromols.summary() X_optimal = X_optimal[:, [0, 1, 3]] regressorfromols = sm.OLS(endog=Y_new, exog=X_optimal).fit() regressorfromols.summary() X_optimal = X_optimal[:, [0, 1]] regressorfromols = sm.OLS(endog=Y_new, exog=X_optimal).fit() regressorfromols.summary()
# Now we will split the data to train and test the data for prediction training_x,test_x,training_y,test_y = train_test_split(real_x,real_y,test_size=0.2 '''indicating for 20% test data, rest will go into the training data''', random_state = 0 ''' Taking 0 difference between our prediction and model predication''') # now we will do Regression to train the data MLR = LinearRegression() MLR.fit(training_x,training_y) #training data #now we will all prediction pred_y = MLR.predict(test_x) pred_y #PREDICTED VALUE # now you can compare the predicted value with the exact value test_y #real value MLR.coef_ #to calculate the cofficient value MLR.intercept_ # to find the intercept value #now to calculate the value through formula #y = b0+ b1x1+b2x2......+BnXn (We have to get b=0, to amke all the zero) real_x = np.append(arr=np.ones((50,1)).astypes(int),values=real_x,axis=1) # now we got all the values in 1 x_opt= real[:,[0,1,2,3,4,5]] reg_OLS=sm.OLS(endog=real_y, exog=x_opt).fit() reg_OLS.summary() #if the p value would greate then 0.5 then remove that row index value x_opt= real[:,[0,1,2,3,4,5]] #we will remove 2 from list, thier value is more than 0.5 p value. reg_OLS=sm.OLS(endog=real_y, exog=x_opt).fit() reg_OLS.summary()
from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0) from sklearn.linear_model import LinearRegression regressor = LinearRegression() regressor.fit(x_train, y_train) y_pred = regressor.predict(x_test) import statsmodels.formula.api as sm x = np.append(arr=np.ones((50, 1)).astype(int), values=x, axis=1) x_opt = x[:, [0, 1, 2, 3, 4, 5]] regressor_ols = sm.OLS(endog=y, exog=x_opt).fit() regressor_ols.summary() x_opt = x[:, [0, 1, 3, 4, 5]] regressor_ols = sm.OLS(endog=y, exog=x_opt).fit() regressor_ols.summary() x_opt = x[:, [0, 3, 4, 5]] regressor_ols = sm.OLS(endog=y, exog=x_opt).fit() regressor_ols.summary() x_opt = x[:, [0, 3, 5]] regressor_ols = sm.OLS(endog=y, exog=x_opt).fit() regressor_ols.summary() x_opt = x[:, [0, 3]]
from sklearn.linear_model import LinearRegression linerModel = LinearRegression() linerModel.fit(X_train, y_train) predictVal = linerModel.predict(X_test) #print("============Predicted Value==============") #print(predictVal) #Build Optimal Model using BAckward Elimination import statsmodels.formula.api as sm #We y = b0 = x1b1 + x2b2 + ... xnbn,So we are making y = b0x0 + b1x1 + b2x2 + ..... bnxn #X = np.append(X, np.ones((48,1)).astype(int), axis=1) # It will append in last position #We need to add One clomuns on ones in first position that why we appending X = np.append(np.ones((48, 1)).astype(int), X, axis=1) xOpt = X[:, [0, 1, 2, 3, 4, 5]] regressorOls = sm.OLS(endog=y, exog=xOpt).fit() # See Summary if P > 0.05 then eleminate which have higher value of p print(regressorOls.summary()) #Eleminate x it has 0.959 xOpt = X[:, [0, 1, 3, 4, 5]] regressorOls = sm.OLS(endog=y, exog=xOpt).fit() print(regressorOls.summary()) # Now Remove X2 because it has 0.897 value xOpt = X[:, [0, 3, 4, 5]] regressorOls = sm.OLS(endog=y, exog=xOpt).fit() print(regressorOls.summary()) #BAckward Elimination process xOpt = X[:, [0, 3, 5]] regressorOls = sm.OLS(endog=y, exog=xOpt).fit() print(regressorOls.summary()) #BAckward Elimination process
index = aff.index(max_prob) print("percentage of total women actually had an affair is : ", affair[1]) print("Prediction of women : ", index) #optimal Model import statsmodels.formula.api as sm features = np.append(arr=np.ones((6366, 1)).astype(int), values=features, axis=1) features_opt = features[:, [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ]] regressor_OLS = sm.OLS(endog=labels, exog=features_opt).fit() regressor_OLS.summary() features_opt = features[:, [0, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]] regressor_OLS = sm.OLS(endog=labels, exog=features_opt).fit() regressor_OLS.summary() features_opt = features[:, [0, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]] regressor_OLS = sm.OLS(endog=labels, exog=features_opt).fit() regressor_OLS.summary() features_opt = features[:, [0, 6, 7, 8, 9, 10, 11, 12, 13, 15]] regressor_OLS = sm.OLS(endog=labels, exog=features_opt).fit() regressor_OLS.summary() features_opt = features[:, [0, 11, 12, 13, 15]]
# ====> training and test set divide startup_data_train, startup_data_test, profit_train, profit_test = train_test_split(startup_data, profit, test_size=0.2) # ====> Linear regression model regressor = LinearRegression() regressor.fit(startup_data_train, profit_train) # ====> predict the test results profit_predict = regressor.predict(startup_data_test) # ====> Bulding the optimal model using backward elimination startup_data = np.append(arr = np.ones((50,1)).astype(int), values = startup_data, axis = 1) startup_data_opt = startup_data[:, [0, 1, 2, 3, 4, 5]] regressor_ols = smapi.OLS(profit, startup_data_opt).fit() regressor_summary = regressor_ols.summary() startup_data_opt = startup_data[:, [0, 1, 3, 4, 5]] regressor_ols = smapi.OLS(profit, startup_data_opt).fit() regressor_summary = regressor_ols.summary() startup_data_opt = startup_data[:, [0, 3, 4, 5]] regressor_ols = smapi.OLS(profit, startup_data_opt).fit() regressor_summary = regressor_ols.summary() startup_data_opt = startup_data[:, [0, 3, 5]] regressor_ols = smapi.OLS(profit, startup_data_opt).fit() regressor_summary = regressor_ols.summary() startup_data_opt = startup_data[:, [0, 3]]
formula, density * u.grams / u.milliliter)) for cas, (formula, density) in data[["formula", "density"]].iterrows())) data["gaff_corrected"] = data.gaff + data.polcorr data["opls_corrected"] = data.opls + data.polcorr figure() plt.plot([0.01, 1], [0.01, 1], 'k') # Guide title("Inverse Static Dielectric (Virtual Chemistry; GAFF)") xlabel("Predicted") ylabel("Experiment") x, y = data["gaff"], data["expt"] ols_model = sm.OLS(y, x) ols_results = ols_model.fit() r2 = ols_results.rsquared #plot(x, y, 'o', label="GAFF (R^2 = %.3f)" % r2) plot(x**-1, y**-1, 'o', label="GAFF") xlim((0.01, 1)) ylim((0.01, 1)) plt.gca().set_aspect('equal', adjustable='box') plt.draw() savefig("./manuscript/figures/dielectric_virtual_chemistry_gaff_nocorr.pdf", bbox_inches="tight") x, y = data["gaff_corrected"], data["expt"] ols_model = sm.OLS(y, x) ols_results = ols_model.fit()
X1 = np.append(arr=X1, values=np.ones((rows_count, 1)).astype(float), axis=1) X1 = X1[:, [5, 0, 1, 2, 3, 4]] # 2ns way of append values in which (b.) value will be automaically set to 1 at index 0 (use any fron 1st or 2nd way) X1 = np.append(arr=np.ones((rows_count, 1)).astype(int), values=X1, axis=1) """ for i in X1: X2 = X1[:,[5]] /2 """ # 9-2) Now we are going to start backward elimination import statsmodels.formula.api as sm # we are going to make a varibale of collection of independent/predictors X1_optimized = X1[:, [ 0, 1, 2, 3, 4, 5 ]] # We will delete index step by step to get important predictors regressor_OLS = sm.OLS( endog=Y1, exog=X1_optimized).fit() #Fit full model with all predictors regressor_OLS.summary( ) # Check summary of model and remove highest P values predictors X1_optimized = X1[:, [ 0, 3, 4, 5, ]] # We will delete index step by step to get important predictors regressor_OLS = sm.OLS( endog=Y1, exog=X1_optimized).fit() #Fit full model with all predictors regressor_OLS.summary() X1_optimized = X1[:, [ 3, 4, 5
else: X_no[k, :] = X[i, :].reshape(1, colX) temp = temp - 1 i = i + 1 k = k + 1 #removing extra rows containing zeros X_yes = X_yes[:-(rowX - j), :] X_no = X_no[:-(rowX - k), :] # Feature Scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_yes = sc_X.fit_transform(X_yes) X_no = sc_X.transform(X_no) #building the optimal model using backward elimination import statsmodels.formula.api as sm #appendding X(i) as ones to X matrix X = np.append(arr=np.ones((len(X_yes), 1)).astype(int), values=X_yes, axis=1) #PL = 0.05 X_opt = X_yes[:, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]] regressor_OLS = sm.OLS(endog=np.ones((j, 1)), exog=X_opt).fit() regressor_OLS.summary() #unfortunately #P values are almost same. That means linear model doesnt adapt to it. #Implement Non Linear Models such as Kernal-SVM, Random Forest, XGBoost #Suggested ones are Kernal-SVM or XGBoost
# opt_features = features[:, [0, 1, 2, 3, 4, 5]] # Step1 - All-in # ols_regressor = sm.OLS(endog=output, exog=opt_features).fit() # Step2 - Fit the model # ols_regressor.summary() # Step3 - Consider the predictor with the highest P-value # # opt_features = features[:, [0, 1, 3, 4, 5]] # Step4 - Remove the predictor # ols_regressor = sm.OLS(endog=output, exog=opt_features).fit() # Step2 # ols_regressor.summary() # Step3 # # opt_features = features[:, [0, 3, 4, 5]] # Step4 # ols_regressor = sm.OLS(endog=output, exog=opt_features).fit() # Step2 # ols_regressor.summary() # Step3 # # # Way-1: Using threshold values ---------------------------------------------- opt_features = features[:, [0, 3, 5]] # Step4 ols_regressor = sm.OLS(endog=target, exog=opt_features).fit() # Step2 ols_regressor.summary() # Step3 opt_features = features[:, [3]] # Step4 # Removing not only feature-5 but also the feature-0 as it is the constant value we have added. # ***** Checking the Results with opt_features ***** opt_training_features = training_features[:, [2]] opt_testing_features = testing_features[:, [2]] opt_regressor = LinearRegression() opt_regressor.fit(opt_training_features, training_target) opt_predicted_target = opt_regressor.predict(opt_testing_features) opt_error = abs(testing_target - opt_predicted_target) # ***** Visualising Results (Optimised) ***** # - Visualising the Training set results plt.subplot(121)
x_train, x_test = train_test_split(x, test_size=0.2, random_state=0) y_train, y_test = train_test_split(y, test_size=0.2, random_state=0) # Regresi Linear Multiple from sklearn.linear_model import LinearRegression regresilinear = LinearRegression() regresilinear.fit(x_train, y_train) # Data hasil regresi linear y_pred = regresilinear.predict(x_train) import statsmodels.formula.api as sm x = np.append(arr=np.ones((50, 1)), values=x, axis=1) x_opt = x[:, [0, 1, 2, 3, 4, 5]] # Tahap 1 ------------------------------------------ hasil = sm.OLS(endog=y, exog=x_opt).fit() hasil.summary() # Tahap 2 ------------------------------------------ x_opt = x[:, [0, 1, 3, 4, 5]] hasil = sm.OLS(endog=y, exog=x_opt).fit() hasil.summary() # Tahap 3 ------------------------------------------ x_opt = x[:, [0, 3, 4, 5]] hasil = sm.OLS(endog=y, exog=x_opt).fit() hasil.summary() # Tahap 4 ------------------------------------------ x_opt = x[:, [0, 3, 5]] hasil = sm.OLS(endog=y, exog=x_opt).fit() hasil.summary() # Tahap 5 ------------------------------------------ x_opt = x[:, [0, 3]]
columns=['kilo', 'yas', 'cinsiyet']) veri = pd.concat([Ilkpart, Sonpart], axis=1) x_train, x_test, y_train, y_test = train_test_split(veri, Boy, test_size=0.33, random_state=0) regression.fit(x_train, y_train) y_preg = regression.predict(x_test) import statsmodels.formula.api as sm X = np.append(arr=np.ones((22, 1)).astype(int), values=veri, axis=1) X_l = veri.iloc[:, [0, 1, 2, 3, 4, 5]].values r_ols = sm.OLS(endog=Boy, exog=X_l) r = r_ols.fit() print(r.summary()) X_l = veri.iloc[:, [0, 1, 2, 3, 5]].values r_ols = sm.OLS(endog=Boy, exog=X_l) r = r_ols.fit() print(r.summary()) X_l = veri.iloc[:, [0, 1, 2, 3]].values r_ols = sm.OLS(endog=Boy, exog=X_l) r = r_ols.fit() print(r.summary())
from sklearn.linear_model import LinearRegression regressor = LinearRegression() regressor.fit(X_train, y_train) #trying to predict the resullts with testing dataset ,compare with actual prediction y_pred = regressor.predict(X_test) #Building the optimal model (Performance reason,extract significant models) import statsmodels.formula.api as sm X = X[:, 1:-1] ''' appending the dependent variables with ones for coefficient of x0 ''' X = np.append(arr=np.ones((50, 1)).astype(int), values=X, axis=1) X_optimal_features = X[:, [0, 1, 2, 3, 4, 5]] regressor_OLS = sm.OLS(endog=y, exog=X_optimal_features).fit() regressor_OLS.summary() ''' ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const 5.013e+04 6884.820 7.281 0.000 3.62e+04 6.4e+04 x1 198.7888 3371.007 0.059 0.953 -6595.030 6992.607 x2 -41.8870 3256.039 -0.013 0.990 -6604.003 6520.229 x3 0.8060 0.046 17.369 0.000 0.712 0.900 x4 -0.0270 0.052 -0.517 0.608 -0.132 0.078 x5 0.0270 0.017 1.574 0.123 -0.008 0.062 ============================================================================== ''' #since we can remove the variable that has highest p value>0.05---X2 X_optimal_features = X[:, [0, 1, 3, 4, 5]]
from sklearn.model_selection import train_test_split x_train,x_test,y_train,y_test=train_test_split(s3.iloc[:,1:2],s3.iloc[:,-1:],test_size=0.33,random_state=0) from sklearn.linear_model import LinearRegression regressor=LinearRegression() regressor.fit(x_train,y_train) y_pred=regressor.predict(x_test) print(y_pred) import statsmodels.formula.api as sm X=np.append(arr=np.ones((14,1)).astype(int),values=s3.iloc[:,:-1],axis=1) X_l=s3.iloc[:,1].values r_ols=sm.OLS(endog=s3.iloc[:,-1:],exog=X_l) r=r_ols.fit() print(r.summary()) '''X_l=s3.iloc[:,[0,1,2,3,5]].values r_ols=sm.OLS(endog=boy,exog=X_l) r=r_ols.fit() print(r.summary()) ''' y_pred=regressor.predict(x_test)