def train_var(self, name_y): order_list = list(range(self.min_order, self.max_order)) if isinstance(self.mts, pd.DataFrame): names_x = list(self.mts.columns.values) names_x.remove(name_y) temp_x = self.mts[names_x] temp_y = self.mts[name_y] elif isinstance(self.mts, pd.Series): temp_x = None temp_y = self.mts else: print('error!!!') exit(0) scores = dict() for ord in order_list: y, X = self.ts_order(temp_y, temp_x, order=ord) model = Lr() model_fit = model.fit(X.values, y.values) pred = model_fit.predict(X.values) scores[ord] = self.get_score[self.score](y.values, pred, X.shape[1]) best_order = min(scores, key=scores.get) self.info["best_order"] = best_order self.info["score"] = scores[best_order] self.y, self.X = self.ts_order(temp_y, temp_x, order=best_order) model = Lr() model_fit = model.fit(self.X.values, self.y.values) return model_fit
def _calculate_rss(X_series: pd.DataFrame, y_series: pd.Series): """ This function returns the sum of squared residuals. The function firstly checks that the input arguments are of the correct type, followed by fitting the linear regression model on the X_series and y_series. The predicted values are then placed into the 'y_hat' column, after which the residuals are calculated. Finally, the sum of squared residuals (rss) is calculated. :param: X_series: the series or set of series denoting the X variable. (pd.DataFrame) :param: y_series: the series denoting the y variable. (pd.Series) :return: summary_result: a Pandas DataFrame summarising the result. (pd.DataFrame) :return: rss: the sum of squared errors. (float) """ if not isinstance(X_series, pd.DataFrame): raise TypeError( "The 'X_series' argument should be a Pandas DataFrame.") if not isinstance(y_series, pd.Series): raise TypeError("The 'y_series' argument must be a Pandas Series.") model = Lr().fit(X_series, y_series) summary_result = pd.DataFrame() summary_result['y_hat'] = list(model.predict(X_series)) summary_result['y_actual'] = y_series.values summary_result[ 'residuals'] = summary_result['y_actual'] - summary_result['y_hat'] summary_result['residuals_sq'] = (summary_result['y_actual'] - summary_result['y_hat'])**2 rss = float(summary_result['residuals_sq'].sum()) return summary_result, rss
veri = pd.read_csv("2016dolaralis.csv") print(veri) x = veri["Gun"] y = veri["Fiyat"] x = np.array(x) y = np.array(y) x = x.reshape(251, 1) y = y.reshape(251, 1) plt.scatter(x, y) #Linear Regresyon---------------- tahmin_lineer = Lr() tahmin_lineer.fit(x, y) #Verileri x ve y eksenine oturtuyoruz, tahmin_lineer.predict( x) #x(gün^e göre tahmin etmek , yani 7.günde fiyat kaç olur #X eksenine göre Y yi tahmin edeceğiz plt.plot(x, tahmin_lineer.predict(x), color="red") #Polinom Regresyon----------------------- tahmin_polinom = Pr(degree=2) #2.dereceden fonk olsun xYeni = tahmin_polinom.fit_transform( x ) #x için yeni bir matrix oluşturucağız , tahmin için oluşturduğumuz kısa form polinom_model = Lr() polinom_model.fit(xYeni, y)
# Step:-2 Splitting the data # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 / 3, random_state=0) # Step:-3 Training the model # Fitting the dataset as per the requirements # Most simple or noob model from sklearn.linear_model import LinearRegression as Lr regressor = Lr() regressor.fit(X_train, y_train) # Step:-4 Predicting the data y_pred = regressor.predict(X_test) # Step:-5 Visualizing the dataset #Train set plt.scatter(X_train, y_train, color='red') plt.plot(X_train, regressor.predict(X_train), color='blue') plt.title('Salary Vs Experience(Training Set)') plt.xlabel('Experience') plt.ylabel('Salary') plt.show()
from sklearn.metrics import r2_score # Datensatz einlesen und formatieren x_diabetes, y_diabetes = ds.load_diabetes(return_X_y=True) x_diabetes = x_diabetes[:, np.newaxis, 2] # Trainingsdaten (80%) x_train = x_diabetes[: -88] y_train = y_diabetes[: -88] # Testdaten (20%) x_test = x_diabetes[-88:] y_test = y_diabetes[-88:] # Modell trainieren model = Lr() # Lineare Regression aufsetzen model.fit(x_train, y_train) # Trainieren # y vorhersagen (mit Testdaten) y_pred_test = model.predict(x_test) # y vorhersagen (mit Trainingsdaten) y_pred_train = model.predict(x_train) # Plot erstellen plt.plot(x_test, y_test, ls="none", marker="o") # Testdaten (Kreise, 20%) plt.plot(x_train, y_train, ls="none", marker="s") # Trainingsdaten (Quadrate, 80%) plt.plot(x_test, y_pred_test, 'b-') # Regressionsgerade # Fehler bestimmen print("MSE (Test): ", mse(y_test, y_pred_test)) # Mean Squared Error (Testdaten)
def q1(): """ :return: """ pd.set_option('display.max_columns', None) df = pd.read_csv("house.csv", delimiter=",") # data type of each column print(df.dtypes) print( "\n" + " ===============================================================================" + "\n") # top 5 rows print(df.head()) print( "\n" + " ===============================================================================" + "\n") # drops Unnamed:0 and id columns from data frame df = df.drop(axis=1, columns=["Unnamed: 0", "id"]) print(df.head()) print( "\n" + " ===============================================================================" + "\n") # gets count of unique values of the floor column floor_count = df['floors'].value_counts().to_frame() print(floor_count) print( "\n" + " ===============================================================================" + "\n") # plot that can be used to determine whether houses with a waterfront view or without a waterfront view have more # price outliers. df1 = df[['waterfront', 'price']] sns.boxplot(x=df['waterfront'], y=df['price'], data=df1) # plt.show() print( "\n" + " ===============================================================================" + "\n") # scatter plot with sqft_above on x and price on y axis # plotted a line of best fit # sqft_above is positively correlated to price sns.regplot(x=df['sqft_above'], y=df['price'], data=df) plt.ylim(0, ) # plt.show() print( "\n" + " ===============================================================================" + "\n") # predicts the price using the feature 'sqft_living' then calculated the R^2 # model sort of explains variation around prices mean (approx. 50%) lm = Lr() x = df[['sqft_living']] y = df['price'] lm.fit(x, y) r_squared = lm.score(x, y) print(r_squared) print( "\n" + " ===============================================================================" + "\n") # linear model to predict price using those 4 variables lm = Lr() x = df[['floors', 'waterfront', 'lat', 'sqft_living']] y = df['price'] lm.fit(x, y) r_squared = lm.score(x, y) print(r_squared)
def train(self, data): """ Linear Ready-Made model: Auto-regressive model with exogenous input u(t) Args: data: Training set for ARX model Returns: tuple: Weights and biases for identified linear model """ if self.p is None: raise ValueError("You need to set the number of regress lags.") data.index = data['t'] data = data.drop(['t'], axis=1) # subtract mean self.avagy = np.mean(data['y']) self.avagu = np.mean(data['u']) data['u'] = (data['u'] - self.avagu) data['y'] = (data['y'] - self.avagy) dependent_vars = data[['u', 'y']] # Slice/extract dataframe var_keys = [col for col in dependent_vars.columns] # Divide dependent vars to train and test data set ratio = int(data.index.shape[0] / 2 * 1.5) train_data = pd.DataFrame(dependent_vars.iloc[:ratio, :]) test_data = pd.DataFrame(dependent_vars.iloc[ratio:, :]) # KEYS if self.n_diff is not None: # Firstly, differentiate both data sets to get stationary data train_data['u'] = train_data['u'].diff(self.n_diff).values train_data['y'] = train_data['y'].diff(self.n_diff).values if self.sc is not None: train_data[['u', 'y']] = self.sc.fit_transform(train_data[['u', 'y']]) """" Secondly, lag data by p order for each model parameter. Lagged cols are appended from the last model parameter col.""" for k in range(0, len(var_keys)): # col index for i in range(1, self.p + 1): # No. shifts train_data['{}: Lag {}'.format( var_keys[k], i)] = train_data[var_keys[k]].shift(i) test_data['{}: Lag {}'.format( var_keys[k], i)] = test_data[var_keys[k]].shift(i) # Remove "nan" garbage train_data = train_data.dropna() test_data = test_data.dropna() # TRAIN DATA x_train = train_data.iloc[:, self. input_dim:].values # take only lagged values! y_train_target = train_data['y'].values # TEST DATA x_test = test_data.iloc[:, self.input_dim:].values y_test_target = test_data['y'].values # Optimize linear regression parameters lr = Lr(fit_intercept=False, normalize=False) lr.fit(x_train, y_train_target) # no intercept self.parameters = {'weights': lr.coef_, 'bias': lr.intercept_} # Predict and save into the table for root mean squared error y_train_predict = x_train.dot( self.parameters['weights']) + self.parameters['bias'] y_test_predict = x_test.dot( self.parameters['weights']) + self.parameters['bias'] self.train_rmse = np.sqrt(mse(y_train_target, y_train_predict)) self.test_rmse = np.sqrt(mse(y_test_target, y_test_predict))
def train(self, num_iter=100): self.model = Lr(multi_class="multinomial", solver="lbfgs", max_iter=num_iter, random_state=200).fit(self.features, self.output)