示例#1
0
    def train_var(self, name_y):
        order_list = list(range(self.min_order, self.max_order))
        if isinstance(self.mts, pd.DataFrame):
            names_x = list(self.mts.columns.values)
            names_x.remove(name_y)
            temp_x = self.mts[names_x]
            temp_y = self.mts[name_y]
        elif isinstance(self.mts, pd.Series):
            temp_x = None
            temp_y = self.mts
        else:
            print('error!!!')
            exit(0)
        scores = dict()
        for ord in order_list:
            y, X = self.ts_order(temp_y, temp_x, order=ord)
            model = Lr()
            model_fit = model.fit(X.values, y.values)
            pred = model_fit.predict(X.values)
            scores[ord] = self.get_score[self.score](y.values, pred, X.shape[1])

        best_order = min(scores, key=scores.get)
        self.info["best_order"] = best_order
        self.info["score"] = scores[best_order]

        self.y, self.X = self.ts_order(temp_y, temp_x, order=best_order)
        model = Lr()
        model_fit = model.fit(self.X.values, self.y.values)
        return model_fit
示例#2
0
def _calculate_rss(X_series: pd.DataFrame, y_series: pd.Series):
    """
    This function returns the sum of squared residuals. The function firstly checks that the input
    arguments are of the correct type, followed by fitting the linear regression model on the X_series
    and y_series. The predicted values are then placed into the 'y_hat' column, after which the residuals
    are calculated. Finally, the sum of squared residuals (rss) is calculated.

    :param: X_series: the series or set of series denoting the X variable. (pd.DataFrame)
    :param: y_series: the series denoting the y variable. (pd.Series)
    :return: summary_result: a Pandas DataFrame summarising the result. (pd.DataFrame)
    :return: rss: the sum of squared errors. (float)
    """
    if not isinstance(X_series, pd.DataFrame):
        raise TypeError(
            "The 'X_series' argument should be a Pandas DataFrame.")
    if not isinstance(y_series, pd.Series):
        raise TypeError("The 'y_series' argument must be a Pandas Series.")
    model = Lr().fit(X_series, y_series)
    summary_result = pd.DataFrame()
    summary_result['y_hat'] = list(model.predict(X_series))
    summary_result['y_actual'] = y_series.values
    summary_result[
        'residuals'] = summary_result['y_actual'] - summary_result['y_hat']
    summary_result['residuals_sq'] = (summary_result['y_actual'] -
                                      summary_result['y_hat'])**2
    rss = float(summary_result['residuals_sq'].sum())
    return summary_result, rss
示例#3
0
veri = pd.read_csv("2016dolaralis.csv")
print(veri)

x = veri["Gun"]
y = veri["Fiyat"]

x = np.array(x)
y = np.array(y)

x = x.reshape(251, 1)
y = y.reshape(251, 1)

plt.scatter(x, y)

#Linear Regresyon----------------
tahmin_lineer = Lr()
tahmin_lineer.fit(x, y)  #Verileri x ve y eksenine oturtuyoruz,
tahmin_lineer.predict(
    x)  #x(gün^e göre tahmin etmek , yani 7.günde fiyat kaç olur
#X eksenine göre Y yi tahmin edeceğiz

plt.plot(x, tahmin_lineer.predict(x), color="red")

#Polinom Regresyon-----------------------
tahmin_polinom = Pr(degree=2)  #2.dereceden fonk olsun
xYeni = tahmin_polinom.fit_transform(
    x
)  #x için yeni bir matrix oluşturucağız , tahmin için oluşturduğumuz kısa form

polinom_model = Lr()
polinom_model.fit(xYeni, y)
# Step:-2 Splitting the data

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=1 / 3,
                                                    random_state=0)

# Step:-3 Training the model

# Fitting the dataset as per the requirements
# Most simple or noob model
from sklearn.linear_model import LinearRegression as Lr
regressor = Lr()
regressor.fit(X_train, y_train)

# Step:-4   Predicting the data
y_pred = regressor.predict(X_test)

# Step:-5 Visualizing the dataset

#Train set
plt.scatter(X_train, y_train, color='red')
plt.plot(X_train, regressor.predict(X_train), color='blue')
plt.title('Salary Vs Experience(Training Set)')
plt.xlabel('Experience')
plt.ylabel('Salary')
plt.show()
示例#5
0
from sklearn.metrics import r2_score

# Datensatz einlesen und formatieren
x_diabetes, y_diabetes = ds.load_diabetes(return_X_y=True)
x_diabetes = x_diabetes[:, np.newaxis, 2]

# Trainingsdaten (80%)
x_train = x_diabetes[: -88]
y_train = y_diabetes[: -88]

# Testdaten (20%)
x_test = x_diabetes[-88:]
y_test = y_diabetes[-88:]

# Modell trainieren
model = Lr()  # Lineare Regression aufsetzen
model.fit(x_train, y_train)  # Trainieren

# y vorhersagen (mit Testdaten)
y_pred_test = model.predict(x_test)

# y vorhersagen (mit Trainingsdaten)
y_pred_train = model.predict(x_train)

# Plot erstellen
plt.plot(x_test, y_test, ls="none", marker="o")  # Testdaten (Kreise, 20%)
plt.plot(x_train, y_train, ls="none", marker="s")  # Trainingsdaten (Quadrate, 80%)
plt.plot(x_test, y_pred_test, 'b-')  # Regressionsgerade

# Fehler bestimmen
print("MSE (Test): ", mse(y_test, y_pred_test))  # Mean Squared Error (Testdaten)
示例#6
0
def q1():
    """

    :return:
    """
    pd.set_option('display.max_columns', None)
    df = pd.read_csv("house.csv", delimiter=",")

    # data type of each column
    print(df.dtypes)

    print(
        "\n" +
        " ==============================================================================="
        + "\n")
    # top 5 rows
    print(df.head())

    print(
        "\n" +
        " ==============================================================================="
        + "\n")

    # drops Unnamed:0 and id columns from data frame
    df = df.drop(axis=1, columns=["Unnamed: 0", "id"])
    print(df.head())

    print(
        "\n" +
        " ==============================================================================="
        + "\n")

    # gets count of unique values of the floor column
    floor_count = df['floors'].value_counts().to_frame()
    print(floor_count)

    print(
        "\n" +
        " ==============================================================================="
        + "\n")

    # plot that can be used to determine whether houses with a waterfront view or without a waterfront view have more
    # price outliers.
    df1 = df[['waterfront', 'price']]
    sns.boxplot(x=df['waterfront'], y=df['price'], data=df1)
    # plt.show()

    print(
        "\n" +
        " ==============================================================================="
        + "\n")

    # scatter plot with sqft_above on x and price on y axis
    # plotted a line of best fit
    # sqft_above is positively correlated to price
    sns.regplot(x=df['sqft_above'], y=df['price'], data=df)
    plt.ylim(0, )
    # plt.show()

    print(
        "\n" +
        " ==============================================================================="
        + "\n")

    # predicts the price using the feature 'sqft_living' then calculated the R^2
    # model sort of explains variation around prices mean (approx. 50%)
    lm = Lr()
    x = df[['sqft_living']]
    y = df['price']
    lm.fit(x, y)
    r_squared = lm.score(x, y)
    print(r_squared)

    print(
        "\n" +
        " ==============================================================================="
        + "\n")

    # linear model to predict price using those 4 variables
    lm = Lr()
    x = df[['floors', 'waterfront', 'lat', 'sqft_living']]
    y = df['price']
    lm.fit(x, y)
    r_squared = lm.score(x, y)
    print(r_squared)
示例#7
0
    def train(self, data):
        """ Linear Ready-Made model: Auto-regressive model with exogenous input u(t)

        Args:
            data: Training set for ARX model

        Returns:
            tuple: Weights and biases for identified linear model
        """
        if self.p is None:
            raise ValueError("You need to set the number of regress lags.")

        data.index = data['t']
        data = data.drop(['t'], axis=1)

        # subtract mean
        self.avagy = np.mean(data['y'])
        self.avagu = np.mean(data['u'])

        data['u'] = (data['u'] - self.avagu)
        data['y'] = (data['y'] - self.avagy)

        dependent_vars = data[['u', 'y']]  # Slice/extract dataframe
        var_keys = [col for col in dependent_vars.columns]

        # Divide dependent vars to train and test data set
        ratio = int(data.index.shape[0] / 2 * 1.5)
        train_data = pd.DataFrame(dependent_vars.iloc[:ratio, :])
        test_data = pd.DataFrame(dependent_vars.iloc[ratio:, :])

        # KEYS
        if self.n_diff is not None:
            # Firstly, differentiate both data sets to get stationary data
            train_data['u'] = train_data['u'].diff(self.n_diff).values
            train_data['y'] = train_data['y'].diff(self.n_diff).values

        if self.sc is not None:
            train_data[['u',
                        'y']] = self.sc.fit_transform(train_data[['u', 'y']])
        """" Secondly, lag data by p order for each model parameter.
        Lagged cols are appended from the last model parameter col."""
        for k in range(0, len(var_keys)):  # col index
            for i in range(1, self.p + 1):  # No. shifts
                train_data['{}: Lag {}'.format(
                    var_keys[k], i)] = train_data[var_keys[k]].shift(i)
                test_data['{}: Lag {}'.format(
                    var_keys[k], i)] = test_data[var_keys[k]].shift(i)

        # Remove "nan" garbage
        train_data = train_data.dropna()
        test_data = test_data.dropna()

        # TRAIN DATA
        x_train = train_data.iloc[:, self.
                                  input_dim:].values  # take only lagged values!
        y_train_target = train_data['y'].values

        # TEST DATA
        x_test = test_data.iloc[:, self.input_dim:].values
        y_test_target = test_data['y'].values

        # Optimize linear regression parameters
        lr = Lr(fit_intercept=False, normalize=False)
        lr.fit(x_train, y_train_target)  # no intercept
        self.parameters = {'weights': lr.coef_, 'bias': lr.intercept_}

        # Predict and save into the table for root mean squared error
        y_train_predict = x_train.dot(
            self.parameters['weights']) + self.parameters['bias']
        y_test_predict = x_test.dot(
            self.parameters['weights']) + self.parameters['bias']

        self.train_rmse = np.sqrt(mse(y_train_target, y_train_predict))
        self.test_rmse = np.sqrt(mse(y_test_target, y_test_predict))
 def train(self, num_iter=100):
     self.model = Lr(multi_class="multinomial",
                     solver="lbfgs",
                     max_iter=num_iter,
                     random_state=200).fit(self.features, self.output)