示例#1
0
def run():
    varsY = [
        x for x in Y.columns.tolist()
        if Y.columns.tolist().index(x) in listboxY.curselection()
    ]
    varsX = [
        x for x in X.columns.tolist()
        if X.columns.tolist().index(x) in listboxX.curselection()
    ]
    global trainY
    global trainX
    trainY = Y[~data.isnull().T.any().T]
    trainX = X[~data.isnull().T.any().T]
    trainX = add_constant(trainX[varsX])
    testX = X[data.isnull().T.any().T]
    testX = add_constant(testX[varsX])
    result0 = DataFrame(columns=varsY)
    if (len(varsY) == 0):
        messagebox.showinfo('提示', '至少选中一个结果变量!')
        return
    if (len(varsX) == 0):
        messagebox.showinfo('提示', '至少选中一个预测变量!')
        return
    with ExcelWriter(saveFile, engine="openpyxl") as writer:
        for id, varY in enumerate(varsY):
            fit = OLS(trainY.iloc[:, id], trainX).fit()
            print(fit.summary2().tables)
            result0[varY] = fit.predict(testX)
            result0.to_excel(writer,
                             sheet_name="SUMMARY",
                             header=True,
                             index=True)
            global result1
            result1 = fit.get_prediction(testX).summary_frame()
            result1.to_excel(writer, sheet_name=varY, header=True, index=True)
            global result2
            result2 = fit.summary2().tables
            result2[0].iloc[:, [0, 1]].to_excel(writer,
                                                sheet_name=varY,
                                                header=False,
                                                index=False,
                                                startrow=result1.shape[0] + 2,
                                                startcol=0)
            result2[0].iloc[:, [2, 3]].to_excel(writer,
                                                sheet_name=varY,
                                                header=False,
                                                index=False,
                                                startrow=result1.shape[0] + 2,
                                                startcol=5)
            result2[1].to_excel(writer,
                                sheet_name=varY,
                                header=True,
                                index=True,
                                startrow=result1.shape[0] +
                                result2[0].shape[0] + 3)
    writer.save()
    writer.close()
    messagebox.showinfo('提示', '执行完成!')
class PostLasso:
    def __init__(self, formula=None):
        self.lasso_model = Lasso()
        self.ols_model = None
        self.relevant_x = None
        self.subset_cols = None
        self.coefs = None
        self.formula = formula

    def __repr__(self):
        return f'PostLasso({self.formula})'

    @ignore_warnings(category=ConvergenceWarning)
    def fit(self, X, y, force_include_idx=None):
        ''' Estimate a model using Post-Lasso

        X: X matrix (without intercept)
        y: y vector
        force_include_idx: column indexes that ALWAYS is
            included in the OLS model, regardless of their
            status in the lasso stage.
        '''
        self.lasso_model = self.lasso_model.fit(X, y)
        self.coefs = np.insert(
            self.lasso_model.coef_, 0,
            self.lasso_model.intercept_)  # inserts intercepts in the first col
        self.subset_cols = np.where(self.coefs != 0)[
            0]  # select variables for which the coef after lasso is not zero
        if force_include_idx is not None:  # add cols defined in force_include_idx to subset_cols
            self.subset_cols = np.union1d(self.subset_cols, force_include_idx)
        self.relevant_x = add_constant(
            X
        )[:, self.
          subset_cols]  # add constant to X and choose only the subset cols
        self.ols_model = OLS(y, self.relevant_x).fit()
        return self

    def predict(self, X=None):
        ''' Predict using a fitted post-lasso model.
        '''
        if X is None:
            return self.ols_model.predict(self.relevant_x)
        if X.shape == self.relevant_x.shape:
            return self.ols_model.predict(X)
        return self.ols_model.predict(X[:, self.subset_cols])
示例#3
0
 def fit(self, x, y):
     x = array(x).reshape(-1, 1)
     model = OLS(y, PolynomialFeatures(2).fit_transform(x)).fit()
     self.m = model.predict(
         PolynomialFeatures(2).fit_transform(AGES.reshape(-1, 1)))
     self.s = wls_prediction_std(
         model,
         PolynomialFeatures(2).fit_transform(AGES.reshape(-1, 1)))[0]
     return self
示例#4
0
 def run_regr(self):
     if self.pca_flag == True:
         self.train_x, self.test_x = self.pca(
             self.train_x, self.test_x, n_components=self.n_components)
     regr = OLS(self.train_y['Y_M_1'], add_constant(self.train_x)).fit()
     # print(regr.summary())
     try:
         y_pred = regr.predict(add_constant(self.test_x))
     except Exception as e:
         print(e)
         return None
     # print(f'R-square is {r2_score(self.test_y.Y_M_1, y_pred)}')
     # print(f'Mean - y_pred {np.mean(y_pred)}, Mean - y {np.mean(self.test_y.Y_M_1)}')
     return r2_score(self.test_y.Y_M_1, y_pred)
示例#5
0
)
mean_long.columns = range(1, 13)



import patsy
from statsmodels.api import OLS

y, X = patsy.dmatrices("Mean ~ bs(Year, 5) + bs(Month, 5)", data=mean)
model = OLS(y, X).fit()
model.summary()




mean["Pred"] = model.predict()

mean.columns = ['Mean', 'Year', 'Month', 'Fitted mean']

m_long = mean.pivot(index="Month", columns="Year", values="Mean")
d_long = mean.reset_index().pivot(index="Month", columns="Year", values="index")


color = plt.cm.coolwarm(np.linspace(0.1, 0.9, 12))
mpl.rcParams['axes.prop_cycle'] = cycler.cycler('color', color)



fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 3), sharey=True)

for i in range(12):
示例#6
0
 def obj_fun(x_t, y_t):
     model = OLS(y_t, x_t).fit()
     pred = model.predict(x_t)
     return r2_score(y_t, pred)
class WordCountEstimator:
    """
    Model to estimate the word counts of audio files from their syllable
    envelopes.

    Given a syllable envelope, the number of syllables nuclei are determined
    using a peak picking algorithm. Then a linear mapping of the nuclei count
    to the word count is made.
    Both the peak picking and linear mapping can be trained/adapted if the
    number of words per files are provided.

    Attributes
    ----------
    threshold : float
        Minimum value separating a maximum and its left neighbour for this
        maximum to be considered a peak.
    lin_reg : <statsmodels.regression.linear_model>
        OLS model to map the nuclei count to the word count.
    alpha : float
        Recall of the SAD to readjust the word counts.
    additional_features : list
        List of features (str) to add to the estimated word count for the linear
        mapping training.

    Methods
    -------
    summary()
        Print a summary of the model.
    save_model(model_file)
        Save the model to a given file.
    load_model(model_file)
        Load the model from a given file.
    add_features(envelope)
        Compute the desired features from a syllable envelope.
    train(envelopes, target_word_counts, thresholds, model_file)
        Train the model given syllable envelopes and their respective target
        word counts. The resulting model is saved to model_file.
    predict(envelopes)
        Predicts the word counts for a given list of syllable envelopes.
    """
    def __init__(self, threshold=0.5, alpha=1, additional_features=[]):

        self.threshold = threshold
        self.lin_reg = OLS([1], [1]).fit()
        self.alpha = alpha
        self.additional_features = additional_features

    def summary(self):
        """
        Print a summary of the model.
        """

        print("Summary of WCE model:")
        for attr in self.__dict__:
            if attr != "lin_reg":
                print(attr, self.__dict__[attr])
        print("lin_reg coefficients", self.__dict__["lin_reg"].params)

    def save_model(self, model_file):
        """
        Save the model to a given file.

        Parameters
        ----------
        model_file : str
            Path to the model's file.
        """

        try:
            pickle.dump(self.__dict__, open(model_file, 'wb'))
        except:
            sys.exit("Error with WCE model file.")

    def load_model(self, model_file):
        """
        Load the model from a given file.

        Parameters
        ----------
        model_file : str
            Path to the model's file.
        """
        try:
            model = pickle.load(open(model_file, 'rb'))
        except:
            sys.exit("Error with WCE model file.")

        for attr in model:
            setattr(self, attr, model[attr])

    def add_features(self, envelope):
        """
        Compute the desired features from a syllable envelope.

        Parameters
        ----------
        envelope : ndarray
            1D array containing the values of the syllable envelope.

        Returns
        -------
        features : list
            List of the computed features.
        """

        features = []

        if 'duration' in self.additional_features:
            durs = len(envelope) / 100
            features.append(durs)

        if 'sonority_total_energy' in self.additional_features:
            en_sonor_total = np.sum(envelope)
            features.append(en_sonor_total)

        if 'sonority_mean_energy' in self.additional_features:
            en_sonor_mean = np.mean(envelope)
            features.append(en_sonor_mean)

        if 'sonority_SD_energy' in self.additional_features:
            en_sonor_sd = np.std(envelope)
            features.append(en_sonor_sd)

        # TODO: Possibility to add more.

        return features

    def train(self, envelopes, target_word_counts, model_file, thresholds=THR):
        """
        Train the model given syllable envelopes and their respective target
        word counts. The resulting model is then saved to model_file.

        Training works as follows:
            - estimate the number of syllable nuclei per envelope according to
            different thresholds and chose the threshold that produces the best
            correlation between the estimated number of nuclei and the target
            number of word counts.
            - using the estimated number of nuclei resulting from the optimal
            threshold, determine the coefficients of the linear mapping.

        Parameters
        ----------
        envelopes : ndarray
            2D, array of envelope per waveform.
        target_word_counts : list
            List of the word count per envelope.
        model_file: str
            Path of where to save the model file.
        thresholds : list
            List of the thresholds values to test for the model adaptation.
        """

        self.additional_features = [
            "duration", "sonority_mean_energy", "sonority_SD_energy"
        ]

        n_envelopes = len(envelopes)
        n_thresholds = len(thresholds)

        # count syllable nuclei per files
        estimated_nuclei_counts = np.zeros((n_envelopes, n_thresholds))
        for i in range(n_envelopes):
            for j in range(n_thresholds):
                n_syl_nuclei = len(peakdet(envelopes[i], thresholds[j])[0])
                estimated_nuclei_counts[i, j] = n_syl_nuclei

        # determine best threshold
        corvals = np.zeros(n_thresholds)
        for k in range(n_thresholds):
            all_zeros = not np.any(estimated_nuclei_counts[:, k])
            if not all_zeros:
                corvals[k] = np.corrcoef(target_word_counts,
                                         estimated_nuclei_counts[:, k],
                                         rowvar=False)[0][1]

        try:
            opti_k = np.nanargmax(corvals)
        except:
            opti_k = 0
        opti_threshold = thresholds[opti_k]
        nuclei_counts = estimated_nuclei_counts[:, opti_k]

        # create an array X from nuclei_counts and additional features
        X = np.zeros((n_envelopes, 1 + len(self.additional_features)))
        for l in range(n_envelopes):
            X[l, 0] = nuclei_counts[l]
            X[l, 1:] = self.add_features(envelopes[l])
        X = add_constant(X, has_constant='add')

        # multiple linear regression on X and target_word_counts
        self.lin_reg = OLS(target_word_counts, X).fit()

        # readjust coefficients by dividing by alpha: the recall of the SAD
        self.lin_reg.params /= self.alpha

        self.threshold = opti_threshold
        self.save_model(model_file)

    def predict(self, envelopes):
        """
        Predicts the word counts for a given list of syllable envelopes.

        Parameters
        ----------
        envelopes : ndarray
            2D, array of envelope per file.

        Returns
        -------
        word_counts : ndarray
            2D array containing the estimated word count per audio file/envelope.
        """

        n_envelopes = len(envelopes)

        X = np.zeros((n_envelopes, 1 + len(self.additional_features)))
        for k in range(n_envelopes):
            n_syl_nuclei = len(peakdet(envelopes[k], self.threshold)[0])
            X[k, 0] = n_syl_nuclei
            X[k, 1:] = self.add_features(envelopes[k])

        if len(self.lin_reg.params) > 1:
            X = add_constant(X, has_constant='add')

        word_counts = self.lin_reg.predict(X)

        return word_counts
示例#8
0
文件: code.py 项目: asarantsev/IDY
trendCoeff = coefficients[1]
heatCoeff = coefficients[2]
avgIDY = trendCoeff / abs(heatCoeff)
print('avgIDY = ', avgIDY)
avgHeat = (intercept - avgIDY) / abs(heatCoeff)
print('long-term average heat measure = ', avgHeat)

Heat = cumIDY - avgIDY * range(T)  #Heat measure
plt.figure(figsize=(7, 6))
plt.plot(range(NEW, LAST), Heat)
print('current heat measure = ', Heat[-1])
plt.title('Heat measure')
plt.show()
print('Correlation of heat measure and total returns = ',
      stats.pearsonr(Heat[:-1], TR[W:])[0])
residuals = IDY - Regression.predict(DF)

#analysis of regression residuals for white noise and normality
stderr = np.std(residuals)
print('stderr = ', stderr)
print('Shapiro-Wilk normality test for residuals', stats.shapiro(residuals))
print('Jarque-Bera normality test for residuals', stats.jarque_bera(residuals))
aresiduals = abs(residuals)
qqplot(residuals, line='s')
plt.title('residuals')
plt.show()
plot_acf(residuals)
plt.title('original values of residuals')
plt.show()
plot_acf(aresiduals)
plt.title('absolute values of residuals')
示例#9
0
# Ingenieria de caracteristicas: seleccion
print boston.corr(method='pearson')

# Seleccionar que atributos usar en la regresion
X = boston['LSTAT']
y = boston['MEDV']

# Regresion
model = OLS(y, add_constant(X))
model = model.fit()
theta = model.params

print "Estimated parameters:\n", theta

# Prepare plots.
fig, ax = plt.subplots(figsize=(12, 8))

# Scatter plot.
ax.scatter(X, y, label='Dataset', color='Cyan')

# Mostrar la regresion lineal.
x = np.linspace(X.min(), X.max(), len(X))
ax.plot(x, model.predict(add_constant(x)), 'r', label='OLS', color='Green')

# Plot settings.
ax.set_xlabel('LSTAT')
ax.set_ylabel('MEDV')
ax.set_title("MEDV vs LSTAT")
ax.legend()
plt.show()
示例#10
0
    def regression(self, time_model=True, model_by="sk"):
        mat = self.census_scatter(time_model=time_model)
        # X = mat[[c for c in mat.columns if c not in ('COMMUNITY', 'COMMUNITY AREA NAME', 'Community Area', 'Avg. Annual Crimes', 'General Population: Population Change, 2000-10', 'Population: 2000 Census', 'Population: 2000 Census', 'SHAPE_AREA')]]
        if time_model:
            X = mat[self.dummy_cols + ['Crimes_lag1month']]
            y = mat['Crimes']
            i = 'time_series'
        else:
            X = mat[[
                c for c in mat.columns
                if 'Pct' in c or c == 'Population Density'
            ]]
            y = mat['Avg. Annual Crimes']
            i = 'census'

        significant_cols = list()

        fig, ax = plt.subplots(2)
        kf = KFold(n_splits=5)
        best_cols = dict()
        acc = np.zeros((len(X.columns), len(y)))
        for n_features in range(1, len(X.columns) + 1):
            SK = SelectKBest(chi2, k=n_features)
            SK.fit(X.values, y.values.astype(int))
            cols = X.columns[np.argsort(SK.scores_)[::-1][0:n_features]]
            best_cols[n_features] = cols
            for fold, (train, test) in enumerate(kf.split(np.arange(len(y)))):
                Xtrain = X[cols].values[train]
                ytrain = y.values[train].astype(int)
                Xtest = X[cols].values[test]
                ytest = y.values[test].astype(int)
                if model_by == "sk":
                    LR = LinearRegression(fit_intercept=False)
                    LR.fit(Xtrain, ytrain)
                    mse = mean_squared_error(ytest, LR.predict(Xtest))
                elif model_by == "sm":
                    model = OLS(ytrain, Xtrain)
                    result = model.fit()
                    mse = result.mse_total

                acc[n_features - 1, fold] = mse

                if n_features == 13:
                    if model_by == "sk":
                        predicted = LR.predict(Xtest)
                    elif model_by == "sm":
                        predicted = np.zeros(len(ytest))
                        for i, x in enumerate(Xtest):
                            p = model.predict(result.params, exog=x)
                            predicted[i] = p
                    ax[0].scatter(ytest, predicted)
                    a = [ytest.min(), ytest.max()]
                    ax[0].plot(a, a, 'k--', lw=4)
                    ax[0].set_xlabel('Measured')
                    ax[0].set_ylabel('Predicted')

        avg_acc = np.mean(acc, axis=1)
        print avg_acc
        print len(avg_acc)
        print np.argmin(avg_acc) + 1
        ax[1].plot(np.arange(1, len(avg_acc) + 1), avg_acc, 'g.')
        fig.savefig('img_%s.png' % i)
        plt.close()

        cols = list(best_cols[np.argmin(avg_acc) + 1])
        cols.sort()
        if model_by == "sk":
            LR.fit(X[cols].values, y.values.astype(int))
            print '-------------LINEAR REGRESSION-------------'
            print "R^2:  %s" % LR.score(X[cols].values, y.values.astype(int))
            print "MSE:  %s" % mean_squared_error(y.values.astype(int),
                                                  LR.predict(X[cols].values))
            print 'variable:%scoefficients:\nIntercept%s\n%s' % (
                ' ' * (70 - len('variable:')), ' ' *
                (70 - len('intercept')) + str(LR.intercept_), '\n'.join([
                    '%s%s%s' % (n, ' ' * (70 - len(n)), c)
                    for n, c in zip(cols, LR.coef_)
                ]))

        elif model_by == "sm":
            model = OLS(y.values.astype(int), X[cols])
            result = model.fit()
            print result.summary()

        model = OLS(
            y.values.astype(int), mat[[
                c for c in mat.columns if re.match('Household Income.*Pct', c)
            ]])
        if time_model:
            model = OLS(
                y.values.astype(int), mat[[
                    'Crimes_lag1month', 'Interventions', '01', '02', '03',
                    '04', '05', '06', '07', '08', '09', '10', '11', '12'
                ]])
            model = OLS(y.values.astype(int), mat[['Interventions']])
        result = model.fit()
        print result.summary()
示例#11
0
ols_result.tvalues  # t-value가 필요하면
ols_result.rsquared  # R2가 필요하면
ols_result.rsquared_adj  # Adjusted R2가 필요하면

# predict
test_X = pd.DataFrame([[1, 2, 3, 4, 5]], columns=['a', 'b', 'c', 'd', 'e'])
pred = ols_result.predict(test_X)
################################################################################

### Linear Regression ##########################################################
from sklearn.linear_model import LinearRegression

model = LinearRegression().fit(train_X, train_y)  # train_X는 Matrix여야 함 (mXn)

model.coef_  # coefficient
model.predict([[1], [2], [10], [50], [100]])  # predict
model.predict([['a', 'b', 'c']])  # predict
model.score(train_X, train_y)  # 결정계수 R^2

# mean squared error
from sklearn.metrics import mean_squared_error
mean_squared_error(y_true, y_pred)
################################################################################

### Logistic Regression ########################################################
from sklearn.linear_model import LogisticRegression

train_X = df_lr['dep_A', 'dep_B', 'dep_C', 'dep_D', 'dep_E']
train_y = df_lr['indep']

lr = LogisticRegression(C=100000,
data = data[data.proccessor_turbo != "Not found"]

data["proccessor"] = to_numeric(data["proccessor"])
data["proccessor_turbo"] = to_numeric(data["proccessor_turbo"])
#print(data.info())

x = data[["size", "proccessor", "proccessor_turbo", "ram", "hdd"]]
y = data["price"]

regr = linear_model.LinearRegression()
regr.fit(x, y)

print("Intercept: ", regr.intercept_)
print("Coeff: ", regr.coef_)
print("Score: ", regr.score(x, y))

new_size = 15.6
new_proccessor = 1.6
new_proccessor_turbo = 3.9
new_ram = 12
new_hdd = 1250

predicted = regr.predict(
    [[new_size, new_proccessor, new_proccessor_turbo, new_ram, new_hdd]])
print("Predicted: ", predicted)

x = add_constant(x)
model = OLS(y, x).fit()
predicted = model.predict(
    [[1, new_size, new_proccessor, new_proccessor_turbo, new_ram, new_hdd]])
print(model.summary())


import statsmodels.api as sm


# In[65]:

ols=OLS(timevncats,sm.add_constant(X))


# In[66]:

ols=ols.fit()





nclients=Clientes.shape[0]

predtime=(ols.predict([1,nclients,nclients**2])/60/60)[0]

print('Full data set should take %i hours' % int(predtime))