def run(): varsY = [ x for x in Y.columns.tolist() if Y.columns.tolist().index(x) in listboxY.curselection() ] varsX = [ x for x in X.columns.tolist() if X.columns.tolist().index(x) in listboxX.curselection() ] global trainY global trainX trainY = Y[~data.isnull().T.any().T] trainX = X[~data.isnull().T.any().T] trainX = add_constant(trainX[varsX]) testX = X[data.isnull().T.any().T] testX = add_constant(testX[varsX]) result0 = DataFrame(columns=varsY) if (len(varsY) == 0): messagebox.showinfo('提示', '至少选中一个结果变量!') return if (len(varsX) == 0): messagebox.showinfo('提示', '至少选中一个预测变量!') return with ExcelWriter(saveFile, engine="openpyxl") as writer: for id, varY in enumerate(varsY): fit = OLS(trainY.iloc[:, id], trainX).fit() print(fit.summary2().tables) result0[varY] = fit.predict(testX) result0.to_excel(writer, sheet_name="SUMMARY", header=True, index=True) global result1 result1 = fit.get_prediction(testX).summary_frame() result1.to_excel(writer, sheet_name=varY, header=True, index=True) global result2 result2 = fit.summary2().tables result2[0].iloc[:, [0, 1]].to_excel(writer, sheet_name=varY, header=False, index=False, startrow=result1.shape[0] + 2, startcol=0) result2[0].iloc[:, [2, 3]].to_excel(writer, sheet_name=varY, header=False, index=False, startrow=result1.shape[0] + 2, startcol=5) result2[1].to_excel(writer, sheet_name=varY, header=True, index=True, startrow=result1.shape[0] + result2[0].shape[0] + 3) writer.save() writer.close() messagebox.showinfo('提示', '执行完成!')
class PostLasso: def __init__(self, formula=None): self.lasso_model = Lasso() self.ols_model = None self.relevant_x = None self.subset_cols = None self.coefs = None self.formula = formula def __repr__(self): return f'PostLasso({self.formula})' @ignore_warnings(category=ConvergenceWarning) def fit(self, X, y, force_include_idx=None): ''' Estimate a model using Post-Lasso X: X matrix (without intercept) y: y vector force_include_idx: column indexes that ALWAYS is included in the OLS model, regardless of their status in the lasso stage. ''' self.lasso_model = self.lasso_model.fit(X, y) self.coefs = np.insert( self.lasso_model.coef_, 0, self.lasso_model.intercept_) # inserts intercepts in the first col self.subset_cols = np.where(self.coefs != 0)[ 0] # select variables for which the coef after lasso is not zero if force_include_idx is not None: # add cols defined in force_include_idx to subset_cols self.subset_cols = np.union1d(self.subset_cols, force_include_idx) self.relevant_x = add_constant( X )[:, self. subset_cols] # add constant to X and choose only the subset cols self.ols_model = OLS(y, self.relevant_x).fit() return self def predict(self, X=None): ''' Predict using a fitted post-lasso model. ''' if X is None: return self.ols_model.predict(self.relevant_x) if X.shape == self.relevant_x.shape: return self.ols_model.predict(X) return self.ols_model.predict(X[:, self.subset_cols])
def fit(self, x, y): x = array(x).reshape(-1, 1) model = OLS(y, PolynomialFeatures(2).fit_transform(x)).fit() self.m = model.predict( PolynomialFeatures(2).fit_transform(AGES.reshape(-1, 1))) self.s = wls_prediction_std( model, PolynomialFeatures(2).fit_transform(AGES.reshape(-1, 1)))[0] return self
def run_regr(self): if self.pca_flag == True: self.train_x, self.test_x = self.pca( self.train_x, self.test_x, n_components=self.n_components) regr = OLS(self.train_y['Y_M_1'], add_constant(self.train_x)).fit() # print(regr.summary()) try: y_pred = regr.predict(add_constant(self.test_x)) except Exception as e: print(e) return None # print(f'R-square is {r2_score(self.test_y.Y_M_1, y_pred)}') # print(f'Mean - y_pred {np.mean(y_pred)}, Mean - y {np.mean(self.test_y.Y_M_1)}') return r2_score(self.test_y.Y_M_1, y_pred)
) mean_long.columns = range(1, 13) import patsy from statsmodels.api import OLS y, X = patsy.dmatrices("Mean ~ bs(Year, 5) + bs(Month, 5)", data=mean) model = OLS(y, X).fit() model.summary() mean["Pred"] = model.predict() mean.columns = ['Mean', 'Year', 'Month', 'Fitted mean'] m_long = mean.pivot(index="Month", columns="Year", values="Mean") d_long = mean.reset_index().pivot(index="Month", columns="Year", values="index") color = plt.cm.coolwarm(np.linspace(0.1, 0.9, 12)) mpl.rcParams['axes.prop_cycle'] = cycler.cycler('color', color) fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 3), sharey=True) for i in range(12):
def obj_fun(x_t, y_t): model = OLS(y_t, x_t).fit() pred = model.predict(x_t) return r2_score(y_t, pred)
class WordCountEstimator: """ Model to estimate the word counts of audio files from their syllable envelopes. Given a syllable envelope, the number of syllables nuclei are determined using a peak picking algorithm. Then a linear mapping of the nuclei count to the word count is made. Both the peak picking and linear mapping can be trained/adapted if the number of words per files are provided. Attributes ---------- threshold : float Minimum value separating a maximum and its left neighbour for this maximum to be considered a peak. lin_reg : <statsmodels.regression.linear_model> OLS model to map the nuclei count to the word count. alpha : float Recall of the SAD to readjust the word counts. additional_features : list List of features (str) to add to the estimated word count for the linear mapping training. Methods ------- summary() Print a summary of the model. save_model(model_file) Save the model to a given file. load_model(model_file) Load the model from a given file. add_features(envelope) Compute the desired features from a syllable envelope. train(envelopes, target_word_counts, thresholds, model_file) Train the model given syllable envelopes and their respective target word counts. The resulting model is saved to model_file. predict(envelopes) Predicts the word counts for a given list of syllable envelopes. """ def __init__(self, threshold=0.5, alpha=1, additional_features=[]): self.threshold = threshold self.lin_reg = OLS([1], [1]).fit() self.alpha = alpha self.additional_features = additional_features def summary(self): """ Print a summary of the model. """ print("Summary of WCE model:") for attr in self.__dict__: if attr != "lin_reg": print(attr, self.__dict__[attr]) print("lin_reg coefficients", self.__dict__["lin_reg"].params) def save_model(self, model_file): """ Save the model to a given file. Parameters ---------- model_file : str Path to the model's file. """ try: pickle.dump(self.__dict__, open(model_file, 'wb')) except: sys.exit("Error with WCE model file.") def load_model(self, model_file): """ Load the model from a given file. Parameters ---------- model_file : str Path to the model's file. """ try: model = pickle.load(open(model_file, 'rb')) except: sys.exit("Error with WCE model file.") for attr in model: setattr(self, attr, model[attr]) def add_features(self, envelope): """ Compute the desired features from a syllable envelope. Parameters ---------- envelope : ndarray 1D array containing the values of the syllable envelope. Returns ------- features : list List of the computed features. """ features = [] if 'duration' in self.additional_features: durs = len(envelope) / 100 features.append(durs) if 'sonority_total_energy' in self.additional_features: en_sonor_total = np.sum(envelope) features.append(en_sonor_total) if 'sonority_mean_energy' in self.additional_features: en_sonor_mean = np.mean(envelope) features.append(en_sonor_mean) if 'sonority_SD_energy' in self.additional_features: en_sonor_sd = np.std(envelope) features.append(en_sonor_sd) # TODO: Possibility to add more. return features def train(self, envelopes, target_word_counts, model_file, thresholds=THR): """ Train the model given syllable envelopes and their respective target word counts. The resulting model is then saved to model_file. Training works as follows: - estimate the number of syllable nuclei per envelope according to different thresholds and chose the threshold that produces the best correlation between the estimated number of nuclei and the target number of word counts. - using the estimated number of nuclei resulting from the optimal threshold, determine the coefficients of the linear mapping. Parameters ---------- envelopes : ndarray 2D, array of envelope per waveform. target_word_counts : list List of the word count per envelope. model_file: str Path of where to save the model file. thresholds : list List of the thresholds values to test for the model adaptation. """ self.additional_features = [ "duration", "sonority_mean_energy", "sonority_SD_energy" ] n_envelopes = len(envelopes) n_thresholds = len(thresholds) # count syllable nuclei per files estimated_nuclei_counts = np.zeros((n_envelopes, n_thresholds)) for i in range(n_envelopes): for j in range(n_thresholds): n_syl_nuclei = len(peakdet(envelopes[i], thresholds[j])[0]) estimated_nuclei_counts[i, j] = n_syl_nuclei # determine best threshold corvals = np.zeros(n_thresholds) for k in range(n_thresholds): all_zeros = not np.any(estimated_nuclei_counts[:, k]) if not all_zeros: corvals[k] = np.corrcoef(target_word_counts, estimated_nuclei_counts[:, k], rowvar=False)[0][1] try: opti_k = np.nanargmax(corvals) except: opti_k = 0 opti_threshold = thresholds[opti_k] nuclei_counts = estimated_nuclei_counts[:, opti_k] # create an array X from nuclei_counts and additional features X = np.zeros((n_envelopes, 1 + len(self.additional_features))) for l in range(n_envelopes): X[l, 0] = nuclei_counts[l] X[l, 1:] = self.add_features(envelopes[l]) X = add_constant(X, has_constant='add') # multiple linear regression on X and target_word_counts self.lin_reg = OLS(target_word_counts, X).fit() # readjust coefficients by dividing by alpha: the recall of the SAD self.lin_reg.params /= self.alpha self.threshold = opti_threshold self.save_model(model_file) def predict(self, envelopes): """ Predicts the word counts for a given list of syllable envelopes. Parameters ---------- envelopes : ndarray 2D, array of envelope per file. Returns ------- word_counts : ndarray 2D array containing the estimated word count per audio file/envelope. """ n_envelopes = len(envelopes) X = np.zeros((n_envelopes, 1 + len(self.additional_features))) for k in range(n_envelopes): n_syl_nuclei = len(peakdet(envelopes[k], self.threshold)[0]) X[k, 0] = n_syl_nuclei X[k, 1:] = self.add_features(envelopes[k]) if len(self.lin_reg.params) > 1: X = add_constant(X, has_constant='add') word_counts = self.lin_reg.predict(X) return word_counts
trendCoeff = coefficients[1] heatCoeff = coefficients[2] avgIDY = trendCoeff / abs(heatCoeff) print('avgIDY = ', avgIDY) avgHeat = (intercept - avgIDY) / abs(heatCoeff) print('long-term average heat measure = ', avgHeat) Heat = cumIDY - avgIDY * range(T) #Heat measure plt.figure(figsize=(7, 6)) plt.plot(range(NEW, LAST), Heat) print('current heat measure = ', Heat[-1]) plt.title('Heat measure') plt.show() print('Correlation of heat measure and total returns = ', stats.pearsonr(Heat[:-1], TR[W:])[0]) residuals = IDY - Regression.predict(DF) #analysis of regression residuals for white noise and normality stderr = np.std(residuals) print('stderr = ', stderr) print('Shapiro-Wilk normality test for residuals', stats.shapiro(residuals)) print('Jarque-Bera normality test for residuals', stats.jarque_bera(residuals)) aresiduals = abs(residuals) qqplot(residuals, line='s') plt.title('residuals') plt.show() plot_acf(residuals) plt.title('original values of residuals') plt.show() plot_acf(aresiduals) plt.title('absolute values of residuals')
# Ingenieria de caracteristicas: seleccion print boston.corr(method='pearson') # Seleccionar que atributos usar en la regresion X = boston['LSTAT'] y = boston['MEDV'] # Regresion model = OLS(y, add_constant(X)) model = model.fit() theta = model.params print "Estimated parameters:\n", theta # Prepare plots. fig, ax = plt.subplots(figsize=(12, 8)) # Scatter plot. ax.scatter(X, y, label='Dataset', color='Cyan') # Mostrar la regresion lineal. x = np.linspace(X.min(), X.max(), len(X)) ax.plot(x, model.predict(add_constant(x)), 'r', label='OLS', color='Green') # Plot settings. ax.set_xlabel('LSTAT') ax.set_ylabel('MEDV') ax.set_title("MEDV vs LSTAT") ax.legend() plt.show()
def regression(self, time_model=True, model_by="sk"): mat = self.census_scatter(time_model=time_model) # X = mat[[c for c in mat.columns if c not in ('COMMUNITY', 'COMMUNITY AREA NAME', 'Community Area', 'Avg. Annual Crimes', 'General Population: Population Change, 2000-10', 'Population: 2000 Census', 'Population: 2000 Census', 'SHAPE_AREA')]] if time_model: X = mat[self.dummy_cols + ['Crimes_lag1month']] y = mat['Crimes'] i = 'time_series' else: X = mat[[ c for c in mat.columns if 'Pct' in c or c == 'Population Density' ]] y = mat['Avg. Annual Crimes'] i = 'census' significant_cols = list() fig, ax = plt.subplots(2) kf = KFold(n_splits=5) best_cols = dict() acc = np.zeros((len(X.columns), len(y))) for n_features in range(1, len(X.columns) + 1): SK = SelectKBest(chi2, k=n_features) SK.fit(X.values, y.values.astype(int)) cols = X.columns[np.argsort(SK.scores_)[::-1][0:n_features]] best_cols[n_features] = cols for fold, (train, test) in enumerate(kf.split(np.arange(len(y)))): Xtrain = X[cols].values[train] ytrain = y.values[train].astype(int) Xtest = X[cols].values[test] ytest = y.values[test].astype(int) if model_by == "sk": LR = LinearRegression(fit_intercept=False) LR.fit(Xtrain, ytrain) mse = mean_squared_error(ytest, LR.predict(Xtest)) elif model_by == "sm": model = OLS(ytrain, Xtrain) result = model.fit() mse = result.mse_total acc[n_features - 1, fold] = mse if n_features == 13: if model_by == "sk": predicted = LR.predict(Xtest) elif model_by == "sm": predicted = np.zeros(len(ytest)) for i, x in enumerate(Xtest): p = model.predict(result.params, exog=x) predicted[i] = p ax[0].scatter(ytest, predicted) a = [ytest.min(), ytest.max()] ax[0].plot(a, a, 'k--', lw=4) ax[0].set_xlabel('Measured') ax[0].set_ylabel('Predicted') avg_acc = np.mean(acc, axis=1) print avg_acc print len(avg_acc) print np.argmin(avg_acc) + 1 ax[1].plot(np.arange(1, len(avg_acc) + 1), avg_acc, 'g.') fig.savefig('img_%s.png' % i) plt.close() cols = list(best_cols[np.argmin(avg_acc) + 1]) cols.sort() if model_by == "sk": LR.fit(X[cols].values, y.values.astype(int)) print '-------------LINEAR REGRESSION-------------' print "R^2: %s" % LR.score(X[cols].values, y.values.astype(int)) print "MSE: %s" % mean_squared_error(y.values.astype(int), LR.predict(X[cols].values)) print 'variable:%scoefficients:\nIntercept%s\n%s' % ( ' ' * (70 - len('variable:')), ' ' * (70 - len('intercept')) + str(LR.intercept_), '\n'.join([ '%s%s%s' % (n, ' ' * (70 - len(n)), c) for n, c in zip(cols, LR.coef_) ])) elif model_by == "sm": model = OLS(y.values.astype(int), X[cols]) result = model.fit() print result.summary() model = OLS( y.values.astype(int), mat[[ c for c in mat.columns if re.match('Household Income.*Pct', c) ]]) if time_model: model = OLS( y.values.astype(int), mat[[ 'Crimes_lag1month', 'Interventions', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12' ]]) model = OLS(y.values.astype(int), mat[['Interventions']]) result = model.fit() print result.summary()
ols_result.tvalues # t-value가 필요하면 ols_result.rsquared # R2가 필요하면 ols_result.rsquared_adj # Adjusted R2가 필요하면 # predict test_X = pd.DataFrame([[1, 2, 3, 4, 5]], columns=['a', 'b', 'c', 'd', 'e']) pred = ols_result.predict(test_X) ################################################################################ ### Linear Regression ########################################################## from sklearn.linear_model import LinearRegression model = LinearRegression().fit(train_X, train_y) # train_X는 Matrix여야 함 (mXn) model.coef_ # coefficient model.predict([[1], [2], [10], [50], [100]]) # predict model.predict([['a', 'b', 'c']]) # predict model.score(train_X, train_y) # 결정계수 R^2 # mean squared error from sklearn.metrics import mean_squared_error mean_squared_error(y_true, y_pred) ################################################################################ ### Logistic Regression ######################################################## from sklearn.linear_model import LogisticRegression train_X = df_lr['dep_A', 'dep_B', 'dep_C', 'dep_D', 'dep_E'] train_y = df_lr['indep'] lr = LogisticRegression(C=100000,
data = data[data.proccessor_turbo != "Not found"] data["proccessor"] = to_numeric(data["proccessor"]) data["proccessor_turbo"] = to_numeric(data["proccessor_turbo"]) #print(data.info()) x = data[["size", "proccessor", "proccessor_turbo", "ram", "hdd"]] y = data["price"] regr = linear_model.LinearRegression() regr.fit(x, y) print("Intercept: ", regr.intercept_) print("Coeff: ", regr.coef_) print("Score: ", regr.score(x, y)) new_size = 15.6 new_proccessor = 1.6 new_proccessor_turbo = 3.9 new_ram = 12 new_hdd = 1250 predicted = regr.predict( [[new_size, new_proccessor, new_proccessor_turbo, new_ram, new_hdd]]) print("Predicted: ", predicted) x = add_constant(x) model = OLS(y, x).fit() predicted = model.predict( [[1, new_size, new_proccessor, new_proccessor_turbo, new_ram, new_hdd]]) print(model.summary())
import statsmodels.api as sm # In[65]: ols=OLS(timevncats,sm.add_constant(X)) # In[66]: ols=ols.fit() nclients=Clientes.shape[0] predtime=(ols.predict([1,nclients,nclients**2])/60/60)[0] print('Full data set should take %i hours' % int(predtime))