Пример #1
0
def ols_plot(y, x, add_intercept=True, alpha=0.05, xlim=None, ax=None):
    """
    Generate a scatter plot with OLS prediction plus confidence intervals
    :param y:
    :param x:
    :param add_intercept:
    :param alpha:
    :param ax:
    :return:
    """
    if ax is None:
        fig = plt.figure()
        ax = fig.add_subplot(111)

    try:
        x = x.astype(float)
    except Exception:
        pass

    if add_intercept:
        X = sm.add_constant(x)
    else:
        X = x

    model = sm.OLS(y, X)
    res = model.fit()

    # plot data
    ax.scatter(x, y, marker='o')
    if xlim is None:
        xlim = np.array(ax.get_xlim())

    xx = np.linspace(xlim[0], xlim[1], 100)

    # compute prediction and confidence intervals
    if add_intercept:
        b0, b1 = res.params
        sdev, lower, upper = wls_prediction_std(res,
                                                sm.add_constant(xx),
                                                alpha=alpha)
        # b0_min, b0_max = res.conf_int(alpha=alpha)[0]
        # b1_min, b1_max = res.conf_int(alpha=alpha)[1]

    else:
        b1 = res.params[0]
        b0 = 0.
        sdev, lower, upper = wls_prediction_std(res, xx, alpha=alpha)
        # b0 = b0_min = b0_max = 0.
        # b1_min, b1_max = res.conf_int(alpha=alpha)[0]

    ax.plot(xx, b0 + b1 * xx, 'k-', lw=1.5)
    ax.fill_between(xx, lower, upper, edgecolor='b', facecolor='b', alpha=0.4)

    # lower = b0_min + b1_min * xlim
    # upper = b0_max + b1_max * xlim
    # ax.fill_between(xlim, lower, upper, edgecolor='b', facecolor='b', alpha=0.4)

    ax.set_xlim(xlim)
    return res, ax
Пример #2
0
def plot_regression(_output):
    global analysis

    def to_percent(x, position):
        s = str(100 * x)
        if plt.rcParams['text.usetex'] is True:
            return s + r'$\%$'
        else:
            return s + '%'

    mpl.style.use('classic')
    fig = plt.figure()
    x = np.array(analysis.iloc[:, 1], dtype=float)
    y = np.array(analysis.iloc[:, 2], dtype=float) * 100
    model_x = sm.add_constant(x)
    model = sm.OLS(y, model_x)
    fitted = model.fit()
    fit = np.polyfit(x, y, 1)
    fit_function = np.poly1d(fit)
    sdev, lower, upper = wls_prediction_std(fitted, alpha=0.05)
    plt.fill_between(x, lower, upper, color='#67e0d7', alpha=0.3)
    plt.plot(x, fit_function(x), color='green')

    x = np.array(analysis.iloc[:, 4], dtype=float)
    y = np.array(analysis.iloc[:, 5], dtype=float) * 100
    model_x = sm.add_constant(x)
    model = sm.OLS(y, model_x)
    fitted = model.fit()
    fit = np.polyfit(x, y, 1)
    fit_function = np.poly1d(fit)
    sdev, lower, upper = wls_prediction_std(fitted, alpha=0.05)
    plt.fill_between(x, lower, upper, color='#93e067', alpha=0.3)
    plt.plot(x, fit_function(x), color='black')
    formatter = FuncFormatter(to_percent)
    plt.gca().xaxis.set_major_formatter(formatter)
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)
    plt.xlim(0, 1)
    plt.ylim(0, 10)
    circle1 = mlines.Line2D(
        [], [],
        color='green',
        markerfacecolor="green",
        label='AEP Risk (CDC Definition)')  # Make a circle for the legend
    circle2 = mlines.Line2D(
        [], [],
        color='black',
        markerfacecolor="black",
        label='AEP Risk (Actual)')  # Make a circle for the legend
    plt.legend(handles=[circle1, circle2],
               numpoints=1,
               prop={'size': 15},
               loc=2)
    plt.title('Compliance with CDC Recommendation (%)', fontsize=20)
    plt.tight_layout()
    plt.savefig(_output + '.pdf')
    plt.savefig(_output + '.jpg', bbox_inches='tight')
Пример #3
0
def _regional_prediction(X, X_pred, Y, i):
    mod = sm.OLS(np.log(Y[i]), X)
    res = mod.fit()

    y_pred = res.predict(X_pred)
    _, _, std_u = wls_prediction_std(res, exog=X_pred, alpha=1-0.6827)  # 1 s.d.
    _, ci_l, ci_u = wls_prediction_std(res, exog=X_pred, alpha=1-0.95)  # 95% CI

    return y_pred, std_u, ci_l, ci_u, res.params[1]
Пример #4
0
    def predict(self, ID, ALPHA=0.5):
        list1 = get_data(ID)
        vector = self.vectorizer.transform([list1[0]])
        vector = self.lsa.transform(vector)
        array = np.array([list1[1:4]])**2.0 / self.sum
        array = array**0.5
        vector = np.hstack([vector, array])
        length = vector.shape[1]
        '''
        for i in range(length):
            tmp = vector[0][i] * vector[0][i]
            tmp = np.array([[tmp]])
            vector = np.hstack([vector, tmp])
        '''

        for i in range(length):
            for j in range(i, length):
                tmp = vector[0][i] * vector[0][j]
                tmp = np.array([[tmp]])
                vector = np.hstack([vector, tmp])
        vector = del_vector(vector, self.dellist)

        estimated = self.results.predict(vector)
        prstdn, infa, supa = wls_prediction_std(self.results,
                                                vector,
                                                alpha=ALPHA)

        if infa[0] < 0:
            infa[0] = 0
        return estimated[0]**2.0, infa[0]**2.0, supa[0]**2.0
def visualize_linear_regression(data,
                                fit,
                                stock,
                                benchmark,
                                axis_low=-0.1,
                                axis_high=0.1,
                                show_std=False):
    """Create a scatter plot and linear model of the stock returns vs. the benchmark returns.
    
    Arguments:
        data      -- a tidy DataFrame, with columns stock_ret, bench_ret, and const
        fit       -- a linear regression result
        stock     -- name of the stock
        benchmark -- name of the benchmark index
        axis_low  -- lowest value for x and y axes (defult -0.1)
        axis_high -- highest value for x and y axes (defult 0.1)
        show_std  -- whether to show upper and lower bands around answer (default False)
    """
    ax = data.plot(kind='scatter',
                   x='bench_ret',
                   y='stock_ret',
                   title='1-Day Returns',
                   xlim=(axis_low, axis_high),
                   ylim=(axis_low, axis_high))
    X_new = pd.DataFrame({'bench_ret': [axis_low, axis_high]})
    X_new['const'] = 1
    preds = fit.predict(X_new)
    plt.plot(X_new['bench_ret'], preds, 'r-')
    if show_std:
        _, lower, upper = wls_prediction_std(fit, X_new)
        plt.plot(X_new['bench_ret'], lower, 'r--', X_new['bench_ret'], upper,
                 'r--')
    ax.set_xlabel(benchmark)
    ax.set_ylabel(stock)
    ax.set_aspect(1)
Пример #6
0
 def get_predicted_y_PI(self, x_pred, alpha=0.05):
     """ :returns prediction interval of the y at the provided x values """
     X_pred = self.f.get_X(x_pred)
     sdev, lower, upper = wls_prediction_std(self.fitted,
                                             exog=X_pred,
                                             alpha=alpha)
     return lower, upper
def try_prod24h_before(
        columns=['Tout', 'vWind', 'vWindavg24', 'prod24h_before'],
        add_const=False,
        y=y):
    plt.close('all')
    X = all_data[columns]
    res = mlin_regression(y, X, add_const=add_const)
    timesteps = ens.gen_hourly_timesteps(dt.datetime(2015, 12, 17, 1),
                                         dt.datetime(2016, 1, 15, 0))

    plt.subplot(2, 1, 1)
    plt.plot_date(timesteps, y, 'b', label='Actual prodution')
    plt.plot_date(timesteps, res.fittedvalues, 'r', label='Weather model')
    prstd, iv_l, iv_u = wls_prediction_std(res)
    plt.plot_date(timesteps, iv_u, 'r--', label='95% conf. int.')
    plt.plot_date(timesteps, iv_l, 'r--')
    plt.ylabel('MW')
    plt.legend(loc=2)
    plt.subplot(2, 1, 2)
    plt.plot_date(timesteps, res.resid, '-', label='Residual')
    plt.ylabel('MW')
    plt.legend()

    print "MAE = " + str(mae(res.resid))
    print "MAPE = " + str(mape(res.resid, y))
    print "RMSE = " + str(rmse(res.resid))

    print res.summary()

    return res
Пример #8
0
def plot_locality_regression(snps, cob, gene_limit=10):
    # Get degree and bootstrap degree
    log('Fetching Empirical Degree')
    degree = cob.locality(
        cob.refgen.candidate_genes(snps, gene_limit=gene_limit,
                                   chain=True)).sort('local')
    log('Fetching BS Degree')
    #bsdegree = pd.concat([cob.locality(cob.refgen.bootstrap_candidate_genes(snps,gene_limit=gene_limit,chain=True)) for x in range(50)]).sort('local')
    # get OLS for the bootstrapped degree
    log('Fitting models')
    model = sm.OLS(degree['global'], degree.local)
    res = model.fit()
    std, iv_l, iv_u = wls_prediction_std(res)
    # plot the bootstrapped data
    fig, ax = pylab.subplots(figsize=(8, 6))
    fig.hold(True)
    ax.set_xlim(0, max(degree.local))
    ax.set_ylim(0, max(degree['global']))
    # plot the bootstraps std
    # plot the true data
    log('Plotting Empirical')
    ax.plot(degree.local, degree['global'], 'o', label='Empirical')
    log('Plotting Residuals')
    ax.plot(degree.local, res.fittedvalues, '--')
    ax.plot(degree.local, res.fittedvalues + 2.5 * std, 'r--')
    ax.plot(degree.local, res.fittedvalues - 2.5 * std, 'r--')
    ax.set_xlabel('Number Local Interactions')
    ax.set_ylabel('Number Global Interactions')
    log('Saving Figure')
    fig.savefig('{}_locality.png'.format(cob.name))
Пример #9
0
def linreg_stock(stock_ticker='AAPL', start_date = '2019-12-01',
                 end_date = '2020-02-05', visualize=False):
    """
    Defines an ordinary linear regression, between time and 
    stock price. The function returns the 95% confidence interval
    of the slope beta
    """
    this_stock = Stock(stock_ticker)
    panel_data = this_stock.get_price_history(
        start_date=start_date, end_date=end_date)
    if panel_data.shape[0] == 0:
        return [np.nan, np.nan]

    panel_data['x_val'] = list(range(panel_data.shape[0]))

    X = panel_data['x_val']
    y = panel_data['Open'].values/panel_data['Open'].values[0]
    X = sm.add_constant(X)
    model = sm.OLS(y, X)
    results = model.fit()

    A = results.conf_int(alpha=0.05, cols=None)

    beta_lower = A[0]['x_val']
    beta_upper = A[1]['x_val']

    if visualize:
        prstd, iv_l, iv_u = wls_prediction_std(results)

        plt.plot(panel_data['x_val'], y, 'ro')
        plt.plot(panel_data['x_val'], results.fittedvalues, 'r--.', label="OLS")

        plt.plot(panel_data['x_val'], iv_l, 'b--.')
        plt.plot(panel_data['x_val'], iv_u, 'b--.')
    return [beta_lower, beta_upper]
Пример #10
0
def LR(df, options):
    origsize = len(df)

    ldf = df.copy()
    if 'divisor' in options:
        ldf = df.tail(int(origsize / options['divisor']))

    x = np.arange(len(ldf))
    X = sm.add_constant(x)

    res = sm.OLS(ldf, X).fit()
    print(res.summary())

    std, lower, upper = wls_prediction_std(res)
    middle = res.predict()

    nfill = origsize - len(ldf)
    if nfill < origsize:
        empty = np.full_like(np.arange(nfill), np.nan, dtype=np.double)
        std = np.append(empty, std)
        lower = np.append(empty, lower)
        upper = np.append(empty, upper)
        middle = np.append(empty, middle)

    return std, middle, lower, upper
Пример #11
0
def user_model(data):
    """
    This function allows the user to enter their own linear regression
    model formula, which is then run in the statsmodels package and
    returns model results.
    """

    # List available covariates in the data set
    print('The data set contains the following covariates: \n')
    print(list(data.columns), '\n')

    # Prompt user to input model formula, in R type syntax
    userFormula = choose_data = input('Enter your regression model formula, using syntax as shown: \n \n dependent_variable ~ covariate1 + covariate 2 + ... \n \n')

    # Run the user-defined model as a statsmodels linear regression
    userModel = smf.ols(formula=userFormula, data=data).fit()
    print('\n', userModel.summary(), '\n')

    # Retrieve y variable and time variable for plotting
    yvar = userModel.model.endog_names
    y = data[yvar]
    timeVar = list(data.columns[data.dtypes == 'datetime64[ns]'])
    x = data[timeVar]
    # covars = list(userModel.params.keys())


    # Plot dependent variable data and model fitted values vs time
    prstd, iv_l, iv_u = wls_prediction_std(userModel)
    fig = plt.figure(figsize=(12,6))

    plt.plot(x, userModel.fittedvalues, 'r.', alpha=0.2, label='Fitted Values')
    plt.plot(x, y, 'b.', alpha=0.2, label='%s data' % yvar)
    plt.legend(loc='upper left')
    plt.title('%s actual data and model fitted values' % yvar, fontsize='x-large')
Пример #12
0
def get_prediction(res, x):
    """
    得到模型的预测结果以及结果的上下限
    """
    prstd, ci_low, ci_up = wls_prediction_std(res, alpha=0.05)
    pred = res.predict(x)
    return pd.DataFrame({"ci_low": ci_low, "pred": pred, "ci_up": ci_up})
Пример #13
0
def dataframe_ordinary_least_squares(dataframe_in,
                                     x_col_name,
                                     y_col_name,
                                     showplot=False):

    x = dataframe_in[x_col_name].to_numpy()
    X = sm.add_constant(x)
    X = np.array(X, dtype=float)
    y = dataframe_in[y_col_name].to_numpy()

    model = sm.OLS(y, X)
    results = model.fit()

    prstd, iv_l, iv_u = wls_prediction_std(results)

    dataframe_in['OLS Values'] = results.fittedvalues
    dataframe_in['Confidence Upper'] = iv_u
    dataframe_in['Confidence Lower'] = iv_l

    if (showplot == True):
        print(results.summary())
        fig, ax = plt.subplots()
        ax.scatter(x, y, color="#778899", label="Test Volume")
        ax.plot(x,
                dataframe_in['OLS Values'],
                ".--",
                color="#4682B4",
                label="Ordinary Least Squares Regression")
        ax.plot(x, iv_u, color="#F08080", ls=":")
        ax.plot(x, iv_l, color="#F08080", ls=":")
        plt.show()
Пример #14
0
def plot_locality_regression(snps,cob,gene_limit=10):
    # Get degree and bootstrap degree
    log('Fetching Empirical Degree')
    degree = cob.locality(cob.refgen.candidate_genes(snps,gene_limit=gene_limit,chain=True)).sort('local')
    log('Fetching BS Degree')
    #bsdegree = pd.concat([cob.locality(cob.refgen.bootstrap_candidate_genes(snps,gene_limit=gene_limit,chain=True)) for x in range(50)]).sort('local')
    # get OLS for the bootstrapped degree 
    log('Fitting models')
    model = sm.OLS(degree['global'],degree.local)
    res = model.fit()
    std, iv_l, iv_u = wls_prediction_std(res)
    # plot the bootstrapped data
    fig,ax = pylab.subplots(figsize=(8,6)) 
    fig.hold(True)
    ax.set_xlim(0,max(degree.local))
    ax.set_ylim(0,max(degree['global']))
    # plot the bootstraps std
    # plot the true data
    log('Plotting Empirical')
    ax.plot(degree.local,degree['global'],'o',label='Empirical')
    log('Plotting Residuals')
    ax.plot(degree.local,res.fittedvalues,'--')
    ax.plot(degree.local,res.fittedvalues+2.5*std,'r--')
    ax.plot(degree.local,res.fittedvalues-2.5*std,'r--')
    ax.set_xlabel('Number Local Interactions')
    ax.set_ylabel('Number Global Interactions')
    log('Saving Figure')
    fig.savefig('{}_locality.png'.format(cob.name))
Пример #15
0
def PlotFit(res, x):

    prstd, iv_l, iv_u = wls_prediction_std(res)

    fig, ax = plt.subplots(figsize=(8, 6))

    ax.plot(x, 'o', label='data')
    #ax.plot(x, y_true, 'b-', label="True")
    ax.plot(x.index, res.fittedvalues, 'r--.', label="OLS")
    ax.plot(x.index, iv_u, 'r--')
    ax.plot(x.index, iv_l, 'r--')

    # Draw the predictions for one year into the future
    time_today = np.datetime64(datetime.today())
    #print((time_today-x.index.values[-1])/np.timedelta64(5724000, 's'))
    dtRange = np.linspace(
        pd.Timestamp(time_today).value,
        pd.Timestamp(time_today + np.timedelta64(31536000, 's')).value, 4)
    dtRange = pd.to_datetime(dtRange)
    Xnew = np.ones((x.shape[0] + 4, 2))  # adding the constant
    Xnew[:, 1] = np.arange(x.shape[0] + 4)
    Xnew = sm1.add_constant(Xnew)
    #print(Xnew)
    ynewpred = res.predict(Xnew)
    #print(ynewpred)
    ax.plot(x.index.union(dtRange), ynewpred, 'r', label="OLS prediction")
    ax.legend(loc='best')
    if draw: plt.show()
Пример #16
0
def mcp_model(df_sector, sector, thres_v):
    #    print('Check the correlation between site and satellite:')
    #    corr_sit_sat=pearsonr(df_regression['speed_sit'],df_regression['speed_sat'])
    #    if corr_sit_sat[0]<threshold_satcorr or corr_sit_sat[1]>threshold_p:
    #        print("Pearson's test p-value is %f %s %f, correlation between valuables is %f %s %f, therefore this satellite data should be rejected."%(corr_sit_sat[1],'>' if corr_sit_sat[1]>threshold_p else '<=',threshold_p,corr_sit_sat[0],'<' if corr_sit_sat[0]<threshold_satcorr else '>=',threshold_satcorr))
    #        sys.exit()
    #    else:
    #        print("Pearson's test p-value is %f <= %f, and the correlation between valuables is %f >= %f"%(corr_sit_sat[1],threshold_p,corr_sit_sat[0],threshold_satcorr))
    #    corr3_sit_sat=pearsonr(df_regression['speed3_sit'],df_regression['speed3_sat'])
    #    if corr3_sit_sat[0]<threshold_satcorr or corr3_sit_sat[1]>threshold_p:
    #        print("Pearson's test p-value is %f %s %f, correlation between valuables is %f %s %f, therefore this satellite data should be rejected."%(corr3_sit_sat[1],'>' if corr3_sit_sat[1]>threshold_p else '<=',threshold_p,corr3_sit_sat[0],'<' if corr3_sit_sat[0]<threshold_satcorr else '>=',threshold_satcorr))
    #        sys.exit()
    #    else:
    #        print("Pearson's test p-value is %f <= %f, and the correlation between valuables is %f >= %f"%(corr3_sit_sat[1],threshold_p,corr3_sit_sat[0],threshold_satcorr))
    #
    #    print('Linear regression (speed_sit, veer) ~ speed_sat:')
    #    print('Cross validation process.')
    #    df_train,df_test=mcpprocess.holdout(df_regression,0.75)
    #    mcpmodel=lrm(df_train['speed_sat'],df_train['speed_sit'])
    #    print(mcpmodel.summary2())
    df_regression = df_sector[df_sector.speed_sit > thres_v].reset_index(
        drop=True)
    #train:test=4:1
    df_train, df_test = mcpprocess.holdout(df_regression, 0.2)

    pre_std, Y_l, Y_u = wls_prediction_std(mcpmodel_v)

    descriptive.rl_fit(df_regression.speed_sat, df_regression.speed_sit,
                       mcpmodel_v.fittedvalues, Y_l, Y_u, sector, True, False,
                       1)
    descriptive.rl_residu(df_regression.speed_sat, mcpmodel_v.resid, sector,
                          True, False, 1)
    descriptive.rl_qqplot(resid_norm, sector, True, False, 1)
def try_prod24h_before(columns=['Tout', 'vWind', 'vWindavg24', 'prod24h_before'], add_const=False, y=y):
    plt.close('all')
    X = all_data[columns]
    res = mlin_regression(y, X, add_const=add_const)
    timesteps = ens.gen_hourly_timesteps(dt.datetime(2015,12,17,1), dt.datetime(2016,1,15,0))
    
    plt.subplot(2,1,1)
    plt.plot_date(timesteps, y, 'b', label='Actual prodution')
    plt.plot_date(timesteps, res.fittedvalues, 'r', label='Weather model')
    prstd, iv_l, iv_u = wls_prediction_std(res)    
    plt.plot_date(timesteps, iv_u, 'r--', label='95% conf. int.')
    plt.plot_date(timesteps, iv_l, 'r--')
    plt.ylabel('MW')
    plt.legend(loc=2)
    plt.subplot(2,1,2)
    plt.plot_date(timesteps, res.resid, '-', label='Residual')
    plt.ylabel('MW')
    plt.legend()
    
    print "MAE = " + str(mae(res.resid))
    print "MAPE = " + str(mape(res.resid, y))
    print "RMSE = " + str(rmse(res.resid))
    
    print res.summary()
    
       
    return res
def plot_best_model():
    plt.close('all')
    columns = ['Tout', 'Toutavg24', 'vWind', 'vWindavg24']#, 'hours', 'hours2','hours3', 'hours4','hours5', 'hours6']#, 'hours7', 'hours8']#,'hours5', 'hours6']
    X = all_data[columns]
    res = mlin_regression(y, X)
    timesteps = ens.gen_hourly_timesteps(dt.datetime(2015,12,17,1), dt.datetime(2016,1,15,0))
    
    plt.subplot(2,1,1)
    plt.plot_date(timesteps, y, 'b', label='Actual prodution')
    plt.plot_date(timesteps, res.fittedvalues, 'r', label='Weather model')
    prstd, iv_l, iv_u = wls_prediction_std(res)    
    plt.plot_date(timesteps, iv_u, 'r--', label='95% conf. int.')
    plt.plot_date(timesteps, iv_l, 'r--')
    mean_day_resid = [res.resid[i::24].mean() for i in range(24)]
    mean_resid_series = np.tile(mean_day_resid, 29)
    plt.plot_date(timesteps, res.fittedvalues + mean_resid_series, 'g', label='Weather model + avg daily profile')
    plt.ylabel('MW')
    plt.legend(loc=2)
    plt.subplot(2,1,2)
    plt.plot_date(timesteps, res.resid, '-', label='Residual')
    
    plt.plot_date(timesteps, mean_resid_series)
    plt.ylabel('MW')
    plt.legend()
    
    mape = np.mean(np.abs((res.fittedvalues + mean_resid_series-y)/y))
    mape2 = np.mean(np.abs((res.resid)/y))
    mae = np.mean(np.abs((res.fittedvalues + mean_resid_series-y)))
    
    print mape, mape2, mae
    
    
    res.summary()
    return res
Пример #19
0
    def _predict(self, fit, df, **kwargs):
        """
        Return a df with predictions and confidence interval
        The df will contain the following columns:
        - 'predicted': the model output
        - 'interval_u', 'interval_l': upper and lower confidence bounds.

        Parameters
        ----------
        fit : Statsmodels fit
        df : pandas DataFrame or None (default)
            If None, use self.df
        confint : float (default=0.05)
            Confidence level for two-sided hypothesis, if given, overrides the default one.

        Returns
        -------
        df : pandas DataFrame
            same as df with additional columns 'predicted', 'interval_u' and 'interval_l'
        """

        confint = kwargs.get('confint', self.confint)

        # Add model results to data as column 'predictions'
        if 'Intercept' in fit.model.exog_names:
            df['Intercept'] = 1.0
        df['predicted'] = fit.predict(df)
        if not self.allow_negative_predictions:
            df.loc[df['predicted'] < 0, 'predicted'] = 0
        prstd, interval_l, interval_u = wls_prediction_std(
            fit, df[fit.model.exog_names])
        df['interval_l'] = interval_l
        df['interval_u'] = interval_u

        return df
Пример #20
0
    def plot_vol_surface(self, x, y, contract, put_call):
        """Plot surface using Ordinary Least Square Fit."""

        df = pd.DataFrame(columns=['x', 'y'])
        df['x'] = x
        df['y'] = y
        degree = 3

        try:
            weights = np.polyfit(x, y, degree)
            model = np.poly1d(weights)
            results = smf.ols(formula='y ~ model(x)', data=df).fit()
            prstd, iv_l, iv_u = wls_prediction_std(results)
            fig, ax = plt.subplots(figsize=(8, 6))
            plt.title(
                "Implied Vol for NG European Options = {0}, moneyness plot= log(K/F): tradeDate: {1}"
                .format(contract, self.tradeDate))
            ax.plot(x, y, 'o', label="{0} Implied Vol".format(put_call))
            ax.plot(x, results.fittedvalues, 'r--.', label="OLS")
            ax.plot(x, iv_u, 'r--')
            ax.plot(x, iv_l, 'r--')
            ax.legend(loc='best')
            plt.xlabel("Moneyness: log(K/F)")
            plt.ylabel("Implied Vol.")
            plt.axvline(0, color='k')
            plt.show()

            print(prstd)
        except ValueError:
            logging.info("ValueError!")
Пример #21
0
def visualize_linear_regression(data,
                                fit,
                                response_name,
                                predictor_name,
                                show_std=False):
    """Create a scatter plot and linear model of the response variable (column 1) vs. predictor variable (column 2).
    
    Arguments:
        data           -- a tidy DataFrame, with response in column 1, predictor in column 2, and constant in column 3
        fit            -- a linear regression result
        response_name  -- name of the response variable
        predictor_name -- name of the predictor variable
        show_std       -- whether to show upper and lower bands around answer (default False)
    """
    ax = data.plot(
        kind='scatter',
        x=1,
        y=0,
        title=('Relationship of %s vs. %s' % (response_name, predictor_name)),
        #xlim=(axis_low, axis_high), ylim=(axis_low, axis_high)
    )
    X_new = pd.DataFrame({'predictor': ax.get_xlim()})
    X_new['const'] = 1
    preds = fit.predict(X_new)
    plt.plot(X_new['predictor'], preds, 'r-')
    if show_std:
        _, lower, upper = wls_prediction_std(fit, X_new)
        plt.plot(X_new['predictor'], lower, 'r--', X_new['predictor'], upper,
                 'r--')
    ax.set_xlabel(predictor_name)
    ax.set_ylabel(response_name)
Пример #22
0
    def test_ci(self):
        res_wls = self.res_wls
        prstd, iv_l, iv_u = wls_prediction_std(res_wls)
        pred_res = get_prediction(res_wls)
        ci = pred_res.conf_int(obs=True)

        assert_allclose(pred_res.se_obs, prstd, rtol=1e-13)
        assert_allclose(ci, np.column_stack((iv_l, iv_u)), rtol=1e-13)

        sf = pred_res.summary_frame()

        col_names = [
            'mean', 'mean_se', 'mean_ci_lower', 'mean_ci_upper',
            'obs_ci_lower', 'obs_ci_upper'
        ]
        assert_equal(sf.columns.tolist(), col_names)

        pred_res2 = res_wls.get_prediction()
        ci2 = pred_res2.conf_int(obs=True)

        assert_allclose(pred_res2.se_obs, prstd, rtol=1e-13)
        assert_allclose(ci2, np.column_stack((iv_l, iv_u)), rtol=1e-13)

        sf2 = pred_res2.summary_frame()
        assert_equal(sf2.columns.tolist(), col_names)
Пример #23
0
def visualizeModel(re, data, features, labels):
    """
    模型可视化
    """
    # 计算预测结果的标准差,预测下界,预测上界
    prstd, preLow, preUp = wls_prediction_std(re, alpha=0.05)
    # 为在Matplotlib中显示中文,设置特殊字体
    plt.rcParams['font.sans-serif'] = ['SimHei']
    # 创建一个图形框
    fig = plt.figure(figsize=(6, 6), dpi=80)
    # 在图形框里只画一幅图
    ax = fig.add_subplot(111)
    # 在Matplotlib中显示中文,需要使用unicode
    ax.set_title(u'%s' % "线性回归统计分析示例")
    # 画点图,用蓝色圆点表示原始数据
    ax.scatter(data[features],
               data[labels],
               color='b',
               label=u'%s: $y = x + \epsilon$' % "真实值")
    # 画线图,用红色虚线表示95%置信区间
    ax.plot(data[features], preUp, "r--", label=u'%s' % "95%置信区间")
    ax.plot(data[features], re.predict(data[features]), color='r',
        label=u'%s: $y = %.3fx$'\
        % ("预测值", re.params[features]))

    ax.plot(data[features], preLow, "r--")
    legend = plt.legend(shadow=True)
    legend.get_frame().set_facecolor('#6F93AE')
    plt.show()
Пример #24
0
    def loess_normative_model(self):
        """ Compute classical normative model."""
        if self.bins is None:
            self._create_bins()

        # format data
        data = self.data[[self.conf, self.score]].to_numpy(dtype=np.float64)

        # take the controls
        ctr_mask, _ = self._get_masks()
        ctr = data[ctr_mask]

        self.zm = np.zeros(self.bins.shape[0])  # mean
        self.zstd = np.zeros(self.bins.shape[0])  # standard deviation
        self.zci = np.zeros([self.bins.shape[0], 2])  # confidence interval

        for i, bin_center in enumerate(self.bins):
            mu = np.array(bin_center)  # bin_center value (age or conf)
            bin_mask = (abs(ctr[:, :1] - mu) < self.bin_width) * 1.
            idx = [u for (u, v) in np.argwhere(bin_mask)]

            scores = ctr[idx, 1]
            adj_conf = ctr[idx, 0] - mu  # confound relative to bin center

            # if more than 2 non NaN values do the model
            if (~np.isnan(scores)).sum() > 2:
                mod = sm.WLS(scores,
                             sm.tools.add_constant(adj_conf,
                                                   has_constant='add'),
                             missing='drop',
                             weight=bin_mask.flatten()[idx],
                             hasconst=True).fit()
                self.zm[i] = mod.params[0]  # mean

                # std and confidence intervals
                prstd, iv_l, iv_u = wls_prediction_std(mod, [0, 0])
                self.zstd[i] = prstd
                self.zci[i, :] = mod.conf_int()[0, :]  # [iv_l, iv_u]

            else:
                self.zm[i] = np.nan
                self.zci[i] = np.nan
                self.zstd[i] = np.nan

        dists = [np.abs(conf - self.bins) for conf in self.data[self.conf]]
        idx = [np.argmin(d) for d in dists]
        m = np.array([self.zm[i] for i in idx])
        std = np.array([self.zstd[i] for i in idx])
        nmodel = (self.data[self.score] - m) / std
        self.data['LOESS_pred'] = nmodel
        self.data['LOESS_residuals'] = self.data[
            self.score] - self.data['LOESS_pred']

        score = self._get_score()
        res = self.data['LOESS_residuals'].to_numpy(dtype=np.float64)
        self.SMSE_LOESS = (np.mean(res[ctr_mask]**2)**0.5) / np.std(
            score[ctr_mask])

        self._loess_rank()
Пример #25
0
def summary_obs(res, alpha=0.05):

    from scipy import stats
    from statsmodels.sandbox.regression.predstd import wls_prediction_std

    infl = Influence(res)

    #standard error for predicted mean
    #Note: using hat_matrix only works for fitted values
    predict_mean_se = np.sqrt(infl.hat_matrix_diag*res.mse_resid)

    tppf = stats.t.isf(alpha/2., res.df_resid)
    predict_mean_ci = np.column_stack([
                        res.fittedvalues - tppf * predict_mean_se,
                        res.fittedvalues + tppf * predict_mean_se])


    #standard error for predicted observation
    predict_se, predict_ci_low, predict_ci_upp = wls_prediction_std(res)
    predict_ci = np.column_stack((predict_ci_low, predict_ci_upp))

    #standard deviation of residual
    resid_se = np.sqrt(res.mse_resid * (1 - infl.hat_matrix_diag))

    table_sm = np.column_stack([
                                  np.arange(res.nobs) + 1,
                                  res.model.endog,
                                  res.fittedvalues,
                                  predict_mean_se,
                                  predict_mean_ci[:,0],
                                  predict_mean_ci[:,1],
                                  predict_ci[:,0],
                                  predict_ci[:,1],
                                  res.resid,
                                  resid_se,
                                  infl.resid_studentized_internal,
                                  infl.cooks_distance()[0]
                                  ])


    #colnames, data = zip(*table_raw) #unzip
    data = table_sm
    ss2 = ['Obs', 'Dep Var\nPopulation', 'Predicted\nValue', 'Std Error\nMean Predict', 'Mean ci\n95% low', 'Mean ci\n95% upp', 'Predict ci\n95% low', 'Predict ci\n95% upp', 'Residual', 'Std Error\nResidual', 'Student\nResidual', "Cook's\nD"]
    colnames = ss2
    #self.table_data = data
    #data = np.column_stack(data)
    data = np.round(data,4)
    #self.table = data
    from statsmodels.iolib.table import SimpleTable, default_html_fmt
    from statsmodels.iolib.tableformatting import fmt_base
    from copy import deepcopy
    fmt = deepcopy(fmt_base)
    fmt_html = deepcopy(default_html_fmt)
    fmt['data_fmts'] = ["%4d"] + ["%6.3f"] * (data.shape[1] - 1)
    #fmt_html['data_fmts'] = fmt['data_fmts']
    st = SimpleTable(data, headers=colnames, txt_fmt=fmt,
                       html_fmt=fmt_html)

    return st, data, ss2
def predict_y_with_model(model,
                         intercept_included: bool,
                         independent_variables: List[List[float]],
                         iv_names: List[str],
                         significance_level=0.1):
    """
    Format of independent_variables:
      x1  [[1, 2, 3, 4, 5],
      x2   [5, 4, 3, 2, 1],
      x3   [6, 6, 6, 6, 6]]
    """
    ivs = independent_variables
    assert significance_level < 0.2

    print('Predicting y with model (significance_level == {}):'.format(
        significance_level))

    model_parameters = {}
    if intercept_included:
        model_parameters['intercept'] = 1
    for i in range(len(ivs)):
        model_parameters[iv_names[i]] = ivs[i]
    print(model_parameters)
    predicted_values = model.predict(pd.DataFrame(model_parameters))

    results = []
    for i in range(len(independent_variables[0])):
        tmp = []
        for j in range(len(iv_names)):
            tmp.append(independent_variables[j][i])
        tmp.append(predicted_values[i])
        results.extend([tmp])

    results_pd = pd.DataFrame(results)
    headers = []
    for i in range(len(iv_names)):
        headers.append(iv_names[i])
    headers.append('Predicted Value')

    results_pd.columns = headers

    if intercept_included:
        exogenous_parameters = []

        for i in range(len(ivs[0])):
            temp_iv = [1]
            for j in range(len(ivs)):
                temp_iv.append(ivs[j][i])
            exogenous_parameters.append(temp_iv)
    # print(f'exogenous_parameters: {exogenous_parameters}')
        results_pd['Prediction Std'], results_pd['Lowers'], results_pd[
            'Uppers'] = wls_prediction_std(model,
                                           exog=exogenous_parameters,
                                           weights=1,
                                           alpha=significance_level)
    else:
        print(
            'Interval prediction not available if intercept is not included.')
    print(results_pd)
Пример #27
0
 def fit(self, x, y):
     x = array(x).reshape(-1, 1)
     model = OLS(y, PolynomialFeatures(2).fit_transform(x)).fit()
     self.m = model.predict(
         PolynomialFeatures(2).fit_transform(AGES.reshape(-1, 1)))
     self.s = wls_prediction_std(
         model,
         PolynomialFeatures(2).fit_transform(AGES.reshape(-1, 1)))[0]
     return self
Пример #28
0
def plot(ts, trend=True, interval=False, outliers=False, ax=None, **kwargs):
    '''
    Plot a timeseries with optionl trend, 2 standard deviation interval and outliers
    Parameters
    ----------
    ts : A DataFrame or Series with a timeseries index with all numeric columns
   
    trend : overlay trend linear?
    
    interval : overlay a 2 standard deviation interval?
    
    outliers : overlay outliers?
    
    kwargs : aguments passed to isoutler
    
    ax : axes to draw on (optional)
    
    Returns
    -------
    axes object
    '''

    if not ax:
        ax = gca()

    # ols won't accept a date so create time in seconds from first date as the independant variable
    if isinstance(ts, pd.Series):
        df = (ts).to_frame()  # Unify handeling of Series and DataFrame
    else:
        df = ts.copy()

    cols = df.select_dtypes(include=[np.number]).columns
    df['__Seconds'] = (ts.index - ts.index.min()).astype('timedelta64[s]')
    for col in cols:

        res = smf.ols(formula=col + ' ~ __Seconds', data=df).fit()

        # Plot this first to get the better pandas timeseries drawing of dates on x axis
        df[col].plot(ax=ax,
                     label="{} (r^2 = {:2.2})".format(col, res.rsquared)
                     if trend else col)

        if trend:
            res.fittedvalues.plot(ax=ax, style='--g', label="")
        if interval:
            prstd, iv_l, iv_u = wls_prediction_std(res)
            ax.fill_between(iv_l.index,
                            iv_l,
                            iv_u,
                            color='#888888',
                            alpha=0.25)
        if outliers:
            df_outliers = df[col][isoutlier(df[col], **kwargs)]
            if len(df_outliers) > 0:
                df_outliers.plot(ax=ax, style='r*', label="")

    return ax
Пример #29
0
def plot_fit(results, exog_idx, y_true=None, ax=None, **kwargs):
    """Plot fit against one regressor.

    This creates one graph with the scatterplot of observed values compared to
    fitted values.

    Parameters
    ----------
    results : result instance
        result instance with resid, model.endog and model.exog as attributes
    x_var : int or str
        Name or index of regressor in exog matrix.
    y_true : array_like
        (optional) If this is not None, then the array is added to the plot
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    kwargs
        The keyword arguments are passed to the plot command for the fitted
        values points.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.
    """
    fig, ax = utils.create_mpl_ax(ax)

    exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
    results = maybe_unwrap_results(results)

    #maybe add option for wendog, wexog
    y = results.model.endog
    x1 = results.model.exog[:, exog_idx]
    x1_argsort = np.argsort(x1)
    y = y[x1_argsort]
    x1 = x1[x1_argsort]

    ax.plot(x1, y, 'bo', label=results.model.endog_names)
    if not y_true is None:
        ax.plot(x1, y_true[x1_argsort], 'b-', label='True values')
    title = 'Fitted values versus %s' % exog_name

    prstd, iv_l, iv_u = wls_prediction_std(results)
    ax.plot(x1, results.fittedvalues[x1_argsort], 'D', color='r',
            label='fitted', **kwargs)
    ax.vlines(x1, iv_l[x1_argsort], iv_u[x1_argsort], linewidth=1, color='k',
            alpha=.7)
    #ax.fill_between(x1, iv_l[x1_argsort], iv_u[x1_argsort], alpha=0.1,
    #                    color='k')
    ax.set_title(title)
    ax.set_xlabel(exog_name)
    ax.set_ylabel(results.model.endog_names)
    ax.legend(loc='best')

    return fig
Пример #30
0
 def calculate_wls_prediction_std(result):
     """
     :return:
         predstd : array_like, standard error of prediction same length as rows of exog
         iv_l : array_like, lower confidence bound
         iv_u : array_like, upper confidence bound
     """
     # predstd, iv_l, iv_u = wls_prediction_std(result)
     return wls_prediction_std(result)
Пример #31
0
def plot_fit(res, exog_idx, exog_name='', y_true=None, ax=None, fontsize='small'):
    """Plot fit against one regressor.

    This creates one graph with the scatterplot of observed values compared to
    fitted values.

    Parameters
    ----------
    res : result instance
        result instance with resid, model.endog and model.exog as attributes
    exog_idx : int
        index of regressor in exog matrix
    y_true : array_like
        (optional) If this is not None, then the array is added to the plot
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.

    Notes
    -----
    This is currently very simple, no options or varnames yet.

    """
    fig, ax = utils.create_mpl_ax(ax)

    if exog_name == '':
        exog_name = 'variable %d' % exog_idx

    #maybe add option for wendog, wexog
    y = res.model.endog
    x1 = res.model.exog[:, exog_idx]
    x1_argsort = np.argsort(x1)
    y = y[x1_argsort]
    x1 = x1[x1_argsort]

    ax.plot(x1, y, 'bo', label='observed')
    if not y_true is None:
        ax.plot(x1, y_true[x1_argsort], 'b-', label='true')
        title = 'fitted versus regressor %s' % exog_name
    else:
        title = 'fitted versus regressor %s' % exog_name

    prstd, iv_l, iv_u = wls_prediction_std(res)
    ax.plot(x1, res.fittedvalues[x1_argsort], 'k-', label='fitted') #'k-o')
    #ax.plot(x1, iv_u, 'r--')
    #ax.plot(x1, iv_l, 'r--')
    ax.fill_between(x1, iv_l[x1_argsort], iv_u[x1_argsort], alpha=0.1, color='k')
    ax.set_title(title, fontsize=fontsize)

    return fig
Пример #32
0
def plot_fit(res, exog_idx, exog_name='', y_true=None, ax=None, fontsize='small'):
    """Plot fit against one regressor.

    This creates one graph with the scatterplot of observed values compared to
    fitted values.

    Parameters
    ----------
    res : result instance
        result instance with resid, model.endog and model.exog as attributes
    exog_idx : int
        index of regressor in exog matrix
    y_true : array_like
        (optional) If this is not None, then the array is added to the plot
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.

    Notes
    -----
    This is currently very simple, no options or varnames yet.

    """
    fig, ax = utils.create_mpl_ax(ax)

    if exog_name == '':
        exog_name = 'variable %d' % exog_idx

    #maybe add option for wendog, wexog
    y = res.model.endog
    x1 = res.model.exog[:, exog_idx]
    x1_argsort = np.argsort(x1)
    y = y[x1_argsort]
    x1 = x1[x1_argsort]

    ax.plot(x1, y, 'bo', label='observed')
    if not y_true is None:
        ax.plot(x1, y_true[x1_argsort], 'b-', label='true')
        title = 'fitted versus regressor %s' % exog_name
    else:
        title = 'fitted versus regressor %s' % exog_name

    prstd, iv_l, iv_u = wls_prediction_std(res)
    ax.plot(x1, res.fittedvalues[x1_argsort], 'k-', label='fitted') #'k-o')
    #ax.plot(x1, iv_u, 'r--')
    #ax.plot(x1, iv_l, 'r--')
    ax.fill_between(x1, iv_l[x1_argsort], iv_u[x1_argsort], alpha=0.1, color='k')
    ax.set_title(title, fontsize=fontsize)

    return fig
Пример #33
0
def odds_hat_l_u(self: LassoICSelector):
    Xols = self.transform_to_ols(self.X)
    yhat = self.ols.predict(self.ols_results.params, Xols)
    # from equation 5
    odds_hat = np.exp(yhat)
    # the error in yhat is
    (yhat_std, yhat_l, yhat_u) = wls_prediction_std(self.ols_results, Xols)
    oddshat_l = np.exp(yhat - 2 * yhat_std)
    oddshat_u = np.exp(yhat + 2 * yhat_std)
    return odds_hat, oddshat_l, oddshat_u
Пример #34
0
	def plot_OLS_CI(self, model, x, y, y_true):
		prstd, iv_l, iv_u = wls_prediction_std(model)
		fig, ax = plt.subplots(figsize=(8,6))

		ax.plot(x, y, 'o', label="data")
		ax.plot(x, y_true, 'b-', label="True")
		ax.plot(x, model.fittedvalues, 'r--.', label="OLS")
		ax.plot(x, iv_u, 'r--')
		ax.plot(x, iv_l, 'r--')
		ax.legend(loc='best');
Пример #35
0
def lm(x, y):
    "fits an OLS from statsmodels. returns tuple."
    x, y = map(_plot_friendly, [x, y])
    if _isdate(x[0]):
        x = np.array([i.toordinal() for i in x])
    df = pd.DataFrame({'x': x, 'y': y})
    df['const'] = 1.
    fit = sm.OLS(df.y, df[['x', 'const']]).fit()
    df['predicted_y'] = fit.fittedvalues
    df['predstd'], df['interval_l'], df['interval_u'] = wls_prediction_std(fit)
    return (df.predicted_y, df.interval_l, df.interval_u)
Пример #36
0
def lm(x, y):
    "fits an OLS from statsmodels. returns tuple."
    x, y = map(_plot_friendly, [x, y])
    if _isdate(x[0]):
        x = np.array([i.toordinal() for i in x])
    df = pd.DataFrame({"x": x, "y": y})
    df["const"] = 1.0
    fit = sm.OLS(df.y, df[["x", "const"]]).fit()
    df["predicted_y"] = fit.fittedvalues
    df["predstd"], df["interval_l"], df["interval_u"] = wls_prediction_std(fit)
    return (df.predicted_y, df.interval_l, df.interval_u)
Пример #37
0
    def _predict(self, fit, df):
        """
        Return a df with predictions and confidence interval

        Notes
        -----
        The df will contain the following columns:
        - 'predicted': the model output
        - 'interval_u', 'interval_l': upper and lower confidence bounds.

        The result will depend on the following attributes of self:
        confint : float (default=0.95)
            Confidence level for two-sided hypothesis
        allow_negative_predictions : bool (default=True)
            If False, correct negative predictions to zero (typically for energy consumption predictions)

        Parameters
        ----------
        fit : Statsmodels fit
        df : pandas DataFrame or None (default)
            If None, use self.df


        Returns
        -------
        df_res : pandas DataFrame
            Copy of df with additional columns 'predicted', 'interval_u' and 'interval_l'
        """

        # Add model results to data as column 'predictions'
        df_res = df.copy()
        if 'Intercept' in fit.model.exog_names:
            df_res['Intercept'] = 1.0
        df_res['predicted'] = fit.predict(df_res)
        if not self.allow_negative_predictions:
            df_res.loc[df_res['predicted'] < 0, 'predicted'] = 0

        def rename(x):
            if x == 'Intercept':
                return x
            else:
                return self.quote(x)

        prstd, interval_l, interval_u = wls_prediction_std(
            fit,
            df_res.rename(columns=rename)[fit.model.exog_names],
            alpha=1 - self.confint)
        df_res['interval_l'] = interval_l
        df_res['interval_u'] = interval_u

        if 'Intercept' in df_res:
            df_res.drop(labels=['Intercept'], axis=1, inplace=True)

        return df_res
Пример #38
0
    def _predict(self, fit, df):
        """
        Return a df with predictions and confidence interval

        Notes
        -----
        The df will contain the following columns:
        - 'predicted': the model output
        - 'interval_u', 'interval_l': upper and lower confidence bounds.

        The result will depend on the following attributes of self:
        confint : float (default=0.95)
            Confidence level for two-sided hypothesis
        allow_negative_predictions : bool (default=True)
            If False, correct negative predictions to zero (typically for energy consumption predictions)

        Parameters
        ----------
        fit : Statsmodels fit
        df : pandas DataFrame or None (default)
            If None, use self.df


        Returns
        -------
        df_res : pandas DataFrame
            Copy of df with additional columns 'predicted', 'interval_u' and 'interval_l'
        """

        # Add model results to data as column 'predictions'
        df_res = df.copy()
        if 'Intercept' in fit.model.exog_names:
            df_res['Intercept'] = 1.0
        df_res['predicted'] = fit.predict(df_res)
        if not self.allow_negative_predictions:
            df_res.loc[df_res['predicted'] < 0, 'predicted'] = 0

        def rename(x):
            if x == 'Intercept':
                return x
            else:
                return self.quote(x)

        prstd, interval_l, interval_u = wls_prediction_std(fit,
                                                           df_res.rename(columns=rename)[fit.model.exog_names],
                                                           alpha=1 - self.confint)
        df_res['interval_l'] = interval_l
        df_res['interval_u'] = interval_u

        if 'Intercept' in df_res:
            df_res.drop(labels=['Intercept'], axis=1, inplace=True)

        return df_res
Пример #39
0
    def plot_locality(self,gene_list,bootstraps=10,num_windows=100,sd_thresh=2):
        '''
            Make a fancy locality plot.
        '''
        # Generate a blank fig
        fig,ax = plt.subplots(figsize=(8,6)) 
        fig.hold(True)
        # Y axis is local degree (what we are TRYING to predict)
        degree = self.locality(gene_list).sort('global')
        ax.set_ylim(0,max(degree['local']))
        ax.set_xlim(0,max(degree['global']))
        if bootstraps > 0:
            bs = pd.concat(
                [self.locality(
                    self.refgen.bootstrap_candidate_genes(gene_list)
                ) for x in range(10)]
            ).sort('global')
            ax.set_ylim(0,max(bs['local']))
            ax.set_xlim(0,max(bs['global']))
            plt.plot(bs['global'],bs['local'],'ro',alpha=0.05,label='Bootstraps')
        # Plot the bootstraps and the empirical
        plt.plot(degree['global'],degree['local'],'bo',label='Empirical')
        emp_ols = sm.OLS(degree['local'],degree['global']).fit()
        ax.plot(degree['global'],emp_ols.fittedvalues,'k:',label='Empirical OLS')

        if bootstraps > 0:
            # Get the OLS
            bs_ols = sm.OLS(bs['local'],bs['global']).fit()
            bs['resid'] = bs_ols.resid
            bs['fitted'] = bs_ols.fittedvalues
            ax.plot(bs['global'],bs_ols.fittedvalues,'g--',label='bootstrap OLS')
            # Do lowess on the residuals
            # We only care about windows within the empirical part
            window_tick = len(bs)/num_windows
            bs['window'] = [int(x/window_tick) for x in range(len(bs))]
            # get std for each window
            win_std = bs.groupby('window').apply(lambda df: df['resid'].std()).to_dict()
            bs['std_envelope'] = [win_std[x] for x in bs.window.values]
            # Plot confidence intervals
            prstd, iv_l, iv_u = wls_prediction_std(bs_ols)           
            ax.plot(bs['global'], iv_u, 'g--',label='conf int.')
            ax.plot(bs['global'], iv_l, 'g--')
            # plot the  
            ax.plot(
                bs['global'],bs['fitted']+(sd_thresh*bs['std_envelope']),'r--'
                ,label='{} s.d. envelope'.format(sd_thresh)
            )
            ax.plot(bs['global'],bs['fitted']-(sd_thresh*bs['std_envelope']),'r--')
        ax.set_xlabel('Number Global Interactions')
        ax.set_ylabel('Number Local Interactions')
        legend = ax.legend(loc='best')
        return plt
Пример #40
0
 def predict(self, ID, ALPHA=0.5):
     list1 = get_data(ID)
     vector = self.vectorizer.transform([list1[0]])
     vector = self.lsa.transform(vector)
     array = np.array([list1[1:4]])**2.0 / self.sum
     array = array**0.5
     vector= np.hstack([vector, array])
     vector = del_vector(vector, self.dellist)
     
     estimated = self.results.predict(vector)
     prstdn, infa, supa = wls_prediction_std(self.results, vector, alpha = ALPHA)
     if infa[0] < 0:
         infa[0] = 0
     return estimated[0]**2.0, infa[0]**2.0, supa[0]**2.0
Пример #41
0
 def run_ordinary_least_squares(ols_dates, ols_data, statsmodels_settings):
     """
     This method receives the dates and prices of a Quandl data-set as well as settings for the StatsModels package,
     it then calculates the regression lines and / or the confidence lines are returns the objects
     """
     intercept = np.column_stack((ols_dates, ols_dates ** statsmodels_settings.exponent))
     constant = sm.add_constant(intercept)
     statsmodel_regression = sm.OLS(ols_data, constant).fit()
     print(statsmodel_regression.summary())
     if statsmodels_settings.confidence:
         prstd, lower, upper = wls_prediction_std(statsmodel_regression)
         return statsmodel_regression, lower, upper
     else:
         return statsmodel_regression
Пример #42
0
def main():
    df = pickle.loads(open('OLS_data','r').read())
    df = df.sort(columns='Median household income')
    y = df['Tip Perc']
    X = df[['Median household income','Income2','const']]
    result = sm.OLS(y, X).fit()
    yhat = result.predict(X)
    prstd, iv_l, iv_u = wls_prediction_std(result)
    plt.scatter(X['Median household income'],y,color = 'b', alpha = 0.9)
    plt.plot(X['Median household income'],yhat, color = 'r', alpha = 0.7)
    plt.plot(X['Median household income'], iv_u, '--', color ='r',alpha = 0.7, linewidth = 0.7)
    plt.plot(X['Median household income'], iv_l, '--', color ='r', alpha = 0.7, linewidth = 0.7)
    plt.text(125000, 24.5,'$R^2$=$%.3f$' % result.rsquared, ha='center', va='center')
    plt.xlabel('Median Household Income ($)')
    plt.ylabel('Average Tip Percentage')
    plt.title('Regress Tip Percentage on Median Household Income')
    plt.show()
Пример #43
0
def main():
    df = pickle.loads(open("OLS_data", "r").read())
    df = df.sort(columns="White")
    y = df["Tip Perc"]
    X = df[["White", "const"]]
    result = sm.OLS(y, X).fit()
    yhat = result.predict(X)
    prstd, iv_l, iv_u = wls_prediction_std(result)
    plt.scatter(X["White"], y, color="b", alpha=0.9)
    plt.plot(X["White"], yhat, color="r", alpha=0.7)
    plt.plot(X["White"], iv_u, "--", color="r", alpha=0.7, linewidth=0.7)
    plt.plot(X["White"], iv_l, "--", color="r", alpha=0.7, linewidth=0.7)
    plt.text(1.05, 25, "$R^2$=$%.3f$" % result.rsquared, ha="center", va="center")
    plt.xlabel("White Rate")
    plt.ylabel("Average Tip Percentage")
    plt.title("Regress Tip Percentage on White Rate")
    plt.show()
Пример #44
0
def lm(x, y, alpha=ALPHA):
    "fits an OLS from statsmodels. returns tuple."
    x, y = map(plot_friendly, [x,y])
    if _isdate(x[0]):
        x = np.array([i.toordinal() for i in x])
    X = sm.add_constant(x)
    fit = sm.OLS(y, X).fit()
    prstd, iv_l, iv_u = wls_prediction_std(fit)
    _, summary_values, summary_names = summary_table(fit, alpha=alpha)
    df = pd.DataFrame(summary_values, columns=map(snakify, summary_names))
    fittedvalues        = df['predicted_value']
    predict_mean_se     = df['std_error_mean_predict']
    predict_mean_ci_low = df['mean_ci_95%_low']
    predict_mean_ci_upp = df['mean_ci_95%_upp']
    predict_ci_low      = df['predict_ci_95%_low']
    predict_ci_upp      = df['predict_ci_95%_upp']
    return (fittedvalues, predict_mean_ci_low, predict_mean_ci_upp)
def returnOutliers(results, x, y, alpha=0.05):
    o_x = []
    o_y = []

    #print results.cov_params().shape[0]
    exog = results.model.exog
    #print exog.shape
    #print x.shape[0]
    pred_y, iv_l, iv_u = wls_prediction_std(results, exog=x, weights=None, alpha=alpha)

    i = 0
    for val in y:
        if (val > iv_u[i] or val < iv_l[i]):
            o_x.append(x[i][1])
            o_y.append(val)
        i += 1

    return o_x, o_y
Пример #46
0
    def test_ci(self):
        res_wls = self.res_wls
        prstd, iv_l, iv_u = wls_prediction_std(res_wls)
        pred_res = get_prediction(res_wls)
        ci = pred_res.conf_int(obs=True)

        assert_allclose(pred_res.se_obs, prstd, rtol=1e-13)
        assert_allclose(ci, np.column_stack((iv_l, iv_u)), rtol=1e-13)

        sf = pred_res.summary_frame()

        col_names = ['mean', 'mean_se', 'mean_ci_lower', 'mean_ci_upper',
                      'obs_ci_lower', 'obs_ci_upper']
        assert_equal(sf.columns.tolist(), col_names)

        pred_res2 = res_wls.get_prediction()
        ci2 = pred_res2.conf_int(obs=True)

        assert_allclose(pred_res2.se_obs, prstd, rtol=1e-13)
        assert_allclose(ci2, np.column_stack((iv_l, iv_u)), rtol=1e-13)

        sf2 = pred_res2.summary_frame()
        assert_equal(sf2.columns.tolist(), col_names)

        # check that list works, issue 4437
        x = res_wls.model.exog.mean(0)
        pred_res3 = res_wls.get_prediction(x)
        ci3 = pred_res3.conf_int(obs=True)
        pred_res3b = res_wls.get_prediction(x.tolist())
        ci3b = pred_res3b.conf_int(obs=True)
        assert_allclose(pred_res3b.se_obs, pred_res3.se_obs, rtol=1e-13)
        assert_allclose(ci3b, ci3, rtol=1e-13)
        res_df = pred_res3b.summary_frame()
        assert_equal(res_df.index.values, [0])

        x = res_wls.model.exog[-2:]
        pred_res3 = res_wls.get_prediction(x)
        ci3 = pred_res3.conf_int(obs=True)
        pred_res3b = res_wls.get_prediction(x.tolist())
        ci3b = pred_res3b.conf_int(obs=True)
        assert_allclose(pred_res3b.se_obs, pred_res3.se_obs, rtol=1e-13)
        assert_allclose(ci3b, ci3, rtol=1e-13)
        res_df = pred_res3b.summary_frame()
        assert_equal(res_df.index.values, [0, 1])
Пример #47
0
def test_pred_interval(show_plot=False):
    from ml_ext import examples
    (coefs,df)=examples.gen_simplemodel_data(n=50,k=3)
    df.sort('X1',inplace=True)
    lr=LinModel()
    X=df[df.columns[df.columns!='y']]
    y=df.y


    lr.fit(X=X,y=y)
    lr.summary()
    df_ci=lr.get_confidence_interval_for_mean(X)
    df_pi=lr.get_prediction_interval(X)

    #Now use statsmodels to compare
    from statsmodels.sandbox.regression.predstd import wls_prediction_std
    import statsmodels.api as sm
    re = sm.OLS(y, X).fit()
    prstd, iv_l, iv_u = wls_prediction_std(re)

    if show_plot:
        (fig,ax)=plt.subplots(nrows=2,ncols=1,figsize=[14,12])

        cols=sns.color_palette('husl',n_colors=4)
        ax[0].scatter(X.X1,y,label='y',color=cols[3],alpha=0.4)
        
        ax[0].plot(X.X1,df_pi['upper_pred'],label='pred',color=cols[1],alpha=0.5)
        ax[0].plot(X.X1,df_pi['lower_pred'],color=cols[1],alpha=0.5)
        ax[0].plot(X.X1,df_ci['upper_mean'],color=cols[2],alpha=0.5)
        ax[0].plot(X.X1,df_ci['lower_mean'],label='mean_ci',color=cols[2],alpha=0.5)
        ax[0].scatter(X.X1,df_pi['y_hat'],label='y_hat',color=cols[0],alpha=0.5)
        ax[0].legend(loc='best')

        ax[1].scatter(X.X1,y,label='y',color=cols[3],alpha=0.4)
        ax[1].scatter(X.X1,df_ci['y_hat'],label='y_hat',color=cols[0],alpha=0.5)
        ax[1].plot(X.X1,iv_u,label='wls',color=cols[1],alpha=0.5)
        ax[1].plot(X.X1,iv_l,color=cols[1],alpha=0.5)
        ax[1].legend(loc='best')

    #get difference between uppers from each and check they are within 1%
    overall_diff=100*numpy.sum(iv_u-df_pi['upper_pred'])/numpy.sum(iv_u)
    logging.debug("Overall % difference in prediction ranges for upper bound: {}".format(overall_diff))
    assert overall_diff<0.1
Пример #48
0
def lm(x, y, alpha=ALPHA):
    "fits an OLS from statsmodels. returns tuple."
    x_is_date = _isdate(x.iloc[0])
    if x_is_date:
        x = np.array([i.toordinal() for i in x])
    X = sm.add_constant(x)
    fit = sm.OLS(y, X).fit()
    prstd, iv_l, iv_u = wls_prediction_std(fit)
    _, summary_values, summary_names = summary_table(fit, alpha=alpha)
    df = pd.DataFrame(summary_values, columns=map(_snakify, summary_names))
    # TODO: indexing w/ data frame is messing everything up
    fittedvalues        = df['predicted_value'].values
    predict_mean_ci_low = df['mean_ci_95%_low'].values
    predict_mean_ci_upp = df['mean_ci_95%_upp'].values
    predict_ci_low      = df['predict_ci_95%_low'].values
    predict_ci_upp      = df['predict_ci_95%_upp'].values

    if x_is_date:
        x = [Timestamp.fromordinal(int(i)) for i in x]
    return (x, fittedvalues, predict_mean_ci_low, predict_mean_ci_upp)
Пример #49
0
def test_nonlinear():
    np.random.seed(111)
    
    n_sample = 50
    max_val = 30
    sig = 0.5

    x = np.linspace(0, max_val, n_sample)
    X = np.c_[x, np.sin(x), (x - 5)**2, np.ones(n_sample)]
    beta = np.array([0.5, 0.5, -0.02, 5.0])
    e = np.random.normal(size=n_sample) 

    #X = sm.add_constant(X, prepend=False)
    y_true = np.dot(X, beta)
    y = y_true + sig * e

    for i in xrange(5):
        print '%3d: %s %s' % (i, X[i, :], y[i])

    print
    print
    model = sm.OLS(y, X)
    results = model.fit()
    print results.summary()
    print
    print
    print results.params
    print results.rsquared 
    print results.bse
    print results.predict()

    
    plt.figure()
    plt.plot(x, y, 'o', x, y_true, 'b-')
    prstd, iv_l, iv_u = wls_prediction_std(results)
    plt.plot(x, results.fittedvalues, 'r--.')
    plt.plot(x, iv_u, 'r--')
    plt.plot(x, iv_l, 'r--')
    plt.title('blue: true,   red: OLS')
    plt.show()
Пример #50
0
 def predict(self, ID, ALPHA=0.5):
     list1 = get_data(ID)
     vector = self.vectorizer.transform([list1[0]])
     vector = self.lsa.transform(vector)
     array = np.array([list1[1:4]])**2.0 / self.sum
     array = array**0.5
     vector= np.hstack([vector, array])
     length = vector.shape[1]
     '''
     for i in range(length):
         tmp = vector[0][i] * vector[0][i]
         tmp = np.array([[tmp]])
         vector = np.hstack([vector, tmp])
     '''
     
     for i in range(length):
         for j in range(i, length):
             tmp = vector[0][i] * vector[0][j]
             tmp = np.array([[tmp]])
             vector = np.hstack([vector, tmp])        
     vector = del_vector(vector, self.dellist)
     
     estimated = self.results.predict(vector)
     prstdn, infa, supa = wls_prediction_std(self.results, vector, alpha = ALPHA)
     
     
     if infa[0] < 0:
         infa[0] = 0
     return estimated[0]**2.0, infa[0]**2.0, supa[0]**2.0         
         
         
         
         
         
         
         
         
         
     
Пример #51
0
    def test_ci(self):
        res_wls = self.res_wls
        prstd, iv_l, iv_u = wls_prediction_std(res_wls)
        pred_res = get_prediction(res_wls)
        ci = pred_res.conf_int(obs=True)

        assert_allclose(pred_res.se_obs, prstd, rtol=1e-13)
        assert_allclose(ci, np.column_stack((iv_l, iv_u)), rtol=1e-13)

        sf = pred_res.summary_frame()

        col_names = ['mean', 'mean_se', 'mean_ci_lower', 'mean_ci_upper',
                      'obs_ci_lower', 'obs_ci_upper']
        assert_equal(sf.columns.tolist(), col_names)

        pred_res2 = res_wls.get_prediction()
        ci2 = pred_res2.conf_int(obs=True)

        assert_allclose(pred_res2.se_obs, prstd, rtol=1e-13)
        assert_allclose(ci2, np.column_stack((iv_l, iv_u)), rtol=1e-13)

        sf2 = pred_res2.summary_frame()
        assert_equal(sf2.columns.tolist(), col_names)
Пример #52
0
def linear(data):
	# Regression
	x = []
	y = []
	for i in range(len(data)):
		x.append(i)
	for p in data[headers[1]]: 
		y.append(p)
	x = np.array(x)
	y = np.array(y)
	x = x.reshape(-1, 1)
	y = y.reshape(-1, 1)
	x_line = np.column_stack((x, x**2)) 
	x_cons = sm.add_constant(x_line) 
	model = sm.OLS(y, x_cons)
	results = model.fit()
	print (results.summary())
	print ('Coefficients: ', results.params) # Save to pfd
	print ('Standard errors: ', results.bse)
	print ('R2: ', results.rsquared)
	# Plot
	prstd, iv_l, iv_u = wls_prediction_std(results)
	fig, ax = plt.subplots()
	title = "Linear Regression" ,headers[1]
	plt.title(title)
	ax.spines["top"].set_visible(False)
	ax.spines["right"].set_visible(False)
	ax.get_xaxis().tick_bottom()    
	ax.get_yaxis().tick_left()  
	plt.tick_params(axis="both", which="both", bottom="on", top="off",    
	                labelbottom="on", left="off", right="off", labelleft="on")
	ax.plot(x, y, label="data")
	ax.plot(x, results.fittedvalues, 'r--.', label="OLS")
	ax.plot(x, iv_u, 'c--')
	ax.plot(x, iv_l, 'c--')
	ax.legend(loc='best')
	plt.savefig('linear.png', bbox_inches="tight")
Пример #53
0
# Fit and summary:

res = sm.OLS(y, X).fit()
print(res.summary())


# Extract other quantities of interest:

print('Parameters: ', res.params)
print('Standard errors: ', res.bse)
print('Predicted values: ', res.predict())


# Draw a plot to compare the true relationship to OLS predictions. Confidence intervals around the predictions are built using the ``wls_prediction_std`` command.

prstd, iv_l, iv_u = wls_prediction_std(res)

fig, ax = plt.subplots()

ax.plot(x, y, 'o', label="data")
ax.plot(x, y_true, 'b-', label="True")
ax.plot(x, res.fittedvalues, 'r--.', label="OLS")
ax.plot(x, iv_u, 'r--')
ax.plot(x, iv_l, 'r--')
ax.legend(loc='best');


# ## OLS with dummy variables
# 
# We generate some artificial data. There are 3 groups which will be modelled using dummy variables. Group 0 is the omitted/benchmark category.
Пример #54
0
se = np.round(se,4)
colnames = ['x1', 'const']
rownames = ['WLS', 'OLS', 'OLS_HC0', 'OLS_HC1', 'OLS_HC3', 'OLS_HC3']
tabl = SimpleTable(se, colnames, rownames, txt_fmt=default_txt_fmt)
print(tabl)


# Calculate OLS prediction interval:

covb = res_ols.cov_params()
prediction_var = res_ols.mse_resid + (X * np.dot(covb,X.T).T).sum(1)
prediction_std = np.sqrt(prediction_var)
tppf = stats.t.ppf(0.975, res_ols.df_resid)


prstd_ols, iv_l_ols, iv_u_ols = wls_prediction_std(res_ols)


# Draw a plot to compare predicted values in WLS and OLS:

prstd, iv_l, iv_u = wls_prediction_std(res_wls)

fig, ax = plt.subplots()
ax.plot(x, y, 'o', label="Data")
ax.plot(x, y_true, 'b-', label="True")
# OLS
ax.plot(x, res_ols.fittedvalues, 'r--')
ax.plot(x, iv_u_ols, 'r--', label="OLS")
ax.plot(x, iv_l_ols, 'r--')
# WLS
ax.plot(x, res_wls.fittedvalues, 'g--.')
def summary_table(res, alpha=0.05):
    '''generate summary table of outlier and influence similar to SAS

    Parameters
    ----------
    alpha : float
       significance level for confidence interval

    Returns
    -------
    st : SimpleTable instance
       table with results that can be printed
    data : ndarray
       calculated measures and statistics for the table
    ss2 : list of strings
       column_names for table (Note: rows of table are observations)

    '''

    from scipy import stats
    from statsmodels.sandbox.regression.predstd import wls_prediction_std

    infl = OLSInfluence(res)

    #standard error for predicted mean
    #Note: using hat_matrix only works for fitted values
    predict_mean_se = np.sqrt(infl.hat_matrix_diag*res.mse_resid)

    tppf = stats.t.isf(alpha/2., res.df_resid)
    predict_mean_ci = np.column_stack([
                        res.fittedvalues - tppf * predict_mean_se,
                        res.fittedvalues + tppf * predict_mean_se])


    #standard error for predicted observation
    predict_se, predict_ci_low, predict_ci_upp = wls_prediction_std(res)
    predict_ci = np.column_stack((predict_ci_low, predict_ci_upp))

    #standard deviation of residual
    resid_se = np.sqrt(res.mse_resid * (1 - infl.hat_matrix_diag))

    table_sm = np.column_stack([
                                  np.arange(res.nobs) + 1,
                                  res.model.endog,
                                  res.fittedvalues,
                                  predict_mean_se,
                                  predict_mean_ci[:,0],
                                  predict_mean_ci[:,1],
                                  predict_ci[:,0],
                                  predict_ci[:,1],
                                  res.resid,
                                  resid_se,
                                  infl.resid_studentized_internal,
                                  infl.cooks_distance[0]
                                  ])


    #colnames, data = zip(*table_raw) #unzip
    data = table_sm
    ss2 = ['Obs', 'Dep Var\nPopulation', 'Predicted\nValue', 'Std Error\nMean Predict', 'Mean ci\n95% low', 'Mean ci\n95% upp', 'Predict ci\n95% low', 'Predict ci\n95% upp', 'Residual', 'Std Error\nResidual', 'Student\nResidual', "Cook's\nD"]
    colnames = ss2
    #self.table_data = data
    #data = np.column_stack(data)
    from statsmodels.iolib.table import SimpleTable, default_html_fmt
    from statsmodels.iolib.tableformatting import fmt_base
    from copy import deepcopy
    fmt = deepcopy(fmt_base)
    fmt_html = deepcopy(default_html_fmt)
    fmt['data_fmts'] = ["%4d"] + ["%6.3f"] * (data.shape[1] - 1)
    #fmt_html['data_fmts'] = fmt['data_fmts']
    st = SimpleTable(data, headers=colnames, txt_fmt=fmt,
                       html_fmt=fmt_html)

    return st, data, ss2
__author__ = 'Yas'

import numpy as np

import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.sandbox.regression.predstd import wls_prediction_std


np.random.seed(1024)
X= [1,2,3,4,5,6,7]
#X = range(1,8)
Y = [1,7,3,20,5,6,2]
#X = sm.add_constant(X)
wls_model = sm.WLS(Y,X, weights=[0.1,0.1,0.1,0.0,0.1,0.1,0.1])
res_wls = wls_model.fit()
print res_wls.params
print res_wls.tvalues

#print(results.t_test([1, 0]))
plt.plot(X, res_wls.fittedvalues, 'g--');
prstd, iv_l, iv_u = wls_prediction_std(res_wls)
#print(results.f_test([0, 1]))
plt.ylim(-50,50)
plt.xlim(0,30)
plt.plot(X,Y, 'o')
plt.plot(X, iv_u, 'b--');
plt.plot(X, iv_l, 'r--');
#plt.plot(X,res_wls.f, '.')
plt.show()
#Read data
filename = 'griliches.dta'
df = rd_stata(filename)
x = []

#===========Least square regression
#=====================
#lw VS multivariates 
#=====================
x= df[['rns','mrt','smsa','med','iq','kww','age','s','expr']]
y = df.lw
X = sm.add_constant(x)
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

#plot the results
plt.figure();
plt.plot(x, y, 'o');
prstd, iv_l, iv_u = wls_prediction_std(results)
plt.plot(x, results.fittedvalues, 'g--.')
plt.plot(x, iv_u, 'r--')
plt.plot(x, iv_l, 'r--')
plt.xlabel('multivariates')
plt.ylabel('Log Wage')
plt.title('Multivariates');
plt.savefig('1-multivariates.png')
plt.show()

Пример #58
0
def regression_and_scatter(df_x_path, x_name, y_names,
	df_y_path=None,
	roi_normalize=True,
	confidence_intervals=False,
	prediction_intervals=False,
	animals=None,
	):

	df = pd.read_csv(df_x_path, index_col=0)

	if df_y_path:
		dfy = pd.read_csv(df_y_path, index_col=0)
		df = pd.concat([df, dfy], axis=1)

	if animals:
		df = df.loc[animals]

	if roi_normalize:
		df[x_cols] = df[x_cols].apply(lambda x: (x / x.mean()))

	fig, ax = plt.subplots()
	ax.set_xmargin(0.1)
	ax.set_ymargin(0.11)
	df[x_cols] = df[x_cols].apply(lambda x: (x / x.mean()))

	fig, ax = plt.subplots()
	ax.set_xmargin(0.1)
	ax.set_ymargin(0.11)

	for ix, y_name in enumerate(y_names):
		x = df[[x_name]].values
		y = df[[y_name]].values

		x_ = sm.add_constant(x) # constant intercept term
		model = sm.OLS(y, x_)

	for ix, y_name in enumerate(y_names):
		x = df[[x_name]].values
		y = df[[y_name]].values

		x_ = sm.add_constant(x) # constant intercept term
		model = sm.OLS(y, x_)
		fitted = model.fit()
		x_pred = np.linspace(x.min(), x.max(), 50)
		x_pred2 = sm.add_constant(x_pred)
		y_pred = fitted.predict(x_pred2)

		y_hat = fitted.predict(x_)
		y_err = y - y_hat
		mean_x = x.mean()
		n = len(x)
		dof = n - fitted.df_model - 1
		t = stats.t.ppf(0.05, df=dof)
		s_err = np.sum(np.power(y_err, 2))

		if confidence_intervals:
			conf = t * np.sqrt((s_err/(n-2))*(1.0/n + (np.power((x_pred-mean_x),2) / ((np.sum(np.power(x_pred,2))) - n*(np.power(mean_x,2))))))
			upper_conf = y_pred + abs(conf)
			lower_conf = y_pred - abs(conf)
			ax.fill_between(x_pred, lower_conf, upper_conf, color=qualitative_colorset[ix], alpha=0.3)

		if prediction_intervals:
			sdev_pred, lower_pred, upper_pred = wls_prediction_std(fitted, exog=x_pred2, alpha=0.05)
			ax.fill_between(x_pred, lower_pred, upper_pred, color=qualitative_colorset[ix], alpha=0.08)

		data_points = ax.plot(x,y,'o',color=qualitative_colorset[ix],markeredgecolor=qualitative_colorset[ix])
		ax.tick_params(axis="both",which="both",bottom="off",top="off",length=0)
		ax.plot(x_pred, y_pred, '-', color=qualitative_colorset[ix], linewidth=2, label=y_name)
	plt.legend(loc="best")
Пример #59
0
def first_ens_prod_fig():
    """ This plot is based on a production model taking into account:
        Tout, vWind and the production 24 hours before
        
        """
        
    plt.close('all')
    cols = ['Tout', 'vWind', 'prod24h_before']
        
    ts1 = ens.gen_hourly_timesteps(dt.datetime(2015,12,17,1), dt.datetime(2016,1,15,0))
    ts2 = ens.gen_hourly_timesteps(dt.datetime(2016,1,20,1), dt.datetime(2016,1,28,0))
    
    #load the data
    fit_data = ens.repack_ens_mean_as_df()
    fit_data['prod24h_before'] = sq.fetch_production(dt.datetime(2015,12,16,1), dt.datetime(2016,1,14,0))

    vali_data = ens.repack_ens_mean_as_df(dt.datetime(2016,1,20,1), dt.datetime(2016,1,28,0))
    vali_data['prod24h_before'] = sq.fetch_production(dt.datetime(2016,1,19,1), dt.datetime(2016,1,27,0))   
    
 
    # do the fit
    X = fit_data[cols]
    y = fit_data['prod']
    res = mlin_regression(y, X, add_const=True)    
    
    fig, [ax1, ax2] = plt.subplots(2,1, figsize=(40,20))
    
    # load ensemble data
    ens_data1 = ens.load_ens_timeseries_as_df(ts_start=ts1[0], ts_end=ts1[-1])
    ens_data1['prod24h_before'] = fit_data['prod24h_before']    
    ens_data2 = ens.load_ens_timeseries_as_df(ts_start=ts2[0], ts_end=ts2[-1])
    ens_data2['prod24h_before'] = vali_data['prod24h_before']
    
    all_ens_data = pd.concat([ens_data1, ens_data2])
    all_ts = ts1 + ts2    
    
    
    # calculate production for each ensemble member
    ens_prods = np.zeros((len(all_ts), 25))
    for i in range(25):
        ens_cols = ['Tout' + str(i), 'vWind' + str(i), 'prod24h_before']
        ens_params = pd.Series({'Tout' + str(i):res.params['Tout'],
                                'vWind' + str(i):res.params['vWind'],
                                'const':res.params['const'],
                                'prod24h_before':res.params['prod24h_before']})
        ens_prods[:,i] = linear_map(all_ens_data, ens_params, ens_cols)    
    
    
       
    # calculate combined confint
    prstd, iv_l, iv_u = wls_prediction_std(res)
    mean_conf_int_spread = np.mean(res.fittedvalues - iv_l)
    model_std = np.concatenate([prstd, (1./1.9599)*mean_conf_int_spread*np.ones(len(ts2))])
    ens_std = ens_prods.std(axis=1)
    combined_std = np.sqrt(model_std**2 + ens_std**2)
    all_prod_model = np.concatenate([res.fittedvalues, linear_map(vali_data, res.params, cols)])
    combined_ub95 = all_prod_model + 1.9599*combined_std
    combined_lb95 = all_prod_model - 1.9599*combined_std 
    
    # plot confint
    ax1.fill_between(all_ts, combined_lb95, combined_ub95, label='Combined 95% conf. int.')
    ax1.fill_between(all_ts, all_prod_model - 1.9599*ens_std, all_prod_model + 1.9599*ens_std, facecolor='grey', label='Ensemble 95% conf. int.')
    
    # plot ensempble models    
    ax1.plot_date(all_ts, ens_prods, '-', lw=0.5)    
    
    ax1.plot_date(ts1, y, 'k-', lw=2, label='Actual production')
    ax1.plot_date(ts1, res.fittedvalues,'r-', lw=2, label='Model on ensemble mean')
         
    ax1.plot_date(ts2, vali_data['prod'], 'k-', lw=2, label='')
    ax1.plot_date(ts2, linear_map(vali_data, res.params, cols), 'r-', lw=2)
    ax1.set_ylabel('[MW]')
    ax1.legend(loc=2)
    
    vali_resid = linear_map(vali_data, res.params, cols) - vali_data['prod']
    ax2.plot_date(ts1, res.resid, '-', label='Residual, fitted data')
    ax2.plot_date(ts2, vali_resid, '-', label='Residual, validation data')
    ax2.set_ylabel('[MW]')
    ax2.legend(loc=2)
    print "MAE = " + str(mae(vali_resid))
    print "MAPE = " + str(mape(vali_resid, vali_data['prod']))
    print "RMSE = " + str(rmse(vali_resid))
    print "ME = " + str(np.mean(vali_resid))
    
    print "MAE (fit) = " + str(mae(res.resid))
    print "MAPE (fit) = " + str(mape(res.resid, fit_data['prod']))
    print "RMSE (fit)= " + str(rmse(res.resid))
    print "ME (fit)= " + str(np.mean(res.resid))

    plt.savefig('figures/ens_prod_models.pdf', dpi=600) 
    plt.figure()
    plt.plot_date(all_ts, ens_std)
    plt.ylabel('Std. of ensemble production models [MW]')
    plt.savefig('figures/std_ens_prod_models.pdf', dpi=600) 
    
    
    sns.jointplot(x=ens_std, y=np.concatenate([res.resid, vali_resid]))
   
        
    return res, all_ens_data, all_ts, fit_data['prod'], vali_data['prod']