Exemplo n.º 1
0
def return_qqplot(data):
    ''' Generates a Q-Q plot of the returns.'''
    plt.figure(figsize=(9, 5))
    sm.qqplot(data['returns'], line='s')
    plt.grid(True)
    plt.xlabel('theoretical quantiles')
    plt.ylabel('sample quantiles')
Exemplo n.º 2
0
def plot_single_peak(peak, ff = False, num_bins = 50, qq = scipy.stats.norm):
    '''Plotte fuer einen Peak das Histogramm sowie qq-Plot zur Verteilung qq
    Besser plot_simlist verwenden, wenn nicht nur gezielt ein Peak angeschaut werden soll, oder Histogrammdarstellung erwuenscht'''
    data = peak
    # Falls from_file gewaehlt, oeffne file
    if ff:
        with open (peak, 'rb') as daten:
            data = pickle.load(daten)
    #Normales Hist plotten
    n, bins, patches = plt.hist(data.times, num_bins, normed=1, alpha=0.5 )
    plt.suptitle("params:" + str(data.params))
    # Jetzt noch ein qq-Plot
    x = np.arange(1, 250, 0.5)
    if qq == scipy.stats.invgauss:
        mu, loc, scale =  scipy.stats.invgauss.fit(data.times)
        logging.log(20, "ig-paramss, %s, %s, %s", str(mu), str(loc), str(scale))
        plt.plot(x,scipy.stats.invgauss.pdf(x,mu, loc, scale))
        logging.log(20,'skew, %s', str(scipy.stats.skew(data.times)))
        sm.qqplot(np.array(data.times), qq, distargs=(mu,),  line = 'r')
        plt.suptitle("params:" + str(data.params) + " qq-Plot mit Normalverteilung" )
    elif qq == scipy.stats.norm:
        sm.qqplot(np.array(data.times), qq, line='r')
        plt.suptitle("params:" + str(data.params) + " qq-Plot mit Inverser Gauss Verteilung: ")
    else: 
        print("not yet implemented, distribution:", qq)
    plt.show()
Exemplo n.º 3
0
def hist(request, sym):
    """create a histogram plot"""
    data = Data(syms=[sym], start=start)
        
    r = data.panel.r.copy()
    r = r.dropna()
    
    fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(12, 7))
    
    ax = axes[0,0]
    ax.hist(r[sym].values, bins=30)
    r.plot(kind="kde", ax=ax,grid=True)
    r.boxplot(ax=axes[0,1],grid=True)
    r.plot(kind="kde", ax=axes[1,0],grid=True)
    sm.qqplot(r[sym], line='r', fit=True, ax=axes[1,1])
    
    r['mean'] = pandas.rolling_mean(r[sym], 12)
    r['std'] = pandas.rolling_std(r[sym], 12)
    r['cum_ret'] = r[sym].cumsum()
    r[['mean', 'std']].plot(ax=axes[0,2], grid=True, rot=45)
    
    r[['cum_ret']].plot(ax=axes[1,2], grid=True, rot=45)
    
    fig.tight_layout()
    
    fig.set_facecolor((1,.8,.6,0))
    canvas = FigureCanvas(fig)
    response = HttpResponse(content_type='image/png')
    canvas.print_png(response)
    return response
Exemplo n.º 4
0
def plotFit(fit):
  """Create's the 2x2 panel of plots that plot(fit) would create in R"""
  resid = fit.resid
  mu = resid.mean()
  std = resid.std(axis=0)

  #had to write my own normalize function
  def _normalize(resid):
      return (resid-mu)/std
  norm_resid = resid.apply(_normalize)


  f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharex='col', sharey='row')

  ax1.scatter(fit.fittedvalues, fit.resid)
  ax1.set_xlabel('Fitted Values')
  ax1.set_ylabel('Residuals')
  ax1.set_title('Residuals vs Fitted')


  sm.qqplot(fit.resid, ax=ax2)
  ax2.set_title('QQ plot')

  ax3.scatter(fit.fittedvalues, norm_resid)
  ax3.set_xlabel('Fitted Values')
  ax3.set_ylabel('Standardized Residuals')
  ax3.set_title('Scale-Location')

  sm.graphics.influence_plot(fit, ax=ax4, criterion="cooks")

  plt.show()
Exemplo n.º 5
0
def plot (sim_liste, histogram_separate, histogram_spec, qq_Plot, fit_qq_Plot, num_bins = 50, vergleich= scipy.stats.invgauss):
    startzeit = time.clock()   
    if histogram_spec:
        print "Erstelle Spektrum"
        fig, ax = plt.subplots()
        fig.suptitle("Laenge: "+str(sim_liste[0].length)+" Anz Teilchen: " +str(sim_liste[1].number)) #TODO, gehe hier davon aus, dass gleiche sim-bedingungen vorliegen
        for sim in sim_liste:
            ax.hist(sim.times, num_bins, alpha=0.5, normed = 1, label = str(sim.params) )
       # plt.show()  
        legend = ax.legend(loc='upper right', shadow=True)

    
    # Je Simulation ein Ausgabefenster mit separatem Histogramm/qq-Plot mit gewählten Params/qq mit automatischem Fit 
    number_stats = sum([histogram_separate, qq_Plot, fit_qq_Plot])
    print number_stats
    if histogram_separate or qq_Plot or fit_qq_Plot:
	print "Erstelle separate Dinge"
	for sim in sim_liste:
	    fig = plt.figure(figsize=(4*number_stats, 4))
            gs1 = gridspec.GridSpec(1, number_stats)
            ax_list = [fig.add_subplot(ss) for ss in gs1]
           
	    akt = 0
	    fig.suptitle("ps, pm"+str(sim.params)+str(round(sim.params[0]-sim.params[1],5)), size = 15)
	    if histogram_separate:
		ax_list[akt].hist(sim.times, num_bins)
		ax_list[akt].set_title("Histogramm")
                akt+=1
                
            #print "hist sep", time.clock()-startzeit
	    if qq_Plot:
                sm.qqplot (np.array(sim.times), scipy.stats.norm,  line = 'r', ax=ax_list[akt])
		ax_list[akt].set_title("qq-Plot; norm!! Params: 0.05")
                akt+=1
            #print 'qq 0.05', time.clock()-startzeit
	    if fit_qq_Plot:
		                
                #mu, loc, scale = scipy.stats.invgauss.fit(sim.times)
                #mean, var = scipy.stats.invgauss.stats(mu, loc, scale, moments='mv')
                #print  "params", sim.params, '(mu, loc, scale), mean, var', round(mu, 5), round(loc, 2), round(scale, 2), '\n',  mean, '\n', var
                
                #sm.qqplot (np.array(sim.times), vergleich, fit = True,  line = 'r', ax=ax_list[akt])
		#ax_list[akt].set_title("qq-Plot mit auto Fit")
                #akt+=1 
                sm.qqplot (np.array(sim.times), vergleich, distargs= (sim.mu, ),  line = 'r', ax=ax_list[akt])
		ax_list[akt].set_title("qq-Plot mit mu:" + str(sim.mu))
                akt+=1
            #print "qq plus rechnen", time.clock()-startzeit                

                #fig.subplots_adjust(top=5.85)
            gs1.tight_layout(fig, rect=[0, 0.03, 1, 0.95]) 
            print time.clock()-startzeit
            #plt.tight_layout()
    plt.show()    
       

   
   
    '''x = np.linspace(0, 2*np.pi, 400)
Exemplo n.º 6
0
def plot_single_histqq_ff(datei, num_bins=50):
    with open(datei, 'rb') as daten:
        sim = pickle.load(daten)
        n, bins, patches = plt.hist(sim.times, num_bins, normed=1, alpha=0.5 )
        x = np.arange(50000, 250000, 100)
        print "ig-params", scipy.stats.invgauss.fit(sim.times)
        mu, loc, scale =  scipy.stats.invgauss.fit(sim.times)
        plt.plot(x,scipy.stats.invgauss.pdf(x,mu, loc, scale))
        print 'skew', scipy.stats.skew(sim.times)
        
        sm.qqplot(np.array(sim.times), scipy.stats.invgauss, distargs=(mu,),  line = 'r')
Exemplo n.º 7
0
def ts_diagnostics(y, lags=None, title='', filename=''):
    '''
    Calculate acf, pacf, qq plot and Augmented Dickey Fuller test for a given time series
    '''
    if not isinstance(y, pd.Series):
        y = pd.Series(y)

    # weekly moving averages (5 day window because of workdays)
    rolling_mean = pd.rolling_mean(y, window=12)
    rolling_std = pd.rolling_std(y, window=12)

    fig = plt.figure(figsize=(14, 12))
    layout = (3, 2)
    ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2)
    acf_ax = plt.subplot2grid(layout, (1, 0))
    pacf_ax = plt.subplot2grid(layout, (1, 1))
    qq_ax = plt.subplot2grid(layout, (2, 0))
    hist_ax = plt.subplot2grid(layout, (2, 1))

    # time series plot
    y.plot(ax=ts_ax)
    rolling_mean.plot(ax=ts_ax, color='crimson')
    rolling_std.plot(ax=ts_ax, color='darkslateblue')
    plt.legend(loc='best')
    ts_ax.set_title(title, fontsize=24)

    # acf and pacf
    smt.graphics.plot_acf(y, lags=lags, ax=acf_ax, alpha=0.5)
    smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax, alpha=0.5)

    # qq plot
    sm.qqplot(y, line='s', ax=qq_ax)
    qq_ax.set_title('QQ Plot')

    # hist plot
    y.plot(ax=hist_ax, kind='hist', bins=25)
    hist_ax.set_title('Histogram')
    plt.tight_layout()
    # plt.savefig('./img/{}.png'.format(filename))
    plt.show()

    # perform Augmented Dickey Fuller test
    print('Results of Dickey-Fuller test:')
    dftest = adfuller(y, autolag='AIC')
    dfoutput = pd.Series(
        dftest[0:4],
        index=['test statistic', 'p-value', '# of lags', '# of observations'])
    for key, value in dftest[4].items():
        dfoutput['Critical Value (%s)' % key] = value
    print(dfoutput)
    return
Exemplo n.º 8
0
def plot_time_series(data, lags=None, title=None, filename=None):
    """
    Saves time series plot figure of the provided data in filename.

    Parameters
    ==========
    data : series
        One-dimensional ndarray with axis labels (including time series).
    lags : {int, array_like}
        An int or array of lag values, used on horizontal axis.
    title : string
        The title that will be set for the whole figure.
    filename : string
        File to save the plot result
    """

    if not isinstance(data, pd.Series):
        data = pd.Series(data).dropna()

    with plt.style.context('bmh'):
        fig = plt.figure(figsize=(10, 8))
        layout = (3, 2)
        ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2)
        acf_ax = plt.subplot2grid(layout, (1, 0))
        pacf_ax = plt.subplot2grid(layout, (1, 1))
        qq_ax = plt.subplot2grid(layout, (2, 0))
        pp_ax = plt.subplot2grid(layout, (2, 1))

        data.plot(ax=ts_ax)
        ts_ax.set_title(title if title else 'Time Series Analysis Plots')
        smt.graphics.plot_acf(data,
                              lags=lags,
                              ax=acf_ax,
                              alpha=0.5,
                              zero=False)
        smt.graphics.plot_pacf(data,
                               lags=lags,
                               ax=pacf_ax,
                               alpha=0.5,
                               zero=False)
        sm.qqplot(data, line='s', ax=qq_ax)
        qq_ax.set_title('QQ Plot')
        scs.probplot(data, sparams=(data.mean(), data.std()), plot=pp_ax)
        plt.sca(acf_ax)
        plt.xticks(np.arange(1, lags + 1, 2.0))
        plt.sca(pacf_ax)
        plt.xticks(np.arange(1, lags + 1, 2.0))
        plt.tight_layout()

    fig.savefig(filename.lower())
    plt.close()
Exemplo n.º 9
0
def qqplot(dataFrame, columns):
    '''qq图'''
    counts = 0
    for i, col in enumerate(columns):
        if i % cell_size == 0:
            fig = plt.figure(figsize=(15, 15))
        ax = fig.add_subplot(col_size, row_size, (i % cell_size) + 1)
        sm.qqplot(dataFrame[col], ax=ax)
        ax.set_title(col)
        if (i + 1) % cell_size == 0 or i + 1 == len(columns):
            counts += 1
            plt.subplots_adjust(wspace=0.3, hspace=0.3)
            plt.savefig('./output/qqplot' + str(counts) + '.png')
            plt.show()
Exemplo n.º 10
0
def residual():
    residual_mean = []
    residual_std = []
    resid = pd.DataFrame()
    resid_lag = pd.DataFrame()
    auto_alpha = []
    auto_pvalue = []

    code_list = [
        'SPY', 'XLB', 'XLE', 'XLF', 'XLI', 'XLK', 'XLP', 'XLU', 'XLV', 'XLY'
    ]
    for code in code_list:
        etf = ETF(code, '2010-01-01', '2019-09-14')
        etf.price_acquire()
        etf.data['ETF_Daily_return'] = (
            etf.data['Close'] / etf.data['Close'].shift(1) - 1)
        etf.data['Date'] = etf.data['Date'].apply(
            lambda x: x.strftime("%Y%m%d"))
        data = pd.DataFrame.merge(etf.data, ff.data, how='left', on='Date')
        data = data.dropna(axis=0, how='any')
        model = OLS(y=data.ETF_Daily_return, x=data[['Mkt_RF', 'SMB', 'HML']])
        resid['' + code + '_resids'] = model.resids
        sm.qqplot(resid['' + code + '_resids'], fit=True, line='45')
        plt.title('Normality test of daily residuals for ETF:' + code + '')
        plt.show()
        residual_mean = residual_mean + [np.mean(resid['' + code + '_resids'])]
        residual_std = residual_std + [np.std(resid['' + code + '_resids'])]
        resid_lag['' + code + '_resids_lag'] = resid['' + code +
                                                     '_resids'].shift(1)
        residual = pd.concat([
            resid_lag['' + code + '_resids_lag'], resid['' + code + '_resids']
        ],
                             axis=1).dropna()
        regress_result = stats.linregress(residual.iloc[:, 0],
                                          residual.iloc[:, 1])
        auto_alpha = auto_alpha + [regress_result.slope]
        auto_pvalue = auto_pvalue + [regress_result.pvalue]

    result = {
        'Code': code_list,
        'E_Mean': residual_mean,
        'E_std': residual_std
    }
    result = pd.DataFrame(result)

    auto = {'Code': code_list, 'Alpha': auto_alpha, 'P_Value': auto_pvalue}
    auto = pd.DataFrame(auto)

    return result, auto
Exemplo n.º 11
0
def plot_ic_qq(ic, theoretical_dist=stats.norm, ax=None):
    """
    Plots Spearman Rank Information Coefficient "Q-Q" plot relative to
    a theoretical distribution.

    Parameters
    ----------
    ic : pd.DataFrame
        DataFrame indexed by date, with IC for each forward return.
    theoretical_dist : scipy.stats._continuous_distns
        Continuous distribution generator. scipy.stats.norm and
        scipy.stats.t are popular options.
    ax : matplotlib.Axes, optional
        Axes upon which to plot.

    Returns
    -------
    ax : matplotlib.Axes
        The axes that were plotted on.
    """

    ic = ic.copy()

    num_plots = len(ic.columns)

    v_spaces = ((num_plots - 1) // 3) + 1

    if ax is None:
        f, ax = plt.subplots(v_spaces, 3, figsize=(18, v_spaces * 6))
        ax = ax.flatten()

    if isinstance(theoretical_dist, stats.norm.__class__):
        dist_name = 'Normal'
    elif isinstance(theoretical_dist, stats.t.__class__):
        dist_name = 'T'
    else:
        dist_name = 'Theoretical'

    for a, (period_num, ic) in zip(ax, ic.iteritems()):
        sm.qqplot(ic.replace(np.nan, 0.).values,
                  theoretical_dist,
                  fit=True,
                  line='45',
                  ax=a)
        a.set(title="{} Period IC {} Dist. Q-Q".format(period_num, dist_name),
              ylabel='Observed Quantile',
              xlabel='{} Distribution Quantile'.format(dist_name))

    return ax
Exemplo n.º 12
0
def simple_exponential_smoothing():
    N, t, alpha, x0 = 200, 160, 0.5, 20
    realisations = pd.Series(sample_gaussian_process(20, 5, N), range(N))
    forecasts = ses(realisations, alpha, x0, t)
    plot(realisations, forecasts, alpha) 
    forecasts = ses_rolling(realisations, alpha, x0)
    res = residuals(realisations, forecasts)
    print("E[e_t] = "+str(statistics.mean(res)))
    print("Stdev[e_t] = "+str(statistics.stdev(res)))
    standardised_res = standardised_residuals(realisations, forecasts)
    residuals_plot(res)
    residuals_histogram(standardised_res)
    residuals_autocorrelation(res, None)
    sm.qqplot(standardised_res, line ='45') 
    py.show() 
Exemplo n.º 13
0
 def QQplot(self, save = False):
     '''
     Function for Q-Q plot visualization.
     Args:
         save - whether to save the output in local directory or not.
     Return: 
         Q-Q plots for each variable.
     '''
     for idx, array in enumerate(self.array):
         fig, ax = plt.subplots(figsize = (7, 5))
         plt.title('Q-Q plot ({})'.format(self.labels[idx]))
         sm.qqplot(np.array(array), line = '45', fit = True, ax = ax)
         if save == True:
             plt.savefig('Q-Q_plot_{}.png'.format(self.labels[idx]), dpi = 200)
         plt.show()
Exemplo n.º 14
0
def plot_qq_checkout():

    path = './qq_checkout'
    if os.path.exists(path) == False:
        os.mkdir(path)

    global number_attribute_remove_lost_arr
    for k, v in number_attribute_remove_lost_arr.iteritems():
        sm.qqplot(np.array(v), line='r')
        #plt.xlabel(k)
        plt.title(k)
        plt.grid(True)
        #plt.show()
        plt.savefig(path + '/' + k + '.png')
        plt.close()
Exemplo n.º 15
0
def plot_qq_checkout():

	path = './qq_checkout'
	if os.path.exists(path) == False:
		os.mkdir(path)

	global number_attribute_remove_lost_arr
	for k, v in number_attribute_remove_lost_arr.iteritems():
		sm.qqplot(np.array(v), line='r')
		#plt.xlabel(k)
		plt.title(k)
		plt.grid(True)
		#plt.show()
		plt.savefig(path + '/' + k + '.png')
		plt.close()
Exemplo n.º 16
0
 def regression(self):  # 线性回归
     rate1 = self.rate
     rate2 = self.rate2
     model = sm.OLS(rate1, sm.add_constant(rate2)).fit()
     print(model.summary())
     model.fittedvalues  # 查看方程的拟合值
     model.resid  # 回归的残差项
     plt.scatter(model.fittedvalues, model.resid)
     plt.show()
     # 正态性,当因变量成正态分布,模型的残差应该是一个均值为0的正态分布
     # qq图
     sm.qqplot(model.resid_pearson, stats.norm, line='45')
     # 同方差性
     plt.scatter(model.fittedvalues, model.resid_pearson**0.5)
     pass
Exemplo n.º 17
0
def graphics(data):

    fig, axes = plt.subplots(nrows=1, ncols=3)
    fig.suptitle("Graphical Analysis")

    axes[0].hist(data, bins=20, alpha=0.8)
    axes[0].set_title("Histogram")

    sns.boxplot(y=data["Close"], ax=axes[1], orient="vertical")
    axes[1].set_title("Boxplot")

    sm.qqplot(data["Close"], ax=axes[2], line="q")
    axes[2].set_title("Q-Q Plot against a normal distribution")

    plt.show()
Exemplo n.º 18
0
    def qqplot(x, title='', path=None):
        """
    Q-Q plot

    Parameters
    ----------    
    x  : array_like 
                first group

    Returns
    -------
    None
    """
        sm.qqplot(np.array(x), line='q')
        if (path != None):
            plt.savefig(path + '/qqplot_' + title + '.png')
def sklearn_ols_regression(X,y,print_coefficients=True,print_resid=False,plot_resid=False,qqplot_line='s'):
    """
    ols regression in sklearn
    print: coefficients (optional), regression metrics (optional), qqplot (optional)
    output: SKlearn LinearRegression object
    """
    # initialize a linear regression model in sklearn
    linrig = LinearRegression()
    # fit linear model to training data
    linrig.fit(X, y)
    y_pred = linrig.predict(X)

    if print_coefficients:
        print('Features: ', list(X.columns))
        print('Coefficients: ', linrig.coef_)
        print('y-intercept: ', np.round(linrig.intercept_,3))
        print('\n')

    if print_resid:
        regression_results(y, y_pred)
#         print('MSE: ', mean_squared_error(y, y_pred, multioutput='raw_values'))
    if plot_resid:
        sk_res = pd.Series(data=[np.abs(y - y_pred)])
        #correct this later
        print('QQPLOT OF RESID NOT WORKING. RESID INCORRECT OR WRONG ORDER?\n')
        fig = sm.qqplot(sk_res,line=qqplot_line)
        plt.show()
    return linrig
Exemplo n.º 20
0
def check_residuals(resids, **plot_args):
    """可视化残差诊断检验
    
    Args:
        resids: 残差, np.array or pd.Series
        **plot_args: 用于构造figure对象
        
    Returns:
        plt.Figure
    """
    fig, axes = plt.subplots(nrows=2, ncols=2, **plot_args)
    ax1, ax2, ax3, ax4 = axes.flatten()

    # 残差时序图
    ax1 = sns.lineplot(x=range(len(resids)), y=resids, ax=ax1)
    ax1.set(title="Residuals", xlabel="", ylabel="")

    # 残差直方图
    ax2 = sns.histplot(x=resids, kde=True, ax=ax2)
    ax2.set(title="Histogram", xlabel="", ylabel="")

    # QQ plot
    fig = sm.qqplot(resids, fit=True, line="45", ax=ax3)
    ax3.set(title="Normal QQ")

    # 自相关图
    ax4 = plot_acf(resids, ax=ax4, title="ACF")

    plt.tight_layout()

    return fig
Exemplo n.º 21
0
 def test_qqplot_pltkwargs(self):
     fig = sm.qqplot(self.res,
                     line='r',
                     marker='d',
                     markerfacecolor='cornflowerblue',
                     markeredgecolor='white',
                     alpha=0.5)
Exemplo n.º 22
0
def residual_plots():
    
    ''' Plots the OLS residuals vs predictors. Also plots a QQ plot of 
    residuals'''
    
    plt.style.use('ggplot')
    
    aggregated_data = load_concatenated_data()
    lm_ols = smf.ols(formula='y ~ x', data=aggregated_data).fit() # OLS fit
    
    #---------------------------------------------
    # Scatter plot of OLS residuals vs predictors
    #---------------------------------------------
    #plt.scatter(aggregated_data['x'].values, lm_ols.resid)
    plt.scatter(aggregated_data['x'].values, lm_ols.resid**2)
    plt.xlabel('x')
    plt.ylabel('OLS squared residuals')
    plt.savefig('squared_residuals_scatterplot.pdf')
    plt.show()
    
    #----------------------
    ## QQ plot of residuals
    #----------------------
    fig = sm.qqplot(lm_ols.resid, line='s')
    fig.savefig('residuals_QQplot.pdf')
    plt.show(fig)
Exemplo n.º 23
0
def shapiro(data):
    '''
    >plot qq plot and pdf
    > shapiro wilk test for normality
    '''

    #qq plot
    sm.qqplot(np.array(data), line='45')
    pylab.show()

    #pdf
    fig, ax = plt.subplots()
    n, bins, patches = ax.hist(data, 40, density=1)

    #shapiro wilk test
    print('shapiro test', stats.shapiro(data))
def arima_diag(resids, n_lags = 40):
    
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2)
    
    r = resids
    resids = (r - np.nanmean(r)) / np.nanstd(r)
    resids_nonmissing = resids[~(np.isnan(resids))]  ######
    
    sns.lineplot(x = np.arange(len(resids)), y = resids, ax = ax1)
    ax1.set_title('Standardized residuals')
    
    x_lim = (-1.96 * 2, 1.96 *2)
    r_range = np.linspace(x_lim[0], x_lim[1])
    norm_pdf = stats.norm.pdf(r_range)
    sns.distplot(resids_nonmissing, norm_hist = True, hist = True, kde = True, ax = ax2)
    
    ax2.plot(r_range, norm_pdf, 'g', lw= 2, label = 'N(0,1)')
    ax2.set_title('Distribution of standardized residuals')
    ax2.set_xlim(x_lim)
    ax2.legend()
    
    qq = sm.qqplot(resids_nonmissing, line = 's', ax = ax3)
    ax3.set_title('Q-Q plot')
    
    plot_acf(resids, ax = ax4, lags = n_lags, alpha = 0.05)
    ax4.set_title('ACF plot')
    
    return fig
Exemplo n.º 25
0
def plot_QQ(model, fit=False, *args, **kwargs):
    """
    Plot the QQ plot.
    :param model: The statmodel model
    :param fit: When True the line that is shown as True line is the fitting line. This is useful sometimes since a straight
                line different to y=x means that the distribution is probably the same but the parameters are the same
                For instance, a Gaussian with different mean or sigma.
    :param args: Parameters for the qqplot method from statmodels. The most important one is the first parametrs
                which represents a model different to Gaussian (use: scipy.stats.t for t distribution and so on).
                Check: http://www.statsmodels.org/dev/generated/statsmodels.graphics.gofplots.qqplot.html
    :param kwargs: Other parameters for qqplot
    :return: None
    """
    res = model.resid  # residuals
    xmin = np.min(res)
    xmax = np.max(res)

    if 'fit' not in kwargs:
        kwargs['fit'] = fit

    #reg = LinearRegression().fit(np.arange(len(res)).reshape(-1,1), res)
    #print('Fitting line coefficients: {} and intercepts'.format(reg.coef_, reg.intercept_))


    fig = sm.qqplot(res, line='r', *args, **kwargs)
    plt.plot([xmin,xmax],[xmin,xmax], 'r')
    plt.show()
Exemplo n.º 26
0
def create_qq_subplots(data, variables):
    fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(20, 15))
    ax = axes.flatten()
    for i in range(len(variables)):
        for label in (ax[i].get_xticklabels() + ax[i].get_yticklabels()):
            label.set_fontsize(12)
        col_name = variables[i]
        sm.qqplot(data[col_name],
                  marker='o',
                  markerfacecolor='none',
                  markeredgecolor='k',
                  alpha=0.5,
                  ax=ax[i])
        ax[i].set_ylabel(col_name, fontsize=18)
        ax[i].set_xlabel("Theoretical Quantiles", fontsize=14)
    return plt
Exemplo n.º 27
0
def statistic_plot(log_returns,stock_set):
    for sym in stock_set:
        print("\nResults for symbol %s" % sym)
        print(30 * "-")
        log_data = np.array(log_returns[sym].dropna())
        stc.print_statistics(log_data)

    # 通过qq图检查代码的数据

        # 下面是HS300 对数收益率 分位数-分位数图

        sm.qqplot(log_returns[sym].dropna(), line='s')
        plt.title(sym+'qqplot')
        plt.grid(True)
        plt.xlabel('theoretical quantiles')
        plt.ylabel('sample quantiles')
Exemplo n.º 28
0
def qq_plot(depend,features, df):
    df_copy = df.copy()
    fig, ax = plt.subplots(4,2, figsize=(30,30))
    i=0
    for m in range(4):
        for n in range(2):
            if m == 3 and n == 1:
                pass
            else:
                f = '{}~{}'.format(depend, features[i])
                model = smf.ols(formula=f, data=df_copy).fit()
                resid1 = model.resid
                sm.qqplot(resid1, dist=sp.stats.norm, line='45', fit=True, ax=ax[m][n])
                ax[m][n].set_title('{}'.format(features[i]))
                i += 1
    return
Exemplo n.º 29
0
    def RunEstimation(self,request,tsmodelid,tsworkspaceid):
        self.data=pandas.DataFrame()
        self.prepdata(tsmodelid)
        print(self.data)
        ig=lambda x:x
        g=lambda x:x
 
        tsmodel=modeler.ModelClass(data=self.data,startdate=self.startdate,enddate=self.enddate, dependent=self.depVar,exogenous=self.indepVar ,transform=g,inverstransform=ig)
        tsmodel.setmodel(AR=int(self.AR),I=int(self.I),MA=int(self.MA))
        
        
        tsmodel.estimate()
        self.fit=tsmodel.fit
        print tsmodel.fit.summary()
        #return HttpResponseRedirect('/tsbuild/workspace/%s/%s' % (str(tsmodelid),str(tsworkspaceid)))
        confint0=self.fit.conf_int()[0]
        confint1=self.fit.conf_int()[1]
        
        self.SaveValues(tsmodelid,tsworkspaceid,tsmodel.fit)
        
        #QQ Plot
        sm.qqplot(tsmodel.fit.resid)
        plt.savefig('files/%s/%s/qqplot_resid.png' % (tsmodelid, tsworkspaceid))
        plt.clf()
        #In Sample plot
        pdframe=pandas.DataFrame()
        self.data['resid']=tsmodel.fit.resid
        self.data['%s_%s' % (self.depVar[0], 'hat')]=self.fit.fittedvalues
        
        #Plot residuals
        print(tsmodel.fit.resid.index)
        print(tsmodel.fit.resid.values)
        plt.plot(tsmodel.fit.resid.index,tsmodel.fit.resid.values)
        plt.savefig('files/%s/%s/resid.png' % (tsmodelid, tsworkspaceid))
        plt.clf()
        
        plt.plot(tsmodel.fit.fittedvalues.index,tsmodel.fit.fittedvalues.values)
        plt.plot(self.data[self.depVar[0]].index,self.data[self.depVar[0]].values)
        plt.savefig('files/%s/%s/insample.png' % (tsmodelid, tsworkspaceid))
        plt.clf()

        
        return render(request,'tsbuild/arimaSummary.html', {'fit': self.fit,
                                                            'confint0':confint0,
                                                            'confint1':confint1,
                                                            'tsmodelid':tsmodelid,
                                                            'tsworkspaceid':tsworkspaceid} )
Exemplo n.º 30
0
def plot_ic_qq(ic, theoretical_dist=stats.norm, ax=None):
    """
    Plots Spearman Rank Information Coefficient "Q-Q" plot relative to
    a theoretical distribution.

    Parameters
    ----------
    ic : pd.DataFrame
        DataFrame indexed by date, with IC for each forward return.
    theoretical_dist : scipy.stats._continuous_distns
        Continuous distribution generator. scipy.stats.norm and
        scipy.stats.t are popular options.
    ax : matplotlib.Axes, optional
        Axes upon which to plot.

    Returns
    -------
    ax : matplotlib.Axes
        The axes that were plotted on.
    """

    ic = ic.copy()

    num_plots = len(ic.columns)

    v_spaces = ((num_plots - 1) // 3) + 1

    if ax is None:
        f, ax = plt.subplots(v_spaces, 3, figsize=(18, v_spaces * 6))
        ax = ax.flatten()

    if isinstance(theoretical_dist, stats.norm.__class__):
        dist_name = 'Normal'
    elif isinstance(theoretical_dist, stats.t.__class__):
        dist_name = 'T'
    else:
        dist_name = 'Theoretical'

    for a, (period_num, ic) in zip(ax, ic.iteritems()):
        sm.qqplot(ic.replace(np.nan, 0.).values, theoretical_dist, fit=True,
                  line='45', ax=a)
        a.set(title="{} Period IC {} Dist. Q-Q".format(
              period_num, dist_name),
              ylabel='Observed Quantile',
              xlabel='{} Distribution Quantile'.format(dist_name))

    return ax
def bootstrap_qqplot(data_directory_name: str, scenario: str,
                     result_dict_name: str):
    """

    :param data_directory_name:
    :param scenario:
    :param result_dict_name:
    :return:
    """
    with open(
            f'results/result_dict/{data_directory_name}/bootstrap_refit_reduced_{result_dict_name}_{scenario}_'
            f'result_dict.p', 'rb') as fp:
        bootstrap_result_dict = pickle.load(fp)

    train_p_value_vet = []
    test_p_value_vet = []

    for sample_size in bootstrap_result_dict.keys():
        sample_size_train_p_value_vet = []
        sample_size_test_p_value_vet = []
        for trial_index in bootstrap_result_dict[sample_size].keys():
            sample_size_train_p_value_vet.append(
                bootstrap_result_dict[sample_size][trial_index]
                ["train_p_value"])
            sample_size_test_p_value_vet.append(
                bootstrap_result_dict[sample_size][trial_index]
                ["test_p_value"])

        train_p_value_vet.append(sample_size_train_p_value_vet)
        test_p_value_vet.append(sample_size_test_p_value_vet)

    plt.scatter(train_p_value_vet[1], test_p_value_vet[1])
    fig_1 = sm.qqplot(data=np.array(test_p_value_vet[0]),
                      dist=dist.uniform,
                      line="45")
    plt.title("Train")
    fig_2 = sm.qqplot(data=np.array(test_p_value_vet[1]),
                      dist=dist.uniform,
                      line="45")
    plt.title("Test")

    fig_1.savefig(
        f"results/plots/{data_directory_name}/bootstrap_refit_reduced_{result_dict_name}_train.png"
    )
    fig_2.savefig(
        f"results/plots/{data_directory_name}/bootstrap_refit_reduced_{result_dict_name}_test.png"
    )
Exemplo n.º 32
0
def tsplot(y, lags=None, figsize=(10, 8), style='bmh', max_lag=10):
    if not isinstance(y, pd.Series):
        y = pd.Series(y)
    with plt.style.context(style):
        fig = plt.figure(figsize=figsize)
        #mpl.rcParams['font.family'] = 'Ubuntu Mono'
        layout = (3, 2)
        ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2)
        acf_ax = plt.subplot2grid(layout, (1, 0))
        pacf_ax = plt.subplot2grid(layout, (1, 1))
        qq_ax = plt.subplot2grid(layout, (2, 0))
        pp_ax = plt.subplot2grid(layout, (2, 1))
        dful_pvalue = np.around(smt.stattools.adfuller(y)[1], 3)
        ACF = smt.stattools.acf(y, nlags=max_lag, qstat=True)
        ARord = np.array([
            i for i in range(0, max_lag + 1)
            if abs(ACF[0][i]) > 2 / np.sqrt(y.shape[0])
        ])
        PACF = smt.stattools.pacf(y, nlags=max_lag)
        MAord = np.array([
            i for i in range(0, max_lag + 1)
            if abs(PACF[i]) > 2 / np.sqrt(y.shape[0])
        ])
        Qstat_pvalue = np.around(ACF[2][max_lag - 1], 3)
        jb_pvalue = sm.stats.stattools.jarque_bera(y)
        jb_pvalue, kurtosis = np.around(jb_pvalue[1],
                                        3), np.around(jb_pvalue[3], 3)

        y.plot(ax=ts_ax)
        ts_ax.set_title(
            'Time Series Analysis Plots\nDickey-Fuller Test: {}'.format(
                dful_pvalue))
        smt.graphics.plot_acf(y, lags=lags, ax=acf_ax, alpha=0.5)
        smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax, alpha=0.5)
        sm.qqplot(y, line='s', ax=qq_ax)
        scs.probplot(y, sparams=(y.mean(), y.std()), plot=pp_ax)
        qq_ax.set_title('QQ Plot\nJarque-Bera Test: {}\nKurtosis: {}'.format(
            jb_pvalue, kurtosis))
        acf_ax.set_title(
            "Autocorrelation\nQ({}): {}\nLast Singf Lag: {}".format(
                max_lag, Qstat_pvalue, max(ARord)))
        pacf_ax.set_title("Partial Autocorrelation\nLast Singf Lag: {}".format(
            max(MAord)))

        plt.tight_layout()
    plt.show()
    return ARord, MAord
Exemplo n.º 33
0
def qq_plot(diffs_mean, recall, rt=False):

    sns.set_style('white')

    t = 'recall'
    t1 = ''

    if not recall:
        t = 'recog'

    if rt:
        t1 = '_rt'
    t = t + t1

    fig2, axs = plt.subplots(1, 3, sharex=False)
    fig2.add_subplot(111, frameon=False)

    plt.tick_params(labelcolor='none',
                    top=False,
                    bottom=False,
                    left=False,
                    right=False)
    plt.xlabel("Theoretical Quantiles")
    plt.ylabel("Sample Quantiles")

    title = ['Rest', 'Video', 'Game']
    axs = axs.ravel()

    for i in range(3):

        sm.qqplot(np.array(diffs_mean.iloc[:, i]), line='s', ax=axs[i])

        axs[i].get_lines()[0].set_markersize(5)
        axs[i].get_lines()[0].set_markeredgewidth(0.3)
        axs[i].get_lines()[0].set_markerfacecolor(colors[i])
        axs[i].get_lines()[0].set_markeredgecolor('gray')
        axs[i].get_lines()[1].set_color('gray')

        axs[i].set_xlabel('')
        axs[i].set_ylabel('')

        axs[i].set_title(title[i])

        axs[i].set_xlim(-2, 2)

    plt.tight_layout()
    plt.savefig(F'qq-plot{t}', dpi=300)
Exemplo n.º 34
0
def qqplotbags(dataframe, bagType, method, regionList):
    for reg in regionList:
        df = dataframe.loc[(dataframe['region'] == reg)
                           & (dataframe['type'] == method)]
        demand = df[bagType].tolist()
        demand.sort()
        demandSorted = pd.DataFrame(demand)
        sm.qqplot(demandSorted, line='s', alpha=0.3)
        plt.title(reg + " " + method + " " + bagType)

        directory = "Data Analysis/BAGS/QQplotsbags/"
        if not os.path.isdir(directory + method + bagType + "/"):
            os.makedirs(directory + method + bagType + "/")

        plt.savefig(directory + method + bagType + "/" + reg + " " + method +
                    " " + bagType)
        plt.show()
Exemplo n.º 35
0
def q_q_plot(filepath, parameter):
    df = pandas.read_csv(filepath)
    array = df[parameter]
    try:
        fig = sm.qqplot(array, scipy.stats.t, fit=True, line='45')
        plt.show()
    except:
        print "There was an error."
def fourinone(res, y_pred2, x):
    ### Residual check
    ### Set up Quad graph
    fig = plt.figure()
    fig.suptitle('Residual Summary', fontsize=16)
    fig.set_facecolor('tan')

    ### QQ-plot
    ax = fig.add_subplot(2, 2, 1)
    sm.qqplot(res, line='s', ax=ax)
    plt.title('QQ plot')

    ### Res vs fitted value
    ax = fig.add_subplot(2, 2, 2)
    ###Horizontal line
    horiz_line_data = np.array([0, 0])
    min_max = np.array([y_pred2.min(), y_pred2.max()])
    ax.plot(min_max, horiz_line_data, 'k--')
    ### Data
    ax.plot(y_pred2, res, 'o', label="data")  # Data
    ax.set_ylabel('Residual')
    ax.set_xlabel('Fitted Value')
    ax.set_title('Residual vs Fitted Value')

    ### Histogram of residuals
    ax = fig.add_subplot(2, 2, 3)
    bins = 12
    plt.hist(res, bins, edgecolor="k", alpha=1)
    #plt.xticks(bins)
    ax.set_ylabel('Frequency')
    ax.set_xlabel('Residual')
    ax.set_title('Histogram')

    ### Residual vs Observation Order
    ax = fig.add_subplot(2, 2, 4)
    horiz_line_data = np.array([0, 0])
    min_max = np.array([x.min(), x.max()])
    ax.plot(min_max, horiz_line_data, 'k--')
    ax.plot(x, res, '-o', label="data")  # Data
    ax.set_ylabel('Residual')
    ax.set_xlabel('Observation Order')
    ax.set_title('Residual vs Observation Order')

    fig.tight_layout()
    fig.show()
    return
Exemplo n.º 37
0
    def target_dist(self, bins=10, dist=stats.norm):
        # plot the distribution histogram of the target variable
        _ = self.target.hist(bins=bins)
        plt.show()

        # plot the qq plot
        _ = sm.qqplot(self.target, dist, fit=True, line='45')  #,stats.beta
        plt.show()
Exemplo n.º 38
0
def plot_model(prediction, y, x):
    fig, axs = sns.plt.subplots(2, 2, figsize=(16, 10))
    axs = axs.flatten()

    resid = pd.Series(y - prediction, index=y.index, name='Residuals')

    resid.hist(bins=40, ax=axs[0])
    axs[0].set_xlabel('Residuals')
    sm.qqplot(resid, line='q', ax=axs[1])
    axs[1].set_xlabel('Residuals')
    tbpd.hist2d(resid, prediction, ax=axs[2],
                vlabel='Residuals', hlabel='Predicted value',
                integer_aligned_bins=True)
    tbpd.hist2d(y, prediction, ax=axs[3],
                vlabel='True value', hlabel='Predicted value',
                integer_aligned_bins=True, sqrt=True)
    fig.tight_layout()
Exemplo n.º 39
0
def hyp_test_pic2(symbol, from_t, to_t):
    """
    画出检验正态分布的图。这是方法二。
    X轴理论分位数,y轴样本分位数.只要不在一条直线上,就表示不符合正态分布
    :param symbol: str
    :param from_t: str
    :param to_t: str
    :return: picture
    """
    sql = f"select * from stock_candles_day where symbol='{symbol}' and dt>='{from_t}' and dt<='{to_t}' order by symbol,series"
    dt = query_dt(sql)
    #计算对数收益率
    a = np.log(dt['c'].pct_change() + 1)
    fix, axes = plt.subplots(1, 1, figsize=(10, 12))
    sm.qqplot(a.dropna(), line='s', ax=axes)
    axes.set_title("hypothesis testing")  #用中文做标题会出错
    return plt.show()
Exemplo n.º 40
0
def do_qqplot(data, data_type, d):
    fig = sm.qqplot(data, line='45')
    fig.savefig(
        f"/home/vmargot/Documents/Jussieu/new/{data_type}_{n}_d={d}_qqplot",
        format="svg",
        dpi=300,
    )
    plt.close(fig)
Exemplo n.º 41
0
def test_qqplot():
    #just test that it runs
    data = sm.datasets.longley.load()
    data.exog = sm.add_constant(data.exog)
    mod_fit = sm.OLS(data.endog, data.exog).fit()
    res = mod_fit.resid
    fig = sm.qqplot(res)

    plt.close(fig)
Exemplo n.º 42
0
def test_qqplot():
    #just test that it runs
    data = sm.datasets.longley.load()
    data.exog = sm.add_constant(data.exog, prepend=False)
    mod_fit = sm.OLS(data.endog, data.exog).fit()
    res = mod_fit.resid
    fig = sm.qqplot(res, line='r')

    plt.close('all')
Exemplo n.º 43
0
def plot(file_name,negative_control_gRNAs=None,wald_only=False):
    data=open(file_name,'rb')
    short_file_name=file_name[:file_name.index(".gene_summary.txt")]
    data.readline()
    permute_p_value_list=[]
    wald_p_value_list=[]
    beta_value_list=[]

    if negative_control_gRNAs!=None:
        negative_control_permute_p_value_list=[]
        negative_control_wald_p_value_list=[]
        negative_control_beta_value_list=[]


    for line in data:
        elements=line.decode().strip().split("\t")
        if negative_control_gRNAs!=None and elements[0] in negative_control_gRNAs:
            negative_control_beta_value_list.append(float(elements[2]))
            if wald_only==True:
                negative_control_wald_p_value_list.append(float(elements[4]))
            else:
                negative_control_permute_p_value_list.append(float(elements[4]))
                negative_control_wald_p_value_list.append(float(elements[6]))
        else:
            beta_value_list.append(float(elements[2]))
            if wald_only==True:
                wald_p_value_list.append(float(elements[4]))
            else:
                permute_p_value_list.append(float(elements[4]))
                wald_p_value_list.append(float(elements[6]))
    beta_value_list=[x for x in beta_value_list if str(x) != 'nan' and abs(x)<3]
    wald_p_value_list=[x for x in wald_p_value_list if str(x) != 'nan']
    if negative_control_gRNAs!=None:
        negative_control_beta_value_list=[x for x in beta_value_list if str(x) != 'nan' and abs(x)<3]
        negative_control_wald_p_value_list=[x for x in wald_p_value_list if str(x) != 'nan']

    if wald_only!=True:
        permute_p_value_list=[x for x in permute_p_value_list if str(x) != 'nan']
        stats.probplot(permute_p_value_list, dist="uniform",plot=pylab)
        pylab.savefig("QQplot of permute_p value %s.png" %short_file_name)
        pylab.close()

    pylab.hist(beta_value_list,bins=1000)
    pylab.savefig("Hist of beta value %s.png" %short_file_name)
    pylab.close()

    #stats.probplot(wald_p_value_list, dist="uniform",plot=pylab)
    fig=sm.qqplot(np.array(wald_p_value_list),stats.uniform,fit=True, line='45')
    pylab.xlim(0,1)
    pylab.ylim(0,1)
    #fig.set_xlim(0,1)
    pylab.savefig("QQplot of wald_p value %s.png" %short_file_name)
    pylab.close()
    '''
Exemplo n.º 44
0
 def qqPlot(self):
     """ Plots sample signals against theorethical distribution"""
     import statsmodels.api as sm #pandas, patsy
     import matplotlib.pyplot as plt
     data = self.array.probes[:, 2 + self.number]  # add log2
     plt.figure(self.number)
     fig = sm.qqplot(data)
     plt.xlabel('Theoretical quantiles')
     plt.ylabel('Sample quantiles')
     plt.title('Probe intensities for %s' % (self.name))
     plt.savefig("%s_qqprob.png" % (self.name))
Exemplo n.º 45
0
def print_qqplot_and_residuals_plot(model):
    # qq-plot
    ax1 = plt.subplot(1, 3, 1)
    qq_plot = sm.qqplot(model.resid, line = 'r', ax = ax1)
    
    # Residuals plot
    ax2 = plt.subplot(1, 3, 2)
    stdres = pandas.DataFrame(model.resid_pearson)
    residuals_plot = plt.plot(stdres, 'o', ls = 'None')
    plt.axhline(y = 0, color = 'r')
    plt.ylabel('Standarized Residual')
    plt.xlabel('Observation Number')
    
    plt.show()
Exemplo n.º 46
0
    def tsplot(y, lags=None, figsize=(10, 8), style='bmh'):
        if not isinstance(y, pd.Series):
            y = pd.Series(y)
        with plt.style.context(style):    
            fig = plt.figure(figsize=figsize)
            #mpl.rcParams['font.family'] = 'Ubuntu Mono'
            layout = (3, 2)
            ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2)
            acf_ax = plt.subplot2grid(layout, (1, 0))
            pacf_ax = plt.subplot2grid(layout, (1, 1))
            qq_ax = plt.subplot2grid(layout, (2, 0))
            pp_ax = plt.subplot2grid(layout, (2, 1))

            y.plot(ax=ts_ax)
            ts_ax.set_title('Time Series Analysis Plots')
            smt.graphics.plot_acf(y, lags=lags, ax=acf_ax, alpha=0.5)
            smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax, alpha=0.5)
            sm.qqplot(y, line='s', ax=qq_ax)
            qq_ax.set_title('QQ Plot')        
            scs.probplot(y, sparams=(y.mean(), y.std()), plot=pp_ax)

            plt.tight_layout()
        return
def mult_regression(wine_set):
    # center quantitative IVs for regression analysis
    w = wine_set['quality']
    wine_set = wine_set - wine_set.mean()
    wine_set['quality'] = w

    print ("OLS multivariate regression model")
    # first i have run with all columns; than chose the most significant for each wine set and rerun:

    if len(wine_set) < 2000:
        # for red
        model1 = smf.ols(
            formula="quality ~ volatile_acidity + chlorides + pH + sulphates + alcohol",
            data=wine_set)
    else:
        # for white
        model1 = smf.ols(
            formula="quality ~ volatile_acidity + density + pH + sulphates + alcohol",
            data=wine_set)

    results1 = model1.fit()
    print(results1.summary())

    # q-q plot for normality
    qq = sm.qqplot(results1.resid, line = 'r')
    plt.show()

    # plot of residuals
    stdres = pd.DataFrame(results1.resid_pearson)
    plt.plot(stdres, 'o', ls = 'None')
    l = plt.axhline(y=0, color = 'r')
    plt.ylabel('Standardized redisual')
    plt.xlabel('Observation number')
    plt.show()

    # # diagnostic plots
    # figure1 = plt.figure(figsize=(12, 8))
    # figure1 = sm.graphics.plot_regress_exog(results1, "alcohol", fig = figure1)
    # plt.show()
    #
    # figure1 = plt.figure(figsize=(12, 8))
    # figure1 = sm.graphics.plot_regress_exog(results1, "sulphates", fig = figure1)
    # plt.show()

    # leverage plot
    figure1 = sm.graphics.influence_plot(results1, size=8)
    plt.show()
reg2 = smf.ols('lifeexpectancy ~ breastcancerper100th_c + I(breastcancerper100th_c**2)', data=sub1).fit()
print (reg2.summary())


####################################################################################
# EVALUATING MODEL FIT
####################################################################################

# adding alcohol consumption
reg3 = smf.ols('lifeexpectancy ~ breastcancerper100th_c + I(breastcancerper100th_c**2) + breastcancerper100th_c', 
               data=sub1).fit()
print (reg3.summary())


#Q-Q plot for normality
fig4=sm.qqplot(reg3.resid, line='r')

# simple plot of residuals
stdres=pandas.DataFrame(reg3.resid_pearson)
plt.plot(stdres, 'o', ls='None')
l = plt.axhline(y=0, color='r')
plt.ylabel('Standardized Residual')
plt.xlabel('Observation Number')


# additional regression diagnostic plots
fig2 = plt.figure(figsize=(12,8))
fig2 = sm.graphics.plot_regress_exog(reg3,  "breastcancerper100th_c", fig=fig2)

# leverage plot
fig3=sm.graphics.influence_plot(reg3, size=8)
Exemplo n.º 49
0
def plot_box_resids(fit_model, y_pred, subset=None):
    '''More than you ever wanted to know about your residuals'''
    s_resid = (fit_model.resid - np.mean(fit_model.resid)) /\
               np.var(fit_model.resid)
    if subset:
        s_resid = np.random.choice(s_resid,
                                  replace=False,
                                  size=math.floor(len(s_resid) * subset))
    df = pd.DataFrame(s_resid, columns=['resids'])
    temp_df = pd.DataFrame(y_pred, columns=['target'])
    df = df.join(temp_df)

    if min(y_pred) < -1:
        df['turnout_bucket'] = df['target']\
        .apply(lambda x: int(math.floor(10 * np.exp(x))))
        y = df['target'].apply(lambda x: np.exp(x))
    else:
        df['turnout_bucket'] = df['target']\
        .apply(lambda x: int(math.floor(10 * x)))
        y = df['target']

    posit = sorted(df['turnout_bucket'].unique())

    plt.scatter(y, s_resid, alpha=.2)
    slope, intercept = np.polyfit(y, s_resid, 1)
    plt.plot(y, np.poly1d(np.polyfit(y, s_resid, 1))(y))
    plt.title('Studentized Residuals vs Prediction')
    plt.xlabel('Predicted Value')
    plt.ylabel('Studentized Residual')
    print 'Slope of best fit line: %s' % slope
    plt.show()

    ax1 = df[['resids', 'turnout_bucket']]\
        .boxplot(by='turnout_bucket', positions=posit, widths=.5)
    plt.title('Residuals versus Turnout')
    plt.xlabel('Turnout Bucket')
    plt.ylabel('Studentized Residuals')
    plt.suptitle('')
    plt.show()

    fig = sm.qqplot(s_resid, line='s')
    plt.title('Q-Q Plot')
    plt.show()

    w, p_val = shapiro(s_resid)
    print 'Shapiro-Wilk P_val is %s, larger the better' % p_val

    k, p_val = normaltest(s_resid)
    print 'D’Agostino and Pearson’s P_val is %s, larger the better' % p_val

    k, p_val = kstest(s_resid, 'norm')
    print 'Kolmogorov–Smirnov P_val is %s, larger the better' % p_val

    A, critical, sig = anderson(s_resid)
    print 'Anderson-Darling A2 is %s, smaller the better' % A
    print critical
    print sig

    n, bins, patches = plt.hist(s_resid, 75, normed=1)
    mu = np.mean(s_resid)
    sigma = np.std(s_resid)
    plt.plot(bins, mlab.normpdf(bins, mu, sigma))
    plt.title('Residuals versus a Normal Dist')
    plt.show()

    df['turnout_bucket'].hist(bins=posit, align='left', color='b')
    plt.title('Histogram of Turnout Bucket')
    plt.ylabel('Count')
    plt.xlim(-.5, - .5 + len(posit))

    temp = df[['resids', 'turnout_bucket']].groupby('turnout_bucket').count()
    temp.columns = ['Count']
    plt.show()
    print temp
 def plot(self):
     """Makes the plot."""
     sm.qqplot(self.data, fit=True, line='s')
     plt.show()
def azureml_main(BikeShare):
    import matplotlib
    matplotlib.use('agg')  # Set backend
    matplotlib.rcParams.update({'font.size': 20})
    
    import matplotlib.pyplot as plt
    import statsmodels.api as sm
    
    Azure = False

## Sort the data frame based on the dayCount
    BikeShare.sort('dayCount',  axis = 0, inplace = True) 

## Compute the residuals.
    BikeShare['Resids'] = BikeShare['Scored Label Mean'] - BikeShare['cnt']   
    
## Plot the residuals vs the label, the count of rented bikes.
    fig = plt.figure(figsize=(8, 6))
    fig.clf()
    ax = fig.gca()
## PLot the residuals.    
    BikeShare.plot(kind = 'scatter', x = 'cnt', y = 'Resids', 
                   alpha = 0.05, color = 'red', ax = ax)              
    plt.xlabel("Bike demand")
    plt.ylabel("Residual")
    plt.title("Residuals vs demand")
    plt.show()
    if(Azure == True): fig.savefig('scatter1.png')
    

## Make time series plots of actual bike demand and 
## predicted demand by times of the day.    
    times = [7, 9, 12, 15, 18, 20, 22]
    for tm in times:
        fig = plt.figure(figsize=(8, 6))
        fig.clf()
        ax = fig.gca()
        BikeShare[BikeShare.hr == tm].plot(kind = 'line', 
                                           x = 'dayCount', y = 'cnt',
                                           ax = ax)          
        BikeShare[BikeShare.hr == tm].plot(kind = 'line', 
                                           x = 'dayCount', y = 'Scored Label Mean',
                                           color = 'red', ax = ax)                                    
        plt.xlabel("Days from start of plot")
        plt.ylabel("Count of bikes rented")
        plt.title("Bikes rented by days for hour = " + str(tm))
        plt.show()
        if(Azure == True): fig.savefig('tsplot' + str(tm) + '.png')
 
## Boxplots to for the residuals by hour and transformed hour.
    labels = ["Box plots of residuals by hour of the day \n\n",
            "Box plots of residuals by transformed hour of the day \n\n"]
    xAxes = ["hr", "xformWorkHr"]
    for lab, xaxs in zip(labels, xAxes):
        fig = plt.figure(figsize=(12, 6))
        fig.clf()
        ax = fig.gca()  
        BikeShare.boxplot(column = ['Resids'], by = [xaxs], ax = ax)   
        plt.xlabel('')
        plt.ylabel('Residuals')
        plt.show() 
        if(Azure == True): fig.savefig('boxplot' + xaxs + '.png')
     
## QQ Normal plot of residuals    
    fig = plt.figure(figsize = (6,6))
    fig.clf()
    ax = fig.gca()
    sm.qqplot(BikeShare['Resids'], ax = ax)
    ax.set_title('QQ Normal plot of residuals')
    if(Azure == True): fig.savefig('QQ.png')
    if(Azure == True): fig.savefig('QQ1.png')

## Histograms of the residuals
    fig = plt.figure(figsize = (8,6))
    fig.clf()
    fig.clf()
    ax = fig.gca()
    ax.hist(BikeShare['Resids'].as_matrix(), bins = 40)
    ax.set_xlabel("Residuals")
    ax.set_ylabel("Density")
    ax.set_title("Histogram of residuals")
    if(Azure == True): fig.savefig('hist.png')   

    return BikeShare    
comb.boxplot(column=[0])


## Q-Q Plot 

##### In statistics, a Q–Q plot ("Q" stands for quantile) is a probability plot, which is a graphical method for comparing two probability distributions by plotting their quantiles against each other. If the two distributions being compared are similar, the points in the Q–Q plot will approximately lie on the line y = x. If the distributions are linearly related, the points in the Q–Q plot will approximately lie on a line, but not necessarily on the line y = x.

# In[266]:

import statsmodels.api as sm


# In[269]:

sm.qqplot(comb[1],line='45')


# In[275]:

os.getcwd()


# In[287]:

for i in np.arange(0,40,1):
    pieces1='histograms/histogram',format(i),'.jpg'
    hist=comb[i].hist()
    fig = hist.get_figure()
    fig.savefig(''.join(pieces1))
    fig.clear()
Exemplo n.º 53
0
import numpy as np
from scipy.interpolate import interp1d
import sys

def load():
    return  pd.read_excel("Analysis.xls",header = None)
    
data = load() 
for i in range(17):
    print data[i].dropna().describe()


for i in range(3,11):
    plt.hist(data[i].dropna())
    plt.show()
    sm.qqplot(data[i], line='q')
    plt.show()
    plt.boxplot(data[i].dropna())
    plt.show()

for i in range(11,18):
    h = data[data[2].isin(['high'])][i]
    m = data[data[2].isin(['medium'])][i]
    l = data[data[2].isin(['low'])][i]
    d = [np.asarray(h),np.asarray(m),np.asarray(l)]
    plt.boxplot(d)
    plt.show()

def getmaxcorr(dt,index):
    max = -1.0
    pos = 0;
sig_marks = {}

for speed, event in groups.groups.keys():

    group = groups.get_group((speed, event))

    index = []
    t_vals = []
    p_vals = []

    for col in group.columns:

        if col.startswith('k_'):

            # plot the quantiles plot to see if the data is normally distributed
            fig = qqplot(group[col], line='45')
            plot_dir = os.path.join(PATHS['figures_dir'], 'quantile-plots',
                                    event, structure, '{:1.1f}'.format(speed))
            plot_dir = utils.mkdir(plot_dir)
            fig.savefig(os.path.join(plot_dir, '{}.png'.format(col)))
            plt.close(fig)

            # compute the t statistic to see if the value is significantly
            # different than zero
            t_stat, p_val = ttest_1samp(group[col], 0.0)

            index.append(col)
            t_vals.append(t_stat)
            p_vals.append(p_val)

    #mark = np.zeros((num_schedules, num_sensors, num_actuators), dtype=bool)
Exemplo n.º 55
0
def plot_qq(datei, qq_Plot, fit_qq_Plot, vergleich = scipy.stats.invgauss):
    with open(datei, 'rb') as csvfile:
        myreader = csv.reader(csvfile, delimiter = ";",quoting=csv.QUOTE_NONE)
        liste = []
        # Erstelle Liste wie oben
        for row in myreader:
            unterliste = []
            for r in row:
                r2 = float(r)
                unterliste.append(r2)
            liste.append(unterliste)

    # Und einen qq-Plot erstellen, evtl Parameter zur vergleichsfunktion müssen
    # per Hand eingestellt werden
    if qq_Plot:
        print "erstelle qq-Plot",
        fig = plt.figure()
        ax = fig.add_subplot(221)
        sm.qqplot (np.array(liste[0]), vergleich, distargs= (0.005,),  line = 'r', ax =ax)
        #txt = ax.text(-1.8, 3500, str(params[0]) ,verticalalignment='top')
        #txt.set_bbox(dict(facecolor='k', alpha=0.1))
        print "nr2",
        ax = fig.add_subplot(222)
        sm.qqplot (np.array(liste[1]), vergleich, distargs= (0.005,),  line = 'r', ax =ax)
        #txt = ax.text(-1.8, 3500, str(params[1]) ,verticalalignment='top')
        #txt.set_bbox(dict(facecolor='k', alpha=0.1))
        print "nr3",
        ax = fig.add_subplot(223)
        sm.qqplot (np.array(liste[2]), vergleich, distargs= (0.005,),  line = 'r', ax =ax)
        #txt = ax.text(-1.8, 3500, str(params[2]) ,verticalalignment='top')
        #txt.set_bbox(dict(facecolor='k', alpha=0.1))
        print "nr4",
        ax = fig.add_subplot(224)
        sm.qqplot (np.array(liste[3]), vergleich, distargs= (0.005,),  line = 'r', ax =ax)
        #txt = ax.text(-1.8, 3500, str(params[3]) ,verticalalignment='top')
        #txt.set_bbox(dict(facecolor='k', alpha=0.1))
        print "qqplot erstellt"

    # qq-Plot mit automatischem fit zur Vergleichsfunktion
    if fit_qq_Plot:
        print "erstelle fit-qq-plot", 
        fig = plt.figure()
        ax = fig.add_subplot(221)
        sm.qqplot (np.array(liste[0]), vergleich, fit = True,  line = 'r', ax =ax)
        #txt = ax.text(-1.8, 3500, str(params[0]) ,verticalalignment='top')
        #txt.set_bbox(dict(facecolor='k', alpha=0.1))
        print "nr2",
        ax = fig.add_subplot(222)
        sm.qqplot (np.array(liste[1]), vergleich, fit = True,  line = 'r', ax =ax)
        #txt = ax.text(-1.8, 3500, str(params[1]) ,verticalalignment='top')
        #txt.set_bbox(dict(facecolor='k', alpha=0.1))
        print "nr3",
        ax = fig.add_subplot(223)
        sm.qqplot (np.array(liste[2]), vergleich, fit = True,  line = 'r', ax =ax)
        #txt = ax.text(-1.8, 3500, str(params[2]) ,verticalalignment='top')
        #txt.set_bbox(dict(facecolor='k', alpha=0.1))
        print "nr4",
        ax = fig.add_subplot(224)
        sm.qqplot (np.array(liste[3]), vergleich, fit = True,  line = 'r', ax =ax)
        #txt = ax.text(-1.8, 3500, str(params[3]) ,verticalalignment='top')
        #txt.set_bbox(dict(facecolor='k', alpha=0.1))
        print "qqplot erstellt"

    plt.show()
#Adding employement rate 
print ("Association Between Urban Rate, Life Expectancy, Income, CO2 Emissions, Alcohol, Employment and Breast Cancers Rate")
reg6 = smf.ols('breastcancer ~ urbanrate_c + lifeexpect_c + co2emissions_c + income_c + alcconsumption_c + employrate_c', data=gapmind1).fit()
print (reg6.summary())
#%%
#%%
#Keep only significant variables in the model 
print ("Association Between Income, Alcohol and Breast Cancers Rate")
reg7 = smf.ols('breastcancer ~ income_c + alcconsumption_c', data=gapmind1).fit()
print (reg7.summary())
####################################################################################
# EVALUATING MODEL FIT
####################################################################################
#%%
#Q-Q plot for normality
fig1=sm.qqplot(reg7.resid, line='r')
#%%
# simple plot of residuals
stdres=pandas.DataFrame(reg7.resid_pearson)
fig2 = plt.plot(stdres, 'o', ls='None')
l = plt.axhline(y=0, color='r')
plt.ylabel('Standardized Residual')
plt.xlabel('Observation Number')
print (fig2)
#%%
"""
# additional regression diagnostic plots
# For alcohol consumption 
fig3 = plt.figure(figsize=(12,8)) 
fig3 = sm.graphics.plot_regress_exog(reg7, 'alcconsumption_c', fig=fig3)
#%%
Exemplo n.º 57
0
from scipy import stats
import matplotlib.pyplot as plt

params = stats.f.fit(sample)
print(params)
fig = plt.figure(8, figsize=(10, 10))
ax = fig.add_subplot(111)
res = stats.probplot(sample, dist=stats.f, sparams=params, plot=ax)
plt.show()


# In[13]:

import statsmodels.api as sm

fig = sm.qqplot(sample, stats.genextreme, fit=True, line='45')
plt.show()

fig = sm.qqplot(sample, stats.lognorm, fit=True, line='45')
plt.show()

fig = sm.qqplot(sample, stats.f, fit=True, line='45')
plt.show()


# In[16]:

# Computes the Kolmogorov-Smirnov statistic on 2 samples.

#     This is a two-sided test for the null hypothesis that 2 independent samples
#     are drawn from the same continuous distribution.
# Funcion que realiza las iteraciones del cross validation
def itcrossval(kf, X, Y):
	k_fold = cross_validation.KFold(len(X),kf)
	mse_cv = 0
	for k, (train, val) in enumerate(k_fold):
		linreg = lm.LinearRegression(fit_intercept = False)
		linreg.fit(X[train], Y[train])
		yhat_val = linreg.predict(X[val])
		mse_fold = mean_squared_error(Y[val], yhat_val)
		mse_cv += mse_fold
	mse_cv = mse_cv / kf
	return mse_cv

# Validacion cruzada para k=5
print "mse para training con k=5: ", itcrossval(5, Xm, ym)

# Validacion cruzada para k=10
print "mse para training con k=10: ", itcrossval(10, Xm, ym)


######## Pregunta (j) ############################################################

# Se calcula el error de prediccion sobre todos los datos de entrenamiento
errorp = ytrain - yhat_train
print "Error de prediccion sobre training set: \n", errorp

# Se realiza un quantile-quntile plot
graf = sm.qqplot(yhat_train - ytrain, fit=True, line='45') 
plt.show() 
Exemplo n.º 59
0
 def test_qqplot(self, close_figures):
     sm.qqplot(self.res, line='r')
Exemplo n.º 60
0
 def test_qqplot_pltkwargs(self, close_figures):
     sm.qqplot(self.res, line='r', marker='d',
               markerfacecolor='cornflowerblue',
               markeredgecolor='white',
               alpha=0.5)