def display_qqplot(name: str, df: pd.DataFrame, target: str): """Show QQ plot for data against normal quantiles Parameters ---------- name : str Stock ticker df : pd.DataFrame Dataframe target : str Column in data to look at """ # Statsmodels has a UserWarning for marker kwarg-- which we dont use warnings.filterwarnings(category=UserWarning, action="ignore") data = df[target] fig, ax = plt.subplots(figsize=plot_autoscale(), dpi=PLOT_DPI) qqplot(data, stats.distributions.norm, fit=True, line="45", ax=ax) ax.set_title(f"Q-Q plot for {name} {target}") ax.set_ylabel("Sample quantiles") ax.set_xlabel("Theoretical quantiles") ax.grid(True) if gtff.USE_ION: plt.ion() fig.tight_layout(pad=1) plt.show() print("")
def _plotQQPlotOtherBars(self, saveDirectory='', showIt=False): # Plot the quantile plot of each asset in the portfolio: for eachAssetName, eachAssetDataFrame in self.ALTERNATIVE_BARS.items(): logger.warning(f'[{self._plotQQPlotOtherBars.__name__}] - Looping for asset <{eachAssetName}>...') # Plot the QQplot: qqplot(eachAssetDataFrame.Returns.values, line='s') # Add more variables: plt.grid(linestyle='dotted') plt.xlabel('Theoretical Quantiles', horizontalalignment='center', verticalalignment='center', fontsize=14, labelpad=20) plt.ylabel('Sample Quantiles', horizontalalignment='center', verticalalignment='center', fontsize=14, labelpad=20) plt.title(f'Asset: {eachAssetName} -- Quantile-Quantile (QQ) Plot') plt.subplots_adjust(left=0.09, bottom=0.20, right=0.94, top=0.90, wspace=0.2, hspace=0) # In PNG: plt.savefig(saveDirectory + f'/QQPlot_{eachAssetName}.png') # Show it: if showIt: plt.show() ######################### PLOTS #########################
def normal(x): print('Shapiro-Wilk p =', stats.shapiro(x)[1]) print('Jarque-Bera p =', stats.jarque_bera(x)[1]) print('QQ plot') qqplot(x, line='s') pyplot.show() return 0
def test_distribution(dataframe, t=None): def print_res(p, alpha): print('p = ', p) if np.isnan(p): print('p is null') elif p < alpha: print( "The null hypothesis of normality can be rejected --> NOT NORMAL" ) else: print( "The null hypothesis of normality cannot be rejected --> LIKELY NORMAL" ) alpha = 0.05 global arr arr = dataframe.values.flatten() arr = arr[~np.isnan(arr)] corrected = (arr - np.mean(arr)) / np.std(arr) plt.hist(corrected, bins=15) plt.suptitle(t) plt.show() qqplot(corrected) plt.show() # test raw values print("Raw Data:") k2, p = normaltest(corrected) print_res(p, alpha)
def plot(data): n, bins, patches = plt.hist(np.array(data), 50) mu = np.mean(data) sigma = np.std(data) print("Mean: {}, std: {}".format(mu, sigma)) # Shapiro test stat, p = shapiro(np.array(data)) if p > 0.05: print("Shapiro: Data is normally distributed") else: print("Shapiro: Data is NOT normally distributed, p-value: {}".format(p)) stat, p = normaltest(np.array(data)) if p > 0.05: print("D'Agostino: Data is normally distributed") else: print("D'Agostino: Data is NOT normally distributed, p-value: {}".format(p)) result = anderson(np.array(data)) p = 0 for i in range(len(result.critical_values)): sl, cv = result.significance_level[i], result.critical_values[i] if result.statistic < result.critical_values[i]: print('Anderson: %.3f: %.3f, data looks normal (fail to reject H0)' % (sl, cv)) else: print('Anderson: %.3f: %.3f, data does not look normal (reject H0)' % (sl, cv)) plt.plot(bins, norm.pdf(bins, mu, sigma)) qqplot(np.array(data), line='s') plt.show()
def _plot_data(self): for plot in ["MA", "STD"]: plt.figure() col_temp = [ col1 for col1 in self.data.columns if str(plot) in col1 ] for col in col_temp: plt.plot(self.data['DateTime_Stamp'], self.data[col]) plt.title(plot) plt.xticks(rotation=90) plt.subplots_adjust(bottom=0.2) plt.show() plt.figure() plt.hist(self.data['Y_label'], bins=np.arange(0.00, 1.01, 0.05)) qqplot(self.data['Y_label'], line='s') plt.figure() plt.plot(self.data['DateTime_Stamp'], self.data["OPEN_Bid"]) col_temp = [col1 for col1 in self.data.columns if "Band" in col1] for col in col_temp: plt.plot(self.data['DateTime_Stamp'], self.data[col]) return self
def qq_plot(self, column=None): if column: series = self.data[column] else: column = self.data_column series = self.data[self.data_column] qqplot(series, line='r') plt.show()
def qq_plot(data): #convert data (python list) to a numpy array data = np.array(data) #plot the data qqplot(data, line='s') pyplot.show()
def plot(label): QQdata = pd.read_csv( f"~/Dropbox/Fundamental Market Research/QQPlots/{label}.csv") numpyQQdata = QQdata.values newdata = numpy.array( [numpyQQdata[k, 0] for k in range(numpy.size(numpyQQdata))]) qqplot(newdata, line='s') pyplot.show()
def qq_plot(): # seed the random number generator seed(1) # generate univariate observations data = 5 * randn(100) + 50 # q-q plot qqplot(data, line='s') pyplot.show()
def AR_model(X, data_in, lag, i): model = AR(data_in) results_AR = model.fit(maxlag=lag, disp=0) AR_data = results_AR.fittedvalues act = data_in[3:] print("Parameters of Autoregressive Model AR(%d) are:" % lag) print(results_AR.params) plt.figure() plt.plot(act, color='blue', label='Actual Value') plt.plot(results_AR.fittedvalues, color='red', label="Predicted Value") plt.legend(loc='best') plt.xlabel("Time") plt.ylabel("Time series values") plt.title('AR(' + str(lag) + ")" + "Model with RMSE:" + str(np.sqrt((np.sum(np.square(AR_data - act))) / len(act)))) plt.title("AR Fit (not scaled)") plt.savefig("AR fit not scaled" + str(i)) #plt.show() inverted_in = [ inverse_difference(X[i], data_in[i]) for i in range(len(AR_data)) ] inverted_AR = [ inverse_difference(X[i], AR_data[i]) for i in range(len(AR_data)) ] plt.figure() plt.plot(inverted_in, color='red', label="actual value") plt.plot(inverted_AR, color='blue', label="predicted value") plt.legend(loc='upper left') plt.title( "Comparison of predicted and actual values for Autoregression model, lag" + str(lag)) plt.savefig(" AR Fit Final" + str(i)) print("RMSE on the Data is:" + str(np.sqrt((np.sum(np.square(AR_data - act))) / len(act)))) residuals = results_AR.resid plt.figure() plt.title("Residual Scatter Plot") plt.scatter(AR_data, residuals) plt.savefig("residuals" + str(i)) #plt.show() plt.figure() qqplot(residuals) plt.title("Residual Q-Q Plot") plt.savefig("QQ" + str(i)) plt.figure() plt.hist(residuals) plt.title("Residual Histogram") plt.savefig("Hist" + str(i)) k2, p = stats.normaltest(residuals) alpha = 0.001 print("Chi-Square Test : k2 = %.4f p = %.4f" % (k2, p)) print("two sided chi squared probability :" + str(p))
def newDeath(request): plt.clf() boxPlot('new_deaths') uri = renderMatplotlib(plt) describe = df['new_deaths'].describe() describes = { "count": describe['count'], "mean": describe['mean'], "std": describe['std'], "min": describe['min'], "haiNam": describe['25%'], "namMuoi": describe['50%'], "bayNam": describe['75%'], "max": describe['max'], "median": df['new_deaths'].median(), "mode": df['new_deaths'].mode() } doPhanTan = { "IQR": interquartile_range('new_deaths'), "var": df['new_deaths'].var(), "std": df['new_deaths'].std(), } mucDo = { "knewness": df['new_deaths'].skew(), "kurtosis": df['new_deaths'].kurtosis() } plt.clf() fig, ax = plt.subplots() df['new_deaths'].plot.kde(ax=ax, legend=False, title='Histogram new_deaths') df['new_deaths'].plot.hist(density=True, ax=ax, color='red') ax.set_ylabel('new_deaths') ax.grid(axis='y') ax.set_facecolor('#d8dcd0') hist = renderMatplotlib(plt) plt.clf() x = df['new_deaths'] data = randn(len(x)) qqplot(data, line='s') plt.title('Biểu đồ phân phối chuẩn của new_deaths') kiemDinh = renderMatplotlib(plt) data = { "uri": uri, "describe": describes, "doPhanTan": doPhanTan, "mucDo": mucDo, "hist": hist, "kiemDinh": kiemDinh } return render(request, 'components/newDead.html', {"data": data})
def normality_test(ts): # Completed """ Performs a series of hypothesis tests about normality on the time series data distribution. Besides the result of the statistical test, this also includes a quantile plot of the data (qqplot). Note: Shapiro & Kolmogorov-Smirnov Tests can still produce inconsistencies if the data set (size) is to small to detect non-normality. """ ts = ts_to_list(ts) data = np.array(ts) # Shapiro-Wilk test: Detects all departures from normality. # Rejects the hypothesis of normality when the p-value is <= to 0.05. # i.e not from a normal distribution. stat_sw, p_sw = shapiro(data) # (1) Normality test # Kolmogorov-Smirnov: Tests the sample data against # another sample, to compare their distributions for # similarities, not just for normal distributions. # If p < .05 we can reject the null, meaning our sample # distribution is not identical to a normal distribution. stat_ks, p_ks = normaltest(data) # (2) Normality test # Anderson-Darling: Test is the data comes from a particular # distribution (one of many). Modified version of the # Kolmogorov-Smirnov to check for normality. However, rather # Than a p-value, we're given an array of critical values # where the hypothesis can be rejected. stat_ad = anderson(data) # (3) Normality test # Print results of all 3 tests print(f'\nShapiro-Wilk Statistic Test Result: {stat_sw:.3f}') print(f'P-value: {p_sw}: ', end='') # Check if (SW) from normal distribution or not. if p_sw < 0.05: print("Null Hypothesis Rejected. Not from normal distribution.\n") else: print("Accepted Null Hypothesis.\n") print(f'Kolmogorov-Smirnov Statistic Test Result: {stat_ks:.3f}') print(f'P-value: {p_ks}', end='') # Check if (KS) from normal distribution or not. if p_ks < 0.05: print("Null Hypothesis Rejected. Not from normal distribution.\n") else: print("Accepted Null Hypothesis. Can occurs if data set is too small.") print(f'Anderson-Darling Statistic Test Result: {stat_ad.statistic}') # Check if (AD) from normal distribution or not. for i in range(len(stat_ad.critical_values)): st, cv = stat_ad.significance_level[i], stat_ad.critical_values[i] if stat_ad.statistic < stat_ad.critical_values[i]: print(f'{st:.3f}: {cv:.3f}: Accepted. From normal distribution') else: print(f'{st:.3f}: {cv:.3f}: Rejected. Data not normal') # Plots a standardized line, scaled by the SD of the time series. qqplot(data, line='s') plt.show() ts = list_to_ts(ts) return ts
def residooMultipleRegression(Y, A, B1, B2, B3, B4, x1, x2, x3, x4): # Y is the ChangeNominal/Real, A is the intercept B1 - B4 are the corresponding coefficients of x1-x4 residual = numpy.empty_like(Y) i = 0 while i < T-1: PredicatedY = (B1 * x1[i]) + (B2 * x2[i]) + (B3 * x3[i]) + (B4 + x4[i]) residual[i] = Y[i] - PredicatedY i += 1 qqplot(residual, line = 's') plt.show()
def QQ_plot(data): # QQ Plot from numpy.random import seed from numpy.random import randn from statsmodels.graphics.gofplots import qqplot from matplotlib import pyplot # q-q plot qqplot(data, line='s') pyplot.show()
def spatial_QQ_plots(ALLdata, timestamps): """ INPUTS stations - list of station objects start - tuple or list in the form of (year,month,day) end - tuple or list in the form of (year,month,day) var - the variable to make the QQ plots for """ labels = ['temp', 'direction', 'speed', 'solar'] num_times = len(timestamps) fig, axes = plt.subplots(num_times, 4, figsize=(13, int(np.round(num_times * 13 / 4))), dpi=80, facecolor='w', edgecolor='k') for row, ts in enumerate(timestamps): temp = [] wind_dir = [] wind_speed = [] solar = [] for station in ALLdata.WSdata: df = station.data_binned.loc[ts] temp.append(df['temp:']) wind_dir.append(df['dir:']) wind_speed.append(df['speed:']) solar.append(df['solar:']) lst = [ np.matrix(sorted(temp)).T, np.matrix(sorted(wind_dir)).T, np.matrix(sorted(wind_speed)).T, np.matrix(sorted(solar)).T ] for col, vals in enumerate(lst): ax = axes[row, col] qqplot(vals, line='s', ax=ax) ax.set_xlabel('') ax.set_ylabel('') k2, p = sps.shapiro(vals) p_str = '{:.2f}'.format(p) print('timestamp:{}, var:{}, p: {}'.format(ts, labels[col], p_str)) ax.set_xlabel('Shap p:{}'.format(p_str)) #ax.annotate('Shap p:{}'.format(p_str),xy = (0.05,.9),xytext = (0.05,.9), textcoords='axes fraction',horizontalalignment='left', verticalalignment='top') #ax.annotate('p'.format(p),xy = (0.05,.9),xytext = (0.05,.9), textcoords='axes fraction',horizontalalignment='left', verticalalignment='top') if col == 0: ax.set_ylabel(ts) if row == 0: ax.title.set_text(labels[col]) fig.tight_layout(pad=1.1)
def test_qqplot_pltkwargs(self, close_figures): qqplot( self.res, line="r", marker="d", markerfacecolor="cornflowerblue", markeredgecolor="white", alpha=0.5, )
def explore_series(df, columns_explore, plot_lags=20): # Figure setup fig_out = plt.figure(figsize=(22, 24)) plot_cols = len(columns_explore) position = 0 # create lists for output values from adfuller tests output_ADF = [] output_pval = [] output_crit1 = [] output_crit5 = [] output_crit10 = [] output_labels = [] # loop through the columns of interest for column in df: if column in columns_explore: # time history plot df[column].plot(ax=plt.subplot2grid((5, plot_cols), (0, position)), title=column) # histogram df[column].hist(ax=plt.subplot2grid((5, plot_cols), (1, position))) # qqplot to check normality qqplot(df[column], line='r', ax=plt.subplot2grid((5, plot_cols), (2, position))) # autocorrelation plot plot_acf(df[column].dropna(), lags=plot_lags, ax=plt.subplot2grid((5, plot_cols), (3, position))) # partial autocorrelation plot plot_pacf(df[column].dropna(), lags=plot_lags, ax=plt.subplot2grid((5, plot_cols), (4, position))) position += 1 # run adfuller test and append results to lists result = adfuller(df[column].dropna()) output_ADF.append(result[0]) output_pval.append(result[1]) output_crit1.append(result[4]['1%']) output_crit5.append(result[4]['5%']) output_crit10.append(result[4]['10%']) output_labels.append(column) # create dataframe for the adfuller results df_out = pd.DataFrame(columns=output_labels, index=[ 'ADF_Statistic', 'p-value', 'Critical_1percent', 'Critical_5_percent', 'Critical_10_percent' ]) df_out.iloc[0] = output_ADF df_out.iloc[1] = output_pval df_out.iloc[2] = output_crit1 df_out.iloc[3] = output_crit5 df_out.iloc[4] = output_crit10 return df_out, fig_out
def isGaussian(data): # histogram plot -------------------------------------------------------------- print("Histogram plot -------------------------------------") pyplot.hist(data) pyplot.show() #QQPlot ---------------------------------------------------------------------- print("QQ plot -------------------------------------") from statsmodels.graphics.gofplots import qqplot # q-q plot qqplot(data, line='s') pyplot.show() #shapiro wilk-test ----------------------------------------------------------- print("Shapiro-Wilk test -------------------------------------") from scipy.stats import shapiro stat, p = shapiro(data) print('Statistics=%.3f, p=%.3f' % (stat, p)) # interpret alpha = 0.05 if p > alpha: print('Sample looks Gaussian (fail to reject H0)') else: print('Sample does not look Gaussian (reject H0)') #D'Agostino's K^2 Test -------------------------------------------------------- print("D'Agostino's K^2 test -------------------------------------") from scipy.stats import normaltest # normality test stat, p = normaltest(data) print('Statistics=%.3f, p=%.3f' % (stat, p)) # interpret alpha = 0.05 if p > alpha: print('Sample looks Gaussian (fail to reject H0)') else: print('Sample does not look Gaussian (reject H0)') #Anderson-Darling Test -------------------------------------------------------- print("Anderson-Darling test -------------------------------------") from scipy.stats import anderson # normality test result = anderson(data) print('Statistic: %.3f' % result.statistic) p = 0 for i in range(len(result.critical_values)): sl, cv = result.significance_level[i], result.critical_values[i] if result.statistic < result.critical_values[i]: print('%.3f: %.3f, data looks normal (fail to reject H0)' % (sl, cv)) else: print('%.3f: %.3f, data does not look normal (reject H0)' % (sl, cv))
def qq(a, logvalue, lognext, interval, NCHANGES): V, A, r, sigma = regression(a, logvalue, lognext, interval, NCHANGES) centralized = numpy.array([V[k] - r * A[k] for k in range(NCHANGES)]) s1 = stats.shapiro(centralized)[0] s2 = stats.shapiro(centralized)[1] qqplot(centralized, line='r') # fig = plt.figure() # fig.savefig(im + 'eco_'+ Ecoregions[eid] + '_Raw_Data_residuals.png') pyplot.show() return s1, s2
def plot_qqplot( other_args: List[str], ticker: str, model_name: str, residuals: List[float], ): """Qqplot time series against a standard normal curve Parameters ---------- other_args : str Command line arguments to be processed with argparse ticker : str Ticker of the stock model_name : str Model fitting name in use residuals : List[float] Residuals data """ parser = argparse.ArgumentParser( add_help=False, prog="qqplot", description=""" Qqplot time series against a standard normal curve """, ) try: ns_parser = parse_known_args_and_warn(parser, other_args) if not ns_parser: return plt.figure(figsize=plot_autoscale(), dpi=PLOT_DPI, constrained_layout=True) qqplot(residuals, stats.distributions.norm, fit=True, line="45", ax=plt.gca()) plt.title(f"Q-Q plot residuals from {model_name} on {ticker}") plt.ylabel("Sample quantiles") plt.xlabel("Theoretical quantiles") plt.grid(True) if gtff.USE_ION: plt.ion() plt.show() print("") except Exception as e: print(e, "\n") return
def display_qqplot( name: str, df: pd.DataFrame, target: str, external_axes: Optional[List[plt.Axes]] = None, ): """Show QQ plot for data against normal quantiles Parameters ---------- name : str Stock ticker df : pd.DataFrame Dataframe target : str Column in data to look at external_axes : Optional[List[plt.Axes]], optional External axes (1 axis is expected in the list), by default None """ # Statsmodels has a UserWarning for marker kwarg-- which we don't use warnings.filterwarnings(category=UserWarning, action="ignore") data = df[target] # This plot has 1 axis if external_axes is None: _, ax = plt.subplots( figsize=plot_autoscale(), dpi=PLOT_DPI, ) else: if len(external_axes) != 1: logger.error("Expected list of one axis item.") console.print("[red]Expected list of 1 axis items./n[/red]") return (ax, ) = external_axes qqplot( data, stats.distributions.norm, fit=True, line="45", color=theme.down_color, ax=ax, ) ax.get_lines()[1].set_color(theme.up_color) ax.set_title(f"Q-Q plot for {name} {target}") ax.set_ylabel("Sample quantiles") ax.set_xlabel("Theoretical quantiles") theme.style_primary_axis(ax) if external_axes is None: theme.visualize_output()
def qqplots(self, epoch, sub=None): assert 0 <= epoch <= self.epochs, 'epoch must be between 0 and %.0f, got %.0f' % ( self.epochs, epoch) if not sub: sub = range(self.masks) for group in sub: gofplots.qqplot(self.data[group, :, epoch], fit=True, line='45') plt.suptitle("%s, %s, Epoch %.0f" % (self.name, self.labels[group], epoch), fontsize=18)
def plot_QQ_plot(self, series_values): try: from numpy.random import seed from numpy.random import randn from statsmodels.graphics.gofplots import qqplot from matplotlib import pyplot qqplot(series_values, line='s') pyplot.show() except Exception as exc: raise exc
def residuals_eval(y_true, y_pred): y_res = y_true - y_pred df_res = DataFrame(y_res) print(df_res.describe()) df_res.plot()#line pyplot.show() df_res.hist()#hist pyplot.show() df_res.plot(kind='kde')#density plot pyplot.show() qqplot(numpy.array(y_res), line='r') pyplot.show()
def residuals_charts(test_dataset, test_output): """Build residuals charts for one experiment.""" prediction_series = test_output.mean(axis=0).reshape(test_output.shape[1]) residuals = prediction_series - test_dataset.y_data.reshape( test_dataset.y_data.shape[0]) pd.Series(residuals).hist(bins=30) print(stats.normaltest(residuals)) gofplots.qqplot(residuals) xxx = np.linspace(-3.5, 3.5, 4) pyplot.plot(xxx, xxx) tsaplots.plot_acf(residuals, lags=30) tsaplots.plot_pacf(residuals, lags=30)
def residual_plot(self): ''' Plot the residual and save it to current directory ''' import matplotlib.pyplot as plt from scipy.stats import norm from statsmodels.graphics.gofplots import qqplot # set the size of the plot plt.figure(figsize=(16, 9)) # plot the distribution ax = plt.subplot(121) # create bins and count the numbers count = pd.DataFrame([0] * 24, index=np.arange(-5.75, 6, step=0.5)) for i in count.index: for r in self._residual: if r >= i - 0.25 and r < i + 0.25: count.loc[i] += 1 # create a normal distribution reference xx = np.linspace(-3, 3, 100) normal = norm.pdf(xx, np.mean(self._residual), np.std(self._residual)) normalcdf = norm.cdf(xx, np.mean(self._residual), np.std(self._residual)) low_flag = True for i in range(xx.shape[0]): if normalcdf[i] >= 0.025 and low_flag: low = i low_flag = False if normalcdf[i] >= 0.975: high = i break # plot the distribution plt.plot(count.index, count, 'o', label="residual") plt.plot(xx, normal * self._residual.shape[0], '--', label="normal") plt.fill_between(xx[low:high], 0, normal[low:high] * self._residual.shape[0], alpha=.3, facecolor="grey", label="95% normal") plt.xlim([-6, 6]) ax.set_ylim(bottom=5) ax.legend() plt.title("Distribution of the residual") plt.yscale('log') # plot the QQ plot ax = plt.subplot(122) qqplot(self._residual, line='s', ax=ax) plt.xlim([-3.5, 3.5]) plt.ylim([-5, 5]) plt.title("residual QQ plot") #plt.show() plt.savefig("residual_plots.png")
def plot_Model_Identify(DataSet, frequency=1, acf_lag=12, pacf_lag=12): """ DataSet : dataframe with the type of first column either int() or panda datetime Frequency : int, Seasonal Component period (in time step) """ # Organize plot fig, ax = plt.subplots(3, 4) # Plot the Observed Data DataSet.plot(ax=ax[0, 0]) ax[0, 0].set_title('Observed Value') ax[0, 0].set_xlabel("") # Plot the autocorrelation plot autocorrelation_plot(DataSet.iloc[:, 0], ax=ax[1, 0]) ax[1, 0].set_title('Autocorrelation') # Plot the QQ plot qqplot( DataSet.iloc[:, 0], ax=ax[2, 0], ) ax[2, 0].set_title('Q-Q Plot') # Lag plot lag_plot(DataSet.iloc[:, 0], ax=ax[0, 1]) ax[0, 1].set_title('Lag Plot') ax[0, 1].set_ylabel("") ax[0, 1].set_xlabel("") # ACF Plot tsa.plot_acf(DataSet.iloc[:, 0], ax=ax[1, 1], lags=acf_lag, alpha=0.05) ax[1, 1].set_title('ACF') # PACF Plot tsa.plot_pacf(DataSet.iloc[:, 0], ax=ax[2, 1], lags=pacf_lag, alpha=0.05) ax[2, 1].set_title('PACF') # decomposition plot decomposition = sm.tsa.seasonal_decompose(DataSet.iloc[:, 0], freq=frequency) decomposition.resid.plot(ax=ax[0, 2]) decomposition.resid.plot(ax=ax[0, 3], kind='kde') decomposition.seasonal.plot(ax=ax[1, 2]) decomposition.trend.plot(ax=ax[2, 2]) ax[0, 2].set_title('Residual') ax[1, 2].set_title('Seasonal') ax[2, 2].set_title('Trend') ax[0, 3].set_title('Residual Prob. Distrib') plt.show()
def correctLin(x, y): n = numpy.size(x) r = stats.linregress(x, y) s = r.slope i = r.intercept print(r) residuals = numpy.array([y[k] - x[k] * s - i for k in range(n)]) stderr = math.sqrt((1 / (n - 2)) * numpy.dot(residuals, residuals)) qqplot(residuals, line='r') pyplot.show() print('Shapiro-Wilk p = ', stats.shapiro(residuals)[1]) print('Jarque-Bera p = ', stats.jarque_bera(residuals)[1]) return (residuals, s, i, stderr)
def simpleLin(x, y): n = numpy.size(x) x = numpy.array(x) y = numpy.array(y) k = numpy.dot(x, y) / numpy.dot(x, x) residuals = y - k * x stderr = numpy.std(residuals) qqplot(residuals, line='r') pyplot.show() pyplot.plot(residuals) pyplot.show() print('normality', stats.shapiro(residuals)) return (k, stderr)
def plot_regress_analysis(model, influence=True, annotate=True): plt.figure(figsize=(15, 16)) # Residuals vs Fitted ax = plt.subplot2grid((3, 2), (0, 0)) ax.set_title("Residuals vs Fitted") ax.set_xlabel('Fitted values') ax.set_ylabel('Residuals') fitted = model.predict() residuals = model.resid ax.plot(fitted, residuals, marker='.', linestyle='') # Model non-linearity with quadratic polyline = np.poly1d(np.polyfit(fitted, residuals, 2)) max_fitted = np.max(fitted) xs = np.append(np.arange(np.min(fitted), max_fitted), max_fitted) ax.plot(xs, polyline(xs), linewidth=2.5) # Q-Q plot ax = plt.subplot2grid((3, 2), (0, 1)) ax.set_title("Q-Q") qqplot(model.resid_pearson, dist="norm", line='r', ax=ax) # Scale-Location ax = plt.subplot2grid((3, 2), (1, 0)) ax.set_title("Scale-Location") ax.set_xlabel('Fitted values') ax.set_ylabel('$|$Normalized residuals$|^{1/2}$') std_residuals = np.sqrt(np.abs(model.resid_pearson)) ax.plot(fitted, std_residuals, linestyle='', marker='.') # Model non-linearity with quadratic polyline = np.poly1d(np.polyfit(fitted, std_residuals, 2)) ax.plot(xs, polyline(xs), linewidth=2.5) # Residuals vs Leverage ax = plt.subplot2grid((3, 2), (1, 1)) plot_leverage_resid2(model, ax, annotate=annotate) # Influence plot if influence: ax = plt.subplot2grid((3, 2), (2, 0), colspan=2) ax = influence_plot(model, ax=ax)
def env_corr(self, env_vars, coeff_plot=False, qq_plot=False): """ Determine correlations with environmental/non-discretionary variables using a logit regression. Tobit will be implemented when available upstream in statsmodels. Takes: env_vars: A pandas dataframe of environmental variables Returns: corr_mod: the statsmodels' model instance containing the inputs and results from the logit model. Note that there can be no spaces in the variables' names. """ import matplotlib.pyplot as plt from statsmodels.regression.linear_model import OLS from statsmodels.graphics.gofplots import qqplot from seaborn import coefplot env_data = _to_dataframe(env_vars) corr_data = env_data.join(self['Efficiency']) corr_mod = OLS.from_formula( "Efficiency ~ " + " + ".join(env_vars.columns), corr_data) corr_res = corr_mod.fit() #plot coeffs if coeff_plot: coefplot("Efficiency ~ " + " + ".join(env_vars.columns), data=corr_data) plt.xticks(rotation=45, ha='right') plt.title('Regression coefficients and standard errors') #plot qq of residuals if qq_plot: qqplot(corr_res.resid, line='s') plt.title('Distribution of residuals') print(corr_res.summary()) return corr_res
def draw_figures(): bdims = pd.read_csv("bdims.csv") fdims = bdims[ bdims["sex"] == 0] fig, plots = plt.subplots(4, 2) biidi = standardize(fdims["bii.di"]) elbdi = standardize(fdims["elb.di"]) age = standardize(bdims["age"]) chede = standardize(fdims["che.de"]) plots[0][0].hist(biidi, bins=range(-4,4)) plots[1][0].hist(elbdi, bins=range(-3,5)) plots[2][0].hist(age, bins=range(-2,5)) plots[3][0].hist(chede, bins=range(-2,6)) plots[0][0].set_title("Histogram of female biiliac diameter") plots[1][0].set_title("Histogram of female elbow diameter") plots[2][0].set_title("Histogram of general age") plots[3][0].set_title("Histogram of female chest depth") qqplot(biidi, ax=plots[1][1], line="q") qqplot(elbdi, ax=plots[2][1], line="q") qqplot(age, ax=plots[3][1], line="q") qqplot(chede, ax=plots[0][1], line="q") plots[0][1].set_title("Normal Q-Q Plot A") plots[1][1].set_title("Normal Q-Q Plot B") plots[2][1].set_title("Normal Q-Q Plot C") plots[3][1].set_title("Normal Q-Q Plot D") for i in range(0,4): plots[i][0].set_xlabel("standarized data") plots[i][0].set_ylabel("frequency") fig.set_size_inches(12, 12) plt.tight_layout() return fig
print (final_test_loss) for folder in folders: print print folder for d in data[folder]: breakeven_points = util.find_breakeven(d["curve"]) pr[folder].append(breakeven_points[-1]) print (breakeven_points[-1]) print ("Loss samples t test") # Random samples from normal s = np.random.normal(np.mean(lc[folders[0]]), np.std(lc[folders[1]]), 100) print ("Random samples", scipy.stats.shapiro(s)) fig = qqplot(s, scipy.stats.norm, fit=True, line="45") plt.show() # First folder figures fig = qqplot(np.array(lc[folders[0]]), scipy.stats.norm, fit=True, line="45") plt.show() print (folders[0], scipy.stats.shapiro(np.array(lc[folders[0]]))) # Second folder figures fig = qqplot(np.array(lc[folders[1]]), scipy.stats.norm, fit=True, line="45") plt.show() print (folders[1], scipy.stats.shapiro(np.array(lc[folders[1]]))) tstat, pval = perform_welchs_test(lc[folders[0]], lc[folders[1]]) print ("t-statistics = {}".format(tstat)) print ("p-value = {}".format(pval))
irf.plot_cum_effects(orth=False) plt.show() fevd = results.fevd(1) fevd.summary() fevd.plot() plt.show() results.test_causality('DJIA', ['SP500'],kind='f') results.test_causality('SP500', ['DJIA'],kind='f') results.test_normality(signif=0.05,verbose=False) resids = results.resid.sum(axis=1) resids.plot() plt.show() from statsmodels.graphics.gofplots import qqplot, qqline qqplot(data=resids,line='s')#, dist, distargs, a, loc, scale, fit, line, ax) plt.show() from statsmodels.sandbox.tsa.garch import Garch rets1 = rets_bm.iloc[:,1] Garch(rets1) #Getting GARCH working -using RPY2 import rpy2 rpy2.__version__ t =datetime(2013,1,5) from pandas.tseries.offsets import MonthEnd, BusinessMonthBegin (t + 2*BusinessMonthBegin())> datetime(2013,1,1)
ma['teams'][0], ma['teams'][1], ma['score'][0], ma['score'][1], fitted[mai], ) mfile.write(towrite) mfile.write('\n') print print 'residual_std: %.10f' % (resid.std()) print if PLOT_RESIDUAL_QQ: import statsmodels.graphics.gofplots as sgg sgg.qqplot(resid, fit=True) if PLOT_RESIDUAL_HIST: import pylab import scipy.stats as ss freqs, lefts = np.histogram(resid, bins = 'auto', density = True) centers = (lefts[:-1] + lefts[1:]) / 2 pylab.bar(centers, freqs, width = centers[1] - centers[0]) empirical_dist = ss.norm(*(ss.norm.fit(resid))) pylab.plot(centers, [empirical_dist.pdf(x) for x in centers], 'g-', linewidth = 5) if PLOT_SCATTER_TOT_EXP_RESIDUALS: import pylab pylab.scatter(tot_exps, resid, marker = '.', s = 1)