示例#1
0
def display_qqplot(name: str, df: pd.DataFrame, target: str):
    """Show QQ plot for data against normal quantiles

    Parameters
    ----------
    name : str
        Stock ticker
    df : pd.DataFrame
        Dataframe
    target : str
        Column in data to look at
    """
    # Statsmodels has a UserWarning for marker kwarg-- which we dont use
    warnings.filterwarnings(category=UserWarning, action="ignore")
    data = df[target]
    fig, ax = plt.subplots(figsize=plot_autoscale(), dpi=PLOT_DPI)
    qqplot(data, stats.distributions.norm, fit=True, line="45", ax=ax)
    ax.set_title(f"Q-Q plot for {name} {target}")
    ax.set_ylabel("Sample quantiles")
    ax.set_xlabel("Theoretical quantiles")
    ax.grid(True)

    if gtff.USE_ION:
        plt.ion()
    fig.tight_layout(pad=1)
    plt.show()
    print("")
    def _plotQQPlotOtherBars(self, saveDirectory='', showIt=False):

        # Plot the quantile plot of each asset in the portfolio:
        for eachAssetName, eachAssetDataFrame in self.ALTERNATIVE_BARS.items():

            logger.warning(f'[{self._plotQQPlotOtherBars.__name__}] - Looping for asset <{eachAssetName}>...')

            # Plot the QQplot:
            qqplot(eachAssetDataFrame.Returns.values, line='s')

            # Add more variables:
            plt.grid(linestyle='dotted')
            plt.xlabel('Theoretical Quantiles', horizontalalignment='center', verticalalignment='center', fontsize=14, labelpad=20)
            plt.ylabel('Sample Quantiles', horizontalalignment='center', verticalalignment='center', fontsize=14, labelpad=20)
            plt.title(f'Asset: {eachAssetName} -- Quantile-Quantile (QQ) Plot')
            plt.subplots_adjust(left=0.09, bottom=0.20, right=0.94, top=0.90, wspace=0.2, hspace=0)

            # In PNG:
            plt.savefig(saveDirectory + f'/QQPlot_{eachAssetName}.png')

            # Show it:
            if showIt:
                plt.show()

    ######################### PLOTS #########################
def normal(x):
    print('Shapiro-Wilk p =', stats.shapiro(x)[1])
    print('Jarque-Bera p =', stats.jarque_bera(x)[1])
    print('QQ plot')
    qqplot(x, line='s')
    pyplot.show()
    return 0
示例#4
0
def test_distribution(dataframe, t=None):
    def print_res(p, alpha):
        print('p = ', p)
        if np.isnan(p):
            print('p is null')
        elif p < alpha:
            print(
                "The null hypothesis of normality can be rejected --> NOT NORMAL"
            )
        else:
            print(
                "The null hypothesis of normality cannot be rejected --> LIKELY NORMAL"
            )

    alpha = 0.05
    global arr
    arr = dataframe.values.flatten()
    arr = arr[~np.isnan(arr)]
    corrected = (arr - np.mean(arr)) / np.std(arr)
    plt.hist(corrected, bins=15)
    plt.suptitle(t)
    plt.show()
    qqplot(corrected)
    plt.show()
    # test raw values
    print("Raw Data:")
    k2, p = normaltest(corrected)
    print_res(p, alpha)
示例#5
0
def plot(data):
    n, bins, patches = plt.hist(np.array(data), 50)
    mu = np.mean(data)
    sigma = np.std(data)
    print("Mean: {}, std: {}".format(mu, sigma))
    # Shapiro test
    stat, p = shapiro(np.array(data))
    if p > 0.05:
        print("Shapiro: Data is normally distributed")
    else:
        print("Shapiro: Data is NOT normally distributed, p-value: {}".format(p))
    stat, p = normaltest(np.array(data))
    if p > 0.05:
        print("D'Agostino: Data is normally distributed")
    else:
        print("D'Agostino: Data is NOT normally distributed, p-value: {}".format(p))
    result = anderson(np.array(data))
    p = 0
    for i in range(len(result.critical_values)):
	    sl, cv = result.significance_level[i], result.critical_values[i]
	    if result.statistic < result.critical_values[i]:
		    print('Anderson: %.3f: %.3f, data looks normal (fail to reject H0)' % (sl, cv))
	    else:
		    print('Anderson: %.3f: %.3f, data does not look normal (reject H0)' % (sl, cv))
    plt.plot(bins, norm.pdf(bins, mu, sigma))
    qqplot(np.array(data), line='s')
    plt.show()
示例#6
0
    def _plot_data(self):

        for plot in ["MA", "STD"]:
            plt.figure()
            col_temp = [
                col1 for col1 in self.data.columns if str(plot) in col1
            ]
            for col in col_temp:
                plt.plot(self.data['DateTime_Stamp'], self.data[col])

            plt.title(plot)
            plt.xticks(rotation=90)
            plt.subplots_adjust(bottom=0.2)
            plt.show()

        plt.figure()
        plt.hist(self.data['Y_label'], bins=np.arange(0.00, 1.01, 0.05))
        qqplot(self.data['Y_label'], line='s')

        plt.figure()
        plt.plot(self.data['DateTime_Stamp'], self.data["OPEN_Bid"])
        col_temp = [col1 for col1 in self.data.columns if "Band" in col1]
        for col in col_temp:
            plt.plot(self.data['DateTime_Stamp'], self.data[col])

        return self
 def qq_plot(self, column=None):
     if column:
         series = self.data[column]
     else:
         column = self.data_column
         series = self.data[self.data_column]
     qqplot(series, line='r')
     plt.show()
def qq_plot(data):

    #convert data (python list) to a numpy array
    data = np.array(data)

    #plot the data
    qqplot(data, line='s')
    pyplot.show()
示例#9
0
def plot(label):
    QQdata = pd.read_csv(
        f"~/Dropbox/Fundamental Market Research/QQPlots/{label}.csv")
    numpyQQdata = QQdata.values
    newdata = numpy.array(
        [numpyQQdata[k, 0] for k in range(numpy.size(numpyQQdata))])
    qqplot(newdata, line='s')
    pyplot.show()
示例#10
0
def qq_plot():
    # seed the random number generator
    seed(1)
    # generate univariate observations
    data = 5 * randn(100) + 50
    # q-q plot
    qqplot(data, line='s')
    pyplot.show()
def AR_model(X, data_in, lag, i):
    model = AR(data_in)
    results_AR = model.fit(maxlag=lag, disp=0)
    AR_data = results_AR.fittedvalues
    act = data_in[3:]
    print("Parameters of Autoregressive Model AR(%d) are:" % lag)
    print(results_AR.params)
    plt.figure()
    plt.plot(act, color='blue', label='Actual Value')
    plt.plot(results_AR.fittedvalues, color='red', label="Predicted Value")
    plt.legend(loc='best')
    plt.xlabel("Time")
    plt.ylabel("Time series values")
    plt.title('AR(' + str(lag) + ")" + "Model with RMSE:" +
              str(np.sqrt((np.sum(np.square(AR_data - act))) / len(act))))
    plt.title("AR Fit (not scaled)")
    plt.savefig("AR fit not scaled" + str(i))
    #plt.show()
    inverted_in = [
        inverse_difference(X[i], data_in[i]) for i in range(len(AR_data))
    ]
    inverted_AR = [
        inverse_difference(X[i], AR_data[i]) for i in range(len(AR_data))
    ]

    plt.figure()
    plt.plot(inverted_in, color='red', label="actual value")
    plt.plot(inverted_AR, color='blue', label="predicted value")
    plt.legend(loc='upper left')
    plt.title(
        "Comparison of predicted and actual values for Autoregression model, lag"
        + str(lag))
    plt.savefig(" AR Fit Final" + str(i))

    print("RMSE on the Data is:" +
          str(np.sqrt((np.sum(np.square(AR_data - act))) / len(act))))

    residuals = results_AR.resid
    plt.figure()
    plt.title("Residual Scatter Plot")
    plt.scatter(AR_data, residuals)
    plt.savefig("residuals" + str(i))
    #plt.show()

    plt.figure()
    qqplot(residuals)
    plt.title("Residual Q-Q Plot")
    plt.savefig("QQ" + str(i))

    plt.figure()
    plt.hist(residuals)
    plt.title("Residual Histogram")
    plt.savefig("Hist" + str(i))

    k2, p = stats.normaltest(residuals)
    alpha = 0.001
    print("Chi-Square Test : k2 = %.4f  p = %.4f" % (k2, p))
    print("two sided chi squared probability :" + str(p))
示例#12
0
def newDeath(request):
    plt.clf()

    boxPlot('new_deaths')
    uri = renderMatplotlib(plt)

    describe = df['new_deaths'].describe()
    describes = {
        "count": describe['count'],
        "mean": describe['mean'],
        "std": describe['std'],
        "min": describe['min'],
        "haiNam": describe['25%'],
        "namMuoi": describe['50%'],
        "bayNam": describe['75%'],
        "max": describe['max'],
        "median": df['new_deaths'].median(),
        "mode": df['new_deaths'].mode()
    }

    doPhanTan = {
        "IQR": interquartile_range('new_deaths'),
        "var": df['new_deaths'].var(),
        "std": df['new_deaths'].std(),
    }

    mucDo = {
        "knewness": df['new_deaths'].skew(),
        "kurtosis": df['new_deaths'].kurtosis()
    }

    plt.clf()
    fig, ax = plt.subplots()
    df['new_deaths'].plot.kde(ax=ax,
                              legend=False,
                              title='Histogram new_deaths')
    df['new_deaths'].plot.hist(density=True, ax=ax, color='red')
    ax.set_ylabel('new_deaths')
    ax.grid(axis='y')
    ax.set_facecolor('#d8dcd0')
    hist = renderMatplotlib(plt)

    plt.clf()
    x = df['new_deaths']
    data = randn(len(x))
    qqplot(data, line='s')
    plt.title('Biểu đồ phân phối chuẩn của new_deaths')
    kiemDinh = renderMatplotlib(plt)

    data = {
        "uri": uri,
        "describe": describes,
        "doPhanTan": doPhanTan,
        "mucDo": mucDo,
        "hist": hist,
        "kiemDinh": kiemDinh
    }
    return render(request, 'components/newDead.html', {"data": data})
示例#13
0
def normality_test(ts):  # Completed
    """
    Performs a series of hypothesis tests about normality
    on the time series data distribution. Besides
    the result of the statistical test, this also includes
    a quantile plot of the data (qqplot).

    Note: Shapiro & Kolmogorov-Smirnov Tests can still produce
    inconsistencies if the data set (size) is to small to detect
    non-normality.
    """
    ts = ts_to_list(ts)
    data = np.array(ts)
    # Shapiro-Wilk test: Detects all departures from normality.
    # Rejects the hypothesis of normality when the p-value is <= to 0.05.
    # i.e not from a normal distribution.
    stat_sw, p_sw = shapiro(data)  # (1) Normality test
    # Kolmogorov-Smirnov: Tests the sample data against
    # another sample, to compare their distributions for
    # similarities, not just for normal distributions.
    # If p < .05 we can reject the null, meaning our sample
    # distribution is not identical to a normal distribution.
    stat_ks, p_ks = normaltest(data)  # (2) Normality test
    # Anderson-Darling: Test is the data comes from a particular
    # distribution (one of many). Modified version of the
    # Kolmogorov-Smirnov to check for normality. However, rather
    # Than a p-value, we're given an array of critical values
    # where the hypothesis can be rejected.
    stat_ad = anderson(data)  # (3) Normality test
    # Print results of all 3 tests
    print(f'\nShapiro-Wilk Statistic Test Result: {stat_sw:.3f}')
    print(f'P-value: {p_sw}: ', end='')
    # Check if (SW) from normal distribution or not.
    if p_sw < 0.05:
        print("Null Hypothesis Rejected. Not from normal distribution.\n")
    else:
        print("Accepted Null Hypothesis.\n")
    print(f'Kolmogorov-Smirnov Statistic Test Result: {stat_ks:.3f}')
    print(f'P-value: {p_ks}', end='')
    # Check if (KS) from normal distribution or not.
    if p_ks < 0.05:
        print("Null Hypothesis Rejected. Not from normal distribution.\n")
    else:
        print("Accepted Null Hypothesis. Can occurs if data set is too small.")
    print(f'Anderson-Darling Statistic Test Result: {stat_ad.statistic}')
    # Check if (AD) from normal distribution or not.
    for i in range(len(stat_ad.critical_values)):
        st, cv = stat_ad.significance_level[i], stat_ad.critical_values[i]
        if stat_ad.statistic < stat_ad.critical_values[i]:
            print(f'{st:.3f}: {cv:.3f}: Accepted. From normal distribution')
        else:
            print(f'{st:.3f}: {cv:.3f}: Rejected. Data not normal')
    # Plots a standardized line, scaled by the SD of the time series.
    qqplot(data, line='s')
    plt.show()

    ts = list_to_ts(ts)
    return ts
def residooMultipleRegression(Y, A, B1, B2, B3, B4, x1, x2, x3, x4): # Y is the ChangeNominal/Real, A is the intercept B1 - B4 are the corresponding coefficients of x1-x4
    residual = numpy.empty_like(Y)
    i = 0
    while i < T-1:
        PredicatedY = (B1 * x1[i]) + (B2 * x2[i]) + (B3 * x3[i]) + (B4 + x4[i])
        residual[i] = Y[i] - PredicatedY
        i += 1
    qqplot(residual, line = 's')
    plt.show()
示例#15
0
def QQ_plot(data):
    # QQ Plot
    from numpy.random import seed
    from numpy.random import randn
    from statsmodels.graphics.gofplots import qqplot
    from matplotlib import pyplot
    # q-q plot
    qqplot(data, line='s')
    pyplot.show()
示例#16
0
def spatial_QQ_plots(ALLdata, timestamps):
    """
    INPUTS
        stations - list of station objects
        start - tuple or list in the form of (year,month,day)
        end - tuple or list in the form of (year,month,day)
        var - the variable to make the QQ plots for
    """

    labels = ['temp', 'direction', 'speed', 'solar']

    num_times = len(timestamps)

    fig, axes = plt.subplots(num_times,
                             4,
                             figsize=(13, int(np.round(num_times * 13 / 4))),
                             dpi=80,
                             facecolor='w',
                             edgecolor='k')

    for row, ts in enumerate(timestamps):
        temp = []
        wind_dir = []
        wind_speed = []
        solar = []

        for station in ALLdata.WSdata:
            df = station.data_binned.loc[ts]
            temp.append(df['temp:'])
            wind_dir.append(df['dir:'])
            wind_speed.append(df['speed:'])
            solar.append(df['solar:'])

        lst = [
            np.matrix(sorted(temp)).T,
            np.matrix(sorted(wind_dir)).T,
            np.matrix(sorted(wind_speed)).T,
            np.matrix(sorted(solar)).T
        ]

        for col, vals in enumerate(lst):
            ax = axes[row, col]
            qqplot(vals, line='s', ax=ax)
            ax.set_xlabel('')
            ax.set_ylabel('')
            k2, p = sps.shapiro(vals)
            p_str = '{:.2f}'.format(p)
            print('timestamp:{}, var:{}, p: {}'.format(ts, labels[col], p_str))
            ax.set_xlabel('Shap p:{}'.format(p_str))
            #ax.annotate('Shap p:{}'.format(p_str),xy = (0.05,.9),xytext = (0.05,.9), textcoords='axes fraction',horizontalalignment='left', verticalalignment='top')
            #ax.annotate('p'.format(p),xy = (0.05,.9),xytext = (0.05,.9), textcoords='axes fraction',horizontalalignment='left', verticalalignment='top')
            if col == 0:
                ax.set_ylabel(ts)
            if row == 0:
                ax.title.set_text(labels[col])

    fig.tight_layout(pad=1.1)
示例#17
0
 def test_qqplot_pltkwargs(self, close_figures):
     qqplot(
         self.res,
         line="r",
         marker="d",
         markerfacecolor="cornflowerblue",
         markeredgecolor="white",
         alpha=0.5,
     )
示例#18
0
def explore_series(df, columns_explore, plot_lags=20):
    # Figure setup
    fig_out = plt.figure(figsize=(22, 24))
    plot_cols = len(columns_explore)
    position = 0
    # create lists for output values from adfuller tests
    output_ADF = []
    output_pval = []
    output_crit1 = []
    output_crit5 = []
    output_crit10 = []
    output_labels = []

    # loop through the columns of interest
    for column in df:
        if column in columns_explore:
            # time history plot
            df[column].plot(ax=plt.subplot2grid((5, plot_cols), (0, position)),
                            title=column)
            # histogram
            df[column].hist(ax=plt.subplot2grid((5, plot_cols), (1, position)))
            # qqplot to check normality
            qqplot(df[column],
                   line='r',
                   ax=plt.subplot2grid((5, plot_cols), (2, position)))
            # autocorrelation plot
            plot_acf(df[column].dropna(),
                     lags=plot_lags,
                     ax=plt.subplot2grid((5, plot_cols), (3, position)))
            # partial autocorrelation plot
            plot_pacf(df[column].dropna(),
                      lags=plot_lags,
                      ax=plt.subplot2grid((5, plot_cols), (4, position)))
            position += 1

            # run adfuller test and append results to lists
            result = adfuller(df[column].dropna())
            output_ADF.append(result[0])
            output_pval.append(result[1])
            output_crit1.append(result[4]['1%'])
            output_crit5.append(result[4]['5%'])
            output_crit10.append(result[4]['10%'])
            output_labels.append(column)

    # create dataframe for the adfuller results
    df_out = pd.DataFrame(columns=output_labels,
                          index=[
                              'ADF_Statistic', 'p-value', 'Critical_1percent',
                              'Critical_5_percent', 'Critical_10_percent'
                          ])
    df_out.iloc[0] = output_ADF
    df_out.iloc[1] = output_pval
    df_out.iloc[2] = output_crit1
    df_out.iloc[3] = output_crit5
    df_out.iloc[4] = output_crit10

    return df_out, fig_out
示例#19
0
def isGaussian(data):
        
    # histogram plot --------------------------------------------------------------
    print("Histogram plot -------------------------------------")
    pyplot.hist(data)
    pyplot.show()
    
    
    #QQPlot ----------------------------------------------------------------------
    print("QQ plot -------------------------------------")
    from statsmodels.graphics.gofplots import qqplot
    # q-q plot
    qqplot(data, line='s')
    pyplot.show()
    
    #shapiro wilk-test -----------------------------------------------------------
    print("Shapiro-Wilk test -------------------------------------")
    from scipy.stats import shapiro
    stat, p = shapiro(data)
    print('Statistics=%.3f, p=%.3f' % (stat, p))
    
    # interpret
    alpha = 0.05
    
    if p > alpha:
        print('Sample looks Gaussian (fail to reject H0)')
    else:
        print('Sample does not look Gaussian (reject H0)')
        
    #D'Agostino's K^2 Test --------------------------------------------------------
    print("D'Agostino's K^2 test -------------------------------------")
    from scipy.stats import normaltest
    # normality test
    stat, p = normaltest(data)
    print('Statistics=%.3f, p=%.3f' % (stat, p))
    # interpret
    alpha = 0.05
    
    if p > alpha:
        print('Sample looks Gaussian (fail to reject H0)')
    else:
        print('Sample does not look Gaussian (reject H0)')
    
    #Anderson-Darling Test --------------------------------------------------------
    print("Anderson-Darling test -------------------------------------")
    from scipy.stats import anderson
    # normality test
    result = anderson(data)
    print('Statistic: %.3f' % result.statistic)
    p = 0
    for i in range(len(result.critical_values)):
        sl, cv = result.significance_level[i], result.critical_values[i]
        if result.statistic < result.critical_values[i]:
            print('%.3f: %.3f, data looks normal (fail to reject H0)' % (sl, cv))
        else:
            print('%.3f: %.3f, data does not look normal (reject H0)' % (sl, cv))
示例#20
0
def qq(a, logvalue, lognext, interval, NCHANGES):
    V, A, r, sigma = regression(a, logvalue, lognext, interval, NCHANGES)
    centralized = numpy.array([V[k] - r * A[k] for k in range(NCHANGES)])
    s1 = stats.shapiro(centralized)[0]
    s2 = stats.shapiro(centralized)[1]
    qqplot(centralized, line='r')
    #    fig = plt.figure()
    #    fig.savefig(im + 'eco_'+ Ecoregions[eid] + '_Raw_Data_residuals.png')
    pyplot.show()
    return s1, s2
def plot_qqplot(
    other_args: List[str],
    ticker: str,
    model_name: str,
    residuals: List[float],
):
    """Qqplot time series against a standard normal curve

    Parameters
    ----------
    other_args : str
        Command line arguments to be processed with argparse
    ticker : str
        Ticker of the stock
    model_name : str
        Model fitting name in use
    residuals : List[float]
        Residuals data
    """
    parser = argparse.ArgumentParser(
        add_help=False,
        prog="qqplot",
        description="""
            Qqplot time series against a standard normal curve
        """,
    )

    try:
        ns_parser = parse_known_args_and_warn(parser, other_args)
        if not ns_parser:
            return

        plt.figure(figsize=plot_autoscale(),
                   dpi=PLOT_DPI,
                   constrained_layout=True)

        qqplot(residuals,
               stats.distributions.norm,
               fit=True,
               line="45",
               ax=plt.gca())
        plt.title(f"Q-Q plot residuals from {model_name} on {ticker}")
        plt.ylabel("Sample quantiles")
        plt.xlabel("Theoretical quantiles")
        plt.grid(True)

        if gtff.USE_ION:
            plt.ion()

        plt.show()
        print("")

    except Exception as e:
        print(e, "\n")
        return
示例#22
0
def display_qqplot(
    name: str,
    df: pd.DataFrame,
    target: str,
    external_axes: Optional[List[plt.Axes]] = None,
):
    """Show QQ plot for data against normal quantiles

    Parameters
    ----------
    name : str
        Stock ticker
    df : pd.DataFrame
        Dataframe
    target : str
        Column in data to look at
    external_axes : Optional[List[plt.Axes]], optional
        External axes (1 axis is expected in the list), by default None
    """
    # Statsmodels has a UserWarning for marker kwarg-- which we don't use
    warnings.filterwarnings(category=UserWarning, action="ignore")
    data = df[target]

    # This plot has 1 axis
    if external_axes is None:
        _, ax = plt.subplots(
            figsize=plot_autoscale(),
            dpi=PLOT_DPI,
        )
    else:
        if len(external_axes) != 1:
            logger.error("Expected list of one axis item.")
            console.print("[red]Expected list of 1 axis items./n[/red]")
            return
        (ax, ) = external_axes

    qqplot(
        data,
        stats.distributions.norm,
        fit=True,
        line="45",
        color=theme.down_color,
        ax=ax,
    )
    ax.get_lines()[1].set_color(theme.up_color)

    ax.set_title(f"Q-Q plot for {name} {target}")
    ax.set_ylabel("Sample quantiles")
    ax.set_xlabel("Theoretical quantiles")

    theme.style_primary_axis(ax)

    if external_axes is None:
        theme.visualize_output()
    def qqplots(self, epoch, sub=None):
        assert 0 <= epoch <= self.epochs, 'epoch must be between 0 and %.0f, got %.0f' % (
            self.epochs, epoch)
        if not sub:
            sub = range(self.masks)

        for group in sub:
            gofplots.qqplot(self.data[group, :, epoch], fit=True, line='45')
            plt.suptitle("%s, %s, Epoch %.0f" %
                         (self.name, self.labels[group], epoch),
                         fontsize=18)
    def plot_QQ_plot(self, series_values):
        try:
            from numpy.random import seed
            from numpy.random import randn
            from statsmodels.graphics.gofplots import qqplot
            from matplotlib import pyplot

            qqplot(series_values, line='s')
            pyplot.show()
        except Exception as exc:
            raise exc
def residuals_eval(y_true, y_pred):
    y_res = y_true - y_pred
    df_res = DataFrame(y_res)
    print(df_res.describe())
    df_res.plot()#line
    pyplot.show()
    df_res.hist()#hist
    pyplot.show()
    df_res.plot(kind='kde')#density plot
    pyplot.show()
    qqplot(numpy.array(y_res), line='r')
    pyplot.show()
def residuals_charts(test_dataset, test_output):
    """Build residuals charts for one experiment."""
    prediction_series = test_output.mean(axis=0).reshape(test_output.shape[1])
    residuals = prediction_series - test_dataset.y_data.reshape(
        test_dataset.y_data.shape[0])
    pd.Series(residuals).hist(bins=30)
    print(stats.normaltest(residuals))
    gofplots.qqplot(residuals)
    xxx = np.linspace(-3.5, 3.5, 4)
    pyplot.plot(xxx, xxx)
    tsaplots.plot_acf(residuals, lags=30)
    tsaplots.plot_pacf(residuals, lags=30)
示例#27
0
 def residual_plot(self):
     '''
     Plot the residual and save it to current directory
     '''
     import matplotlib.pyplot as plt
     from scipy.stats import norm
     from statsmodels.graphics.gofplots import qqplot
     # set the size of the plot
     plt.figure(figsize=(16, 9))
     # plot the distribution
     ax = plt.subplot(121)
     # create bins and count the numbers
     count = pd.DataFrame([0] * 24, index=np.arange(-5.75, 6, step=0.5))
     for i in count.index:
         for r in self._residual:
             if r >= i - 0.25 and r < i + 0.25:
                 count.loc[i] += 1
     # create a normal distribution reference
     xx = np.linspace(-3, 3, 100)
     normal = norm.pdf(xx, np.mean(self._residual), np.std(self._residual))
     normalcdf = norm.cdf(xx, np.mean(self._residual),
                          np.std(self._residual))
     low_flag = True
     for i in range(xx.shape[0]):
         if normalcdf[i] >= 0.025 and low_flag:
             low = i
             low_flag = False
         if normalcdf[i] >= 0.975:
             high = i
             break
     # plot the distribution
     plt.plot(count.index, count, 'o', label="residual")
     plt.plot(xx, normal * self._residual.shape[0], '--', label="normal")
     plt.fill_between(xx[low:high],
                      0,
                      normal[low:high] * self._residual.shape[0],
                      alpha=.3,
                      facecolor="grey",
                      label="95% normal")
     plt.xlim([-6, 6])
     ax.set_ylim(bottom=5)
     ax.legend()
     plt.title("Distribution of the residual")
     plt.yscale('log')
     # plot the QQ plot
     ax = plt.subplot(122)
     qqplot(self._residual, line='s', ax=ax)
     plt.xlim([-3.5, 3.5])
     plt.ylim([-5, 5])
     plt.title("residual QQ plot")
     #plt.show()
     plt.savefig("residual_plots.png")
示例#28
0
def plot_Model_Identify(DataSet, frequency=1, acf_lag=12, pacf_lag=12):
    """
    DataSet : dataframe with the type of first column either int()
    or panda datetime
    Frequency : int, Seasonal Component period (in time step)
    """
    # Organize plot
    fig, ax = plt.subplots(3, 4)

    # Plot the Observed Data
    DataSet.plot(ax=ax[0, 0])
    ax[0, 0].set_title('Observed Value')
    ax[0, 0].set_xlabel("")

    # Plot the autocorrelation plot
    autocorrelation_plot(DataSet.iloc[:, 0], ax=ax[1, 0])
    ax[1, 0].set_title('Autocorrelation')

    # Plot the QQ plot
    qqplot(
        DataSet.iloc[:, 0],
        ax=ax[2, 0],
    )
    ax[2, 0].set_title('Q-Q Plot')

    # Lag plot
    lag_plot(DataSet.iloc[:, 0], ax=ax[0, 1])
    ax[0, 1].set_title('Lag Plot')
    ax[0, 1].set_ylabel("")
    ax[0, 1].set_xlabel("")

    # ACF Plot
    tsa.plot_acf(DataSet.iloc[:, 0], ax=ax[1, 1], lags=acf_lag, alpha=0.05)
    ax[1, 1].set_title('ACF')

    # PACF Plot
    tsa.plot_pacf(DataSet.iloc[:, 0], ax=ax[2, 1], lags=pacf_lag, alpha=0.05)
    ax[2, 1].set_title('PACF')

    # decomposition plot
    decomposition = sm.tsa.seasonal_decompose(DataSet.iloc[:, 0],
                                              freq=frequency)
    decomposition.resid.plot(ax=ax[0, 2])
    decomposition.resid.plot(ax=ax[0, 3], kind='kde')
    decomposition.seasonal.plot(ax=ax[1, 2])
    decomposition.trend.plot(ax=ax[2, 2])
    ax[0, 2].set_title('Residual')
    ax[1, 2].set_title('Seasonal')
    ax[2, 2].set_title('Trend')
    ax[0, 3].set_title('Residual Prob. Distrib')

    plt.show()
示例#29
0
def correctLin(x, y):
    n = numpy.size(x)
    r = stats.linregress(x, y)
    s = r.slope
    i = r.intercept
    print(r)
    residuals = numpy.array([y[k] - x[k] * s - i for k in range(n)])
    stderr = math.sqrt((1 / (n - 2)) * numpy.dot(residuals, residuals))
    qqplot(residuals, line='r')
    pyplot.show()
    print('Shapiro-Wilk p = ', stats.shapiro(residuals)[1])
    print('Jarque-Bera p = ', stats.jarque_bera(residuals)[1])
    return (residuals, s, i, stderr)
示例#30
0
def simpleLin(x, y):
    n = numpy.size(x)
    x = numpy.array(x)
    y = numpy.array(y)
    k = numpy.dot(x, y) / numpy.dot(x, x)
    residuals = y - k * x
    stderr = numpy.std(residuals)
    qqplot(residuals, line='r')
    pyplot.show()
    pyplot.plot(residuals)
    pyplot.show()
    print('normality', stats.shapiro(residuals))
    return (k, stderr)
示例#31
0
def plot_regress_analysis(model, influence=True, annotate=True):
    plt.figure(figsize=(15, 16))

    # Residuals vs Fitted
    ax = plt.subplot2grid((3, 2), (0, 0))
    ax.set_title("Residuals vs Fitted")
    ax.set_xlabel('Fitted values')
    ax.set_ylabel('Residuals')
    fitted = model.predict()
    residuals = model.resid
    ax.plot(fitted, residuals, marker='.', linestyle='')

    # Model non-linearity with quadratic
    polyline = np.poly1d(np.polyfit(fitted, residuals, 2))
    max_fitted = np.max(fitted)
    xs = np.append(np.arange(np.min(fitted), max_fitted), max_fitted)
    ax.plot(xs, polyline(xs), linewidth=2.5)

    # Q-Q plot
    ax = plt.subplot2grid((3, 2), (0, 1))
    ax.set_title("Q-Q")
    qqplot(model.resid_pearson, dist="norm", line='r', ax=ax)

    # Scale-Location
    ax = plt.subplot2grid((3, 2), (1, 0))
    ax.set_title("Scale-Location")
    ax.set_xlabel('Fitted values')
    ax.set_ylabel('$|$Normalized residuals$|^{1/2}$')
    std_residuals = np.sqrt(np.abs(model.resid_pearson))
    ax.plot(fitted, std_residuals, linestyle='', marker='.')

    # Model non-linearity with quadratic
    polyline = np.poly1d(np.polyfit(fitted, std_residuals, 2))
    ax.plot(xs, polyline(xs), linewidth=2.5)

    # Residuals vs Leverage
    ax = plt.subplot2grid((3, 2), (1, 1))
    plot_leverage_resid2(model, ax, annotate=annotate)

    # Influence plot
    if influence:
        ax = plt.subplot2grid((3, 2), (2, 0), colspan=2)
        ax = influence_plot(model, ax=ax)
示例#32
0
    def env_corr(self, env_vars, coeff_plot=False, qq_plot=False):
        """
        Determine correlations with environmental/non-discretionary variables
        using a logit regression. Tobit will be implemented when available
        upstream in statsmodels.

        Takes:
            env_vars: A pandas dataframe of environmental variables

        Returns:
            corr_mod: the statsmodels' model instance containing the inputs
                      and results from the logit model.

        Note that there can be no spaces in the variables' names.
        """

        import matplotlib.pyplot as plt
        from statsmodels.regression.linear_model import OLS
        from statsmodels.graphics.gofplots import qqplot
        from seaborn import coefplot

        env_data = _to_dataframe(env_vars)
        corr_data = env_data.join(self['Efficiency'])
        corr_mod = OLS.from_formula(
            "Efficiency ~ " + " + ".join(env_vars.columns), corr_data)
        corr_res = corr_mod.fit()

        #plot coeffs
        if coeff_plot:
            coefplot("Efficiency ~ " + " + ".join(env_vars.columns),
                     data=corr_data)
            plt.xticks(rotation=45, ha='right')
            plt.title('Regression coefficients and standard errors')

        #plot qq of residuals
        if qq_plot:
            qqplot(corr_res.resid, line='s')
            plt.title('Distribution of residuals')

        print(corr_res.summary())

        return corr_res
def draw_figures():
    bdims = pd.read_csv("bdims.csv")
    fdims = bdims[ bdims["sex"] == 0]

    fig, plots = plt.subplots(4, 2)

    biidi = standardize(fdims["bii.di"])
    elbdi = standardize(fdims["elb.di"])
    age = standardize(bdims["age"])
    chede = standardize(fdims["che.de"])

    plots[0][0].hist(biidi, bins=range(-4,4))
    plots[1][0].hist(elbdi, bins=range(-3,5))
    plots[2][0].hist(age, bins=range(-2,5))
    plots[3][0].hist(chede, bins=range(-2,6))

    plots[0][0].set_title("Histogram of female biiliac diameter")
    plots[1][0].set_title("Histogram of female elbow diameter")
    plots[2][0].set_title("Histogram of general age")
    plots[3][0].set_title("Histogram of female chest depth")
    
    qqplot(biidi, ax=plots[1][1], line="q")
    qqplot(elbdi, ax=plots[2][1], line="q")
    qqplot(age, ax=plots[3][1], line="q")
    qqplot(chede, ax=plots[0][1], line="q")

    plots[0][1].set_title("Normal Q-Q Plot A")
    plots[1][1].set_title("Normal Q-Q Plot B")
    plots[2][1].set_title("Normal Q-Q Plot C")
    plots[3][1].set_title("Normal Q-Q Plot D")
  
    for i in range(0,4):
        plots[i][0].set_xlabel("standarized data")
        plots[i][0].set_ylabel("frequency")

    fig.set_size_inches(12, 12)
    plt.tight_layout()

    return fig
示例#34
0
        print (final_test_loss)


for folder in folders:
    print
    print folder
    for d in data[folder]:
        breakeven_points = util.find_breakeven(d["curve"])
        pr[folder].append(breakeven_points[-1])
        print (breakeven_points[-1])

print ("Loss samples t test")
# Random samples from normal
s = np.random.normal(np.mean(lc[folders[0]]), np.std(lc[folders[1]]), 100)
print ("Random samples", scipy.stats.shapiro(s))
fig = qqplot(s, scipy.stats.norm, fit=True, line="45")
plt.show()

# First folder figures
fig = qqplot(np.array(lc[folders[0]]), scipy.stats.norm, fit=True, line="45")
plt.show()
print (folders[0], scipy.stats.shapiro(np.array(lc[folders[0]])))

# Second folder figures
fig = qqplot(np.array(lc[folders[1]]), scipy.stats.norm, fit=True, line="45")
plt.show()
print (folders[1], scipy.stats.shapiro(np.array(lc[folders[1]])))

tstat, pval = perform_welchs_test(lc[folders[0]], lc[folders[1]])
print ("t-statistics = {}".format(tstat))
print ("p-value = {}".format(pval))
示例#35
0
文件: CAPM.py 项目: GBelzoni/BigGits
irf.plot_cum_effects(orth=False)
plt.show()
fevd = results.fevd(1)
fevd.summary()
fevd.plot()
plt.show()
results.test_causality('DJIA', ['SP500'],kind='f')
results.test_causality('SP500', ['DJIA'],kind='f')
results.test_normality(signif=0.05,verbose=False)

resids = results.resid.sum(axis=1)
resids.plot()
plt.show()
from statsmodels.graphics.gofplots import qqplot, qqline

qqplot(data=resids,line='s')#, dist, distargs, a, loc, scale, fit, line, ax)
plt.show()
from statsmodels.sandbox.tsa.garch import Garch
rets1 = rets_bm.iloc[:,1]
Garch(rets1)

#Getting GARCH working -using RPY2
import rpy2
rpy2.__version__

t =datetime(2013,1,5)
from pandas.tseries.offsets import MonthEnd, BusinessMonthBegin

(t + 2*BusinessMonthBegin())> datetime(2013,1,1)

示例#36
0
				ma['teams'][0],
				ma['teams'][1],
				ma['score'][0],
				ma['score'][1],
				fitted[mai],
		)
		mfile.write(towrite)
		mfile.write('\n')

print
print 'residual_std: %.10f' % (resid.std())
print

if PLOT_RESIDUAL_QQ:
	import statsmodels.graphics.gofplots as sgg
	sgg.qqplot(resid, fit=True)

if PLOT_RESIDUAL_HIST:
	import pylab
	import scipy.stats as ss

	freqs, lefts = np.histogram(resid, bins = 'auto', density = True)
	centers = (lefts[:-1] + lefts[1:]) / 2
	pylab.bar(centers, freqs, width = centers[1] - centers[0])

	empirical_dist = ss.norm(*(ss.norm.fit(resid)))
	pylab.plot(centers, [empirical_dist.pdf(x) for x in centers], 'g-', linewidth = 5)
	
if PLOT_SCATTER_TOT_EXP_RESIDUALS:
	import pylab
	pylab.scatter(tot_exps, resid, marker = '.', s = 1)