def get_univariate_dist( data, kernel="gau", fft=True, bw="scott", gridsize=100, cut=3, clip=None ): kde = smnp.KDEUnivariate(data) kde.fit(kernel, bw, fft, gridsize=gridsize, cut=cut, clip=clip) grid, y = kde.support, kde.density return grid, y
def _statsmodels_univariate_kde(data, kernel, bw, gridsize, cut, clip, cumulative=False): """Compute a univariate kernel density estimate using statsmodels.""" # statsmodels 0.8 fails on int type data data = data.astype(np.float64) fft = kernel == "gau" kde = smnp.KDEUnivariate(data) try: kde.fit(kernel, bw, fft, gridsize=gridsize, cut=cut, clip=clip) except RuntimeError as err: # GH#1990 if stats.iqr(data) > 0: raise err msg = "Default bandwidth for data is 0; skipping density estimation." warnings.warn(msg, UserWarning) return np.array([]), np.array([]) if cumulative: grid, y = kde.support, kde.cdf else: grid, y = kde.support, kde.density return grid, y
def kernel_density_estimation(univariate_dataset, k="gau", bw=1): kernel = k bandwidth = bw fft = kernel == "gau" kde = smnp.KDEUnivariate(univariate_dataset) kde.fit(kernel, bandwidth, fft) x, y = kde.support, kde.density return x, y
def mode(data): """Compute a kernel density estimate and return the mode""" if len(np.unique(data)) == 1: return data[0] else: kde = smnp.KDEUnivariate(data.astype('double')) kde.fit(cut=0) grid, y = kde.support, kde.density return grid[y == y.max()][0]
def statsmodels_univariate_kde(data, kernel, bw, gridsize, cut, clip): """Compute a univariate kernel density estimate using statsmodels.""" if clip is None: clip = (-np.inf, np.inf) fft = kernel == "gau" kde = smnp.KDEUnivariate(data) kde.fit(kernel, bw, fft, gridsize=gridsize, cut=cut, clip=clip) grid, y = kde.support, kde.density return grid, y
def get_ymax(self, data): if np.isnan(data).all(): return 0 kde = smnp.KDEUnivariate(data) kde.fit() maxval = np.nanmax(kde.density) if math.isnan(maxval): maxval = 0 return maxval
def _statsmodels_univariate_kde(data, kernel, bw, gridsize, cut, clip, cumulative=False): """Compute a univariate kernel density estimate using statsmodels.""" fft = kernel == "gau" kde = smnp.KDEUnivariate(data) kde.fit(kernel, bw, fft, gridsize=gridsize, cut=cut, clip=clip) if cumulative: grid, y = kde.support, kde.cdf else: grid, y = kde.support, kde.density return grid, y
def Kde(data, bw=args.bw, kernel="gau", gridsize=100., cut=args.cut, clip=(-np.inf, np.inf), cumulative=False): """Compute a univariate kernel density estimate using statsmodels.""" fft = kernel == "gau" kde = smnp.KDEUnivariate(np.array([float(el) for el in data])) kde.fit(kernel, bw, fft, gridsize=gridsize, cut=cut, clip=clip) return kde
def get_ymax(self, data): if np.isnan(data).all(): return 0 # if there's just one value in the data, fit breaks uniq_values = data.unique() if len(uniq_values) == 1: return uniq_values[0] kde = smnp.KDEUnivariate(data) kde.fit() maxval = np.nanmax(kde.density) if math.isnan(maxval): maxval = 0 return maxval
def _univariate_kdeplot(data, scale=None, shade=False, kernel="gau", bw="scott", gridsize=100, cut=3, clip=None, legend=True, cumulative=False, shade_lowest=True, ax=None, **kwargs): if ax is None: ax = plt.gca() if clip is None: clip = (-np.inf, np.inf) scaled_data = scale(data) # mask out the data that's not in the scale domain scaled_data = scaled_data[~np.isnan(scaled_data)] # Calculate the KDE fft = (kernel == "gau") kde = smnp.KDEUnivariate(scaled_data) kde.fit(kernel, bw, fft, gridsize=gridsize, cut=cut, clip=clip) x, y = scale.inverse(kde.support), kde.density # Make sure the density is nonnegative y = np.amax(np.c_[np.zeros_like(y), y], axis=1) # Check if a label was specified in the call label = kwargs.pop("label", None) color = kwargs.pop("color", None) # Draw the KDE plot and, optionally, shade ax.plot(x, y, color=color, label=label, **kwargs) alpha = kwargs.get("alpha", 0.25) if shade: ax.fill_between(x, 1e-12, y, facecolor=color, alpha=alpha) return ax
def get_kde_threshold(array): dens = smnp.KDEUnivariate(array) dens.fit(gridsize=np.max(array).astype(int), bw=2000) x, y = dens.support, dens.density peaks = find_peaks(y) peaks = peaks[0] highest_peaks = peaks[y[peaks].argsort( )[-2:][::-1]] # we get the indices of the two highest peaks try: thresh = (x[highest_peaks[0]] - x[highest_peaks[1]]) / 4 + x[highest_peaks[1]] except: thresh = np.min(array) # we get the threshold. code works on assumption that there is a small peak followed by a large peak # in the distribution of the rotated rectangle area return thresh
def kde_sm(data, kernel='gau', bw='scott', gridsize=None, cut=3, clip=(-np.inf, np.inf), cumulative=False): import statsmodels.nonparametric.api as smnp fft = kernel == 'gau' kde = smnp.KDEUnivariate(data) # noinspection PyTypeChecker kde.fit(kernel, bw, fft, gridsize=gridsize, cut=cut, clip=clip) if cumulative: grid, y = kde.support, kde.cdf else: grid, y = kde.support, kde.density return pd.Series(y, index=grid)
def plot_kde(array, path: str, animal_name): dens = smnp.KDEUnivariate(array) dens.fit(gridsize=2000, bw=2000) x, y = dens.support, dens.density matplotlib.rcParams.update(matplotlib.rcParamsDefault) thresh = utils.get_kde_threshold(array) sns.set_style('whitegrid') fig = plt.figure(figsize=(10, 10)) plt.plot(x, y) plt.axvline(thresh, linestyle='-.', color='red', label=f'threshold={thresh // 1}') plt.xlabel('Rotated Rectangle Area') plt.ylabel('Kernel Density') plt.title('Rotated Rectangle Area KDE') plt.legend() fig.savefig(path + '/' + animal_name + '_eyelid_density.pdf')
def kde( self, data, gridsize=10, fft=True, kernel="gau", bw="scott", cut=3, clip=(-np.inf, np.inf), ): if bw == "scott": bw = stats.gaussian_kde(data).scotts_factor() * data.std(ddof=1) kde = smnp.KDEUnivariate(data) # create the grid to fit the estimation. support_min = min(max(data.min() - bw * cut, clip[0]), 0) support_max = min(data.max() + bw * cut, clip[1]) x = np.linspace(support_min, support_max, gridsize) kde.fit("gau", bw, fft, gridsize=gridsize, cut=cut, clip=clip) y = kde.density return x, y
def DataTransformation(Numerical_data, Time_name, Outlier_ratio = 50, bw = 1, kernel = "gau", threshold_of_the_number_of_categories = 5): """ Inputs: Numerical_data: Numerical data stored in a DataFrame. Time_name: Name of the time variable in the numerical data. Outlier_ratio: Scale factor of the outlier threshold. bw: Bandwidth. kernel: Kernel function. threshold_of_the_number_of_categories: Threshold of the number of categories. Outputs: Categorical_data: Categorical data. """ Features = list(Numerical_data.keys()) Features.remove(Time_name) Categorical_data = copy.deepcopy(Numerical_data) for feature in Features: #####################Kernel density estimation##################### fft = "True" kde = smnp.KDEUnivariate([float(x) for x in Numerical_data[feature]]) kde.fit(kernel, bw, fft) x, y = kde.support, kde.density #####################Initial data classification##################### outlier_threshold = max(y)/Outlier_ratio #Threshold of the probability density of outliers #Obtain valley values valley_values = [] for i in range(len(y)): if i == 0: if y[i] >= outlier_threshold and y[i] < y[i+1]: valley_values.append(i) elif i == len(y)-1: if y[i] >= outlier_threshold and y[i] < y[i-1]: valley_values.append(i) else: if y[i] >= outlier_threshold and y[i] < y[i-1] and y[i] < y[i+1]: valley_values.append(i) if y[i] >= outlier_threshold and y[i] < y[i-1] and y[i+1] < outlier_threshold: valley_values.append(i) if y[i] >= outlier_threshold and y[i] < y[i+1] and y[i-1] < outlier_threshold: valley_values.append(i) #Obtain valley values peak_values = [] for i in range(len(y)): if i == 0: if y[i] >= outlier_threshold and y[i] > y[i+1]: peak_values.append(i) elif i == len(y)-1: if y[i] >= outlier_threshold and y[i] > y[i-1]: peak_values.append(i) else: if y[i] >= outlier_threshold and y[i] > y[i-1] and y[i] > y[i+1]: peak_values.append(i) #Obtain intervals of categories Intervals_of_categories = [] for i in peak_values: if i == 0: valley = [x for x in valley_values if x > i] Intervals_of_categories.append([i, i, valley[0]]) elif i == len(y)-1: valley = [x for x in valley_values if x < i] Intervals_of_categories.append([valley[-1], i, i]) else: left_valley = [x for x in valley_values if x < i] right_valley = [x for x in valley_values if x > i] Intervals_of_categories.append([left_valley[-1], i, right_valley[0]]) #####################Merge categories if it is necessary##################### while(len(Intervals_of_categories) > threshold_of_the_number_of_categories): number_of_categories_old = len(Intervals_of_categories) minimum_interval_size = np.inf for i in range(len(Intervals_of_categories)): if x[Intervals_of_categories[i][2]]-x[Intervals_of_categories[i][0]] < minimum_interval_size: if i == 0 and Intervals_of_categories[i][2] == Intervals_of_categories[i+1][0]: minimum_interval_size = x[Intervals_of_categories[i][2]]-x[Intervals_of_categories[i][0]] category_to_be_merged = i if i == len(Intervals_of_categories)-1 and Intervals_of_categories[i-1][2] == Intervals_of_categories[i][0]: minimum_interval_size = x[Intervals_of_categories[i][2]]-x[Intervals_of_categories[i][0]] category_to_be_merged = i if i != 0 and i != len(Intervals_of_categories)-1: if Intervals_of_categories[i-1][2] == Intervals_of_categories[i][0] or Intervals_of_categories[i][2] == Intervals_of_categories[i+1][0]: minimum_interval_size = x[Intervals_of_categories[i][2]]-x[Intervals_of_categories[i][0]] category_to_be_merged = i if category_to_be_merged == 0: if y[Intervals_of_categories[category_to_be_merged][1]] > y[Intervals_of_categories[category_to_be_merged+1][1]]: Intervals_of_categories[category_to_be_merged] = [Intervals_of_categories[category_to_be_merged][0], Intervals_of_categories[category_to_be_merged][1], Intervals_of_categories[category_to_be_merged+1][2]] else: Intervals_of_categories[category_to_be_merged] = [Intervals_of_categories[category_to_be_merged][0], Intervals_of_categories[category_to_be_merged+1][1], Intervals_of_categories[category_to_be_merged+1][2]] del Intervals_of_categories[category_to_be_merged+1] elif category_to_be_merged == len(Intervals_of_categories)-1: if y[Intervals_of_categories[category_to_be_merged][1]] > y[Intervals_of_categories[category_to_be_merged-1][1]]: Intervals_of_categories[category_to_be_merged] = [Intervals_of_categories[category_to_be_merged-1][0], Intervals_of_categories[category_to_be_merged][1], Intervals_of_categories[category_to_be_merged][2]] else: Intervals_of_categories[category_to_be_merged] = [Intervals_of_categories[category_to_be_merged-1][0], Intervals_of_categories[category_to_be_merged-1][1], Intervals_of_categories[category_to_be_merged][2]] del Intervals_of_categories[category_to_be_merged-1] else: left_consistency_index = y[Intervals_of_categories[category_to_be_merged][1]] - y[Intervals_of_categories[category_to_be_merged][0]] right_consistency_index = y[Intervals_of_categories[category_to_be_merged][2]] - y[Intervals_of_categories[category_to_be_merged][1]] if left_consistency_index < right_consistency_index: if y[Intervals_of_categories[category_to_be_merged][1]] > y[Intervals_of_categories[category_to_be_merged-1][1]]: Intervals_of_categories[category_to_be_merged] = [Intervals_of_categories[category_to_be_merged-1][0], Intervals_of_categories[category_to_be_merged][1], Intervals_of_categories[category_to_be_merged][2]] else: Intervals_of_categories[category_to_be_merged] = [Intervals_of_categories[category_to_be_merged-1][0], Intervals_of_categories[category_to_be_merged-1][1], Intervals_of_categories[category_to_be_merged][2]] del Intervals_of_categories[category_to_be_merged-1] if left_consistency_index > right_consistency_index: if y[Intervals_of_categories[category_to_be_merged][1]] > y[Intervals_of_categories[category_to_be_merged+1][1]]: Intervals_of_categories[category_to_be_merged] = [Intervals_of_categories[category_to_be_merged][0], Intervals_of_categories[category_to_be_merged][1], Intervals_of_categories[category_to_be_merged+1][2]] else: Intervals_of_categories[category_to_be_merged] = [Intervals_of_categories[category_to_be_merged][0], Intervals_of_categories[category_to_be_merged+1][1], Intervals_of_categories[category_to_be_merged+1][2]] del Intervals_of_categories[category_to_be_merged+1] if len(Intervals_of_categories) == number_of_categories_old: print("Error: This variable cannot be merged") #####################Data transformation according to categories##################### Variable_ = [] for i in range(len(Numerical_data[feature])): flag = 0 for j in range(len(Intervals_of_categories)): if x[Intervals_of_categories[j][0]] <= Numerical_data[feature][i] and x[Intervals_of_categories[j][2]] >= Numerical_data[feature][i]: Variable_.append(feature + ": " + str(round(x[Intervals_of_categories[j][0]],2))+ "-" + str(round(x[Intervals_of_categories[j][2]],2))) flag = 1 break if flag == 0: Variable_.append(np.nan) Categorical_data[feature] = Variable_ return Categorical_data
def scaled_1d_kde_plot(data, shade, bandwidth='scott', vertical=False, legend=False, ax=None, density_scale=None, **kwargs): """Plot a univariate kernel density estimate on one of the axes. Adapted from _univariate_kdeplot from seaborn but allow user to scale densityu estimates using density_scale. """ if ax is None: ax = plt.gca() # Calculate the KDE kde = smnp.KDEUnivariate(data.astype('double')) kde.fit(bw=bandwidth) x, y = kde.support, kde.density if density_scale: y = density_scale * y / np.max(y) # Make sure the density is nonnegative y = np.amax(np.c_[np.zeros_like(y), y], axis=1) # Flip the data if the plot should be on the y axis if vertical: x, y = y, x # Check if a label was specified in the call label = kwargs.pop("label", None) # Otherwise check if the data object has a name if label is None and hasattr(data, "name"): label = data.name # Decide if we're going to add a legend legend = label is not None and legend label = "_nolegend_" if label is None else label # Use the active color cycle to find the plot color facecolor = kwargs.pop("facecolor", None) line, = ax.plot(x, y, **kwargs) color = line.get_color() line.remove() kwargs.pop("color", None) facecolor = color if facecolor is None else facecolor # Draw the KDE plot and, optionally, shade ax.plot(x, y, color=color, label=label, **kwargs) shade_kws = dict( facecolor=facecolor, alpha=kwargs.get("alpha", 0.25), clip_on=kwargs.get("clip_on", True), zorder=kwargs.get("zorder", 1), ) if shade: if vertical: ax.fill_betweenx(y, 0, x, **shade_kws) else: ax.fill_between(x, 0, y, **shade_kws) # Set the density axis minimum to 0 ax.set_ylim(0, auto=None) # Draw the legend here handles, labels = ax.get_legend_handles_labels() return ax, x, y
from scipy import stats import matplotlib.pyplot as plt import statsmodels.nonparametric.api as npar from statsmodels.sandbox.nonparametric import kernels from statsmodels.distributions.mixture_rvs import mixture_rvs # example from test_kde.py mixture of two normal distributions np.random.seed(12345) x = mixture_rvs([.25, .75], size=200, dist=[stats.norm, stats.norm], kwargs=(dict(loc=-1, scale=.5), dict(loc=1, scale=.5))) x.sort() # not needed kde = npar.KDEUnivariate(x) kde.fit('gau') ci = kde.kernel.density_confint(kde.density, len(x)) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) ax.hist(x, bins=15, density=True, alpha=0.25) ax.plot(kde.support, kde.density, lw=2, color='red') ax.fill_between(kde.support, ci[:, 0], ci[:, 1], color='grey', alpha='0.7') ax.set_title('Kernel Density Gaussian (bw = %4.2f)' % kde.bw) # use all kernels directly x_grid = np.linspace(np.min(x), np.max(x), 51)
def plotDistribution(dist): r"""Plots the fitted PDF, KDE and CDF as well as the PDF differences between fits, binning and KDE. The figure contains additional informations like: * Kolmogorov-Smirnov test statistics and P-values * The KDE difference defined by $$ \Delta PDF(x) = 2*[PDF_{KDE}(x) - PDF_{FIT}(x)]/[PDF_{KDE}(x) + PDF_{FIT}(x)] $$ and the integrated KDE difference is given by $$ \sqrt{ \int dx [\Delta PDF(x)]^2 } $$ Parameters ---------- dist : array or list, one dimensional Returns ------- fig : 'matplotlib.figure' Note ---- Abbreviations: * KDE : Kernel Density Estimate * PDF : Probability Density Function * CDF : Cumulative Density Function This routine uses seaborn to estimate the bins and KDE, scipy for the Kolmogorov-Smirnov test (https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test) and 'statsmodels' for estimating the KDE '''python >>> import statsmodels.nonparametric.api as smnp >>> kde = smnp.KDEUnivariate(data) >>> kde.fit(kernel="gau", bw="scott", fft=True, gridsize=100, cut=3) ''' 'seaborn' itself uses 'numpy' for binning where the number of bins is determined by the Freedman Diaconis Estimator (https://docs.scipy.org/doc/numpy/reference/generated/numpy.histogram.html). """ # Create the figure fig, axs = plt.subplots(dpi=400, figsize=(3, 3), nrows=3, sharex=True, gridspec_kw={'height_ratios': [1, 3, 1]}) # Set up plot styles baseLineStyle = {"color": "gray", "lw": 0.5, "ls": "--", "zorder": -1} fitLineStyle = {"lw": 0.9, "color": "red", "label": "Fit"} kdeLineStyle = { "lw": 0.0, "marker": ".", "ms": 3, "color": "green", "label": "KDE", } histLineStyle = { "rwidth": 0.9, "label": "Bins", } styles = { "Base": baseLineStyle, "Fit": fitLineStyle, "KDE": kdeLineStyle, "Bins": histLineStyle, } # Compute distribution fits mean, sdev = np.mean(dist), np.std(dist, ddof=1) # Kolmogorov-Smirnov test ksRes = stats.kstest(dist, 'norm', args=(mean, sdev)) # Estimate KDE and compare to normal kde = smnp.KDEUnivariate(dist) kde.fit(kernel="gau", bw="scott", fft=True, gridsize=100, cut=3) ## Get infinitesimal step size deltaX = kde.support[1] - kde.support[0] ## Compute fitted PDF normal = stats.norm.pdf(kde.support, loc=mean, scale=sdev) ## Compute difference kdeDiff = (2 * (kde.density - normal) / (kde.density + normal)) normKDEDiff = np.sqrt(np.sum(kdeDiff**2) * deltaX) # Set title axs[0].set_title( "KS Test result: Statistic = {stat:1.3f}, P-Value = {pvalue:1.3f}". format(stat=ksRes.statistic, pvalue=ksRes.pvalue) + ",\nintegrated KDE difference = {normKDEDiff:1.3f}".format( normKDEDiff=normKDEDiff)) # Compute fits yb, xb = np.histogram(dist, bins="fd") #Plot PDF ax = axs[0] sns.distplot(dist, hist_kws=styles["Bins"], kde_kws=styles["KDE"], fit_kws=styles["Fit"], ax=ax, norm_hist=True, fit=stats.norm) ## Axis styling ax.axvline(mean, label=r"$\mu$", **baseLineStyle) ax.set_ylabel("PDF") ax.set_yticks([]) ax.legend([]) # CDFs ax = axs[1] styles["KDE"].update({"cumulative": True}) styles["Bins"].update({"cumulative": True}) ## Plot CDFs ecdf = sns.distplot(dist, hist_kws=styles["Bins"], kde_kws=styles["KDE"], ax=ax, norm_hist=True) ## Get the x-range lines = ecdf.get_lines()[0] xl = lines.get_xdata() ## Compute the fitted CDF cdf = stats.norm.cdf(xl, loc=mean, scale=sdev) ax.plot(xl, cdf, **fitLineStyle) ## Styling ax.set_ylabel("CDF") ax.axvline(mean, label=r"$\mu$", **baseLineStyle) ax.axhline(0.5, **baseLineStyle) ax.set_yticks(np.linspace(0.25, 1, 4)) ax.legend(loc="upper left", frameon=True) # Difference plot ax = axs[2] for key in ["KDE", "Bins"]: styles[key].pop("cumulative") styles[key].pop("label") # Plot KDE difference ax.plot(kde.support, kdeDiff, **styles["KDE"]) # Plot bin difference rwidth = styles["Bins"].pop("rwidth") styles["Bins"].pop("normed") midBin = (xb[1:] + xb[:-1]) / 2 yb = yb / np.sum(yb * (xb[1:] - xb[:-1])) pdf = stats.norm.pdf(midBin, loc=mean, scale=sdev) diff = 2 * (yb - pdf) / (yb + pdf) ax.bar(xb[:-1] + deltaX / 2, diff, width=(xb[1:] - xb[:-1]) * rwidth, align='edge', **styles["Bins"]) ax.set_ylabel(r"$\Delta$PDF") ax.set_ylim(min(-0.1, diff.min()) * 1.5, max(diff.max(), 0.1) * 1.5) ## Styling ax.axvline(mean, **baseLineStyle) baseLineStyle["color"] = "black" baseLineStyle["ls"] = "-" ax.axhline(0, **baseLineStyle) # General styling for nax, ax in enumerate(axs): # Labels right ax.yaxis.set_label_position("right") # Ticks styling ax.tick_params(axis="both", direction='inout', width=0.5, length=2.5, top=(nax != 0)) # set line width for val in ax.spines.values(): val.set_linewidth(0.5) # Remove line width for PDF plot for pos in ["left", "top", "right"]: axs[0].spines[pos].set_linewidth(0) ax.set_xlim(dist.min(), dist.max()) # Adjust internal plot spacings plt.subplots_adjust(hspace=0.0) return fig
Created on Sun Jul 26 11:23:09 2020 @author: Chaobo Zhang """ import pandas as pd import numpy as np import matplotlib.pyplot as plt import statsmodels.nonparametric.api as smnp #Draw a probability density plot of a variable Data_for_density_estimation = pd.read_csv(open('Test data for probability density plot.csv')) #用来估计核密度曲线 fft = "True" Feature = "Variable 1" kde = smnp.KDEUnivariate([float(x) for x in Data_for_density_estimation[Feature]]) outlier_ratio = 50 #Scale factor of the outlier threshold bw = 1 #Bandwidth threshold_of_the_number_of_categories = 5 #Threshold of the number of categories kde.fit("gau", bw, fft) x, y = kde.support, kde.density outlier_threshold = max(y)/outlier_ratio #Threshold of the probability density of outliers plt.figure(figsize=(13, 6)) plt.xticks(fontproperties='Times New Roman',fontsize=24) plt.yticks(fontproperties='Times New Roman',fontsize=24) plt.plot(x, y, 'k', linewidth=1.5) plt.ylim(-0.01, max(y)*1.1) plt.xlim(min(x), max(x)) outlier_interval = [[-2.0, -1.0], [11.7, 53.0]]
def density(data): x = np.array(data, dtype=np.float64) kde = smnp.KDEUnivariate(x) kde.fit("gau", bw=.5, fft=True) x, y = kde.support, kde.density return x, y
def getStatisticsFrame(samples, nXStart=0, nXStep=1, obsTitles=None): r""" Computes a statistic frame for a given correlator bootstrap ensemble. This routine takes statistical data 'samples' (see parameters) as input. For each individual distribution within the sample data, this routine fits a Gaussian Probability Density Function (PDF) and computes Kernel Density Estimate (KDE). The output of this routine is a data frame, which contains the following information for each individual distribution of data within the samples array: * 'mean': the mean value of the distribution * 'sDev': the standard deviation of the individual distribution * 'kdeDiff': the relative vector norm of the KDE and the fitted PDF $$ \sqrt{ \int dx [ 2*(PDF_{KDE} - PDF_{FIT})/(PDF_{KDE} + PDF_{FIT}) ]^2 } $$ * 'Dn' and 'pValue': the statistic and the significance of the Hypothesis (normal distribution with given parameters) by the Kolmogorov-Smirnov test. of the Kolmogorov-Smirnov test (https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test). The data is classified by 'nX' and the observable name ('obsTitles' if present). Parameters ---------- samples : array, shape = (nObservables, nXrange, nSamples) The statsitical HMC data. nXStart : int Index to nX dimension of samples array for plotting frames. Plots will start at this index. nXStep : int Stepindex to nX dimension of samples array for plotting frames. Only each 'nXStep' will be shown. obsTitles : None or list, length = nObservables Row titles for figure. Returns ------- df : 'pandas.DataFrame' Note ---- For the Kolmogorov-Smirnov test see 'scipy.stats.kstest' and for the KDE see '''python >>> import statsmodels.nonparametric.api as smnp >>> kde = smnp.KDEUnivariate(dist) >>> kde.fit(kernel="gau", bw="scott", fft=True, gridsize=100, cut=3) ''' """ # Allocate temp variables nObs, nXSize, _ = samples.shape if obsTitles is None: obsTitles = [r"O{0}".format(no) for no in range(nObs)] nXRange = np.arange(nXStart, nXSize, nXStep) data = [] # Iterate correlators for nO, corrSample in enumerate(samples): # Iterate time steps for nX, dist in zip(nXRange, corrSample[nXStart::nXStep]): # Execute KS test mean, sDev = np.mean(dist), np.std(dist, ddof=1) ksRes = stats.kstest(dist, "norm", args=(mean, sDev)) # Estimate KDE kde = smnp.KDEUnivariate(dist) kde.fit(kernel="gau", bw="scott", fft=True, gridsize=100, cut=3) # Compute integral difference between normal dist and KDE deltaX = kde.support[1] - kde.support[0] normal = stats.norm.pdf(kde.support, loc=mean, scale=sDev) kdeDiff = 2 * (kde.density - normal) / (kde.density + normal) kdeDiffnorm = np.sqrt(np.sum(kdeDiff**2) * deltaX) # Store data data += [{ "observable": obsTitles[nO], "nX": nX, "mean": mean, "sDev": sDev, "Dn": ksRes.statistic, "pValue": ksRes.pvalue, "kdeDiff": kdeDiffnorm, }] # Return frame return pd.DataFrame(data, columns=[ "observable", "nX", "mean", "sDev", "Dn", "pValue", "kdeDiff" ])