def maximum_likelihood_estimator_scipy(data_series): """ wrapper for scipy default ML estimator :param data_series: samples from pareto distribution that are used to estimate its parameters :return: tuple (alpha, gamma) that contains estimated parameters of Pareto distribution """ params = pareto.fit(data_series) return params[0], params[2]
data[i] = max(data) print(data) fig, ax = plt.subplots() # Plot the histogram bins = list(range(0, max(data), 100)) # bins should be every 100ms bins.append(max(data)) # also include the last one print(bins) plt.hist(data, bins=bins, density=True, facecolor='green', alpha=1) # Try to fit a Pareto in the data # shape is b (alpha in wikipedia), scale is x (x_b in wikipedia) shape, loc, scale = pareto.fit(data) y = pareto.pdf(bins, shape, loc=loc, scale=scale) # Plot the Pareto on top of the bins l = plt.plot(bins, y, 'r--', linewidth=2) plt.xticks((0, 500, 1000, 1500) + tuple(range(2500, max(data), 5000)), rotation=90) ax.grid(alpha=0.3) #plot plt.xlabel('Miliseconds') plt.ylabel('Probability') plt.title( "Histogram of %d circuit timeout values fitted against Pareto with shape=%.3f, loc=%.3f and scale=%.3f" % (len(data), shape, loc, scale))
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("directory", type=str, help="Directory prefix of pcaps and dumps") parser.add_argument("emulated_prefix", type=str, help='Prefix of cluster that will be approximated. ' \ 'Expect "1.x.".') args = parser.parse_args() data_dir = args.directory emulated_prefix = args.emulated_prefix edge_filename = data_dir + '/edges2/edges.raw' host_filename = data_dir + '/hosts2/hosts.raw' intermimic_filename = data_dir + '/intermimic.dat' t = Testers(emulated_prefix) with open(host_filename, 'r') as host_file, \ open(edge_filename, 'r') as edge_file, \ open(intermimic_filename, 'w') as intermimic_file: ing_interarrivals = get_interarrivals(edge_file, t.ingress_tester) b, loc, scale = pareto.fit(ing_interarrivals) intermimic_file.write(" ".join([str(b), str(loc), str(scale)]) + '\n') egr_interarrivals = get_interarrivals(host_file, t.egress_tester) b, loc, scale = pareto.fit(egr_interarrivals) intermimic_file.write(" ".join([str(b), str(loc), str(scale)]) + '\n')
def bootstrap(a, f=None, b=100, method="balanced", family=None, strata=None, smooth=False, random_state=None): """ Calculate function values from bootstrap samples or optionally return bootstrap samples themselves Parameters ---------- a : array-like Original sample f : callable or None Function to be bootstrapped b : int Number of bootstrap samples method : string * 'ordinary' * 'balanced' * 'parametric' family : string or None * 'gaussian' * 't' * 'laplace' * 'logistic' * 'F' * 'gamma' * 'log-normal' * 'inverse-gaussian' * 'pareto' * 'beta' * 'poisson' strata : array-like or None Stratification labels, ignored when method is parametric smooth : boolean Whether or not to add noise to bootstrap samples, ignored when method is parametric random_state : int or None Random number seed Returns ------- y | X : np.array Function applied to each bootstrap sample or bootstrap samples if f is None """ np.random.seed(random_state) a = np.asarray(a) n = len(a) # stratification not meaningful for parametric sampling if strata is not None and (method != "parametric"): strata = np.asarray(strata) if len(strata) != len(a): raise ValueError("a and strata must have" " the same length") # recursively call bootstrap without stratification # on the different strata masks = [strata == x for x in np.unique(strata)] boot_strata = [ bootstrap(a=a[m], f=None, b=b, method=method, strata=None, random_state=random_state) for m in masks ] # concatenate resampled strata along first column axis X = np.concatenate(boot_strata, axis=1) else: if method == "ordinary": # i.i.d. sampling from ecdf of a X = np.reshape(a[np.random.choice(range(a.shape[0]), a.shape[0] * b)], newshape=(b, ) + a.shape) elif method == "balanced": # permute b concatenated copies of a r = np.reshape([a] * b, newshape=(b * a.shape[0], ) + a.shape[1:]) X = np.reshape(r[np.random.permutation(range(r.shape[0]))], newshape=(b, ) + a.shape) elif method == "parametric": if len(a.shape) > 1: raise ValueError("a must be one-dimensional") # fit parameters by maximum likelihood and sample if family == "gaussian": theta = norm.fit(a) arr = norm.rvs(size=n * b, loc=theta[0], scale=theta[1], random_state=random_state) elif family == "t": theta = t.fit(a, fscale=1) arr = t.rvs(size=n * b, df=theta[0], loc=theta[1], scale=theta[2], random_state=random_state) elif family == "laplace": theta = laplace.fit(a) arr = laplace.rvs(size=n * b, loc=theta[0], scale=theta[1], random_state=random_state) elif family == "logistic": theta = logistic.fit(a) arr = logistic.rvs(size=n * b, loc=theta[0], scale=theta[1], random_state=random_state) elif family == "F": theta = F.fit(a, floc=0, fscale=1) arr = F.rvs(size=n * b, dfn=theta[0], dfd=theta[1], loc=theta[2], scale=theta[3], random_state=random_state) elif family == "gamma": theta = gamma.fit(a, floc=0) arr = gamma.rvs(size=n * b, a=theta[0], loc=theta[1], scale=theta[2], random_state=random_state) elif family == "log-normal": theta = lognorm.fit(a, floc=0) arr = lognorm.rvs(size=n * b, s=theta[0], loc=theta[1], scale=theta[2], random_state=random_state) elif family == "inverse-gaussian": theta = invgauss.fit(a, floc=0) arr = invgauss.rvs(size=n * b, mu=theta[0], loc=theta[1], scale=theta[2], random_state=random_state) elif family == "pareto": theta = pareto.fit(a, floc=0) arr = pareto.rvs(size=n * b, b=theta[0], loc=theta[1], scale=theta[2], random_state=random_state) elif family == "beta": theta = beta.fit(a) arr = beta.rvs(size=n * b, a=theta[0], b=theta[1], loc=theta[2], scale=theta[3], random_state=random_state) elif family == "poisson": theta = np.mean(a) arr = poisson.rvs(size=n * b, mu=theta, random_state=random_state) else: raise ValueError("Invalid family") X = np.reshape(arr, newshape=(b, n)) else: raise ValueError("method must be either 'ordinary'" " , 'balanced', or 'parametric'," " '{method}' was supplied".format(method=method)) # samples are already smooth in the parametric case if smooth and (method != "parametric"): X += np.random.normal(size=X.shape, scale=1 / np.sqrt(n)) if f is None: return X else: return np.asarray([f(x) for x in X])
def best_fit_scaled(x, plot=False, dist_to_test=None, number_of_bins=100, log_scaled=False, the_label="something", color="red", symbol="o"): if dist_to_test is None: dist_to_test = [ "norm", "pareto", "lognorm", "gamma", "expon", "weibull_min", "weibull_max" ] x_min = min(x) x_max = max(x) dX = (x_max - x_min) * 0.01 support = np.arange(x_min, x_max, dX) best_fit = [] error = [] if "pareto" in dist_to_test: try: dist_param = pareto.fit(x) my_dist = pareto(*dist_param) kt, p_value = kstest(x, "pareto", dist_param) best_fit.append({ "distribution": "pareto", "ktest": kt, "pvalue": p_value, "parameters": dist_param }) if plot: Y = my_dist.pdf(support) plt.plot(support, Y, linewidth=2.0) if log_scaled: hist, bins = np.histogram(x, bins=number_of_bins, normed=True) plt.plot(bins[:-1], hist, symbol, color=color, label=the_label) # stuff = plt.hist(X,bins=numberOfBins,normed=True) except Exception as e: error.append(("pareto_err", e)) if "lognorm" in dist_to_test: try: dist_param = lognorm.fit(x) my_dist = lognorm(*dist_param) kt, p_value = kstest(x, "lognorm", dist_param) best_fit.append({ "distribution": "lognorm", "ktest": kt, "pvalue": p_value, "parameters": dist_param }) if plot: Y = my_dist.pdf(support) plt.plot(support, Y, color=color, linewidth=2.0) if log_scaled: hist, bins = np.histogram(x, bins=number_of_bins, normed=True) plt.plot(bins[:-1], hist, symbol, color=color, label=the_label) except Exception as e: error.append(("lognorm_err", e)) # FINISH PLOT if plot: if log_scaled: plt.yscale("log") plt.xscale("log") plt.legend(loc="best") plt.show() return best_fit, error
def loglog_distribution_plots(data, service_simulated=None, plot_dist=None, x_lim=None, y_lim=None, ax=None, title=None): if plot_dist is None: plot_dist = [ "norm", "pareto", "lognorm", "gamma", "expon", "weibull_min", "weibull_max" ] if x_lim is None: x_min = min(data) x_max = max(data) else: x_min, x_max = x_lim d_x = (x_max - x_min) * 0.001 support = np.arange(x_min, x_max, d_x) if "pareto" in plot_dist: dist_param = pareto.fit(data) my_dist = pareto(*dist_param) y_pareto = my_dist.pdf(support) print("Pareto: " + str(ks_2samp(data, y_pareto)[0])) if "norm" in plot_dist: dist_param = norm.fit(data) my_dist = norm(*dist_param) y_norm = my_dist.pdf(support) print("Norm: " + str(ks_2samp(data, y_norm)[0])) if "expon" in plot_dist: dist_param = expon.fit(data) my_dist = expon(*dist_param) y_exp = my_dist.pdf(support) kt, p_value = kstest(data, "lognorm", dist_param) print("Exp: " + str(ks_2samp(data, y_exp)[0]), kt, p_value) if "lognorm" in plot_dist: dist_param = lognorm.fit(data) my_dist = lognorm(*dist_param) y_lognorm = my_dist.pdf(support) print("LogNorm: " + str(ks_2samp(data, y_lognorm)[0])) if "gamma" in plot_dist: dist_param = gamma.fit(data) my_dist = gamma(*dist_param) y_gamma = my_dist.pdf(support) print("Gamma: " + str(ks_2samp(data, y_gamma)[0])) if "lognorm" in plot_dist: if ax is None: plt.plot(support, y_lognorm, label="ln", linewidth=6.0) else: ax.plot(support, y_lognorm, linewidth=6.0) if "gamma" in plot_dist: if ax is None: plt.plot(support, y_gamma, label="gamma", linewidth=6.0) else: ax.plot(support, y_gamma, linewidth=6.0) if "norm" in plot_dist: if ax is None: plt.plot(support, y_norm, linewidth=6.0) else: ax.plot(support, y_norm, linewidth=6.0) if "pareto" in plot_dist: if ax is None: plt.plot(support, y_pareto, label="pareto", linewidth=6.0) else: ax.plot(support, y_pareto, linewidth=6.0) if "expon" in plot_dist: if ax is None: plt.plot(support, y_exp, linewidth=6.0) else: ax.plot(support, y_exp, linewidth=6.0) if x_lim is not None: if ax is None: plt.xlim(x_lim) else: ax.set_xlim(x_lim) if y_lim is not None: if ax is None: plt.ylim(y_lim) else: ax.set_ylim(y_lim) if ax is None: plt.yscale("log") plt.xscale("log") else: ax.set_yscale("log") ax.set_xscale("log") if ax is None: plt.ylabel("Frequency") plt.xlabel("Service Time (s)") else: ax.set_ylabel("Frequency") ax.set_xlabel("Service Time (s)") count, bins = np.histogram(data, bins=100, normed=True) if ax is None: plt.plot(bins[1:], count, "o", label="empirical", markersize=10) else: ax.plot(bins[1:], count, "o", label="empirical", markersize=10) if service_simulated is not None: count, _ = np.histogram(service_simulated, bins=bins, normed=True) if ax is None: plt.plot(bins[1:], count, "D", label="model", markersize=10) else: ax.plot(bins[1:], count, "D", label="model", markersize=10) if ax is None: plt.grid(True) plt.legend(bbox_to_anchor=(0, 1.02, 1, 0.2), loc="lower left", mode="expand", borderaxespad=0, ncol=2) else: ax.grid(True) ax.legend(bbox_to_anchor=(0, 1.02, 1, 0.2), loc="lower left", mode="expand", borderaxespad=0, ncol=2) if title is not None: if ax is None: plt.title(title) else: ax.set_title(title) return ax