예제 #1
0
def maximum_likelihood_estimator_scipy(data_series):
    """
    wrapper for scipy default ML estimator
    :param data_series: samples from pareto distribution that are used to estimate its parameters
    :return: tuple (alpha, gamma) that contains estimated parameters of Pareto distribution
    """
    params = pareto.fit(data_series)
    return params[0], params[2]
예제 #2
0
        data[i] = max(data)

print(data)

fig, ax = plt.subplots()

# Plot the histogram
bins = list(range(0, max(data), 100))  # bins should be every 100ms
bins.append(max(data))  # also include the last one

print(bins)
plt.hist(data, bins=bins, density=True, facecolor='green', alpha=1)

# Try to fit a Pareto in the data
# shape is b (alpha in wikipedia), scale is x (x_b in wikipedia)
shape, loc, scale = pareto.fit(data)
y = pareto.pdf(bins, shape, loc=loc, scale=scale)

# Plot the Pareto on top of the bins
l = plt.plot(bins, y, 'r--', linewidth=2)

plt.xticks((0, 500, 1000, 1500) + tuple(range(2500, max(data), 5000)),
           rotation=90)
ax.grid(alpha=0.3)

#plot
plt.xlabel('Miliseconds')
plt.ylabel('Probability')
plt.title(
    "Histogram of %d circuit timeout values fitted against Pareto with shape=%.3f, loc=%.3f and scale=%.3f"
    % (len(data), shape, loc, scale))
예제 #3
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("directory",
                        type=str,
                        help="Directory prefix of pcaps and dumps")
    parser.add_argument("emulated_prefix", type=str,
                        help='Prefix of cluster that will be approximated. ' \
                             'Expect "1.x.".')
    args = parser.parse_args()

    data_dir = args.directory
    emulated_prefix = args.emulated_prefix

    edge_filename = data_dir + '/edges2/edges.raw'
    host_filename = data_dir + '/hosts2/hosts.raw'
    intermimic_filename = data_dir + '/intermimic.dat'

    t = Testers(emulated_prefix)

    with open(host_filename, 'r') as host_file, \
         open(edge_filename, 'r') as edge_file, \
         open(intermimic_filename, 'w') as intermimic_file:
        ing_interarrivals = get_interarrivals(edge_file, t.ingress_tester)
        b, loc, scale = pareto.fit(ing_interarrivals)
        intermimic_file.write(" ".join([str(b), str(loc), str(scale)]) + '\n')

        egr_interarrivals = get_interarrivals(host_file, t.egress_tester)
        b, loc, scale = pareto.fit(egr_interarrivals)
        intermimic_file.write(" ".join([str(b), str(loc), str(scale)]) + '\n')
예제 #4
0
def bootstrap(a,
              f=None,
              b=100,
              method="balanced",
              family=None,
              strata=None,
              smooth=False,
              random_state=None):
    """
    Calculate function values from bootstrap samples or
    optionally return bootstrap samples themselves

    Parameters
    ----------
    a : array-like
        Original sample
    f : callable or None
        Function to be bootstrapped
    b : int
        Number of bootstrap samples
    method : string
        * 'ordinary'
        * 'balanced'
        * 'parametric'
    family : string or None
        * 'gaussian'
        * 't'
        * 'laplace'
        * 'logistic'
        * 'F'
        * 'gamma'
        * 'log-normal'
        * 'inverse-gaussian'
        * 'pareto'
        * 'beta'
        * 'poisson'
    strata : array-like or None
        Stratification labels, ignored when method
        is parametric
    smooth : boolean
        Whether or not to add noise to bootstrap
        samples, ignored when method is parametric
    random_state : int or None
        Random number seed

    Returns
    -------
    y | X : np.array
        Function applied to each bootstrap sample
        or bootstrap samples if f is None
    """
    np.random.seed(random_state)
    a = np.asarray(a)
    n = len(a)

    # stratification not meaningful for parametric sampling
    if strata is not None and (method != "parametric"):
        strata = np.asarray(strata)
        if len(strata) != len(a):
            raise ValueError("a and strata must have" " the same length")
        # recursively call bootstrap without stratification
        # on the different strata
        masks = [strata == x for x in np.unique(strata)]
        boot_strata = [
            bootstrap(a=a[m],
                      f=None,
                      b=b,
                      method=method,
                      strata=None,
                      random_state=random_state) for m in masks
        ]
        # concatenate resampled strata along first column axis
        X = np.concatenate(boot_strata, axis=1)
    else:
        if method == "ordinary":
            # i.i.d. sampling from ecdf of a
            X = np.reshape(a[np.random.choice(range(a.shape[0]),
                                              a.shape[0] * b)],
                           newshape=(b, ) + a.shape)
        elif method == "balanced":
            # permute b concatenated copies of a
            r = np.reshape([a] * b, newshape=(b * a.shape[0], ) + a.shape[1:])
            X = np.reshape(r[np.random.permutation(range(r.shape[0]))],
                           newshape=(b, ) + a.shape)
        elif method == "parametric":
            if len(a.shape) > 1:
                raise ValueError("a must be one-dimensional")

            # fit parameters by maximum likelihood and sample
            if family == "gaussian":
                theta = norm.fit(a)
                arr = norm.rvs(size=n * b,
                               loc=theta[0],
                               scale=theta[1],
                               random_state=random_state)
            elif family == "t":
                theta = t.fit(a, fscale=1)
                arr = t.rvs(size=n * b,
                            df=theta[0],
                            loc=theta[1],
                            scale=theta[2],
                            random_state=random_state)
            elif family == "laplace":
                theta = laplace.fit(a)
                arr = laplace.rvs(size=n * b,
                                  loc=theta[0],
                                  scale=theta[1],
                                  random_state=random_state)
            elif family == "logistic":
                theta = logistic.fit(a)
                arr = logistic.rvs(size=n * b,
                                   loc=theta[0],
                                   scale=theta[1],
                                   random_state=random_state)
            elif family == "F":
                theta = F.fit(a, floc=0, fscale=1)
                arr = F.rvs(size=n * b,
                            dfn=theta[0],
                            dfd=theta[1],
                            loc=theta[2],
                            scale=theta[3],
                            random_state=random_state)
            elif family == "gamma":
                theta = gamma.fit(a, floc=0)
                arr = gamma.rvs(size=n * b,
                                a=theta[0],
                                loc=theta[1],
                                scale=theta[2],
                                random_state=random_state)
            elif family == "log-normal":
                theta = lognorm.fit(a, floc=0)
                arr = lognorm.rvs(size=n * b,
                                  s=theta[0],
                                  loc=theta[1],
                                  scale=theta[2],
                                  random_state=random_state)
            elif family == "inverse-gaussian":
                theta = invgauss.fit(a, floc=0)
                arr = invgauss.rvs(size=n * b,
                                   mu=theta[0],
                                   loc=theta[1],
                                   scale=theta[2],
                                   random_state=random_state)
            elif family == "pareto":
                theta = pareto.fit(a, floc=0)
                arr = pareto.rvs(size=n * b,
                                 b=theta[0],
                                 loc=theta[1],
                                 scale=theta[2],
                                 random_state=random_state)
            elif family == "beta":
                theta = beta.fit(a)
                arr = beta.rvs(size=n * b,
                               a=theta[0],
                               b=theta[1],
                               loc=theta[2],
                               scale=theta[3],
                               random_state=random_state)
            elif family == "poisson":
                theta = np.mean(a)
                arr = poisson.rvs(size=n * b,
                                  mu=theta,
                                  random_state=random_state)
            else:
                raise ValueError("Invalid family")

            X = np.reshape(arr, newshape=(b, n))
        else:
            raise ValueError("method must be either 'ordinary'"
                             " , 'balanced', or 'parametric',"
                             " '{method}' was supplied".format(method=method))

    # samples are already smooth in the parametric case
    if smooth and (method != "parametric"):
        X += np.random.normal(size=X.shape, scale=1 / np.sqrt(n))

    if f is None:
        return X
    else:
        return np.asarray([f(x) for x in X])
def best_fit_scaled(x,
                    plot=False,
                    dist_to_test=None,
                    number_of_bins=100,
                    log_scaled=False,
                    the_label="something",
                    color="red",
                    symbol="o"):
    if dist_to_test is None:
        dist_to_test = [
            "norm", "pareto", "lognorm", "gamma", "expon", "weibull_min",
            "weibull_max"
        ]
    x_min = min(x)
    x_max = max(x)
    dX = (x_max - x_min) * 0.01

    support = np.arange(x_min, x_max, dX)
    best_fit = []
    error = []

    if "pareto" in dist_to_test:
        try:
            dist_param = pareto.fit(x)
            my_dist = pareto(*dist_param)
            kt, p_value = kstest(x, "pareto", dist_param)
            best_fit.append({
                "distribution": "pareto",
                "ktest": kt,
                "pvalue": p_value,
                "parameters": dist_param
            })
            if plot:
                Y = my_dist.pdf(support)
                plt.plot(support, Y, linewidth=2.0)
                if log_scaled:
                    hist, bins = np.histogram(x,
                                              bins=number_of_bins,
                                              normed=True)
                    plt.plot(bins[:-1],
                             hist,
                             symbol,
                             color=color,
                             label=the_label)
                # stuff = plt.hist(X,bins=numberOfBins,normed=True)
        except Exception as e:
            error.append(("pareto_err", e))

    if "lognorm" in dist_to_test:
        try:
            dist_param = lognorm.fit(x)
            my_dist = lognorm(*dist_param)
            kt, p_value = kstest(x, "lognorm", dist_param)
            best_fit.append({
                "distribution": "lognorm",
                "ktest": kt,
                "pvalue": p_value,
                "parameters": dist_param
            })
            if plot:
                Y = my_dist.pdf(support)
                plt.plot(support, Y, color=color, linewidth=2.0)
                if log_scaled:
                    hist, bins = np.histogram(x,
                                              bins=number_of_bins,
                                              normed=True)
                    plt.plot(bins[:-1],
                             hist,
                             symbol,
                             color=color,
                             label=the_label)

        except Exception as e:
            error.append(("lognorm_err", e))

    # FINISH PLOT
    if plot:
        if log_scaled:
            plt.yscale("log")
            plt.xscale("log")

        plt.legend(loc="best")
        plt.show()

    return best_fit, error
def loglog_distribution_plots(data,
                              service_simulated=None,
                              plot_dist=None,
                              x_lim=None,
                              y_lim=None,
                              ax=None,
                              title=None):
    if plot_dist is None:
        plot_dist = [
            "norm", "pareto", "lognorm", "gamma", "expon", "weibull_min",
            "weibull_max"
        ]
    if x_lim is None:
        x_min = min(data)
        x_max = max(data)
    else:
        x_min, x_max = x_lim
    d_x = (x_max - x_min) * 0.001
    support = np.arange(x_min, x_max, d_x)

    if "pareto" in plot_dist:
        dist_param = pareto.fit(data)
        my_dist = pareto(*dist_param)
        y_pareto = my_dist.pdf(support)
        print("Pareto: " + str(ks_2samp(data, y_pareto)[0]))

    if "norm" in plot_dist:
        dist_param = norm.fit(data)
        my_dist = norm(*dist_param)
        y_norm = my_dist.pdf(support)
        print("Norm: " + str(ks_2samp(data, y_norm)[0]))

    if "expon" in plot_dist:
        dist_param = expon.fit(data)
        my_dist = expon(*dist_param)
        y_exp = my_dist.pdf(support)
        kt, p_value = kstest(data, "lognorm", dist_param)
        print("Exp: " + str(ks_2samp(data, y_exp)[0]), kt, p_value)

    if "lognorm" in plot_dist:
        dist_param = lognorm.fit(data)
        my_dist = lognorm(*dist_param)
        y_lognorm = my_dist.pdf(support)
        print("LogNorm: " + str(ks_2samp(data, y_lognorm)[0]))

    if "gamma" in plot_dist:
        dist_param = gamma.fit(data)
        my_dist = gamma(*dist_param)
        y_gamma = my_dist.pdf(support)
        print("Gamma: " + str(ks_2samp(data, y_gamma)[0]))

    if "lognorm" in plot_dist:
        if ax is None:
            plt.plot(support, y_lognorm, label="ln", linewidth=6.0)
        else:
            ax.plot(support, y_lognorm, linewidth=6.0)
    if "gamma" in plot_dist:
        if ax is None:
            plt.plot(support, y_gamma, label="gamma", linewidth=6.0)
        else:
            ax.plot(support, y_gamma, linewidth=6.0)

    if "norm" in plot_dist:
        if ax is None:
            plt.plot(support, y_norm, linewidth=6.0)
        else:
            ax.plot(support, y_norm, linewidth=6.0)
    if "pareto" in plot_dist:
        if ax is None:
            plt.plot(support, y_pareto, label="pareto", linewidth=6.0)
        else:
            ax.plot(support, y_pareto, linewidth=6.0)
    if "expon" in plot_dist:
        if ax is None:
            plt.plot(support, y_exp, linewidth=6.0)
        else:
            ax.plot(support, y_exp, linewidth=6.0)

    if x_lim is not None:
        if ax is None:
            plt.xlim(x_lim)
        else:
            ax.set_xlim(x_lim)
    if y_lim is not None:
        if ax is None:
            plt.ylim(y_lim)
        else:
            ax.set_ylim(y_lim)

    if ax is None:
        plt.yscale("log")
        plt.xscale("log")
    else:
        ax.set_yscale("log")
        ax.set_xscale("log")

    if ax is None:
        plt.ylabel("Frequency")
        plt.xlabel("Service Time (s)")
    else:
        ax.set_ylabel("Frequency")
        ax.set_xlabel("Service Time (s)")
    count, bins = np.histogram(data, bins=100, normed=True)
    if ax is None:
        plt.plot(bins[1:], count, "o", label="empirical", markersize=10)
    else:
        ax.plot(bins[1:], count, "o", label="empirical", markersize=10)
    if service_simulated is not None:
        count, _ = np.histogram(service_simulated, bins=bins, normed=True)
        if ax is None:
            plt.plot(bins[1:], count, "D", label="model", markersize=10)
        else:
            ax.plot(bins[1:], count, "D", label="model", markersize=10)
    if ax is None:
        plt.grid(True)
        plt.legend(bbox_to_anchor=(0, 1.02, 1, 0.2),
                   loc="lower left",
                   mode="expand",
                   borderaxespad=0,
                   ncol=2)
    else:
        ax.grid(True)
        ax.legend(bbox_to_anchor=(0, 1.02, 1, 0.2),
                  loc="lower left",
                  mode="expand",
                  borderaxespad=0,
                  ncol=2)
    if title is not None:
        if ax is None:
            plt.title(title)
        else:
            ax.set_title(title)
    return ax