Exemplo n.º 1
def make_pdf(distribution: st.rv_continuous,
             params: Tuple[float, ...],
             size: int = 25_000) -> pd.Series:
    Generate a pandas Series for the distributions's Probability Distribution Function. This Series
    will have axis values as index, and PDF values as values.

        distribution (st.rv_continuous): a scipy.stats generator object
        params (Tuple[float, ...]): the parameters for this generator given back by the fit.
        size (int): the number of points to evaluate.

        A pandas Series object with the PDF as values, corresponding axis values as index.
    # Separate parts of parameters
    *args, loc, scale = params

    logger.debug("Getting sane start and end points of distribution")
    start = (distribution.ppf(0.01, *args, loc=loc, scale=scale)
             if args else distribution.ppf(0.01, loc=loc, scale=scale))
    end = (distribution.ppf(0.99, *args, loc=loc, scale=scale)
           if args else distribution.ppf(0.99, loc=loc, scale=scale))

    logger.debug("Building PDF")
    x = np.linspace(start, end, size)
    y = distribution.pdf(x, loc=loc, scale=scale, *args)
    return pd.Series(y, x)
Exemplo n.º 2
def discrete_distrb(distrb: rv_continuous) -> np.ndarray:
    Returns a discretisation of specified distribution at values x = 0, 1, 2, 3, ..., ceiling(10^-6 quantile)
    upper_lim = np.ceil(distrb.ppf(1 - 1e-6))
    bin_lims = np.linspace(0.5, upper_lim + 0.5, int(upper_lim + 1))
    cdf = distrb.cdf(bin_lims)
    pmf = np.diff(cdf, prepend=0)

    return pmf / pmf.sum()
Exemplo n.º 3
def generate_type_differentiated_rates(x: int, y: int,
                                       mRNA_distribution: stats.rv_continuous=stats.truncnorm,
                                       mRNA_parameters: np.ndarray=np.array([5, 15]),
                                       miRNA_distribution: stats.rv_continuous=stats.truncnorm,
                                       miRNA_parameters: np.ndarray=np.array([5, 15])) -> np.ndarray:
    Generates a value for each species. Values for mRNA species are drawn from mRNA distribution, likewise for
    miRNAs species.
    Used to get arrival, decay, and burst rates.
    rates = np.zeros(x+y)
    rates[:x] = mRNA_distribution.rvs(*mRNA_parameters, size=x)
    rates[x:] = miRNA_distribution.rvs(*miRNA_parameters, size=y)
    return rates
Exemplo n.º 4
    def __init__(self, ds: Tuple, ηs: Tuple, event_size: stats.rv_continuous):
        self.ds = ds
        self.ηs = ηs
        self.event_size = event_size
        self.event_size_distribution_name = str(event_size.__class__)
        self.event_size_mean = float(event_size.mean())
        self.event_size_std = float(event_size.std())

        self.df_waveforms = pd.DataFrame(columns=self.columns)
        self.df_parameters = pd.DataFrame(
            columns=["MC_type", "MC_name", "count", "params"])

        self.t_max = 420e-9  # sec. See https://arxiv.org/abs/0810.4930v2
        self.tqdm = True
Exemplo n.º 5
def normalize(a: np.ndarray,
              dist: rv_continuous = norm,
              **kwargs) -> np.ndarray:
    """Assumes a is 1d."""
    indices, a = np.argsort(a), a.copy().astype(np.float)
    disc_dist = dist.ppf(np.linspace(0, 1, len(a) + 2, endpoint=True),
    a[indices] = disc_dist[1:-1]
    return a
Exemplo n.º 6
def generate_iid_species_rates(x: int, y: int,
                               distribution: stats.rv_continuous=stats.truncnorm,
                               parameters: np.ndarray=np.array([5, 15])):
    Generate a value drawn from the given distribution for each species.
    Used to generate arrival, decay, and bursting rates.
    rates = distribution.rvs(*parameters, size=(x+y))
    return rates
Exemplo n.º 7
def find_distribution_area(distribution: stats.rv_continuous, lower, upper,
                           *args, **kwargs):
    Find the area between the lower and upper bounds of a scipy.stats continuous random variable distribution.
    # Verify that either lower or upper is specified
    if lower is None and upper is None:
        print("Lower and upper bounds can't both be None")
        raise UserInputError
    # If lower is not specified we want to find the area below upper
    if lower is None:
        return distribution.cdf(upper, *args, **kwargs)
    # If upper is not specified we want to find the are above lower
    if upper is None:
        return 1 - distribution.cdf(lower, *args, **kwargs)
    # Otherwise we find the area between the lower and upper bounds
    return distribution.cdf(upper, *args, **kwargs) - distribution.cdf(
        lower, *args, **kwargs)
Exemplo n.º 8
def read_flights(filename: str, basetime: datetime.datetime,
                 rtc_dist: sps.rv_continuous, weight_dist: sps.rv_continuous):
    test_data = pandas.read_excel(io=filename, index_row=None)
    flights = set()
    for row in test_data.itertuples():
        fid = row.fid
        airline = row.Airline
        departtime = basetime + datetime.timedelta(minutes=row.DT)
        duration = datetime.timedelta(minutes=row.FCA) - datetime.timedelta(
        rtc = datetime.timedelta(seconds=float(rtc_dist.rvs(size=1)))
        weight = float(weight_dist.rvs(size=1))
    return flights
Exemplo n.º 9
def plot_cts_distribution(distribution: stats.rv_continuous,
                          epsilon: float = 5e-5,
                          epsilon_end: float = 1e-10,
                          from_samples: bool = False,
                          num_samples: int = 10000) -> None:
    Plot the continuous distribution using seaborn and matplotlib
    See scipy.stats for cts distributions: 

    epsilon              ->  Dist between plot points
    epsilon_end          ->  Plot on interval [x,y] where P(X<x) = epsilon_end to P(X<y) = 1 - epsilon_end
    from_samples = True  ->  Plot from random sampling
                 = False ->  Plot the PDF
    num_samples          ->  If samples true gives number of samples


        plot_cts_distribution(stats.arcsine(), epsilon_end=1e-2)
        plot_cts_distribution(stats.arcsine(), samples=True)


    if from_samples:
        rv_samples = distribution.rvs(size=num_samples)
        ax = sns.distplot(rv_samples, color="m")
        x = np.linspace(distribution.ppf(0 + epsilon_end),
                        distribution.ppf(1 - epsilon_end), int(1 / epsilon))
        df = pd.DataFrame({'Values': x, 'Probability': distribution.pdf(x)})
        ax = sns.lineplot(x='Values', y='Probability', data=df)
Exemplo n.º 10
def exceedance_probability(distribution: rv_continuous,
                           n_samples: Optional[int] = None):
    """ Calculates the exceedance probability of a random variable following a continuous multivariate distribution.
    Exceedance probability: φ_i = p(∀j != i: x_i > x_j | x ~ ``distribution``).

    :param distribution: the continuous multivariate distribution.
    :param n_samples: the number of realization sampled from the distribution to approximate the exceedance probability.
                      Default to ``None`` and numerical integration is used instead of Monte Carlo simulation.
    :return: the exceedance probability of a random variable following the continuous multivariate distribution.
    if n_samples is None:  # Numerical integration
        from scipy.stats._multivariate import dirichlet_frozen, multivariate_normal_frozen
        if type(distribution) is multivariate_normal_frozen:
            # Speekenbrink, M., & Konstantinidis, E. (2015). Uncertainty and exploration in a restless bandit problem.
            # https://onlinelibrary.wiley.com/doi/pdf/10.1111/tops.12145: p. 4.
            distribution: multivariate_normal_frozen
            μ, Σ = distribution.mean, distribution.cov
            n = len(μ)
            φ = np.zeros(n)
            I = -np.eye(n - 1)
            for i in range(n):
                A = np.insert(I, i, 1, axis=1)
                φ[i] = (mvn.cdf(A @ μ, cov=A @ Σ @ A.T))
        elif type(distribution) is dirichlet_frozen:
            # Soch, J. & Allefeld, C. (2016). Exceedance Probabilities for the Dirichlet Distribution.
            # https://arxiv.org/pdf/1611.01439.pdf: p. 361.
            distribution: dirichlet_frozen
            α = distribution.alpha
            n = len(α)
            γ = [gammaln(α[i]) for i in range(n)]

            def f(x, i):
                φ_i = 1
                for j in range(n):
                    if i != j:
                        φ_i *= gammainc(α[j], x)
                return φ_i * exp((α[i] - 1) * log(x) - x - γ[i])

            φ = [
                integrate.quad(lambda x: f(x, i), 0, np.inf)[0]
                for i in range(n)
            raise NotImplementedError(
                'Numerical integration not implemented for this distribution!')
        φ = np.array(φ)
    else:  # Monte Carlo simulation
        samples = distribution.rvs(size=n_samples)
        φ = (samples == np.amax(samples, axis=1, keepdims=True)).sum(axis=0)
    return φ / φ.sum()
Exemplo n.º 11
def _fit_parametric_family(dist: stats.rv_continuous,
                           sample: np.ndarray) -> _tp.Tuple[float, ...]:
    if dist == stats.multivariate_normal:
        # has no fit method...
        return np.mean(sample, axis=0), np.cov(sample.T, ddof=1)

    if dist in {stats.f, stats.beta}:
        fit_kwargs = {"floc": 0, "fscale": 1}
    elif dist in {stats.gamma, stats.lognorm, stats.invgauss, stats.pareto}:
        fit_kwargs = {"floc": 0}
        fit_kwargs = {}

    return dist.fit(sample, **fit_kwargs)  # type: ignore
Exemplo n.º 12
def _resample_parametric(
        sample: np.ndarray, size: int, dist: stats.rv_continuous,
        rng: np.random.Generator) -> _tp.Generator[np.ndarray, None, None]:
    n = len(sample)

    # fit parameters by maximum likelihood and sample from that
    if dist == stats.poisson:
        # - poisson has no fit method and there is no scale parameter
        # - random number generation for poisson distribution in scipy seems to be buggy
        mu = np.mean(sample)
        for _ in range(size):
            yield rng.poisson(mu, size=n)
        args = _fit_parametric_family(dist, sample)
        dist = dist(*args)
        for _ in range(size):
            yield dist.rvs(size=n, random_state=rng)
Exemplo n.º 13
    def generate_gammas(x: int, y: int,
                        distribution: stats.rv_continuous=stats.truncnorm,
                        parameters: np.ndarray=np.array([5, 15])) -> np.ndarray:
        gammas = np.zeros([x+y, x+y])
        mRNA_to_miRNA = np.zeros([x, y])
        for row in range(x):
            mRNA_to_miRNA[row] = distribution.rvs(*parameters, size=y)

        #  Remove some gamma values
        legal = False
        while not legal:
            culled_gammas = Network.new_cull_gammas(x, y, deepcopy(mRNA_to_miRNA))
            legal = Network.new_check_network_legality(x, y, culled_gammas)

        gammas[:x, x:] = culled_gammas
        gammas[x:, :x] = culled_gammas.T

        return gammas
Exemplo n.º 14
def generate_iid_gammas(x: int, y: int,
                        distribution: stats.rv_continuous=stats.truncnorm,
                        parameters: np.ndarray=np.array([1, 5])) -> np.ndarray:
    Generates iid gamma rates between each mRNA and miRNA. Rates are drawn from the given distribution with the given
    rates = np.zeros([x+y, x+y])
    mRNA_to_miRNA = np.zeros([x, y])
    for row in range(x):
        mRNA_to_miRNA[row] = distribution.rvs(*parameters, size=y)

    #  Remove some gamma values
    legal = False
    culled_gammas = cull_gammas(x, y, mRNA_to_miRNA)

    rates[:x, x:] = culled_gammas
    rates[x:, :x] = culled_gammas.T

    return rates
Exemplo n.º 15
def run_experiment(n: int, k_vals: dict[str, int], dist: rv_continuous,
                   batch_size: int) -> dict[str, int]:
    Run an experiment with the given number of experts, k_vals, distribution, and batch size.

    :param n: Number of experts.
    :param k_vals: Dictionary with keys that are the names of the k values (i.e., "sqrt", "logn") and
                   values that are the actual k values.
    :param dist: Distribution to draw the expert competencies.
    :param batch_size: Number of instances to run at once (makes computation faster as one big array).
    :return: Dictionary with keys as k_value names, and values that are the number of instances
             (out of batch_size total),that the top k experts got the correct answer.
    # Sample competencies
    competencies = dist.rvs((n, batch_size))
    # Sort by expert competencies
    sorted_comps = np.sort(competencies, axis=0)
    # Sample expert opinions from their competencies
    expert_opinions = bernoulli(sorted_comps).rvs()
    # Calculate number correct for each k.
    return {
        k_name: best_k_accuracies(expert_opinions, k_val)
        for k_name, k_val in k_vals.items()
Exemplo n.º 16
def initialize(self, rv_p: stats.rv_continuous, rv_p_kwargs: dict):
    if not rv_p_kwargs:
        rv_p_kwargs = {'loc': 0, 'scale': 1}
    self.rv_p = rv_p(**rv_p_kwargs)
    self.p = rv_p.rvs(
        size=self.n_vars)  # np.random.uniform(0, 1, num_instruments)