Exemplo n.º 1
def sample_trivariate_xyz(size=1000, seed=42):
    """Sample from three dimensional toy dataset.

    The output is a DataFrame containing three columns:

    * ``x``: Beta distribution with a=0.1 and b=0.1
    * ``y``: Beta distribution with a=0.1 and b=0.5
    * ``z``: Normal distribution + 10 times ``y``

        size (int):
            Amount of samples to generate. Defaults to 1000.
        seed (int):
            Random seed to use. Defaults to 42.

            DataFrame with three columns, ``x``, ``y`` and ``z``.
    with random_seed(seed):
        x = stats.beta.rvs(a=0.1, b=0.1, size=size)
        y = stats.beta.rvs(a=0.1, b=0.5, size=size)
        return pd.DataFrame({
            'x': x,
            'y': y,
            'z': np.random.normal(size=size) + y * 10
Exemplo n.º 2
def sample_bivariate_age_income(size=1000, seed=42):
    """Sample from a bivariate toy dataset.

    This dataset contains two columns which correspond to the simulated age and
    income which are positively correlated with outliers.

        size (int):
            Amount of samples to generate. Defaults to 1000.
        seed (int):
            Random seed to use. Defaults to 42.

            DataFrame with two columns, ``age`` and ``income``.
    with random_seed(seed):
        age = stats.beta.rvs(a=2.0, b=6.0, loc=18, scale=100, size=size)
        income = np.log(age) * 100
        income += np.random.normal(loc=np.log(age) / 100, scale=10, size=size)
        income[np.random.randint(0, 10, size=size) == 0] /= 1000

    return pd.DataFrame({
        "age": age,
        "income": income
Exemplo n.º 3
def sample_univariate_beta(size=1000, seed=42):
    """Sample from a beta distribution with a=3 and b=1 and loc=4.

        size (int):
            Amount of samples to generate. Defaults to 1000.
        seed (int):
            Random seed to use. Defaults to 42.

            Series with the sampled values.
    with random_seed(seed):
        return pd.Series(stats.beta.rvs(a=3, b=1, loc=4, size=size))
Exemplo n.º 4
def sample_univariate_degenerate(size=1000, seed=42):
    """Sample from a degenerate distribution that only takes one random value.

        size (int):
            Amount of samples to generate. Defaults to 1000.
        seed (int):
            Random seed to use. Defaults to 42.

            Series with the sampled values.
    with random_seed(seed):
        return pd.Series(np.full(size, np.random.random()))
Exemplo n.º 5
def sample_univariate_exponential(size=1000, seed=42):
    """Sample from an exponential distribution at 3.0 with rate 1.0.

        size (int):
            Amount of samples to generate. Defaults to 1000.
        seed (int):
            Random seed to use. Defaults to 42.

            Series with the sampled values.
    with random_seed(seed):
        return pd.Series(np.random.exponential(size=size) + 3.0)
Exemplo n.º 6
def sample_univariate_normal(size=1000, seed=42):
    """Sample from a normal distribution with mean 1 and stdev 1.

        size (int):
            Amount of samples to generate. Defaults to 1000.
        seed (int):
            Random seed to use. Defaults to 42.

            Series with the sampled values.
    with random_seed(seed):
        return pd.Series(np.random.normal(size=size, loc=1.0))
Exemplo n.º 7
def sample_univariate_uniform(size=1000, seed=42):
    """Sample from a uniform distribution in [-1.0, 3.0].

        size (int):
            Amount of samples to generate. Defaults to 1000.
        seed (int):
            Random seed to use. Defaults to 42.

            Series with the sampled values.
    with random_seed(seed):
        return pd.Series(4.0 * np.random.random(size=size) - 1.0)
Exemplo n.º 8
def load_age_income(seed=42):
    This dataset contains two columns which correspond to the simulated age and
    income which are positively correlated with outliers.
    N = 500
    with random_seed(seed):
        age = stats.beta.rvs(a=2.0, b=6.0, loc=18, scale=100, size=N)
        income = np.log(age) * 100
        income += np.random.normal(loc=np.log(age) / 100, scale=10, size=N)
        income[np.random.randint(0, 10, size=N) == 0] /= 1000
    return pd.DataFrame({
        "age": age,
        "income": income
Exemplo n.º 9
def sample_univariate_bernoulli(size=1000, seed=42):
    """Sample from a Bernoulli distribution with p=0.3.

    The distribution is built by sampling a uniform random and then setting
    0 or 1 depending on whether the value is above or below 0.3.

        size (int):
            Amount of samples to generate. Defaults to 1000.
        seed (int):
            Random seed to use. Defaults to 42.

            Series with the sampled values.
    with random_seed(seed):
        return pd.Series(np.random.random(size=size) < 0.3).astype(float)
Exemplo n.º 10
def load_three_dimensional(seed=42):
    This dataset contains 6 columns, each of which corresponds to a different
    univariate distribution:

        bernoulli - a Bernoulli distribution with p=0.3
        bimodal - a mixture of two Gaussians at 0.0 and 10.0 with stdev=1
        uniform - a uniform distribution in [-1.0, 3.0]
        normal - a normal distribution at 1.0 with stdev=1
        constant - a constant value
        exponential - an exponential distribution at 3.0 with rate 1.0
    data = np.zeros((1000, 3))
    with random_seed(seed):
        data[:, 0] = stats.beta.rvs(a=0.1, b=0.1, size=data.shape[0])
        data[:, 1] = stats.beta.rvs(a=0.1, b=0.5, size=data.shape[0])
        data[:, 2] = np.random.normal(size=data.shape[0]) + data[:, 1] * 10
    return pd.DataFrame(data, columns=["x", "y", "z"])
Exemplo n.º 11
def sample_univariate_bimodal(size=1000, seed=42):
    """Sample from a bimodal distribution which mixes two Gaussians at 0.0 and 10.0 with stdev=1.

    The distribution is built by sampling a standard normal and a normal with mean ``10``
    and then selecting one or the other based on a bernoulli distribution.

        size (int):
            Amount of samples to generate. Defaults to 1000.
        seed (int):
            Random seed to use. Defaults to 42.

            Series with the sampled values.
    with random_seed(seed):
        bernoulli = sample_univariate_bernoulli(size, seed)
        mode1 = np.random.normal(size=size) * bernoulli
        mode2 = np.random.normal(size=size, loc=10) * (1.0 - bernoulli)

        return pd.Series(mode1 + mode2)
Exemplo n.º 12
def load_diverse_univariates(seed=42):
    This dataset contains 6 columns, each of which corresponds to a different
    univariate distribution:

        bernoulli - a Bernoulli distribution with p=0.3
        bimodal - a mixture of two Gaussians at 0.0 and 10.0 with stdev=1
        uniform - a uniform distribution in [-1.0, 3.0]
        normal - a normal distribution at 1.0 with stdev=1
        constant - a constant value
        exponential - an exponential distribution at 3.0 with rate 1.0
    size = 1000
    df = pd.DataFrame()
    with random_seed(seed):
        df["bernoulli"] = (np.random.random(size=size) < 0.3).astype(float)
        df["bimodal"] = np.random.normal(size=size) * df["bernoulli"] + \
            np.random.normal(size=size, loc=10) * (1.0 - df["bernoulli"])
        df["uniform"] = 4.0 * np.random.random(size=size) - 1.0
        df["normal"] = np.random.normal(size=size, loc=1.0)
        df["constant"] = np.random.random()  # a single random number
        df["exponential"] = np.random.exponential(size=size) + 3.0
    return df