Пример #1
0
def generate_categorical_dataframe(
    sm: nx.DiGraph,
    n_samples: int,
    distribution: str = "logit",
    n_categories: int = 3,
    noise_scale: float = 1.0,
    intercept: bool = False,
    seed: int = None,
    kernel: Optional[Kernel] = None,
) -> pd.DataFrame:
    """
    Generates a dataframe with samples from SEM with specified type of noise.

    Args:
        sm: A DAG in form of a networkx or StructureModel. Does not require weights.
        n_samples: The number of rows/observations to sample.
        kernel: A kernel from sklearn.gaussian_process.kernels like RBF(1) or
            Matern(1) or any combinations thereof. The kernels are used to
            create the latent variable for the binary / categorical variables
            and are directly used for continuous variables.
        distribution: The type of distribution to use for the noise
            of a variable. Options: 'probit'/'normal' (alias),
            "logit"/"gumbel" (alias). Logit is default.
        n_categories: Number of categories per variable/node.
        noise_scale: The standard deviation of the noise. The categorical features
            are created using a latent variable approach. The noise standard
            deviation determines how much weight the "mean" estimate has on
            the feature value.
        intercept: Whether to use an intercept for the latent variable of each feature.
        seed: Random state
    Returns:
        x_mat: [n_samples, d_nodes] sample matrix
    Raises:
        ValueError: if distribution is not 'probit', 'normal', 'logit', 'gumbel'
    """

    if kernel is None:
        return sem_generator(
            graph=sm,
            default_type=f"categorical:{n_categories}",
            n_samples=n_samples,
            distributions={"categorical": distribution},
            noise_std=noise_scale,
            intercept=intercept,
            seed=seed,
        )

    return nonlinear_sem_generator(
        graph=sm,
        kernel=kernel,
        default_type=f"categorical:{n_categories}",
        n_samples=n_samples,
        distributions={"categorical": distribution},
        noise_std=noise_scale,
        seed=seed,
    )
Пример #2
0
def generate_binary_data(
    sm: nx.DiGraph,
    n_samples: int,
    distribution: str = "logit",
    noise_scale: float = 1.0,
    intercept: bool = False,
    seed: int = None,
    kernel: Optional[Kernel] = None,
) -> np.ndarray:
    """
    Simulate samples from SEM with specified type of noise.
    The order of the columns on the returned array is the one provided by `sm.nodes`

    Args:
        sm: A DAG in form of a networkx or StructureModel. Does not require weights.
        n_samples: The number of rows/observations to sample.
        kernel: A kernel from sklearn.gaussian_process.kernels like RBF(1) or
            Matern(1) or any combinations thereof. The kernels are used to
            create the latent variable for the binary / categorical variables
            and are directly used for continuous variables.
        distribution: The type of distribution to use for the noise
            of a variable. Options: 'probit'/'normal' (alias),
            'logit' (default).
        noise_scale: The standard deviation of the noise. The binary and
            categorical features are created using a latent variable approach.
            The noise standard deviation determines how much weight the "mean"
            estimate has on the feature value.
        intercept: Whether to use an intercept for the latent variable of each feature.
        seed: Random state
    Returns:
        x_mat: [n_samples,d_nodes] sample matrix
    Raises:
        ValueError: if distribution isn't 'probit', 'normal', 'logit'
    """
    if kernel is None:
        df = sem_generator(
            graph=sm,
            default_type="binary",
            n_samples=n_samples,
            distributions={"binary": distribution},
            noise_std=noise_scale,
            intercept=intercept,
            seed=seed,
        )
    else:
        df = nonlinear_sem_generator(
            graph=sm,
            kernel=kernel,
            default_type="binary",
            n_samples=n_samples,
            distributions={"binary": distribution},
            noise_std=noise_scale,
            seed=seed,
        )
    return df[list(sm.nodes())].values
Пример #3
0
def generate_continuous_dataframe(
    sm: nx.DiGraph,
    n_samples: int,
    distribution: str = "gaussian",
    noise_scale: float = 1.0,
    intercept: bool = False,
    seed: int = None,
    kernel: Optional[Kernel] = None,
) -> pd.DataFrame:
    """
    Generates a dataframe with samples from SEM with specified type of noise.
    Args:
        sm: A DAG in form of a networkx or StructureModel. Does not require weights.
        n_samples: The number of rows/observations to sample.
        kernel: A kernel from sklearn.gaussian_process.kernels like RBF(1) or
            Matern(1) or any combinations thereof. The kernels are used to
            create the latent variable for the binary / categorical variables
            and are directly used for continuous variables.
        distribution: The type of distribution to use for the noise
            of a variable. Options: 'gaussian'/'normal' (alias), 'student-t',
            'exponential', 'gumbel'.
        noise_scale: The standard deviation of the noise.
        intercept: Whether to use an intercept for each feature.
        seed: Random state
    Returns:
        Dataframe with the node names as column names
    Raises:
        ValueError: if distribution is not 'gaussian', 'normal', 'student-t',
            'exponential', 'gumbel'
    """
    if kernel is None:
        return sem_generator(
            graph=sm,
            default_type="continuous",
            n_samples=n_samples,
            distributions={"continuous": distribution},
            noise_std=noise_scale,
            intercept=intercept,
            seed=seed,
        )

    return nonlinear_sem_generator(
        graph=sm,
        kernel=kernel,
        default_type="continuous",
        n_samples=n_samples,
        distributions={"continuous": distribution},
        noise_std=noise_scale,
        seed=seed,
    )
Пример #4
0
def generate_count_dataframe(
    sm: nx.DiGraph,
    n_samples: int,
    zero_inflation_factor: float = 0.1,
    intercept: bool = False,
    seed: int = None,
    kernel: Optional[Kernel] = None,
) -> pd.DataFrame:
    """
    Generates a dataframe with samples from SEM with specified type of noise.

    Args:
        sm: A DAG in form of a networkx or StructureModel. Does not require weights.
        n_samples: The number of rows/observations to sample.
        kernel: A kernel from sklearn.gaussian_process.kernels like RBF(1) or
            Matern(1) or any combinations thereof. The kernels are used to
            create the latent variable for the binary / categorical variables
            and are directly used for continuous variables.
        zero_inflation_factor: The probability of zero inflation for count data.
        intercept: Whether to use an intercept for the latent variable of each feature.
        seed: Random state
    Returns:
        x_mat: [n_samples, d_nodes] sample matrix
    Raises:
        ValueError: if ``zero_inflation_factor`` is not a float in [0, 1].
    """

    if kernel is None:
        return sem_generator(
            graph=sm,
            default_type="count",
            n_samples=n_samples,
            distributions={"count": zero_inflation_factor},
            noise_std=1,  # not used for poisson
            intercept=intercept,
            seed=seed,
        )

    return nonlinear_sem_generator(
        graph=sm,
        kernel=kernel,
        default_type="count",
        n_samples=n_samples,
        distributions={"count": zero_inflation_factor},
        noise_std=1,  # not used for poisson
        seed=seed,
    )