def generate_categorical_dataframe( sm: nx.DiGraph, n_samples: int, distribution: str = "logit", n_categories: int = 3, noise_scale: float = 1.0, intercept: bool = False, seed: int = None, kernel: Optional[Kernel] = None, ) -> pd.DataFrame: """ Generates a dataframe with samples from SEM with specified type of noise. Args: sm: A DAG in form of a networkx or StructureModel. Does not require weights. n_samples: The number of rows/observations to sample. kernel: A kernel from sklearn.gaussian_process.kernels like RBF(1) or Matern(1) or any combinations thereof. The kernels are used to create the latent variable for the binary / categorical variables and are directly used for continuous variables. distribution: The type of distribution to use for the noise of a variable. Options: 'probit'/'normal' (alias), "logit"/"gumbel" (alias). Logit is default. n_categories: Number of categories per variable/node. noise_scale: The standard deviation of the noise. The categorical features are created using a latent variable approach. The noise standard deviation determines how much weight the "mean" estimate has on the feature value. intercept: Whether to use an intercept for the latent variable of each feature. seed: Random state Returns: x_mat: [n_samples, d_nodes] sample matrix Raises: ValueError: if distribution is not 'probit', 'normal', 'logit', 'gumbel' """ if kernel is None: return sem_generator( graph=sm, default_type=f"categorical:{n_categories}", n_samples=n_samples, distributions={"categorical": distribution}, noise_std=noise_scale, intercept=intercept, seed=seed, ) return nonlinear_sem_generator( graph=sm, kernel=kernel, default_type=f"categorical:{n_categories}", n_samples=n_samples, distributions={"categorical": distribution}, noise_std=noise_scale, seed=seed, )
def generate_binary_data( sm: nx.DiGraph, n_samples: int, distribution: str = "logit", noise_scale: float = 1.0, intercept: bool = False, seed: int = None, kernel: Optional[Kernel] = None, ) -> np.ndarray: """ Simulate samples from SEM with specified type of noise. The order of the columns on the returned array is the one provided by `sm.nodes` Args: sm: A DAG in form of a networkx or StructureModel. Does not require weights. n_samples: The number of rows/observations to sample. kernel: A kernel from sklearn.gaussian_process.kernels like RBF(1) or Matern(1) or any combinations thereof. The kernels are used to create the latent variable for the binary / categorical variables and are directly used for continuous variables. distribution: The type of distribution to use for the noise of a variable. Options: 'probit'/'normal' (alias), 'logit' (default). noise_scale: The standard deviation of the noise. The binary and categorical features are created using a latent variable approach. The noise standard deviation determines how much weight the "mean" estimate has on the feature value. intercept: Whether to use an intercept for the latent variable of each feature. seed: Random state Returns: x_mat: [n_samples,d_nodes] sample matrix Raises: ValueError: if distribution isn't 'probit', 'normal', 'logit' """ if kernel is None: df = sem_generator( graph=sm, default_type="binary", n_samples=n_samples, distributions={"binary": distribution}, noise_std=noise_scale, intercept=intercept, seed=seed, ) else: df = nonlinear_sem_generator( graph=sm, kernel=kernel, default_type="binary", n_samples=n_samples, distributions={"binary": distribution}, noise_std=noise_scale, seed=seed, ) return df[list(sm.nodes())].values
def generate_continuous_dataframe( sm: nx.DiGraph, n_samples: int, distribution: str = "gaussian", noise_scale: float = 1.0, intercept: bool = False, seed: int = None, kernel: Optional[Kernel] = None, ) -> pd.DataFrame: """ Generates a dataframe with samples from SEM with specified type of noise. Args: sm: A DAG in form of a networkx or StructureModel. Does not require weights. n_samples: The number of rows/observations to sample. kernel: A kernel from sklearn.gaussian_process.kernels like RBF(1) or Matern(1) or any combinations thereof. The kernels are used to create the latent variable for the binary / categorical variables and are directly used for continuous variables. distribution: The type of distribution to use for the noise of a variable. Options: 'gaussian'/'normal' (alias), 'student-t', 'exponential', 'gumbel'. noise_scale: The standard deviation of the noise. intercept: Whether to use an intercept for each feature. seed: Random state Returns: Dataframe with the node names as column names Raises: ValueError: if distribution is not 'gaussian', 'normal', 'student-t', 'exponential', 'gumbel' """ if kernel is None: return sem_generator( graph=sm, default_type="continuous", n_samples=n_samples, distributions={"continuous": distribution}, noise_std=noise_scale, intercept=intercept, seed=seed, ) return nonlinear_sem_generator( graph=sm, kernel=kernel, default_type="continuous", n_samples=n_samples, distributions={"continuous": distribution}, noise_std=noise_scale, seed=seed, )
def generate_count_dataframe( sm: nx.DiGraph, n_samples: int, zero_inflation_factor: float = 0.1, intercept: bool = False, seed: int = None, kernel: Optional[Kernel] = None, ) -> pd.DataFrame: """ Generates a dataframe with samples from SEM with specified type of noise. Args: sm: A DAG in form of a networkx or StructureModel. Does not require weights. n_samples: The number of rows/observations to sample. kernel: A kernel from sklearn.gaussian_process.kernels like RBF(1) or Matern(1) or any combinations thereof. The kernels are used to create the latent variable for the binary / categorical variables and are directly used for continuous variables. zero_inflation_factor: The probability of zero inflation for count data. intercept: Whether to use an intercept for the latent variable of each feature. seed: Random state Returns: x_mat: [n_samples, d_nodes] sample matrix Raises: ValueError: if ``zero_inflation_factor`` is not a float in [0, 1]. """ if kernel is None: return sem_generator( graph=sm, default_type="count", n_samples=n_samples, distributions={"count": zero_inflation_factor}, noise_std=1, # not used for poisson intercept=intercept, seed=seed, ) return nonlinear_sem_generator( graph=sm, kernel=kernel, default_type="count", n_samples=n_samples, distributions={"count": zero_inflation_factor}, noise_std=1, # not used for poisson seed=seed, )