def test_missing_cardinality(self, schema): schema = {"new": "categorical:"} with pytest.raises(ValueError, match="Missing cardinality for categorical"): validate_schema(nodes={"new"}, schema=schema) schema = {"new": "categorical:01"} with pytest.raises(ValueError, match="Missing cardinality for categorical"): validate_schema(nodes={"new"}, schema=schema) schema = {"new": "categorical:100"} validate_schema(nodes={"new"}, schema=schema)
def _init_sem_data_gen( graph: nx.DiGraph, schema: Dict, n_samples: int, default_type: str, distributions: Dict[str, str], seed: int, ): np.random.seed(seed) if not nx.algorithms.is_directed_acyclic_graph(graph): raise ValueError("Provided graph is not a DAG.") distributions = _set_default_distributions(distributions=distributions) validated_schema = validate_schema(nodes=graph.nodes(), schema=schema, default_type=default_type) var_fte_mapper = VariableFeatureMapper(validated_schema) # pre-allocate array n_columns = var_fte_mapper.n_features x_mat = np.empty([n_samples, n_columns]) return distributions, var_fte_mapper, x_mat
def test_correct_schema(self, schema): new_schema = validate_schema(nodes=list(schema.keys()), schema=schema) assert new_schema == schema
def test_imputation(self): default_schema = "continuous" new_schema = validate_schema(nodes=["new"], schema=None, default_type=default_schema) assert new_schema["new"] == default_schema
def test_unknown_default_schema(self): with pytest.raises(ValueError, match="Unknown default data type"): validate_schema(nodes=["new"], schema={}, default_type="unknown")
def test_unknown_data_type(self, schema): schema = {"new": "unknown"} with pytest.raises(ValueError, match="Unknown data type"): validate_schema(nodes={"new"}, schema=schema)
def sem_generator( graph: nx.DiGraph, schema: Optional[Dict] = None, default_type: str = "continuous", noise_std: float = 1.0, n_samples: int = 1000, distributions: Dict[str, str] = None, intercept: bool = True, seed: int = None, ) -> pd.DataFrame: """ Generator for tabular data with mixed variable types from a DAG. Supported variable types: `'binary', 'categorical', 'continuous'`. The number of categories can be determined using a colon, e.g. `'categorical:5'` specifies a categorical feature with 5 categories. Notation: For binary and continuous variables, a ``variable'' refers to a ``node'', a ``feature'' refers to the one-hot column for categorical variables and is equivalent to a binary or continuous variable. Args: graph: A DAG in form of a networkx or StructureModel. schema: Dictionary with schema for a node/variable, if a node is missing uses ``default_type``. Format, {node_name: variable type}. default_type: The default data type for a node/variable not listed in the schema, or when the schema is empty. noise_std: The standard deviation of the noise. The binary and categorical features are created using a latent variable approach. The noise standard deviation determines how much weight the "mean" estimate has on the feature value. n_samples: The number of rows/observations to sample. distributions: ``continuous'': The type of distribution to use for the noise of a continuous variable. Options: 'gaussian'/'normal' (alias) (default), 'student-t', 'exponential', 'gumbel'. ``binary'': The type of distribution to use for the noise of the latent binary variable. Options: 'probit'/'normal' (alias), 'logit' (default). ``categorical'': The type of distribution to use for the noise of a latent continuous feature. Options: 'probit'/'normal' (alias), 'logit'/'gumbel' (alias) (default). ``weight'': The type of distribution to use for the linear coefficients. Options: 'gaussian'/'normal' (alias), 'uniform' (default). ``intercept'': The type of distribution to use for the intercept. For binary/categorical: this is the mean in the latent space. Options: 'gaussian'/'normal' (alias), 'uniform' (default). intercept: Whether to use an intercept for each feature. The intercept is sampled once and held constant for all rows. For binary or categorical the intercept determines the class imbalance. seed: Random State Returns: DataFrame with generated features, uses a one-hot coding for categorical features. Raises: ValueError: if the graph is not a DAG. ValueError: if schema variable type is not in `'binary', 'categorical', 'continuous', 'continuous:X` (for variables with X categories). ValueError: if distributions['continuous'] is not 'gaussian', 'normal', 'student-t', 'exponential', 'gumbel'. ValueError: if distributions['binary'] is not 'probit', 'normal', 'logit'. ValueError: if distributions['categorical'] is not 'probit', 'normal', 'logit', 'gumbel'. ValueError: if distributions['weight'] is not 'normal' / 'gaussian' (alias), 'uniform'. ValueError: if distributions['intercept'] is not 'normal' / 'gaussian' (alias), 'uniform'. Example: sm = StructureModel() sm.add_edges_from([('A', 'C'), ('D', 'C'), ('E', 'D')]) sm.add_nodes_from(['B', 'F']) schema = {'B': 'binary', 'C': 'categorical:5', 'E': 'binary', 'F': 'continuous'} df = sem_generator(sm, schema, noise_scale=1, n_samples=10000, intercept=True, ) """ np.random.seed(seed) if not nx.algorithms.is_directed_acyclic_graph(graph): raise ValueError("Provided graph is not a DAG.") distributions = _set_default_distributions(distributions=distributions) validated_schema = validate_schema( nodes=graph.nodes(), schema=schema, default_type=default_type ) var_fte_mapper = VariableFeatureMapper(validated_schema) n_columns = var_fte_mapper.n_features # get dependence based on edges in graph (not via adjacency matrix) w_mat = _create_weight_matrix( edges_w_weights=graph.edges(data="weight"), variable_to_indices_dict=var_fte_mapper.var_indices_dict, weight_distribution=distributions["weight"], intercept_distribution=distributions["intercept"], intercept=intercept, ) # pre-allocate array x_mat = np.empty([n_samples, n_columns + 1 if intercept else n_columns]) # intercept, append ones to the feature matrix if intercept: x_mat[:, -1] = 1 # loop over sorted features according to ancestry (no parents first) for j_node in nx.topological_sort(graph): # all feature indices corresponding to the node/variable j_idx_list = var_fte_mapper.get_indices(j_node) # get all parent feature indices for the variable/node parents_idx = var_fte_mapper.get_indices(list(graph.predecessors(j_node))) if intercept: parents_idx += [n_columns] # continuous variable if var_fte_mapper.is_var_of_type(j_node, "continuous"): x_mat[:, j_idx_list[0]] = _add_continuous_noise( mean=x_mat[:, parents_idx].dot(w_mat[parents_idx, j_idx_list[0]]), distribution=distributions["continuous"], noise_std=noise_std, ) # binary variable elif var_fte_mapper.is_var_of_type(j_node, "binary"): x_mat[:, j_idx_list[0]] = _sample_binary_from_latent( latent_mean=x_mat[:, parents_idx].dot( w_mat[parents_idx, j_idx_list[0]] ), distribution=distributions["binary"], noise_std=noise_std, ) # categorical variable elif var_fte_mapper.is_var_of_type(j_node, "categorical"): x_mat[:, j_idx_list] = _sample_categories_from_latent( latent_mean=x_mat[:, parents_idx].dot( w_mat[np.ix_(parents_idx, j_idx_list)] ), distribution=distributions["categorical"], noise_std=noise_std, ) return pd.DataFrame( x_mat[:, :-1] if intercept else x_mat, columns=var_fte_mapper.feature_list )