def train(self, J, G, verbose=True): """ Trains sampler using dataset to resample variable jj relative to G. Args: J: features of interest G: arbitrary set of variables verbose: printing """ J = Sampler._to_array(list(J)) G = Sampler._to_array(list(G)) super().train(J, G, verbose=verbose) if not self._train_J_degenerate(J, G, verbose=verbose): G_disjoint = set(G).isdisjoint(self.cat_inputs) J_disjoint = set(J).isdisjoint(self.cat_inputs) if not G_disjoint or not J_disjoint: raise NotImplementedError('GaussianConditionalEstimator does ' 'not support categorical variables.') # to be sampled using gaussin estimator J_R = list(set(J) - set(G)) # to be ID returned J_G = list(set(J) - set(J_R)) gaussian_estimator = GaussianConditionalEstimator() train_inputs = self.X_train[Sampler._order_fset(J_R)].to_numpy() train_context = self.X_train[Sampler._order_fset(G)].to_numpy() gaussian_estimator.fit(train_inputs=train_inputs, train_context=train_context) J_G_ixs = fset_to_ix(G, J) samplef_J_G = sample_id(J_G_ixs) ixs_J_G = fset_to_ix(J, J_G) ixs_J_R = fset_to_ix(J, J_R) def samplefunc(eval_context, **kwargs): sample_J_G = samplef_J_G(eval_context, **kwargs) sample_J_R = gaussian_estimator.sample(eval_context, **kwargs) sample = np.zeros((sample_J_R.shape[0], sample_J_R.shape[1], sample_J_R.shape[2] + sample_J_G.shape[2])) sample[:, :, ixs_J_G] = sample_J_G sample[:, :, ixs_J_R] = sample_J_R return sample self._store_samplefunc(J, G, samplefunc, verbose=verbose) return gaussian_estimator else: return None
def __call__(self, estimator: ConditionalDistributionEstimator, sem: StructuralEquationModel, target_var: str, context_vars: Tuple[str], exp_args: Union[DictConfig, dict], conditioning_mode: str = 'all', test_df: pd.DataFrame = None, sort_estimator_context=True): logger.info( f"Calculating {self.name} for {target_var} / {context_vars}") assert target_var not in context_vars context = { node: torch.tensor(test_df.loc[:, node]) for node in context_vars } context_size = len(test_df) logger.info("Initializing SEM conditional distributions") if conditioning_mode == 'true_parents': data_log_prob = sem.parents_conditional_distribution( target_var, parents_context=context).log_prob elif conditioning_mode == 'true_markov_blanket' or conditioning_mode == 'all': data_log_prob = sem.mb_conditional_log_prob(target_var, global_context=context, **exp_args['mb_dist']) elif conditioning_mode == 'arbitrary' and isinstance( sem, LinearGaussianNoiseSEM): data_log_prob = sem.conditional_distribution( target_var, context=context).log_prob else: raise NotImplementedError('Unknown conditioning type!') def log_prob_from_np(value, log_prob_func): value = torch.tensor([[value]]) with torch.no_grad(): return log_prob_func(value.repeat(context_size, 1)).squeeze() logger.info("Initializing estimator's conditional distributions") if sort_estimator_context: test_context = test_df[Sampler._order_fset( context_vars)].to_numpy() else: test_context = test_df[context_vars].to_numpy() model_log_prob = estimator.conditional_distribution( test_context).log_prob logger.info("Calculating integral") if self.name == 'conditional_kl_divergence' or self.name == 'conditional_hellinger_distance': def integrand(value): data_log_p = log_prob_from_np(value, data_log_prob) model_log_p = log_prob_from_np(value, model_log_prob) if self.name == 'conditional_kl_divergence': res = (data_log_p - model_log_p) * data_log_p.exp() elif self.name == 'conditional_hellinger_distance': res = (torch.sqrt(data_log_p.exp()) - torch.sqrt(model_log_p.exp()))**2 res[torch.isnan(res)] = 0.0 # Out of support values return res.numpy() if self.name == 'conditional_kl_divergence': result = integrate.quad_vec( integrand, *sem.support_bounds, epsabs=exp_args['metrics']['epsabs'])[0] else: result = integrate.quad_vec( integrand, -np.inf, np.inf, epsabs=exp_args['metrics']['epsabs'])[0] if self.name == 'conditional_hellinger_distance': result = np.sqrt(0.5 * result) elif self.name == 'conditional_js_divergence': # functions to integrate def integrand1(value): data_log_p = log_prob_from_np(value, data_log_prob) model_log_p = log_prob_from_np(value, model_log_prob) log_mixture = np.log(0.5) + torch.logsumexp( torch.stack([data_log_p, model_log_p]), 0) res = (data_log_p - log_mixture) * data_log_p.exp() res[torch.isnan(res)] = 0.0 # Out of support values return res.numpy() def integrand2(value): data_log_p = log_prob_from_np(value, data_log_prob) model_log_p = log_prob_from_np(value, model_log_prob) log_mixture = np.log(0.5) + torch.logsumexp( torch.stack([data_log_p, model_log_p]), 0) res = (model_log_p - log_mixture) * model_log_p.exp() res[torch.isnan(res)] = 0.0 # Out of support values return res.numpy() result = 0.5 * ( integrate.quad_vec(integrand1, -np.inf, np.inf, epsabs=exp_args['metrics']['epsabs'])[0] + integrate.quad_vec(integrand2, -np.inf, np.inf, epsabs=exp_args['metrics']['epsabs'])[0]) else: raise NotImplementedError() # Bounds check if not ((result + exp_args['metrics']['epsabs']) >= 0.0).all(): logger.warning( f'{((result + exp_args["metrics"]["epsabs"]) < 0.0).sum()} contexts have negative distances, ' f'Please increase cond distribution estimation accuracy. Negative values will be ignored.' ) result = result[(result + exp_args['metrics']['epsabs']) >= 0.0] if self.name == 'conditional_js_divergence' and not ( result - exp_args["metrics"]["epsabs"] <= np.log(2)).all(): logger.warning( f'{(result - exp_args["metrics"]["epsabs"] > np.log(2)).sum()} contexts have distances, ' f'larger than log(2), ' f'please increase cond distribution estimation accuracy. Larger values will be ignored.' ) result = result[(result - exp_args["metrics"]["epsabs"]) <= np.log(2)] elif self.name == 'conditional_hellinger_distance' and not ( (result - exp_args["metrics"]["epsabs"]) <= 1.0).all(): logger.warning( f'{((result - exp_args["metrics"]["epsabs"]) > 1.0).sum()} contexts have distances, larger than 1.0, ' f'please increase cond distribution estimation accuracy. Larger values will be ignored.' ) result = result[(result - exp_args.metrics.epsabs) <= 1.0] return result.mean()
def _pd_to_np(df): return Sampler._pd_to_np(df)
def _order_fset(S): return Sampler._order_fset(S)
def _to_array(S): """Coverts to numpy array """ return Sampler._to_array(S)
def _to_key(S): """ Converts array to key for trainedCs Dict """ return Sampler._to_key(S)
def train(self, J, G, verbose=True): J = Sampler._to_array(J) G = Sampler._to_array(G) super().train(J, G, verbose=verbose) if not self._train_J_degenerate(J, G, verbose=verbose): if not set(J).isdisjoint(self.cat_inputs) and len(J) > 1: raise NotImplementedError('Multiple categorical or mixed ' 'variables sampling is not ' 'supported.') train_inputs = self.X_train[sorted(set(J))].to_numpy() train_context = self.X_train[sorted(set(G))].to_numpy() # Categorical variables in context cat_context = None if not set(G).isdisjoint(self.cat_inputs): G_cat = list(set(G).intersection(self.cat_inputs)) cat_ixs = utils.fset_to_ix(sorted(G), sorted(G_cat)) # cat_context = search_nonsorted(G, G_cat) logger.info('One hot encoding following context ' 'features: {}'.format(G_cat)) # Categorical variable as input if not set(J).isdisjoint(self.cat_inputs): logger.info(f'One hot encoding following inputs features:{J}') logger.info(f'Fitting categorical sampler for fset {J}.') logger.info(f'Fitting method: {self.fit_method}.') logger.info(f'Fitting parameters: {self.fit_params}') context_size = train_context.shape[1] model = CategoricalEstimator(context_size=context_size, cat_context=cat_ixs, **self.fit_params) # Continuous variable as input else: logger.info(f'Fitting continuous sampler for ' f'features {J}. Fitting method: ' f'{self.fit_method}. ' f'Fitting parameters: {self.fit_params}') cntxt_sz = train_context.shape[1] model = MixtureDensityNetworkEstimator(inputs_size=len(J), context_size=cntxt_sz, cat_context=cat_context, **self.fit_params) # Fitting a sampler getattr(model, self.fit_method)(train_inputs=train_inputs, train_context=train_context, **self.fit_params) def samplefunc(eval_context, **kwargs): return model.sample(eval_context, **kwargs) self._store_samplefunc(J, G, samplefunc, verbose=verbose) return model else: return None
def train(self, J, G, verbose=True): J = Sampler._to_array(J) G = Sampler._to_array(G) super().train(J, G, verbose=verbose) if not self._train_J_degenerate(J, G, verbose=verbose): if not set(J).isdisjoint(self.cat_inputs) and len(J) > 1: raise NotImplementedError('Multiple categorical or mixed ' 'variables sampling is not ' 'supported.') # TODO(valik): adjust to loading data from data frame using # J, G lists of columnnames train_inputs = self.X_train[Sampler._order_fset(J)].to_numpy() train_context = self.X_train[Sampler._order_fset(G)].to_numpy() # Categorical variables in context cat_context = None if not set(G).isdisjoint(self.cat_inputs): G_cat = list(set(G).intersection(self.cat_inputs)) cat_ixs = utils.fset_to_ix(sorted(G), sorted(G_cat)) # cat_context = search_nonsorted(G, G_cat) logger.info('One hot encoding following ' 'context features: ' '{}'.format(G_cat)) # Categorical variable as input if not set(J).isdisjoint(self.cat_inputs): logger.info(f'One hot encoding for inputs features: {J}') logger.info(f'Fitting categorical sampler for features {J}.' 'Fitting method: {self.fit_method}.' f'Fitting parameters: {self.fit_params}') context_size = train_context.shape[1] model = CategoricalEstimator(context_size=context_size, cat_context=cat_ixs, **self.fit_params) # Continuous variable as input else: logger.info(f'Fitting continuous sampler for features {J}. ' 'Fitting method: {self.fit_method}. ' f'Fitting parameters: {self.fit_params}') model = NormalisingFlowEstimator(inputs_size=len(J), context_size=context_size, cat_context=cat_ixs, **self.fit_params) # Fitting a sampler getattr(model, self.fit_method)(train_inputs=train_inputs, train_context=train_context, **self.fit_params) def samplefunc(eval_context, **kwargs): # eval_context: numpy array sorted by columname # as numpy array return model.sample(eval_context, **kwargs) self._store_samplefunc(J, G, samplefunc, verbose=verbose) return model else: return None
def samplefunc(eval_context, **kwargs): eval_context = eval_context[Sampler._order_fset(G)].to_numpy() return model.sample(eval_context, **kwargs)
def samplefunc(eval_context, **kwargs): # eval_context: numpy array sorted by columname # as numpy array eval_context = eval_context[Sampler._order_fset(G)].to_numpy() return model.sample(eval_context, **kwargs)