def train(self, J, G, verbose=True): """ Trains sampler using dataset to resample variable jj relative to G. Args: J: features of interest G: arbitrary set of variables verbose: printing """ J = Sampler._to_array(list(J)) G = Sampler._to_array(list(G)) super().train(J, G, verbose=verbose) if not self._train_J_degenerate(J, G, verbose=verbose): G_disjoint = set(G).isdisjoint(self.cat_inputs) J_disjoint = set(J).isdisjoint(self.cat_inputs) if not G_disjoint or not J_disjoint: raise NotImplementedError('GaussianConditionalEstimator does ' 'not support categorical variables.') # to be sampled using gaussin estimator J_R = list(set(J) - set(G)) # to be ID returned J_G = list(set(J) - set(J_R)) gaussian_estimator = GaussianConditionalEstimator() train_inputs = self.X_train[Sampler._order_fset(J_R)].to_numpy() train_context = self.X_train[Sampler._order_fset(G)].to_numpy() gaussian_estimator.fit(train_inputs=train_inputs, train_context=train_context) J_G_ixs = fset_to_ix(G, J) samplef_J_G = sample_id(J_G_ixs) ixs_J_G = fset_to_ix(J, J_G) ixs_J_R = fset_to_ix(J, J_R) def samplefunc(eval_context, **kwargs): sample_J_G = samplef_J_G(eval_context, **kwargs) sample_J_R = gaussian_estimator.sample(eval_context, **kwargs) sample = np.zeros((sample_J_R.shape[0], sample_J_R.shape[1], sample_J_R.shape[2] + sample_J_G.shape[2])) sample[:, :, ixs_J_G] = sample_J_G sample[:, :, ixs_J_R] = sample_J_R return sample self._store_samplefunc(J, G, samplefunc, verbose=verbose) return gaussian_estimator else: return None
def _train_J_degenerate(self, J, G, verbose=True): """Training function that takes care of degenerate cases where either j is in G or G is empty. Args: J: features of interest G: relative feature set Returns: Whether a degenerate case was present. """ degenerate = True # are we conditioning on zero elements? # if G.size == 0: # logger.debug('Degenerate Training: Empty G') # J_ixs = utils.fset_to_ix(self.X_train.columns, J) # self._store_samplefunc(J, G, sample_perm(J_ixs)) # are all elements in G being conditioned upon? if np.sum(1 - np.isin(J, G)) == 0: logger.debug('Degenerate Training: J subseteq G') J_ixs = utils.fset_to_ix(Sampler._order_fset(G), Sampler._order_fset(J)) self._store_samplefunc(J, G, sample_id(J_ixs)) else: logger.debug('Training not degenerate.') degenerate = False return degenerate
def train(self, J, G, verbose=True): J = Sampler._to_array(J) G = Sampler._to_array(G) super().train(J, G, verbose=verbose) if not self._train_J_degenerate(J, G, verbose=verbose): if not set(J).isdisjoint(self.cat_inputs) and len(J) > 1: raise NotImplementedError('Multiple categorical or mixed ' 'variables sampling is not ' 'supported.') train_inputs = self.X_train[sorted(set(J))].to_numpy() train_context = self.X_train[sorted(set(G))].to_numpy() # Categorical variables in context cat_context = None if not set(G).isdisjoint(self.cat_inputs): G_cat = list(set(G).intersection(self.cat_inputs)) cat_ixs = utils.fset_to_ix(sorted(G), sorted(G_cat)) # cat_context = search_nonsorted(G, G_cat) logger.info('One hot encoding following context ' 'features: {}'.format(G_cat)) # Categorical variable as input if not set(J).isdisjoint(self.cat_inputs): logger.info(f'One hot encoding following inputs features:{J}') logger.info(f'Fitting categorical sampler for fset {J}.') logger.info(f'Fitting method: {self.fit_method}.') logger.info(f'Fitting parameters: {self.fit_params}') context_size = train_context.shape[1] model = CategoricalEstimator(context_size=context_size, cat_context=cat_ixs, **self.fit_params) # Continuous variable as input else: logger.info(f'Fitting continuous sampler for ' f'features {J}. Fitting method: ' f'{self.fit_method}. ' f'Fitting parameters: {self.fit_params}') cntxt_sz = train_context.shape[1] model = MixtureDensityNetworkEstimator(inputs_size=len(J), context_size=cntxt_sz, cat_context=cat_context, **self.fit_params) # Fitting a sampler getattr(model, self.fit_method)(train_inputs=train_inputs, train_context=train_context, **self.fit_params) def samplefunc(eval_context, **kwargs): return model.sample(eval_context, **kwargs) self._store_samplefunc(J, G, samplefunc, verbose=verbose) return model else: return None
def train(self, J, G, verbose=True): J = Sampler._to_array(J) G = Sampler._to_array(G) super().train(J, G, verbose=verbose) if not self._train_J_degenerate(J, G, verbose=verbose): if not set(J).isdisjoint(self.cat_inputs) and len(J) > 1: raise NotImplementedError('Multiple categorical or mixed ' 'variables sampling is not ' 'supported.') # TODO(valik): adjust to loading data from data frame using # J, G lists of columnnames train_inputs = self.X_train[Sampler._order_fset(J)].to_numpy() train_context = self.X_train[Sampler._order_fset(G)].to_numpy() # Categorical variables in context cat_context = None if not set(G).isdisjoint(self.cat_inputs): G_cat = list(set(G).intersection(self.cat_inputs)) cat_ixs = utils.fset_to_ix(sorted(G), sorted(G_cat)) # cat_context = search_nonsorted(G, G_cat) logger.info('One hot encoding following ' 'context features: ' '{}'.format(G_cat)) # Categorical variable as input if not set(J).isdisjoint(self.cat_inputs): logger.info(f'One hot encoding for inputs features: {J}') logger.info(f'Fitting categorical sampler for features {J}.' 'Fitting method: {self.fit_method}.' f'Fitting parameters: {self.fit_params}') context_size = train_context.shape[1] model = CategoricalEstimator(context_size=context_size, cat_context=cat_ixs, **self.fit_params) # Continuous variable as input else: logger.info(f'Fitting continuous sampler for features {J}. ' 'Fitting method: {self.fit_method}. ' f'Fitting parameters: {self.fit_params}') model = NormalisingFlowEstimator(inputs_size=len(J), context_size=context_size, cat_context=cat_ixs, **self.fit_params) # Fitting a sampler getattr(model, self.fit_method)(train_inputs=train_inputs, train_context=train_context, **self.fit_params) def samplefunc(eval_context, **kwargs): # eval_context: numpy array sorted by columname # as numpy array return model.sample(eval_context, **kwargs) self._store_samplefunc(J, G, samplefunc, verbose=verbose) return model else: return None