Пример #1
0
    def train(self, J, G, verbose=True):
        """
        Trains sampler using dataset to resample variable jj relative to G.
        Args:
            J: features of interest
            G: arbitrary set of variables
            verbose: printing
        """

        J = Sampler._to_array(list(J))
        G = Sampler._to_array(list(G))
        super().train(J, G, verbose=verbose)

        if not self._train_J_degenerate(J, G, verbose=verbose):
            G_disjoint = set(G).isdisjoint(self.cat_inputs)
            J_disjoint = set(J).isdisjoint(self.cat_inputs)
            if not G_disjoint or not J_disjoint:
                raise NotImplementedError('GaussianConditionalEstimator does '
                                          'not support categorical variables.')

            # to be sampled using gaussin estimator
            J_R = list(set(J) - set(G))
            # to be ID returned
            J_G = list(set(J) - set(J_R))

            gaussian_estimator = GaussianConditionalEstimator()
            train_inputs = self.X_train[Sampler._order_fset(J_R)].to_numpy()
            train_context = self.X_train[Sampler._order_fset(G)].to_numpy()

            gaussian_estimator.fit(train_inputs=train_inputs,
                                   train_context=train_context)

            J_G_ixs = fset_to_ix(G, J)
            samplef_J_G = sample_id(J_G_ixs)

            ixs_J_G = fset_to_ix(J, J_G)
            ixs_J_R = fset_to_ix(J, J_R)

            def samplefunc(eval_context, **kwargs):
                sample_J_G = samplef_J_G(eval_context, **kwargs)
                sample_J_R = gaussian_estimator.sample(eval_context, **kwargs)
                sample = np.zeros((sample_J_R.shape[0], sample_J_R.shape[1],
                                   sample_J_R.shape[2] + sample_J_G.shape[2]))
                sample[:, :, ixs_J_G] = sample_J_G
                sample[:, :, ixs_J_R] = sample_J_R
                return sample

            self._store_samplefunc(J, G, samplefunc, verbose=verbose)

            return gaussian_estimator
        else:
            return None
Пример #2
0
    def __call__(self,
                 estimator: ConditionalDistributionEstimator,
                 sem: StructuralEquationModel,
                 target_var: str,
                 context_vars: Tuple[str],
                 exp_args: Union[DictConfig, dict],
                 conditioning_mode: str = 'all',
                 test_df: pd.DataFrame = None,
                 sort_estimator_context=True):

        logger.info(
            f"Calculating {self.name} for {target_var} / {context_vars}")
        assert target_var not in context_vars

        context = {
            node: torch.tensor(test_df.loc[:, node])
            for node in context_vars
        }
        context_size = len(test_df)

        logger.info("Initializing SEM conditional distributions")
        if conditioning_mode == 'true_parents':
            data_log_prob = sem.parents_conditional_distribution(
                target_var, parents_context=context).log_prob
        elif conditioning_mode == 'true_markov_blanket' or conditioning_mode == 'all':
            data_log_prob = sem.mb_conditional_log_prob(target_var,
                                                        global_context=context,
                                                        **exp_args['mb_dist'])
        elif conditioning_mode == 'arbitrary' and isinstance(
                sem, LinearGaussianNoiseSEM):
            data_log_prob = sem.conditional_distribution(
                target_var, context=context).log_prob
        else:
            raise NotImplementedError('Unknown conditioning type!')

        def log_prob_from_np(value, log_prob_func):
            value = torch.tensor([[value]])
            with torch.no_grad():
                return log_prob_func(value.repeat(context_size, 1)).squeeze()

        logger.info("Initializing estimator's conditional distributions")
        if sort_estimator_context:
            test_context = test_df[Sampler._order_fset(
                context_vars)].to_numpy()
        else:
            test_context = test_df[context_vars].to_numpy()
        model_log_prob = estimator.conditional_distribution(
            test_context).log_prob

        logger.info("Calculating integral")
        if self.name == 'conditional_kl_divergence' or self.name == 'conditional_hellinger_distance':

            def integrand(value):
                data_log_p = log_prob_from_np(value, data_log_prob)
                model_log_p = log_prob_from_np(value, model_log_prob)

                if self.name == 'conditional_kl_divergence':
                    res = (data_log_p - model_log_p) * data_log_p.exp()
                elif self.name == 'conditional_hellinger_distance':
                    res = (torch.sqrt(data_log_p.exp()) -
                           torch.sqrt(model_log_p.exp()))**2

                res[torch.isnan(res)] = 0.0  # Out of support values
                return res.numpy()

            if self.name == 'conditional_kl_divergence':
                result = integrate.quad_vec(
                    integrand,
                    *sem.support_bounds,
                    epsabs=exp_args['metrics']['epsabs'])[0]
            else:
                result = integrate.quad_vec(
                    integrand,
                    -np.inf,
                    np.inf,
                    epsabs=exp_args['metrics']['epsabs'])[0]

            if self.name == 'conditional_hellinger_distance':
                result = np.sqrt(0.5 * result)

        elif self.name == 'conditional_js_divergence':

            # functions to integrate
            def integrand1(value):
                data_log_p = log_prob_from_np(value, data_log_prob)
                model_log_p = log_prob_from_np(value, model_log_prob)
                log_mixture = np.log(0.5) + torch.logsumexp(
                    torch.stack([data_log_p, model_log_p]), 0)
                res = (data_log_p - log_mixture) * data_log_p.exp()
                res[torch.isnan(res)] = 0.0  # Out of support values
                return res.numpy()

            def integrand2(value):
                data_log_p = log_prob_from_np(value, data_log_prob)
                model_log_p = log_prob_from_np(value, model_log_prob)
                log_mixture = np.log(0.5) + torch.logsumexp(
                    torch.stack([data_log_p, model_log_p]), 0)
                res = (model_log_p - log_mixture) * model_log_p.exp()
                res[torch.isnan(res)] = 0.0  # Out of support values
                return res.numpy()

            result = 0.5 * (
                integrate.quad_vec(integrand1,
                                   -np.inf,
                                   np.inf,
                                   epsabs=exp_args['metrics']['epsabs'])[0] +
                integrate.quad_vec(integrand2,
                                   -np.inf,
                                   np.inf,
                                   epsabs=exp_args['metrics']['epsabs'])[0])

        else:
            raise NotImplementedError()

        # Bounds check
        if not ((result + exp_args['metrics']['epsabs']) >= 0.0).all():
            logger.warning(
                f'{((result + exp_args["metrics"]["epsabs"]) < 0.0).sum()} contexts have negative distances, '
                f'Please increase cond distribution estimation accuracy. Negative values will be ignored.'
            )
            result = result[(result + exp_args['metrics']['epsabs']) >= 0.0]

        if self.name == 'conditional_js_divergence' and not (
                result - exp_args["metrics"]["epsabs"] <= np.log(2)).all():
            logger.warning(
                f'{(result - exp_args["metrics"]["epsabs"] > np.log(2)).sum()} contexts have distances, '
                f'larger than log(2), '
                f'please increase cond distribution estimation accuracy. Larger values will be ignored.'
            )
            result = result[(result -
                             exp_args["metrics"]["epsabs"]) <= np.log(2)]

        elif self.name == 'conditional_hellinger_distance' and not (
            (result - exp_args["metrics"]["epsabs"]) <= 1.0).all():
            logger.warning(
                f'{((result - exp_args["metrics"]["epsabs"]) > 1.0).sum()} contexts have distances, larger than 1.0, '
                f'please increase cond distribution estimation accuracy. Larger values will be ignored.'
            )
            result = result[(result - exp_args.metrics.epsabs) <= 1.0]

        return result.mean()
Пример #3
0
 def _pd_to_np(df):
     return Sampler._pd_to_np(df)
Пример #4
0
 def _order_fset(S):
     return Sampler._order_fset(S)
Пример #5
0
 def _to_array(S):
     """Coverts to numpy array
     """
     return Sampler._to_array(S)
Пример #6
0
 def _to_key(S):
     """
     Converts array to key for trainedCs Dict
     """
     return Sampler._to_key(S)
Пример #7
0
    def train(self, J, G, verbose=True):

        J = Sampler._to_array(J)
        G = Sampler._to_array(G)
        super().train(J, G, verbose=verbose)

        if not self._train_J_degenerate(J, G, verbose=verbose):

            if not set(J).isdisjoint(self.cat_inputs) and len(J) > 1:
                raise NotImplementedError('Multiple categorical or mixed '
                                          'variables sampling is not '
                                          'supported.')

            train_inputs = self.X_train[sorted(set(J))].to_numpy()
            train_context = self.X_train[sorted(set(G))].to_numpy()

            # Categorical variables in context
            cat_context = None
            if not set(G).isdisjoint(self.cat_inputs):
                G_cat = list(set(G).intersection(self.cat_inputs))
                cat_ixs = utils.fset_to_ix(sorted(G), sorted(G_cat))
                # cat_context = search_nonsorted(G, G_cat)
                logger.info('One hot encoding following context '
                            'features: {}'.format(G_cat))

            # Categorical variable as input
            if not set(J).isdisjoint(self.cat_inputs):
                logger.info(f'One hot encoding following inputs features:{J}')
                logger.info(f'Fitting categorical sampler for fset {J}.')
                logger.info(f'Fitting method: {self.fit_method}.')
                logger.info(f'Fitting parameters: {self.fit_params}')
                context_size = train_context.shape[1]
                model = CategoricalEstimator(context_size=context_size,
                                             cat_context=cat_ixs,
                                             **self.fit_params)
            # Continuous variable as input
            else:
                logger.info(f'Fitting continuous sampler for '
                            f'features {J}. Fitting method: '
                            f'{self.fit_method}. '
                            f'Fitting parameters: {self.fit_params}')
                cntxt_sz = train_context.shape[1]
                model = MixtureDensityNetworkEstimator(inputs_size=len(J),
                                                       context_size=cntxt_sz,
                                                       cat_context=cat_context,
                                                       **self.fit_params)

            # Fitting a sampler
            getattr(model, self.fit_method)(train_inputs=train_inputs,
                                            train_context=train_context,
                                            **self.fit_params)

            def samplefunc(eval_context, **kwargs):
                return model.sample(eval_context, **kwargs)

            self._store_samplefunc(J, G, samplefunc, verbose=verbose)

            return model

        else:
            return None
Пример #8
0
    def train(self, J, G, verbose=True):

        J = Sampler._to_array(J)
        G = Sampler._to_array(G)
        super().train(J, G, verbose=verbose)

        if not self._train_J_degenerate(J, G, verbose=verbose):

            if not set(J).isdisjoint(self.cat_inputs) and len(J) > 1:
                raise NotImplementedError('Multiple categorical or mixed '
                                          'variables sampling is not '
                                          'supported.')

            # TODO(valik): adjust to loading data from data frame using
            # J, G lists of columnnames
            train_inputs = self.X_train[Sampler._order_fset(J)].to_numpy()
            train_context = self.X_train[Sampler._order_fset(G)].to_numpy()

            # Categorical variables in context
            cat_context = None
            if not set(G).isdisjoint(self.cat_inputs):
                G_cat = list(set(G).intersection(self.cat_inputs))
                cat_ixs = utils.fset_to_ix(sorted(G), sorted(G_cat))
                # cat_context = search_nonsorted(G, G_cat)
                logger.info('One hot encoding following '
                            'context features: '
                            '{}'.format(G_cat))

            # Categorical variable as input
            if not set(J).isdisjoint(self.cat_inputs):
                logger.info(f'One hot encoding for inputs features: {J}')
                logger.info(f'Fitting categorical sampler for features {J}.'
                            'Fitting method: {self.fit_method}.'
                            f'Fitting parameters: {self.fit_params}')
                context_size = train_context.shape[1]
                model = CategoricalEstimator(context_size=context_size,
                                             cat_context=cat_ixs,
                                             **self.fit_params)
            # Continuous variable as input
            else:
                logger.info(f'Fitting continuous sampler for features {J}. '
                            'Fitting method: {self.fit_method}. '
                            f'Fitting parameters: {self.fit_params}')

                model = NormalisingFlowEstimator(inputs_size=len(J),
                                                 context_size=context_size,
                                                 cat_context=cat_ixs,
                                                 **self.fit_params)

            # Fitting a sampler
            getattr(model, self.fit_method)(train_inputs=train_inputs,
                                            train_context=train_context,
                                            **self.fit_params)

            def samplefunc(eval_context, **kwargs):
                # eval_context: numpy array sorted by columname
                # as numpy array
                return model.sample(eval_context, **kwargs)

            self._store_samplefunc(J, G, samplefunc, verbose=verbose)

            return model

        else:
            return None
Пример #9
0
 def samplefunc(eval_context, **kwargs):
     eval_context = eval_context[Sampler._order_fset(G)].to_numpy()
     return model.sample(eval_context, **kwargs)
Пример #10
0
 def samplefunc(eval_context, **kwargs):
     # eval_context: numpy array sorted by columname
     # as numpy array
     eval_context = eval_context[Sampler._order_fset(G)].to_numpy()
     return model.sample(eval_context, **kwargs)