Exemplo n.º 1
0
    def train(self, J, G, verbose=True):
        """
        Trains sampler using dataset to resample variable jj relative to G.
        Args:
            J: features of interest
            G: arbitrary set of variables
            verbose: printing
        """

        J = Sampler._to_array(list(J))
        G = Sampler._to_array(list(G))
        super().train(J, G, verbose=verbose)

        if not self._train_J_degenerate(J, G, verbose=verbose):
            G_disjoint = set(G).isdisjoint(self.cat_inputs)
            J_disjoint = set(J).isdisjoint(self.cat_inputs)
            if not G_disjoint or not J_disjoint:
                raise NotImplementedError('GaussianConditionalEstimator does '
                                          'not support categorical variables.')

            # to be sampled using gaussin estimator
            J_R = list(set(J) - set(G))
            # to be ID returned
            J_G = list(set(J) - set(J_R))

            gaussian_estimator = GaussianConditionalEstimator()
            train_inputs = self.X_train[Sampler._order_fset(J_R)].to_numpy()
            train_context = self.X_train[Sampler._order_fset(G)].to_numpy()

            gaussian_estimator.fit(train_inputs=train_inputs,
                                   train_context=train_context)

            J_G_ixs = fset_to_ix(G, J)
            samplef_J_G = sample_id(J_G_ixs)

            ixs_J_G = fset_to_ix(J, J_G)
            ixs_J_R = fset_to_ix(J, J_R)

            def samplefunc(eval_context, **kwargs):
                sample_J_G = samplef_J_G(eval_context, **kwargs)
                sample_J_R = gaussian_estimator.sample(eval_context, **kwargs)
                sample = np.zeros((sample_J_R.shape[0], sample_J_R.shape[1],
                                   sample_J_R.shape[2] + sample_J_G.shape[2]))
                sample[:, :, ixs_J_G] = sample_J_G
                sample[:, :, ixs_J_R] = sample_J_R
                return sample

            self._store_samplefunc(J, G, samplefunc, verbose=verbose)

            return gaussian_estimator
        else:
            return None
Exemplo n.º 2
0
    def _train_J_degenerate(self, J, G, verbose=True):
        """Training function that takes care of degenerate cases
        where either j is in G or G is empty.
        Args:
            J: features of interest
            G: relative feature set

        Returns:
            Whether a degenerate case was present.
        """
        degenerate = True

        # are we conditioning on zero elements?
        # if G.size == 0:
        #     logger.debug('Degenerate Training: Empty G')
        #     J_ixs = utils.fset_to_ix(self.X_train.columns, J)
        #     self._store_samplefunc(J, G, sample_perm(J_ixs))
        # are all elements in G being conditioned upon?
        if np.sum(1 - np.isin(J, G)) == 0:
            logger.debug('Degenerate Training: J subseteq G')
            J_ixs = utils.fset_to_ix(Sampler._order_fset(G),
                                     Sampler._order_fset(J))
            self._store_samplefunc(J, G, sample_id(J_ixs))
        else:
            logger.debug('Training not degenerate.')
            degenerate = False

        return degenerate
Exemplo n.º 3
0
    def train(self, J, G, verbose=True):

        J = Sampler._to_array(J)
        G = Sampler._to_array(G)
        super().train(J, G, verbose=verbose)

        if not self._train_J_degenerate(J, G, verbose=verbose):

            if not set(J).isdisjoint(self.cat_inputs) and len(J) > 1:
                raise NotImplementedError('Multiple categorical or mixed '
                                          'variables sampling is not '
                                          'supported.')

            train_inputs = self.X_train[sorted(set(J))].to_numpy()
            train_context = self.X_train[sorted(set(G))].to_numpy()

            # Categorical variables in context
            cat_context = None
            if not set(G).isdisjoint(self.cat_inputs):
                G_cat = list(set(G).intersection(self.cat_inputs))
                cat_ixs = utils.fset_to_ix(sorted(G), sorted(G_cat))
                # cat_context = search_nonsorted(G, G_cat)
                logger.info('One hot encoding following context '
                            'features: {}'.format(G_cat))

            # Categorical variable as input
            if not set(J).isdisjoint(self.cat_inputs):
                logger.info(f'One hot encoding following inputs features:{J}')
                logger.info(f'Fitting categorical sampler for fset {J}.')
                logger.info(f'Fitting method: {self.fit_method}.')
                logger.info(f'Fitting parameters: {self.fit_params}')
                context_size = train_context.shape[1]
                model = CategoricalEstimator(context_size=context_size,
                                             cat_context=cat_ixs,
                                             **self.fit_params)
            # Continuous variable as input
            else:
                logger.info(f'Fitting continuous sampler for '
                            f'features {J}. Fitting method: '
                            f'{self.fit_method}. '
                            f'Fitting parameters: {self.fit_params}')
                cntxt_sz = train_context.shape[1]
                model = MixtureDensityNetworkEstimator(inputs_size=len(J),
                                                       context_size=cntxt_sz,
                                                       cat_context=cat_context,
                                                       **self.fit_params)

            # Fitting a sampler
            getattr(model, self.fit_method)(train_inputs=train_inputs,
                                            train_context=train_context,
                                            **self.fit_params)

            def samplefunc(eval_context, **kwargs):
                return model.sample(eval_context, **kwargs)

            self._store_samplefunc(J, G, samplefunc, verbose=verbose)

            return model

        else:
            return None
Exemplo n.º 4
0
    def train(self, J, G, verbose=True):

        J = Sampler._to_array(J)
        G = Sampler._to_array(G)
        super().train(J, G, verbose=verbose)

        if not self._train_J_degenerate(J, G, verbose=verbose):

            if not set(J).isdisjoint(self.cat_inputs) and len(J) > 1:
                raise NotImplementedError('Multiple categorical or mixed '
                                          'variables sampling is not '
                                          'supported.')

            # TODO(valik): adjust to loading data from data frame using
            # J, G lists of columnnames
            train_inputs = self.X_train[Sampler._order_fset(J)].to_numpy()
            train_context = self.X_train[Sampler._order_fset(G)].to_numpy()

            # Categorical variables in context
            cat_context = None
            if not set(G).isdisjoint(self.cat_inputs):
                G_cat = list(set(G).intersection(self.cat_inputs))
                cat_ixs = utils.fset_to_ix(sorted(G), sorted(G_cat))
                # cat_context = search_nonsorted(G, G_cat)
                logger.info('One hot encoding following '
                            'context features: '
                            '{}'.format(G_cat))

            # Categorical variable as input
            if not set(J).isdisjoint(self.cat_inputs):
                logger.info(f'One hot encoding for inputs features: {J}')
                logger.info(f'Fitting categorical sampler for features {J}.'
                            'Fitting method: {self.fit_method}.'
                            f'Fitting parameters: {self.fit_params}')
                context_size = train_context.shape[1]
                model = CategoricalEstimator(context_size=context_size,
                                             cat_context=cat_ixs,
                                             **self.fit_params)
            # Continuous variable as input
            else:
                logger.info(f'Fitting continuous sampler for features {J}. '
                            'Fitting method: {self.fit_method}. '
                            f'Fitting parameters: {self.fit_params}')

                model = NormalisingFlowEstimator(inputs_size=len(J),
                                                 context_size=context_size,
                                                 cat_context=cat_ixs,
                                                 **self.fit_params)

            # Fitting a sampler
            getattr(model, self.fit_method)(train_inputs=train_inputs,
                                            train_context=train_context,
                                            **self.fit_params)

            def samplefunc(eval_context, **kwargs):
                # eval_context: numpy array sorted by columname
                # as numpy array
                return model.sample(eval_context, **kwargs)

            self._store_samplefunc(J, G, samplefunc, verbose=verbose)

            return model

        else:
            return None