Python SimpleHypergrid.add_dimension 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: mlos.Spaces

클래스/타입: SimpleHypergrid

메소드/함수: add_dimension

hotexamples.com에서의 예제들: 5

Python SimpleHypergrid.add_dimension - 5개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 mlos.Spaces.SimpleHypergrid.add_dimension에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

SimpleHypergrid(30)

random(9)

add_dimension(5)

join(5)

random_dataframe(4)

JoinedSubgrid(2)

add_subgrid_on_external_dimension(1)

contains_point(1)

예제 #1

파일 보기

파일: RegressionEnhancedRandomForestModel.py 프로젝트: microsoft/MLOS

    def _fit_root_regression(
            self,
            x: pd.DataFrame,
            y: pd.DataFrame,
            iteration_number: int
    ):
        # TODO : Add back RidgeCV option after creating RidgeCrossValidatedRegressionModel
        assert \
            self.model_config.boosting_root_model_name in [
                LassoCrossValidatedRegressionModel.__name__
            ], f'Unrecognized boosting_root_model_name {self.model_config.boosting_root_model_name}'

        # Since the RERF transform_x created the proper design_matrix, this serves as the input space for the root regression model.
        # Hence the code below creates a (temporary) hypergrid reflecting the design_matrix.
        # This is less than ideal solution, but deriving min and max of polynomial terms (given feature column degrees) is non-trivial
        # TODO: set bounds on the polynomial terms correctly and eliminate the hack forcing the base_regressor to skip filtering invalid features
        design_matrix_hypergrid = SimpleHypergrid(
            name='RegressionEnhanceRandomForest_design_matrix',
            dimensions=None
        )
        for design_matrix_column_name in x.columns.values:
            design_matrix_dimension = ContinuousDimension(
                name=design_matrix_column_name,
                min=x[design_matrix_column_name].min(),
                max=x[design_matrix_column_name].max()
            )
            design_matrix_hypergrid.add_dimension(design_matrix_dimension)

        # fit lasso/ridge model using either specified params from __init__  or hyper-parameter search
        if self.model_config.boosting_root_model_name == LassoCrossValidatedRegressionModel.__name__:
            root_model_config = self.model_config.dimension_value_dict['lasso_regression_model_config']
            self.base_regressor_ = LassoCrossValidatedRegressionModel(
                model_config=root_model_config,
                input_space=design_matrix_hypergrid,
                output_space=self.output_space
            )
            # skips filtering to valid features in the base_regressor since the valid range of design_matrix column values is incorrect above
            self.base_regressor_.skip_input_filtering_on_predict = True

        self.base_regressor_.fit(
            x,
            y,
            iteration_number=iteration_number
        )

        return self

예제 #2

파일 보기

class CategoricalToOneHotEncodedHypergridAdapter(HypergridAdapter):
    """ Maps values in categorical dimensions into values in OneHotEncoded dimensions using:
        https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html,
        which will be referred to as sklearn's OHE.

        Parameters
        ----------
        merge_all_categorical_dimensions: bool
            If True, sklearn's OHE will be applied to the cross product of all categorical levels in the adaptee space.
            If False, sklearn's OHE will be applied individually to each categorical dimension in the adaptee space.
            Default=False.
        drop: {None, 'first', 'if_binary'}
            Argument passed to sklearn's OHE with same argument name.
            Default=None.

        The following sklearn OHE arguments are not supported or are restricted:
        categories: Not supported since the levels a categorical can assume are those specified by the adaptee CategoricalDimension categories.
        drop: Not supporting sklearn's OHE drop argument as a array-like of shape (n_features,).
        dtype: This adapter will always use float64 so we're able to accommodate np.NaN values in the projected/unprojected dataframes.
        sparse: Not exposed in the adapter since the adapter maintains the OHE instances.
        handle_unknown: This adapter will always set sklearn's OHE instantiation argument handle_unknown='error'.

    """
    def __init__(self,
                 adaptee: Hypergrid,
                 merge_all_categorical_dimensions: bool = False,
                 drop: str = None):
        if not HypergridAdapter.is_like_simple_hypergrid(adaptee):
            raise ValueError("Adaptee must implement a Hypergrid Interface.")

        HypergridAdapter.__init__(self,
                                  name=adaptee.name,
                                  random_state=adaptee.random_state)

        self._adaptee: Hypergrid = adaptee
        self._merge_all_categorical_dimensions = merge_all_categorical_dimensions
        self._one_hot_encoder_kwargs = {
            'drop': drop,
            'dtype': np.float64,
            'sparse': False,
            'handle_unknown': 'error'
        }
        self._all_one_hot_encoded_target_dimension_names = []
        self._adaptee_to_target_data_dict: Dict[
            str, CategoricalToOneHotEncodingAdapteeTargetMapping] = {}
        self._adaptee_expected_dimension_name_ordering = []
        self._concatenation_delim = '___'
        self._merged_categorical_dimension_column_name = 'ohe_cross_product'
        self.ohe_target_column_suffix = '__ohe'
        self._target: Hypergrid = None
        self.has_adaptee_been_flattened = False

        # Since CategoricalDimension values may have different types within the same dimension,
        #  we pass the adaptee through the CategoricalToDiscrete adapter to move all value types to ints

        # Because the OneHotEncoder needs to remember the dimension names (which change by the flattening in CategoricalToDiscrete),
        #  the flattening is performed here so the OneHotEncoder discovers the correct flattened column names
        if self._adaptee.is_hierarchical():
            self._adaptee = HierarchicalToFlatHypergridAdapter(
                adaptee=self._adaptee)
            self.has_adaptee_been_flattened = True

        # Since the CategoricalToDiscrete adapter converts categorical dimensions to discrete dimensions, we remember the categorical dim names
        self._adaptee_contains_categorical_dimensions = False
        self._adaptee_dimension_names_to_transform = []
        for adaptee_dimension in self._adaptee.dimensions:
            if isinstance(adaptee_dimension, CategoricalDimension):
                self._adaptee_dimension_names_to_transform.append(
                    adaptee_dimension.name)
            self._adaptee_expected_dimension_name_ordering.append(
                adaptee_dimension.name)
        self._adaptee_contains_categorical_dimensions = len(
            self._adaptee_dimension_names_to_transform) > 0

        # since sklearn OneHotEncoder doesn't accept strings, convert any categorical dimensions to discrete
        if any(
                isinstance(dimension, CategoricalDimension) for dimension in
                self._adaptee.dimensions) or self.has_adaptee_been_flattened:
            self._adaptee = CategoricalToDiscreteHypergridAdapter(
                adaptee=self._adaptee)

        self._build_simple_hypergrid_target()

    @property
    def adaptee(self) -> Hypergrid:
        return self._adaptee

    @property
    def target(self) -> Hypergrid:
        return self._target

    @property
    def was_encoding_merge_all_categoricals_requested(self):
        return self._merge_all_categorical_dimensions

    def get_original_categorical_column_names(self):
        return self._adaptee_to_target_data_dict.keys()

    def get_one_hot_encoded_column_names(self):
        return self._all_one_hot_encoded_target_dimension_names

    def _concatenate_dataframe_columns(self, df: DataFrame,
                                       columns_to_concatenate) -> DataFrame:
        df[columns_to_concatenate] = df[columns_to_concatenate].astype(
            'float64')
        return df[columns_to_concatenate].apply(
            lambda cat_row: self._concatenation_delim.join(cat_row.map(str)),
            axis=1)

    def _project_dataframe(self, df: DataFrame, in_place=True) -> DataFrame:
        if not in_place:
            df = df.copy(deep=True)
        columns_to_drop = []
        potentially_missing_columns = list(
            set.difference(set(self._adaptee_expected_dimension_name_ordering),
                           set(df.columns.values)))
        for missing_col in potentially_missing_columns:
            df[missing_col] = np.NaN
            df[missing_col] = df[missing_col].astype('float64')
            columns_to_drop.append(missing_col)

        columns_to_transform = self._adaptee_dimension_names_to_transform
        if self._merge_all_categorical_dimensions and self._adaptee_contains_categorical_dimensions:
            df[self.
               _merged_categorical_dimension_column_name] = self._concatenate_dataframe_columns(
                   df, columns_to_transform)
            columns_to_transform = [
                self._merged_categorical_dimension_column_name
            ]
            columns_to_drop.extend(self._adaptee_dimension_names_to_transform)

        for adaptee_column_name in columns_to_transform:
            my_ohe_dict = self._adaptee_to_target_data_dict[
                adaptee_column_name]
            my_ohe = my_ohe_dict.one_hot_encoder
            if not self._merge_all_categorical_dimensions:
                df[adaptee_column_name] = df[adaptee_column_name].astype(
                    'float64')
            ohe_x = df[adaptee_column_name].map(str).to_numpy().reshape(-1, 1)
            my_ohe_target_columns = my_ohe_dict.target_dims
            df[my_ohe_target_columns] = DataFrame(my_ohe.transform(ohe_x),
                                                  index=df.index)
            columns_to_drop.append(adaptee_column_name)

        if columns_to_drop:
            df.drop(columns=columns_to_drop, inplace=True)
        return df

    def _unproject_dataframe(self, df: DataFrame, in_place=True) -> DataFrame:
        if not in_place:
            df = df.copy(deep=True)

        columns_to_return = self._adaptee_expected_dimension_name_ordering
        if self._merge_all_categorical_dimensions and self._adaptee_contains_categorical_dimensions:
            for column_to_transform in self._adaptee_dimension_names_to_transform:
                if column_to_transform not in columns_to_return:
                    columns_to_return.append(column_to_transform)

        columns_to_drop = []
        if self._merge_all_categorical_dimensions and self._adaptee_contains_categorical_dimensions:
            my_ohe_dict = self._adaptee_to_target_data_dict[
                self._merged_categorical_dimension_column_name]
            target_columns_to_invert = my_ohe_dict.target_dims
            my_ohe = my_ohe_dict.one_hot_encoder
            df[self.
               _merged_categorical_dimension_column_name] = my_ohe.inverse_transform(
                   df[target_columns_to_invert])
            df[self._adaptee_dimension_names_to_transform] = df[self._merged_categorical_dimension_column_name]\
                .str.split(self._concatenation_delim, expand=True)
            df.loc[:, self._adaptee_dimension_names_to_transform].replace(
                'nan', np.NaN, inplace=True)
            df[self._adaptee_dimension_names_to_transform] = df[
                self._adaptee_dimension_names_to_transform].astype('float64')
            columns_to_drop.extend(target_columns_to_invert)
            columns_to_drop.append(
                self._merged_categorical_dimension_column_name)

        else:
            for adaptee_column_name in self._adaptee_dimension_names_to_transform:
                my_ohe_dict = self._adaptee_to_target_data_dict[
                    adaptee_column_name]
                target_columns_to_invert = my_ohe_dict.target_dims
                my_ohe = my_ohe_dict.one_hot_encoder
                df[adaptee_column_name] = my_ohe.inverse_transform(
                    df[target_columns_to_invert])
                df[adaptee_column_name].replace('nan', np.NaN, inplace=True)
                df[adaptee_column_name] = df[adaptee_column_name].astype(
                    'float64')
                columns_to_drop.extend(target_columns_to_invert)

        columns_to_retain_present_in_df = [
            column_name for column_name in columns_to_return
            if column_name in df.columns.values
        ]
        if in_place:
            df.loc[:, columns_to_retain_present_in_df].dropna(axis=1,
                                                              how='all',
                                                              inplace=in_place)
            df.drop(columns=columns_to_drop, inplace=in_place)
        else:
            df = df[columns_to_retain_present_in_df].dropna(axis=1,
                                                            how='all',
                                                            inplace=in_place)

        return df

    def _build_simple_hypergrid_target(self) -> None:
        """ Builds a SimpleHypergrid target for a SimpleHypergrid adaptee.

        :return:
        """

        self._target = SimpleHypergrid(name=self._adaptee.name,
                                       dimensions=None,
                                       random_state=self._adaptee.random_state)
        """ Details about construction of the target hypergrid:
           1) Moving non-categorical dimensions to target, while collecting needed info about adaptee categorical dimensions
           2) Since sklearn's OHE will handle both project and unproject dataframe transforms, prepare the OHE class.
              This requires constructing the 'categories' argument for OHE (all categorical dims or 1 cross product dim).
              The dimension's .linspace() method provides the order list of values but doesn't include possible np.NaN values,
              hence that list is augmented to include the string 'nan' which pandas.DataFrame.apply(map(str)) will produce from a np.NaN value.
              All values (output from CategoricalToDiscrete adapter are converted to strings prior to initializing the OHE object.
              This will allow the code to accommodate any missing values in the dataframes passed to .project_dataframe and .unproject_dataframe.
           3) If the cross product of all categorical dimensions have been requested, construct the cross product
        """
        categories_list_for_ohe_init = []
        for adaptee_dimension in self._adaptee.dimensions:
            if adaptee_dimension.name in self._adaptee_dimension_names_to_transform:
                """ conversion to str allows accommodation of np.NaN values in dataframes
                    np.NaN values will not appear in the .linspace() list but will be present in dataframes generated from hierarchical hypergrids.
                    So 'nan' is included to allow OHE to map np.NaNs in ._project_dataframe() and ._unproject_dataframe().
                    The value 'nan' is placed first in the list so the 'nan' x ... x 'nan' cross product value is first ([0]).
                    Since this value should never appear in hierarchical hypergrid derived dataframes, it is popped from
                    the categories when user specifies merge_all_categorical_dimensions==True.
                """
                expanded_categories = ['nan'] + [
                    str(float(x)) for x in adaptee_dimension.linspace()
                ]
                categories_list_for_ohe_init.append(expanded_categories)

                if not self._merge_all_categorical_dimensions and self._adaptee_contains_categorical_dimensions:
                    # do not need to encode the cross product of all categorical dimensions, sufficient info here to add target dimensions
                    self._adaptee_to_target_data_dict[
                        adaptee_dimension.
                        name] = CategoricalToOneHotEncodingAdapteeTargetMapping(
                            one_hot_encoder=OneHotEncoder(
                                categories=[expanded_categories],
                                **self._one_hot_encoder_kwargs))
                    temp_df_for_fit = DataFrame(
                        {adaptee_dimension.name: expanded_categories})
                    self._add_one_hot_encoded_dimensions(
                        adaptee_dimension.name, temp_df_for_fit)
            else:
                self._target.add_dimension(adaptee_dimension.copy())

        if self._merge_all_categorical_dimensions and self._adaptee_contains_categorical_dimensions:
            # harvested categories for each categorical dimension in single pass across all adaptee dimensions used to compute the cross product encoding here
            cross_product_categories = self._create_cross_product_categories(
                categories_list_for_ohe_init)
            self._adaptee_to_target_data_dict[
                self.
                _merged_categorical_dimension_column_name] = CategoricalToOneHotEncodingAdapteeTargetMapping(
                    one_hot_encoder=OneHotEncoder(
                        categories=[cross_product_categories],
                        **self._one_hot_encoder_kwargs))
            temp_df_for_fit = DataFrame({
                self._merged_categorical_dimension_column_name:
                cross_product_categories
            })
            self._add_one_hot_encoded_dimensions(
                self._merged_categorical_dimension_column_name,
                temp_df_for_fit)

    def _add_one_hot_encoded_dimensions(self, adaptee_dimension_name,
                                        temp_df_for_fit: DataFrame) -> None:
        my_target_data = self._adaptee_to_target_data_dict[
            adaptee_dimension_name]
        my_ohe_output = my_target_data.one_hot_encoder.fit_transform(
            temp_df_for_fit)
        my_target_data.num_dummy_dims = my_ohe_output.shape[1]
        for i in range(my_target_data.num_dummy_dims):
            target_dim_name = f'{adaptee_dimension_name}{self.ohe_target_column_suffix}{i}'
            my_target_data.target_dims.append(target_dim_name)
            self._target.add_dimension(
                DiscreteDimension(name=target_dim_name, min=0, max=1))
            self._all_one_hot_encoded_target_dimension_names.append(
                target_dim_name)

    def _create_cross_product_categories(self, categories_per_dimension) -> []:
        num_categorical_dims = len(categories_per_dimension)
        cross_product = np.array(np.meshgrid(
            *categories_per_dimension)).T.reshape(-1, num_categorical_dims)
        temp_df = DataFrame(cross_product)
        temp_df['concatenated_levels'] = self._concatenate_dataframe_columns(
            temp_df, temp_df.columns.values)
        concatenated_levels = temp_df['concatenated_levels'].values.tolist()

        # expect first element arises from 'nan' x ... x 'nan' which cannot appear in hierarchical hypergrids,
        #  so popping this before returning the cross product list
        if self.has_adaptee_been_flattened and num_categorical_dims > 1:
            all_nans = self._concatenation_delim.join(['nan'] *
                                                      num_categorical_dims)
            should_be_all_nans = concatenated_levels.pop(0)
            if should_be_all_nans != all_nans:
                raise ValueError(
                    'Failed to find cross product of nan values when constructing OneHotEncoding with merge_all_categorical_dimensions==True'
                )
        return concatenated_levels

예제 #3

파일 보기

class CategoricalToDiscreteHypergridAdapter(HypergridAdapter):
    """ Maps values in categorical dimensions into values in discrete dimensions.

    """

    def __init__(self, adaptee: Hypergrid):
        HypergridAdapter.__init__(self, name=adaptee.name, random_state=adaptee.random_state)
        self._adaptee: Hypergrid = adaptee
        self._target: Hypergrid = None

        # Forward mapping:
        #   Key: dimension name
        #   Value: a dictionary mapping adaptee values to target values
        #
        self._adaptee_to_target_dimension_mappings = dict()

        # Reverse mapping:
        #   Key: dimension name
        #   Value: a dictionary mapping target values to adaptee values
        self._target_to_adaptee_dimension_mappings = dict()


        # Now we need to build the target hypergrid and the mappings between adaptee and target.
        if HypergridAdapter.is_like_simple_hypergrid(adaptee) and not adaptee.is_hierarchical():
            self._build_simple_hypergrid_target()
        else:
            raise NotImplementedError("First apply the HierarchicalToFlatHypergridAdapter and chain it with this one.")

    @property
    def adaptee(self) -> Hypergrid:
        return self._adaptee

    @property
    def target(self) -> Hypergrid:
        return self._target

    def _translate_point(self, point: Point) -> Point:
        translated_point = Point()
        for dim_name, original_dim_value in point:
            forward_mapping = self._adaptee_to_target_dimension_mappings.get(dim_name, None)
            if forward_mapping is None:
                translated_point[dim_name] = original_dim_value
            else:
                translated_point[dim_name] = forward_mapping[original_dim_value]
        return translated_point

    def _untranslate_point(self, point: Point) -> Point:
        untranslated_point = Point()
        for dim_name, translated_dim_value in point:
            backward_mapping = self._target_to_adaptee_dimension_mappings.get(dim_name, None)
            if backward_mapping is None:
                untranslated_point[dim_name] = translated_dim_value
            else:
                untranslated_point[dim_name] = backward_mapping[translated_dim_value]
        return untranslated_point

    def _translate_dataframe(self, df: DataFrame, in_place=True) -> DataFrame:
        # For each dimension that has a forward mapping, apply the mapping to the corresponding column.
        #
        if not in_place:
            df = df.copy(deep=True)
        for dim_name, forward_mapping in self._adaptee_to_target_dimension_mappings.items():
            df[dim_name] = df[dim_name].apply(lambda original_value: forward_mapping.get(original_value, original_value))  # pylint: disable=cell-var-from-loop
        return df

    def _untranslate_dataframe(self, df: DataFrame, in_place=True) -> DataFrame:
        if not in_place:
            df = df.copy(deep=True)
        for dim_name, backward_mapping in self._target_to_adaptee_dimension_mappings.items():
            df[dim_name] = df[dim_name].apply(lambda original_value: backward_mapping.get(original_value, original_value))  # pylint: disable=cell-var-from-loop
        return df

    def _build_simple_hypergrid_target(self) -> None:
        """ Builds a SimpleHypergrid target for a SimpleHypergrid adaptee.

        :return:
        """
        assert isinstance(self.adaptee, SimpleHypergrid) or \
               (isinstance(self.adaptee, HypergridAdapter) and isinstance(self.adaptee.target, SimpleHypergrid))

        self._target = SimpleHypergrid(
            name=self._adaptee.name,
            dimensions=None,
            random_state=self._adaptee.random_state
        )

        # Now we iterate over all dimensions and when necessary map the CategoricalDimensions to DiscreteDimensions
        #
        for adaptee_dimension in self._adaptee.dimensions:
            if not isinstance(adaptee_dimension, CategoricalDimension):
                self._target.add_dimension(adaptee_dimension.copy())
            else:
                target_dimension = self._map_categorical_dimension(adaptee_dimension)
                self._target.add_dimension(target_dimension)

    def _map_categorical_dimension(self, adaptee_dimension: CategoricalDimension) -> DiscreteDimension:
        """ Translates a categorical dimension into a discrete dimension and persists the mappings.

        :param adaptee_dimension:
        :return:
        """
        forward_mapping = {}
        backward_mapping = {}
        for i, value in enumerate(adaptee_dimension):
            forward_mapping[value] = i
            backward_mapping[i] = value

        self._adaptee_to_target_dimension_mappings[adaptee_dimension.name] = forward_mapping
        self._target_to_adaptee_dimension_mappings[adaptee_dimension.name] = backward_mapping
        target_dimension = DiscreteDimension(
            name=adaptee_dimension.name,
            min=0,
            max=len(adaptee_dimension) - 1
        )
        return target_dimension

예제 #4

파일 보기

파일: ContinuousToPolynomialBasisHypergridAdapter.py 프로젝트: sycomix/MLOS

class ContinuousToPolynomialBasisHypergridAdapter(HypergridAdapter):
    """ Adds polynomial basis function features for each continuous dimension in the adaptee hypergrid using
        https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html.
        All non-continuous adaptee dimensions will be present in the target hypergrid.
        Beware: Because HierarchicalHypergrids may have NaN values for some points, these NaNs will be replaced by zeros.

        Parameters
        ----------
        degree: integer
            The degree of the polynomial features.
            Default = 2.

        interaction_only: boolean
            If true, only interaction features are produced: features that are products of at most degree distinct input features
            (so not x[1] ** 2, x[0] * x[2] ** 3, etc.).
            Default = False

        include_bias: boolean
            If True, then include a bias column, the feature in which all polynomial powers are zero
            (i.e. a column of ones - acts as an intercept term in a linear model).
            Default = True
    """

    def __init__(
            self,
            adaptee: Hypergrid,
            degree: int = 2,
            include_bias: bool = True,
            interaction_only: bool = False
    ):
        if not HypergridAdapter.is_like_simple_hypergrid(adaptee):
            raise ValueError("Adaptee must implement a Hypergrid Interface.")

        HypergridAdapter.__init__(self, name=adaptee.name, random_state=adaptee.random_state)

        self._adaptee: Hypergrid = adaptee
        self._polynomial_features_kwargs = {
            'degree': degree,
            'interaction_only': interaction_only,
            'include_bias': include_bias,
            'order': 'C'
        }
        self._target: Hypergrid = None

        if self._adaptee.is_hierarchical():
            self._adaptee = HierarchicalToFlatHypergridAdapter(adaptee=self._adaptee)

        # Record which adaptee dimensions are continuous
        self._adaptee_contains_dimensions_to_transform = False
        self._adaptee_dimension_names_to_transform = []
        for adaptee_dimension in self._adaptee.dimensions:
            if isinstance(adaptee_dimension, ContinuousDimension):
                self._adaptee_dimension_names_to_transform.append(adaptee_dimension.name)
        self._num_dimensions_to_transform = len(self._adaptee_dimension_names_to_transform)
        self._adaptee_contains_dimensions_to_transform = self._num_dimensions_to_transform > 0

        # see definition of _get_polynomial_feature_names() for usage
        self._internal_feature_name_terminal_char = '_'

        # Since sklearn PolynomialFeatures does not accept NaNs and these may appear in data frames from hierarchical hypergrids,
        # the NaNs will be replaced with an imputed (finite) value.  The following sets the value used.
        self._nan_imputed_finite_value = 0

        # instantiate sklearn's polynomial features instance
        self._polynomial_features = PolynomialFeatures(**self._polynomial_features_kwargs)
        # because the exact number of additional dimensions that will be added depends on the parameters to sklearn's PF,
        # *and* the sklearn PF instance above doesn't determine this information until after the .fit() method is called (requiring a dataframe),
        # *and* the target hypergrid can not be constructed without knowing the resulting number of continuous dimensions,
        # a trivial dataframe is constructed (all 1s) and .fit_transform() of _polynomial_features instance is called.
        trivial_continuous_dim_x = np.ones((1, self._num_dimensions_to_transform))
        trivial_polynomial_features_y = self._polynomial_features.fit_transform(trivial_continuous_dim_x)
        self._polynomial_features_powers = self._polynomial_features.powers_
        self._num_polynomial_basis_dimensions_in_target = trivial_polynomial_features_y.shape[1]
        self._target_polynomial_feature_map = {}  # keys are target dimension names, values are index in features
        self._build_simple_hypergrid_target()

    def _build_simple_hypergrid_target(self) -> None:
        self._target = SimpleHypergrid(
            name=self._adaptee.name,
            dimensions=None,
            random_state=self._adaptee.random_state
        )

        # Add all adaptee dimensions to the target
        # This aligns with this adapter's goal since the linear terms will always be included in the polynomial basis functions
        for adaptee_dimension in self._adaptee.dimensions:
            if not adaptee_dimension.name in self._adaptee_dimension_names_to_transform:
                self._target.add_dimension(adaptee_dimension.copy())

        if not self._adaptee_contains_dimensions_to_transform:
            return

        # add new dimensions to be created by sklearn PolynomialFeatures

        # construct target dim names using adaptee dim names and polynomial feature powers matrix
        # This logic is worked out explicitly here so we have control over the derived dimension names.
        # Currently, the code only substitutes adaptee feature names into the default feature_names produced by
        # sklearn's PolynomialFeatures .get_feature_names() method.
        poly_feature_dim_names = self._get_polynomial_feature_names()
        for i, poly_feature_name in enumerate(poly_feature_dim_names):
            ith_terms_powers = self._polynomial_features_powers[i]

            if not self._polynomial_features_kwargs['include_bias'] and ith_terms_powers.sum() == 0:
                # the constant term is skipped
                continue
            else:
                # replace adaptee dim names for poly feature name {x0_, x1_, ...} representatives
                target_dim_name = poly_feature_name
                for j, adaptee_dim_name in enumerate(self._adaptee_dimension_names_to_transform):
                    adaptee_dim_power = ith_terms_powers[j]
                    if adaptee_dim_power == 0:
                        continue
                    if adaptee_dim_power == 1:
                        poly_feature_adaptee_dim_name_standin = f'x{j}{self._internal_feature_name_terminal_char}'
                        adaptee_dim_replacement_name = adaptee_dim_name
                    else:
                        # power > 1 cases
                        poly_feature_adaptee_dim_name_standin = f'x{j}{self._internal_feature_name_terminal_char}^{adaptee_dim_power}'
                        adaptee_dim_replacement_name = f'{adaptee_dim_name}^{adaptee_dim_power}'

                    target_dim_name = target_dim_name.replace(poly_feature_adaptee_dim_name_standin, adaptee_dim_replacement_name)
            # add target dimension
            # min and max are placed at -Inf and +Inf since .random() on the target hypergrid is generated on the original
            # hypergrid and passed through the adapters.
            self._target.add_dimension(
                ContinuousDimension(name=target_dim_name, min=-math.inf, max=math.inf)
            )
            self._target_polynomial_feature_map[target_dim_name] = i

    @property
    def adaptee(self) -> Hypergrid:
        return self._adaptee

    @property
    def target(self) -> Hypergrid:
        return self._target

    @property
    def polynomial_features_kwargs(self) -> dict:
        return self._polynomial_features_kwargs

    @property
    def nan_imputed_finite_value(self):
        return self._nan_imputed_finite_value

    def get_column_names_for_polynomial_features(self, degree=None):
        # column names ordered by target dimension index as this coincides with the polynomial_features.powers_ table
        sorted_by_column_index = {k: v for k, v in sorted(self._target_polynomial_feature_map.items(), key=lambda item: item[1])}
        if degree is None:
            return list(sorted_by_column_index.keys())

        dim_names = []
        for ith_terms_powers, poly_feature_name  in zip(self._polynomial_features_powers, self._get_polynomial_feature_names()):
            if ith_terms_powers.sum() == degree:
                dim_names.append(poly_feature_name)
        return dim_names

    def get_polynomial_feature_powers_table(self):
        return self._polynomial_features_powers

    def get_num_polynomial_features(self):
        return self._polynomial_features_powers.shape[0]

    def _get_polynomial_feature_names(self):
        # The default polynomial feature feature names returned from .get_feature_names() look like: ['1', 'x0', 'x1', 'x0^2', 'x0 x1', 'x1^2']
        # They are altered below by adding a terminal char so string substitutions don't confuse
        # a derived feature named 'x1 x12' with another potentially derived feature named 'x10 x124'
        replaceable_feature_names = []
        for i in range(len(self._adaptee_dimension_names_to_transform)):
            replaceable_feature_names.append(f'x{i}{self._internal_feature_name_terminal_char}')
        return self._polynomial_features.get_feature_names(replaceable_feature_names)

    def _project_dataframe(self, df: DataFrame, in_place=True) -> DataFrame:
        if not in_place:
            df = df.copy(deep=True)

        # replace NaNs with zeros
        df.fillna(self._nan_imputed_finite_value, inplace=True)

        # Transform the continuous columns and add the higher order columns to the df
        # Filtering columns to transform b/c dataframes coming from hierarchical hypergrid points
        # may not contain all possible dimensions knowable from hypergrid
        x_to_transform = np.zeros((len(df.index), len(self._adaptee_dimension_names_to_transform)))
        for i, dim_name in enumerate(self._adaptee_dimension_names_to_transform):
            if dim_name in df.columns.values:
                x_to_transform[:, i] = df[dim_name]

        all_poly_features = self._polynomial_features.transform(x_to_transform)
        for target_dim_name in self._target_polynomial_feature_map:
            target_dim_index = self._target_polynomial_feature_map[target_dim_name]
            df[target_dim_name] = all_poly_features[:, target_dim_index]
        return df

    def _unproject_dataframe(self, df: DataFrame, in_place=True) -> DataFrame:
        if not in_place:
            df = df.copy(deep=True)

        # unproject simply drops the monomial columns whose degree is not 1
        polynomial_feature_powers = self.get_polynomial_feature_powers_table()
        column_names_to_drop = []
        for target_dim_name, powers_table_index in self._target_polynomial_feature_map.items():
            target_powers = polynomial_feature_powers[powers_table_index]
            if target_powers.sum() == 1:
                continue
            column_names_to_drop.append(target_dim_name)
        df.drop(columns=column_names_to_drop, inplace=True)

        return df

예제 #5

파일 보기

파일: DiscreteToUnitContinuousHypergridAdapter.py 프로젝트: HeatherJia/MLOS

class DiscreteToUnitContinuousHypergridAdapter(HypergridAdapter):
    """ Maps values in discrete dimensions into values in a unit continuous dimensions.

    Unit continuous all target dimensions are between 0 and 1.

    And more importantly, unmaps the continuous values back to discrete ones.

    """
    def __init__(self, adaptee: Hypergrid):
        if not HypergridAdapter.is_like_simple_hypergrid(adaptee):
            raise ValueError("Adaptee must implement a Hypergrid Interface.")
        HypergridAdapter.__init__(self,
                                  name=adaptee.name,
                                  random_state=adaptee.random_state)
        self._adaptee: Hypergrid = adaptee
        self._target: Hypergrid = None

        # Forward mapping:
        #   Key: adaptee dimension name
        #   Value: target dimension
        #
        self._adaptee_to_target_dimension_mappings = dict()

        # Reverse mapping:
        #   Key: target dimension name
        #   Value: adaptee dimension
        self._target_to_adaptee_dimension_mappings = dict()

        if any(
                isinstance(dimension, CategoricalDimension)
                for dimension in self._adaptee.dimensions):
            self._adaptee = CategoricalToDiscreteHypergridAdapter(
                adaptee=self._adaptee)

        # Now we need to build the target hypergrid and the mappings between adaptee and target.
        self._build_simple_hypergrid_target()

    @property
    def adaptee(self) -> Hypergrid:
        return self._adaptee

    @property
    def target(self) -> Hypergrid:
        return self._target

    def _project_point(self, point: Point) -> Point:
        projected_point = Point()
        for dim_name, original_dim_value in point:
            adaptee_dimension = self._adaptee[dim_name]
            if isinstance(adaptee_dimension, DiscreteDimension):
                # simply scale the value
                projected_point[dim_name] = (
                    original_dim_value -
                    adaptee_dimension.min * 1.0) / len(adaptee_dimension)
            elif isinstance(adaptee_dimension, ContinuousDimension):
                if adaptee_dimension.min == adaptee_dimension.max:
                    projected_point[dim_name] = 0
                else:
                    projected_point[dim_name] = (
                        original_dim_value - adaptee_dimension.min * 1.0) / (
                            adaptee_dimension.max - adaptee_dimension.min)
            else:
                raise ValueError(
                    f"Dimension {adaptee_dimension.name} is neither Discrete nor Continuous."
                )
        return projected_point

    def _unproject_point(self, point: Point) -> Point:
        unprojected_point = Point()
        for dim_name, projected_dim_value in point:
            adaptee_dimension = self._adaptee[dim_name]
            if isinstance(adaptee_dimension, DiscreteDimension):
                # simply scale the value the other way
                unprojected_point[dim_name] = math.floor(
                    projected_dim_value * len(adaptee_dimension) +
                    adaptee_dimension.min)
            elif isinstance(adaptee_dimension, ContinuousDimension):
                unprojected_point[dim_name] = projected_dim_value * (
                    adaptee_dimension.max -
                    adaptee_dimension.min) + adaptee_dimension.min
            else:
                raise ValueError(
                    f"Dimension {adaptee_dimension.name} is neither Discrete nor Continuous."
                )
        return unprojected_point

    def _project_dataframe(self, df: DataFrame, in_place=True) -> DataFrame:
        # Basically apply the scaling for each column.
        #
        if not in_place:
            df = df.copy(deep=True)

        for adaptee_dimension in self._adaptee.dimensions:
            dim_name = adaptee_dimension.name
            if isinstance(adaptee_dimension, DiscreteDimension):
                df[dim_name] = (df[dim_name] -
                                adaptee_dimension.min) / len(adaptee_dimension)
            elif isinstance(adaptee_dimension, ContinuousDimension):
                if adaptee_dimension.min == adaptee_dimension.max:
                    df[dim_name] = 0
                else:
                    df[dim_name] = (df[dim_name] - adaptee_dimension.min) / (
                        adaptee_dimension.max - adaptee_dimension.min)
            else:
                raise ValueError(
                    f"Dimension {adaptee_dimension.name} is neither Discrete nor Continuous."
                )
        return df

    def _unproject_dataframe(self, df: DataFrame, in_place=True) -> DataFrame:
        if not in_place:
            df = df.copy(deep=True)

        for adaptee_dimension in self._adaptee.dimensions:
            dim_name = adaptee_dimension.name
            if isinstance(adaptee_dimension, DiscreteDimension):
                if df[dim_name].isnull().any():
                    df.loc[:, dim_name] = np.floor(df[dim_name] *
                                                   len(adaptee_dimension) +
                                                   adaptee_dimension.min)
                else:
                    # If there are no nulls, we must cast back to int64.
                    df.loc[:,
                           dim_name] = np.floor(df[dim_name] *
                                                len(adaptee_dimension) +
                                                adaptee_dimension.min).astype(
                                                    np.int64)

            elif isinstance(adaptee_dimension, ContinuousDimension):
                df.loc[:, dim_name] = df[dim_name] * (
                    adaptee_dimension.max -
                    adaptee_dimension.min) + adaptee_dimension.min
            else:
                raise ValueError(
                    f"Dimension {adaptee_dimension.name} is neither Discrete nor Continuous."
                )
        return df

    def _build_simple_hypergrid_target(self) -> None:
        """ Builds a SimpleHypergrid target for a SimpleHypergrid adaptee.

        :return:
        """

        self._target = SimpleHypergrid(name=self._adaptee.name,
                                       dimensions=None,
                                       random_state=self._adaptee.random_state)

        # Now we iterate over all dimensions and when necessary map the CategoricalDimensions to DiscreteDimensions
        #
        for adaptee_dimension in self._adaptee.dimensions:
            if isinstance(adaptee_dimension, DiscreteDimension):
                target_dimension = ContinuousDimension(
                    name=adaptee_dimension.name,
                    min=0,
                    max=1,
                    include_max=False)
            else:
                target_dimension = ContinuousDimension(
                    name=adaptee_dimension.name,
                    min=0,
                    max=1,
                    include_min=adaptee_dimension.include_min,
                    include_max=adaptee_dimension.include_max)

            self._target.add_dimension(target_dimension)
            self._adaptee_to_target_dimension_mappings[
                adaptee_dimension.name] = target_dimension
            self._target_to_adaptee_dimension_mappings[
                target_dimension.name] = adaptee_dimension