def _fit_root_regression( self, x: pd.DataFrame, y: pd.DataFrame, iteration_number: int ): # TODO : Add back RidgeCV option after creating RidgeCrossValidatedRegressionModel assert \ self.model_config.boosting_root_model_name in [ LassoCrossValidatedRegressionModel.__name__ ], f'Unrecognized boosting_root_model_name {self.model_config.boosting_root_model_name}' # Since the RERF transform_x created the proper design_matrix, this serves as the input space for the root regression model. # Hence the code below creates a (temporary) hypergrid reflecting the design_matrix. # This is less than ideal solution, but deriving min and max of polynomial terms (given feature column degrees) is non-trivial # TODO: set bounds on the polynomial terms correctly and eliminate the hack forcing the base_regressor to skip filtering invalid features design_matrix_hypergrid = SimpleHypergrid( name='RegressionEnhanceRandomForest_design_matrix', dimensions=None ) for design_matrix_column_name in x.columns.values: design_matrix_dimension = ContinuousDimension( name=design_matrix_column_name, min=x[design_matrix_column_name].min(), max=x[design_matrix_column_name].max() ) design_matrix_hypergrid.add_dimension(design_matrix_dimension) # fit lasso/ridge model using either specified params from __init__ or hyper-parameter search if self.model_config.boosting_root_model_name == LassoCrossValidatedRegressionModel.__name__: root_model_config = self.model_config.dimension_value_dict['lasso_regression_model_config'] self.base_regressor_ = LassoCrossValidatedRegressionModel( model_config=root_model_config, input_space=design_matrix_hypergrid, output_space=self.output_space ) # skips filtering to valid features in the base_regressor since the valid range of design_matrix column values is incorrect above self.base_regressor_.skip_input_filtering_on_predict = True self.base_regressor_.fit( x, y, iteration_number=iteration_number ) return self
class CategoricalToOneHotEncodedHypergridAdapter(HypergridAdapter): """ Maps values in categorical dimensions into values in OneHotEncoded dimensions using: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html, which will be referred to as sklearn's OHE. Parameters ---------- merge_all_categorical_dimensions: bool If True, sklearn's OHE will be applied to the cross product of all categorical levels in the adaptee space. If False, sklearn's OHE will be applied individually to each categorical dimension in the adaptee space. Default=False. drop: {None, 'first', 'if_binary'} Argument passed to sklearn's OHE with same argument name. Default=None. The following sklearn OHE arguments are not supported or are restricted: categories: Not supported since the levels a categorical can assume are those specified by the adaptee CategoricalDimension categories. drop: Not supporting sklearn's OHE drop argument as a array-like of shape (n_features,). dtype: This adapter will always use float64 so we're able to accommodate np.NaN values in the projected/unprojected dataframes. sparse: Not exposed in the adapter since the adapter maintains the OHE instances. handle_unknown: This adapter will always set sklearn's OHE instantiation argument handle_unknown='error'. """ def __init__(self, adaptee: Hypergrid, merge_all_categorical_dimensions: bool = False, drop: str = None): if not HypergridAdapter.is_like_simple_hypergrid(adaptee): raise ValueError("Adaptee must implement a Hypergrid Interface.") HypergridAdapter.__init__(self, name=adaptee.name, random_state=adaptee.random_state) self._adaptee: Hypergrid = adaptee self._merge_all_categorical_dimensions = merge_all_categorical_dimensions self._one_hot_encoder_kwargs = { 'drop': drop, 'dtype': np.float64, 'sparse': False, 'handle_unknown': 'error' } self._all_one_hot_encoded_target_dimension_names = [] self._adaptee_to_target_data_dict: Dict[ str, CategoricalToOneHotEncodingAdapteeTargetMapping] = {} self._adaptee_expected_dimension_name_ordering = [] self._concatenation_delim = '___' self._merged_categorical_dimension_column_name = 'ohe_cross_product' self.ohe_target_column_suffix = '__ohe' self._target: Hypergrid = None self.has_adaptee_been_flattened = False # Since CategoricalDimension values may have different types within the same dimension, # we pass the adaptee through the CategoricalToDiscrete adapter to move all value types to ints # Because the OneHotEncoder needs to remember the dimension names (which change by the flattening in CategoricalToDiscrete), # the flattening is performed here so the OneHotEncoder discovers the correct flattened column names if self._adaptee.is_hierarchical(): self._adaptee = HierarchicalToFlatHypergridAdapter( adaptee=self._adaptee) self.has_adaptee_been_flattened = True # Since the CategoricalToDiscrete adapter converts categorical dimensions to discrete dimensions, we remember the categorical dim names self._adaptee_contains_categorical_dimensions = False self._adaptee_dimension_names_to_transform = [] for adaptee_dimension in self._adaptee.dimensions: if isinstance(adaptee_dimension, CategoricalDimension): self._adaptee_dimension_names_to_transform.append( adaptee_dimension.name) self._adaptee_expected_dimension_name_ordering.append( adaptee_dimension.name) self._adaptee_contains_categorical_dimensions = len( self._adaptee_dimension_names_to_transform) > 0 # since sklearn OneHotEncoder doesn't accept strings, convert any categorical dimensions to discrete if any( isinstance(dimension, CategoricalDimension) for dimension in self._adaptee.dimensions) or self.has_adaptee_been_flattened: self._adaptee = CategoricalToDiscreteHypergridAdapter( adaptee=self._adaptee) self._build_simple_hypergrid_target() @property def adaptee(self) -> Hypergrid: return self._adaptee @property def target(self) -> Hypergrid: return self._target @property def was_encoding_merge_all_categoricals_requested(self): return self._merge_all_categorical_dimensions def get_original_categorical_column_names(self): return self._adaptee_to_target_data_dict.keys() def get_one_hot_encoded_column_names(self): return self._all_one_hot_encoded_target_dimension_names def _concatenate_dataframe_columns(self, df: DataFrame, columns_to_concatenate) -> DataFrame: df[columns_to_concatenate] = df[columns_to_concatenate].astype( 'float64') return df[columns_to_concatenate].apply( lambda cat_row: self._concatenation_delim.join(cat_row.map(str)), axis=1) def _project_dataframe(self, df: DataFrame, in_place=True) -> DataFrame: if not in_place: df = df.copy(deep=True) columns_to_drop = [] potentially_missing_columns = list( set.difference(set(self._adaptee_expected_dimension_name_ordering), set(df.columns.values))) for missing_col in potentially_missing_columns: df[missing_col] = np.NaN df[missing_col] = df[missing_col].astype('float64') columns_to_drop.append(missing_col) columns_to_transform = self._adaptee_dimension_names_to_transform if self._merge_all_categorical_dimensions and self._adaptee_contains_categorical_dimensions: df[self. _merged_categorical_dimension_column_name] = self._concatenate_dataframe_columns( df, columns_to_transform) columns_to_transform = [ self._merged_categorical_dimension_column_name ] columns_to_drop.extend(self._adaptee_dimension_names_to_transform) for adaptee_column_name in columns_to_transform: my_ohe_dict = self._adaptee_to_target_data_dict[ adaptee_column_name] my_ohe = my_ohe_dict.one_hot_encoder if not self._merge_all_categorical_dimensions: df[adaptee_column_name] = df[adaptee_column_name].astype( 'float64') ohe_x = df[adaptee_column_name].map(str).to_numpy().reshape(-1, 1) my_ohe_target_columns = my_ohe_dict.target_dims df[my_ohe_target_columns] = DataFrame(my_ohe.transform(ohe_x), index=df.index) columns_to_drop.append(adaptee_column_name) if columns_to_drop: df.drop(columns=columns_to_drop, inplace=True) return df def _unproject_dataframe(self, df: DataFrame, in_place=True) -> DataFrame: if not in_place: df = df.copy(deep=True) columns_to_return = self._adaptee_expected_dimension_name_ordering if self._merge_all_categorical_dimensions and self._adaptee_contains_categorical_dimensions: for column_to_transform in self._adaptee_dimension_names_to_transform: if column_to_transform not in columns_to_return: columns_to_return.append(column_to_transform) columns_to_drop = [] if self._merge_all_categorical_dimensions and self._adaptee_contains_categorical_dimensions: my_ohe_dict = self._adaptee_to_target_data_dict[ self._merged_categorical_dimension_column_name] target_columns_to_invert = my_ohe_dict.target_dims my_ohe = my_ohe_dict.one_hot_encoder df[self. _merged_categorical_dimension_column_name] = my_ohe.inverse_transform( df[target_columns_to_invert]) df[self._adaptee_dimension_names_to_transform] = df[self._merged_categorical_dimension_column_name]\ .str.split(self._concatenation_delim, expand=True) df.loc[:, self._adaptee_dimension_names_to_transform].replace( 'nan', np.NaN, inplace=True) df[self._adaptee_dimension_names_to_transform] = df[ self._adaptee_dimension_names_to_transform].astype('float64') columns_to_drop.extend(target_columns_to_invert) columns_to_drop.append( self._merged_categorical_dimension_column_name) else: for adaptee_column_name in self._adaptee_dimension_names_to_transform: my_ohe_dict = self._adaptee_to_target_data_dict[ adaptee_column_name] target_columns_to_invert = my_ohe_dict.target_dims my_ohe = my_ohe_dict.one_hot_encoder df[adaptee_column_name] = my_ohe.inverse_transform( df[target_columns_to_invert]) df[adaptee_column_name].replace('nan', np.NaN, inplace=True) df[adaptee_column_name] = df[adaptee_column_name].astype( 'float64') columns_to_drop.extend(target_columns_to_invert) columns_to_retain_present_in_df = [ column_name for column_name in columns_to_return if column_name in df.columns.values ] if in_place: df.loc[:, columns_to_retain_present_in_df].dropna(axis=1, how='all', inplace=in_place) df.drop(columns=columns_to_drop, inplace=in_place) else: df = df[columns_to_retain_present_in_df].dropna(axis=1, how='all', inplace=in_place) return df def _build_simple_hypergrid_target(self) -> None: """ Builds a SimpleHypergrid target for a SimpleHypergrid adaptee. :return: """ self._target = SimpleHypergrid(name=self._adaptee.name, dimensions=None, random_state=self._adaptee.random_state) """ Details about construction of the target hypergrid: 1) Moving non-categorical dimensions to target, while collecting needed info about adaptee categorical dimensions 2) Since sklearn's OHE will handle both project and unproject dataframe transforms, prepare the OHE class. This requires constructing the 'categories' argument for OHE (all categorical dims or 1 cross product dim). The dimension's .linspace() method provides the order list of values but doesn't include possible np.NaN values, hence that list is augmented to include the string 'nan' which pandas.DataFrame.apply(map(str)) will produce from a np.NaN value. All values (output from CategoricalToDiscrete adapter are converted to strings prior to initializing the OHE object. This will allow the code to accommodate any missing values in the dataframes passed to .project_dataframe and .unproject_dataframe. 3) If the cross product of all categorical dimensions have been requested, construct the cross product """ categories_list_for_ohe_init = [] for adaptee_dimension in self._adaptee.dimensions: if adaptee_dimension.name in self._adaptee_dimension_names_to_transform: """ conversion to str allows accommodation of np.NaN values in dataframes np.NaN values will not appear in the .linspace() list but will be present in dataframes generated from hierarchical hypergrids. So 'nan' is included to allow OHE to map np.NaNs in ._project_dataframe() and ._unproject_dataframe(). The value 'nan' is placed first in the list so the 'nan' x ... x 'nan' cross product value is first ([0]). Since this value should never appear in hierarchical hypergrid derived dataframes, it is popped from the categories when user specifies merge_all_categorical_dimensions==True. """ expanded_categories = ['nan'] + [ str(float(x)) for x in adaptee_dimension.linspace() ] categories_list_for_ohe_init.append(expanded_categories) if not self._merge_all_categorical_dimensions and self._adaptee_contains_categorical_dimensions: # do not need to encode the cross product of all categorical dimensions, sufficient info here to add target dimensions self._adaptee_to_target_data_dict[ adaptee_dimension. name] = CategoricalToOneHotEncodingAdapteeTargetMapping( one_hot_encoder=OneHotEncoder( categories=[expanded_categories], **self._one_hot_encoder_kwargs)) temp_df_for_fit = DataFrame( {adaptee_dimension.name: expanded_categories}) self._add_one_hot_encoded_dimensions( adaptee_dimension.name, temp_df_for_fit) else: self._target.add_dimension(adaptee_dimension.copy()) if self._merge_all_categorical_dimensions and self._adaptee_contains_categorical_dimensions: # harvested categories for each categorical dimension in single pass across all adaptee dimensions used to compute the cross product encoding here cross_product_categories = self._create_cross_product_categories( categories_list_for_ohe_init) self._adaptee_to_target_data_dict[ self. _merged_categorical_dimension_column_name] = CategoricalToOneHotEncodingAdapteeTargetMapping( one_hot_encoder=OneHotEncoder( categories=[cross_product_categories], **self._one_hot_encoder_kwargs)) temp_df_for_fit = DataFrame({ self._merged_categorical_dimension_column_name: cross_product_categories }) self._add_one_hot_encoded_dimensions( self._merged_categorical_dimension_column_name, temp_df_for_fit) def _add_one_hot_encoded_dimensions(self, adaptee_dimension_name, temp_df_for_fit: DataFrame) -> None: my_target_data = self._adaptee_to_target_data_dict[ adaptee_dimension_name] my_ohe_output = my_target_data.one_hot_encoder.fit_transform( temp_df_for_fit) my_target_data.num_dummy_dims = my_ohe_output.shape[1] for i in range(my_target_data.num_dummy_dims): target_dim_name = f'{adaptee_dimension_name}{self.ohe_target_column_suffix}{i}' my_target_data.target_dims.append(target_dim_name) self._target.add_dimension( DiscreteDimension(name=target_dim_name, min=0, max=1)) self._all_one_hot_encoded_target_dimension_names.append( target_dim_name) def _create_cross_product_categories(self, categories_per_dimension) -> []: num_categorical_dims = len(categories_per_dimension) cross_product = np.array(np.meshgrid( *categories_per_dimension)).T.reshape(-1, num_categorical_dims) temp_df = DataFrame(cross_product) temp_df['concatenated_levels'] = self._concatenate_dataframe_columns( temp_df, temp_df.columns.values) concatenated_levels = temp_df['concatenated_levels'].values.tolist() # expect first element arises from 'nan' x ... x 'nan' which cannot appear in hierarchical hypergrids, # so popping this before returning the cross product list if self.has_adaptee_been_flattened and num_categorical_dims > 1: all_nans = self._concatenation_delim.join(['nan'] * num_categorical_dims) should_be_all_nans = concatenated_levels.pop(0) if should_be_all_nans != all_nans: raise ValueError( 'Failed to find cross product of nan values when constructing OneHotEncoding with merge_all_categorical_dimensions==True' ) return concatenated_levels
class CategoricalToDiscreteHypergridAdapter(HypergridAdapter): """ Maps values in categorical dimensions into values in discrete dimensions. """ def __init__(self, adaptee: Hypergrid): HypergridAdapter.__init__(self, name=adaptee.name, random_state=adaptee.random_state) self._adaptee: Hypergrid = adaptee self._target: Hypergrid = None # Forward mapping: # Key: dimension name # Value: a dictionary mapping adaptee values to target values # self._adaptee_to_target_dimension_mappings = dict() # Reverse mapping: # Key: dimension name # Value: a dictionary mapping target values to adaptee values self._target_to_adaptee_dimension_mappings = dict() # Now we need to build the target hypergrid and the mappings between adaptee and target. if HypergridAdapter.is_like_simple_hypergrid(adaptee) and not adaptee.is_hierarchical(): self._build_simple_hypergrid_target() else: raise NotImplementedError("First apply the HierarchicalToFlatHypergridAdapter and chain it with this one.") @property def adaptee(self) -> Hypergrid: return self._adaptee @property def target(self) -> Hypergrid: return self._target def _translate_point(self, point: Point) -> Point: translated_point = Point() for dim_name, original_dim_value in point: forward_mapping = self._adaptee_to_target_dimension_mappings.get(dim_name, None) if forward_mapping is None: translated_point[dim_name] = original_dim_value else: translated_point[dim_name] = forward_mapping[original_dim_value] return translated_point def _untranslate_point(self, point: Point) -> Point: untranslated_point = Point() for dim_name, translated_dim_value in point: backward_mapping = self._target_to_adaptee_dimension_mappings.get(dim_name, None) if backward_mapping is None: untranslated_point[dim_name] = translated_dim_value else: untranslated_point[dim_name] = backward_mapping[translated_dim_value] return untranslated_point def _translate_dataframe(self, df: DataFrame, in_place=True) -> DataFrame: # For each dimension that has a forward mapping, apply the mapping to the corresponding column. # if not in_place: df = df.copy(deep=True) for dim_name, forward_mapping in self._adaptee_to_target_dimension_mappings.items(): df[dim_name] = df[dim_name].apply(lambda original_value: forward_mapping.get(original_value, original_value)) # pylint: disable=cell-var-from-loop return df def _untranslate_dataframe(self, df: DataFrame, in_place=True) -> DataFrame: if not in_place: df = df.copy(deep=True) for dim_name, backward_mapping in self._target_to_adaptee_dimension_mappings.items(): df[dim_name] = df[dim_name].apply(lambda original_value: backward_mapping.get(original_value, original_value)) # pylint: disable=cell-var-from-loop return df def _build_simple_hypergrid_target(self) -> None: """ Builds a SimpleHypergrid target for a SimpleHypergrid adaptee. :return: """ assert isinstance(self.adaptee, SimpleHypergrid) or \ (isinstance(self.adaptee, HypergridAdapter) and isinstance(self.adaptee.target, SimpleHypergrid)) self._target = SimpleHypergrid( name=self._adaptee.name, dimensions=None, random_state=self._adaptee.random_state ) # Now we iterate over all dimensions and when necessary map the CategoricalDimensions to DiscreteDimensions # for adaptee_dimension in self._adaptee.dimensions: if not isinstance(adaptee_dimension, CategoricalDimension): self._target.add_dimension(adaptee_dimension.copy()) else: target_dimension = self._map_categorical_dimension(adaptee_dimension) self._target.add_dimension(target_dimension) def _map_categorical_dimension(self, adaptee_dimension: CategoricalDimension) -> DiscreteDimension: """ Translates a categorical dimension into a discrete dimension and persists the mappings. :param adaptee_dimension: :return: """ forward_mapping = {} backward_mapping = {} for i, value in enumerate(adaptee_dimension): forward_mapping[value] = i backward_mapping[i] = value self._adaptee_to_target_dimension_mappings[adaptee_dimension.name] = forward_mapping self._target_to_adaptee_dimension_mappings[adaptee_dimension.name] = backward_mapping target_dimension = DiscreteDimension( name=adaptee_dimension.name, min=0, max=len(adaptee_dimension) - 1 ) return target_dimension
class ContinuousToPolynomialBasisHypergridAdapter(HypergridAdapter): """ Adds polynomial basis function features for each continuous dimension in the adaptee hypergrid using https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html. All non-continuous adaptee dimensions will be present in the target hypergrid. Beware: Because HierarchicalHypergrids may have NaN values for some points, these NaNs will be replaced by zeros. Parameters ---------- degree: integer The degree of the polynomial features. Default = 2. interaction_only: boolean If true, only interaction features are produced: features that are products of at most degree distinct input features (so not x[1] ** 2, x[0] * x[2] ** 3, etc.). Default = False include_bias: boolean If True, then include a bias column, the feature in which all polynomial powers are zero (i.e. a column of ones - acts as an intercept term in a linear model). Default = True """ def __init__( self, adaptee: Hypergrid, degree: int = 2, include_bias: bool = True, interaction_only: bool = False ): if not HypergridAdapter.is_like_simple_hypergrid(adaptee): raise ValueError("Adaptee must implement a Hypergrid Interface.") HypergridAdapter.__init__(self, name=adaptee.name, random_state=adaptee.random_state) self._adaptee: Hypergrid = adaptee self._polynomial_features_kwargs = { 'degree': degree, 'interaction_only': interaction_only, 'include_bias': include_bias, 'order': 'C' } self._target: Hypergrid = None if self._adaptee.is_hierarchical(): self._adaptee = HierarchicalToFlatHypergridAdapter(adaptee=self._adaptee) # Record which adaptee dimensions are continuous self._adaptee_contains_dimensions_to_transform = False self._adaptee_dimension_names_to_transform = [] for adaptee_dimension in self._adaptee.dimensions: if isinstance(adaptee_dimension, ContinuousDimension): self._adaptee_dimension_names_to_transform.append(adaptee_dimension.name) self._num_dimensions_to_transform = len(self._adaptee_dimension_names_to_transform) self._adaptee_contains_dimensions_to_transform = self._num_dimensions_to_transform > 0 # see definition of _get_polynomial_feature_names() for usage self._internal_feature_name_terminal_char = '_' # Since sklearn PolynomialFeatures does not accept NaNs and these may appear in data frames from hierarchical hypergrids, # the NaNs will be replaced with an imputed (finite) value. The following sets the value used. self._nan_imputed_finite_value = 0 # instantiate sklearn's polynomial features instance self._polynomial_features = PolynomialFeatures(**self._polynomial_features_kwargs) # because the exact number of additional dimensions that will be added depends on the parameters to sklearn's PF, # *and* the sklearn PF instance above doesn't determine this information until after the .fit() method is called (requiring a dataframe), # *and* the target hypergrid can not be constructed without knowing the resulting number of continuous dimensions, # a trivial dataframe is constructed (all 1s) and .fit_transform() of _polynomial_features instance is called. trivial_continuous_dim_x = np.ones((1, self._num_dimensions_to_transform)) trivial_polynomial_features_y = self._polynomial_features.fit_transform(trivial_continuous_dim_x) self._polynomial_features_powers = self._polynomial_features.powers_ self._num_polynomial_basis_dimensions_in_target = trivial_polynomial_features_y.shape[1] self._target_polynomial_feature_map = {} # keys are target dimension names, values are index in features self._build_simple_hypergrid_target() def _build_simple_hypergrid_target(self) -> None: self._target = SimpleHypergrid( name=self._adaptee.name, dimensions=None, random_state=self._adaptee.random_state ) # Add all adaptee dimensions to the target # This aligns with this adapter's goal since the linear terms will always be included in the polynomial basis functions for adaptee_dimension in self._adaptee.dimensions: if not adaptee_dimension.name in self._adaptee_dimension_names_to_transform: self._target.add_dimension(adaptee_dimension.copy()) if not self._adaptee_contains_dimensions_to_transform: return # add new dimensions to be created by sklearn PolynomialFeatures # construct target dim names using adaptee dim names and polynomial feature powers matrix # This logic is worked out explicitly here so we have control over the derived dimension names. # Currently, the code only substitutes adaptee feature names into the default feature_names produced by # sklearn's PolynomialFeatures .get_feature_names() method. poly_feature_dim_names = self._get_polynomial_feature_names() for i, poly_feature_name in enumerate(poly_feature_dim_names): ith_terms_powers = self._polynomial_features_powers[i] if not self._polynomial_features_kwargs['include_bias'] and ith_terms_powers.sum() == 0: # the constant term is skipped continue else: # replace adaptee dim names for poly feature name {x0_, x1_, ...} representatives target_dim_name = poly_feature_name for j, adaptee_dim_name in enumerate(self._adaptee_dimension_names_to_transform): adaptee_dim_power = ith_terms_powers[j] if adaptee_dim_power == 0: continue if adaptee_dim_power == 1: poly_feature_adaptee_dim_name_standin = f'x{j}{self._internal_feature_name_terminal_char}' adaptee_dim_replacement_name = adaptee_dim_name else: # power > 1 cases poly_feature_adaptee_dim_name_standin = f'x{j}{self._internal_feature_name_terminal_char}^{adaptee_dim_power}' adaptee_dim_replacement_name = f'{adaptee_dim_name}^{adaptee_dim_power}' target_dim_name = target_dim_name.replace(poly_feature_adaptee_dim_name_standin, adaptee_dim_replacement_name) # add target dimension # min and max are placed at -Inf and +Inf since .random() on the target hypergrid is generated on the original # hypergrid and passed through the adapters. self._target.add_dimension( ContinuousDimension(name=target_dim_name, min=-math.inf, max=math.inf) ) self._target_polynomial_feature_map[target_dim_name] = i @property def adaptee(self) -> Hypergrid: return self._adaptee @property def target(self) -> Hypergrid: return self._target @property def polynomial_features_kwargs(self) -> dict: return self._polynomial_features_kwargs @property def nan_imputed_finite_value(self): return self._nan_imputed_finite_value def get_column_names_for_polynomial_features(self, degree=None): # column names ordered by target dimension index as this coincides with the polynomial_features.powers_ table sorted_by_column_index = {k: v for k, v in sorted(self._target_polynomial_feature_map.items(), key=lambda item: item[1])} if degree is None: return list(sorted_by_column_index.keys()) dim_names = [] for ith_terms_powers, poly_feature_name in zip(self._polynomial_features_powers, self._get_polynomial_feature_names()): if ith_terms_powers.sum() == degree: dim_names.append(poly_feature_name) return dim_names def get_polynomial_feature_powers_table(self): return self._polynomial_features_powers def get_num_polynomial_features(self): return self._polynomial_features_powers.shape[0] def _get_polynomial_feature_names(self): # The default polynomial feature feature names returned from .get_feature_names() look like: ['1', 'x0', 'x1', 'x0^2', 'x0 x1', 'x1^2'] # They are altered below by adding a terminal char so string substitutions don't confuse # a derived feature named 'x1 x12' with another potentially derived feature named 'x10 x124' replaceable_feature_names = [] for i in range(len(self._adaptee_dimension_names_to_transform)): replaceable_feature_names.append(f'x{i}{self._internal_feature_name_terminal_char}') return self._polynomial_features.get_feature_names(replaceable_feature_names) def _project_dataframe(self, df: DataFrame, in_place=True) -> DataFrame: if not in_place: df = df.copy(deep=True) # replace NaNs with zeros df.fillna(self._nan_imputed_finite_value, inplace=True) # Transform the continuous columns and add the higher order columns to the df # Filtering columns to transform b/c dataframes coming from hierarchical hypergrid points # may not contain all possible dimensions knowable from hypergrid x_to_transform = np.zeros((len(df.index), len(self._adaptee_dimension_names_to_transform))) for i, dim_name in enumerate(self._adaptee_dimension_names_to_transform): if dim_name in df.columns.values: x_to_transform[:, i] = df[dim_name] all_poly_features = self._polynomial_features.transform(x_to_transform) for target_dim_name in self._target_polynomial_feature_map: target_dim_index = self._target_polynomial_feature_map[target_dim_name] df[target_dim_name] = all_poly_features[:, target_dim_index] return df def _unproject_dataframe(self, df: DataFrame, in_place=True) -> DataFrame: if not in_place: df = df.copy(deep=True) # unproject simply drops the monomial columns whose degree is not 1 polynomial_feature_powers = self.get_polynomial_feature_powers_table() column_names_to_drop = [] for target_dim_name, powers_table_index in self._target_polynomial_feature_map.items(): target_powers = polynomial_feature_powers[powers_table_index] if target_powers.sum() == 1: continue column_names_to_drop.append(target_dim_name) df.drop(columns=column_names_to_drop, inplace=True) return df
class DiscreteToUnitContinuousHypergridAdapter(HypergridAdapter): """ Maps values in discrete dimensions into values in a unit continuous dimensions. Unit continuous all target dimensions are between 0 and 1. And more importantly, unmaps the continuous values back to discrete ones. """ def __init__(self, adaptee: Hypergrid): if not HypergridAdapter.is_like_simple_hypergrid(adaptee): raise ValueError("Adaptee must implement a Hypergrid Interface.") HypergridAdapter.__init__(self, name=adaptee.name, random_state=adaptee.random_state) self._adaptee: Hypergrid = adaptee self._target: Hypergrid = None # Forward mapping: # Key: adaptee dimension name # Value: target dimension # self._adaptee_to_target_dimension_mappings = dict() # Reverse mapping: # Key: target dimension name # Value: adaptee dimension self._target_to_adaptee_dimension_mappings = dict() if any( isinstance(dimension, CategoricalDimension) for dimension in self._adaptee.dimensions): self._adaptee = CategoricalToDiscreteHypergridAdapter( adaptee=self._adaptee) # Now we need to build the target hypergrid and the mappings between adaptee and target. self._build_simple_hypergrid_target() @property def adaptee(self) -> Hypergrid: return self._adaptee @property def target(self) -> Hypergrid: return self._target def _project_point(self, point: Point) -> Point: projected_point = Point() for dim_name, original_dim_value in point: adaptee_dimension = self._adaptee[dim_name] if isinstance(adaptee_dimension, DiscreteDimension): # simply scale the value projected_point[dim_name] = ( original_dim_value - adaptee_dimension.min * 1.0) / len(adaptee_dimension) elif isinstance(adaptee_dimension, ContinuousDimension): if adaptee_dimension.min == adaptee_dimension.max: projected_point[dim_name] = 0 else: projected_point[dim_name] = ( original_dim_value - adaptee_dimension.min * 1.0) / ( adaptee_dimension.max - adaptee_dimension.min) else: raise ValueError( f"Dimension {adaptee_dimension.name} is neither Discrete nor Continuous." ) return projected_point def _unproject_point(self, point: Point) -> Point: unprojected_point = Point() for dim_name, projected_dim_value in point: adaptee_dimension = self._adaptee[dim_name] if isinstance(adaptee_dimension, DiscreteDimension): # simply scale the value the other way unprojected_point[dim_name] = math.floor( projected_dim_value * len(adaptee_dimension) + adaptee_dimension.min) elif isinstance(adaptee_dimension, ContinuousDimension): unprojected_point[dim_name] = projected_dim_value * ( adaptee_dimension.max - adaptee_dimension.min) + adaptee_dimension.min else: raise ValueError( f"Dimension {adaptee_dimension.name} is neither Discrete nor Continuous." ) return unprojected_point def _project_dataframe(self, df: DataFrame, in_place=True) -> DataFrame: # Basically apply the scaling for each column. # if not in_place: df = df.copy(deep=True) for adaptee_dimension in self._adaptee.dimensions: dim_name = adaptee_dimension.name if isinstance(adaptee_dimension, DiscreteDimension): df[dim_name] = (df[dim_name] - adaptee_dimension.min) / len(adaptee_dimension) elif isinstance(adaptee_dimension, ContinuousDimension): if adaptee_dimension.min == adaptee_dimension.max: df[dim_name] = 0 else: df[dim_name] = (df[dim_name] - adaptee_dimension.min) / ( adaptee_dimension.max - adaptee_dimension.min) else: raise ValueError( f"Dimension {adaptee_dimension.name} is neither Discrete nor Continuous." ) return df def _unproject_dataframe(self, df: DataFrame, in_place=True) -> DataFrame: if not in_place: df = df.copy(deep=True) for adaptee_dimension in self._adaptee.dimensions: dim_name = adaptee_dimension.name if isinstance(adaptee_dimension, DiscreteDimension): if df[dim_name].isnull().any(): df.loc[:, dim_name] = np.floor(df[dim_name] * len(adaptee_dimension) + adaptee_dimension.min) else: # If there are no nulls, we must cast back to int64. df.loc[:, dim_name] = np.floor(df[dim_name] * len(adaptee_dimension) + adaptee_dimension.min).astype( np.int64) elif isinstance(adaptee_dimension, ContinuousDimension): df.loc[:, dim_name] = df[dim_name] * ( adaptee_dimension.max - adaptee_dimension.min) + adaptee_dimension.min else: raise ValueError( f"Dimension {adaptee_dimension.name} is neither Discrete nor Continuous." ) return df def _build_simple_hypergrid_target(self) -> None: """ Builds a SimpleHypergrid target for a SimpleHypergrid adaptee. :return: """ self._target = SimpleHypergrid(name=self._adaptee.name, dimensions=None, random_state=self._adaptee.random_state) # Now we iterate over all dimensions and when necessary map the CategoricalDimensions to DiscreteDimensions # for adaptee_dimension in self._adaptee.dimensions: if isinstance(adaptee_dimension, DiscreteDimension): target_dimension = ContinuousDimension( name=adaptee_dimension.name, min=0, max=1, include_max=False) else: target_dimension = ContinuousDimension( name=adaptee_dimension.name, min=0, max=1, include_min=adaptee_dimension.include_min, include_max=adaptee_dimension.include_max) self._target.add_dimension(target_dimension) self._adaptee_to_target_dimension_mappings[ adaptee_dimension.name] = target_dimension self._target_to_adaptee_dimension_mappings[ target_dimension.name] = adaptee_dimension