def test_projecting_dataframe_from_categorical_to_discrete_simple_hypergrid( self): adapter = CategoricalToDiscreteHypergridAdapter( adaptee=self.simple_hypergrid) original_df = self.simple_hypergrid.random_dataframe(num_samples=10000) projected_df = adapter.project_dataframe(original_df, in_place=False) # Let's make sure we have a deep copy. # self.assertTrue( id(original_df) != id(projected_df)) # Make sure that a deep copy was made. self.assertFalse(original_df.equals(projected_df)) # TODO: assert projected df only has numbers # Let's copy the projected_df before testing if all is numeric - the test might change the data. copied_df = projected_df.copy(deep=True) columns = copied_df.columns.values.tolist() for column in columns: # For each column let's validate that it contains only numerics. We'll do this by coercing all values to numerics. # If such coercion fails, it produces a null value, so we can validate that there are no nulls in the output. self.assertTrue( pd.to_numeric(copied_df[column], errors='coerce').notnull().all()) # To make sure the check above is capable of failing, let's try the same trick on the input where we know there are non-numeric values # copied_original_df = original_df.copy(deep=True) self.assertFalse( pd.to_numeric(copied_original_df['categorical_mixed_types'], errors='coerce').notnull().all()) unprojected_df = adapter.unproject_dataframe(projected_df, in_place=False) self.assertTrue(id(original_df) != id(unprojected_df)) self.assertTrue(original_df.equals(unprojected_df)) # Let's make sure that projecting in place works as expected. projected_in_place_df = adapter.project_dataframe(original_df) self.assertTrue(id(original_df) == id(projected_in_place_df)) self.assertTrue(projected_in_place_df.equals(projected_df)) unprojected_in_place_df = adapter.unproject_dataframe( projected_in_place_df) self.assertTrue(id(original_df) == id(unprojected_in_place_df)) self.assertTrue(unprojected_in_place_df.equals(unprojected_df))
def test_projecting_dataframe_from_categorical_hierarchical_to_discrete_flat_hypergrid( self): adapter = CategoricalToDiscreteHypergridAdapter( adaptee=HierarchicalToFlatHypergridAdapter( adaptee=self.hierarchical_hypergrid)) assert not any( isinstance(dimension, CategoricalDimension) for dimension in adapter.dimensions) assert not any("." in dimension.name for dimension in adapter.dimensions) original_df = self.hierarchical_hypergrid.random_dataframe( num_samples=10000) projected_df = adapter.project_dataframe(df=original_df, in_place=False) unprojected_df = adapter.unproject_dataframe(df=projected_df, in_place=False) assert original_df.equals(unprojected_df)
class GridPlot: """Maintains all data, meta-data and styling information required to produce a grid-plot. The grid plot is built based on the OptimizationProblem instance, to find out what objectives and what features are to be plotted. We use information contained in the dimensions to compute the ranges for all axes/ranges on the plot, as well as to configure the color map. If the range is infinite (as can be the case with many objectives) we can use the observed range of values to configure the range of values to be plotted. Each figure in the grid plot contains: * Either a scatter plot of feature vs. feature where the color of each point corresponds to the objective value * Or a scatter plot of feature vs. objective (if we are on a diagonal). Additionally, we could also plot the predicted values as a background heatmap for the feature vs. feature plots, and a predicted value with confidence intervals plot for feature vs. objective plots. This of course introduces a complication of needing to query the optimizer for each pixel and so we will add it later. """ def __init__(self, optimization_problem: OptimizationProblem, objective_name: str, observations_data_source: ObservationsDataSource, logger=None): if logger is None: logger = create_logger(self.__class__.__name__) self.logger = logger # The data source is maintained by the tomograph. # self._observations_data_source = observations_data_source # Metatdata - what dimensions are we going to be plotting here? # self.optimization_problem = optimization_problem assert objective_name in self.optimization_problem.objective_space.dimension_names self.objective_name = objective_name # The adapter is needed if we want to create plots of categorical dimensions. It maps categorical values to integers so # that we can consistently place them on the plots. # self._feature_space_adapter = CategoricalToDiscreteHypergridAdapter( adaptee=self.optimization_problem.feature_space) self.feature_dimension_names: List[str] = [ feature_name for feature_name in self._feature_space_adapter.dimension_names if feature_name != "contains_context" ] self.num_features = len(self.feature_dimension_names) # Stores figure ranges by name so that we can synchronize zooming and panning # self._x_ranges_by_name = {} self._y_ranges_by_name = {} # Stores an array of all plots for all objectives. # self._figures = [[None for col in range(self.num_features)] for row in range(self.num_features)] self._title = Div(text=f"<h1>{self.objective_name}</h1>") # Stores the bokeh gridplot object. # self._grid_plot = None @property def formatted_plots(self): return column([self._title, self._grid_plot]) def update_plots(self): """Updates the plot with observations from data source. """ self._x_ranges_by_name = {} self._y_ranges_by_name = {} self._grid_plot = None tooltips = [(f"{feature_name}", f"@{feature_name}") for feature_name in self.feature_dimension_names] tooltips.extend([ (f"{objective_name}", f"@{objective_name}") for objective_name in self.optimization_problem.objective_names ]) hover = HoverTool(tooltips=tooltips) plot_options = dict(plot_width=int(2000 / self.num_features), plot_height=int(2000 / self.num_features), tools=[ 'box_select', 'lasso_select', 'box_zoom', 'wheel_zoom', 'reset', hover ]) final_column_plot_options = dict( plot_width=int(2000 / self.num_features) + 75, plot_height=int(2000 / self.num_features), tools=[ 'box_select', 'lasso_select', 'box_zoom', 'wheel_zoom', 'reset', hover ]) color_mapper = LinearColorMapper( palette='Turbo256', low=self._observations_data_source.observations_df[ self.objective_name].min(), high=self._observations_data_source.observations_df[ self.objective_name].max()) for row, row_dimension_name in enumerate(self.feature_dimension_names): for col, col_dimension_name in enumerate( self.feature_dimension_names): x_axis_name = col_dimension_name x_ticks, x_tick_label_mapping = self._get_feature_ticks_and_tick_label_mapping( x_axis_name) if row == col: # For plots on the diagonals, we want to plot the row dimension vs. objective # y_axis_name = self.objective_name # Since objectives are always continuous, the default ticks and tick-labels provided by bokeh work well. # y_ticks, y_tick_label_mapping = None, None else: y_axis_name = row_dimension_name y_ticks, y_tick_label_mapping = self._get_feature_ticks_and_tick_label_mapping( y_axis_name) if col == (self.num_features - 1): fig = figure(**final_column_plot_options) else: fig = figure(**plot_options) fig.scatter( x_axis_name, y_axis_name, color={ 'field': self.objective_name, 'transform': color_mapper }, marker='circle', source=self._observations_data_source.data_source, ) fig.xaxis.axis_label = x_axis_name fig.yaxis.axis_label = y_axis_name fig.xaxis.ticker = x_ticks fig.axis.major_label_overrides = x_tick_label_mapping if y_ticks is not None: fig.yaxis.ticker = y_ticks fig.yaxis.major_label_overrides = y_tick_label_mapping self._set_ranges(fig, x_axis_name, y_axis_name) self.logger.debug( f"Assigning figure to [{row}][{col}]. {self.objective_name}, {row_dimension_name}, {col_dimension_name}" ) self._figures[row][col] = fig color_bar = ColorBar(color_mapper=color_mapper, label_standoff=12, location=(0, 0), title=self.objective_name) self._figures[row][-1].add_layout(color_bar, 'right') self._grid_plot = gridplot(self._figures) def _get_feature_ticks_and_tick_label_mapping(self, axis_name): """Returns tick positions as well as labels for each tick. The complication is that tick labels can be categorical, but ticks must be plotted at locations specified by integers. Once again adapters come to the rescue: we simply use an adapter to construct a (persistent) mapping between the categorical values (needed to label the ticks) and integer values (needed to position the ticks). This mapping is persisted in the adapter and here we dole it out to each plot on an as-needed basis. :param axis_name: :return: """ projected_ticks = self._feature_space_adapter[axis_name].linspace(5) projected_ticks_df = pd.DataFrame({axis_name: projected_ticks}) unprojected_ticks_df = self._feature_space_adapter.unproject_dataframe( projected_ticks_df) unprojected_col_name = unprojected_ticks_df.columns[0] tick_mapping = { projected_tick: f"{unprojected_tick:.2f}" if isinstance( unprojected_tick, float) else str(unprojected_tick) for projected_tick, unprojected_tick in zip( projected_ticks, unprojected_ticks_df[unprojected_col_name]) } return projected_ticks, tick_mapping def _set_ranges(self, fig, x_axis_name, y_axis_name): """Sets the ranges on each axis to enable synchronized panning and zooming. Basically, when we see a given range name for the first time we cache the range and set that cached range for all figures in the future. This way all plots that share the same range name (so the same dimension) are synchronized for panning and zooming. """ if x_axis_name in self._x_ranges_by_name: fig.x_range = self._x_ranges_by_name[x_axis_name] else: self._x_ranges_by_name[x_axis_name] = fig.x_range if y_axis_name in self._y_ranges_by_name: fig.y_range = self._y_ranges_by_name[y_axis_name] else: self._y_ranges_by_name[y_axis_name] = fig.y_range