Пример #1
0
    def test_filtering_out_invalid_rows(self):
        spaces = [
            bayesian_optimizer_config_store.parameter_space,
            glow_worm_swarm_optimizer_config_store.parameter_space
        ]

        # Just to make sure we are testing both hierarchical and flat code paths.
        #
        self.assertTrue(any(space.is_hierarchical() for space in spaces))
        self.assertTrue(any(not space.is_hierarchical() for space in spaces))

        num_samples = 1000
        for space in spaces:
            random_dataframe_with_invalid_rows = space.random_dataframe(num_samples=num_samples)
            for dimension in space.dimensions:
                if isinstance(dimension, (ContinuousDimension, DiscreteDimension)):
                    # This makes about half of the rows invalid.
                    #
                    random_dataframe_with_invalid_rows.loc[:, [dimension.name]] *= 2
                    break

            with traced(scope_name="slow_filtering"):
                # Let's filter out invalid rows the slow way.
                #
                valid_indices = []
                for idx in random_dataframe_with_invalid_rows.index:
                    row_as_df = random_dataframe_with_invalid_rows.loc[[idx]]
                    row_as_point = Point.from_dataframe(row_as_df)
                    if row_as_point in space:
                        valid_indices.append(idx)
                expected_valid_rows_index = pd.Index(valid_indices)

            print(f"{len(expected_valid_rows_index)}/{len(random_dataframe_with_invalid_rows.index)} rows are valid.")
            self.assertTrue(0 < len(expected_valid_rows_index))
            self.assertTrue(len(expected_valid_rows_index) < num_samples)

            # Let's filter out invalid rows the fast way.
            #
            actual_valid_rows_index = space.filter_out_invalid_rows(original_dataframe=random_dataframe_with_invalid_rows, exclude_extra_columns=True).index
            self.assertTrue(expected_valid_rows_index.equals(actual_valid_rows_index))

            if not space.is_hierarchical():
                # For flat spaces we can choose between the column-wise operators and the row-wise validation. This is to get the tracing data to see the
                # perf difference, but also to validate correctness by computing the desired index in yet another way.
                #
                with traced(scope_name="faster_filtering"):
                    expected_valid_rows_index_2 = random_dataframe_with_invalid_rows[random_dataframe_with_invalid_rows.apply(
                        lambda row: Point(**{dim_name: row[i] for i, dim_name in enumerate(space.dimension_names)}) in space,
                        axis=1
                    )].index
                self.assertTrue(expected_valid_rows_index_2.equals(actual_valid_rows_index))
    def _naive_poi(self, multi_objective_predictions: MultiObjectivePrediction,
                   valid_predictions_index: pd.Index,
                   std_dev_column_name: str):
        """Naively generates a monte carlo data frame for each of the feature rows and sends them to ParetoFrontier individually.

        We should be able to substantially improve on this by batching all those dataframes.

        :return:
        """
        poi_df = pd.DataFrame(index=valid_predictions_index,
                              columns=['utility'],
                              dtype='float')

        with traced(scope_name="poi_monte_carlo"):
            for config_idx in valid_predictions_index:
                monte_carlo_samples_df = self.create_monte_carlo_samples_df(
                    multi_objective_predictions=multi_objective_predictions,
                    config_idx=config_idx,
                    std_dev_column_name=std_dev_column_name)
                num_samples = len(monte_carlo_samples_df.index)
                assert num_samples == self.config.num_monte_carlo_samples

                # At this point we have a dataframe with all the randomly generated points in the objective space. Let's query the pareto
                # frontier if they are dominated or not.
                num_dominated_points = self.pareto_frontier.is_dominated(
                    objectives_df=monte_carlo_samples_df).sum()
                num_non_dominated_points = num_samples - num_dominated_points
                probability_of_improvement = num_non_dominated_points / num_samples
                poi_df.loc[config_idx, 'utility'] = probability_of_improvement

        return poi_df
Пример #3
0
    def test_named_configs(self):
        """Tests named optimizer configurations against named objective functions.

        It is prohibitively expensive to test the entire cross product so we test only its subset, but in such a way that
        each configuration will be tested at least once.
        """
        optimizer_named_configs = bayesian_optimizer_config_store.list_named_configs()
        num_optimizer_configs = len(optimizer_named_configs)
        objective_function_named_configs = objective_function_config_store.list_named_configs()
        num_objective_function_configs = len(objective_function_named_configs)

        num_tests = max(num_optimizer_configs, num_objective_function_configs)

        with traced(scope_name="parallel_tests"), concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
            outstanding_futures = set()

            for i in range(num_tests):
                named_optimizer_config = optimizer_named_configs[i % num_optimizer_configs]
                named_objective_function_config = objective_function_named_configs[i % num_objective_function_configs]

                print("#####################################################################################################")
                print(named_optimizer_config)
                print(named_objective_function_config)

                optimizer_evaluator_config = optimizer_evaluator_config_store.get_config_by_name(name="parallel_unit_tests_config")
                optimizer_config = named_optimizer_config.config_point
                objective_function_config = named_objective_function_config.config_point

                optimizer_evaluator = OptimizerEvaluator(
                    optimizer_evaluator_config=optimizer_evaluator_config,
                    objective_function_config=objective_function_config,
                    optimizer_config=optimizer_config
                )

                future = executor.submit(optimizer_evaluator.evaluate_optimizer)
                outstanding_futures.add(future)

            done_futures, outstanding_futures = concurrent.futures.wait(outstanding_futures, return_when=concurrent.futures.ALL_COMPLETED)

            for future in done_futures:
                optimizer_evaluation_report = future.result()
                assert optimizer_evaluation_report.success
                mlos.global_values.tracer.trace_events.extend(optimizer_evaluation_report.execution_trace)

                with pd.option_context('display.max_columns', 100):
                    print(optimizer_evaluation_report.regression_model_goodness_of_fit_state.get_goodness_of_fit_dataframe(DataSetType.TRAIN).tail())
                    for optimum_name, optimum_over_time in optimizer_evaluation_report.optima_over_time.items():
                        print("#####################################################################################################")
                        print(optimum_name)
                        print(optimum_over_time.get_dataframe().tail(10))
                        print("#####################################################################################################")
Пример #4
0
    def evaluate_optimizer(self) -> OptimizerEvaluationReport: # pylint: disable=too-many-statements,too-many-branches
        evaluation_report = OptimizerEvaluationReport(
            optimizer_configuration=self.optimizer_config,
            objective_function_configuration=self.objective_function_config,
            num_optimization_iterations=self.optimizer_evaluator_config.num_iterations,
            evaluation_frequency=self.optimizer_evaluator_config.evaluation_frequency
        )

        if self.optimizer_evaluator_config.include_execution_trace_in_report:
            mlos.global_values.declare_singletons()
            if mlos.global_values.tracer is None:
                mlos.global_values.tracer = Tracer()
            mlos.global_values.tracer.clear_events()


        if self.optimizer_evaluator_config.include_pickled_objective_function_in_report:
            evaluation_report.pickled_objective_function_initial_state = pickle.dumps(self.objective_function)

        if self.optimizer_evaluator_config.include_pickled_optimizer_in_report:
            evaluation_report.pickled_optimizer_initial_state = pickle.dumps(self.optimizer)

        multi_objective_regression_model_fit_state = MultiObjectiveRegressionModelFitState(objective_names=self.optimizer.optimization_problem.objective_names)
        for objective_name in self.optimizer.optimization_problem.objective_names:
            multi_objective_regression_model_fit_state[objective_name] = RegressionModelFitState()

        optima_over_time = {}
        optima_over_time[OptimumDefinition.BEST_OBSERVATION.value] = OptimumOverTime(
            optimization_problem=self.optimizer.optimization_problem,
            optimum_definition=OptimumDefinition.BEST_OBSERVATION
        )

        optima_over_time[OptimumDefinition.PREDICTED_VALUE_FOR_OBSERVED_CONFIG.value] = OptimumOverTime(
            optimization_problem=self.optimizer.optimization_problem,
            optimum_definition=OptimumDefinition.PREDICTED_VALUE_FOR_OBSERVED_CONFIG
        )

        optima_over_time[f"{OptimumDefinition.UPPER_CONFIDENCE_BOUND_FOR_OBSERVED_CONFIG.value}_99"] = OptimumOverTime(
            optimization_problem=self.optimizer.optimization_problem,
            optimum_definition=OptimumDefinition.UPPER_CONFIDENCE_BOUND_FOR_OBSERVED_CONFIG,
            alpha=0.01
        )

        optima_over_time[f"{OptimumDefinition.LOWER_CONFIDENCE_BOUND_FOR_OBSERVED_CONFIG.value}_99"] = OptimumOverTime(
            optimization_problem=self.optimizer.optimization_problem,
            optimum_definition=OptimumDefinition.LOWER_CONFIDENCE_BOUND_FOR_OBSERVED_CONFIG,
            alpha=0.01
        )

        #####################################################################################################
        evaluation_report.start_time = datetime.utcnow()
        i = 0
        try:
            with traced(scope_name="optimization_loop"):
                for i in range(self.optimizer_evaluator_config.num_iterations):
                    parameters = self.optimizer.suggest()
                    objectives = self.objective_function.evaluate_point(parameters)
                    self.optimizer.register(parameters.to_dataframe(), objectives.to_dataframe())

                    if i % self.optimizer_evaluator_config.evaluation_frequency == 0:
                        self.logger.info(f"[{i + 1}/{self.optimizer_evaluator_config.num_iterations}]")
                        with traced(scope_name="evaluating_optimizer"):
                            if self.optimizer_evaluator_config.include_pickled_optimizer_in_report:
                                evaluation_report.add_pickled_optimizer(iteration=i, pickled_optimizer=pickle.dumps(self.optimizer))

                            if self.optimizer.trained:
                                multi_objective_gof_metrics = self.optimizer.compute_surrogate_model_goodness_of_fit()
                                for objective_name, gof_metrics in multi_objective_gof_metrics:
                                    multi_objective_regression_model_fit_state[objective_name].set_gof_metrics(
                                        data_set_type=DataSetType.TRAIN,
                                        gof_metrics=gof_metrics
                                    )

                            for optimum_name, optimum_over_time in optima_over_time.items():
                                try:
                                    config, value = self.optimizer.optimum(
                                        optimum_definition=optimum_over_time.optimum_definition,
                                        alpha=optimum_over_time.alpha
                                    )
                                    optima_over_time[optimum_name].add_optimum_at_iteration(
                                        iteration=i,
                                        optimum_config=config,
                                        optimum_value=value
                                    )
                                except ValueError as e:
                                    self.logger.info(f"Failed to get {optimum_name} optimum.", exc_info=True)

                            if self.optimizer_evaluator_config.report_pareto_over_time:
                                evaluation_report.pareto_over_time[i] = copy.deepcopy(self.optimizer.optimization_problem)

                            if self.optimizer_evaluator_config.report_pareto_volume_over_time:
                                volume_estimator = self.optimizer.pareto_frontier.approximate_pareto_volume()
                                ci99_on_volume = volume_estimator.get_two_sided_confidence_interval_on_pareto_volume(alpha=0.01)
                                evaluation_report.pareto_volume_over_time[i] = ci99_on_volume

                evaluation_report.success = True

        except Exception as e:
            evaluation_report.success = False
            evaluation_report.exception = e
            evaluation_report.exception_traceback = traceback.format_exc()

        evaluation_report.end_time = datetime.utcnow()

        with traced(scope_name="evaluating_optimizer"):
            # Once the optimization is done, we perform a final evaluation of the optimizer.

            if self.optimizer.trained:
                multi_objective_gof_metrics = self.optimizer.compute_surrogate_model_goodness_of_fit()
                for objective_name, gof_metrics in multi_objective_gof_metrics:
                    multi_objective_regression_model_fit_state[objective_name].set_gof_metrics(data_set_type=DataSetType.TRAIN, gof_metrics=gof_metrics)

            for optimum_name, optimum_over_time in optima_over_time.items():
                try:
                    config, value = self.optimizer.optimum(optimum_definition=optimum_over_time.optimum_definition, alpha=optimum_over_time.alpha)
                    optima_over_time[optimum_name].add_optimum_at_iteration(
                        iteration=self.optimizer_evaluator_config.num_iterations,
                        optimum_config=config,
                        optimum_value=value
                    )
                except Exception as e:
                    self.logger.info(f"Failed to get {optimum_name} optimum.", exc_info=True)

        if self.optimizer_evaluator_config.report_pareto_over_time:
            evaluation_report.pareto_over_time[i] = copy.deepcopy(self.optimizer.optimization_problem)

        if self.optimizer_evaluator_config.report_pareto_volume_over_time:
            volume_estimator = self.optimizer.pareto_frontier.approximate_pareto_volume()
            ci99_on_volume = volume_estimator.get_two_sided_confidence_interval_on_pareto_volume(alpha=0.01)
            evaluation_report.pareto_volume_over_time[i] = ci99_on_volume

        if self.optimizer_evaluator_config.include_execution_trace_in_report:
            evaluation_report.execution_trace = mlos.global_values.tracer.trace_events
            mlos.global_values.tracer.clear_events()

        if self.optimizer_evaluator_config.include_pickled_optimizer_in_report:
            evaluation_report.add_pickled_optimizer(iteration=i, pickled_optimizer=pickle.dumps(self.optimizer))

        if self.optimizer_evaluator_config.include_pickled_objective_function_in_report:
            evaluation_report.pickled_objective_function_final_state = pickle.dumps(self.objective_function)

        if self.optimizer_evaluator_config.report_regression_model_goodness_of_fit:
            evaluation_report.regression_model_fit_state = multi_objective_regression_model_fit_state

        if self.optimizer_evaluator_config.report_optima_over_time:
            evaluation_report.optima_over_time = optima_over_time

        return evaluation_report
Пример #5
0
    def run_iteration(self, worms: pd.DataFrame):

        with traced(scope_name="numpy_matrix_operations"):
            positions = worms[self.dimension_names].to_numpy()
            # At this point many glowworms will have NaNs in their position vectors: for every column that's invalid.
            # Glowworms with the same set of valid columns, belong to the same subgrids in the hypergrid, but glowworms
            # with different set of valid columns belong to - essentially - different search spaces, and the idea of
            # distance ceases to make sense. But their positions are all cast onto this really high dimensional space.
            # How can we keep them apart?
            #
            # Here is the trick: glowworms should only see other glowworms in their own search space. One way to
            # accomplish that is to fill in the NaNs with a value larger than the max_sensory_radius. Now, glowworms
            # in different subgrids will never see each other (they can't see that far in this space), but glowworms in
            # the same subgrids will have the same large placeholder in their invalid dimensions, so it will not contribute
            # anything to the distance between them.
            positions = np.nan_to_num(x=positions,
                                      copy=False,
                                      nan=2 *
                                      self.optimizer_config.max_sensory_radius)

            distances = euclidean_distances(positions, positions)
            decision_radii = worms['decision_radius'].to_numpy().transpose()

            # Subtract the sensory radius from each row. Everything in the row, that's negative is your neighbor (if they
            # also have a higher luciferin level).
            #
            distances = (distances - decision_radii).transpose()

            # Now let's compute the difference in luciferin. Numpy's broadcasting is hard to read, but fast and convenient.
            # We are basically doing exactly the same thing with luciferin as with distances: whatever is left negative in
            # your row is your neighbor (if they are also close enough).
            #
            luciferin = worms['luciferin'].to_numpy()
            luciferin_diffs = luciferin[:, np.newaxis] - luciferin

            # So worms are neighbors if both signs are negative.
            #
            distances_signs = np.sign(distances)
            luciferin_signs = np.sign(luciferin_diffs)
            summed_signs = distances_signs + luciferin_signs

            # Now let's put together a matrix, such that in each row for each column we have either:
            #  0 - if the worm in that column is not a neighbor (too far or too dim)
            #  or luciferin difference between that neighbor and us.
            #
            unnormalized_probability = np.where(summed_signs == -2,
                                                -luciferin_diffs, 0)

        # We will have to iterate over all rows anyway, to invoke the np.random.choice() since it operates on
        # 1-D arrays so we might as well iterate over unnormalized probabilities, check if there is anything
        # non-zero in there, select the target, compute, and take the step.
        #
        for row, unnormalized_probability_row in enumerate(
                unnormalized_probability):
            row_sum = unnormalized_probability_row.sum()
            num_neighbors = np.count_nonzero(unnormalized_probability_row)
            if row_sum == 0:
                # nobody is close enough and bright enough
                continue
            normalized_probability = unnormalized_probability_row / row_sum
            col = np.random.choice(len(normalized_probability),
                                   size=1,
                                   p=normalized_probability)[0]
            our_position = positions[row]
            their_position = positions[col]
            distance = distances[row][col]
            step_unit_vector = (their_position - our_position) / distance
            step = self.optimizer_config.step_size * step_unit_vector
            our_new_position = our_position + step

            # We only set the non-nan values in the worms dataframe. Remember the trick of setting nans to big values?
            # This is undoing that trick to hide it from the caller.
            # TODO: this ends up being pretty slow. See if we can improve.
            #
            is_nan = np.isnan(worms.loc[row, self.dimension_names].to_numpy())
            our_new_position[is_nan] = np.nan
            worms.loc[row, self.dimension_names] = our_new_position

            current_decision_radius = decision_radii[row]
            decision_radius_update = self.optimizer_config.decision_radius_adjustment_constant * \
                                     (self.optimizer_config.desired_num_neighbors - num_neighbors)

            new_decision_radius = min(
                self.optimizer_config.max_sensory_radius,
                max(0, current_decision_radius + decision_radius_update))
            worms.loc[row, ['decision_radius']] = new_decision_radius
        return worms
Пример #6
0
    def evaluate_optimizer(self) -> OptimizerEvaluationReport: # pylint: disable=too-many-statements
        evaluation_report = OptimizerEvaluationReport(
            optimizer_configuration=self.optimizer_config,
            objective_function_configuration=self.objective_function_config,
            num_optimization_iterations=self.optimizer_evaluator_config.num_iterations,
            evaluation_frequency=self.optimizer_evaluator_config.evaluation_frequency,
        )

        if self.optimizer_evaluator_config.include_execution_trace_in_report:
            mlos.global_values.declare_singletons()
            if mlos.global_values.tracer is None:
                mlos.global_values.tracer = Tracer()
            mlos.global_values.tracer.clear_events()


        if self.optimizer_evaluator_config.include_pickled_objective_function_in_report:
            evaluation_report.pickled_objective_function_initial_state = pickle.dumps(self.objective_function)

        if self.optimizer_evaluator_config.include_pickled_optimizer_in_report:
            evaluation_report.pickled_optimizer_initial_state = pickle.dumps(self.optimizer)

        regression_model_fit_state = RegressionModelFitState()

        optima_over_time = {}
        optima_over_time[OptimumDefinition.BEST_OBSERVATION.value] = OptimumOverTime(
            optimization_problem=self.optimizer.optimization_problem,
            optimum_definition=OptimumDefinition.BEST_OBSERVATION
        )

        optima_over_time[OptimumDefinition.PREDICTED_VALUE_FOR_OBSERVED_CONFIG.value] = OptimumOverTime(
            optimization_problem=self.optimizer.optimization_problem,
            optimum_definition=OptimumDefinition.PREDICTED_VALUE_FOR_OBSERVED_CONFIG
        )

        optima_over_time[f"{OptimumDefinition.UPPER_CONFIDENCE_BOUND_FOR_OBSERVED_CONFIG.value}_99"] = OptimumOverTime(
            optimization_problem=self.optimizer.optimization_problem,
            optimum_definition=OptimumDefinition.UPPER_CONFIDENCE_BOUND_FOR_OBSERVED_CONFIG,
            alpha=0.01
        )

        optima_over_time[f"{OptimumDefinition.LOWER_CONFIDENCE_BOUND_FOR_OBSERVED_CONFIG.value}_99"] = OptimumOverTime(
            optimization_problem=self.optimizer.optimization_problem,
            optimum_definition=OptimumDefinition.LOWER_CONFIDENCE_BOUND_FOR_OBSERVED_CONFIG,
            alpha=0.01
        )

        #####################################################################################################
        i = 0
        try:
            with traced(scope_name="optimization_loop"):
                for i in range(self.optimizer_evaluator_config.num_iterations):
                    parameters = self.optimizer.suggest()
                    objectives = self.objective_function.evaluate_point(parameters)
                    self.optimizer.register(parameters.to_dataframe(), objectives.to_dataframe())

                    if i % self.optimizer_evaluator_config.evaluation_frequency == 0:
                        print(f"[{i + 1}/{self.optimizer_evaluator_config.num_iterations}]")
                        with traced(scope_name="evaluating_optimizer"):
                            if self.optimizer_evaluator_config.include_pickled_optimizer_in_report:
                                evaluation_report.add_pickled_optimizer(iteration=i, pickled_optimizer=pickle.dumps(self.optimizer))

                            if self.optimizer.trained:
                                gof_metrics = self.optimizer.compute_surrogate_model_goodness_of_fit()
                                regression_model_fit_state.set_gof_metrics(data_set_type=DataSetType.TRAIN, gof_metrics=gof_metrics)

                            for optimum_name, optimum_over_time in optima_over_time.items():
                                try:
                                    config, value = self.optimizer.optimum(
                                        optimum_definition=optimum_over_time.optimum_definition,
                                        alpha=optimum_over_time.alpha
                                    )
                                    optima_over_time[optimum_name].add_optimum_at_iteration(
                                        iteration=i,
                                        optimum_config=config,
                                        optimum_value=value
                                    )
                                except ValueError as e:
                                    print(e)
                evaluation_report.success = True

        except Exception as e:
            evaluation_report.success = False
            evaluation_report.exception = e
            evaluation_report.exception_traceback = traceback.format_exc()

        with traced(scope_name="evaluating_optimizer"):
            """Once the optimization is done, we performa final evaluation of the optimizer."""

            if self.optimizer.trained:
                gof_metrics = self.optimizer.compute_surrogate_model_goodness_of_fit()
                regression_model_fit_state.set_gof_metrics(data_set_type=DataSetType.TRAIN, gof_metrics=gof_metrics)

            for optimum_name, optimum_over_time in optima_over_time.items():
                try:
                    config, value = self.optimizer.optimum(optimum_definition=optimum_over_time.optimum_definition, alpha=optimum_over_time.alpha)
                    optima_over_time[optimum_name].add_optimum_at_iteration(
                        iteration=self.optimizer_evaluator_config.num_iterations,
                        optimum_config=config,
                        optimum_value=value
                    )
                except Exception as e:
                    print(e)

        if self.optimizer_evaluator_config.include_execution_trace_in_report:
            evaluation_report.execution_trace = mlos.global_values.tracer.trace_events
            mlos.global_values.tracer.clear_events()

        if self.optimizer_evaluator_config.include_pickled_optimizer_in_report:
            evaluation_report.add_pickled_optimizer(iteration=i, pickled_optimizer=pickle.dumps(self.optimizer))

        if self.optimizer_evaluator_config.include_pickled_objective_function_in_report:
            evaluation_report.pickled_objective_function_final_state = pickle.dumps(self.objective_function)

        if self.optimizer_evaluator_config.report_regression_model_goodness_of_fit:
            evaluation_report.regression_model_goodness_of_fit_state = regression_model_fit_state

        if self.optimizer_evaluator_config.report_optima_over_time:
            evaluation_report.optima_over_time = optima_over_time

        return evaluation_report