Exemplo n.º 1
0
def test_logger_title(capsys, caplog, logger_env_cleanup):
    logger = get_logger(TEST_LOGGER_NAME)
    log_title(logger, "Log title")
    assert "Log title" in caplog.text

    caplog.clear()
    log_subtitle(logger, "Log subtitle")
    assert "Log subtitle" in caplog.text
Exemplo n.º 2
0
    def search(self, data_checks="auto", show_iteration_plot=True):
        """Find the best pipeline for the data set.

        Arguments:
            data_checks (DataChecks, list(Datacheck), str, None): A collection of data checks to run before
                automl search. If data checks produce any errors, an exception will be thrown before the
                search begins. If "disabled" or None, `no` data checks will be done.
                If set to "auto", DefaultDataChecks will be done. Default value is set to "auto".

            feature_types (list, optional): list of feature types, either numerical or categorical.
                Categorical features will automatically be encoded

            show_iteration_plot (boolean, True): Shows an iteration vs. score plot in Jupyter notebook.
                Disabled by default in non-Jupyter enviroments.
        """
        if self._searched:
            logger.info(
                "AutoMLSearch.search() has already been run and will not run again on the same instance. Re-initialize AutoMLSearch to search again."
            )
            return

        # don't show iteration plot outside of a jupyter notebook
        if show_iteration_plot:
            try:
                get_ipython
            except NameError:
                show_iteration_plot = False

        data_checks = self._validate_data_checks(data_checks)
        self._data_check_results = data_checks.validate(
            _convert_woodwork_types_wrapper(self.X_train.to_dataframe()),
            _convert_woodwork_types_wrapper(self.y_train.to_series()))
        for result in self._data_check_results["warnings"]:
            logger.warning(result["message"])
        for result in self._data_check_results["errors"]:
            logger.error(result["message"])
        if self._data_check_results["errors"]:
            raise ValueError(
                "Data checks raised some warnings and/or errors. Please see `self.data_check_results` for more information or pass data_checks='disabled' to search() to disable data checking."
            )

        log_title(logger, "Beginning pipeline search")
        logger.info("Optimizing for %s. " % self.objective.name)
        logger.info("{} score is better.\n".format(
            'Greater' if self.objective.greater_is_better else 'Lower'))
        logger.info(
            f"Using {self._engine.__class__.__name__} to train and score pipelines."
        )

        if self.max_batches is not None:
            logger.info(
                f"Searching up to {self.max_batches} batches for a total of {self.max_iterations} pipelines. "
            )
        elif self.max_iterations is not None:
            logger.info("Searching up to %s pipelines. " % self.max_iterations)
        if self.max_time is not None:
            logger.info(
                "Will stop searching for new pipelines after %d seconds.\n" %
                self.max_time)
        logger.info(
            "Allowed model families: %s\n" %
            ", ".join([model.value for model in self.allowed_model_families]))
        self.search_iteration_plot = None
        if self.plot:
            self.search_iteration_plot = self.plot.search_iteration_plot(
                interactive_plot=show_iteration_plot)

        self._start = time.time()

        try:
            self._add_baseline_pipelines()
        except KeyboardInterrupt:
            if self._handle_keyboard_interrupt():
                self._interrupted = True

        current_batch_pipelines = []
        current_batch_pipeline_scores = []
        new_pipeline_ids = []
        loop_interrupted = False
        while self._should_continue():
            try:
                if not loop_interrupted:
                    current_batch_pipelines = self._automl_algorithm.next_batch(
                    )
            except StopIteration:
                logger.info('AutoML Algorithm out of recommendations, ending')
                break
            try:
                new_pipeline_ids = self._engine.evaluate_batch(
                    current_batch_pipelines)
                loop_interrupted = False
            except KeyboardInterrupt:
                loop_interrupted = True
                if self._handle_keyboard_interrupt():
                    break
            full_rankings = self.full_rankings
            current_batch_idx = full_rankings['id'].isin(new_pipeline_ids)
            current_batch_pipeline_scores = full_rankings[current_batch_idx][
                'score']
            if len(current_batch_pipeline_scores
                   ) and current_batch_pipeline_scores.isna().all():
                raise AutoMLSearchException(
                    f"All pipelines in the current AutoML batch produced a score of np.nan on the primary objective {self.objective}."
                )

        self.search_duration = time.time() - self._start
        elapsed_time = time_elapsed(self._start)
        desc = f"\nSearch finished after {elapsed_time}"
        desc = desc.ljust(self._MAX_NAME_LEN)
        logger.info(desc)

        self._find_best_pipeline()
        if self._best_pipeline is not None:
            best_pipeline = self.rankings.iloc[0]
            best_pipeline_name = best_pipeline["pipeline_name"]
            logger.info(f"Best pipeline: {best_pipeline_name}")
            logger.info(
                f"Best pipeline {self.objective.name}: {best_pipeline['score']:3f}"
            )
        self._searched = True
Exemplo n.º 3
0
    def search(self, show_iteration_plot=True):
        """Find the best pipeline for the data set.

        Arguments:
            feature_types (list, optional): list of feature types, either numerical or categorical.
                Categorical features will automatically be encoded

            show_iteration_plot (boolean, True): Shows an iteration vs. score plot in Jupyter notebook.
                Disabled by default in non-Jupyter enviroments.
        """
        if self._searched:
            logger.info("AutoMLSearch.search() has already been run and will not run again on the same instance. Re-initialize AutoMLSearch to search again.")
            return

        # don't show iteration plot outside of a jupyter notebook
        if show_iteration_plot:
            try:
                get_ipython
            except NameError:
                show_iteration_plot = False

        log_title(logger, "Beginning pipeline search")
        logger.info("Optimizing for %s. " % self.objective.name)
        logger.info("{} score is better.\n".format('Greater' if self.objective.greater_is_better else 'Lower'))
        logger.info(f"Using {self._engine.__class__.__name__} to train and score pipelines.")

        if self.max_batches is not None:
            logger.info(f"Searching up to {self.max_batches} batches for a total of {self.max_iterations} pipelines. ")
        elif self.max_iterations is not None:
            logger.info("Searching up to %s pipelines. " % self.max_iterations)
        if self.max_time is not None:
            logger.info("Will stop searching for new pipelines after %d seconds.\n" % self.max_time)
        logger.info("Allowed model families: %s\n" % ", ".join([model.value for model in self.allowed_model_families]))
        self.search_iteration_plot = None
        if self.plot:
            self.search_iteration_plot = self.plot.search_iteration_plot(interactive_plot=show_iteration_plot)

        self._start = time.time()

        try:
            self._add_baseline_pipelines()
        except KeyboardInterrupt:
            if self._handle_keyboard_interrupt():
                self._interrupted = True

        current_batch_pipelines = []
        current_batch_pipeline_scores = []
        new_pipeline_ids = []
        loop_interrupted = False
        while self._should_continue():
            computations = []
            try:
                if not loop_interrupted:
                    current_batch_pipelines = self._automl_algorithm.next_batch()
            except StopIteration:
                logger.info('AutoML Algorithm out of recommendations, ending')
                break
            try:
                new_pipeline_ids = []
                log_title(logger, f"Evaluating Batch Number {self._get_batch_number()}")
                for pipeline in current_batch_pipelines:
                    self._pre_evaluation_callback(pipeline)
                    computation = self._engine.submit_evaluation_job(self.automl_config, pipeline, self.X_train, self.y_train)
                    computations.append(computation)
                current_computation_index = 0
                while self._should_continue() and len(computations) > 0:
                    computation = computations[current_computation_index]
                    if computation.done():
                        evaluation = computation.get_result()
                        data, pipeline, job_log = evaluation.get('scores'), evaluation.get("pipeline"), evaluation.get("logger")
                        pipeline_id = self._post_evaluation_callback(pipeline, data, job_log)
                        new_pipeline_ids.append(pipeline_id)
                        computations.pop(current_computation_index)
                    current_computation_index = (current_computation_index + 1) % max(len(computations), 1)
                    time.sleep(0.1)
                loop_interrupted = False
            except KeyboardInterrupt:
                loop_interrupted = True
                if self._handle_keyboard_interrupt():
                    self._interrupted = True
                    for computation in computations:
                        computation.cancel()

            full_rankings = self.full_rankings
            current_batch_idx = full_rankings['id'].isin(new_pipeline_ids)
            current_batch_pipeline_scores = full_rankings[current_batch_idx]['score']
            if len(current_batch_pipeline_scores) and current_batch_pipeline_scores.isna().all():
                raise AutoMLSearchException(f"All pipelines in the current AutoML batch produced a score of np.nan on the primary objective {self.objective}.")

        self.search_duration = time.time() - self._start
        elapsed_time = time_elapsed(self._start)
        desc = f"\nSearch finished after {elapsed_time}"
        desc = desc.ljust(self._MAX_NAME_LEN)
        logger.info(desc)

        self._find_best_pipeline()
        if self._best_pipeline is not None:
            best_pipeline = self.rankings.iloc[0]
            best_pipeline_name = best_pipeline["pipeline_name"]
            logger.info(f"Best pipeline: {best_pipeline_name}")
            logger.info(f"Best pipeline {self.objective.name}: {best_pipeline['score']:3f}")
        self._searched = True