Python pset_from_config 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: gama.configuration.parser

메소드/함수: pset_from_config

hotexamples.com에서의 예제들: 4

Python pset_from_config - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 gama.configuration.parser.pset_from_config에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

    def __init__(
        self,
        scoring: Union[str, Metric, Iterable[str],
                       Iterable[Metric]] = "filled_in_by_child_class",
        regularize_length: bool = True,
        max_pipeline_length: Optional[int] = None,
        config: Dict = None,
        random_state: Optional[int] = None,
        max_total_time: int = 3600,
        max_eval_time: Optional[int] = None,
        n_jobs: Optional[int] = None,
        max_memory_mb: Optional[int] = None,
        verbosity: int = logging.WARNING,
        search: BaseSearch = AsyncEA(),
        post_processing: BasePostProcessing = BestFitPostProcessing(),
        output_directory: Optional[str] = None,
        store: str = "logs",
    ):
        """

        Parameters
        ----------
        scoring: str, Metric or Tuple
            Specifies the/all metric(s) to optimize towards.
            A string will be converted to Metric.
            A tuple must specify each metric with the same type (e.g. all str).
            See :ref:`Metrics` for built-in metrics.

        regularize_length: bool (default=True)
            If True, add pipeline length as an optimization metric.
            Short pipelines should then be preferred over long ones.

        max_pipeline_length: int, optional (default=None)
            If set, limit the maximum number of steps in any evaluated pipeline.
            Encoding and imputation are excluded.

        config: Dict
            Specifies available components and their valid hyperparameter settings.
            For more information, see :ref:`search_space_configuration`.

        random_state:  int, optional (default=None)
            Seed for the random number generators used in the process.
            However, with `n_jobs > 1`,
            there will be randomization introduced by multi-processing.
            For reproducible results, set this and use `n_jobs=1`.

        max_total_time: positive int (default=3600)
            Time in seconds that can be used for the `fit` call.

        max_eval_time: positive int, optional (default=None)
            Time in seconds that can be used to evaluate any one single individual.
            If None, set to 0.1 * max_total_time.

        n_jobs: int, optional (default=None)
            The amount of parallel processes that may be created to speed up `fit`.
            Accepted values are positive integers, -1 or None.
            If -1 is specified, multiprocessing.cpu_count() processes are created.
            If None is specified, multiprocessing.cpu_count() / 2 processes are created.

        max_memory_mb: int, optional (default=None)
            Sets the total amount of memory GAMA is allowed to use (in megabytes).
            If not set, GAMA will use as much as it needs.
            GAMA is not guaranteed to respect this limit at all times,
            but it should never violate it for too long.

        verbosity: int (default=logging.WARNING)
            Sets the level of log messages to be automatically output to terminal.

        search: BaseSearch (default=AsyncEA())
            Search method to use to find good pipelines. Should be instantiated.

        post_processing: BasePostProcessing (default=BestFitPostProcessing())
            Post-processing method to create a model after the search phase.
            Should be an instantiated subclass of BasePostProcessing.

        output_directory: str, optional (default=None)
            Directory to use to save GAMA output. This includes both intermediate
            results during search and logs.
            If set to None, generate a unique name ("gama_HEXCODE").

        store: str (default='logs')
            Determines which data is stored after each run:
             - 'nothing': keep nothing from this run
             - 'models': keep only cache with models and predictions
             - 'logs': keep only the logs
             - 'all': keep logs and cache with models and predictions
        """
        if not output_directory:
            output_directory = f"gama_{str(uuid.uuid4())}"
        self.output_directory = os.path.abspath(
            os.path.expanduser(output_directory))
        if not os.path.exists(self.output_directory):
            os.mkdir(self.output_directory)

        register_stream_log(verbosity)
        if store in ["logs", "all"]:
            log_file = os.path.join(self.output_directory, "gama.log")
            log_handler = logging.FileHandler(log_file)
            log_handler.setLevel(logging.DEBUG)
            log_format = logging.Formatter(
                "[%(asctime)s - %(name)s] %(message)s")
            log_handler.setFormatter(log_format)
            logging.getLogger("gama").addHandler(log_handler)

        arguments = ",".join([
            f"{k}={v}" for (k, v) in locals().items() if k not in
            ["self", "config", "log_file", "log_handler", "log_format"]
        ])
        log.info(f"Using GAMA version {__version__}.")
        log.info(f"INIT:{self.__class__.__name__}({arguments})")

        if n_jobs is None:
            n_jobs = multiprocessing.cpu_count() // 2
            log.debug("n_jobs defaulted to %d.", n_jobs)
        elif n_jobs == -1:
            n_jobs = multiprocessing.cpu_count()
            log.debug("n_jobs set to use all %d cores.", n_jobs)

        err = ""
        if max_total_time is None or max_total_time <= 0:
            err = f"Expect positive int for max_total_time, got {max_total_time}."
        if max_eval_time is not None and max_eval_time <= 0:
            err = f"Expect None or positive int for max_eval_time, got {max_eval_time}."
        if n_jobs < -1 or n_jobs == 0:
            err = f"n_jobs should be -1 or positive int but is {n_jobs}."
        if err:
            self.cleanup("all")
            raise ValueError(err)

        setattr(
            AsyncEvaluator,
            "__init__",
            partialmethod(
                AsyncEvaluator.__init__,
                n_workers=n_jobs,
                memory_limit_mb=max_memory_mb,
                logfile=os.path.join(self.output_directory, "memory.log"),
            ),
        )

        if max_eval_time is None:
            max_eval_time = round(0.1 * max_total_time)
        if max_eval_time > max_total_time:
            log.warning(
                f"max_eval_time ({max_eval_time}) > max_total_time ({max_total_time}) "
                f"is not allowed. max_eval_time set to {max_total_time}.")
            max_eval_time = max_total_time

        self._max_eval_time = max_eval_time
        self._time_manager = TimeKeeper(max_total_time)
        self._metrics: Tuple[Metric, ...] = scoring_to_metric(scoring)
        self._regularize_length = regularize_length
        self._search_method: BaseSearch = search
        self._post_processing = post_processing
        self._store = store

        if random_state is not None:
            random.seed(random_state)
            np.random.seed(random_state)

        self._x: Optional[pd.DataFrame] = None
        self._y: Optional[pd.DataFrame] = None
        self._basic_encoding_pipeline: Optional[Pipeline] = None
        self._fixed_pipeline_extension: List[Tuple[str, TransformerMixin]] = []
        self._inferred_dtypes: List[Type] = []
        self.model: object = None
        self._final_pop: List[Individual] = []

        self._subscribers: Dict[str, List[Callable]] = defaultdict(list)
        cache_directory = os.path.join(self.output_directory, "cache")
        if isinstance(post_processing, EnsemblePostProcessing):
            self._evaluation_library = EvaluationLibrary(
                m=post_processing.hyperparameters["max_models"],
                n=post_processing.hyperparameters["hillclimb_size"],
                cache=cache_directory,
            )
        else:
            # Don't keep memory-heavy evaluation meta-data (predictions, estimators)
            self._evaluation_library = EvaluationLibrary(m=0,
                                                         cache=cache_directory)
        self.evaluation_completed(self._evaluation_library.save_evaluation)
        e = search.logger(
            os.path.join(self.output_directory, "evaluations.log"))
        self.evaluation_completed(e.log_evaluation)

        self._pset, parameter_checks = pset_from_config(config)

        if DATA_TERMINAL not in self._pset:
            if max_pipeline_length is None:
                log.info(
                    "Setting `max_pipeline_length` to 1 "
                    "because there are no preprocessing steps in the search space."
                )
                max_pipeline_length = 1
            elif max_pipeline_length > 1:
                raise ValueError(
                    f"`max_pipeline_length` can't be {max_pipeline_length} "
                    "because there are no preprocessing steps in the search space."
                )
        max_start_length = 3 if max_pipeline_length is None else max_pipeline_length
        self._operator_set = OperatorSet(
            mutate=partial(
                random_valid_mutation_in_place,
                primitive_set=self._pset,
                max_length=max_pipeline_length,
            ),
            mate=partial(random_crossover, max_length=max_pipeline_length),
            create_from_population=partial(create_from_population,
                                           cxpb=0.2,
                                           mutpb=0.8),
            create_new=partial(
                create_random_expression,
                primitive_set=self._pset,
                max_length=max_start_length,
            ),
            compile_=compile_individual,
            eliminate=eliminate_from_pareto,
            evaluate_callback=self._on_evaluation_completed,
            completed_evaluations=self._evaluation_library.lookup,
        )

예제 #2

파일 보기

파일: GamaReport.py 프로젝트: prabhant/gama

import os
from datetime import datetime, timedelta
from typing import List, Tuple, Dict

import pandas as pd

from gama.configuration.classification import clf_config
from gama.configuration.parser import pset_from_config, merge_configurations
from gama.configuration.regression import reg_config
from gama.genetic_programming.components import Individual

pset, _ = pset_from_config(merge_configurations(clf_config, reg_config))


class GamaReport:
    """ Contains information parsed from a search captured by a GAMA analysis log. """
    def __init__(self, log_directory: str):
        """ Parse the logfile or log lines provided.

        Parameters
        ----------
        log_directory: str
            The directory with logs:
                - gama.log
                - evaluations.log
                - resources.log
        """
        self._log_directory = os.path.expanduser(log_directory)
        self.name = os.path.split(log_directory)[-1]
        self.phases: List[Tuple[str, str, datetime, float]] = []
        self._last_tell = 0

예제 #3

파일 보기

파일: gama.py 프로젝트: vumichien/gama

    def __init__(
        self,
        scoring: Union[str, Metric, Iterable[str],
                       Iterable[Metric]] = "filled_in_by_child_class",
        regularize_length: bool = True,
        max_pipeline_length: Optional[int] = None,
        config: Dict = None,
        random_state: Optional[int] = None,
        max_total_time: int = 3600,
        max_eval_time: Optional[int] = None,
        n_jobs: Optional[int] = None,
        verbosity: int = logging.WARNING,
        keep_analysis_log: Optional[str] = "gama.log",
        search_method: BaseSearch = AsyncEA(),
        post_processing_method: BasePostProcessing = BestFitPostProcessing(),
        cache: Optional[str] = None,
    ):
        """

        Parameters
        ----------
        scoring: str, Metric or Tuple
            Specifies the/all metric(s) to optimize towards.
            A string will be converted to Metric.
            A tuple must specify each metric with the same type (e.g. all str).
            See :ref:`Metrics` for built-in metrics.

        regularize_length: bool (default=True)
            If True, add pipeline length as an optimization metric.
            Short pipelines should then be preferred over long ones.

        max_pipeline_length: int, optional (default=None)
            If set, limit the maximum number of steps in any evaluated pipeline.
            Encoding and imputation are excluded.

        config: Dict
            Specifies available components and their valid hyperparameter settings.
            For more information, see :ref:`search_space_configuration`.

        random_state:  int, optional (default=None)
            Seed for the random number generators used in the process.
            However, with `n_jobs > 1`,
            there will be randomization introduced by multi-processing.
            For reproducible results, set this and use `n_jobs=1`.

        max_total_time: positive int (default=3600)
            Time in seconds that can be used for the `fit` call.

        max_eval_time: positive int, optional (default=None)
            Time in seconds that can be used to evaluate any one single individual.
            If None, set to 0.1 * max_total_time.

        n_jobs: int, optional (default=None)
            The amount of parallel processes that may be created to speed up `fit`.
            Accepted values are positive integers, -1 or None.
            If -1 is specified, multiprocessing.cpu_count() processes are created.
            If None is specified, multiprocessing.cpu_count() / 2 processes are created.

        verbosity: int (default=logging.WARNING)
            Sets the level of log messages to be automatically output to terminal.

        keep_analysis_log: str, optional (default='gama.log')
            If non-empty str, specify filepath where the log should be stored.
            If `None`, no log is stored.

        search_method: BaseSearch (default=AsyncEA())
            Search method to use to find good pipelines. Should be instantiated.

        post_processing_method: BasePostProcessing (default=BestFitPostProcessing())
            Post-processing method to create a model after the search phase.
            Should be an instantiated subclass of BasePostProcessing.

        cache: str, optional (default=None)
            Directory to use to save intermediate results during search.
            If set to None, generate a unique cache name.
        """
        register_stream_log(verbosity)
        if keep_analysis_log is not None:
            register_file_log(keep_analysis_log)

        if keep_analysis_log is not None and not os.path.isabs(
                keep_analysis_log):
            keep_analysis_log = os.path.abspath(keep_analysis_log)

        arguments = ",".join([
            f"{k}={v}" for (k, v) in locals().items()
            if k not in ["self", "config"]
        ])
        log.info(f"Using GAMA version {__version__}.")
        log.info(f"{self.__class__.__name__}({arguments})")
        log_event(log, TOKENS.INIT, arguments)

        if n_jobs is None:
            n_jobs = multiprocessing.cpu_count() // 2
            log.debug("n_jobs defaulted to %d", n_jobs)

        if max_total_time is None or max_total_time <= 0:
            raise ValueError(
                f"Expect positive int for max_total_time, got {max_total_time}."
            )
        if max_eval_time is not None and max_eval_time <= 0:
            raise ValueError(
                f"Expect None or positive int for max_eval_time, got {max_eval_time}."
            )
        if n_jobs < -1 or n_jobs == 0:
            raise ValueError(
                f"n_jobs should be -1 or positive int but is {n_jobs}.")
        AsyncEvaluator.n_jobs = n_jobs

        if max_eval_time is None:
            max_eval_time = round(0.1 * max_total_time)
        if max_eval_time > max_total_time:
            log.warning(
                f"max_eval_time ({max_eval_time}) > max_total_time ({max_total_time}) "
                f"is not allowed. max_eval_time set to {max_total_time}.")
            max_eval_time = max_total_time

        self._max_eval_time = max_eval_time
        self._time_manager = TimeKeeper(max_total_time)
        self._metrics: Tuple[Metric, ...] = scoring_to_metric(scoring)
        self._regularize_length = regularize_length
        self._search_method: BaseSearch = search_method
        self._post_processing = post_processing_method

        if random_state is not None:
            random.seed(random_state)
            np.random.seed(random_state)

        self._x: Optional[pd.DataFrame] = None
        self._y: Optional[pd.DataFrame] = None
        self._basic_encoding_pipeline: Optional[Pipeline] = None
        self._fixed_pipeline_extension: List[Tuple[str, TransformerMixin]] = []
        self._inferred_dtypes: List[Type] = []
        self.model: object = None
        self._final_pop: List[Individual] = []

        self._subscribers: Dict[str, List[Callable]] = defaultdict(list)
        if not cache:
            cache = f"cache_{str(uuid.uuid4())}"
        if isinstance(post_processing_method, EnsemblePostProcessing):
            self._evaluation_library = EvaluationLibrary(
                m=post_processing_method.hyperparameters["max_models"],
                n=post_processing_method.hyperparameters["hillclimb_size"],
                cache_directory=cache,
            )
        else:
            # Don't keep memory-heavy evaluation meta-data (predictions, estimators)
            self._evaluation_library = EvaluationLibrary(m=0,
                                                         cache_directory=cache)
        self.evaluation_completed(self._evaluation_library.save_evaluation)

        self._pset, parameter_checks = pset_from_config(config)

        max_start_length = 3 if max_pipeline_length is None else max_pipeline_length
        self._operator_set = OperatorSet(
            mutate=partial(
                random_valid_mutation_in_place,
                primitive_set=self._pset,
                max_length=max_pipeline_length,
            ),
            mate=partial(random_crossover, max_length=max_pipeline_length),
            create_from_population=partial(create_from_population,
                                           cxpb=0.2,
                                           mutpb=0.8),
            create_new=partial(
                create_random_expression,
                primitive_set=self._pset,
                max_length=max_start_length,
            ),
            compile_=compile_individual,
            eliminate=eliminate_from_pareto,
            evaluate_callback=self._on_evaluation_completed,
            completed_evaluations=self._evaluation_library.lookup,
        )

예제 #4

파일 보기

    def __init__(
        self,
        scoring: Union[str, Metric, Tuple[Union[str, Metric],
                                          ...]] = 'filled_in_by_child_class',
        regularize_length: bool = True,
        max_pipeline_length: Optional[int] = None,
        config: Dict = None,
        random_state: Optional[int] = None,
        max_total_time: int = 3600,
        max_eval_time: Optional[int] = None,
        n_jobs: Optional[int] = None,
        verbosity: int = logging.WARNING,
        keep_analysis_log: Optional[str] = 'gama.log',
        search_method: BaseSearch = AsyncEA(),
        post_processing_method: BasePostProcessing = BestFitPostProcessing()):
        """

        Parameters
        ----------
        scoring: str, Metric or Tuple
            Specifies the/all metric(s) to optimize towards. A string will be converted to Metric. A tuple must
            specify each metric with the same type (i.e. all str or all Metric). See :ref:`Metrics` for built-in
            metrics.

        regularize_length: bool
            If True, add pipeline length as an optimization metric (preferring short over long).

        max_pipeline_length: int, optional (default=None)
            If set, limit the maximum number of steps in any evaluated pipeline. Encoding and imputation are excluded.

        config: a dictionary which specifies available components and their valid hyperparameter settings
            For more information, see :ref:`search_space_configuration`.

        random_state:  int, optional (default=None)
            If an integer is passed, this will be the seed for the random number generators used in the process.
            However, with `n_jobs > 1`, there will be randomization introduced by multi-processing.
            For reproducible results, set this and use `n_jobs=1`.

        max_total_time: positive int (default=3600)
            Time in seconds that can be used for the `fit` call.

        max_eval_time: positive int, optional (default=None)
            Time in seconds that can be used to evaluate any one single individual.
            If None, set to 0.1 * max_total_time.

        n_jobs: int, optional (default=None)
            The amount of parallel processes that may be created to speed up `fit`.
            Accepted values are positive integers, -1 or None.
            If -1 is specified, multiprocessing.cpu_count() processes are created.
            If None is specified, multiprocessing.cpu_count() / 2 processes are created.

        verbosity: int (default=logging.WARNING)
            Sets the level of log messages to be automatically output to terminal.

        keep_analysis_log: str, optional (default='gama.log')
            If non-empty str, specifies the path (and name) where the log should be stored, e.g. /output/gama.log.
            If `None`, no log is stored.

        search_method: BaseSearch (default=AsyncEA())
            Search method to use to find good pipelines. Should be instantiated.

        post_processing_method: BasePostProcessing (default=BestFitPostProcessing())
            Post-processing method to create a model after the search phase. Should be instantiated.

        """
        register_stream_log(verbosity)
        if keep_analysis_log is not None:
            register_file_log(keep_analysis_log)

        if keep_analysis_log is not None and not os.path.isabs(
                keep_analysis_log):
            keep_analysis_log = os.path.abspath(keep_analysis_log)

        arguments = ','.join([
            '{}={}'.format(k, v) for (k, v) in locals().items() if k not in [
                'self', 'config', 'gamalog', 'file_handler',
                'stdout_streamhandler'
            ]
        ])
        log.info('Using GAMA version {}.'.format(__version__))
        log.info('{}({})'.format(self.__class__.__name__, arguments))
        log_event(log, TOKENS.INIT, arguments)

        if n_jobs is None:
            n_jobs = multiprocessing.cpu_count() // 2
            log.debug('n_jobs defaulted to %d', n_jobs)

        if max_total_time is None or max_total_time <= 0:
            raise ValueError(
                f"max_total_time should be integer greater than zero but is {max_total_time}."
            )
        if max_eval_time is not None and max_eval_time <= 0:
            raise ValueError(
                f"max_eval_time should be None or integer greater than zero but is {max_eval_time}."
            )
        if n_jobs < -1 or n_jobs == 0:
            raise ValueError(
                f"n_jobs should be -1 or positive integer but is {n_jobs}.")
        elif n_jobs != -1:
            # AsyncExecutor defaults to using multiprocessing.cpu_count(), i.e. n_jobs=-1
            AsyncEvaluator.n_jobs = n_jobs

        if max_eval_time is None:
            max_eval_time = 0.1 * max_total_time
        if max_eval_time > max_total_time:
            log.warning(
                f"max_eval_time ({max_eval_time}) > max_total_time ({max_total_time}) is not allowed. "
                f"max_eval_time set to {max_total_time}.")
            max_eval_time = max_total_time

        self._max_eval_time = max_eval_time
        self._time_manager = TimeKeeper(max_total_time)
        self._metrics: Tuple[Metric] = scoring_to_metric(scoring)
        self._regularize_length = regularize_length
        self._search_method: BaseSearch = search_method
        self._post_processing = post_processing_method

        if random_state is not None:
            random.seed(random_state)
            np.random.seed(random_state)

        self._X: Optional[pd.DataFrame] = None
        self._y: Optional[pd.DataFrame] = None
        self._basic_encoding_pipeline: Optional[Pipeline] = None
        self._inferred_dtypes: List[Type] = []
        self.model: object = None
        self._final_pop = None

        self._subscribers = defaultdict(list)
        if isinstance(post_processing_method, EnsemblePostProcessing):
            self._evaluation_library = EvaluationLibrary(
                m=post_processing_method.hyperparameters['max_models'],
                n=post_processing_method.hyperparameters['hillclimb_size'],
            )
        else:
            # Don't keep memory-heavy evaluation meta-data (predictions, estimators)
            self._evaluation_library = EvaluationLibrary(m=0)
        self.evaluation_completed(self._evaluation_library.save_evaluation)

        self._pset, parameter_checks = pset_from_config(config)

        max_start_length = 3 if max_pipeline_length is None else max_pipeline_length
        self._operator_set = OperatorSet(
            mutate=partial(random_valid_mutation_in_place,
                           primitive_set=self._pset,
                           max_length=max_pipeline_length),
            mate=partial(random_crossover, max_length=max_pipeline_length),
            create_from_population=partial(create_from_population,
                                           cxpb=0.2,
                                           mutpb=0.8),
            create_new=partial(create_random_expression,
                               primitive_set=self._pset,
                               max_length=max_start_length),
            compile_=compile_individual,
            eliminate=eliminate_from_pareto,
            evaluate_callback=self._on_evaluation_completed)