def __init__( self, scoring: Union[str, Metric, Iterable[str], Iterable[Metric]] = "filled_in_by_child_class", regularize_length: bool = True, max_pipeline_length: Optional[int] = None, config: Dict = None, random_state: Optional[int] = None, max_total_time: int = 3600, max_eval_time: Optional[int] = None, n_jobs: Optional[int] = None, max_memory_mb: Optional[int] = None, verbosity: int = logging.WARNING, search: BaseSearch = AsyncEA(), post_processing: BasePostProcessing = BestFitPostProcessing(), output_directory: Optional[str] = None, store: str = "logs", ): """ Parameters ---------- scoring: str, Metric or Tuple Specifies the/all metric(s) to optimize towards. A string will be converted to Metric. A tuple must specify each metric with the same type (e.g. all str). See :ref:`Metrics` for built-in metrics. regularize_length: bool (default=True) If True, add pipeline length as an optimization metric. Short pipelines should then be preferred over long ones. max_pipeline_length: int, optional (default=None) If set, limit the maximum number of steps in any evaluated pipeline. Encoding and imputation are excluded. config: Dict Specifies available components and their valid hyperparameter settings. For more information, see :ref:`search_space_configuration`. random_state: int, optional (default=None) Seed for the random number generators used in the process. However, with `n_jobs > 1`, there will be randomization introduced by multi-processing. For reproducible results, set this and use `n_jobs=1`. max_total_time: positive int (default=3600) Time in seconds that can be used for the `fit` call. max_eval_time: positive int, optional (default=None) Time in seconds that can be used to evaluate any one single individual. If None, set to 0.1 * max_total_time. n_jobs: int, optional (default=None) The amount of parallel processes that may be created to speed up `fit`. Accepted values are positive integers, -1 or None. If -1 is specified, multiprocessing.cpu_count() processes are created. If None is specified, multiprocessing.cpu_count() / 2 processes are created. max_memory_mb: int, optional (default=None) Sets the total amount of memory GAMA is allowed to use (in megabytes). If not set, GAMA will use as much as it needs. GAMA is not guaranteed to respect this limit at all times, but it should never violate it for too long. verbosity: int (default=logging.WARNING) Sets the level of log messages to be automatically output to terminal. search: BaseSearch (default=AsyncEA()) Search method to use to find good pipelines. Should be instantiated. post_processing: BasePostProcessing (default=BestFitPostProcessing()) Post-processing method to create a model after the search phase. Should be an instantiated subclass of BasePostProcessing. output_directory: str, optional (default=None) Directory to use to save GAMA output. This includes both intermediate results during search and logs. If set to None, generate a unique name ("gama_HEXCODE"). store: str (default='logs') Determines which data is stored after each run: - 'nothing': keep nothing from this run - 'models': keep only cache with models and predictions - 'logs': keep only the logs - 'all': keep logs and cache with models and predictions """ if not output_directory: output_directory = f"gama_{str(uuid.uuid4())}" self.output_directory = os.path.abspath( os.path.expanduser(output_directory)) if not os.path.exists(self.output_directory): os.mkdir(self.output_directory) register_stream_log(verbosity) if store in ["logs", "all"]: log_file = os.path.join(self.output_directory, "gama.log") log_handler = logging.FileHandler(log_file) log_handler.setLevel(logging.DEBUG) log_format = logging.Formatter( "[%(asctime)s - %(name)s] %(message)s") log_handler.setFormatter(log_format) logging.getLogger("gama").addHandler(log_handler) arguments = ",".join([ f"{k}={v}" for (k, v) in locals().items() if k not in ["self", "config", "log_file", "log_handler", "log_format"] ]) log.info(f"Using GAMA version {__version__}.") log.info(f"INIT:{self.__class__.__name__}({arguments})") if n_jobs is None: n_jobs = multiprocessing.cpu_count() // 2 log.debug("n_jobs defaulted to %d.", n_jobs) elif n_jobs == -1: n_jobs = multiprocessing.cpu_count() log.debug("n_jobs set to use all %d cores.", n_jobs) err = "" if max_total_time is None or max_total_time <= 0: err = f"Expect positive int for max_total_time, got {max_total_time}." if max_eval_time is not None and max_eval_time <= 0: err = f"Expect None or positive int for max_eval_time, got {max_eval_time}." if n_jobs < -1 or n_jobs == 0: err = f"n_jobs should be -1 or positive int but is {n_jobs}." if err: self.cleanup("all") raise ValueError(err) setattr( AsyncEvaluator, "__init__", partialmethod( AsyncEvaluator.__init__, n_workers=n_jobs, memory_limit_mb=max_memory_mb, logfile=os.path.join(self.output_directory, "memory.log"), ), ) if max_eval_time is None: max_eval_time = round(0.1 * max_total_time) if max_eval_time > max_total_time: log.warning( f"max_eval_time ({max_eval_time}) > max_total_time ({max_total_time}) " f"is not allowed. max_eval_time set to {max_total_time}.") max_eval_time = max_total_time self._max_eval_time = max_eval_time self._time_manager = TimeKeeper(max_total_time) self._metrics: Tuple[Metric, ...] = scoring_to_metric(scoring) self._regularize_length = regularize_length self._search_method: BaseSearch = search self._post_processing = post_processing self._store = store if random_state is not None: random.seed(random_state) np.random.seed(random_state) self._x: Optional[pd.DataFrame] = None self._y: Optional[pd.DataFrame] = None self._basic_encoding_pipeline: Optional[Pipeline] = None self._fixed_pipeline_extension: List[Tuple[str, TransformerMixin]] = [] self._inferred_dtypes: List[Type] = [] self.model: object = None self._final_pop: List[Individual] = [] self._subscribers: Dict[str, List[Callable]] = defaultdict(list) cache_directory = os.path.join(self.output_directory, "cache") if isinstance(post_processing, EnsemblePostProcessing): self._evaluation_library = EvaluationLibrary( m=post_processing.hyperparameters["max_models"], n=post_processing.hyperparameters["hillclimb_size"], cache=cache_directory, ) else: # Don't keep memory-heavy evaluation meta-data (predictions, estimators) self._evaluation_library = EvaluationLibrary(m=0, cache=cache_directory) self.evaluation_completed(self._evaluation_library.save_evaluation) e = search.logger( os.path.join(self.output_directory, "evaluations.log")) self.evaluation_completed(e.log_evaluation) self._pset, parameter_checks = pset_from_config(config) if DATA_TERMINAL not in self._pset: if max_pipeline_length is None: log.info( "Setting `max_pipeline_length` to 1 " "because there are no preprocessing steps in the search space." ) max_pipeline_length = 1 elif max_pipeline_length > 1: raise ValueError( f"`max_pipeline_length` can't be {max_pipeline_length} " "because there are no preprocessing steps in the search space." ) max_start_length = 3 if max_pipeline_length is None else max_pipeline_length self._operator_set = OperatorSet( mutate=partial( random_valid_mutation_in_place, primitive_set=self._pset, max_length=max_pipeline_length, ), mate=partial(random_crossover, max_length=max_pipeline_length), create_from_population=partial(create_from_population, cxpb=0.2, mutpb=0.8), create_new=partial( create_random_expression, primitive_set=self._pset, max_length=max_start_length, ), compile_=compile_individual, eliminate=eliminate_from_pareto, evaluate_callback=self._on_evaluation_completed, completed_evaluations=self._evaluation_library.lookup, )
import os from datetime import datetime, timedelta from typing import List, Tuple, Dict import pandas as pd from gama.configuration.classification import clf_config from gama.configuration.parser import pset_from_config, merge_configurations from gama.configuration.regression import reg_config from gama.genetic_programming.components import Individual pset, _ = pset_from_config(merge_configurations(clf_config, reg_config)) class GamaReport: """ Contains information parsed from a search captured by a GAMA analysis log. """ def __init__(self, log_directory: str): """ Parse the logfile or log lines provided. Parameters ---------- log_directory: str The directory with logs: - gama.log - evaluations.log - resources.log """ self._log_directory = os.path.expanduser(log_directory) self.name = os.path.split(log_directory)[-1] self.phases: List[Tuple[str, str, datetime, float]] = [] self._last_tell = 0
def __init__( self, scoring: Union[str, Metric, Iterable[str], Iterable[Metric]] = "filled_in_by_child_class", regularize_length: bool = True, max_pipeline_length: Optional[int] = None, config: Dict = None, random_state: Optional[int] = None, max_total_time: int = 3600, max_eval_time: Optional[int] = None, n_jobs: Optional[int] = None, verbosity: int = logging.WARNING, keep_analysis_log: Optional[str] = "gama.log", search_method: BaseSearch = AsyncEA(), post_processing_method: BasePostProcessing = BestFitPostProcessing(), cache: Optional[str] = None, ): """ Parameters ---------- scoring: str, Metric or Tuple Specifies the/all metric(s) to optimize towards. A string will be converted to Metric. A tuple must specify each metric with the same type (e.g. all str). See :ref:`Metrics` for built-in metrics. regularize_length: bool (default=True) If True, add pipeline length as an optimization metric. Short pipelines should then be preferred over long ones. max_pipeline_length: int, optional (default=None) If set, limit the maximum number of steps in any evaluated pipeline. Encoding and imputation are excluded. config: Dict Specifies available components and their valid hyperparameter settings. For more information, see :ref:`search_space_configuration`. random_state: int, optional (default=None) Seed for the random number generators used in the process. However, with `n_jobs > 1`, there will be randomization introduced by multi-processing. For reproducible results, set this and use `n_jobs=1`. max_total_time: positive int (default=3600) Time in seconds that can be used for the `fit` call. max_eval_time: positive int, optional (default=None) Time in seconds that can be used to evaluate any one single individual. If None, set to 0.1 * max_total_time. n_jobs: int, optional (default=None) The amount of parallel processes that may be created to speed up `fit`. Accepted values are positive integers, -1 or None. If -1 is specified, multiprocessing.cpu_count() processes are created. If None is specified, multiprocessing.cpu_count() / 2 processes are created. verbosity: int (default=logging.WARNING) Sets the level of log messages to be automatically output to terminal. keep_analysis_log: str, optional (default='gama.log') If non-empty str, specify filepath where the log should be stored. If `None`, no log is stored. search_method: BaseSearch (default=AsyncEA()) Search method to use to find good pipelines. Should be instantiated. post_processing_method: BasePostProcessing (default=BestFitPostProcessing()) Post-processing method to create a model after the search phase. Should be an instantiated subclass of BasePostProcessing. cache: str, optional (default=None) Directory to use to save intermediate results during search. If set to None, generate a unique cache name. """ register_stream_log(verbosity) if keep_analysis_log is not None: register_file_log(keep_analysis_log) if keep_analysis_log is not None and not os.path.isabs( keep_analysis_log): keep_analysis_log = os.path.abspath(keep_analysis_log) arguments = ",".join([ f"{k}={v}" for (k, v) in locals().items() if k not in ["self", "config"] ]) log.info(f"Using GAMA version {__version__}.") log.info(f"{self.__class__.__name__}({arguments})") log_event(log, TOKENS.INIT, arguments) if n_jobs is None: n_jobs = multiprocessing.cpu_count() // 2 log.debug("n_jobs defaulted to %d", n_jobs) if max_total_time is None or max_total_time <= 0: raise ValueError( f"Expect positive int for max_total_time, got {max_total_time}." ) if max_eval_time is not None and max_eval_time <= 0: raise ValueError( f"Expect None or positive int for max_eval_time, got {max_eval_time}." ) if n_jobs < -1 or n_jobs == 0: raise ValueError( f"n_jobs should be -1 or positive int but is {n_jobs}.") AsyncEvaluator.n_jobs = n_jobs if max_eval_time is None: max_eval_time = round(0.1 * max_total_time) if max_eval_time > max_total_time: log.warning( f"max_eval_time ({max_eval_time}) > max_total_time ({max_total_time}) " f"is not allowed. max_eval_time set to {max_total_time}.") max_eval_time = max_total_time self._max_eval_time = max_eval_time self._time_manager = TimeKeeper(max_total_time) self._metrics: Tuple[Metric, ...] = scoring_to_metric(scoring) self._regularize_length = regularize_length self._search_method: BaseSearch = search_method self._post_processing = post_processing_method if random_state is not None: random.seed(random_state) np.random.seed(random_state) self._x: Optional[pd.DataFrame] = None self._y: Optional[pd.DataFrame] = None self._basic_encoding_pipeline: Optional[Pipeline] = None self._fixed_pipeline_extension: List[Tuple[str, TransformerMixin]] = [] self._inferred_dtypes: List[Type] = [] self.model: object = None self._final_pop: List[Individual] = [] self._subscribers: Dict[str, List[Callable]] = defaultdict(list) if not cache: cache = f"cache_{str(uuid.uuid4())}" if isinstance(post_processing_method, EnsemblePostProcessing): self._evaluation_library = EvaluationLibrary( m=post_processing_method.hyperparameters["max_models"], n=post_processing_method.hyperparameters["hillclimb_size"], cache_directory=cache, ) else: # Don't keep memory-heavy evaluation meta-data (predictions, estimators) self._evaluation_library = EvaluationLibrary(m=0, cache_directory=cache) self.evaluation_completed(self._evaluation_library.save_evaluation) self._pset, parameter_checks = pset_from_config(config) max_start_length = 3 if max_pipeline_length is None else max_pipeline_length self._operator_set = OperatorSet( mutate=partial( random_valid_mutation_in_place, primitive_set=self._pset, max_length=max_pipeline_length, ), mate=partial(random_crossover, max_length=max_pipeline_length), create_from_population=partial(create_from_population, cxpb=0.2, mutpb=0.8), create_new=partial( create_random_expression, primitive_set=self._pset, max_length=max_start_length, ), compile_=compile_individual, eliminate=eliminate_from_pareto, evaluate_callback=self._on_evaluation_completed, completed_evaluations=self._evaluation_library.lookup, )
def __init__( self, scoring: Union[str, Metric, Tuple[Union[str, Metric], ...]] = 'filled_in_by_child_class', regularize_length: bool = True, max_pipeline_length: Optional[int] = None, config: Dict = None, random_state: Optional[int] = None, max_total_time: int = 3600, max_eval_time: Optional[int] = None, n_jobs: Optional[int] = None, verbosity: int = logging.WARNING, keep_analysis_log: Optional[str] = 'gama.log', search_method: BaseSearch = AsyncEA(), post_processing_method: BasePostProcessing = BestFitPostProcessing()): """ Parameters ---------- scoring: str, Metric or Tuple Specifies the/all metric(s) to optimize towards. A string will be converted to Metric. A tuple must specify each metric with the same type (i.e. all str or all Metric). See :ref:`Metrics` for built-in metrics. regularize_length: bool If True, add pipeline length as an optimization metric (preferring short over long). max_pipeline_length: int, optional (default=None) If set, limit the maximum number of steps in any evaluated pipeline. Encoding and imputation are excluded. config: a dictionary which specifies available components and their valid hyperparameter settings For more information, see :ref:`search_space_configuration`. random_state: int, optional (default=None) If an integer is passed, this will be the seed for the random number generators used in the process. However, with `n_jobs > 1`, there will be randomization introduced by multi-processing. For reproducible results, set this and use `n_jobs=1`. max_total_time: positive int (default=3600) Time in seconds that can be used for the `fit` call. max_eval_time: positive int, optional (default=None) Time in seconds that can be used to evaluate any one single individual. If None, set to 0.1 * max_total_time. n_jobs: int, optional (default=None) The amount of parallel processes that may be created to speed up `fit`. Accepted values are positive integers, -1 or None. If -1 is specified, multiprocessing.cpu_count() processes are created. If None is specified, multiprocessing.cpu_count() / 2 processes are created. verbosity: int (default=logging.WARNING) Sets the level of log messages to be automatically output to terminal. keep_analysis_log: str, optional (default='gama.log') If non-empty str, specifies the path (and name) where the log should be stored, e.g. /output/gama.log. If `None`, no log is stored. search_method: BaseSearch (default=AsyncEA()) Search method to use to find good pipelines. Should be instantiated. post_processing_method: BasePostProcessing (default=BestFitPostProcessing()) Post-processing method to create a model after the search phase. Should be instantiated. """ register_stream_log(verbosity) if keep_analysis_log is not None: register_file_log(keep_analysis_log) if keep_analysis_log is not None and not os.path.isabs( keep_analysis_log): keep_analysis_log = os.path.abspath(keep_analysis_log) arguments = ','.join([ '{}={}'.format(k, v) for (k, v) in locals().items() if k not in [ 'self', 'config', 'gamalog', 'file_handler', 'stdout_streamhandler' ] ]) log.info('Using GAMA version {}.'.format(__version__)) log.info('{}({})'.format(self.__class__.__name__, arguments)) log_event(log, TOKENS.INIT, arguments) if n_jobs is None: n_jobs = multiprocessing.cpu_count() // 2 log.debug('n_jobs defaulted to %d', n_jobs) if max_total_time is None or max_total_time <= 0: raise ValueError( f"max_total_time should be integer greater than zero but is {max_total_time}." ) if max_eval_time is not None and max_eval_time <= 0: raise ValueError( f"max_eval_time should be None or integer greater than zero but is {max_eval_time}." ) if n_jobs < -1 or n_jobs == 0: raise ValueError( f"n_jobs should be -1 or positive integer but is {n_jobs}.") elif n_jobs != -1: # AsyncExecutor defaults to using multiprocessing.cpu_count(), i.e. n_jobs=-1 AsyncEvaluator.n_jobs = n_jobs if max_eval_time is None: max_eval_time = 0.1 * max_total_time if max_eval_time > max_total_time: log.warning( f"max_eval_time ({max_eval_time}) > max_total_time ({max_total_time}) is not allowed. " f"max_eval_time set to {max_total_time}.") max_eval_time = max_total_time self._max_eval_time = max_eval_time self._time_manager = TimeKeeper(max_total_time) self._metrics: Tuple[Metric] = scoring_to_metric(scoring) self._regularize_length = regularize_length self._search_method: BaseSearch = search_method self._post_processing = post_processing_method if random_state is not None: random.seed(random_state) np.random.seed(random_state) self._X: Optional[pd.DataFrame] = None self._y: Optional[pd.DataFrame] = None self._basic_encoding_pipeline: Optional[Pipeline] = None self._inferred_dtypes: List[Type] = [] self.model: object = None self._final_pop = None self._subscribers = defaultdict(list) if isinstance(post_processing_method, EnsemblePostProcessing): self._evaluation_library = EvaluationLibrary( m=post_processing_method.hyperparameters['max_models'], n=post_processing_method.hyperparameters['hillclimb_size'], ) else: # Don't keep memory-heavy evaluation meta-data (predictions, estimators) self._evaluation_library = EvaluationLibrary(m=0) self.evaluation_completed(self._evaluation_library.save_evaluation) self._pset, parameter_checks = pset_from_config(config) max_start_length = 3 if max_pipeline_length is None else max_pipeline_length self._operator_set = OperatorSet( mutate=partial(random_valid_mutation_in_place, primitive_set=self._pset, max_length=max_pipeline_length), mate=partial(random_crossover, max_length=max_pipeline_length), create_from_population=partial(create_from_population, cxpb=0.2, mutpb=0.8), create_new=partial(create_random_expression, primitive_set=self._pset, max_length=max_start_length), compile_=compile_individual, eliminate=eliminate_from_pareto, evaluate_callback=self._on_evaluation_completed)