def test_check_random_state(): """Check the check_random_state utility function behavior""" assert(check_random_state(None) is np.random.mtrand._rand) assert(check_random_state(np.random) is np.random.mtrand._rand) rng_42 = np.random.RandomState(42) assert(check_random_state(42).randint(100) == rng_42.randint(100)) rng_42 = np.random.RandomState(42) assert(check_random_state(rng_42) is rng_42) rng_42 = np.random.RandomState(42) assert(check_random_state(43).randint(100) != rng_42.randint(100)) assert_raises(ValueError, check_random_state, "some invalid seed")
def _parallel_evolve(n_programs, parents, X, y, sample_weight, seeds, params): n_samples, n_features = X.shape programs = [] for i in range(n_programs): # 先检查随机数种子 random_state = check_random_state(seeds[i]) if parents is None: program = None genome = None
def get_all_indices(self, n_samples=None, max_samples=None, random_state=None): """Get the indices on which to evaluate the fitness of a program. Parameters ---------- n_samples : int The number of samples. max_samples : int The maximum number of samples to use. random_state : RandomState instance The random number generator. Returns ------- indices : array-like, shape = [n_samples] The in-sample indices. not_indices : array-like, shape = [n_samples] The out-of-sample indices. """ if self._indices_state is None and random_state is None: raise ValueError('The program has not been evaluated for fitness ' 'yet, indices not available.') if n_samples is not None and self._n_samples is None: self._n_samples = n_samples if max_samples is not None and self._max_samples is None: self._max_samples = max_samples if random_state is not None and self._indices_state is None: self._indices_state = random_state.get_state() indices_state = check_random_state(None) indices_state.set_state(self._indices_state) not_indices = sample_without_replacement(self._n_samples, self._n_samples - self._max_samples, random_state=indices_state) sample_counts = np.bincount(not_indices, minlength=self._n_samples) indices = np.where(sample_counts == 0)[0] return indices, not_indices
def _parallel_evolve(n_programs, parents, X, y, sample_weight, seeds, params): """Private function used to build a batch of programs within a job.""" n_samples, n_features = X.shape # Unpack parameters tournament_size = params['tournament_size'] function_set = params['function_set'] arities = params['arities'] init_depth = params['init_depth'] init_method = params['init_method'] const_range = params['const_range'] metric = params['_metric'] parsimony_coefficient = params['parsimony_coefficient'] method_probs = params['method_probs'] p_point_replace = params['p_point_replace'] max_samples = params['max_samples'] max_samples = int(max_samples * n_samples) def _tournament(): """Find the fittest individual from a sub-population.""" contenders = random_state.randint(0, len(parents), tournament_size) fitness = [parents[p].fitness_ for p in contenders] if metric.greater_is_better: parent_index = contenders[np.argmax(fitness)] else: parent_index = contenders[np.argmin(fitness)] return parents[parent_index], parent_index # Build programs programs = [] for i in range(n_programs): random_state = check_random_state(seeds[i]) if parents is None: program = None genome = None else: method = random_state.uniform() parent, parent_index = _tournament() if method < method_probs[0]: # crossover donor, donor_index = _tournament() program, removed, remains = parent.crossover( donor.program, random_state) genome = { 'method': 'Crossover', 'parent_idx': parent_index, 'parent_nodes': removed, 'donor_idx': donor_index, 'donor_nodes': remains } elif method < method_probs[1]: # subtree_mutation program, removed, _ = parent.subtree_mutation(random_state) genome = { 'method': 'Subtree Mutation', 'parent_idx': parent_index, 'parent_nodes': removed } elif method < method_probs[2]: # hoist_mutation program, removed = parent.hoist_mutation(random_state) genome = { 'method': 'Hoist Mutation', 'parent_idx': parent_index, 'parent_nodes': removed } elif method < method_probs[3]: # point_mutation program, mutated = parent.point_mutation(random_state) genome = { 'method': 'Point Mutation', 'parent_idx': parent_index, 'parent_nodes': mutated } else: # reproduction program = parent.reproduce() genome = { 'method': 'Reproduction', 'parent_idx': parent_index, 'parent_nodes': [] } program = _Program(function_set=function_set, arities=arities, init_depth=init_depth, init_method=init_method, n_features=n_features, metric=metric, const_range=const_range, p_point_replace=p_point_replace, parsimony_coefficient=parsimony_coefficient, random_state=random_state, program=program) program.parents = genome # Draw samples, using sample weights, and then fit if sample_weight is None: curr_sample_weight = np.ones((n_samples, )) else: curr_sample_weight = sample_weight.copy() oob_sample_weight = curr_sample_weight.copy() indices, not_indices = program.get_all_indices(n_samples, max_samples, random_state) curr_sample_weight[not_indices] = 0 oob_sample_weight[indices] = 0 program.raw_fitness_ = program.raw_fitness(X, y, curr_sample_weight) if max_samples < n_samples: # Calculate OOB fitness program.oob_fitness_ = program.raw_fitness(X, y, oob_sample_weight) programs.append(program) return programs
def fit(self, X, y, sample_weight=None): """Fit the Genetic Program according to X, y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. sample_weight : array-like, shape = [n_samples], optional Weights applied to individual samples. Returns ------- self : object Returns self. """ random_state = check_random_state(self.random_state) # Check arrays X, y = check_X_y(X, y, y_numeric=True) if sample_weight is not None: sample_weight = check_array(sample_weight, ensure_2d=False) _, self.n_features_ = X.shape hall_of_fame = self.hall_of_fame if hall_of_fame is None: hall_of_fame = self.population_size if hall_of_fame > self.population_size or hall_of_fame < 1: raise ValueError('hall_of_fame (%d) must be less than or equal to ' 'population_size (%d).' % (self.hall_of_fame, self.population_size)) n_components = self.n_components if n_components is None: n_components = hall_of_fame if n_components > hall_of_fame or n_components < 1: raise ValueError('n_components (%d) must be less than or equal to ' 'hall_of_fame (%d).' % (self.n_components, self.hall_of_fame)) self._function_set = [] for function in self.function_set: if isinstance(function, six.string_types): if function not in _function_map: raise ValueError('invalid function name %s found in ' '`function_set`.' % function) self._function_set.append(_function_map[function]) elif isinstance(function, _Function): self._function_set.append(function) else: raise ValueError('invalid type %s found in `function_set`.' % type(function)) if not self._function_set: raise ValueError('No valid functions found in `function_set`.') # For point-mutation to find a compatible replacement node self._arities = {} for function in self._function_set: arity = function.arity self._arities[arity] = self._arities.get(arity, []) self._arities[arity].append(function) if isinstance(self.metric, _Fitness): self._metric = self.metric elif isinstance(self, RegressorMixin): if self.metric not in ('mean absolute error', 'mse', 'rmse'): raise ValueError('Unsupported metric: %s' % self.metric) else: self._metric = _fitness_map[self.metric] elif isinstance(self, TransformerMixin): if self.metric not in ('pearson', 'spearman'): raise ValueError('Unsupported metric: %s' % self.metric) else: self._metric = _fitness_map[self.metric] self._method_probs = np.array([ self.p_crossover, self.p_subtree_mutation, self.p_hoist_mutation, self.p_point_mutation ]) self._method_probs = np.cumsum(self._method_probs) if self._method_probs[-1] > 1: raise ValueError('The sum of p_crossover, p_subtree_mutation, ' 'p_hoist_mutation and p_point_mutation should ' 'total to 1.0 or less.') if self.init_method not in ('half and half', 'grow', 'full'): raise ValueError('Valid program initializations methods include ' '"grow", "full" and "half and half". Given %s.' % self.init_method) if (not isinstance(self.const_range, tuple) or len(self.const_range) != 2): raise ValueError('const_range should be a tuple with length two.') if (not isinstance(self.init_depth, tuple) or len(self.init_depth) != 2): raise ValueError('init_depth should be a tuple with length two.') if self.init_depth[0] > self.init_depth[1]: raise ValueError('init_depth should be in increasing numerical ' 'order: (min_depth, max_depth).') params = self.get_params() params['_metric'] = self._metric params['function_set'] = self._function_set params['arities'] = self._arities params['method_probs'] = self._method_probs if not self.warm_start or not hasattr(self, "_programs"): # Free allocated memory, if any self._programs = [] prior_generations = len(self._programs) n_more_generations = self.generations - prior_generations if n_more_generations < 0: raise ValueError('generations=%d must be larger or equal to ' 'len(_programs)=%d when warm_start==True' % (self.generations, len(self._programs))) elif n_more_generations == 0: fitness = [program.raw_fitness_ for program in self._programs[-1]] warn("Warm-start fitting without increasing n_estimators does not " "fit new trees.") if self.warm_start: # Generate and discard seeds that would have been produced on the # initial fit call. for i in range(len(self._programs)): _ = random_state.randint(MAX_INT, size=self.population_size) if self.verbose: # Print header fields self._verbose_reporter() start_time = time() for gen in range(prior_generations, self.generations): if gen == 0: parents = None else: parents = self._programs[gen - 1] # Parallel loop n_jobs, n_programs, starts = _partition_estimators( self.population_size, self.n_jobs) seeds = random_state.randint(MAX_INT, size=self.population_size) population = Parallel( n_jobs=n_jobs, verbose=int(self.verbose > 1))(delayed(_parallel_evolve)( n_programs[i], parents, X, y, sample_weight, seeds[starts[i]:starts[i + 1]], params) for i in range(n_jobs)) # Reduce, maintaining order across different n_jobs population = list(itertools.chain.from_iterable(population)) fitness = [program.raw_fitness_ for program in population] length = [program.length_ for program in population] parsimony_coefficient = None if self.parsimony_coefficient == 'auto': parsimony_coefficient = (np.cov(length, fitness)[1, 0] / np.var(length)) for program in population: program.fitness_ = program.fitness(parsimony_coefficient) self._programs.append(population) # Remove old programs that didn't make it into the new population. for old_gen in np.arange(gen, 0, -1): indices = [] for program in self._programs[old_gen]: if program is not None: for idx in program.parents: if 'idx' in idx: indices.append(program.parents[idx]) indices = set(indices) for idx in range(self.population_size): if idx not in indices: self._programs[old_gen - 1][idx] = None if self.verbose: self._verbose_reporter(start_time, gen, population, fitness, length) # Check for early stopping if self._metric.greater_is_better: best_fitness = fitness[np.argmax(fitness)] if best_fitness >= self.stopping_criteria: break else: best_fitness = fitness[np.argmin(fitness)] if best_fitness <= self.stopping_criteria: break if isinstance(self, RegressorMixin): # Find the best individual in the final generation if self._metric.greater_is_better: self._program = self._programs[-1][np.argmax(fitness)] else: self._program = self._programs[-1][np.argmin(fitness)] if isinstance(self, TransformerMixin): # Find the best individuals in the final generation fitness = np.array(fitness) if self._metric.greater_is_better: hall_of_fame = fitness.argsort()[::-1][:self.hall_of_fame] else: hall_of_fame = fitness.argsort()[:self.hall_of_fame] evaluation = np.array([ gp.execute(X) for gp in [self._programs[-1][i] for i in hall_of_fame] ]) if self.metric == 'spearman': evaluation = np.apply_along_axis(rankdata, 1, evaluation) with np.errstate(divide='ignore', invalid='ignore'): correlations = np.abs(np.corrcoef(evaluation)) np.fill_diagonal(correlations, 0.) components = list(range(self.hall_of_fame)) indices = list(range(self.hall_of_fame)) # Iteratively remove least fit individual of most correlated pair while len(components) > self.n_components: most_correlated = np.unravel_index(np.argmax(correlations), correlations.shape) # The correlation matrix is sorted by fitness, so identifying # the least fit of the pair is simply getting the higher index worst = max(most_correlated) components.pop(worst) indices.remove(worst) correlations = correlations[:, indices][indices, :] indices = list(range(len(components))) self._best_programs = [ self._programs[-1][i] for i in hall_of_fame[components] ] return self
def fit(self, X, y, sample_weight=None): """Fit the Genetic Program according to X, y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. sample_weight : array-like, shape = [n_samples], optional Weights applied to individual samples. Returns ------- self : object Returns self. """ fitting_start_time = time() try: self.feature_names = X.columns except: pass random_state = check_random_state(self.random_state) # Check arrays if isinstance(self, ClassifierMixin): X, y = check_X_y(X, y, y_numeric=False) check_classification_targets(y) self.classes_, y = np.unique(y, return_inverse=True) n_trim_classes = np.count_nonzero(np.bincount(y, sample_weight)) if n_trim_classes != 2: raise ValueError("y contains %d class after sample_weight " "trimmed classes with zero weights, while 2 " "classes are required." % n_trim_classes) self.n_classes_ = len(self.classes_) else: X, y = check_X_y(X, y, y_numeric=True) if sample_weight is not None: sample_weight = check_array(sample_weight, ensure_2d=False) _, self.n_features_ = X.shape hall_of_fame = self.hall_of_fame if hall_of_fame is None: hall_of_fame = self.population_size if hall_of_fame > self.population_size or hall_of_fame < 1: raise ValueError('hall_of_fame (%d) must be less than or equal to ' 'population_size (%d).' % (self.hall_of_fame, self.population_size)) n_components = self.n_components if n_components is None: n_components = hall_of_fame if n_components > hall_of_fame or n_components < 1: raise ValueError('n_components (%d) must be less than or equal to ' 'hall_of_fame (%d).' % (self.n_components, self.hall_of_fame)) self._function_set = [] for function in self.function_set: if isinstance(function, str): if function not in _function_map: raise ValueError( 'invalid function cc_session_identifier %s found in ' '`function_set`.' % function) self._function_set.append(_function_map[function]) elif isinstance(function, _Function): self._function_set.append(function) else: raise ValueError('invalid type %s found in `function_set`.' % type(function)) if not self._function_set: raise ValueError('No valid functions found in `function_set`.') # For point-mutation to find a compatible replacement node self._arities = {} for function in self._function_set: arity = function.arity self._arities[arity] = self._arities.get(arity, []) self._arities[arity].append(function) if isinstance(self.metric, _Fitness): self._metric = self.metric elif isinstance(self, RegressorMixin): if self.metric not in ('mean absolute error', 'mse', 'rmse', 'pearson', 'spearman'): raise ValueError('Unsupported metric: %s' % self.metric) self._metric = _fitness_map[self.metric] elif isinstance(self, ClassifierMixin): if self.metric != 'log loss': raise ValueError('Unsupported metric: %s' % self.metric) self._metric = _fitness_map[self.metric] elif isinstance(self, TransformerMixin): if self.metric not in ('pearson', 'spearman'): raise ValueError('Unsupported metric: %s' % self.metric) self._metric = _fitness_map[self.metric] self._method_probs = np.array([ self.p_crossover, self.p_subtree_mutation, self.p_hoist_mutation, self.p_point_mutation ]) self._method_probs = np.cumsum(self._method_probs) if self._method_probs[-1] > 1: raise ValueError('The sum of p_crossover, p_subtree_mutation, ' 'p_hoist_mutation and p_point_mutation should ' 'total to 1.0 or less.') if self.init_method not in ('half and half', 'grow', 'full'): raise ValueError('Valid program initializations methods include ' '"grow", "full" and "half and half". Given %s.' % self.init_method) if not ((isinstance(self.const_range, tuple) and len(self.const_range) == 2) or self.const_range is None): raise ValueError('const_range should be a tuple with length two, ' 'or None.') if (not isinstance(self.init_depth, tuple) or len(self.init_depth) != 2): raise ValueError('init_depth should be a tuple with length two.') if self.init_depth[0] > self.init_depth[1]: raise ValueError('init_depth should be in increasing numerical ' 'order: (min_depth, max_depth).') if self.feature_names is not None: if self.n_features_ != len(self.feature_names): raise ValueError('The supplied `feature_names` has different ' 'length to n_features. Expected %d, got %d.' % (self.n_features_, len(self.feature_names))) for feature_name in self.feature_names: if not isinstance(feature_name, str): raise ValueError('invalid type %s found in ' '`feature_names`.' % type(feature_name)) if self.transformer is not None: if isinstance(self.transformer, _Function): self._transformer = self.transformer elif self.transformer == 'sigmoid': self._transformer = sig1 else: raise ValueError('Invalid `transformer`. Expected either ' '"sigmoid" or _Function object, got %s' % type(self.transformer)) if self._transformer.arity != 1: raise ValueError( 'Invalid arity for `transformer`. Expected 1, ' 'got %d.' % (self._transformer.arity)) params = self.get_params() params['_metric'] = self._metric if hasattr(self, '_transformer'): params['_transformer'] = self._transformer else: params['_transformer'] = None params['function_set'] = self._function_set params['arities'] = self._arities params['method_probs'] = self._method_probs if not self.warm_start or not hasattr(self, '_programs'): # Free allocated memory, if any self._programs = [] self.run_details_ = { 'generation': [], 'average_length': [], 'average_fitness': [], 'best_length': [], 'best_fitness': [], 'best_oob_fitness': [], 'generation_time': [] } prior_generations = len(self._programs) n_more_generations = self.generations - prior_generations if n_more_generations < 0: raise ValueError('generations=%d must be larger or equal to ' 'len(_programs)=%d when warm_start==True' % (self.generations, len(self._programs))) elif n_more_generations == 0: fitness = [program.raw_fitness_ for program in self._programs[-1]] warn('Warm-start fitting without increasing n_estimators does not ' 'fit new programs.') if self.warm_start: # Generate and discard seeds that would have been produced on the # initial fit call. for i in range(len(self._programs)): _ = random_state.randint(MAX_INT, size=self.population_size) if self.verbose: # Print header fields self._verbose_reporter() for gen in range(prior_generations, self.generations): start_time = time() if gen == 0: parents = None else: parents = self._programs[gen - 1] parents = list(filter(lambda p: p.raw_fitness_ > 0, parents)) if len(parents) < 2: break # Parallel loop n_jobs, n_programs, starts = _partition_estimators( self.population_size, self.n_jobs) seeds = random_state.randint(MAX_INT, size=self.population_size) population = Parallel( n_jobs=n_jobs, verbose=int(self.verbose > 1))(delayed(_parallel_evolve)( n_programs[i], parents, X, y, sample_weight, seeds[starts[i]:starts[i + 1]], params) for i in range(n_jobs)) # Reduce, maintaining order across different n_jobs population = list(itertools.chain.from_iterable(population)) population = [ program for program in population if program.length_ <= self.max_formula_length ] # We want to impose that in order to keep it interpretable seen_key = set() population_unique = [] for p in population: if str(p) not in seen_key: population_unique += [p] seen_key.add(str(p)) population = population_unique # ------------------------------------------------------------------------------- # Modification -> fitness is now the importance score in the tree modified_features = np.array([p.execute(X) for p in population]).T modified_names = np.array([str(p) for p in population]) modified_features = pd.DataFrame(modified_features, columns=modified_names) # ------------------------------------------------------------------------------- # n_samples,n_features self.tree_estimator.fit(modified_features, y, sample_weight=sample_weight) feature_importance_fitness = self.tree_estimator.feature_importances_ for idx in range(len(population)): population[idx].raw_fitness_ = feature_importance_fitness[idx] # -------------------------------------------------------------------------------- fitness = [program.raw_fitness_ for program in population] length = [program.length_ for program in population] parsimony_coefficient = None if self.parsimony_coefficient == 'auto': parsimony_coefficient = (np.cov(length, fitness)[1, 0] / np.var(length)) for program in population: program.fitness_ = program.fitness(parsimony_coefficient) self._programs.append(population) # Remove old programs that didn't make it into the new population. if not self.low_memory: for old_gen in np.arange(gen, 0, -1): indices = [] for program in self._programs[old_gen]: if program is not None: for idx in program.parents: if 'idx' in idx: indices.append(program.parents[idx]) indices = set(indices) for idx in range(len(self._programs[old_gen - 1])): if idx not in indices: self._programs[old_gen - 1][idx] = None elif gen > 0: # Remove old generations self._programs[gen - 1] = None # Record run details best_program = population[np.argmax(fitness)] self.run_details_['generation'].append(gen) self.run_details_['average_length'].append(np.mean(length)) self.run_details_['average_fitness'].append(np.mean(fitness)) self.run_details_['best_length'].append(best_program.length_) self.run_details_['best_fitness'].append(best_program.raw_fitness_) oob_fitness = np.nan if self.max_samples < 1.0: oob_fitness = best_program.oob_fitness_ self.run_details_['best_oob_fitness'].append(oob_fitness) generation_time = time() - start_time self.run_details_['generation_time'].append(generation_time) if self.verbose: self._verbose_reporter(self.run_details_) best_fitness = fitness[np.argmax(fitness)] if best_fitness >= self.stopping_criteria or ( (time() - fitting_start_time) > self.time_budget_s): break # Find the best individuals in the final generation if self.hall_of_fame is not None: fitness = np.array(fitness) hall_of_fame = fitness.argsort()[::-1][:self.hall_of_fame] evaluation = np.array([ gp.execute(X) for gp in [self._programs[-1][i] for i in hall_of_fame] ]) if self.metric == 'spearman': evaluation = np.apply_along_axis(rankdata, 1, evaluation) with np.errstate(divide='ignore', invalid='ignore'): correlations = np.abs(np.corrcoef(evaluation)) np.fill_diagonal(correlations, 0.) components = list(range(self.hall_of_fame)) indices = list(range(self.hall_of_fame)) # Iteratively remove least fit individual of most correlated pair while len(components) > self.n_components: most_correlated = np.unravel_index(np.argmax(correlations), correlations.shape) # The correlation matrix is sorted by fitness, so identifying # the least fit of the pair is simply getting the higher index worst = max(most_correlated) components.pop(worst) indices.remove(worst) correlations = correlations[:, indices][indices, :] indices = list(range(len(components))) self._best_programs = [ self._programs[-1][i] for i in hall_of_fame[components] ] else: self._best_programs = self._programs[-1] self.tree_estimator.fit(self.transform(X), y) return self