def test_mut_operator_stats_update(): """Asserts that self._random_mutation_operator updates stats as expected.""" tpot_obj = TPOTClassifier() tpot_obj._fit_init() ind = creator.Individual.from_string( 'KNeighborsClassifier(' 'BernoulliNB(input_matrix, BernoulliNB__alpha=10.0, BernoulliNB__fit_prior=False),' 'KNeighborsClassifier__n_neighbors=10, ' 'KNeighborsClassifier__p=1, ' 'KNeighborsClassifier__weights=uniform' ')', tpot_obj._pset ) initialize_stats_dict(ind) ind.statistics["crossover_count"] = random.randint(0, 10) ind.statistics["mutation_count"] = random.randint(0, 10) # set as evaluated pipelines in tpot_obj.evaluated_individuals_ tpot_obj.evaluated_individuals_[str(ind)] = tpot_obj._combine_individual_stats(2, 0.99, ind.statistics) for _ in range(10): offspring, = tpot_obj._random_mutation_operator(ind) assert offspring.statistics['crossover_count'] == ind.statistics['crossover_count'] assert offspring.statistics['mutation_count'] == ind.statistics['mutation_count'] + 1 assert offspring.statistics['predecessor'] == (str(ind),) ind = offspring
def test_dict_initialization(): """Asserts that gp_deap.initialize_stats_dict initializes individual statistics correctly""" tpot = TPOTClassifier() tb = tpot._toolbox test_ind = tb.individual() initialize_stats_dict(test_ind) assert test_ind.statistics['generation'] == 0 assert test_ind.statistics['crossover_count'] == 0 assert test_ind.statistics['mutation_count'] == 0 assert test_ind.statistics['predecessor'] == ('ROOT', )
def test_dict_initialization(): """Asserts that gp_deap.initialize_stats_dict initializes individual statistics correctly""" tpot = TPOTClassifier() tb = tpot._toolbox test_ind = tb.individual() initialize_stats_dict(test_ind) assert test_ind.statistics['generation'] == 0 assert test_ind.statistics['crossover_count'] == 0 assert test_ind.statistics['mutation_count'] == 0 assert test_ind.statistics['predecessor'] == ('ROOT',)
def test_mate_operator_stats_update(): """Assert that self._mate_operator updates stats as expected.""" tpot_obj = TPOTClassifier() tpot_obj._fit_init() ind1 = creator.Individual.from_string( 'KNeighborsClassifier(' 'BernoulliNB(input_matrix, BernoulliNB__alpha=10.0, BernoulliNB__fit_prior=False),' 'KNeighborsClassifier__n_neighbors=10, ' 'KNeighborsClassifier__p=1, ' 'KNeighborsClassifier__weights=uniform' ')', tpot_obj._pset ) ind2 = creator.Individual.from_string( 'KNeighborsClassifier(' 'BernoulliNB(input_matrix, BernoulliNB__alpha=10.0, BernoulliNB__fit_prior=True),' 'KNeighborsClassifier__n_neighbors=10, ' 'KNeighborsClassifier__p=2, ' 'KNeighborsClassifier__weights=uniform' ')', tpot_obj._pset ) initialize_stats_dict(ind1) initialize_stats_dict(ind2) # Randomly mutate the statistics ind1.statistics["crossover_count"] = random.randint(0, 10) ind1.statistics["mutation_count"] = random.randint(0, 10) ind2.statistics["crossover_count"] = random.randint(0, 10) ind2.statistics["mutation_count"] = random.randint(0, 10) # set as evaluated pipelines in tpot_obj.evaluated_individuals_ tpot_obj.evaluated_individuals_[str(ind1)] = tpot_obj._combine_individual_stats(2, 0.99, ind1.statistics) tpot_obj.evaluated_individuals_[str(ind2)] = tpot_obj._combine_individual_stats(2, 0.99, ind2.statistics) # Doing 10 tests for _ in range(10): offspring1, offspring2 = tpot_obj._mate_operator(ind1, ind2) assert offspring1.statistics['crossover_count'] == ind1.statistics['crossover_count'] + ind2.statistics['crossover_count'] + 1 assert offspring1.statistics['mutation_count'] == ind1.statistics['mutation_count'] + ind2.statistics['mutation_count'] assert offspring1.statistics['predecessor'] == (str(ind1), str(ind2)) # Offspring replaces on of the two predecessors # Don't need to worry about cloning if random.random() < 0.5: ind1 = offspring1 else: ind2 = offspring1
def MetaeaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar, max_pipeline_size, stats=None, halloffame=None, verbose=0, meta_model=None, per_generation_function=None, primitives_to_hash_dic=None, gptree=None, pset=None, df=None, le=None, use_meta_model_flag=True, use_meta_selection_flag=False, meta_selection_size=10, meta_selection_type='offspring'): """This is the :math:`(\mu + \lambda)` evolutionary algorithm. :param population: A list of individuals. :param toolbox: A :class:`~deap.base.Toolbox` that contains the evolution operators. :param mu: The number of individuals to select for the next generation. :param lambda_: The number of children to produce at each generation. :param cxpb: The probability that an offspring is produced by crossover. :param mutpb: The probability that an offspring is produced by mutation. :param ngen: The number of generation. :param pbar: processing bar :param stats: A :class:`~deap.tools.Statistics` object that is updated inplace, optional. :param halloffame: A :class:`~deap.tools.HallOfFame` object that will contain the best individuals, optional. :param verbose: Whether or not to log the statistics. :param per_generation_function: if supplied, call this function before each generation used by tpot to save best pipeline before each new generation :returns: The final population :returns: A class:`~deap.tools.Logbook` with the statistics of the evolution. The algorithm takes in a population and evolves it in place using the :func:`varOr` function. It returns the optimized population and a :class:`~deap.tools.Logbook` with the statistics of the evolution. The logbook will contain the generation number, the number of evalutions for each generation and the statistics if a :class:`~deap.tools.Statistics` is given as argument. The *cxpb* and *mutpb* arguments are passed to the :func:`varOr` function. The pseudocode goes as follow :: evaluate(population) for g in range(ngen): offspring = varOr(population, toolbox, lambda_, cxpb, mutpb) evaluate(offspring) population = select(population + offspring, mu) First, the individuals having an invalid fitness are evaluated. Second, the evolutionary loop begins by producing *lambda_* offspring from the population, the offspring are generated by the :func:`varOr` function. The offspring are then evaluated and the next generation population is selected from both the offspring **and** the population. Finally, when *ngen* generations are done, the algorithm returns a tuple with the final population and a :class:`~deap.tools.Logbook` of the evolution. This function expects :meth:`toolbox.mate`, :meth:`toolbox.mutate`, :meth:`toolbox.select` and :meth:`toolbox.evaluate` aliases to be registered in the toolbox. This algorithm uses the :func:`varOr` variation. """ logbook = tools.Logbook() logbook.header = ['gen', 'nevals'] + (stats.fields if stats else []) # Initialize statistics dict for the individuals in the population, to keep track of mutation/crossover operations and predecessor relations for ind in population: initialize_stats_dict(ind) population[:] = toolbox.evaluate(population) record = stats.compile(population) if stats is not None else {} logbook.record(gen=0, nevals=len(population), **record) # Begin the generational process for gen in range(1, ngen + 1): # after each population save a periodic pipeline if per_generation_function is not None: per_generation_function(gen=gen - 1, pop=population) # Vary the population offspring = varOr(population, toolbox, lambda_, cxpb, mutpb) # Using meta-learning to reduce offspring if use_meta_model_flag: test_df = create_ranking_df_from_pop(offspring, primitives_to_hash_dic, gptree, pset, df, max_pipeline_size) top_offspring_index = rank_pop(meta_model, test_df, mu, le) top_offspring = [offspring[i] for i in top_offspring_index] offspring = top_offspring # Using meta-learning as pree-tournament if use_meta_selection_flag: if meta_selection_type == 'offspring': offspring = _selMetaTournament( offspring, mu, meta_selection_size, meta_model=meta_model, primitives_to_hash_dic=primitives_to_hash_dic, gptree=gptree, pset=pset, df=df, le=le, max_pipeline_size=max_pipeline_size) elif meta_selection_type == 'pop': population = _selMetaTournament( population, mu, meta_selection_size, meta_model=meta_model, primitives_to_hash_dic=primitives_to_hash_dic, gptree=gptree, pset=pset, df=df, le=le, max_pipeline_size=max_pipeline_size) else: population = _selMetaTournament( population, mu, meta_selection_size, meta_model=meta_model, primitives_to_hash_dic=primitives_to_hash_dic, gptree=gptree, pset=pset, df=df, le=le, max_pipeline_size=max_pipeline_size) offspring = _selMetaTournament( offspring, mu, meta_selection_size, meta_model=meta_model, primitives_to_hash_dic=primitives_to_hash_dic, gptree=gptree, pset=pset, df=df, le=le, max_pipeline_size=max_pipeline_size) # Update generation statistic for all individuals which have invalid 'generation' stats # This hold for individuals that have been altered in the varOr function for ind in population: if ind.statistics['generation'] == 'INVALID': ind.statistics['generation'] = gen # Evaluate the individuals with an invalid fitness invalid_ind = [ind for ind in offspring if not ind.fitness.valid] offspring = toolbox.evaluate(offspring) # Select the next generation population population[:] = toolbox.select(population + offspring, mu) # pbar process if not pbar.disable: # Print only the best individual fitness if verbose == 2: high_score = max([ halloffame.keys[x].wvalues[1] for x in range(len(halloffame.keys)) ]) pbar.write( 'Generation {0} - Current best internal CV score: {1}'. format(gen, high_score)) # Print the entire Pareto front elif verbose == 3: pbar.write( 'Generation {} - Current Pareto front scores:'.format(gen)) for pipeline, pipeline_scores in zip(halloffame.items, reversed( halloffame.keys)): pbar.write('{}\t{}\t{}'.format( int(pipeline_scores.wvalues[0]), pipeline_scores.wvalues[1], pipeline)) pbar.write('') # Update the statistics with the new population record = stats.compile(population) if stats is not None else {} logbook.record(gen=gen, nevals=len(invalid_ind), **record) if per_generation_function is not None: per_generation_function(gen=gen, pop=population) return population, logbook
def extendedeaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar, stats=None, halloffame=None, verbose=0, per_generation_function=None, debug=False, random_seed=None, analysis=None, mutation_rate=None, crossover_rate=None): """This is the :math:`(\mu + \lambda)` evolutionary algorithm. :param population: A list of individuals. :param toolbox: A :class:`~deap.base.Toolbox` that contains the evolution operators. :param mu: The number of individuals to select for the next generation. :param lambda\_: The number of children to produce at each generation. :param cxpb: The probability that an offspring is produced by crossover. :param mutpb: The probability that an offspring is produced by mutation. :param ngen: The number of generation. :param pbar: processing bar :param stats: A :class:`~deap.tools.Statistics` object that is updated inplace, optional. :param halloffame: A :class:`~deap.tools.HallOfFame` object that will contain the best individuals, optional. :param verbose: Whether or not to log the statistics. :param per_generation_function: if supplied, call this function before each generation used by tpot to save best pipeline before each new generation :returns: The final population :returns: A class:`~deap.tools.Logbook` with the statistics of the evolution. The algorithm takes in a population and evolves it in place using the :func:`varOr` function. It returns the optimized population and a :class:`~deap.tools.Logbook` with the statistics of the evolution. The logbook will contain the generation number, the number of evalutions for each generation and the statistics if a :class:`~deap.tools.Statistics` is given as argument. The *cxpb* and *mutpb* arguments are passed to the :func:`varOr` function. The pseudocode goes as follow :: evaluate(population) for g in range(ngen): offspring = varOr(population, toolbox, lambda_, cxpb, mutpb) evaluate(offspring) population = select(population + offspring, mu) First, the individuals having an invalid fitness are evaluated. Second, the evolutionary loop begins by producing *lambda_* offspring from the population, the offspring are generated by the :func:`varOr` function. The offspring are then evaluated and the next generation population is selected from both the offspring **and** the population. Finally, when *ngen* generations are done, the algorithm returns a tuple with the final population and a :class:`~deap.tools.Logbook` of the evolution. This function expects :meth:`toolbox.mate`, :meth:`toolbox.mutate`, :meth:`toolbox.select` and :meth:`toolbox.evaluate` aliases to be registered in the toolbox. This algorithm uses the :func:`varOr` variation. """ if random_seed == None: raise ValueError("No fixed random seed was used!") if random_seed is not None: random.seed(random_seed) np.random.seed(random_seed) logbook = tools.Logbook() logbook.header = ['gen', 'nevals', 'avg', 'std', 'min', 'max', 'raw'] + (stats.fields if stats else []) # Initialize statistics dict for the individuals in the population, to keep track of mutation/crossover operations and predecessor relations for ind in population: initialize_stats_dict(ind) # Evaluate the individuals with an invalid fitness invalid_ind = [ind for ind in population if not ind.fitness.valid] fitnesses = toolbox.evaluate(invalid_ind) for ind, fit in zip(invalid_ind, fitnesses): ind.fitness.values = fit if halloffame is not None: halloffame.update(population) # calculate average fitness for the generation # ignore the -inf models complexity = np.array([fitnesses[i][0] for i in range(len(population))]) fitnesses_only = np.array([fitnesses[i][1] for i in range(len(population))]) n_inf = np.sum(np.isinf(fitnesses_only)) print('Number of invalid pipelines: %d' %n_inf) fitnesses_only = fitnesses_only[~np.isinf(fitnesses_only)] record = stats.compile(population) if stats is not None else {} logbook.record(gen=0, nevals=len(invalid_ind), avg=np.mean(fitnesses_only), std=np.std(fitnesses_only), min=np.min(fitnesses_only), max=np.max(fitnesses_only), raw=fitnesses_only, complexity=complexity, **record) # save the optimal model for initial pipeline gen = 0 if per_generation_function is not None: per_generation_function(gen) # Begin the generational process for gen in range(1, ngen + 1): # after each population save a periodic pipeline if per_generation_function is not None: per_generation_function(gen) # Vary the population offspring = varOr(population, toolbox, lambda_, cxpb, mutpb) # Update generation statistic for all individuals which have invalid 'generation' stats # This hold for individuals that have been altered in the varOr function for ind in population: if ind.statistics['generation'] == 'INVALID': ind.statistics['generation'] = gen # Evaluate the individuals with an invalid fitness invalid_ind = [ind for ind in offspring if not ind.fitness.valid] # update pbar for valid individuals (with fitness values) if not pbar.disable: pbar.update(len(offspring)-len(invalid_ind)) fitnesses = toolbox.evaluate(invalid_ind) for ind, fit in zip(invalid_ind, fitnesses): ind.fitness.values = fit # Update the hall of fame with the generated individuals if halloffame is not None: halloffame.update(offspring) # Select the next generation population population[:] = toolbox.select(population + offspring, mu) # pbar process if not pbar.disable: # Print only the best individual fitness if verbose == 2: high_score = max([halloffame.keys[x].wvalues[1] for x in range(len(halloffame.keys))]) pbar.write('Generation {0} - Current best internal CV score: {1}'.format(gen, high_score)) # Print the entire Pareto front elif verbose == 3: pbar.write('Generation {} - Current Pareto front scores:'.format(gen)) for pipeline, pipeline_scores in zip(halloffame.items, reversed(halloffame.keys)): pbar.write('{}\t{}\t{}'.format( int(pipeline_scores.wvalues[0]), pipeline_scores.wvalues[1], pipeline ) ) pbar.write('') # calculate average fitness for the generation # ignore the -inf models fitnesses_only = np.array([fitnesses[i][1] for i in range(len(offspring))]) n_inf = np.sum(np.isinf(fitnesses_only)) print('Number of invalid pipelines: %d' %n_inf) fitnesses_only = fitnesses_only[~np.isinf(fitnesses_only)] # Update the statistics with the new population record = stats.compile(population) if stats is not None else {} logbook.record(gen=gen, nevals=len(invalid_ind), avg=np.mean(fitnesses_only), std=np.std(fitnesses_only), min=np.min(fitnesses_only), max=np.max(fitnesses_only), raw=fitnesses_only, **record) # Dump logbook import pickle import pandas as pd deap_df = pd.DataFrame(logbook) save_path = get_all_random_seed_paths(analysis, ngen, len(population), debug, mutation_rate, crossover_rate) save_path_df = os.path.join(save_path, 'logbook_rnd_seed%03d.pkl' %random_seed) with open(save_path_df, 'wb') as handle: pickle.dump(deap_df, handle) print('Saved logbook at %s' %save_path_df) return population, logbook