def run_automl(X, y, name, input=None, output=CategoricalVector()): telegram = TelegramLogger( token=os.environ["TOKEN"], channel="@autogoal_board", name=name, ) console = ConsoleLogger() progress = ProgressLogger() automl = AutoML( search_iterations=1000, metalearning_log=True, search_kwargs=dict(search_timeout=2 * 60 * 60, pop_size=50), errors="ignore", input=input, output=output, cross_validation_steps=1, ) automl.fit(X, y, logger=[telegram, console, progress])
class CustomLogger(Logger): def error(self, e: Exception, solution): if e and solution: with open("haha_errors.log", "a") as fp: fp.write(f"solution={repr(solution)}\nerror={repr(e)}\n\n") def update_best(self, new_best, new_fn, *args): with open("haha.log", "a") as fp: fp.write(f"solution={repr(new_best)}\nfitness={new_fn}\n\n") # Basic logging configuration. logger = MemoryLogger() loggers = [ProgressLogger(), ConsoleLogger(), logger] if args.token: from autogoal.contrib.telegram import TelegramLogger telegram = TelegramLogger( token=args.token, name=f"HAHA", channel=args.channel, ) loggers.append(telegram) # Finally, loading the HAHA dataset, running the `AutoML` instance, # and printing the results. X_train, y_train, X_test, y_test = haha.load(max_examples=args.examples)
from autogoal.contrib.keras import KerasSequenceClassifier from autogoal.contrib.torch import BertTokenizeEmbedding from autogoal.datasets import haha from autogoal.kb import CategoricalVector, List, Sentence, Tuple from autogoal.ml import AutoML from autogoal.search import ConsoleLogger, ProgressLogger classifier = AutoML( input=List(Sentence()), output=CategoricalVector(), registry=[KerasSequenceClassifier, BertTokenizeEmbedding], # search_kwargs=dict(memory_limit=4 * 1024 ** 3, evaluation_timeout=60), search_kwargs=dict(memory_limit=0, evaluation_timeout=0), ) Xtrain, Xtest, ytrain, ytest = haha.load(max_examples=10) # embedding = BertEmbedding() # tokens = embedding.run(Xtrain) # classifier = KerasSequenceClassifier().sample() # classifier.run((tokens, ytrain)) classifier.fit(Xtrain, ytrain, logger=[ConsoleLogger(), ProgressLogger()])
import numpy as np from autogoal.ml import AutoML from autogoal.contrib.keras import KerasImageClassifier, KerasImagePreprocessor from autogoal.datasets import cifar10 from autogoal.kb import CategoricalVector, Tensor4 from autogoal.search import ConsoleLogger, ProgressLogger automl = AutoML( input=Tensor4(), output=CategoricalVector(), registry=[KerasImageClassifier], # registry=[KerasImageClassifier, KerasImagePreprocessor], cross_validation_steps=1, search_kwargs=dict( pop_size=20, search_timeout=24 * 60 * 60, evaluation_timeout=0, memory_limit=0, save=False, ), search_iterations=1000, validation_split=1 / 6) Xtrain, ytrain, Xtest, ytest = cifar10.load() X = np.vstack((Xtrain, Xtest)) y = np.hstack((ytrain, ytest)) automl.fit(X, y, logger=[ConsoleLogger(), ProgressLogger()])
print(solution) except ValueError: continue # To evaluate how good a formula is, we simply feed the expression instance # with a sequence of numbers from 1 to 9. If the expression requires more # than 9 digits, it results in an error. The actual value of performing # corresponding operations is done in the `__call__` method of the expression classes. def evaluate(expr): def stream(): for i in range(1, 10): yield i raise ValueError("Too many values asked") return expr(stream()) # We will run 1000 iterations of each search strategy to compare their long-term performance. search_rand = RandomSearch(grammar, evaluate, errors='ignore') best_rand, best_fn_rand = search_rand.run(1000, logger=[ConsoleLogger(), ProgressLogger()]) search_pe = PESearch(grammar, evaluate, pop_size=10, errors='ignore') best_pe, best_fn_pe = search_pe.run(1000, logger=[ConsoleLogger(), ProgressLogger()]) # And here are the results. print(best_rand, best_fn_rand) print(best_pe, best_fn_pe)
from autogoal.contrib.keras import KerasClassifier from autogoal.datasets import cars from autogoal.kb import CategoricalVector, MatrixContinuousDense from autogoal.ml import AutoML from autogoal.search import ConsoleLogger, ProgressLogger classifier = AutoML( input=MatrixContinuousDense(), registry=[KerasClassifier], search_kwargs=dict(memory_limit=0, evaluation_timeout=0), ) X, y = cars.load() classifier.fit(X, y, logger=[ConsoleLogger(), ProgressLogger()])
# with a sequence of numbers from 1 to 9. If the expression requires more # than 9 digits, it results in an error. The actual value of performing # corresponding operations is done in the `__call__` method of the expression classes. def evaluate(expr): def stream(): for i in range(1, 10): yield i raise ValueError("Too many values asked") return expr(stream()) # We will run 1000 iterations of each search strategy to compare their long-term performance. search_rand = RandomSearch(grammar, evaluate, errors='ignore') best_rand, best_fn_rand = search_rand.run( 1000, logger=[ConsoleLogger(), ProgressLogger()]) search_pe = PESearch(grammar, evaluate, pop_size=10, errors='ignore') best_pe, best_fn_pe = search_pe.run(1000, logger=[ConsoleLogger(), ProgressLogger()]) # And here are the results. print(best_rand, best_fn_rand) print(best_pe, best_fn_pe)
try: with open(fR"{path}/binary_X", 'rb') as xfd, open(fR"{path}/binary_Y", 'rb') as yfd: X = pickle.load(xfd) y = pickle.load(yfd) return X, y except Exception as e: #TODO: implement corpus reading from directories print(e) pass if __name__ == "__main__": g = generate_cfg(SklearnNLPClassifier) X, y = load_movie_reviews(100) # X, y = load_corpus("examples/Revolico") def fitness(pipeline): pipeline.fit(X, y) score = pipeline.score(X, y) return score search = RandomSearch(g, fitness, random_state=0, errors='warn', evaluation_timeout=100) result = search.run(50, logger=ProgressLogger())
# care of train/test splitting, fitting a pipeline in the training set and computing # the accuracy on the test set. fitness_fn = movie_reviews.make_fn(examples=100) # ### Random search # # The `RandomSearch` strategy simply calls `grammar.sample()` a bunch of times # and stores the best performing pipeline. It has no intelligence whatsoever, # but it serves as a good baseline implementation. # # We will run it for a total of `1000` fitness evaluations, or equivalently, a total # of `1000` different random pipelines. To see what's actually going on we will use # the wonderfull `enlighten` library through our implementation `EnlightenLogger`. logger = ProgressLogger() random_search = RandomSearch(grammar, fitness_fn, random_state=0) best_rand, fn_rand = random_search.run(1000, logger=logger) # !!! note # For reproducibility purposes we can pass a fixed random seed in `random_state`. # # ### Evolutionary Search # # Random search is fun, but to search with purpose, we need a more intelligent sampling # strategy. The `PESearch` (short for Probabilistic Evolutionary Search, phew), does just that. # It starts with a random sampling strategy, but as it evaluates more pipelines, it modifies # an probabilistic sampling model so that pipelines similar to the best ones found are more # commonly sampled. #
from autogoal.contrib.sklearn import SklearnClassifier from autogoal.grammar import generate_cfg from autogoal.search import RandomSearch, ProgressLogger from sklearn.datasets import make_classification g = generate_cfg(SklearnClassifier) X, y = make_classification() print(g) def fitness(pipeline): pipeline.fit(X, y) return pipeline.score(X, y) search = RandomSearch(g, fitness, random_state=0, errors='warn') search.run(1000, logger=ProgressLogger())
# care of train/test splitting, fitting a pipeline in the training set and computing # the accuracy on the test set. fitness_fn = movie_reviews.make_fn(examples=100) # ### Random search # # The `RandomSearch` strategy simply calls `grammar.sample()` a bunch of times # and stores the best performing pipeline. It has no intelligence whatsoever, # but it serves as a good baseline implementation. # # We will run it for a total of `1000` fitness evaluations, or equivalently, a total # of `1000` different random pipelines. To see what's actually going on we will use # the wonderfull `enlighten` library through our implementation `EnlightenLogger`. logger = ProgressLogger(log_solutions=True) random_search = RandomSearch(grammar, fitness_fn, random_state=0) best_rand, fn_rand = random_search.run(1000, logger=logger) # !!! note # For reproducibility purposes we can pass a fixed random seed in `random_state`. # # ### Evolutionary Search # # Random search is fun, but to search with purpose, we need a more intelligent sampling # strategy. The `PESearch` (short for Probabilistic Evolutionary Search, phew), does just that. # It starts with a random sampling strategy, but as it evaluates more pipelines, it modifies # an probabilistic sampling model so that pipelines similar to the best ones found are more # commonly sampled. #