def simple_es(): df = pd.read_csv('btc_etc.csv').rename(columns={ 'Close': 'close', 'Date time': 'datetime', 'Open': 'open', 'High': 'high', 'Low': 'low', 'Volume': 'volume' }) @timeit def get_reward(weights, df): ds = DataSeries(df) bt = NNBT(ds, balance=1000.0, weights=weights) bt.run() return bt.get_profit() - 200.0 model = get_model() es = EvolutionStrategy(model.get_weights(), get_reward, population_size=10, sigma=0.1, learning_rate=0.001, get_reward_func_args=[df]) es.run(1000, print_step=1)
def main(): #make the environment env = gym.make(ENV_NAME) #declaring model and ES object model = Model() es = EvolutionStrategy(env, model, population_size=POPULATION_SIZE, alpha=ALPHA, sigma=SIGMA, gamma=GAMMA) #for each generation for generation in range(NUM_GENERATIONS): #show how well the model is doing in its current state es.run_act(True, True) #train for one generation es.train(1)
from es import EvolutionStrategy import numpy as np from game import Game, play from win import Window, GAME_SPEED import gi from gi.repository import Gtk, GLib, Gdk from os import path import os import time es = EvolutionStrategy(fn=play, noisep=50, sigma=0.1, alpha=0.001, layer_sizes=[[4, 500], [500, 1]], input_size=4) load = path.join(path.dirname(__file__), 'load.npy') # if load.npy exists, load the parameters from it if path.exists(load): es.layers = np.load(load) def step(game, update): win = Window(game) GLib.timeout_add(GAME_SPEED, lambda: timeout_kill(win, game)) GLib.timeout_add(GAME_SPEED, update) GLib.timeout_add(GAME_SPEED, win.update) win.show_all() Gtk.main()
def main(args): print("IT'S DANGEROUS TO GO ALONE! TAKE THIS.") np.random.seed(0) pt.manual_seed(0) env = BipedalWalker() env.seed(0) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] print(f"Initializing agent (device={device})...") rnn = WorldModel(obs_dim, act_dim) ctrl = Controller(obs_dim + rnn.hid_dim, act_dim) # Adjust population size based on the number of available CPUs. num_workers = mp.cpu_count() if args.nproc is None else args.nproc num_workers = min(num_workers, mp.cpu_count()) agents_per_worker = args.popsize // num_workers popsize = num_workers * agents_per_worker print(f"Initializing population with {popsize} workers...") pop = Population(num_workers, agents_per_worker) global_mu = np.zeros_like(ctrl.genotype) loss_logger = ValueLogger('ha_rnn_loss', bufsize=20) best_logger = ValueLogger('ha_ctrl_best', bufsize=100) # Train the RNN with random policies. print(f"Training M model with a random policy...") optimizer = optim.Adam(rnn.parameters(), lr=args.lr) train_rnn(rnn, optimizer, pop, random_policy=True, num_rollouts=args.num_rollouts, logger=loss_logger) loss_logger.plot('M model training loss', 'step', 'loss') # Upload the trained RNN. success = pop.upload_rnn(rnn.cpu()) assert success # Iteratively update controller and RNN. for i in range(args.niter): # Evolve controllers with the trained RNN. print(f"Iter. {i}: Evolving C model...") es = EvolutionStrategy(global_mu, args.sigma0, popsize) evolve_ctrl(ctrl, es, pop, num_gen=args.num_gen, logger=best_logger) best_logger.plot('C model evolution', 'gen', 'fitness') # Update the global best individual and upload them. global_mu = np.copy(ctrl.genotype) success = pop.upload_ctrl(global_mu, noisy=True) assert success # Train the RNN with the current best controller. print(f"Iter. {i}: Training M model...") train_rnn(rnn, optimizer, pop, random_policy=False, num_rollouts=args.num_rollouts, logger=loss_logger) loss_logger.plot('M model training loss', 'step', 'loss') # Upload the trained RNN. success = pop.upload_rnn(rnn.cpu()) assert success # Test run! rollout(env, rnn, ctrl, render=True) success = pop.close() assert success
def run(start_run, tot_runs, num_iterations, print_steps, output_results, num_workers): runs = {} hyperparam_search = False if (start_run > 0 and tot_runs > 1): hyperparam_search = True for i in range(start_run, tot_runs): chosen_before = False if hyperparam_search: npop = np.random.random_integers(1, 150, 1)[0] sample = np.random.rand(np.maximum(0, npop)) sample_std = np.std(sample) sigma = np.round(np.sqrt(np.random.chisquare(sample_std, 1)), 2)[0] learning_rate_selection = [ 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5 ] alpha = np.random.choice(learning_rate_selection) for key in runs.keys(): if runs[key] == [npop, sigma, alpha]: chosen_before = True print( 'skipping run, as hyperparams [{}] have been chosen before' .format(hyperparams)) else: #default - best hyperparams npop = 50 sigma = 0.1 alpha = 0.001 # will only run if hyperparams are not chosen before if not chosen_before: runs[i] = [npop, sigma, alpha] print('hyperparams chosen -> npop:{} sigma:{} alpha:{}'.format( npop, sigma, alpha)) es = EvolutionStrategy(model.get_weights(), get_reward, population_size=npop, sigma=sigma, learning_rate=alpha) if num_workers == 1: # single thread version metrics = es.run(num_iterations, print_steps) else: # distributed version es.run_dist(num_iterations, print_steps, num_workers) if output_results: RUN_SUMMARY_LOC = '../run_summaries/' print('saving results to loc:', RUN_SUMMARY_LOC) results = pd.DataFrame(np.array(metrics).reshape( int((num_iterations // print_steps)), 6), columns=list([ 'run_name', 'iteration', 'timestamp', 'accuracy_test', 'accuracy_val', 'accuracy_train' ])) filename = os.path.join(RUN_SUMMARY_LOC, results['run_name'][0] + '.csv') results.to_csv(filename, sep=',') print("Total Time usage: " + str(timedelta(seconds=int(round(time.time() - start_time)))))
if new_training: training = Training.create(training_name, pop_size, sigma, learning_rate) else: training = Training.load(training_name) model = training.model.copy() def get_reward(weights): model.set_weights(weights) agent = SMBAgent("Level1-1") fitness1, _ = agent.play(model, render) agent.change_env("Level1-2") fitness2, _ = agent.play(model, render) fitness = fitness1 + fitness2 return fitness es = EvolutionStrategy(training.model.get_weights(), get_reward, training.population_size, training.sigma, training.learning_rate) while True: (main_weights, main_reward), (population_weights, population_rewards) = es.run_generation() training.save(main_weights, main_reward, population_weights, population_rewards)
model.compile(optimizer='Adam', loss='mse') return model def get_reward(weights): model = get_model() model.set_weights(weights) total_steps = 0 for i_episode in range(20): observation = env.reset() for t in range(100): # env.render() action = np.argmax(model.predict(np.expand_dims(observation, 0))) observation, reward, done, info = env.step(action) total_steps += 1 if done: break reward = total_steps / 20.0 return reward - 100.0 model = get_model() es = EvolutionStrategy(model.get_weights(), get_reward, population_size=50, sigma=0.1, learning_rate=0.001) es.run(1000, print_step=1)