action='store_true', help='whether train DQN') parser.add_argument('--test_dqn', action='store_true', help='whether test DQN') try: from argument import add_arguments parser = add_arguments(parser) except: pass args = parser.parse_args() return args args = parse() env = Environment('BreakoutNoFrameskip-v4', "", atari_wrapper=True, test=False) n = env.action_space state = env.reset() device = torch.device("cpu") input = torch.tensor(state, device=device) agent = Agent_DQN(env, args) dqn = DQN() torch.save(dqn.state_dict(), "checkpoint.pth") state_dict = torch.load("checkpoint.pth") agent.train() # Experience = namedtuple( # 'Experience', # ('state','action','next_state','reward') # ) # e = Experience(state,action,next_state,reward)
#def minibatch(self, size): # random_array = np.random.choice(len(self.buffer),size,replace=False) # minibatch = np.zeros(size) # for i in random_array: # minibatch[i] = self.buffer[i] # #minibatch = np.array([self.buffer[i] for i in random_array]) # return minibatch # Main entry point if __name__ == "__main__": # Create an environment. # If display is True, then the environment will be displayed after every agent step. This can be set to False to speed up training time. The evaluation in part 2 of the coursework will be done based on the time with display=False. # Magnification determines how big the window will be when displaying the environment on your monitor. For desktop monitors, a value of 1000 should be about right. For laptops, a value of 500 should be about right. Note that this value does not affect the underlying state space or the learning, just the visualisation of the environment. environment = Environment(display=False, magnification=500) # Create an agent agent = Agent(environment) # Create a DQN (Deep Q-Network) dqn = DQN() my_buffer = ReplayBuffer() losses = [] iterations = [] episode_length = 20 #fig, ax = plt.subplots() #ax.set(xlabel='Iteration', ylabel='Loss', title='Loss Curve') plt.ion() training_iteration = 0 # Loop over episodes
def process(self, sess, global_t, summary_writer, summary_op, summary_placeholders): if self.env is None: # lazy evaluation time.sleep(self.thread_index * 1.0) self.env = Environment({ 'scene_name': self.scene_scope, 'terminal_state_id': int(self.task_scope) }) start_local_t = self.local_t # Initialization states = [] actions = [] rewards = [] values = [] targets = [] terminal_end = False # Reset accmulated gradient variables sess.run(self.reset_gradients) # Obtain shared parameters from global sess.run(self.sync) # t_max times loop for i in range(LOCAL_T_MAX): pi_, value_ = self.local_network.run_policy_and_value( sess, self.env.s_t, self.env.target) pi_ = np.array(pi_) / np.sum(pi_) action = self.choose_action(pi_) states.append(self.env.s_t) actions.append(action) values.append(value_) targets.append(self.env.target) if VERBOSE and (self.thread_index == 0) and (self.local_t % 1000) == 0: sys.stdout.write("%s:" % self.scene_scope) sys.stdout.write("Pi = {0} V = {1}\n".format(pi_, value_)) # process game self.env.step(action) # receive game result reward = self.env.reward terminal = self.env.terminal # ad-hoc reward for navigation # reward = 10.0 if terminal else -0.01 if self.episode_length > 5e3: terminal = True self.episode_reward += reward self.episode_length += 1 self.episode_max_q = max(self.episode_max_q, np.max(value_)) # clip reward rewards.append(np.clip(reward, -1, 1)) self.local_t += 1 if terminal: terminal_end = True sys.stdout.write( "#Thread: %d \n time %d | thread #%d | scene %s | target #%s\n%s %s episode reward = %.3f\n%s %s episode length = %d\n%s %s episode max Q = %.3f\n" % (self.thread_index, global_t, self.thread_index, self.scene_scope, self.task_scope, self.scene_scope, self.task_scope, self.episode_reward, self.scene_scope, self.task_scope, self.episode_length, self.scene_scope, self.task_scope, self.episode_max_q)) summary_values = { "episode_reward_input": self.episode_reward, "episode_length_input": float(self.episode_length), "episode_max_q_input": self.episode_max_q, "learning_rate_input": self._anneal_learning_rate(global_t) } self._record_score(sess, summary_writer, summary_op, summary_placeholders, summary_values, global_t) self.episode_reward = 0 self.episode_length = 0 self.episode_max_q = -np.inf self.env.reset() break R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.env.s_t, self.env.target) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_td = [] batch_R = [] batch_t = [] # compute and accmulate gradients for (ai, ri, si, Vi, ti) in zip(actions, rewards, states, values, targets): R = ri + GAMMA * R td = R - Vi a = np.zeros([ACTION_SIZE]) a[ai] = 1 batch_si.append(si) batch_a.append(a) batch_td.append(td) batch_R.append(R) batch_t.append(ti) sess.run(self.accum_gradients, feed_dict={ self.local_network.s: batch_si, self.local_network.a: batch_a, self.local_network.t: batch_t, self.local_network.td: batch_td, self.local_network.r: batch_R }) cur_learning_rate = self._anneal_learning_rate(global_t) sess.run(self.apply_gradients, feed_dict={self.learning_rate_input: cur_learning_rate}) if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0: sys.stdout.write( "#Thread-%d-%s-Local timestep-%d\n" % (self.thread_index, self.scene_scope, self.local_t)) # return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
# INITIALIZATION # environment_directory = str(args[1]) identifier = str(args[2]) log_directory = str(args[3]) measurement_directory = str(args[4]) # Configure logging parameters so we get output while the program runs logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %H:%M:%S', filename=log_directory + identifier + ".log", level=logging.INFO) logging.info('START logging for run: %s', environment_directory + identifier + ".xml") environment = Environment() environment.initialize(environment_directory, identifier) runner = Runner() measurement = Measurement() # # UPDATE STEP # for i in range(environment.parameters.numSimulations): logging.info(' STARTED with run %s', str(i)) environment.initialize(environment_directory, identifier) # check that environment file has been read correctly #environment.write_environment_file(identifier) runner.initialize(environment) measurement.initialize() # clear the previous measurement
def train(self): self.model.load_weights(self.weight) self.model.compile(loss='mse', optimizer=Adam(lr=self.alpha)) self.epsilon = 0.1 for trial in range(self.ntrials): s = self.env.reset(test=trial + 1) s = np.reshape(s, [1, self.nstates]) treward = 0 while True: a = self.epsilon_greedy(s) s2, r, done = self.env.step(a) s = np.reshape(s2, [1, self.nstates]) treward += r if done: rospy.loginfo('test: ' + str(trial + 1) + ' reward: ' + str(treward)) break if __name__ == '__main__': if not len(sys.argv) > 2: assert False, 'missing model and/or weight' rospy.init_node('ddqn_test') rospy.loginfo('start testing') env = Environment() agent = Agent(env, str(sys.argv[1]), str(sys.argv[2])) agent.train() env.reset() rospy.loginfo('COMPLETE TESTING') rospy.spin()
def __init__(self, model_code, gamma=0.975, field='champions', crop_style=0, gray=False, fps=8, history_length=2700, train_interval=150, num_epochs=1, keep_training=False, td=1, batch_size=32, epsilon_decay=0.9, epsilon_floor=1/16, decay_interval=10, initial_epsilon=None): # reward_model = reward_models.get_model_F() # reward_func = reward_models.create_reward_func(self.reward_model) self.model_code = model_code self.gray = gray self.fps = fps self.field = field self.crop_style = crop_style self.gamma = gamma self.env = Environment(frame_time=1/fps, gray=gray, field=field, crop_style=crop_style) self.history_length = history_length self.td = td # Determine how much td/mc to use self.batch_size = batch_size self.epsilon_decay = epsilon_decay self.epsilon_floor = epsilon_floor self.decay_interval = decay_interval self.trial_count = 0 self.model_path = 'model_data/{}/q_net/{}'.format(field, model_code) for designator in (crop_style, 'G' if gray else 'C', fps, int(gamma*1000)): self.model_path += '_{}'.format(designator) self.model_path += '/' if not os.path.exists(self.model_path): os.makedirs(self.model_path) print('Model path: {}'.format(self.model_path)) print() print('Field: {}'.format(self.field)) print('Model code: {}'.format(self.model_code)) print('Crop style: {}'.format(self.crop_style)) print('Gray: {}'.format(self.gray)) print('FPS: {}'.format(self.fps)) print('Gamma: {}'.format(self.gamma)) try: self.acting_model = load_model(self.model_path + 'latest.hdf5') self.training_model = load_model(self.model_path + 'latest.hdf5') print('Loaded model parameters from disk') except OSError as e: get_model = eval('models.get_model_{}'.format(self.model_code)) self.acting_model = get_model(self.env.ball_obs_dims, self.env.car_obs_dims) self.training_model = get_model(self.env.ball_obs_dims, self.env.car_obs_dims) self.training_model.set_weights(self.acting_model.get_weights()) print('Generated new parameters') self.trials = [] if os.path.exists(self.model_path + 'history.pkl'): ball_history = np.load(self.model_path + 'ball_history.npy') car_history = np.load(self.model_path + 'car_history.npy') with open(self.model_path + 'history.pkl', 'rb') as history: action_history, reward_history, discounted_reward_history, history_index = pickle.load(history) old_len = len(reward_history) next_idx = history_index % old_len if old_len == self.history_length: self.ball_history = ball_history self.car_history = car_history self.action_history = action_history self.reward_history = reward_history self.discounted_reward_history = discounted_reward_history self.history_index = history_index else: self.ball_history = np.zeros((self.history_length,) + self.env.ball_obs_dims) self.car_history = np.zeros((self.history_length,) + self.env.car_obs_dims) self.action_history = np.zeros(self.history_length, dtype=np.int8) self.reward_history = np.zeros(self.history_length) self.discounted_reward_history = np.zeros(self.history_length) if history_index > old_len: # Looped # TODO Copy end of array first if old_len - next_idx > self.history_length: # new history length is shorter than first section of old history self.ball_history[:] = ball_history[next_idx:next_idx+self.history_length] self.car_history[:] = car_history[next_idx:next_idx+self.history_length] self.action_history[:] = action_history[next_idx:next_idx+self.history_length] self.reward_history[:] = reward_history[next_idx:next_idx+self.history_length] self.discounted_reward_history[:] =\ discounted_reward_history[next_idx:next_idx+self.history_length] self.history_index = history_index else: # new history length is longer than first section of old history self.ball_history[:old_len - next_idx] = ball_history[next_idx:] self.car_history[:old_len - next_idx] = car_history[next_idx:] self.action_history[:old_len - next_idx] = action_history[next_idx:] self.reward_history[:old_len - next_idx] = reward_history[next_idx:] self.discounted_reward_history[:old_len - next_idx] =\ discounted_reward_history[next_idx:] # copy second section, when new history is full or we run out of old history stop = min(self.history_length - (old_len - next_idx), next_idx) self.ball_history[old_len - next_idx:old_len] = ball_history[:stop] self.car_history[old_len - next_idx:old_len] = car_history[:stop] self.action_history[old_len - next_idx:old_len] = action_history[:stop] self.reward_history[old_len - next_idx:old_len] = reward_history[:stop] self.discounted_reward_history[old_len - next_idx:old_len] =\ discounted_reward_history[:stop] self.history_index = old_len else: # Not looped stop = min(self.history_length, history_index) self.ball_history[:stop] = ball_history[:stop] self.car_history[:stop] = car_history[:stop] self.action_history[:stop] = action_history[:stop] self.reward_history[:stop] = reward_history[:stop] self.discounted_reward_history[:stop] = discounted_reward_history[:stop] self.history_index = stop if initial_epsilon is None: initial_epsilon = max(self.epsilon_floor, self.epsilon_decay ** (self.history_index/self.decay_interval)) else: self.ball_history = np.zeros((self.history_length,) + self.env.ball_obs_dims) self.car_history = np.zeros((self.history_length,) + self.env.car_obs_dims) self.action_history = np.zeros(self.history_length, dtype=np.int8) self.reward_history = np.zeros(self.history_length) self.discounted_reward_history = np.zeros(self.history_length) self.history_index = 0 if initial_epsilon is None: initial_epsilon = 1.0 print("epsilon = {}".format(initial_epsilon)) self.train_interval = train_interval self.num_epochs = num_epochs self.keep_training = keep_training self.trainer = Trainer(self, num_epochs=self.num_epochs) self.trainer.finished = True self.agent = Agent(self.acting_model, initial_epsilon, self.env.ball_obs_dims, self.env.car_obs_dims) self.frames_in_buffer = 0 # Initializes reward models self.env.reset(read_only=True) self.playing = False print("Ready")
# imports other libs import time import numpy as np from math import fabs, sqrt import glob, os experiment_name = 'individual_demo' if not os.path.exists(experiment_name): os.makedirs(experiment_name) # initializes simulation in individual evolution mode, for single static enemy. env = Environment(experiment_name=experiment_name, enemies=[2], playermode="ai", player_controller=player_controller(), enemymode="static", level=2, speed="fastest") # default environment fitness is assumed for experiment env.state_to_log() # checks environment state #### Optimization for controller solution (best genotype-weights for phenotype-network): Ganetic Algorihm ### ini = time.time() # sets time marker # genetic algorithm params run_mode = 'train' # train or test
def run(args): if args.test_dqn: env = Environment('BreakoutNoFrameskip-v4', args, atari_wrapper=True, test=True) from agent_dqn import Agent_DQN agent = Agent_DQN(env, args) test(agent, env, total_episodes=100)
def train(self): """ Learn your (final) policy. Use evolution strategy algortihm CMA-ES: https://pypi.org/project/cma/ Possible action: [0, 1, 2] Range observation (tuple): - position: [-1.2, 0.6] - velocity: [-0.07, 0.07] """ # 1- Define state features # 2- Define search space (to define a policy) # 3- Define objective function (for policy evaluation) # 4- Use CMA-ES to optimize the objective function # 5- Save optimal policy generations = 10000 for i in range(generations): solutions = self.es.ask() print("iteration:", i, " ;") result = [] for solution in solutions: env = Environment() n_w1 = len(self.w1_flat) self.w1_flat = np.array(solution[0:len(self.w1_flat)]) self.b1_flat = np.array( solution[len(self.w1_flat):len(self.w1_flat) + len(self.b1_flat)]) self.w2_flat = np.array( solution[len(self.w1_flat) + len(self.b1_flat):len(self.w1_flat) + len(self.b1_flat) + len(self.w2_flat)]) self.b2_flat = np.array( solution[len(self.w1_flat) + len(self.b1_flat) + len(self.w2_flat):len(self.w1_flat) + len(self.b1_flat) + len(self.w2_flat) + len(self.b2_flat)]) done = False accumulated_reward = 0 while not done: observation = env.observe() reward, done = env.act(self.act(observation)) accumulated_reward += reward result.append(-accumulated_reward) self.es.tell(solutions, result) if np.mean( result ) < 100: # result.avg=200 when cound not achieve aim, less is better. print("Good generation founded") break index = np.argmin(result) weight = solutions[index] np.save("weights.npy", weight) self.w1_flat = np.array(weight[0:len(self.w1_flat)]) self.b1_flat = np.array(weight[len(self.w1_flat):len(self.w1_flat) + len(self.b1_flat)]) self.w2_flat = np.array( weight[len(self.w1_flat) + len(self.b1_flat):len(self.w1_flat) + len(self.b1_flat) + len(self.w2_flat)]) self.b2_flat = np.array( weight[len(self.w1_flat) + len(self.b1_flat) + len(self.w2_flat):len(self.w1_flat) + len(self.b1_flat) + len(self.w2_flat) + len(self.b2_flat)])
def __init__(self): self.__environment = Environment() self.__board = Board() self.__drone = Drone(self.__environment)
import gym import numpy as np import torch import torch.nn as nn import torch.optim as optim import torch.autograd as autograd import torch.nn.functional as F from environment import Environment USE_CUDA = torch.cuda.is_available() Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda( ) if USE_CUDA else autograd.Variable(*args, **kwargs) env_name = 'BreakoutNoFrameskip-v4' env = Environment(env_name, {}, atari_wrapper=True) epsilon_start = 1.0 epsilon_final = 0.01 epsilon_decay = 500 epsilon_by_frame = lambda frame_idx: epsilon_final + ( epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) from collections import deque class ReplayBuffer(object): def __init__(self, capacity): self.buffer = deque(maxlen=capacity)
return if action == None: action = 0 obs_, reward, done, _ = env.step(action) print('action:{}, reward:{}, done:{}'.format(action, reward, done)) env.render() if save_or_not: s_list.append(obs) a_list.append(action) r_list.append(reward) time.sleep(0.1) if __name__ == '__main__': game_name = 'Breakout-v0' env = Environment(game_name, atari_wrapper=True, test=0) print('We are playing {}'.format(game_name)) print('-------- game information --------') print('observation space: ', end='') print(env.observation_space) print('action space: ', end='') print(env.action_space) try: print('action meanings: ', end='') print(env.unwrapped.get_action_meanings()) except: print('don\'t know the action meanings') check_input_range(env)
from pando.website import Website env = Environment( ASPEN_CHANGES_RELOAD=is_yesish, ASPEN_PROJECT_ROOT=str, ASPEN_SHOW_TRACEBACKS=is_yesish, ASPEN_WWW_ROOT=str, AWS_ACCESS_KEY_ID=str, AWS_SECRET_ACCESS_KEY=str, DATABASE_URL=str, DATABASE_MAXCONN=int, CANONICAL_HOST=str, CANONICAL_SCHEME=str, COMPRESS_ASSETS=is_yesish, CSP_EXTRA=str, SENTRY_DSN=str, SENTRY_RERAISE=is_yesish, LOG_DIR=str, KEEP_PAYDAY_LOGS=is_yesish, LOGGING_LEVEL=str, CACHE_STATIC=is_yesish, CLEAN_ASSETS=is_yesish, RUN_CRON_JOBS=is_yesish, OVERRIDE_PAYDAY_CHECKS=is_yesish, OVERRIDE_QUERY_CACHE=is_yesish, GRATIPAY_BASE_URL=str, SECRET_FOR_GRATIPAY=str, INSTANCE_TYPE=str, ) logging.basicConfig(level=getattr(logging, env.logging_level.upper()))
def run(args): if args.test: env = Environment('Pong-v0', args, test=True) from agent_dir.agent_pg import Agent_PG agent = Agent_PG(env, args) test(agent, env)
def main1(game, enemy, algorithm): # Setting up the game experiment_name = 'adrian-testing' + "-algorithm-" + str(algorithm) print(experiment_name + " game: " + str(game) + " " + "enemy: " + str(enemy)) if not os.path.exists(experiment_name): os.makedirs(experiment_name) # Initialize the amout of neurons n_hidden_neurons = 10 # initializes simulation in individual evolution mode, for single static enemy. env = Environment( experiment_name=experiment_name, enemies=[enemy], playermode="ai", player_controller=player_controller(n_hidden_neurons), enemymode="static", level=2, # default environment fitness is assumed for experiment speed="fastest") # default environment fitness is assumed for experiment env.state_to_log() # checks environment state #Optimization for controller solution (best genotype-weights for phenotype-network): Ganetic Algorihm ### ini = time.time() # sets time marker # genetic algorithm params run_mode = 'train' # train or test # number of weights for multilayer with 10 hidden neurons n_vars = (env.get_num_sensors() + 1) * n_hidden_neurons + (n_hidden_neurons + 1) * 5 #------------ Setting up the GA --------------- # There are two main areas where change is possible a) and b) (Also new things could be added e.g. Doomsday..,) # a) GA Constants and parameters genome_lenght = n_vars #100 #lenght of the bit string to be optimized -> bit lenght is actually n_vars pop_size = 50 p_crossover = 0.8 p_mutation = 0.2 mutation_scaler = genome_lenght # The bitflip function iterates over every single value in an individuals genome, with a probability indpb it decides wether to flip or not. This value is independent from the mutation probability which decides IF a given individual in the population will be selected for mutation. max_generations = 15 # stopping condition tournament_size = 5 # tournament size seed = 50 #random.randint(1, 126) random.seed(seed) # For optimizing continuous functions bound_low = -1 bound_up = 1 crowding_factor = 20 # Defining a tool to create single gene toolbox = base.Toolbox() # toolbox.register("ZeroOrOne", random.randint, -1, 1) toolbox.register("ZeroOrOne", random.uniform, -1, 1) # Each gene is a float between -1 and 1 # Defining the fitness # creator.create("FitnessMin", base.Fitness, weights=(-30.0, -30.0)) # creator.create("FitnessMin", base.Fitness, weights=(-100,)) creator.create("FitnessMin", base.Fitness, weights=(100, )) # Defining an individual creatorfre creator.create( "Individual", list, fitness=creator.FitnessMin ) # An individual will be stored in a list format with fitness evaluated at "FitnessMin" toolbox.register( "individualCreator", tools.initRepeat, creator.Individual, toolbox.ZeroOrOne, genome_lenght ) # An individual consist of a list of n_var attributes (genes) populated by zeroorone # Defining the population cretor toolbox.register("populationCreator", tools.initRepeat, list, toolbox.individualCreator) # Defining the fitness function def evaluate(x): return np.array(list(map(lambda y: simulation(env, y), x))), toolbox.register("evaluate", evaluate) #------------- b) Registering the EA operators -------------- # 1. Standard operators toolbox.register("select", tools.selTournament, tournsize=tournament_size) # # toolbox.register("mate", tools.cxSimulatedBinaryBounded, low=bound_low, up=bound_up,eta=crowding_factor) # # toolbox.register("mutate", tools.mutPolynomialBounded, low=bound_low, up=bound_up,eta=crowding_factor,indpb=1.0/mutation_scaler) # toolbox.register("mate", tools.cxTwoPoint) # toolbox.register("mutate", tools.mutFlipBit, indpb=0.05) # 2. Multiple operators # 2.1 All operators # 2.1.1 Crossover # Two-point: toolbox.register("mate", tools.cxTwoPoint) # Partially matched: toolbox.register("mate", tools.cxPartialyMatched) # Uniform: toolbox.register("mate", tools.cxUniform, indpb=0.05) # 2.1.2 Mutation # flip: toolbox.register("mutate", tools.mutFlipBit, indpb=0.05) # shuffle: toolbox.register("mutate", tools.mutShuffleIndexes, indpb=0.05) # uniformint: toolbox.register("mutate", tools.mutUniformInt(low = -1, high = 1, indpb = 0.05) if algorithm == 1: toolbox.register("mate", tools.cxTwoPoint) toolbox.register("mutate", tools.mutFlipBit, indpb=0.05) elif algorithm == 2: toolbox.register("mate", tools.cxTwoPoint) toolbox.register("mutate", tools.mutShuffleIndexes, indpb=0.05) elif algorithm == 3: toolbox.register("mate", tools.cxTwoPoint) toolbox.register("mutate", tools.mutUniformInt, low=-1, up=1, indpb=0.05) elif algorithm == 4: toolbox.register("mate", tools.cxBlend, alpha=0.05) toolbox.register("mutate", tools.mutFlipBit, indpb=0.05) elif algorithm == 5: toolbox.register("mate", tools.cxBlend, alpha=0.05) toolbox.register("mutate", tools.mutShuffleIndexes, indpb=0.05) elif algorithm == 6: toolbox.register("mate", tools.cxBlend, alpha=0.05) toolbox.register("mutate", tools.mutUniformInt, low=-1, up=1, indpb=0.05) elif algorithm == 7: toolbox.register("mate", tools.cxUniform, indpb=0.05) toolbox.register("mutate", tools.mutFlipBit, indpb=0.05) elif algorithm == 8: toolbox.register("mate", tools.cxUniform, indpb=0.05) toolbox.register("mutate", tools.mutShuffleIndexes, indpb=0.05) elif algorithm == 9: toolbox.register("mate", tools.cxUniform, indpb=0.05) toolbox.register("mutate", tools.mutUniformInt, low=-1, up=1, indpb=0.05) elif algorithm == 10: toolbox.register("mate", tools.cxTwoPoint) toolbox.register("mutate", tools.mutFlipBit, indpb=0.01) elif algorithm == 11: toolbox.register("mate", tools.cxTwoPoint) toolbox.register("mutate", tools.mutFlipBit, indpb=0.15) elif algorithm == 12: toolbox.register("mate", tools.cxTwoPoint) toolbox.register("mutate", tools.mutFlipBit, indpb=0.3) else: print("why?") #---------------- Setting the game enviroment ---------------- def simulation(env, x): f, p, e, t = env.play(pcont=x) return f, p # Plotting maxFitnessValues = [] meanFitnessValues = [] # Running the simulation def main(game, enemy): file_aux=open(experiment_name+'/results_enemy' + \ str(enemy) + str(algorithm) + '.txt', 'a') file_aux.write(f'\ngame {game} \n') file_aux.write('gen, best, mean, std, median, q1, q3, life') file_aux.close() #Creating the population pop = toolbox.populationCreator( n=pop_size) # Population is created as a list object pop_array = np.array(pop) generationCounter = 0 print("Start of evolution") # Evaluating all the population # fitnessValues=list(map(toolbox.evaluate, pop_array)) -> Won't work. Used Kamiel's fitnessValue = evaluate(pop_array) fitnessValue = fitnessValue[0].tolist() fitnesses = [] lifes = [] for value in fitnessValue: fitnesses.append(value[0]) lifes.append(value[1]) for count, individual in enumerate(fitnesses): # Rewrites the fitness value in a way the DEAP algorithm can understand fitnesses[count] = (-individual, ) # Assigning the fitness value to each individual for individual, fitnessValue in zip(pop, fitnesses): individual.fitness.values = fitnessValue # Extract each fitness value fitnessValues = [individual.fitness.values[0] for individual in pop] # Saves first generation fits = fitnessValues g = generationCounter length = len(pop) mean = sum(fits) / length sum2 = sum(x * x for x in fits) std = abs(sum2 / length - abs(mean)**2)**0.5 q1 = np.percentile(fits, 25) median = np.percentile(fits, 50) q3 = np.percentile(fits, 75) max_life = max(lifes) file_aux = open( experiment_name + '/results_enemy' + str(enemy) + 'Tournement.txt', 'a') file_aux.write( f'\n{str(g)}, {str(round(max(fits)))}, {str(round(mean,6))}, {str(round(std,6))}, {str(round(median,6))}, {str(round(q1,6))}, {str(round(q3,6))}, {str(round(max_life,6))}' ) file_aux.close() # Beggin the genetic loop # First, we start with the stopping condition while max(fitnessValues) < 100 and generationCounter < max_generations: begin_time = datetime.datetime.now() print("Being evolution time:", begin_time, "!!!") # Update generation counter generationCounter = generationCounter + 1 print("-- Generation %i --" % generationCounter) # Begin genetic operators # 1. Selection: since we already defined the tournament before # we only need to select the population and its lenght # Selected individuals now will be in a list print("selection...") offspring = toolbox.select(pop, len(pop)) for i in offspring: print(i.fitness.values[0]) # Cloning the selected indv so we can apply the next genetic operators without affecting the original pop offspring = list(map(toolbox.clone, offspring)) print("done") # 2. Crossover. Note taht the mate function takes two individuals as arguments and # modifies them in place, meaning they don't need to be reassigned print("Crossover...") for child1, child2 in zip(offspring[::2], offspring[1::2]): if random.random() < p_crossover: toolbox.mate(child1, child2) del child1.fitness.values del child2.fitness.values print("done") # 3. Mutation print("Mutation...") for mutant in offspring: # if random.random() < p_mutation: if random.random() < (1 - (generationCounter / max_generations)): toolbox.mutate(mutant) del mutant.fitness.values # Individuals that werent mutated remain intact, their fitness values don't need to eb recalculated # The rest of the individuals will have this value EMPTY # We now find those individuals and calculate the new fitness print("...re-evaluating fitness...") freshIndividuals = [ ind for ind in offspring if not ind.fitness.valid ] # Eval not work!!! :(( used Kamiels # freshFitnessValues=list(map(toolbox.evaluate, freshIndividuals)) # for individual, fitnessValue in zip(freshIndividuals, freshFitnessValues): # individual.fitness.values=fitnessValue pop_array = np.array(freshIndividuals) values = evaluate(pop_array) values = values[0].tolist() fitnesses = [] for value in values: fitnesses.append(value[0]) lifes.append(value[1]) for count, individual in enumerate(fitnesses): fitnesses[count] = (individual, ) for ind, fit in zip(freshIndividuals, fitnesses): ind.fitness.values = fit print("done") # Changes best individuals of population with worst individuals of the offspring amount_swithed_individuals = int(len(pop) / 10) worst_offspring = deap.tools.selWorst(offspring, amount_swithed_individuals, fit_attr='fitness') best_gen = deap.tools.selBest(pop, amount_swithed_individuals, fit_attr='fitness') for count, individual in enumerate(worst_offspring): index = offspring.index(individual) offspring[index] = best_gen[count] # End of the proccess -> replace the old population wiht the new one pop[:] = offspring print(f"There are {len(pop)} individuals in the population ") # Gather all the fitnesses in one list and print the stats fits = [ind.fitness.values[0] for ind in pop] length = len(pop) mean = sum(fits) / length sum2 = sum(x * x for x in fits) std = abs(sum2 / length - abs(mean)**2)**0.5 q1 = np.percentile(fits, 25) median = np.percentile(fits, 50) q3 = np.percentile(fits, 75) max_life = max(lifes) # For plotting maxFitness = max(fits) meanFitness = sum(fits) / len(pop) maxFitnessValues.append(maxFitness) meanFitnessValues.append(meanFitness) print(" Min %s" % min(fits)) print(" Max %s" % max(fits)) print(" Avg %s" % mean) print(" Std %s" % std) # Plot plt.plot(maxFitnessValues) plt.plot(meanFitnessValues) plt.ylabel("Values") plt.xlabel("Generations") plt.title(experiment_name + " game: " + str(game) + " " + "enemy: " + str(enemy)) plt.savefig("adrian-testing" + "-algorithm-" + str(algorithm) + "-enemy-" + str(enemy) + ".png") plt.show() # DataFrame df.loc[int(algorithm)] = [min(fits), max(fits), mean, std] df.to_csv("adrian-testing" + "-algorithm-" + str(algorithm) + "-enemy-" + str(enemy) + ".csv", index=False) print("Saving...") # saves results for first pop file_aux=open(experiment_name+'/results_enemy' + \ str(enemy)+'Tournement.txt', 'a') file_aux.write( f'\n{str(g)}, {str(round(max(fits),6))}, {str(round(mean,6))}, {str(round(std,6))}, {str(round(median,6))}, {str(round(q1,6))}, {str(round(q3,6))}, {str(round(max_life,6))}' ) file_aux.close() print("Evolution ended in:", datetime.datetime.now() - begin_time) print("-- End of (successful) evolution --") best_ind = tools.selBest(pop, 1)[0] print("Best individual is %s, %s" % (best_ind, best_ind.fitness.values)) np.savetxt(experiment_name+'/best_game_'+str(game) + \ ',enemy_'+str(enemy)+'Tournement.txt', best_ind) print("Done. New generation...") main(game, enemy) plt.show() print("Run ended in:", datetime.datetime.now() - begin_game)
def bind(self, instance): environment = Environment(self.closure) environment.define('this', instance) return LoxFunction(self.declaration, environment, self.is_initializer)
# # print(real_speed) # if position < self.goal < (position + real_speed): # self.final_action = np.abs(self.goal - position) / real_speed # elif position > self.goal > (position + self.speed): # self.final_action = np.abs(self.goal - position) / real_speed # else: # print("Stable position") # self.position = position # print(f"Position: {self.position}, Goal: {self.goal}, delta: {self.delta}, speed: {real_speed}, direction: {direction}") else: self.position = position if self.final_action is None: self.env.step(direction, False) else: print("FINAL") self.env.step(self.final_action, True) self.finished = True if __name__ == "__main__": solver = Solver(Environment(4)) solver.env.reset() for i in range(100): solver.zero(time.time(), solver.env.state[0]) if solver.finished: break print(solver.env.state)
def main(): trial_len = 1030 env = Environment(100000, 1, trial_len, stock1, stock2) trials = 100 action_info = { 's1_buys_per_trial': [], 's1_sells_per_trial': [], 's2_buys_per_trial': [], 's2_sells_per_trial': [], 'holds_per_trial': [], 'illegal_action_trial': [], 'profits_per_trial': [], 'ranges_per_trial': [], 'good_profits_and_range': [] } dqn_agent = DQNAgent(env, stock1.name, stock2.name) menu_option = input( "Press 1 to load a model from filepath. Press any other button to start a new model " ) if menu_option == "1": dqn_agent.load_model() steps = [] for trial in range(trials): print('Trial ', trial) cur_state = env.state step_count = 0 start_funds = env.get_funds() action = '' stock1_buys = 0 stock1_sells = 0 stock2_buys = 0 stock2_sells = 0 holds = 0 illegal_action = False returns = [] for step in range(trial_len): action_num = dqn_agent.act(cur_state) action, stock = None, None # Get action from Deep Q Net output if action_num == 0: action, stock = 'BUY', stock1.name stock1_buys += 1 elif action_num == 1: action, stock = 'SELL', stock1.name stock1_sells += 1 elif action_num == 2: action, stock = 'BUY', stock2.name stock2_buys += 1 elif action_num == 3: action, stock = 'SELL', stock2.name stock2_sells += 1 elif action_num == 4: action, stock = 'HOLD', '' holds += 1 else: action, stock = None, None prev_funds = env.get_funds() print('Step {}:'.format(step)) print(' Action: ', action) print(' Stock: ', stock) new_state, reward, illegal_action = env.step(action, stock, 1) reward = reward if not illegal_action else -10000 new_funds = env.get_funds() returns.append(new_funds - prev_funds) print(' Reward: ', reward) dqn_agent.remember(cur_state, action_num, reward, new_state, illegal_action) dqn_agent.replay() dqn_agent.target_train() cur_state = new_state step_count += 1 if illegal_action: print('Illegal action taken, starting new trial') break profit = start_funds - env.get_funds() df_range = (env.init_day_index, env.init_day_index + trial_len) print('Profit: ', start_funds - env.get_funds()) if profit >= 5000.00: action_info['good_profits_and_range'].append((df_range, returns)) print(action_info['good_profits_and_range']) action_info['profits_per_trial'].append(profit) action_info['s1_buys_per_trial'].append(stock1_buys) action_info['s1_sells_per_trial'].append(stock1_sells) action_info['s2_buys_per_trial'].append(stock2_buys) action_info['s2_sells_per_trial'].append(stock2_sells) action_info['holds_per_trial'].append(holds) action_info['illegal_action_trial'].append(illegal_action) action_info['ranges_per_trial'].append( (env.init_day_index, env.init_day_index + trial_len)) n = random.randint(0, len(stock1) - trial_len) env = Environment(100000, 1, trial_len, stock1, stock2) print( "Average Profit: ", sum(action_info['profits_per_trial']) / len(action_info['profits_per_trial'])) data_file_name = input( 'Please type the name of the file you would like to save the action info to: ' ) menu_option2 = input( "Press 0 to quit, press 1 to save to model to location/ ") if menu_option2 == "1": fp = input("Enter the filepath to save this model to ") dqn_agent.custom_save_model(fp) action_info_df = pd.DataFrame(action_info) action_info_df.to_csv(data_file_name)
def deap_generalist_twopoint(experiment_name, enemies_in_group, iteration_number): if not os.path.exists(experiment_name): os.makedirs(experiment_name) if os.path.exists(experiment_name + '/results.csv'): os.remove(experiment_name + '/results.csv') if os.path.exists(experiment_name + '/best.txt'): os.remove(experiment_name + '/best.txt') n_hidden_neurons = 10 # initializes simulation in individual evolution mode, for single static enemy. env = Environment( experiment_name=experiment_name, enemies=enemies_in_group, multiplemode="yes", playermode="ai", player_controller=player_controller(n_hidden_neurons), enemymode="static", level=2, speed="fastest", ) # GLOBAL VARIABLES POP_SIZE = 50 # Population size GENS = 10 # Amount of generations MUTPB = 0.2 # MUTPB is the probability for mutating an individual toolbox = base.Toolbox() n_vars = (env.get_num_sensors() + 1) * n_hidden_neurons + (n_hidden_neurons + 1) * 5 #DATA genlist = [] bestlist = [] meanlist = [] stdlist = [] def evaluate(pop): """ This function will start a game with one individual from the population Args: individual (np.ndarray of Floats between -1 and 1): One individual from the population Returns: Float: Fitness """ for ind in pop: f, p, e, t = env.play( pcont=ind ) # return fitness, self.player.life, self.enemy.life, self.time fitness = p - e ind.fitness.values = [fitness] def setup_DEAP(): """ This function sets up the DEAP environment to our liking. creator.create is used to create a class under a certain name toolbox.register is used to register a function under a certain name which can be called For more information about which examples are used and the DEAP documentation: # https://deap.readthedocs.io/en/master/ # https://deap.readthedocs.io/en/master/examples/ga_onemax.html """ # this tells DEAP that the fitness should be as high as possible. (therefore Max) creator.create("FitnessMax", base.Fitness, weights=(1.0, )) # an individual is a np.ndarray filled with random floats which are the inputs of the game creator.create("Individual", np.ndarray, fitness=creator.FitnessMax) toolbox.register("attr_float", random.uniform, -1, 1) # registers function to create an individual # n_vars is the amount of floats in the individual toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_float, n_vars) # registers function to create the population of individuals toolbox.register("population", tools.initRepeat, list, toolbox.individual) # registers function which links to our evaluate function toolbox.register("evaluate", evaluate) # registers crossover function: We use One Point Crossover toolbox.register("mate", tools.cxTwoPoint) # registers mutation function: We use shuffle index toolbox.register("mutate", tools.mutShuffleIndexes, indpb=0.1) # Because it already changes 2 values # registers selection function: We select using tournament selection of 2. toolbox.register("select", tools.selTournament, tournsize=2) # registers survival selection function toolbox.register("survive", tools.selRandom) def mutation(offspring): """ 'Mutation is applied to the offspring delivered by crossover.' Args: offspring (List of individuals): Selected offspring from the population """ for mutant in offspring: if random.random() < MUTPB: toolbox.mutate(mutant) def crossover_and_mutation(offspring): """ 'In evolutionary computing, the combination of features from two individuals in offspring is often called crossover (or recombination).' We currently use one point crossover. Args: offspring (List of individuals): Selected offspring from the population """ children = [] for parent1, parent2 in zip(offspring[::2], offspring[1::2]): # NOT USED. if random.random() < CXPB: child1 = toolbox.clone(parent1) child2 = toolbox.clone(parent2) toolbox.mate(child1, child2) del child1.fitness.values del child2.fitness.values children.extend((child1, child2)) # apply mutation to children mutation(children) # add children to population offspring.extend((child for child in children)) def configure_results(pop, generation, ultimate_best): fits = [ind.fitness.values[0] for ind in pop] length = len(pop) mean = sum(fits) / length sum2 = sum(x * x for x in fits) std = abs(sum2 / length - mean**2)**0.5 max_fitness = max(fits) print(" Min %s" % min(fits)) print(" Max %s" % max_fitness) print(" Avg %s" % mean) print(" Std %s" % std) # 7. best = fits.index(max_fitness) if max_fitness > ultimate_best: print("ultimate best") ultimate_best = best np.savetxt(experiment_name + "/best.txt", pop[best]) if max_fitness > winner["fitness"]: print("WINNER") winner["solution"] = pop[best] winner["fitness"] = max_fitness genlist.append(generation) bestlist.append(round(max_fitness, 6)) meanlist.append(round(mean, 6)) stdlist.append(round(std, 6)) # save result of each generation # file_aux = open(experiment_name+'/results.txt','a') # file_aux.write('\n\ngen best mean std') # file_aux.write('\n'+str(generation)+' '+str(round(max_fitness,6))+' '+str(round(mean,6))+' '+str(round(std,6)) ) # file_aux.close() return fits, ultimate_best def evolution(pop, ultimate_best): """ Evolution Steps: 1. Select next generation of individuals from population 2. Clone is used (I think) to let the DEAP algorithm know it is a new generation 3. Apply Crossover on the offspring 4. Apply Mutation on the offspring 5. Evaluate individuals that have been changed due to crossover or mutation 6. Apply survivor selection by picking the best of a group 6. Show statistics of the fitness levels of the population and save best individual of that run 7. Update environment solutions Args: pop (list): A list containing individuals """ current_g = 0 while current_g < GENS: current_g = current_g + 1 print("-- Generation %i --" % current_g) # 1. selected = toolbox.select(pop, len(pop)) # 2. offspring = list(map(toolbox.clone, selected)) shuffle(offspring) #3. #4. crossover_and_mutation(offspring) #5. changed_individuals = [ ind for ind in offspring if not ind.fitness.valid ] toolbox.evaluate(changed_individuals) #6 survivors = toolbox.survive(offspring, POP_SIZE) # Replace old population by offspring pop[:] = survivors # 7. fits, ultimate_best = configure_results(pop, current_g, ultimate_best) print(fits) # 8. solutions = [pop, fits] env.update_solutions(solutions) env.save_state() def main(iteration_number): """ This is the start of the program. Program Steps: 1. Setup Deap Environment 2. Initialize Population of individuals 3. Evaluate population by playing the game and assigning fitness levels 4. Show and save results for that population 4. Start Evolution """ # 1. setup_DEAP() # 2. print("-- Form Population --") random.seed(2) #starts with the same population pop = toolbox.population(n=POP_SIZE) random.seed(iteration_number) print(iteration_number) # 3. toolbox.evaluate(pop) ultimate_best = -200 # 4. fits, ultimate_best = configure_results(pop, 0, ultimate_best) # 5. evolution(pop, ultimate_best) # Print results to csv print("PRINT TO CSV") with open(experiment_name + '/results.csv', 'w+', newline='') as csvfile: filewriter = csv.writer(csvfile, delimiter=',') filewriter.writerow(["generation", "best", "mean"]) #, "std"]) for i in range(len(bestlist)): filewriter.writerow( [genlist[i], bestlist[i], meanlist[i], stdlist[i]]) main(iteration_number)
from environment import Environment from robot import searchAStar import random import queue as q Env1 = Environment((10, 10), .5, 1) print("World Map:\n", Env1.envMatrix) Env1.robots[0].updateMap(Env1.robotsLocation[0], Env1.envMatrix) # print("Robot World Location: ", Env1.robotsLocation[0]) # print("Robot Relative Location: ", Env1.robots[0].location) # print("Robot Local Map:\n", Env1.robots[0].localMap) stuck = False while not stuck: Env1.robots[0].updateMap(Env1.robotsLocation[0], Env1.envMatrix) if Env1.robots[0].currentPath.empty(): # input("IN NEW PATH SEARCH...") solution = Env1.robots[0].getClosestUnknown() if solution == None: stuck = True else: for i in solution: Env1.robots[0].currentPath.put(i) # iBump = 0 # jBump = 0 while (Env1.robots[0].localMap[Env1.robots[0].goal[0]][ Env1.robots[0].goal[1]] == 3) and (not Env1.robots[0].currentPath.empty()) and not stuck: print("Robot World Location: ", Env1.robotsLocation[0]) print("Robot Relative Location: ", Env1.robots[0].location) print("Robot Local Map:\n", Env1.robots[0].localMap)
def run(): """ Driving function for running the simulation. Press ESC to close the simulation, or [SPACE] to pause the simulation. """ # constant = 0.9957 # alpha = 0.2 tolerance = 0.01 for constant in [ 0.0078, 0.0052, 0.0039, 0.0031, 0.0026, 0.0022, 0.0019, 0.0017 ]: for alpha in [0.2, 0.5, 0.8]: good_counter = 0 for n in range(20): ############## # Create the environment # Flags: # verbose - set to True to display additional output from the simulation # num_dummies - discrete number of dummy agents in the environment, default is 100 # grid_size - discrete number of intersections (columns, rows), default is (8, 6) env = Environment(verbose=True) ############## # Create the driving agent # Flags: # learning - set to True to force the driving agent to use Q-learning # * epsilon - continuous value for the exploration factor, default is 1 # * alpha - continuous value for the learning rate, default is 0.5 agent = env.create_agent( LearningAgent, learning=True, alpha=alpha, constant=constant) ############## # Follow the driving agent # Flags: # enforce_deadline - set to True to enforce a deadline metric env.set_primary_agent(agent, enforce_deadline=True) ############## # Create the simulation # Flags: # update_delay - continuous time (in seconds) between actions, default is 2.0 seconds # display - set to False to disable the GUI if PyGame is enabled # log_metrics - set to True to log trial and simulation results to /logs # optimized - set to True to change the default log file name sim = Simulator( env, update_delay=0, log_metrics=True, display=False, optimized=True) ############## # Run the simulator # Flags: # tolerance - epsilon tolerance before beginning testing, default is 0.05 # n_test - discrete number of testing trials to perform, default is 0 sim.run(n_test=100, tolerance=tolerance) safety_rating, reliability_rating = plot_trials( 'sim_improved-learning.csv') if safety_rating in ['A+', 'A' ] and reliability_rating in ['A', 'A+']: good_counter += 1 else: break f = open('result.txt', 'a+') f.write('{}, {}, {}, {}\n'.format(constant, alpha, agent.counter, good_counter)) f.close()
def main(): """Main function.""" log_levels = { u"NOTSET": logging.NOTSET, u"DEBUG": logging.DEBUG, u"INFO": logging.INFO, u"WARNING": logging.WARNING, u"ERROR": logging.ERROR, u"CRITICAL": logging.CRITICAL } args = parse_args() logging.basicConfig(format=u"%(asctime)s: %(levelname)s: %(message)s", datefmt=u"%Y/%m/%d %H:%M:%S", level=log_levels[args.logging]) logging.info(u"Application started.") try: spec = Specification(args.specification) spec.read_specification() except PresentationError as err: logging.critical(u"Finished with error.") logging.critical(repr(err)) return 1 if spec.output[u"output"] not in OUTPUTS: logging.critical( f"The output {spec.output[u'output']} is not supported.") return 1 return_code = 1 try: env = Environment(spec.environment, args.force) env.set_environment() prepare_static_content(spec) data = InputData(spec, spec.output[u"output"]) if args.input_file: data.process_local_file(args.input_file) elif args.input_directory: data.process_local_directory(args.input_directory) else: data.download_and_parse_data(repeat=1) if args.print_all_oper_data: data.print_all_oper_data() generate_tables(spec, data) generate_plots(spec, data) generate_files(spec, data) if spec.output[u"output"] == u"report": generate_report(args.release, spec, args.week) elif spec.output[u"output"] == u"trending": sys.stdout.write(generate_cpta(spec, data)) try: alert = Alerting(spec) alert.generate_alerts() except AlertingError as err: logging.warning(repr(err)) elif spec.output[u"output"] == u"convert-xml-to-json": convert_xml_to_json(spec, data) else: logging.info("No output will be generated.") logging.info(u"Successfully finished.") return_code = 0 except AlertingError as err: logging.critical(f"Finished with an alerting error.\n{repr(err)}") except PresentationError as err: logging.critical(f"Finished with a PAL error.\n{str(err)}") except (KeyError, ValueError) as err: logging.critical(f"Finished with an error.\n{repr(err)}") finally: if spec is not None: clean_environment(spec.environment) return return_code
def run(restore, q_manual_init = False, LfD = False): env = Environment() agt = env.create_agent(LearningAgent, test=True) # create agent env.set_agent(agt, enforce_deadline=False) # specify agent to track # NOTE: You can set enforce_deadline=False while debugging to allow longer trials n_trials = 10000000 quit = False parent_path = os.path.dirname(os.path.realpath(__file__)) data_path = os.path.join(parent_path, 'q_table') lfd_path = os.path.join(parent_path, 'LfD') if not os.path.exists(data_path): os.makedirs(data_path) files_lst = os.listdir(data_path) max_index = 0 filepath = '' for filename in files_lst: fileindex_list = re.findall(r'\d+', filename) if not fileindex_list: continue fileindex = int(fileindex_list[0]) if fileindex >= max_index: max_index = fileindex filepath = os.path.join(data_path, filename) if restore: if os.path.exists(filepath): print 'restoring Q_values from {} ...'.format(filepath) agt.set_q_tables(filepath) print 'restoring done...' if LfD: print 'initializing Q_values from LfD(Learning from Demonstration)...' agt.q_table_LfD(lfd_path) for trial in xrange(max_index + 1, n_trials): print "Simulator.run(): Trial {}".format(trial) # [debug] if not agt.test: if trial > 10000 and trial < 30000: agt.epsilon = 0.3 elif trial > 30000 and trial < 50000: agt.epsilon = 0.2 elif trial > 50000 and trial < 70000: agt.epsilon = 0.1 elif trial > 70000: agt.epsilon = 0.05 env.reset() print 'epsilon:', agt.epsilon while True: try: env.step() except KeyboardInterrupt: quit = True finally: if quit or env.done: break env.set_agent_velocity(np.zeros(2)) if not agt.test: if trial % 50 == 0: print "Trial {} done, saving Q table...".format(trial) q_table_file = os.path.join(data_path, 'trial' + str('{:07d}'.format(trial)) + '.cpickle') with open(q_table_file, 'wb') as f: cPickle.dump(agt.Q_values, f, protocol=cPickle.HIGHEST_PROTOCOL) if quit: break print 'successful trials: ', env.succ_times print 'number of trials that hit the hard time limit: ', env.num_hit_time_limit print 'number of trials that ran out of time: ', env.num_out_of_time print 'number of trials that hit cars', env.hit_car_times print 'number of trials that hit walls', env.hit_wall_times
from pybrain3.rl.agents.learning import LearningAgent from pybrain3.rl.learners.valuebased.nfq import NFQ import pandas as pd path = '/Users/arammoghaddassi/Google Drive/Projects/RL-Automated-Trading/data/' aapl = pd.read_csv(path + 'AAPL.csv') amzn = pd.read_csv(path + 'AMZN.csv') #Model switches n_episodes = 10 episdoe_length = 30 controller = ActionValueNetwork(dimState= 1, numActions= 3)#Maps states to actions. learner = NFQ() #Does the actual learning, updates values in action value network. env, agent = Environment(aapl), LearningAgent(controller, learner=learner) agent.reset(), env.reset() for ep in range(n_episodes): agent.newEpisode() for i in range(episdoe_length): state = env.state() agent.integrateObservation(state) action = agent.getAction() #Causing an error right now. state, reward = env.step(action) agent.giveReward(reward) print("Episode: {}, Trial: {}, Balance: {}".format(ep, i, env.account_value())) agent.learn() #When/how should I actually call this method?
# Deep Convolutional Neural Network - CNN # Reinforcemente Learning - Trial and error from environment import Environment from train import Trainer from dqn import DQN # initialize gym environment and dqn env = Environment(args) agent = DQN(env, args) # train agent Trainer(agent).run() # play the game env.gym.monitor.start(args.out, force=True) agent.play() env.gym.monitor.close()
if args.decay_method == "adaptive": if iteration % 10 == 0: if recent_total_reward < last_reward: print "Policy is not improving. Decrease KL and increase steps." if args.max_kl > 0.001: args.max_kl -= args.kl_adapt else: print "Policy is improving. Increase KL and decrease steps." if args.max_kl < 0.01: args.max_kl += args.kl_adapt last_reward = recent_total_reward recent_total_reward = 0 if args.decay_method == "linear": if args.max_kl > 0.001: args.max_kl -= args.kl_adapt if args.decay_method == "exponential": if args.max_kl > 0.001: args.max_kl *= args.kl_adapt rollouts.set_policy_weights(theta) else: from agent.agent_continous import TRPOAgent from environment import Environment env = Environment(gym.make(pms.environment_name)) agent = TRPOAgent(env) agent.test(pms.checkpoint_file) rollouts.end()
import q_learning as q import value_iteration as vi # generate sample environment from common_utils import * from environment import Environment environment = Environment(n_states=5, n_actions=2, n_episodes=10000) print('Q-learning: ') policy, value = q.q_learning(environment) print(value) print(policy) print('VI: ') policy, value = vi.value_iteration(environment) print(value) print(policy)
# esta configurado para o modo multi inimigos ja que o intuito do treino desta rede neural e que ele treine # para conseguir derrotar todos os 8 tipos de inimigos # # enimies - a lista indicando quais inimigos serao enfrentados, existem 8 tipos de inimigos diferentes, cada # fenotipo (rede neural) sera avaliada contra os 8 inimigos uma vez contra cada inimigo # # playermode - usado para indicar qual modo de controle para o jogador, no caso ai indica que sera uma inteligencia artificial # # passado a classe player_controller para indicar para o ambiente como o jogador sera controlado # # speed - utilizado para indicar a velocidade do jogo, fastest rodara o jogo sem limitacao de quadros por segundo # acelerando o processo de treinamento, tambem pode ser configurado para normal # # randomini - o atributo utilizado para configurar a posicao de "spawn" (nascimento) do inimigo no mapa, esta # configurado para nascer em partes aleatorias dos cenarios env = Environment(multiplemode = 'no', enemies = [3], playermode="ai", player_controller=player_controller(), speed = 'fastest', randomini = 'yes') # Executa a simulacao e retorna o valor do fitness deste fenotipo (rede neural) que esta sendo avaliada def simula(env,x): # f = fitness result # p = player life result # e = enemy life result # t = time result f,p,e,t = env.play(pcont=x) return f def eval_genomes(genomes, config): for genome_id, genome in genomes: net = neat.nn.RecurrentNetwork.create(genome, config) genome.fitness = simula(env, net)
start_epsilon = 1 end_epsilon = 0.05 discount = 0.99 learning_rate = 0.0001 BATCH_SIZE = 256 TARGET_UPDATE = 100 Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'done')) resize = T.Compose([ T.ToPILImage(), T.Resize((120, 160), interpolation=Image.CUBIC), T.ToTensor() ]) env = Environment(map_name='loop_empty').create_env() env = DiscreteWrapper(env) env = DtRewardWrapper(env) env.reset() init_screen = get_screen() _, screen_height, screen_width = init_screen.shape n_actions = env.action_space.n policy_net = DQN(screen_height, screen_width, n_actions).to(device) policy_net.apply(weights_init) target_net = DQN(screen_height, screen_width, n_actions).to(device) target_net.apply(weights_init) target_net.load_state_dict(policy_net.state_dict()) target_net.eval()
import numpy as np import matplotlib.pyplot as plt import sys sys.path.insert(0, 'evoman') from environment import Environment from demo_controller import player_controller, enemy_controller from random import sample MUTATION_RATE = 0.3 ENV = Environment(experiment_name="test", enemies=[7], playermode="ai", player_controller=player_controller(), enemy_controller=enemy_controller(), level=2, speed="fastest", contacthurt='player', logs='off') class Individual: dom_u = 1 dom_l = -1 n_hidden = 10 n_vars = (ENV.get_num_sensors() + 1) * n_hidden + (n_hidden + 1) * 5 # multilayer with 50 neurons def __init__(self): self.age = 0 self.weights = list()