class AI: def __init__(self, fname): lr = 0.0005 self.agent = Agent(gamma=0.99, epsilon=0.0, alpha=lr, input_dims=6, n_actions=2, mem_size=60000, batch_size=64, epsilon_end=0.0, fname=fname) self.observation = [] self.action = 0 self.n_step = 0 self.fname = fname.split("/")[-1] def episode_start(self, observation): self.observation = observation def choose_action(self): self.action = self.agent.choose_action(self.observation) return self.action def step(self, observation_, reward, done): self.agent.remember(self.observation, self.action, reward, observation_, int(done)) self.observation = observation_ if self.n_step % 3 == 0: self.agent.learn() self.n_step += 1 def episode_end(self): self.agent.save_model()
def main(): gym_env = gym.make('custom_gym:Xplane-v0') lr = 0.001 gam = 0.01 n_games = 1 # nn_input = obs() agent = Agent(learning_rate=lr, gamma=gam, epsilon=1.0, input_dims=(6, ), n_actions=15, batch_size=32, file_name='AI_takeoff/saved_models/dq_model_2.h5') scores = [] total_steps = [] eps_hist = [] agent.load_model() for i in range(n_games): try: done = False score = 0 observation = gym_env.reset() time.sleep(2) observation_checkpoints = np.array([observation[0:2]]) step_counter = 0 print("GAME ITERATION ", i) while not done: action = agent.choose_action(observation) new_observation, reward, done = gym_env.step(action) step_counter = step_counter + 1 score = score + reward agent.store_transition(observation, action, reward, new_observation, done) observation = new_observation # agent.learn() # This if statement checks if the airplane is stuck observation_checkpoints = np.append(observation_checkpoints, [new_observation[0:2]], axis=0) print(observation_checkpoints) print("stepcounter is", step_counter) if step_counter % 30 == 0: if np.array_equal( observation_checkpoints[step_counter - 30], observation_checkpoints[step_counter - 1]): done = True eps_hist.append(agent.epsilon) scores.append(score) total_steps.append(step_counter) except Exception as e: print(str(e))
def main(): #make env and agent env = gym.make('LunarLander-v2') agent = Agent(gamma=0.99, epsilon=1.0, batch_size=64, n_actions=4, eps_end=0.01, input_dims=[8], lr=0.0001) scores, eps_history = [], [] n_games = 500 for i in range(n_games): score = 0 done = False observation = env.reset() while not done: #ingame #get action from current view of game (observation) action = agent.choose_action(observation) #next frame observation_, reward, done, info = env.step(action) score += reward #store memory agent.store_transisation(observation, action, reward, observation_, done) agent.learn() #set next stage to current stage observation = observation_ #append score and eps scores.append(score) eps_history.append(agent.epsilon) #print some nice statements avg_score = np.mean(scores[-100:]) print( f'Episode: {i} Score: {score} Average Score: {avg_score} Epsilon: {agent.epsilon}' )
brain.store_transition(observation, action, reward, observation_, int(done)) observation = observation_ print('done initializing memory') # uncomment the line below to record every episode. # env = wrappers.Monitor(env, "tmp/space-invaders-1", video_callable=lambda episode_id: True, force=True) for i in range(numGames): print('starting game ', i + 1, 'epsilon: %.4f' % brain.epsilon) epsHistory.append(brain.epsilon) done = False observation = env.reset() observation = preprocess(observation, crop_start, crop_end) score = 0 while not done: action = brain.choose_action(observation) observation_, reward, done, info = env.step(action) score += reward observation_ = preprocess(observation_, crop_start, crop_end) if done and info['ale.lives'] == 0: reward = -100 brain.store_transition(observation, action, reward, observation_, int(done)) observation = observation_ brain.learn() env.render() scores.append(score) print('score:', score) x = [i + 1 for i in range(numGames)] fileName = str(numGames) + 'Games' + 'Gamma' + str(brain.gamma) + \ 'Alpha' + str(brain.lr) + 'Memory' + str(brain.mem_size)+ '.png'
n_games = 300 show = False agent = Agent(gamma=0.99, epsilon=1.0, alpha=0.0005, input_dims=8, n_actions=4, batch_size=64) scores = [] eps_history = [] for i in range(1, n_games+1): done = False score = 0 obseervation = env.reset() while not done: if show: env.render() action = agent.choose_action(obseervation) obseervation_, reward, done, info = env.step(action) score += reward agent.remember(obseervation, action, reward, obseervation_, done) obseervation = obseervation_ agent.learn() eps_history.append(agent.epsilon) scores.append(score) avg_score = np.mean(scores[max(0, i-100):i+1]) print('epsiode', i, 'score ', score, 'avg score', avg_score) if i % 10 == 0 and i > 0: agent.save_model() plt.plot(scores)
def main(): scores = [] eps_history = [] info_history = [] # Random starting-points: env = sky.make(random=True, xi=(301, 650 - 25), yi=(100, 300 - 25), width=15, height=15, v_initial=14) # Fixed starting-point: #env = sky.make(xi=550) agent = Agent(gamma=gamma, epsilon=epsilon, lr=lr, input_dims=[imput_dimensions], n_actions=n_actions, mem_size=mem_size, batch_size=batch_size, epsilon_dec=epsilon_dec) if (load_checkpoint): agent.load_modes() for i in range(n_games): score = 0 done = False observation = env.reset() while not done: ''' one game: ending, when done=True ''' action = agent.choose_action(observation) observation_, reward, done, info = env.step(action) score += reward agent.store_transition(observation, action, reward, observation_, int(done)) observation = observation_ agent.learn() if i % 10 == 0 and i > 0: avg_score = np.mean(scores[max(0, i - 10):(i + 1)]) print(i, 'episode', info, '|| score:', score, '| average score: %.3f' % avg_score, '| epsilon: %.3f' % agent.epsilon, '| training done:', round(i / n_games, 2)) else: print(i, 'episode', info, '|| score:', score) scores.append(score) eps_history.append(agent.epsilon) info_history.append(info) print('training ended with:', [[el, info_history.count(el)] for el in ('crashed', 'goal')]) if (save_checkpoint): agent.save_models() print('[+] model saved') # ------------------- # Plotting and output # ------------------- x = [i + 1 for i in range(n_games)] # First axis: Scores fig, ax1 = plt.subplots() color = 'tab:red' ax1.set_xlabel('Episode') ax1.set_ylabel('score per Episode', color=color) ax1.scatter(x, scores, color=color, s=2) ax1.tick_params(axis='y', labelcolor=color) # Second axis: epsilon ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis color = 'tab:blue' ax2.set_ylabel('epsilon', color=color) # we already handled the x-label with ax1 ax2.plot(x, eps_history, color=color) ax2.tick_params(axis='y', labelcolor=color) # Output fig.tight_layout() # otherwise the right y-label is slightly clipped plt.savefig(filename) return env
def selfplay(): """ legacy function for trying to implement self-play reinforcement learning like alpha-zero Go """ agent2 = Agent(0.99, 0.1, 0.003, 42, train_games, 7, eps_dec) agent2.load_checkpoint() global win_cntr global done g = Game() turn = random.choice([PLAYER, AI]) done = False transitions_agent = [] transitions_agent2 = [] while done == False: g.printBoard() if turn == PLAYER: # row = input('{}\'s turn: '.format('Red')) # g.insert(int(row), turn) observation = [] for sublist in g.board: for i in sublist: observation.append(i) observation = np.asarray(observation) action = agent2.choose_action(observation) if g.check_if_action_valid(action): print('{}\'s turn: %d'.format('Red') % action) g.insert(action, PLAYER_PIECE) else: while g.check_if_action_valid(action) == False: agent.store_transition(observation, action, -100, observation, done) action = np.random.randint(7) print('{}\'s turn: %d'.format('Red') % action) g.insert(action, PLAYER_PIECE) observation_ = [] for sublist in g.board: for i in sublist: observation_.append(i) observation_ = np.asarray(observation_) transitions_agent2 += [(observation, action, observation_, done)] else: observation = [] for sublist in g.board: for i in sublist: observation.append(i) observation = np.asarray(observation) action = agent.choose_action(observation) if g.check_if_action_valid(action): print('{}\'s turn: %d'.format('Yellow') % action) g.insert(action, AI_PIECE) else: while g.check_if_action_valid(action) == False: agent.store_transition(observation, action, -100, observation, done) action = np.random.randint(7) print('{}\'s turn: %d'.format('Yellow') % action) g.insert(action, AI_PIECE) observation_ = [] for sublist in g.board: for i in sublist: observation_.append(i) observation_ = np.asarray(observation_) transitions_agent += [(observation, action, observation_, done)] turn = AI if turn == PLAYER else PLAYER if g.getWinner() == Tie: reward_agent = 0 else: winner = AI if turn == PLAYER else PLAYER if winner == AI: win_cntr += 1 if vertical_win: reward_agent = 5 else: reward_agent = 20 else: reward_agent = -20 for i in range(len(transitions_agent)): agent.store_transition(transitions_agent[i][0], transitions_agent[i][1], reward_agent, transitions_agent[i][2], transitions_agent[i][3]) agent.learn() return
break elif e.type == pygame.KEYDOWN: if e.key == pygame.K_RETURN: done = False p1_state, p2_state = env.reset() break env.render() # Get actions based on player_Type if p1_type == 'Human': p1_action = pygame.mouse.get_pos()[1], pygame.mouse.get_rel( )[1] elif p1_type == 'Agent': p1_action = agent_1.choose_action(p1_state) if p2_type == 'Human': p2_action = pygame.mouse.get_pos()[1], pygame.mouse.get_rel( )[1] elif p2_type == 'Agent': p2_action = agent_2.choose_action(p2_state) # Environment takes a step, return observations, reward, and status p1_state_, p2_state_, p1_reward, p2_reward, done = env.step( p1_action, p2_action) # Update memory objects with states for each player if train_networks == True: memory_1.store_transition(p1_state, p1_action, p1_reward,
>>>>>>> e97bebf4b98d392a9fbd9abab6252c78816f590c scores['avg'].append(avg_score) scores['max'].append(max_score) scores['min'].append(min_score) else: print("episode " + str(episode) + " score " + str(score)) state = env.reset() state = preprocess_state(state) stacked_states = None stacked_states = stack_states(stacked_states, state, stack_size) score = 0 done = False while not done: action = agent.choose_action(stacked_states) action += 1 new_state, reward, done, _ = env.step(action) new_state = preprocess_state(new_state) new_stacked_states = stack_states(stacked_states, new_state, stack_size) action -= 1 agent.store_transition(stacked_states, action, reward, new_stacked_states, int(done)) score += reward agent.learn() stacked_states = new_stacked_states history['eps'].append(agent.epsilon) history['score'].append(score)