def trainer(): env = GameEnv(N_agents=8, enemies_stationary=True) # env.seed(0) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") st.write("training using device {}".format(device)) agentArgs = { "device": "cuda:0", "state_size": env.getStateDimension(), "action_size": 2, "seed": 0, } agent = Agent(**agentArgs) runner(env, agent, "ddqn")
env_info = env.reset(train_mode=True)[brain_name] # number of agents in the environment print('Number of agents:', len(env_info.agents)) # number of actions action_size = brain.vector_action_space_size print('Number of actions:', action_size) # examine the state space state = env_info.vector_observations[0] print('States look like:', state) state_size = len(state) print('States have length:', state_size) agent = Agent(state_size=state_size, action_size=action_size) # agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth')) def dqn(n_episodes=2000, max_t=1000, eps_start=0.9, eps_end=0.01, eps_decay=0.95): """Deep Q-Learning. Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection
env_info = env.reset(train_mode=False)[brain_name] # number of agents in the environment print('Number of agents:', len(env_info.agents)) # number of actions action_size = brain.vector_action_space_size print('Number of actions:', action_size) # examine the state space state = env_info.vector_observations[0] print('States look like:', state) state_size = len(state) print('States have length:', state_size) agent = Agent(state_size=state_size, action_size=action_size) agent.qnetwork_local.load_state_dict( torch.load(dirpath + "/checkpoint.pth")) max_t = 1000 for i in range(10): env_info = env.reset(train_mode=False)[brain_name] # reset the environment state = env_info.vector_observations[0] score = 0 for t in range(max_t): action = agent.act(state, 0.01) # send the action to the environment env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has score += reward
import gym import random import torch import numpy as np from collections import deque import matplotlib.pyplot as plt env = gym.make('LunarLander-v2') env.seed(0) print('State shape: ', env.observation_space.shape) print('Number of actions: ', env.action_space.n) from ddqn_agent import Agent agent = Agent(state_size=8, action_size=4, seed=0) # watch an untrained agent state = env.reset() score = 0 for j in range(200): action = agent.act(state) env.render() state, reward, done, _ = env.step(action) score += reward if done: break print('Score: %d' % score) env.close()
import gym import random import torch import numpy as np from collections import deque import matplotlib.pyplot as plt env = gym.make('LunarLander-v2') env.seed(0) print('State shape: ', env.observation_space.shape) print('Number of actions: ', env.action_space.n) from ddqn_agent import Agent agent = Agent(state_size=8, action_size=4, seed=0) # watch an untrained agent state = env.reset() score=0 def dqn(n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): """Deep Q-Learning. Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
env = UnityEnvironment(file_name="./Banana.app") # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # get the state and action size from env env_info = env.reset(train_mode=False)[brain_name] action_size = brain.vector_action_space_size state = env_info.vector_observations[0] state_size = len(state) # initialize agent from trained model algo = 'ddqn' agent = Agent(state_size, action_size, seed=10, algo=algo) print(f'Create agent from trained {algo} model') agent.qnetwork_local.load_state_dict(torch.load(f'{algo}_model.pt')) # track game plays scores = [] total_tries = 0 while total_tries < TOTAL_PLAYS: score = 0 done = False env_info = env.reset(train_mode=False)[brain_name] state = env_info.vector_observations[0] while not done: action = agent.act(state) env_info = env.step(action)[brain_name]
# examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) # agent = MultiAgent(state_size=33, action_size=4, num_agents=20, random_seed=2) # agent = MultiAgent(state_size=33, action_size=4, num_agents=1, random_seed=2) agent = Agent(state_size=33, action_size=4, random_seed=args.seed, num_agents=num_agents, buffer_size=args.buffer_size, batch_size=args.batch_size, lr_actor=args.lr_actor, lr_critic=args.lr_critic, weight_decay=args.weight_decay, update_every=args.update_every, update_times=args.update_times, gamma=args.gamma) import matplotlib.pyplot as plt # %matplotlib inline def ddpg(n_episodes=1000, max_t=1200, print_every=100): scores_deque = deque(maxlen=print_every) scores = [] times = [] for i_episode in range(1, n_episodes + 1):
def run_training_loop(checkpoint_file: str, tp: TrainingParams, algo: str): """ Setup environment for agent to interact and learn Params ====== checkpoint_file: file name for check pointing. tp: a set of parameters for training algo: algorithm to use : ddqn or dqn """ # setup environment env = UnityEnvironment(file_name="./Banana.app") brain_name = env.brain_names[0] brain = env.brains[brain_name] # get the state_size and action_size of the env action_size = brain.vector_action_space_size env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] state_size = len(state) # create the agent agent = Agent(state_size, action_size, seed=10, algo=algo) scores = [] scores_window = deque(maxlen=100) best_avg_score = 0.0 eps = tp.eps_start print(f'Training parameters:\n{tp}') for i_episode in range(1, tp.n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] score = 0.0 for t in range(tp.max_t): action = agent.act(state, eps) env_info = env.step(action)[ brain_name] # send the action to the environment next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_window.append(score) scores.append(score) eps = max(tp.eps_end, eps * tp.eps_decay) print(f'\rEpisode {i_episode}\tScore: {score}') if i_episode % 100 == 0: print('\rEpisode {},\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) if i_episode > 100 and np.mean(scores_window) > best_avg_score: print( f'\nBest avg score improved in last 100 of {i_episode} episodes! Average Score: {np.mean(scores_window):.2f}. Check pointing.' ) torch.save(agent.qnetwork_local.state_dict(), checkpoint_file) best_avg_score = np.mean(scores_window) env.close() return scores