class MADDPG: def __init__(self, state_size, action_size, num_agents, random_seed): super(MADDPG, self).__init__() self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) ##Create agents in the enviromnent self.agents = [ Agent(state_size, action_size, random_seed, num_agents) for i in range(num_agents)] ###create shared Memory Replay Buffer self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def reset(self): for agent in self.agents: agent.reset() def act(self, states,noise): return [ agent.act(state, noise) for agent, state in zip(self.agents,states)] def step(self, states, actions, rewards, next_states, dones,num_current_episode): '''experience replay to save experiences in replay memory and use them to learn''' self.memory.add(encode(states), encode(actions), rewards, encode(next_states), dones) if (len(self.memory)>BATCH_SIZE) and (num_current_episode % UPDATE_EVERY_NB_EPISODE==0): for i in range(MULTIPLE_LEARN_PER_UPDATE): experiences = self.memory.sample() #SAMPLE A BATCH OF EXP FROM MEMORY ###as of now maddpg_learn only works with 2 agents; ###modify it accept n number of agents '''Update agent 0 ''' self.maddpg_learn(experiences, own_idx=0, other_idx=1) experiences = self.memory.sample() '''update agent 1''' self.maddpg_learn(experiences, own_idx=1, other_idx=0) def maddpg_learn(self, experiences, own_idx, other_idx, gamma=GAMMA): states, actions, rewards, next_states, dones = experiences # Filter out the agent OWN states, actions and next_states batch own_states = decode(self.state_size, self.num_agents, own_idx, states) own_actions = decode(self.action_size, self.num_agents, own_idx, actions) own_next_states = decode(self.state_size, self.num_agents, own_idx, next_states) # Filter out the OTHER agent states, actions and next_states batch other_states = decode(self.state_size, self.num_agents, other_idx, states) other_actions = decode(self.action_size, self.num_agents, other_idx, actions) other_next_states = decode(self.state_size, self.num_agents, other_idx, next_states) # Concatenate both agent information (own agent first, other agent in second position) all_states=torch.cat((own_states, other_states), dim=1).to(device) all_actions=torch.cat((own_actions, other_actions), dim=1).to(device) all_next_states=torch.cat((own_next_states, other_next_states), dim=1).to(device) agent = self.agents[own_idx] # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models all_next_actions = torch.cat((agent.actor_target(own_states), agent.actor_target(other_states)), dim =1).to(device) #print("all states, all actions" + str(all_next_states.shape) + " " + str(all_next_actions.shape) ) Q_targets_next = agent.critic_target(all_next_states, all_next_actions) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = agent.critic_local(all_states, all_actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss agent.critic_optimizer.zero_grad() critic_loss.backward() if (CLIP_CRITIC_GRADIENT): torch.nn.utils.clip_grad_norm(agent.critic_local.parameters(), 1) agent.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss all_actions_pred = torch.cat((agent.actor_local(own_states), agent.actor_local(other_states).detach()), dim = 1).to(device) actor_loss = -agent.critic_local(all_states, all_actions_pred).mean() # Minimize the loss agent.actor_optimizer.zero_grad() actor_loss.backward() agent.actor_optimizer.step() # ----------------------- update target networks ----------------------- # agent.soft_update(agent.critic_local, agent.critic_target, TAU) agent.soft_update(agent.actor_local, agent.actor_target, TAU) def maddpg_learn_old(self, experiences, own_idx, other_idx, gamma=GAMMA): '''Only works for 2 agents systems; modify it for any number of agents''' states, actions, rewards, next_states, dones = experiences ##filtering out own states own_states = decode(self.state_size, self.num_agents, own_idx, states) own_actions = decode(self.action_size,self.num_agents, own_idx, actions) own_next_states = decode(self.state_size, self.num_agents, own_idx, next_states) ##filter out other agent states other_states = decode(self.state_size, self.num_agents, other_idx, states) other_actions = decode(self.action_size,self.num_agents, other_idx, actions) other_next_states = decode(self.state_size, self.num_agents, other_idx, next_states) ##conacat both agent info all_states = torch.cat((own_states, other_states), dim=1).to(device) all_actions = torch.cat((own_actions, other_actions), dim=1).to(device) all_next_states = torch.cat((own_next_states, other_next_states), dim=1).to(device) agent = self.agents[own_idx] ######update the critic####### '''Get predicted next state action and Q values from target models''' all_next_actions = torch.cat((agent.actor_target(own_states), agent.actor_target(other_states)), dim=1).to(device) Q_targets_next = agent.critic_target(all_next_states, all_next_actions) Q_targets = rewards + (gamma*Q_targets_next*(1-dones)) #Q target for current state Q_expected = agent.critic_local(all_states, all_actions) critic_loss = F.mse_loss(Q_expected, Q_targets) ##minimize the loss agent.critc_optimizer.zero_grad() critic_loss.backward() if(CLIP_CRITIC_GRADIENT): torch.nn.utils.clip_grad_norm(agent.critic_local.parameters(), 1) agent.critc_optimizer.step() #####Update Actor######## all_actions_pred = torch.cat((agent.actor_local(own_states), agent.actor_local(other_states).detach()), dim=1).to(device) actor_loss = -agent.critic_local(all_states, all_actions_pred).mean() ###minimize the loss#### agent.actor_optimizer.zero_grad() actor_loss.backward() agent.actor_optimizer.step() #####UPDATE TARGET NETWORKS######## agent.soft_update(agent.critic_local, agent.critic_target, TAU) agent.soft_update(agent.actor_local, agent.actor_target, TAU) def checkpoints(self): """Save checkpoints for all Agents""" for idx, agent in enumerate(self.agents): actor_local_filename = 'model_dir/checkpoint_actor_local_' + str(idx) + '.pth' critic_local_filename = 'model_dir/checkpoint_critic_local_' + str(idx) + '.pth' actor_target_filename = 'model_dir/checkpoint_actor_target_' + str(idx) + '.pth' critic_target_filename = 'model_dir/checkpoint_critic_target_' + str(idx) + '.pth' torch.save(agent.actor_local.state_dict(), actor_local_filename) torch.save(agent.critic_local.state_dict(), critic_local_filename) torch.save(agent.actor_target.state_dict(), actor_target_filename) torch.save(agent.critic_target.state_dict(), critic_target_filename)
critic = Critic( sess, state_size=state_space, lr=LR_C ) # we need a good teacher, so the teacher should learn faster than the actor timestampe = datetime.datetime.now().strftime("%Y_%m_%d_%H%M") writer = tf.summary.FileWriter("./logs/LabyrinthZone_Act1/%s" % timestampe, sess.graph) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() max_t_interval = 100 scores = [] # list containing scores from each episode scores_window = deque(maxlen=max_t_interval) # last 100 scores memory = ReplayBuffer(action_space, BUFFER_SIZE, BATCH_SIZE, 714) state = None def store_img(state, epoch, step): name = '../state_img/state_epoch_%i_%i.png' % (epoch, step) cv2.imwrite(name, state) # RENDER = False # eplison = 0.7 # decay = 0.95 # min_eplison = 0.05 max_mean_score = 2000 total_timestep = 0
class Maddpg(): '''MADDPG Agent : Interacts with and learns from the environment''' def __init__(self, state_size, action_size, num_agents, random_seed): self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) # Instantiate Multiple Agent self.agents = [ Agent(state_size, action_size, random_seed, num_agents) for i in range(num_agents) ] # Instantiate Memory replay Buffer (shared between agents) self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def reset(self): '''reset agents''' for agent in self.agents: agent.reset() def act(self, states, noise): '''Return action to perform for each agents (per policy)''' return [ agent.act(state, noise) for agent, state in zip(self.agents, states) ] def step(self, states, actions, rewards, next_states, dones, num_current_episode): '''Save experience in replay memory, and use random sample from buffer to learn''' self.memory.add(encode(states), encode(actions), rewards, encode(next_states), dones) # If enough samples in the replay memory and if it is time to update if (len(self.memory) > BATCH_SIZE) and (num_current_episode % UPDATE_EVERY_NB_EPISODE == 0): # Note: this code only expects 2 agents assert (len(self.agents) == 2) # Allow to learn several time in a row in the same episode for i in range(MULTIPLE_LEARN_PER_UPDATE): # Sample a batch of experience from the replay buffer experiences = self.memory.sample() # Update Agent #0 self.maddpg_learn(experiences, own_idx=0, other_idx=1) # Sample another batch of experience from the replay buffer experiences = self.memory.sample() # Update Agent #1 self.maddpg_learn(experiences, own_idx=1, other_idx=0) def maddpg_learn(self, experiences, own_idx, other_idx, gamma=GAMMA): states, actions, rewards, next_states, dones = experiences # Filter out the agent OWN states, actions and next_states batch own_states = decode(self.state_size, self.num_agents, own_idx, states) own_actions = decode(self.action_size, self.num_agents, own_idx, actions) own_next_states = decode(self.state_size, self.num_agents, own_idx, next_states) # Filter out the OTHER agent states, actions and next_states batch other_states = decode(self.state_size, self.num_agents, other_idx, states) other_actions = decode(self.action_size, self.num_agents, other_idx, actions) other_next_states = decode(self.state_size, self.num_agents, other_idx, next_states) # Concatenate both agent information (own agent first, other agent in second position) all_states = torch.cat((own_states, other_states), dim=1).to(device) all_actions = torch.cat((own_actions, other_actions), dim=1).to(device) all_next_states = torch.cat((own_next_states, other_next_states), dim=1).to(device) agent = self.agents[own_idx] # Update Critic # Get predicted next-state actions and Q values from target models all_next_actions = torch.cat( (agent.actor_target(own_states), agent.actor_target(other_states)), dim=1).to(device) Q_targets_next = agent.critic_target(all_next_states, all_next_actions) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = agent.critic_local(all_states, all_actions) critic_loss = F.mse_loss(Q_expected, Q_targets) agent.critic_optimizer.zero_grad() critic_loss.backward() if (CLIP_CRITIC_GRADIENT): torch.nn.utils.clip_grad_norm(agent.critic_local.parameters(), 1) agent.critic_optimizer.step() # Update Actor # Compute actor loss all_actions_pred = torch.cat( (agent.actor_local(own_states), agent.actor_local(other_states).detach()), dim=1).to(device) actor_loss = -agent.critic_local(all_states, all_actions_pred).mean() agent.actor_optimizer.zero_grad() actor_loss.backward() agent.actor_optimizer.step() # Update target networks agent.soft_update(agent.critic_local, agent.critic_target, TAU) agent.soft_update(agent.actor_local, agent.actor_target, TAU) def checkpoints(self): '''Save checkpoints for all Agents''' for idx, agent in enumerate(self.agents): actor_local_filename = 'model_dir/checkpoint_actor_local_' + str( idx) + '.pth' critic_local_filename = 'model_dir/checkpoint_critic_local_' + str( idx) + '.pth' actor_target_filename = 'model_dir/checkpoint_actor_target_' + str( idx) + '.pth' critic_target_filename = 'model_dir/checkpoint_critic_target_' + str( idx) + '.pth' torch.save(agent.actor_local.state_dict(), actor_local_filename) torch.save(agent.critic_local.state_dict(), critic_local_filename) torch.save(agent.actor_target.state_dict(), actor_target_filename) torch.save(agent.critic_target.state_dict(), critic_target_filename)
def ddpg(agent_name, multiple_agents = False, PER = False, n_episodes = 300, max_t = 1000): """ Deep Deterministic Policy Gradients Params ====== agent_name (string): agent name multiple_agents (boolean): boolean for multiple agents PER (boolean): n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode """ env, env_info, states, state_size, action_size, brain_name, num_agents = initialize_env(multiple_agents) device = get_device() scores_window = deque(maxlen=100) scores = np.zeros(num_agents) scores_episode = [] agents = [] shared_memory = ReplayBuffer(device, BUFFER_SIZE, BATCH_SIZE, RANDOM_SEED) for agent_id in range(num_agents): agents.append(Actor_Crtic_Agent(agent_name, agent_id, device, state_size, action_size)) for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode = True)[brain_name] states = env_info.vector_observations for agent in agents: agent.reset() scores = np.zeros(num_agents) for t in range(max_t): actions = np.array([agents[i].act(states[i]) for i in range(num_agents)]) env_info = env.step(actions)[brain_name] # send the action to the environment next_states = env_info.vector_observations # get the next state rewards = env_info.rewards # get the reward dones = env_info.local_done for i in range(num_agents): agents[i].step(states[i], actions[i], rewards[i], next_states[i], dones[i], shared_memory) if shared_memory.batch_passed(): # exit() experiences = shared_memory.sample() agents[0].learn(experiences, shared_memory) agents = share_learning(agents[0].actor_local, agents) states = next_states scores += rewards if t % 20: print('\rTimestep {}\tScore: {:.2f}\tmin: {:.2f}\tmax: {:.2f}' .format(t, np.mean(scores), np.min(scores), np.max(scores)), end="") if np.any(dones): break score = np.mean(scores) scores_window.append(score) # save most recent score scores_episode.append(score) print('\rEpisode {}\tScore: {:.2f}\tAverage Score: {:.2f}\tMax Score: {:.2f}'.format(i_episode, score, np.mean(scores_window), np.max(scores)), end="\n") update_csv(agent_name, i_episode, np.mean(scores_window), np.max(scores)) agents[0].save_agent(agent_name) # Early stop if i_episode == 100: return scores_episode if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window))) if np.mean(scores_window)>=30.0: print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window))) agents[0].save_agent(agent_name + "Complete") break return scores_episode
class DQN: def __init__(self, n_states, n_actions, gamma=0.99, epsilon_start=0.9, epsilon_end=0.05, epsilon_decay=200, memory_capacity=10000, policy_lr=0.01, batch_size=128, device="cpu"): self.actions_count = 0 self.n_actions = n_actions # 总的动作个数 self.device = device # 设备,cpu或gpu等 self.gamma = gamma # e-greedy策略相关参数 self.epsilon = 0 self.epsilon_start = epsilon_start self.epsilon_end = epsilon_end self.epsilon_decay = epsilon_decay self.batch_size = batch_size self.policy_net = FCN(n_states, n_actions).to(self.device) self.target_net = FCN(n_states, n_actions).to(self.device) # target_net的初始模型参数完全复制policy_net self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() # 不启用 BatchNormalization 和 Dropout # 可查parameters()与state_dict()的区别,前者require_grad=True self.optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) self.loss = 0 self.memory = ReplayBuffer(memory_capacity) def choose_action(self, state, train=True): '''选择动作 ''' if train: self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ math.exp(-1. * self.actions_count / self.epsilon_decay) self.actions_count += 1 if random.random() > self.epsilon: with torch.no_grad(): # 先转为张量便于丢给神经网络,state元素数据原本为float64 # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价 state = torch.tensor( [state], device=self.device, dtype=torch.float32) # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>) q_value = self.policy_net(state) # tensor.max(1)返回每行的最大值以及对应的下标, # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0])) # 所以tensor.max(1)[1]返回最大值对应的下标,即action action = q_value.max(1)[1].item() else: action = random.randrange(self.n_actions) return action else: with torch.no_grad(): # 先转为张量便于丢给神经网络,state元素数据原本为float64 # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价 state = torch.tensor( [state], device='cpu', dtype=torch.float32) # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>) q_value = self.target_net(state) # tensor.max(1)返回每行的最大值以及对应的下标, # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0])) # 所以tensor.max(1)[1]返回最大值对应的下标,即action action = q_value.max(1)[1].item() return action def update(self): if len(self.memory) < self.batch_size: return # 从memory中随机采样transition state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample( self.batch_size) # 转为张量 # 例如tensor([[-4.5543e-02, -2.3910e-01, 1.8344e-02, 2.3158e-01],...,[-1.8615e-02, -2.3921e-01, -1.1791e-02, 2.3400e-01]]) state_batch = torch.tensor( state_batch, device=self.device, dtype=torch.float) action_batch = torch.tensor(action_batch, device=self.device).unsqueeze( 1) # 例如tensor([[1],...,[0]]) reward_batch = torch.tensor( reward_batch, device=self.device, dtype=torch.float) # tensor([1., 1.,...,1]) next_state_batch = torch.tensor( next_state_batch, device=self.device, dtype=torch.float) done_batch = torch.tensor(np.float32( done_batch), device=self.device).unsqueeze(1) # 将bool转为float然后转为张量 # 计算当前(s_t,a)对应的Q(s_t, a) # 关于torch.gather,对于a=torch.Tensor([[1,2],[3,4]]) # 那么a.gather(1,torch.Tensor([[0],[1]]))=torch.Tensor([[1],[3]]) q_predict = self.policy_net(state_batch).gather( dim=1, index=action_batch) # 等价于self.forward # 计算所有next states的Q'(s_{t+1})的最大值,Q'为目标网络的q函数 next_state_values = self.target_net( next_state_batch).max(1)[0].detach() # 比如tensor([ 0.0060, -0.0171,...,]) # 计算 q_target # 对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward q_target = reward_batch + self.gamma * next_state_values * (1-done_batch[0]) self.loss = nn.MSELoss()(q_predict, q_target.unsqueeze(1)) # 计算 均方误差loss # 优化模型 self.optimizer.zero_grad() # zero_grad清除上一步所有旧的gradients from the last step # loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分 self.loss.backward() for param in self.policy_net.parameters(): # clip防止梯度爆炸 param.grad.data.clamp_(-1, 1) self.optimizer.step() # 更新模型 def save_model(self,path): torch.save(self.target_net.state_dict(), path) def load_model(self,path): self.target_net.load_state_dict(torch.load(path))
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, config): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = config.state_size self.action_size = config.action_size self.seed = random.seed(config.random_seed) self.config = config self.t_step = 0 # Actor Network (w/ Target Network) self.actor_local = Actor(self.state_size, self.action_size, config.random_seed).to(device) self.actor_target = Actor(self.state_size, self.action_size, config.random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(self.state_size, self.action_size, config.random_seed).to(device) self.critic_target = Critic(self.state_size, self.action_size, config.random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=config.lr_critic, weight_decay=config.weight_decay) # Noise process self.noise = OUNoise(self.action_size, config.random_seed) # Replay memory self.memory = ReplayBuffer(self.action_size, config.buffer_size, config.batch_size, config.random_seed) def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.config.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.config.batch_size: experiences = self.memory.sample() self.learn(experiences, self.config.gamma) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1) self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.config.tau) self.soft_update(self.actor_local, self.actor_target, self.config.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def train(args, param): """ Args: """ # create CNN convert the [1,3,84,84] to [1, 200] torch.cuda.set_device(1) use_gym = False # in case seed experements args.seed = param now = datetime.now() dt_string = now.strftime("%d_%m_%Y_%H:%M:%S") #args.repeat_opt = repeat_opt torch.manual_seed(args.seed) np.random.seed(args.seed) pathname = str(args.env_name) + '-agent-' + str(args.policy) pathname += "_states_image_" pathname += '_update_freq: ' + str( args.target_update_freq) + "num_q_target_" + str( args.num_q_target) + "_seed_" + str(args.seed) text = "Star_training target_update_freq: {} num_q_target: {} use device {} ".format( args.target_update_freq, args.num_q_target, args.device) print(pathname, text) write_into_file(pathname, text) arg_text = str(args) write_into_file(pathname, arg_text) tensorboard_name = 'runs/' + pathname writer = SummaryWriter(tensorboard_name) if use_gym: env = gym.make(args.env_name) env.seed(args.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) args.max_episode_steps = env._max_episode_steps else: size = 84 env = suite.make( args.env_name, has_renderer=False, use_camera_obs=True, ignore_done=True, has_offscreen_renderer=True, camera_height=size, camera_width=size, render_collision_mesh=False, render_visual_mesh=True, camera_name='agentview', use_object_obs=False, camera_depth=True, reward_shaping=True, ) state_dim = 200 print("State dim, ", state_dim) action_dim = env.dof max_action = 1 args.max_episode_steps = 200 if args.policy == "TD3_ad": policy = TD31v1(state_dim, action_dim, max_action, args) elif args.policy == "DDPG": policy = DDPG(state_dim, action_dim, max_action, args) file_name = "./pytorch_models/{}".format(args.env_name) replay_buffer = ReplayBuffer() save_env_vid = False total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 done = True t0 = time.time() scores_window = deque(maxlen=100) episode_reward = 0 evaluations = [] tb_update_counter = 0 while total_timesteps < args.max_timesteps: tb_update_counter += 1 # If the episode is done if done: episode_num += 1 #env.seed(random.randint(0, 100)) scores_window.append(episode_reward) average_mean = np.mean(scores_window) if tb_update_counter > args.tensorboard_freq: print("Write tensorboard") tb_update_counter = 0 writer.add_scalar('Reward', episode_reward, total_timesteps) writer.add_scalar('Reward mean ', average_mean, total_timesteps) writer.flush() # If we are not at the very beginning, we start the training process of the model if total_timesteps != 0: text = "Total Timesteps: {} Episode Num: {} ".format( total_timesteps, episode_num) text += "Episode steps {} ".format(episode_timesteps) text += "Reward: {:.2f} Average Re: {:.2f} Time: {}".format( episode_reward, np.mean(scores_window), time_format(time.time() - t0)) print(text) write_into_file('search-' + pathname, text) #policy.train(replay_buffer, writer, episode_timesteps) # We evaluate the episode and we save the policy if timesteps_since_eval >= args.eval_freq: timesteps_since_eval %= args.eval_freq evaluations.append( evaluate_policy(policy, writer, total_timesteps, args, env)) torch.manual_seed(args.seed) np.random.seed(args.seed) save_model = file_name + '-{}reward_{:.2f}-agent{}'.format( episode_num, evaluations[-1], args.policy) policy.save(save_model) # When the training step is done, we reset the state of the environment if use_gym: obs = env.reset() else: state = env.reset() obs, state_buffer = stacked_frames(state, size, args, policy) # Set the Done to False done = False # Set rewards and episode timesteps to zero episode_reward = 0 episode_timesteps = 0 # Before 10000 timesteps, we play random actions if total_timesteps < args.start_timesteps: if use_gym: action = env.action_space.sample() else: action = np.random.randn(env.dof) else: # After 10000 timesteps, we switch to the model if use_gym: action = policy.select_action(np.array(obs)) # If the explore_noise parameter is not 0, we add noise to the action and we clip it if args.expl_noise != 0: action = (action + np.random.normal( 0, args.expl_noise, size=env.action_space.shape[0])).clip( env.action_space.low, env.action_space.high) else: action = (policy.select_action(np.array(obs)) + np.random.normal( 0, max_action * args.expl_noise, size=action_dim)).clip(-max_action, max_action) if total_timesteps % args.target_update_freq == 0: if args.policy == "TD3_ad": policy.hardupdate() # The agent performs the action in the environment, then reaches the next state and receives the reward new_obs, reward, done, _ = env.step(action) if not use_gym: new_obs, state_buffer = create_next_obs(new_obs, size, args, state_buffer, policy) # We check if the episode is done #done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done) done_bool = 0 if episode_timesteps + 1 == args.max_episode_steps else float( done) if not use_gym: if episode_timesteps + 1 == args.max_episode_steps: done = True # We increase the total reward reward = reward * args.reward_scalling episode_reward += reward # We store the new transition into the Experience Replay memory (ReplayBuffer) if args.debug: print("add to buffer next_obs ", obs.shape) print("add to bufferobs ", new_obs.shape) replay_buffer.add((obs, new_obs, action, reward, done_bool)) # We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy obs = new_obs if total_timesteps > args.start_timesteps: policy.train(replay_buffer, writer, 1) episode_timesteps += 1 total_timesteps += 1 timesteps_since_eval += 1 # We add the last policy evaluation to our list of evaluations and we save our model evaluations.append( evaluate_policy(policy, writer, total_timesteps, args, episode_num))
class MADDPGAgentGroup: """Group the MADDPG agents as a single entity""" def __init__( self, # env, state_size, action_size, num_agents, writer, hparams, print_every=1000, result_dir='results'): self.num_agents = num_agents # self.env = env # self.brain_name = self.env.brain_names[0] self.state_size = state_size self.action_size = action_size self.batch_size = hparams.batch_size self.buffer_size = hparams.buffer_size self.seed = hparams.seed self.update_every = hparams.update_every random.seed(self.seed) self.writer = writer self.result_dir = result_dir self.hparams = hparams self.agents = [ ma.MADDPGAgent(self.num_agents, self.state_size, self.action_size, i, self.writer, self.hparams, result_dir=self.result_dir) for i in range(self.num_agents) ] self.gamma = hparams.gamma self.memory = ReplayBuffer( self.buffer_size, self.batch_size, self.hparams.seed, ) self.print_every = print_every self.learn_step = 0 self.critic_loss = 0.0 self.actor_loss = 0.0 def act(self, states, add_noise=True): """Executes act on all the agents Parameters: states (list): list of states, one for each agent add_noise (bool): whether to apply noise to the actions """ actions = [] for i, agent in enumerate(self.agents): action = agent.act(states[i], add_noise) actions.append(action) return actions def reshape(self, states, actions, rewards, next_states, dones): """Reshape the inputs """ # adding axis=0 to states, actions, and next_states states = np.expand_dims(states, axis=0) next_states = np.expand_dims(next_states, axis=0) assert (states.shape[0] == 1 and states.shape[1] == self.num_agents and states.shape[2] == self.state_size) actions = np.expand_dims(actions, axis=0) assert (actions.shape[0] == 1 and actions.shape[1] == self.num_agents and actions.shape[2] == self.action_size) # for rewards and dones, reshape then add axis=0 rewards = np.expand_dims(np.array(rewards).reshape( self.num_agents, -1), axis=0) assert (rewards.shape[0] == 1 and rewards.shape[1] == self.num_agents and rewards.shape[2] == 1) dones = np.expand_dims(np.array(dones).reshape(self.num_agents, -1), axis=0) return states, actions, rewards, next_states, dones def step(self, states, actions, rewards, next_states, dones): """Performs the learning step. """ # store a single entry for results from all agents by adding axis=0 states, actions, rewards, next_states, dones = self.reshape( states, actions, rewards, next_states, dones) self.memory.add(states, actions, rewards, next_states, dones) # Get agent to learn from experience if we have enough data/experiences in memory if len( self.memory ) > self.batch_size and self.learn_step % self.update_every == 0: experiences = self.memory.sample() actor_losses = [] critic_losses = [] for agent in self.agents: actor_loss, critic_loss = agent.learn(self.agents, experiences, self.gamma) actor_losses.append(actor_loss) critic_losses.append(critic_loss) # Plot real-time graphs and store losses if self.learn_step % self.print_every == 0: # Save Critic loss utils.save_to_txt( critic_losses, '{}/critic_losses.txt'.format(self.result_dir)) self.writer.text('critic loss: {}'.format(critic_losses), 'Critic') self.writer.push(critic_losses, 'Loss(critic)') # Save Actor loss utils.save_to_txt( actor_losses, '{}/actor_losses.txt'.format(self.result_dir)) self.writer.text('actor loss: {}'.format(actor_losses), 'Actor') self.writer.push(actor_losses, 'Loss(actor)') self.critic_loss = np.array(critic_losses).mean() self.actor_loss = np.array(actor_losses).mean() self.learn_step += 1 return self.critic_loss, self.actor_loss def reset(self): """Resets the noise for each agent""" for agent in self.agents: agent.reset() def save(self): """Checkpoint actor and critic models""" for agent in self.agents: agent.check_point()
class DDPGAgent(object): """ class of the DDPG Agent """ def __init__(self, config): """Initialize an Agent object. Args: param1: (config) """ self.state_size = config.state_dim self.action_size = config.action_dim self.seed = np.random.seed(config.seed) self.n_agents = config.n_agents self.batch_size = config.batch_size self.tau = config.tau self.gamma = config.gamma self.device = config.device # Actor Network (w/ Target Network) self.actor_local = Actor(config).to(config.device) self.actor_target = Actor(config).to(config.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(config).to(config.device) self.critic_target = Critic(config).to(config.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=config.lr_critic) # Noise process self.noise = OUNoise(config) # Replay memory self.memory = ReplayBuffer(config) #self.timesteps = 0 def act(self, states, epsilon, add_noise=True): """ Given a list of states for each agent it returns the actions to be taken by each agent based on the current policy. Returns a numpy array of shape [n_agents, n_actions] NOTE: clips actions to be between -1, 1 Args: states: (torch) states epsilon: (float) add_noise: (bool) add noise to the actions """ states = torch.from_numpy(states).float().to(self.device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() self.actor_local.train() if add_noise and epsilon > np.random.random(): actions += [self.noise.sample() for _ in range(self.n_agents)] return np.clip(actions, -1, 1) def reset_noise(self): """ reset noise""" self.noise.reset() def learn(self): """Update policy and value parameters using given batch of experience tuples. actor_target(state) -> action critic_target(state, action) -> Q-value """ if self.batch_size > self.memory.size(): return states, actions, rewards, next_states, dones = self.memory.sample() # ---------------------------- update critic ---------------------------- # Get predicted next-state actions and Q values from target model actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) def soft_update(self, local_model, target_model): """Soft update model parameters. Args: param1: (torch network) local_model param2: (torch network) target_model """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)
class Maddpg(): """MADDPG Agent : Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize a MADDPG Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ super(Maddpg, self).__init__() self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) # Instantiate Multiple Agent self.agents = [ Agent(state_size,action_size, random_seed, num_agents) for i in range(num_agents) ] # Instantiate Memory replay Buffer (shared between agents) self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def reset(self): """Reset all the agents""" for agent in self.agents: agent.reset() def act(self, states, noise): """Return action to perform for each agents (per policy)""" return [ agent.act(state, noise) for agent, state in zip(self.agents, states) ] def step(self, states, actions, rewards, next_states, dones, num_current_episode): """ # Save experience in replay memory, and use random sample from buffer to learn""" #self.memory.add(states, It mainly reuse function from ``actions, rewards, next_states, dones) self.memory.add(encode(states), encode(actions), rewards, encode(next_states), dones) # If enough samples in the replay memory and if it is time to update if (len(self.memory) > BATCH_SIZE) and (num_current_episode % UPDATE_EVERY_NB_EPISODE ==0) : # Note: this code only expects 2 agents assert(len(self.agents)==2) # Allow to learn several time in a row in the same episode for i in range(MULTIPLE_LEARN_PER_UPDATE): # Sample a batch of experience from the replay buffer experiences = self.memory.sample() # Update Agent #0 self.maddpg_learn(experiences, own_idx=0, other_idx=1) # Sample another batch of experience from the replay buffer experiences = self.memory.sample() # Update Agent #1 self.maddpg_learn(experiences, own_idx=1, other_idx=0) def maddpg_learn(self, experiences, own_idx, other_idx, gamma=GAMMA): """ Update the policy of the MADDPG "own" agent. The actors have only access to agent own information, whereas the critics have access to all agents information. Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(states) -> action critic_target(all_states, all_actions) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples own_idx (int) : index of the own agent to update in self.agents other_idx (int) : index of the other agent to update in self.agents gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Filter out the agent OWN states, actions and next_states batch own_states = decode(self.state_size, self.num_agents, own_idx, states) own_actions = decode(self.action_size, self.num_agents, own_idx, actions) own_next_states = decode(self.state_size, self.num_agents, own_idx, next_states) # Filter out the OTHER agent states, actions and next_states batch other_states = decode(self.state_size, self.num_agents, other_idx, states) other_actions = decode(self.action_size, self.num_agents, other_idx, actions) other_next_states = decode(self.state_size, self.num_agents, other_idx, next_states) # Concatenate both agent information (own agent first, other agent in second position) all_states=torch.cat((own_states, other_states), dim=1).to(device) all_actions=torch.cat((own_actions, other_actions), dim=1).to(device) all_next_states=torch.cat((own_next_states, other_next_states), dim=1).to(device) agent = self.agents[own_idx] # ---------------------------- Update Critic ---------------------------- # # Get predicted next-state actions and Q values from target models all_next_actions = torch.cat((agent.actor_target(own_states), agent.actor_target(other_states)), dim =1).to(device) Q_targets_next = agent.critic_target(all_next_states, all_next_actions) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = agent.critic_local(all_states, all_actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss agent.critic_optimizer.zero_grad() critic_loss.backward() if (CLIP_CRITIC_GRADIENT): torch.nn.utils.clip_grad_norm(agent.critic_local.parameters(), 1) agent.critic_optimizer.step() # ---------------------------- Update Actor ---------------------------- # # Compute actor loss all_actions_pred = torch.cat((agent.actor_local(own_states), agent.actor_local(other_states).detach()), dim = 1).to(device) actor_loss = -agent.critic_local(all_states, all_actions_pred).mean() # Minimize the loss agent.actor_optimizer.zero_grad() actor_loss.backward() agent.actor_optimizer.step() # ----------------------- Update Target Networks ----------------------- # agent.soft_update(agent.critic_local, agent.critic_target, TAU) agent.soft_update(agent.actor_local, agent.actor_target, TAU) def checkpoints(self): """Save checkpoints for all Agents""" for idx, agent in enumerate(self.agents): actor_local_filename = 'model_dir/checkpoint_actor_local_' + str(idx) + '.pth' critic_local_filename = 'model_dir/checkpoint_critic_local_' + str(idx) + '.pth' actor_target_filename = 'model_dir/checkpoint_actor_target_' + str(idx) + '.pth' critic_target_filename = 'model_dir/checkpoint_critic_target_' + str(idx) + '.pth' torch.save(agent.actor_local.state_dict(), actor_local_filename) torch.save(agent.critic_local.state_dict(), critic_local_filename) torch.save(agent.actor_target.state_dict(), actor_target_filename) torch.save(agent.critic_target.state_dict(), critic_target_filename)
class DDPG: def __init__(self, n_states, n_actions, hidden_dim=30, device="cpu", critic_lr=1e-3, actor_lr=1e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128): self.device = device self.critic = Critic(n_states, n_actions, hidden_dim).to(device) self.actor = Actor(n_states, n_actions, hidden_dim).to(device) self.target_critic = Critic(n_states, n_actions, hidden_dim).to(device) self.target_actor = Actor(n_states, n_actions, hidden_dim).to(device) for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()): target_param.data.copy_(param.data) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr) self.memory = ReplayBuffer(memory_capacity) self.batch_size = batch_size self.soft_tau = soft_tau self.gamma = gamma def select_action(self, state): state = torch.FloatTensor(state).unsqueeze(0).to(self.device) action = self.actor(state) # torch.detach()用于切断反向传播 return action.detach().cpu().numpy()[0, 0] def update(self): if len(self.memory) < self.batch_size: return state, action, reward, next_state, done = self.memory.sample( self.batch_size) # 将所有变量转为张量 state = torch.FloatTensor(state).to(self.device) next_state = torch.FloatTensor(next_state).to(self.device) action = torch.FloatTensor(action).to(self.device) reward = torch.FloatTensor(reward).unsqueeze(1).to(self.device) done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(self.device) # 注意critic将(s_t,a)作为输入 policy_loss = self.critic(state, self.actor(state)) policy_loss = -policy_loss.mean() next_action = self.target_actor(next_state) target_value = self.target_critic(next_state, next_action.detach()) expected_value = reward + (1.0 - done) * self.gamma * target_value expected_value = torch.clamp(expected_value, -np.inf, np.inf) value = self.critic(state, action) value_loss = nn.MSELoss()(value, expected_value.detach()) self.actor_optimizer.zero_grad() policy_loss.backward() self.actor_optimizer.step() self.critic_optimizer.zero_grad() value_loss.backward() self.critic_optimizer.step() for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.soft_tau) + param.data * self.soft_tau) for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.soft_tau) + param.data * self.soft_tau) def save_model(self, path): torch.save(self.target_actor.state_dict(), path) def load_model(self, path): self.actor.load_state_dict(torch.load(path))
def train(config_file_path: str, save_dir: str, use_vime: bool, random_policy: bool, device: str, visualize_interval: int): conf_d = toml.load(open(config_file_path)) conf = namedtuple('Config', conf_d.keys())(*conf_d.values()) # Check if saving directory is valid if "test" in save_dir and os.path.exists(save_dir): shutil.rmtree(save_dir) if os.path.exists(save_dir): raise ValueError("Directory {} already exists.".format(save_dir)) # Create save dir os.makedirs(save_dir) ckpt_dir = os.path.join(save_dir, 'checkpoints') os.makedirs(ckpt_dir) log_dir = os.path.join(save_dir, 'logs') os.makedirs(log_dir) # Save config file shutil.copyfile(config_file_path, os.path.join(save_dir, os.path.basename(config_file_path))) # Set random variable np.random.seed(int(time.time())) torch.manual_seed(int(time.time())) device = torch.device(device) if device.type == 'cuda': torch.cuda.manual_seed(int(time.time())) # Set up log metrics metrics = { 'episode': [], 'collected_samples': [], 'reward': [], # cummulated reward 'curiosity_reward': [], # cummulated reward with information gain 'likelihood': [], # likelihood of leanred dynamics model 'D_KL_median': [], 'D_KL_mean': [], 'q1_loss': [], 'policy_loss': [], 'alpha_loss': [], 'alpha': [], 'ELBO': [], 'step': [], 'step_reward': [], 'test_episode': [], 'test_reward': [], } # Set up environment print("----------------------------------------\nTrain in {}\n----------------------------------------".format(conf.environment)) env = gym.make(conf.environment) if use_vime: print("Use VIME") if random_policy: print("Keep using random policy.") # Training set up agent = SAC(env.observation_space, env.action_space, device, **conf.agent) memory = ReplayBuffer(conf.replay_buffer_capacity, env.observation_space.shape, env.action_space.shape) vime = VIME(env.observation_space.shape[0], env.action_space.shape[0], device, **conf.vime) if use_vime else None # Load checkpoint if specified in config if conf.checkpoint != '': ckpt = torch.load(conf.checkpoint, map_location=device) metrics = ckpt['metrics'] agent.load_state_dict(ckpt['agent']) memory.load_state_dict(ckpt['memory']) if use_vime: vime.load_state_dict(ckpt['vime']) def save_checkpoint(): # Save checkpoint ckpt = {'metrics': metrics, 'agent': agent.state_dict(), 'memory': memory.state_dict()} if use_vime: ckpt['vime'] = vime.state_dict() path = os.path.join(ckpt_dir, 'checkpoint.pth') torch.save(ckpt, path) # Save agent model only model_ckpt = {'agent': agent.state_dict()} model_path = os.path.join(ckpt_dir, 'model.pth') torch.save(model_ckpt, model_path) # Save metrics only metrics_ckpt = {'metrics': metrics} metrics_path = os.path.join(ckpt_dir, 'metrics.pth') torch.save(metrics_ckpt, metrics_path) # Train agent init_episode = 0 if len(metrics['episode']) == 0 else metrics['episode'][-1] + 1 pbar = tqdm.tqdm(range(init_episode, conf.episodes)) reward_moving_avg = None moving_avg_coef = 0.1 agent_update_count = 0 total_steps = 0 for episode in pbar: o = env.reset() rewards, curiosity_rewards = [], [] info_gains = [] log_likelihoods = [] q1_losses, q2_losses, policy_losses, alpha_losses, alphas = [],[],[],[],[] for t in range(conf.horizon): if len(memory) < conf.random_sample_num or random_policy: a = env.action_space.sample() else: a = agent.select_action(o, eval=False) o_next, r, done, _ = env.step(a) total_steps += 1 metrics['step'].append(total_steps) metrics['step_reward'].append(r) done = False if t == env._max_episode_steps - 1 else bool(done) # done should be False if an episode is terminated forcefully rewards.append(r) if use_vime and len(memory) >= conf.random_sample_num: # Calculate curiosity reward in VIME info_gain, log_likelihood = vime.calc_info_gain(o, a, o_next) assert not np.isnan(info_gain).any() and not np.isinf(info_gain).any(), "invalid information gain, {}".format(info_gains) info_gains.append(info_gain) log_likelihoods.append(log_likelihood) vime.memorize_episodic_info_gains(info_gain) r = vime.calc_curiosity_reward(r, info_gain) curiosity_rewards.append(r) memory.append(o, a, r, o_next, done) o = o_next # Update agent if len(memory) >= conf.random_sample_num and not random_policy: for _ in range(conf.agent_update_per_step): batch_data = memory.sample(conf.agent_update_batch_size) q1_loss, q2_loss, policy_loss, alpha_loss, alpha = agent.update_parameters(batch_data, agent_update_count) q1_losses.append(q1_loss) q2_losses.append(q2_loss) policy_losses.append(policy_loss) alpha_losses.append(alpha_loss) alphas.append(alpha) agent_update_count += 1 if done: break if len(log_likelihoods) == 0: log_likelihoods.append(-np.inf) # Display performance episodic_reward = np.sum(rewards) reward_moving_avg = episodic_reward if reward_moving_avg is None else (1-moving_avg_coef) * reward_moving_avg + moving_avg_coef * episodic_reward if use_vime: pbar.set_description("EPISODE {}, TOTAL STEPS {}, SAMPLES {} --- Steps {}, Curiosity {:.1f}, Rwd {:.1f} (m.avg {:.1f}), Likelihood {:.2E}".format( episode, memory.step, len(memory), len(rewards), np.sum(curiosity_rewards), episodic_reward, reward_moving_avg, np.mean(np.exp(log_likelihoods)))) else: pbar.set_description("EPISODE {}, TOTAL STEPS {}, SAMPLES {} --- Steps {}, Rwd {:.1f} (mov avg {:.1f})".format( episode, memory.step, len(memory), len(rewards), episodic_reward, reward_moving_avg)) # Save episodic metrics metrics['episode'].append(episode) metrics['collected_samples'].append(total_steps) metrics['reward'].append(episodic_reward) metrics['curiosity_reward'].append(np.sum(curiosity_rewards)) metrics['likelihood'].append(np.mean(np.exp(log_likelihoods))) if episode % visualize_interval == 0: lineplot(metrics['step'][-len(metrics['step_reward']):], metrics['step_reward'], 'stepwise_reward', log_dir, xaxis='total step') lineplot(metrics['episode'][-len(metrics['reward']):], metrics['reward'], 'reward', log_dir) lineplot(metrics['collected_samples'][-len(metrics['reward']):], metrics['reward'], 'sample-reward', log_dir, xaxis='total step') lineplot(metrics['episode'][-len(metrics['curiosity_reward']):], metrics['curiosity_reward'], 'curiosity_reward', log_dir) lineplot(metrics['episode'][-len(metrics['likelihood']):], metrics['likelihood'], 'likelihood', log_dir) # Agent update related metrics if len(policy_losses) > 0 and not random_policy: metrics['q1_loss'].append(np.mean(q1_losses)) metrics['policy_loss'].append(np.mean(policy_losses)) metrics['alpha_loss'].append(np.mean(alpha_losses)) metrics['alpha'].append(np.mean(alphas)) if episode % visualize_interval == 0: lineplot(metrics['episode'][-len(metrics['q1_loss']):], metrics['q1_loss'], 'q1_loss', log_dir) lineplot(metrics['episode'][-len(metrics['policy_loss']):], metrics['policy_loss'], 'policy_loss', log_dir) lineplot(metrics['episode'][-len(metrics['alpha_loss']):], metrics['alpha_loss'], 'alpha_loss', log_dir) lineplot(metrics['episode'][-len(metrics['alpha']):], metrics['alpha'], 'alpha', log_dir) # Update VIME if use_vime and len(memory) >= conf.random_sample_num: for _ in range(conf.vime_update_per_episode): batch_s, batch_a, _, batch_s_next, _ = memory.sample(conf.vime_update_batch_size) elbo = vime.update_posterior(batch_s, batch_a, batch_s_next) metrics['ELBO'].append(elbo) lineplot(metrics['episode'][-len(metrics['ELBO']):], metrics['ELBO'], 'ELBO', log_dir) if len(info_gains) > 0: metrics['D_KL_median'].append(np.median(info_gains)) metrics['D_KL_mean'].append(np.mean(info_gains)) multiple_lineplot(metrics['episode'][-len(metrics['D_KL_median']):], np.array([metrics['D_KL_median'], metrics['D_KL_mean']]).T, 'D_KL', ['median', 'mean'], log_dir) # Test current policy if episode % conf.test_interval == 0: rewards = [] for _ in range(conf.test_times): o = env.reset() done = False episode_reward = 0 while not done: a = agent.select_action(o, eval=True) o_next, r, done, _ = env.step(a) episode_reward += r o = o_next rewards.append(episode_reward) mean, std = np.mean(rewards), np.std(rewards) print("\nTEST AT EPISODE {} ({} episodes) --- Avg. Reward {:.2f} (+- {:.2f})".format(episode, conf.test_times, mean, std)) metrics['test_episode'].append(episode) metrics['test_reward'].append(rewards) lineplot(metrics['test_episode'][-len(metrics['test_reward']):], metrics['test_reward'], 'test_reward', log_dir) # Save checkpoint if episode % conf.checkpoint_interval == 0: save_checkpoint() save_checkpoint() # Save the final model torch.save({'agent': agent.state_dict()}, os.path.join(ckpt_dir, 'final_model.pth'))
class Agent: def __init__(self) -> None: self.network = NetWork().to(device) print("Number of parameters in network:", count_parameters(self.network)) self.criterion = MSELoss() self.optimizer = Adam(self.network.parameters(), lr=0.001, weight_decay=0.001) self.memory = ReplayBuffer(100000) self.remember = self.memory.remember() self.exploration = Exploration() self.explore = self.exploration.epsilonGreedy self.target_network = NetWork().to(device) self.placeholder_network = NetWork().to(device) def choose(self, pixels, hn, cn): self.network.hn, self.network.cn = hn, cn vals = self.network(pixels).reshape(15) return self.explore( vals), pixels, hn, cn, self.network.hn, self.network.cn def learn(self, double=False): gamma = 0.96 obs, action, obs_next, reward, h0, c0, hn, sn, done = self.memory.sample_distribution( 20) # self.network.hn, self.network.cn = hn, sn # if double: # v_s_next = torch.gather(self.target_network(obs_next), 1, torch.argmax(self.network(obs_next), 1).view(-1, 1)).squeeze(1) # else: # v_s_next, input_indexes = torch.max(self.target_network(obs_next), 1) # self.network.hn, self.network.cn = h0, c0 # v_s = torch.gather(self.network(obs), 1, action) # #v_s, _ = torch.max(self.network(obs), 1) # td = (reward + gamma * v_s_next * done.type(torch.float)).detach().view(-1, 1) # loss = self.criterion(v_s, td) # loss.backward() # self.optimizer.step() # self.optimizer.zero_grad() # torch.cuda.empty_cache() self.autoEncode(obs) def autoEncode(self, obs): enc = self.network.color(obs) obs_Guess = self.network.colorReverse(enc) # print(enc.prod(1).sum()) # print(f"[{str(float(enc_stand.max()))[:8]}]", end=" ") entro = (enc + 1).prod(1) - (1 + enc).max(1)[0] img = self.criterion(obs_Guess.view(20, -1), obs.view(20, -1) / 256) # print(enc.max(1)[0].max(1)[0].max(1)[0].shape) # print(enc.max(1, keepdim=True)[0].shape) maxi = enc.max(1, keepdim=False)[0] loss = img * 100 + (entro * entro).mean() loss.backward() self.optimizer.step() self.optimizer.zero_grad() print(f"[{str(float(loss))[:8]}]", end=" ") print(f"[{str(float(img*100))[:8]}]", end=" ") print(f"[{str(float((entro * entro).mean()))[:8]}]", end=" ") print(f"[{str(float(enc.min()))[:8]}]", end=" ") print(f"[{str(float(enc.max()))[:8]}]") # print(f"[{str(float(enc.mean()))[:8]}]") # print(f"[{str(float(entro.min()))[:8]}]", end=" ") # print(f"[{str(float(entro.max()))[:8]}]", end=" ") # print(f"[{str(float(enc.max()))[:8]}]") # print(*[[float(str(f)[:5]) for f in list(p.detach().cpu().numpy().reshape(-1))] for p in self.network.color.parameters()], *[[float(str(f)[:5]) for f in list(p.detach().cpu().numpy().reshape(-1))] for p in self.network.colorReverse.parameters()]) torch.cuda.empty_cache() def update_target_network(self): self.target_network = copy.deepcopy(self.placeholder_network) self.placeholder_network = copy.deepcopy(self.network) self.memory.update_distribution()
class DQNAgent: def __init__(self, settings): self.check_settings(settings) # Constants self.batch_size = settings["batch_size"] self.checkpoint_frequency = settings["checkpoint_frequency"] self.device = settings["device"] self.dtype = (torch.cuda.FloatTensor if self.device.type == "cuda" else torch.FloatTensor) self.env_name = settings["env"] self.env = get_env(settings["env"], 6) self.eps_cliff = settings["eps_cliff"] self.eps_start = settings["eps_start"] self.eps_end = settings["eps_end"] self.frame_history_len = settings["frame_history_len"] self.gamma = settings["gamma"] self.learning_freq = settings["learning_freq"] self.learning_start = settings["learning_start"] self.logs_dir = settings["logs_dir"] self.log_freq = settings["log_freq"] self.memory_size = settings["memory_size"] self.model_name = settings["model_name"] self.num_actions = self.env.action_space.n settings["num_actions"] = self.num_actions settings["num_channels"] = self.frame_history_len self.out_dir = settings["out_dir"] self.target_update_freq = settings["target_update_freq"] self.total_timesteps = settings["total_timesteps"] # Init models self.Q = DQN(settings).to(self.device) self.target_Q = DQN(settings).to(self.device) self.target_Q.load_state_dict(self.Q.state_dict()) self.target_Q.eval() # Init model supporting objects self.memory = ReplayBuffer(self.memory_size, self.frame_history_len) self.optimizer = optim.RMSprop(self.Q.parameters(), lr=settings["lr"], alpha=0.95, eps=0.01) self.loss = F.smooth_l1_loss # Logging self.writer = SummaryWriter(self.logs_dir) def check_settings(self, settings): required_settings = [ "batch_size", "checkpoint_frequency", "device", "env", "eps_start", "eps_end", "eps_cliff", "frame_history_len", "gamma", "learning_start", "log_freq", "logs_dir", "lr", "memory_size", "model_name", "out_dir", "target_update_freq", "total_timesteps", ] if not settings_is_valid(settings, required_settings): raise Exception( f"Settings object {settings} missing some required settings.") def _get_epsilon(self, steps_done): if steps_done < self.eps_cliff: epsilon = (-(self.eps_start - self.eps_end) / self.eps_cliff * steps_done + self.eps_start) else: epsilon = self.eps_end return epsilon def select_epsilon_greedy_action(self, state, steps_done, epsilon=None): if epsilon is None: threshold = self._get_epsilon(steps_done) else: threshold = epsilon if random.random() < threshold: return torch.IntTensor([random.randrange(self.num_actions)]) obs = torch.from_numpy(state).type(self.dtype).unsqueeze(0) / 255.0 with torch.no_grad(): return self.Q(obs).argmax(dim=1).cpu() # returns action def should_stop(self): return (get_wrapper_by_name(self.env, "Monitor").get_total_steps() >= self.max_steps) def eval_model(self, epoch, n=100): self.Q.eval() env = get_env(self.env_name, 6, monitor=False) rewards = [] durations = [] for _e in tqdm(range(n)): memory = ReplayBuffer(10000, self.frame_history_len) state = env.reset()[..., np.newaxis] reward_acc = 0.0 for t in range(10000): if state is None: break memory.store_frame(state) recent_observations = memory.encode_recent_observation() action = self.select_epsilon_greedy_action( recent_observations, None, 0.05).item() state, reward, done, _ = env.step(action) if done: state = env.reset() state = state[..., np.newaxis] reward_acc += reward durations.append(t) self.Q.train() sum_rewards = sum(rewards) sum_durations = sum(durations) self.writer.add_scalar( f"Mean Reward ({n} episodes)", round(sum_rewards / len(rewards), 2), epoch, ) self.writer.add_scalar( f"Mean Duration ({n} episodes)", round(sum_durations / len(durations), 2), epoch, ) self.writer.add_scalar( f"Mean Reward per Timestep ({n} episodes)", round(sum_rewards / sum_durations, 2), epoch, ) def train(self): num_param_updates = 0 loss_acc_since_last_log = 0.0 param_updates_since_last_log = 0 num_episodes = 0 state = self.env.reset()[..., np.newaxis] for t in tqdm(range(self.total_timesteps)): last_idx = self.memory.store_frame(state) recent_observations = self.memory.encode_recent_observation() # Choose random action if learning hasn't started yet if t > self.learning_start: action = self.select_epsilon_greedy_action( recent_observations, t).item() else: action = random.randrange(self.num_actions) # Advance a step next_state, reward, done, _ = self.env.step(action) next_state = next_state[..., np.newaxis] # Store result in memory self.memory.store_effect(last_idx, action, reward, done) # Reset if done (life lost, due to atari wrapper) if done: next_state = self.env.reset() next_state = next_state[..., np.newaxis] state = next_state # Train network using experience replay when # memory is sufficiently large. if (t > self.learning_start and t % self.learning_freq == 0 and self.memory.can_sample(self.batch_size)): # Sample from replay buffer ( state_batch, act_batch, r_batch, next_state_batch, done_mask, ) = self.memory.sample(self.batch_size) state_batch = torch.from_numpy(state_batch).type( self.dtype) / 255.0 act_batch = torch.from_numpy(act_batch).long().to(self.device) r_batch = torch.from_numpy(r_batch).to(self.device) next_state_batch = ( torch.from_numpy(next_state_batch).type(self.dtype) / 255.0) not_done_mask = torch.from_numpy(1 - done_mask).type( self.dtype) # Calculate current Q value current_Q_vals = self.Q(state_batch).gather( 1, act_batch.unsqueeze(1)) # Calculate next Q value based on action that gives max Q vals next_max_Q = self.target_Q(next_state_batch).detach().max( dim=1)[0] next_Q_vals = not_done_mask * next_max_Q # Calculate target of current Q values target_Q_vals = r_batch + (self.gamma * next_Q_vals) # Calculate loss and backprop loss = F.smooth_l1_loss(current_Q_vals.squeeze(), target_Q_vals) self.optimizer.zero_grad() loss.backward() for param in self.Q.parameters(): param.grad.data.clamp_(-1, 1) # Update weights self.optimizer.step() num_param_updates += 1 # Store stats loss_acc_since_last_log += loss.item() param_updates_since_last_log += 1 # Update target network periodically if num_param_updates % self.target_update_freq == 0: self.target_Q.load_state_dict(self.Q.state_dict()) # Save model checkpoint if num_param_updates % self.checkpoint_frequency == 0: save_model_checkpoint( self.Q, self.optimizer, t, f"{self.out_dir}/checkpoints/{self.model_name}_{num_param_updates}", ) # Log progress if (num_param_updates % (self.log_freq // 2) == 0 and param_updates_since_last_log > 0): self.writer.add_scalar( "Mean Loss per Update (Updates)", loss_acc_since_last_log / param_updates_since_last_log, num_param_updates, ) loss_acc_since_last_log = 0.0 param_updates_since_last_log = 0 if num_param_updates % self.log_freq == 0: wrapper = get_wrapper_by_name(self.env, "Monitor") episode_rewards = wrapper.get_episode_rewards() mean_reward = round(np.mean(episode_rewards[-101:-1]), 2) sum_reward = np.sum(episode_rewards[-101:-1]) episode_lengths = wrapper.get_episode_lengths() mean_duration = round(np.mean(episode_lengths[-101:-1]), 2) sum_duration = np.sum(episode_lengths[-101:-1]) self.writer.add_scalar( f"Mean Reward (epoch = {self.log_freq} updates)", mean_reward, num_param_updates // self.log_freq, ) self.writer.add_scalar( f"Mean Duration (epoch = {self.log_freq} updates)", mean_duration, num_param_updates // self.log_freq, ) self.writer.add_scalar( f"Mean Reward per Timestep (epoch = {self.log_freq} updates)", round(sum_reward / sum_duration, 2), num_param_updates // self.log_freq, ) if done: num_episodes += 1 # Save model save_model(self.Q, f"{self.out_dir}/{self.model_name}.model") self.env.close() print(f"Number of Episodes: {num_episodes}") return self.Q
def train(args, param): """ Args: param1(args): hyperparameter """ # in case seed experements args.seed = param now = datetime.now() dt_string = now.strftime("%d_%m_%Y_%H:%M:%S") torch.manual_seed(args.seed) np.random.seed(args.seed) pathname = str(args.env_name) if args.agent == "TD3_ad": pathname += '_update_freq_' + str(args.target_update_freq) pathname += "_num_q_target_" + str(args.num_q_target) pathname += "_seed_" + str(args.seed) + "_agent_" + args.agent tensorboard_name = args.locexp + '/runs/' + pathname writer = SummaryWriter(tensorboard_name) env = gym.make(args.env_name) env.seed(args.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) print(state_dim) if args.agent == "TD3_ad": policy = TD31v1(state_dim, action_dim, max_action, args) elif args.agent == "TD3": policy = TD3(state_dim, action_dim, max_action, args) replay_buffer = ReplayBuffer() total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 done = True t0 = time.time() scores_window = deque(maxlen=100) episode_reward = 0 evaluations = [] file_name = "%s_%s_%s" % (args.agent, args.env_name, str(args.seed)) print("---------------------------------------") print("Settings: %s" % (file_name)) print("---------------------------------------") # We start the main loop over 500,000 timesteps tb_update_counter = 0 while total_timesteps < args.max_timesteps: tb_update_counter += 1 # If the episode is done if done: episode_num += 1 #env.seed(random.randint(0, 100)) scores_window.append(episode_reward) average_mean = np.mean(scores_window) if tb_update_counter > args.tensorboard_freq: tb_update_counter = 0 writer.add_scalar('Reward', episode_reward, total_timesteps) writer.add_scalar('Reward mean ', average_mean, total_timesteps) # If we are not at the very beginning, we start the training process of the model if total_timesteps != 0: text = "Total Timesteps: {} Episode Num: {} Reward: {} Average Re: {:.2f} Time: {}".format( total_timesteps, episode_num, episode_reward, np.mean(scores_window), time_format(time.time() - t0)) print(text) write_into_file('search-' + pathname, text) # We evaluate the episode and we save the policy if timesteps_since_eval >= args.eval_freq: timesteps_since_eval %= args.eval_freq evaluations.append( evaluate_policy(policy, writer, total_timesteps, args, episode_num)) # When the training step is done, we reset the state of the environment obs = env.reset() # Set the Done to False done = False # Set rewards and episode timesteps to zero episode_reward = 0 episode_timesteps = 0 # Before 10000 timesteps, we play random actions if total_timesteps < args.start_timesteps: action = env.action_space.sample() else: # After 10000 timesteps, we switch to the model action = policy.select_action(np.array(obs)) # If the explore_noise parameter is not 0, we add noise to the action and we clip it if args.expl_noise != 0: action = (action + np.random.normal( 0, args.expl_noise, size=env.action_space.shape[0])).clip( env.action_space.low, env.action_space.high) if args.agent == "TD3_ad": if total_timesteps % args.target_update_freq == 0: policy.hardupdate() # The agent performs the action in the environment, then reaches the next state and receives the reward new_obs, reward, done, _ = env.step(action) # We check if the episode is done done_bool = 0 if episode_timesteps + 1 == 1000 else float(done) # We increase the total reward episode_reward += reward # We store the new transition into the Experience Replay memory (ReplayBuffer) replay_buffer.add((obs, new_obs, action, reward, done_bool)) # We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy obs = new_obs episode_timesteps += 1 total_timesteps += 1 timesteps_since_eval += 1 if total_timesteps > args.start_timesteps: policy.train(replay_buffer, writer, 1) # We add the last policy evaluation to our list of evaluations and we save our model evaluations.append( evaluate_policy(policy, writer, total_timesteps, args, episode_num)) if args.save_model: policy.save("%s" % (file_name), directory="./pytorch_models") np.save("./results/%s" % (file_name), evaluations)
def __init__( self, state_size, action_size, num_agents=2, actor_network_units=(64, 64), critic_network_units=(64, 64), optimizer_learning_rate_actor=1e-3, optimizer_learning_rate_critic=1e-3, optimizer_weight_decay_actor=0, optimizer_weight_decay_critic=0, noise_scale=0.1, noise_theta=0.2, noise_sigma=0.2, gamma=0.99, tau=1e-3, gradient_clip_actor=1.0, gradient_clip_critic=1.0, buffer_size=int(1e5), batch_size=128, update_every=1, device=None ): """Initializes a multi-agent training instance. :param state_size: (int) Space size for state observations per agent :param action_size: (int) Space size for actions per agent :param num_agents: (int) Number of agents used in problem :param actor_network_units: (list of ints) Network topology for actor networks :param critic_network_units: (list of ints) Network topology for critic networks :param optimizer_learning_rate_actor: (float) Learning rate for actor loss optimizer :param optimizer_learning_rate_critic: (float) Learning rate for critic loss optimizer :param optimizer_weight_decay_actor: (float) Weight decay for actor loss optimizer :param optimizer_weight_decay_critic: (float) Weight decay for critic loss optimizer :param noise_scale: (float) Scale for noise process :param noise_theta: (float) Theta parameter for noise process :param noise_sigma: (float) Sigma parameter for noise process :param gamma: (float) Discount rate for rewards :param tau: (float) Update parameter for network soft updates :param gradient_clip_actor: (float) Gradient clipping parameter for actor loss optimizer :param gradient_clip_critic: (float) Gradient clipping parameter for critic loss optimizer :param buffer_size: (int) Size of replay memory buffer :param batch_size: (int) Size of training minibatches :param update_every: (int) Number of steps between training :param device: (torch.device) Object representing the device where to allocate tensors """ if device is None: device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.device = device self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.gamma = gamma self.tau = tau self.gradient_clip_actor = gradient_clip_actor self.gradient_clip_critic = gradient_clip_critic self.update_every = update_every self.batch_size = batch_size self.t_step = 0 self.episode = 0 self.agents = [] for i in range(num_agents): self.agents.append(DDPGAgent( state_size=state_size, action_size=action_size, actor_network_units=actor_network_units, critic_network_units=critic_network_units, num_agents=num_agents, optimizer_learning_rate_actor=optimizer_learning_rate_actor, optimizer_learning_rate_critic=optimizer_learning_rate_critic, actor_weight_decay=optimizer_weight_decay_actor, critic_weight_decay=optimizer_weight_decay_critic, noise_scale=noise_scale, noise_theta=noise_theta, noise_sigma=noise_sigma, device=device )) # Replay memory self.memory = ReplayBuffer( buffer_size=buffer_size, device=device )
class DDQN_Agent(object): def __init__(self,env,input_dim,n_actions,alpha,gamma,epsilon,batch_size,lr=5e-4, epsilon_dec=0.995,epsilon_end=0.05,memory_size=10000000,replace_target=5, filename='ddqn.h5'): self.env = env self.action_space= np.arange(n_actions) self.input_dim = input_dim self.n_actions = n_actions self.alpha = alpha #learning rate self.gamma=gamma #discount factor self.epsilon = epsilon #eps-greedy self.batch_size=batch_size self.epsilon_dec = epsilon_dec self.epsilon_end = epsilon_end self.filename = filename self.memory = ReplayBuffer(memory_size,input_dim) self.scores = [] # to keep track of scores self.avg_scores=[] self.replace_target = replace_target self.online_network=Neural_Network(lr,n_actions,input_dim) #network for evaluation self.target_network=Neural_Network(lr,n_actions,input_dim) #network for computing target # online and target network are the same except that parameters of target network # are copied each "replace target" steps from online network's parameters and kept # fixed on all other steps # to interface with memory def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) # choose epsilon greedy action (to keep exploration) def choose_action(self, state): state = state.reshape(1,-1) rand=np.random.random() if rand<self.epsilon: action=np.random.choice(self.action_space) else: actions=self.online_network.predict(state) action= np.argmax(actions) return action def update_online(self):#update parameters of the online network #we start learning after at least batch_size sample in memory if self.memory.memory_count< self.batch_size: return states, actions, rewards, new_states, done =self.memory.sample_buffer(self.batch_size) q_target = self.online_network.predict(states) q_intermediate = self.online_network.predict(new_states) # to estimate the action in the argmax q_next = self.target_network.predict(new_states) # to estimate the q value of the estimated action argmax_actions = np.argmax(q_intermediate,axis=1) # actions that maximize q value batch_index= np.arange(self.batch_size,dtype=np.int32) q_target[batch_index,actions] = rewards + self.gamma * q_next[batch_index,argmax_actions]*(1-done) # if episode over, 1-done = 0 , Q(terminal,)=0 self.online_network.fit(states,q_target,verbose=0) self.epsilon = self.epsilon*self.epsilon_dec if self.epsilon>self.epsilon_end else self.epsilon_end if self.memory.memory_count % self.replace_target ==0: self.update_target() def update_target(self): #update the parameters of target network from online network self.target_network.model.set_weights(self.online_network.model.get_weights()) def train(self,n_games,path): # path : path where to save the model for i in range(n_games): score=0 done = False state = self.env.reset() while not done: action = self.choose_action(state) new_state,reward,done,info= self.env.step(action) score+= reward self.remember(state, action, reward, new_state, done) state = new_state self.update_online() self.scores.append(score) avg_score = np.mean(self.scores[max(0,i-50):i+1]) # rolling score : mean self.avg_scores.append(avg_score) print('episode ',i,'score = %.2f'%score,' Rolling-score = %.2f'%avg_score) # save the model after 100 games if i%100 ==0 and i>0: self.save_model(path) def save_model(self,path): self.online_network.save(path+'/'+ self.filename) def load_model(self,path): self.online_network= load_model(path)
class MADDPG: def __init__( self, state_size, action_size, num_agents=2, actor_network_units=(64, 64), critic_network_units=(64, 64), optimizer_learning_rate_actor=1e-3, optimizer_learning_rate_critic=1e-3, optimizer_weight_decay_actor=0, optimizer_weight_decay_critic=0, noise_scale=0.1, noise_theta=0.2, noise_sigma=0.2, gamma=0.99, tau=1e-3, gradient_clip_actor=1.0, gradient_clip_critic=1.0, buffer_size=int(1e5), batch_size=128, update_every=1, device=None ): """Initializes a multi-agent training instance. :param state_size: (int) Space size for state observations per agent :param action_size: (int) Space size for actions per agent :param num_agents: (int) Number of agents used in problem :param actor_network_units: (list of ints) Network topology for actor networks :param critic_network_units: (list of ints) Network topology for critic networks :param optimizer_learning_rate_actor: (float) Learning rate for actor loss optimizer :param optimizer_learning_rate_critic: (float) Learning rate for critic loss optimizer :param optimizer_weight_decay_actor: (float) Weight decay for actor loss optimizer :param optimizer_weight_decay_critic: (float) Weight decay for critic loss optimizer :param noise_scale: (float) Scale for noise process :param noise_theta: (float) Theta parameter for noise process :param noise_sigma: (float) Sigma parameter for noise process :param gamma: (float) Discount rate for rewards :param tau: (float) Update parameter for network soft updates :param gradient_clip_actor: (float) Gradient clipping parameter for actor loss optimizer :param gradient_clip_critic: (float) Gradient clipping parameter for critic loss optimizer :param buffer_size: (int) Size of replay memory buffer :param batch_size: (int) Size of training minibatches :param update_every: (int) Number of steps between training :param device: (torch.device) Object representing the device where to allocate tensors """ if device is None: device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.device = device self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.gamma = gamma self.tau = tau self.gradient_clip_actor = gradient_clip_actor self.gradient_clip_critic = gradient_clip_critic self.update_every = update_every self.batch_size = batch_size self.t_step = 0 self.episode = 0 self.agents = [] for i in range(num_agents): self.agents.append(DDPGAgent( state_size=state_size, action_size=action_size, actor_network_units=actor_network_units, critic_network_units=critic_network_units, num_agents=num_agents, optimizer_learning_rate_actor=optimizer_learning_rate_actor, optimizer_learning_rate_critic=optimizer_learning_rate_critic, actor_weight_decay=optimizer_weight_decay_actor, critic_weight_decay=optimizer_weight_decay_critic, noise_scale=noise_scale, noise_theta=noise_theta, noise_sigma=noise_sigma, device=device )) # Replay memory self.memory = ReplayBuffer( buffer_size=buffer_size, device=device ) def step(self, state, action, reward, next_state, done): """ Store a single agent step, learning every N steps :param state: (array-like) Initial states on the visit :param action: (array-like) Actions on the visit :param reward: (array-like) Rewards received on the visit :param next_state: (array-like) States reached after the visit :param done: (array-like) Flag whether the next states are terminal states """ self.memory.add(state, action, reward, next_state, done) # Learn every self.update_every time steps self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # If enough samples are available in memory, get random batch and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample(self.batch_size) self.learn(experiences) # Keep track of episode number if np.any(done): self.episode += 1 def act(self, states, target=False, noise=1.0): """ Returns the selected actions for the given states according to the current policy :param states: (array-like) Current states :param target: (boolean, default False) Whether to use local networks or target networks :param noise: (float, default 1) Scaling parameter for noise process :return: action (array-like) List of selected actions """ if type(states) == np.ndarray: states = torch.from_numpy(states).float().to(self.device) actions = [] with torch.no_grad(): for i in range(self.num_agents): agent = self.agents[i] action = agent.act(states[i, :].view(1, -1), target=target, noise=noise) actions.append(action.squeeze()) actions = torch.stack(actions) return actions.cpu().data.numpy() def learn(self, experiences): """ Performs training for each agent based on the selected set of experiencecs :param experiences: Batch of experience tuples (s, a, r, s', d) collected from the replay buffer """ state, action, rewards, next_state, done = experiences state = state.view(-1, self.num_agents, self.state_size) action = action.view(-1, self.num_agents, self.action_size) rewards = rewards.view(-1, self.num_agents) next_state = next_state.view(-1, self.num_agents, self.state_size) done = done.view(-1, self.num_agents) # Select agent being updated based on ensemble at time of samples for agent_number in range(self.num_agents): agent = self.agents[agent_number] # Compute the critic loss target_actions = [] for i in range(self.num_agents): i_agent = self.agents[i] i_action = i_agent.act(next_state[:, i, :], target=True, noise=0.0, train=True) target_actions.append(i_action.squeeze()) target_actions = torch.stack(target_actions) target_actions = target_actions.permute(1, 0, 2).contiguous() with torch.no_grad(): flat_next_state = next_state.view(-1, self.num_agents * self.state_size) flat_target_actions = target_actions.view(-1, self.num_agents * self.action_size) Q_targets_next = agent.target_critic(flat_next_state, flat_target_actions).squeeze() Q_targets = rewards[:, agent_number] + self.gamma * Q_targets_next * (1 - done[:, agent_number]) flat_state = state.view(-1, self.num_agents * self.state_size) flat_action = action.view(-1, self.num_agents * self.action_size) Q_expected = agent.critic(flat_state, flat_action).squeeze() critic_loss = F.mse_loss(Q_targets, Q_expected) # Minimize the critic loss agent.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(agent.critic.parameters(), self.gradient_clip_critic) agent.critic_optimizer.step() # Compute the actor loss Q_input = [] for i in range(self.num_agents): i_agent = self.agents[i] Q_input.append(i_agent.actor(state[:, i, :])) Q_input = torch.stack(Q_input) Q_input = Q_input.permute(1, 0, 2).contiguous() flat_Q_input = Q_input.view(-1, self.num_agents * self.action_size) actor_loss = -agent.critic(flat_state, flat_Q_input).mean() # Minimize the actor loss agent.actor_optimizer.zero_grad() actor_loss.backward() torch.nn.utils.clip_grad_norm_(agent.actor.parameters(), self.gradient_clip_actor) agent.actor_optimizer.step() # soft update target agent.soft_update(self.tau) def save(self, filename): """Saves the model networks to a file. :param filename: Filename where to save the networks """ checkpoint = {} for index, agent in enumerate(self.agents): checkpoint['actor_' + str(index)] = agent.actor.state_dict() checkpoint['target_actor_' + str(index)] = agent.target_actor.state_dict() checkpoint['critic_' + str(index)] = agent.critic.state_dict() checkpoint['target_critic_' + str(index)] = agent.target_critic.state_dict() torch.save(checkpoint, filename) def load(self, filename): """Loads the model networks from a file. :param filename: Filename from where to load the networks """ checkpoint = torch.load(filename) for i in range(self.num_agents): agent = self.agents[i] agent.actor.load_state_dict(checkpoint['actor_' + str(i)]) agent.target_actor.load_state_dict(checkpoint['target_actor_' + str(i)]) agent.critic.load_state_dict(checkpoint['critic_' + str(i)]) agent.target_critic.load_state_dict(checkpoint['target_critic_' + str(i)])
class DQNAgent(): def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size=32, eps_min=0.1, eps_dec=1e-5, tau=1000, env_name='Doom', chkpt_dir='models/'): self.action_to_game = [ list(a) for a in itertools.product([0, 1], repeat=3) ] self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.tau = tau self.env_name = env_name self.chkpt_dir = chkpt_dir self.action_space = [i for i in range(n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = DeepQNetwork(lr, n_actions, f'{env_name}_q_eval.pth', input_dims, chkpt_dir) self.q_next = DeepQNetwork(lr, n_actions, f'{env_name}_q_next.pth', input_dims, chkpt_dir).eval() def choose_action(self, observation): if np.random.random() > self.epsilon: obs = observation.unsqueeze(0).to(self.q_eval.device) action = self.q_eval.forward(obs).argmax().item() else: action = np.random.choice(self.action_space) return action def store_transition(self, state, action, reward, state_, done): self.memory.store_transition(state, action, reward, state_, done) def sample_memory(self): state, action, reward, states_, done = self.memory.sample_buffer( self.batch_size) states = torch.tensor(state).to(self.q_eval.device) rewards = torch.tensor(reward).to(self.q_eval.device) dones = torch.tensor(done).to(self.q_eval.device) actions = torch.tensor(action).to(self.q_eval.device) states_new = torch.tensor(states_).to(self.q_eval.device) return states, actions, rewards, states_new, dones def update_target_network(self): if self.learn_step_counter % self.tau == 0: self.q_next.load_state_dict(self.q_eval.state_dict()) def decrement_eps(self): self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min def save_models(self): self.q_eval.save_checkpoint() self.q_next.save_checkpoint() def load_models(self): self.q_eval.load_checkpoint() self.q_next.load_checkpoint() def learn(self): if self.batch_size > self.memory.mem_cntr: return states, actions, rewards, states_, dones = self.sample_memory() indices = np.arange(self.batch_size) q_pred = self.q_eval.forward( states )[indices, actions] # select the values only for actions the agent have taken actions 0 or 1 # q_next = self.q_eval.forward(states_).detach().max(dim=1)[0] # predict the max value of the with torch.no_grad(): q_next = self.q_next.forward(states_).detach().max( dim=1)[0] # predict the max value of the q_next[ dones] = 0.0 # for terminal states, there's no other state ahead, so reward = 0. q_target = rewards + self.gamma * q_next loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) self.q_eval.optimizer.zero_grad() loss.backward() self.q_eval.optimizer.step() self.learn_step_counter += 1 self.update_target_network( ) # decide to update or not the weights of q_next self.decrement_eps()
class DQN: def __init__(self, screen_height=0, screen_width=0, n_actions=0, gamma=0.999, epsilon_start=0.9, epsilon_end=0.05, epsilon_decay=200, memory_capacity=10000, batch_size=128, device="cpu"): self.actions_count = 0 self.n_actions = n_actions # 总的动作个数 self.device = device # 设备,cpu或gpu等 self.gamma = gamma # e-greedy策略相关参数 self.epsilon = 0 self.epsilon_start = epsilon_start self.epsilon_end = epsilon_end self.epsilon_decay = epsilon_decay self.batch_size = batch_size self.policy_net = CNN(screen_height, screen_width, n_actions).to(self.device) self.target_net = CNN(screen_height, screen_width, n_actions).to(self.device) self.target_net.load_state_dict( self.policy_net.state_dict()) # target_net的初始模型参数完全复制policy_net self.target_net.eval() # 不启用 BatchNormalization 和 Dropout self.optimizer = optim.RMSprop(self.policy_net.parameters( )) # 可查parameters()与state_dict()的区别,前者require_grad=True self.loss = 0 self.memory = ReplayBuffer(memory_capacity) def select_action(self, state): '''选择动作 Args: state [array]: [description] Returns: action [array]: [description] ''' self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ math.exp(-1. * self.actions_count / self.epsilon_decay) self.actions_count += 1 if random.random() > self.epsilon: with torch.no_grad(): q_value = self.policy_net( state) # q_value比如tensor([[-0.2522, 0.3887]]) # tensor.max(1)返回每行的最大值以及对应的下标, # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0])) # 所以tensor.max(1)[1]返回最大值对应的下标,即action action = q_value.max(1)[1].view( 1, 1) # 注意这里action是个张量,如tensor([1]) return action else: return torch.tensor([[random.randrange(self.n_actions)]], device=self.device, dtype=torch.long) def update(self): if len(self.memory) < self.batch_size: return transitions = self.memory.sample(self.batch_size) # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for # detailed explanation). This converts batch-array of Transitions # to Transition of batch-arrays. batch = self.memory.Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements # (a final state would've been the one after which simulation ended) non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_state)), device=self.device, dtype=torch.bool) non_final_next_states = torch.cat( [s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # tensor([1., 1.,...,]) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net state_action_values = self.policy_net(state_batch).gather( 1, action_batch) #tensor([[ 1.1217],...,[ 0.8314]]) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. next_state_values = torch.zeros(self.batch_size, device=self.device) next_state_values[non_final_mask] = self.target_net( non_final_next_states).max(1)[0].detach() # Compute the expected Q values expected_state_action_values = ( next_state_values * self.gamma) + reward_batch # tensor([0.9685, 0.9683,...,]) # Compute Huber loss self.loss = F.smooth_l1_loss( state_action_values, expected_state_action_values.unsqueeze(1)) # .unsqueeze增加一个维度 # Optimize the model self.optimizer.zero_grad( ) # zero_grad clears old gradients from the last step (otherwise you’d just accumulate the gradients from all loss.backward() calls). self.loss.backward( ) # loss.backward() computes the derivative of the loss w.r.t. the parameters (or anything requiring gradients) using backpropagation. for param in self.policy_net.parameters(): # clip防止梯度爆炸 param.grad.data.clamp_(-1, 1) self.optimizer.step( ) # causes the optimizer to take a step based on the gradients of the parameters.
def batch_ddpg(agent_name, multiple_agents = False, PER = False, n_episodes = 300, max_t = 1000): """ Batch processed the states in a single forward pass with a single neural network Params ====== multiple_agents (boolean): boolean for multiple agents PER (boolean): n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode """ env, env_info, states, state_size, action_size, brain_name, num_agents = initialize_env(multiple_agents) device = get_device() scores_window = deque(maxlen=100) scores = np.zeros(num_agents) scores_episode = [] shared_memory = ReplayBuffer(device, BUFFER_SIZE, BATCH_SIZE, RANDOM_SEED) agent = AC_Agent(brain_name, agent_name, device, state_size, action_size) for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode = True)[brain_name] states = env_info.vector_observations agent.reset() scores = np.zeros(num_agents) for t in range(max_t): actions = agent.act(states) env_info = env.step(actions)[brain_name] # send the action to the environment next_states = env_info.vector_observations # get the next state rewards = env_info.rewards # get the reward dones = env_info.local_done if multiple_agents: agent.step(states, actions, rewards, next_states, dones, shared_memory) else: agent.step(states, np.expand_dims(actions, axis=0), rewards, next_states, dones, shared_memory) if shared_memory.batch_passed(): experiences = shared_memory.sample() agent.learn(experiences, shared_memory) states = next_states scores += rewards if t % 20: print('\rTimestep {}\tScore: {:.2f}\tmin: {:.2f}\tmax: {:.2f}' .format(t, np.mean(scores), np.min(scores), np.max(scores)), end="") if np.any(dones): break score = np.mean(scores) scores_window.append(score) # save most recent score scores_episode.append(score) print('\rEpisode {}\tScore: {:.2f}\tAverage Score: {:.2f}\tMax Score: {:.2f}'.format(i_episode, score, np.mean(scores_window), np.max(scores)), end="\n") update_csv(agent_name, i_episode, np.mean(scores_window), np.max(scores)) agent.save_agent(agent_name) # Early stop if i_episode == 100: return scores_episode if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window))) if np.mean(scores_window)>=30.0: print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window))) agent.save_agent(agent_name + "Complete") break return scores_episode
def main(): # define arguments parser = argparse.ArgumentParser() parser.add_argument("--render", action="store_true", help="Render the state") parser.add_argument("--render_interval", type=int, default=10, help="Number of rollouts to skip before rendering") parser.add_argument("--num_rollouts", type=int, default=-1, help="Number of max rollouts") parser.add_argument("--logfile", type=str, help="Indicate where to save rollout data") parser.add_argument("--load_params", type=str, help="Load previously learned parameters from [LOAD_PARAMS]") parser.add_argument("--save_params", type=str, help="Save learned parameters to [SAVE_PARAMS]") args = parser.parse_args() signal.signal(signal.SIGINT, stopsigCallback) global stopsig # create the basketball environment env = BasketballVelocityEnv(fps=60.0, timeInterval=0.1, goal=[0, 5, 0], initialLengths=np.array([0, 0, 1, 1, 0, 1, 1]), initialAngles=np.array([-5, 45, -10, -10, 0, -10, 0])) # create space stateSpace = ContinuousSpace(ranges=env.state_range()) actionRange = env.action_range() actionSpace = DiscreteSpace(intervals=[15 for i in range(5)] + [1], ranges=[actionRange[0], actionRange[1], actionRange[2], actionRange[3], actionRange[5], actionRange[7]]) processor = JointProcessor(actionSpace) # create the model and policy functions modelFn = MxFullyConnected(sizes=[stateSpace.n + actionSpace.n, 512, 256, 1], alpha=0.001, use_gpu=True) if args.load_params: print("loading params...") modelFn.load_params(args.load_params) softmax = lambda s: np.exp(s) / np.sum(np.exp(s)) policyFn = EpsilonGreedyPolicy(epsilon=0.5, getActionsFn=lambda state: actionSpace.sample(1024), distributionFn=lambda qstate: softmax(modelFn(qstate))) dataset = ReplayBuffer() if args.logfile: log = open(args.logfile, "a") rollout = 0 while args.num_rollouts == -1 or rollout < args.num_rollouts: print("Iteration:", rollout) state = env.reset() reward = 0 done = False steps = 0 while not done: if stopsig: break action = policyFn(state) nextState, reward, done, info = env.step( createAction(processor.process_env_action(action))) dataset.append(state, action, reward, nextState) state = nextState steps += 1 if args.render and rollout % args.render_interval == 0: env.render() if stopsig: break dataset.reset() # push trajectory into the dataset buffer modelFn.fit(processor.process_Q(dataset.sample(1024)), num_epochs=10) print("Reward:", reward if (reward >= 0.00001) else 0, "with Error:", modelFn.score(), "with steps:", steps) if args.logfile: log.write("[" + str(rollout) + ", " + str(reward) + ", " + str(modelFn.score()) + "]\n") rollout += 1 if rollout % 100 == 0: policyFn.epsilon *= 0.95 print("Epsilon is now:", policyFn.epsilon) if args.logfile: log.close() if args.save_params: print("saving params...") modelFn.save_params(args.save_params)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters # Score tracker and learning parameters self.total_reward = 0 self.count = 0 self.score = 0 self.best_score = -np.inf self.last_state = None def reset_episode(self): #initialize the parameters self.total_reward = 0 self.count = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): self.total_reward += reward self.count += 1 # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, states): """Returns actions for given state(s) as per current policy.""" states = np.reshape(states, [-1, self.state_size]) action = self.actor_local.model.predict(states)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) self.score = self.total_reward / float( self.count) if self.count else 0.0 if self.best_score < self.score: self.best_score = self.score def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
def test_ReplayBuffer(self): mem = ReplayBuffer(2) mem.push(1) mem.push(2) [sample] = mem.sample(2) self.assertEqual(sorted(sample), [1, 2]) mem.push(3) [sample] = mem.sample(2) self.assertEqual(sorted(sample), [2, 3]) mem.push(4) [sample] = mem.sample(2) self.assertEqual(sorted(sample), [3, 4])
class DQN_agent(object): def __init__(self, env, hyper_params, action_space=len(ACTION_DICT)): self.env = env self.max_episode_steps = env._max_episode_steps self.beta = hyper_params['beta'] self.initial_epsilon = 1 self.final_epsilon = hyper_params['final_epsilon'] self.epsilon_decay_steps = hyper_params['epsilon_decay_steps'] self.episode = 0 self.steps = 0 self.best_reward = 0 self.learning = True self.action_space = action_space state = env.reset() input_len = len(state) output_len = action_space self.eval_model = DQNModel(input_len, output_len, learning_rate=hyper_params['learning_rate']) self.use_target_model = hyper_params['use_target_model'] if self.use_target_model: self.target_model = DQNModel(input_len, output_len) self.memory = ReplayBuffer(hyper_params['memory_size']) self.batch_size = hyper_params['batch_size'] self.update_steps = hyper_params['update_steps'] self.model_replace_freq = hyper_params['model_replace_freq'] # Linear decrease function for epsilon def linear_decrease(self, initial_value, final_value, curr_steps, final_decay_steps): decay_rate = curr_steps / final_decay_steps if decay_rate > 1: decay_rate = 1 return initial_value - (initial_value - final_value) * decay_rate def explore_or_exploit_policy(self, state): p = uniform(0, 1) # Get decreased epsilon epsilon = self.linear_decrease(self.initial_epsilon, self.final_epsilon, self.steps, self.epsilon_decay_steps) if p < epsilon: #return action return randint(0, self.action_space - 1) else: #return action return self.greedy_policy(state) def greedy_policy(self, state): return self.eval_model.predict(state) def update_batch(self): if len(self.memory ) < self.batch_size or self.steps % self.update_steps != 0: return # 1) Sample a 'batch_size' batch of experiences from the memory. batch = self.memory.sample(self.batch_size) (states, actions, reward, next_states, is_terminal) = batch states = states next_states = next_states terminal = FloatTensor([1 if t else 0 for t in is_terminal]) reward = FloatTensor(reward) batch_index = torch.arange(self.batch_size, dtype=torch.long) # Current Q Values --- 2) Predict the Q-value from the 'eval_model' based on (states, actions) _, q_values = self.eval_model.predict_batch(states) q_values = q_values[batch_index, actions] # Calculate target --- 3) Predict the Q-value from the 'target model' based on (next_states), and take max of each Q-value vector, Q_max if self.use_target_model: actions, q_next = self.target_model.predict_batch(next_states) else: actions, q_next = self.eval_model.predict_batch(next_states) q_next = q_next[batch_index, actions] q_target = FloatTensor([ reward[index] if is_terminal[index] else reward[index] + self.beta * q_next[index] for index in range(self.batch_size) ]) # update model self.eval_model.fit(q_values, q_target) def learn_and_evaluate(self, training_episodes, test_interval): test_number = training_episodes // test_interval all_results = [] for i in range(test_number): # learn self.learn(test_interval) # evaluate avg_reward = self.evaluate() all_results.append(avg_reward) return all_results def learn(self, test_interval): for episode in tqdm(range(test_interval), desc="Training"): state = self.env.reset() done = False steps = 0 while steps < self.max_episode_steps and not done: action = self.explore_or_exploit_policy(state) next_state, reward, done, _ = self.env.step(action) # Store history self.memory.add(state, action, reward, next_state, done) # Update the model if self.steps % self.update_steps == 0: self.update_batch() # Update the target network if DQN uses it if self.use_target_model: if self.steps % self.model_replace_freq == 0: self.target_model.replace(self.eval_model) # Update information for the next loop state = next_state steps += 1 self.steps += 1 def evaluate(self, trials=30): total_reward = 0 for _ in tqdm(range(trials), desc="Evaluating"): state = self.env.reset() done = False steps = 0 while steps < self.max_episode_steps and not done: steps += 1 action = self.greedy_policy(state) state, reward, done, _ = self.env.step(action) total_reward += reward avg_reward = total_reward / trials print(avg_reward) f = open(result_file, "a+") f.write(str(avg_reward) + "\n") f.close() if avg_reward >= self.best_reward: self.best_reward = avg_reward self.save_model() return avg_reward # save model def save_model(self): self.eval_model.save(result_floder + '/best_model.pt') # load model def load_model(self): self.eval_model.load(result_floder + '/best_model.pt')
def main(): seeding() # number of parallel agents env = UnityEnvironment(file_name="Tennis.x86_64") env_name = 'Tennis' # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) # size of each action action_size = brain.vector_action_space_size # examine the state space states = env_info.vector_observations state_size = states.shape[-1] # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 10000 episode_length = 10000 batchsize = 128 # amplitude of OU noise # this slowly decreases to 0 noise = 1 noise_reduction = 0.9999 log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) # initialize memory buffer buffer = ReplayBuffer(int(500000), batchsize, 0) # initialize policy and critic maddpg = MADDPG(state_size, action_size, num_agents, seed=12345, discount_factor=0.95, tau=0.02) #how often to update the MADDPG model episode_per_update = 2 # training loop PRINT_EVERY = 5 scores_deque = deque(maxlen=100) # holds raw scores scores = [] # holds avg scores of last 100 epsiodes avg_last_100 = [] threshold = 0.5 # use keep_awake to keep workspace from disconnecting for episode in range(number_of_episodes): env_info = env.reset( train_mode=True)[brain_name] # reset the environment state = env_info.vector_observations # get the current state (for each agent) episode_reward_agent0 = 0 episode_reward_agent1 = 0 for agent in maddpg.maddpg_agent: agent.noise.reset() for episode_t in range(episode_length): actions = maddpg.act(torch.tensor(state, dtype=torch.float), noise=noise) noise *= noise_reduction actions_array = torch.stack(actions).detach().numpy() env_info = env.step(actions_array)[brain_name] next_state = env_info.vector_observations reward = env_info.rewards done = env_info.local_done episode_reward_agent0 += reward[0] episode_reward_agent1 += reward[1] # add data to buffer ''' I can either hstack or concat two states here or do it in the update function in MADDPG However I think it's easier to do it here, since in the update function I have batch_size to deal with Although the replay buffer would have to hold more data by preprocessing and creating 2 new variables that hold essentially the same info as state, and next_state, but just concatenated. ''' full_state = np.concatenate((state[0], state[1])) full_next_state = np.concatenate((next_state[0], next_state[1])) buffer.add(state, full_state, actions_array, reward, next_state, full_next_state, done) state = next_state # update once after every episode_per_update if len(buffer) > batchsize and episode % episode_per_update == 0: for i in range(num_agents): samples = buffer.sample() maddpg.update(samples, i) maddpg.update_targets( ) # soft update the target network towards the actual networks if np.any(done): #if any of the agents are done break break episode_reward = max(episode_reward_agent0, episode_reward_agent1) scores.append(episode_reward) scores_deque.append(episode_reward) avg_last_100.append(np.mean(scores_deque)) # scores.append(episode_reward) print('\rEpisode {}\tAverage Score: {:.4f}\tScore: {:.4f}'.format( episode, avg_last_100[-1], episode_reward), end="") if episode % PRINT_EVERY == 0: print('\rEpisode {}\tAverage Score: {:.4f}'.format( episode, avg_last_100[-1])) # saving successful model #training ends when the threshold value is reached. if avg_last_100[-1] >= threshold: save_dict_list = [] for i in range(num_agents): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # plots graphs raw_score_plotter(scores) plotter(env_name, len(scores), avg_last_100, threshold) break
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, config, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.config = config self.device = config['device'] self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.noise_epsilon = config['NOISE_EPSILON'] # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(self.device) self.actor_target = Actor(state_size, action_size, random_seed).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config['LR_ACTOR']) self.hard_update(self.actor_local, self.actor_target) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(self.device) self.critic_target = Critic(state_size, action_size, random_seed).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=config['LR_CRITIC'], weight_decay=config['WEIGHT_DECAY']) self.hard_update(self.critic_local, self.critic_target) # Noise process self.noise = OUNoise((1, action_size), random_seed, 0.0, config['OU_THETA'], config['OU_SIGMA']) self.noise_epsilon = config['NOISE_EPSILON'] # Replay memory self.memory = ReplayBuffer(action_size, self.config, random_seed) def step(self, t, state, action, reward, next_state, done, agent_index): """Save experience in replay memory, and use random sample from buffer to learn.""" self.memory.add(state, action, reward, next_state, done) if t % self.config['DDPG_UPDATE_EVERY'] == 0 and len( self.memory) > self.config['BATCH_SIZE']: for _ in range(self.config['DDPG_LEARN_TIMES']): experiences = self.memory.sample() self.learn(experiences, agent_index) def act(self, states): states = torch.from_numpy(states).float().to(self.device) actions = np.zeros((1, self.action_size)) self.actor_local.eval() with torch.no_grad(): for agent_num, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[agent_num, :] = action self.actor_local.train() actions += self.noise_epsilon * self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, agent_index): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ gamma = self.config['GAMMA'] states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) if agent_index == 0: actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1) else: actions_next = torch.cat((actions[:, :2], actions_next), dim=1) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) if agent_index == 0: actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1) else: actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) # ---------------------------- update noise ---------------------------- # self.noise_epsilon = max( self.noise_epsilon - self.config['NOISE_EPSILON_DECAY'], self.config['NOISE_EPSILON_MIN']) self.noise.reset() def hard_update(self, local_model, target_model): """Hard update model parameters. θ_target = θ_local Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(local_param.data) def soft_update(self, local_model, target_model): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) """ tau = self.config['TAU'] for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class MADDPG: def __init__(self, num_agents, state_size, action_size, random_seed): self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.agents = [ Agent(state_size, action_size, random_seed, i) for i in range(num_agents) ] self.memory = ReplayBuffer(state_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def reset(self): for agent in self.agents: agent.reset() def act(self, states, noise_counter): actions = [] for agent, state in zip(self.agents, states): action = agent.act(state, noise_counter) actions.append(action) out = np.array(actions).reshape(1, -1) return out def step(self, states, actions, rewards, next_states, dones, t): states = states.reshape(1, -1) next_states = next_states.reshape(1, -1) # add to shared replay memory self.memory.add(states, actions, rewards, next_states, dones) if t % LEARN_EVERY == 0: if len(self.memory) >= BATCH_SIZE: # use the same for all agents e = self.memory.sample() experiences = [e for _ in range(self.num_agents)] # each agent learns (loops over each agent in self.learn()) self.learn(experiences, GAMMA) def learn(self, sample, gamma): next_actions = [] actions = [] # loop over each agent for i, agent in enumerate(self.agents): states, _, _, next_states, _ = sample[i] # get agent_id agent_id = torch.tensor([i]).to(device) # extract agent i state and get action via actor network state = states.reshape(-1, 2, 24).index_select(1, agent_id).squeeze(1) action = agent.actor_local(state) # predict action actions.append(action) # extract agent i next state and get action via target actor network next_state = next_states.reshape(-1, 2, 24).index_select(1, agent_id).squeeze(1) next_action = agent.actor_target(next_state) next_actions.append(next_action) # let each agent learn from his experiences for i, agent in enumerate(self.agents): agent.learn(sample[i], GAMMA, actions, next_actions, i)
class DDPG(): def __init__(self, env, action_dim, state_dim, device, critic_lr=3e-4, actor_lr=3e-4, gamma=0.99, batch_size=100, validate_steps=100, max_episode_length=150): """ param: env: An gym environment param: action_dim: Size of action space param: state_dim: Size of state space param: critic_lr: Learning rate of the critic param: actor_lr: Learning rate of the actor param: gamma: The discount factor param: batch_size: The batch size for training param: device: The device used for training param: validate_steps: Number of iterations after which we evaluate trained policy """ self.gamma = gamma self.batch_size = batch_size self.env = env self.device = device self.eval_env = deepcopy(env) self.validate_steps = validate_steps self.max_episode_length = max_episode_length # actor and actor_target where both networks have the same initial weights self.actor = Actor(state_dim=state_dim, action_dim=action_dim).to(self.device) self.actor_target = deepcopy(self.actor) # critic and critic_target where both networks have the same initial weights self.critic = Critic(state_dim=state_dim, action_dim=action_dim).to(self.device) self.critic_target = deepcopy(self.critic) # Optimizer for the actor and critic self.optimizer_actor = optim.Adam(self.actor.parameters(), lr=actor_lr) self.optimizer_critic = optim.Adam(self.critic.parameters(), lr=critic_lr) # Replay buffer self.ReplayBuffer = ReplayBuffer(buffer_size=10000, init_length=1000, state_dim=state_dim, \ action_dim=action_dim, env=env, device = device) def update_target_networks(self): """ A function to update the target networks """ weighSync(self.actor_target, self.actor) weighSync(self.critic_target, self.critic) def update_network(self, batch): """ A function to update the function just once """ # Sample and parse batch state, action, reward, state_next, done = self.ReplayBuffer.batch_sample( batch) # Predicting the next action and q_value action_next = self.actor_target(state_next) q_next = self.critic_target(state_next, action_next) target_q = reward + (self.gamma * done * q_next) q = self.critic(state, action) # Critic update self.critic.zero_grad() value_loss = F.mse_loss(q, target_q) value_loss.backward() self.optimizer_critic.step() # Actor update self.actor.zero_grad() policy_loss = -self.critic(state, self.actor(state)).mean() policy_loss.backward() self.optimizer_actor.step() # Target update self.update_target_networks() return value_loss.item(), policy_loss.item() def select_action(self, state, isEval): state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) action = self.actor(state).squeeze(0).detach() if isEval: return action.cpu().numpy() action += torch.normal(0, 0.1, size=action.shape).to(self.device) action = torch.clamp(action, -1., 1.).cpu().numpy() return action def train(self, num_steps): """ Train the policy for the given number of iterations :param num_steps:The number of steps to train the policy for """ value_losses, policy_losses, validation_reward, validation_steps = [],[],[],[] step, episode, episode_steps, episode_reward, state = 0, 0, 0, 0., None while step < num_steps: # reset if it is the start of episode if state is None: state = deepcopy(self.env.reset()) action = self.select_action(state, False) # env response with next_state, reward, terminate_info state_next, reward, done, _ = self.env.step(action) state_next = deepcopy(state_next) if self.max_episode_length and episode_steps >= self.max_episode_length - 1: done = True # observe and store in replay buffer self.ReplayBuffer.buffer_add( Exp(state=state, action=action, reward=reward, state_next=state_next, done=done)) # update policy based on sampled batch batch = self.ReplayBuffer.buffer_sample(self.batch_size) value_loss, policy_loss = self.update_network(batch) value_losses.append(value_loss) policy_losses.append(policy_loss) # evaluate if step % self.validate_steps == 0: validate_reward, steps = self.evaluate() validation_reward.append(validate_reward) validation_steps.append(steps) print( "[Eval {:06d}/{:06d}] Steps: {:06d}, Episode Reward:{:04f}" .format(step, int(num_steps), steps, validate_reward)) # update step += 1 episode_steps += 1 episode_reward += reward state = deepcopy(state_next) if done: # reset at the end of episode #print("[Train {:06d}/{:06d}] - Episode Reward:{:04f} ".format(step, num_steps, step, episode_reward)) episode_steps, episode_reward, state = 0, 0., None episode += 1 return value_losses, policy_losses, validation_reward, validation_steps def evaluate(self): """ Evaluate the policy trained so far in an evaluation environment """ state, done, total_reward, steps = self.eval_env.reset(), False, 0., 0 while not done: action = self.select_action(state, True) state_next, reward, done, _ = self.eval_env.step(action) total_reward += reward steps += 1 state = state_next return total_reward / steps, steps
def train(sess, env, actor, critic, RESTORE): sess.run(tf.global_variables_initializer()) # Initialize random noise generator exploration_noise = OUNoise(env.action_space.shape[0]) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay buffER replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) totSteps = 0 # Store q values for illustration purposes q_max_array = [] actor.learning_rate = MAX_ACTOR_LEARNING_RATE critic.learning_rate = MAX_CRITIC_LEARNING_RATE for i in xrange(MAX_EPISODES): s = env.reset() s = normalize(s) ep_reward = 0 ep_ave_max_q = 0 # update learning rates using cosine annealing T_cur = i % LR_CYCLE actor.learning_rate = MIN_ACTOR_LEARNING_RATE +\ 0.5 * (MAX_ACTOR_LEARNING_RATE - MIN_ACTOR_LEARNING_RATE) * \ (1 + np.cos(np.pi * T_cur / LR_CYCLE)) critic.learning_rate = MIN_CRITIC_LEARNING_RATE +\ 0.5 * (MAX_CRITIC_LEARNING_RATE - MIN_CRITIC_LEARNING_RATE) * \ (1 + np.cos(np.pi * T_cur / LR_CYCLE)) for j in xrange(MAX_EP_STEPS): totSteps += 1 # Begin "Experimentation and Evaluation Phase" # Select next experimental action by adding noise to action prescribed by policy a = actor.predict(np.reshape(s, (1, actor.s_dim, 1))) # If in a testing episode, do not add noise if i < EXPLORATION_SIZE and not (i % 100 is 49 or i % 100 is 99): noise = exploration_noise.noise() a = a + noise # Constrain action a = np.clip(a, -15, 15) # Take step with experimental action s2, r, terminal, info = env.step( np.reshape(a.T, newshape=(env.action_space.shape[0], )), CONST_THROTTLE) #print("car pos: " + str(env.car_dist_s)) #print("action: " + str(a)) #print("reward: " + str(r)) s2 = normalize(s2) # Add transition to replay buffer if not testing episode if i % 100 is not 49 and i % 100 is not 99: replay_buffer.add(np.reshape(s, (actor.s_dim, 1)), np.reshape(a, (actor.a_dim, )), r, terminal, np.reshape(s2, (actor.s_dim, 1))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MEMORY_WARMUP: s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( MINIBATCH_SIZE) # Find target estimate to use for updating the Q-function # Predict_traget function determines Q-value of next state target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) # Complete target estimate (R(t+1) + Q(s(t+1),a(t+1))) y_i = [] for k in xrange(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # Perform gradient descent to update critic predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) ep_ave_max_q += np.amax(predicted_q_value, axis=0) # Perform "Learning" phase by moving policy parameters in direction of deterministic policy gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 ep_reward += r # If episode is finished, print results if terminal: if i % 100 is 49 or i % 100 is 99: print("Testing") kmodel = Sequential() actVars = [] for var in tf.trainable_variables(): if 'non-target' in str(var): actVars.append(var) kmodel.add( Dense(units=l1size, activation='tanh', weights=[ sess.run(actVars[0]), sess.run(actVars[1]) ], input_dim=actor.s_dim)) kmodel.add( Dense(units=l2size, activation='tanh', weights=[ sess.run(actVars[2]), sess.run(actVars[3]) ])) kmodel.add( Dense(units=1, activation='tanh', weights=[ sess.run(actVars[4]), sess.run(actVars[5]) ])) optimizer = optimizers.RMSprop(lr=0.00025, rho=0.9, epsilon=1e-06) kmodel.compile(loss="mse", optimizer=optimizer) kmodel.save(modelfile) else: print("Training") print('| Reward: %.2i' % int(ep_reward), " | Episode", i, '| Qmax: %.4f' % (ep_ave_max_q / float(j))) q_max_array.append(ep_ave_max_q / float(j)) print('Finished in ' + str(j) + ' steps') break plt.plot(q_max_array) plt.xlabel('Episode Number') plt.ylabel('Max Q-Value') plt.show() kmodel = Sequential() actVars = [] for var in tf.trainable_variables(): if 'non-target' in str(var): actVars.append(var) kmodel.add( Dense(units=l1size, activation='tanh', weights=[sess.run(actVars[0]), sess.run(actVars[1])], input_dim=actor.s_dim)) kmodel.add( Dense(units=l2size, activation='tanh', weights=[sess.run(actVars[2]), sess.run(actVars[3])])) kmodel.add( Dense(units=1, activation='tanh', weights=[sess.run(actVars[4]), sess.run(actVars[5])])) optimizer = optimizers.RMSprop(lr=0.00025, rho=0.9, epsilon=1e-06) kmodel.compile(loss="mse", optimizer=optimizer) kmodel.summary() kmodel.save(modelfile)