class DDPG(object): """ Deep Deterministic Policy Gradient Algorithm """ def __init__(self, env, writer=None): self.env = env self.writer = writer state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] self.max_action = env.action_space.high[0] # Randomly initialize network parameter self.actor = Actor(state_dim, action_dim).to('cuda') self.critic = Critic(state_dim, action_dim).to('cuda') # Initialize target network parameter self.target_actor = Actor(state_dim, action_dim).to('cuda') self.target_actor.load_state_dict(self.actor.state_dict()) self.target_critic = Critic(state_dim, action_dim).to('cuda') self.target_critic.load_state_dict(self.critic.state_dict()) # Replay memory self.memory = ReplayMemory(state_dim, action_dim) self.gamma = gamma self.criterion = nn.MSELoss() self.tau = tau # network parameter optimizer self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr, weight_decay=weight_decay) def get_action(self, state, ou_noise=None, timestep=None): # When test if ou_noise is None: return self.actor(torch.from_numpy(state).to( 'cuda', torch.float)).to('cpu').detach().numpy().copy() # When train action = self.actor(torch.from_numpy(state).to('cuda', torch.float)) noise = ou_noise(timestep) return np.clip(action.to('cpu').detach().numpy().copy() + noise, -1, 1) def store_transition(self, state, action, state_, reward, done): self.memory.store_transition(state, action, state_, reward, done) def soft_update(self, target_net, net): """Target parameters soft update""" for target_param, param in zip(target_net.parameters(), net.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def update(self, time_step, batch_size=64): """Network parameter update""" if len(self.memory) < batch_size: return states, actions, states_, rewards, terminals = self.memory.sample( batch_size) # Calculate expected value with torch.no_grad(): y = rewards.unsqueeze(1) + terminals.unsqueeze(1) * self.gamma * \ self.target_critic(states_, self.target_actor(states_)) # Update Critic q = self.critic(states, actions) critic_loss = self.criterion(q, y) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() if self.writer: self.writer.add_scalar("loss/critic", critic_loss.item(), time_step) # Update Actor (Policy Gradient) actor_loss = -self.critic(states, self.actor(states)).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() if self.writer: self.writer.add_scalar("loss/actor", actor_loss.item(), time_step) # target parameter soft update self.soft_update(self.target_actor, self.actor) # update target actor network self.soft_update(self.target_critic, self.critic) # update target critic network def save_model(self, path='models/'): torch.save(self.actor.state_dict(), path + 'actor') torch.save(self.critic.state_dict(), path + 'critic') torch.save(self.target_actor.state_dict(), path + 'target_actor') torch.save(self.target_critic.state_dict(), path + 'target_critic') def load_model(self, path='models/'): self.actor.load_state_dict(torch.load(path + 'actor')) self.critic.load_state_dict(torch.load(path + 'critic')) self.target_actor.load_state_dict(torch.load(path + 'target_actor')) self.target_critic.load_state_dict(torch.load(path + 'target_critic'))
class DQN_agent: def __init__(self,env,policy,target,n_action=18,capacity=100000,batch_size=32,lr=2.5e-4,gamma=0.99,burn_in=50000,C=1000,eps_decay=1000000): self.env=env self.n_action=n_action self.memory=ReplayMemory(capacity) self.device="cuda" self.policy=policy self.target=target self.batch_size=batch_size self.gamma=gamma self.lr=lr self.opt= optim.Adam(self.policy.parameters(), lr=self.lr) self.burn_in=burn_in self.C=C self.eps_decay=eps_decay self.loss=nn.MSELoss() def get_state(self,obs): state=torch.FloatTensor(np.array(obs).transpose(2,0,1)).unsqueeze(0) return(state) def get_action(self,state,eps): x=random.random() if x<eps: return(torch.tensor([[random.randrange(self.n_action)]], dtype=torch.long)) else: with torch.no_grad(): return(self.policy(state.to("cuda")).max(1)[1].view(1,1)) def update_policy(self): state,action,reward,next_state,done=self.memory.sample(self.batch_size) state=state.to("cuda") action=action.to("cuda") next_state=next_state.to("cuda") reward=reward.to("cuda") done=done.to("cuda") q=self.policy(state).gather(1,action.unsqueeze(1)).squeeze(1) q_max=self.target(next_state).max(1)[0] y=(reward+self.gamma*q_max)*(1-done)+reward*done loss=self.loss(q,y) self.opt.zero_grad() loss.backward() self.opt.step() return def update_target(self): self.target.load_state_dict(self.policy.state_dict()) def train(self,episodes): steps=0 reward_list=[] for episode in range(episodes): obs=self.env.reset() state=self.get_state(obs) reward_episode=0 done=False while not done: steps+=1 test_eps=int(steps>self.eps_decay) eps=(1-steps*(1-0.1)/self.eps_decay)*(1-test_eps)+0.1*test_eps action=self.get_action(state,eps) obs,reward,done,info=env.step(action) reward_episode+=reward next_state=self.get_state(obs) reward = torch.tensor([reward], device="cpu", dtype=torch.float) action = torch.tensor([action], device="cpu", dtype=torch.long) done = torch.tensor([int(done)], device="cpu", dtype=int) self.memory.push(state,action,reward,next_state,done) if steps>self.burn_in: self.update_policy() if steps>self.burn_in and steps%self.C==0: self.update_target() state=next_state if episode%100 == 0: print('Total steps: {} \t Episode: {}/{} \t Total reward: {}'.format(steps, episode, episodes, np.mean(reward_list[-100:]))) if episode%500==0: print(reward_list) reward_list.append(reward_episode) self.env.close() print(reward_list) return(reward_list) def save_model(self,name): torch.save(self.policy,name) return def load_model(self,name): self.policy=torch.load(name) def test(self,n_episodes): test_reward=[] for episode in range(n_episodes): obs = self.env.reset() state = self.get_state(obs) reward_episode = 0.0 done=False while not done: with torch.no_grad(): action=self.policy(state.to("cuda")).max(1)[1].view(1,1) obs,reward,done,infoself.=env.step(action) reward_episode+=reward state=self.get_state(obs) if done: print("Finished Episode {} with reward {}".format(episode, reward_episode)) self.env.close() test_reward.append(reward_episode) return (test_reward)
args.device = torch.device('cpu') # Simple ISO 8601 timestamped logger def log(s): print('[' + str(datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) + '] ' + s) # Environment env = Env(args) env.train() action_space = env.action_space() # Agent dqn = Agent(args, env) mem = ReplayMemory(args, args.memory_capacity) priority_weight_increase = (1 - args.priority_weight) / (args.T_max - args.learn_start) # Construct validation memory val_mem = ReplayMemory(args, args.evaluation_size) T, done = 0, True while T < args.evaluation_size: if done: state, done = env.reset(), False next_state, _, done = env.step(random.randint(0, action_space - 1)) val_mem.append(state, None, None, done) state = next_state T += 1
class MADDPGAgent(Agent): def __init__(self, index, name, env, actor, critic, params): self.index = index self.name = name self.env = env self.actor = actor.to(DEVICE) self.critic = critic.to(DEVICE) self.actor_target = actor.clone().to(DEVICE) self.critic_target = critic.clone().to(DEVICE) self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=params.lr_actor) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=params.lr_critic) self.memory = ReplayMemory(params.memory_size, params.max_episode_len, self.actor.n_outputs, self.actor.n_inputs) self.mse = torch.nn.MSELoss() # params self.batch_size = params.batch_size self.tau = params.tau self.gamma = params.gamma self.clip_grads = True # flags # local obs/actions means only the obs/actions of this agent are available # if obs and actions are local this is equivalent to DDPG self.local_obs = params.local_obs self.local_actions = params.local_actions or params.local_obs # agent modeling self.use_agent_models = params.use_agent_models self.agent_models = {} self.model_optims = {} self.model_lr = params.modeling_lr self.entropy_weight = 1e-3 self.max_past = params.max_past self.modeling_train_steps = params.modeling_train_steps self.modeling_batch_size = params.modeling_batch_size self.model_class = Actor # action and observation noise self.obfuscate_others = (params.sigma_noise is not None) or (params.temp_noise is not None) self.sigma_noise = params.sigma_noise self.temp_noise = params.temp_noise def init_agent_models(self, agents): for agent in agents: if agent is self: continue agent_model = self.model_class.from_actor(agent.actor).to(DEVICE) self.agent_models[agent.index] = agent_model optim = torch.optim.Adam(agent_model.parameters(), lr=self.model_lr) self.model_optims[agent.index] = optim def update_params(self, target, source): zipped = zip(target.parameters(), source.parameters()) for target_param, source_param in zipped: updated_param = target_param.data * (1.0 - self.tau) + \ source_param.data * self.tau target_param.data.copy_(updated_param) def act(self, obs, explore=True): obs = torch.tensor(obs, dtype=torch.float, requires_grad=False).to(DEVICE) actions = self.actor.select_action(obs, explore=explore).detach() return actions.to('cpu').numpy() def experience(self, episode_count, obs, action, reward, new_obs, done): self.memory.add(episode_count, obs, action, reward, new_obs, float(done)) def train_actor(self, batch): ### forward pass ### pred_actions = self.actor.select_action(batch.observations[self.index]) actions = list(batch.actions) actions[self.index] = pred_actions q_obs = [batch.observations[self.index] ] if self.local_obs else batch.observations q_actions = [actions[self.index]] if self.local_actions else actions pred_q = self.critic(q_obs, q_actions) ### backward pass ### p_reg = torch.mean( self.actor.forward(batch.observations[self.index])**2) loss = -pred_q.mean() + 1e-3 * p_reg self.actor_optim.zero_grad() loss.backward() if self.clip_grads: torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 0.5) self.actor_optim.step() return loss def train_critic(self, batch, agents): """Train critic with TD-target.""" ### forward pass ### # (a_1', ..., a_n') = (mu'_1(o_1'), ..., mu'_n(o_n')) self_obs = batch.next_observations[self.index] self_action = self.actor_target.select_action(self_obs).detach() if self.local_actions: pred_next_actions = [self_action] elif self.use_agent_models: pred_next_actions = [ m.select_action(batch.next_observations[idx]).detach() for idx, m in self.agent_models.items() ] pred_next_actions.insert(self.index, self_action) else: pred_next_actions = [ a.actor_target.select_action(o).detach() for o, a in zip(batch.next_observations, agents) ] q_next_obs = [batch.next_observations[self.index] ] if self.local_obs else batch.next_observations q_next = self.critic_target(q_next_obs, pred_next_actions) reward = batch.rewards[self.index] done = batch.dones[self.index] # if not done: y = r + gamma * Q(o_1, ..., o_n, a_1', ..., a_n') # if done: y = r q_target = reward + (1.0 - done) * self.gamma * q_next ### backward pass ### # loss(params) = mse(y, Q(o_1, ..., o_n, a_1, ..., a_n)) q_obs = [batch.observations[self.index] ] if self.local_obs else batch.observations q_actions = [batch.actions[self.index] ] if self.local_actions else batch.actions loss = self.mse(self.critic(q_obs, q_actions), q_target.detach()) self.critic_optim.zero_grad() loss.backward() if self.clip_grads: torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5) self.critic_optim.step() return loss def train_models(self, batch, agents): for idx, model in self.agent_models.items(): obs = batch.observations[idx] actions = batch.actions[idx] distributions = model.prob_dists(obs) split_actions = torch.split(actions, agents[idx].actor.action_split, dim=-1) self.model_optims[idx].zero_grad() losses = torch.zeros(len(distributions)) for i, (actions, dist) in enumerate(zip(split_actions, distributions)): entropy = dist.base_dist._categorical.entropy() loss = (dist.log_prob(actions).mean() + self.entropy_weight * entropy).mean() losses[i] = loss loss = -torch.mean(losses) loss.backward() self.model_optims[idx].step() return loss def compare_models(self, agents, batch): kls = [] for idx, model in self.agent_models.items(): kls.append([]) obs = batch.observations[idx] modelled_distributions = model.prob_dists(obs) agent_distributions = agents[idx].actor.prob_dists(obs) for model_dist, agent_dist in zip(modelled_distributions, agent_distributions): kl_div = torch.distributions.kl.kl_divergence( agent_dist, model_dist).data kls[-1].append(kl_div.mean()) return zip(self.agent_models.keys(), kls) def add_noise_(self, batch): for i in range(len(batch.actions)): if i == self.index: continue # get observations and actions for agent i obs = batch.observations[i] actions = batch.actions[i] # create noise tensors, same shape and on same device if self.sigma_noise is not None: obs = obs + torch.randn_like(obs) * self.sigma_noise if self.temp_noise is not None: temp = torch.tensor(self.temp_noise, dtype=torch.float, device=actions.device) # avoid zero probs which lead to nan samples probs = actions + 1e-45 actions = RelaxedOneHotCategorical(temp, probs=probs).sample() # add noise batch.observations[i] = obs batch.actions[i] = actions def update(self, agents): # collect transistion memories form all agents memories = [a.memory for a in agents] # train model networks if self.use_agent_models: model_losses = [] for _ in range(self.modeling_train_steps): batch = self.memory.sample_transitions_from( memories, self.modeling_batch_size, max_past=self.max_past) if self.obfuscate_others: self.add_noise_(batch) model_losses.append(self.train_models(batch, agents).data) model_loss = np.mean(model_losses) model_kls = self.compare_models(agents, batch) else: model_loss = None model_kls = None # sample minibatch batch = self.memory.sample_transitions_from(memories, self.batch_size) if self.obfuscate_others: self.add_noise_(batch) # train actor and critic network actor_loss = self.train_actor(batch) critic_loss = self.train_critic(batch, agents) # update target network params self.update_params(self.actor_target, self.actor) self.update_params(self.critic_target, self.critic) return actor_loss, critic_loss, model_loss, model_kls def get_state(self): if self.agent_models: models = {i: m.state_dict() for i, m in self.agent_models.items()} optims = {i: o.state_dict() for i, o in self.model_optims.items()} model_pair = (models, optims) else: model_pair = None return { 'actor': self.actor.state_dict(), 'actor_target': self.actor_target.state_dict(), 'actor_optim': self.actor_optim.state_dict(), 'critic': self.critic.state_dict(), 'critic_target': self.critic_target.state_dict(), 'critic_optim': self.critic_optim.state_dict(), }, model_pair def load_state(self, state): for key, value in state['state_dicts'].items(): getattr(self, key).load_state_dict(value) if 'models' in state: models, optims = state['models'] for i, m in models.items(): self.agent_models[i].load_state_dict(m) for i, o in optims.items(): self.model_optims[i].load_state_dict(o)
args.device = torch.device('cuda') torch.cuda.manual_seed(np.random.randint(1, 10000)) torch.backends.cudnn.enabled = False else: args.device = torch.device('cpu') def log(s): print('[' + str(datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) + '] ' + s) env = Env(args) env.train() action_space = env.action_space() dqn = Agent(args, env) mem = ReplayMemory(args, args.memory_capacity) priority_weight_increase = (1 - args.priority_weight) / (args.T_max - args.learn_start) val_mem = ReplayMemory(args, args.evaluation_size) T, done = 0, True while T < args.evaluation_size: if done: state, done = env.reset(), False next_state, _, done = env.step(np.random.randint(0, action_space)) val_mem.append(state, None, None, done) state = next_state T += 1 if args.evaluate: dqn.eval() avg_reward, avg_Q, env = test(args, 0, dqn, val_mem, env, evaluate=True)
if np.random.random() < epsilon: action = np.random.randint(4) for i in range(self.action_repeat): reward = self.environment.act(action) total_score += reward self.environment.update_screen() return total_score sess = tf.InteractiveSession() counter = Counter(7000000) replay_memory = ReplayMemory(1000000) dqn_agent = DQNAgent((84,84,4), NATURE, 4, replay_memory, counter, tf_session=sess) agent = EpsilonAgent(dqn_agent, 4, counter) agi = AtariGameInterface('Breakout.bin', agent, replay_memory, counter) # Create a Tensorboard monitor and populate with the desired summaries tensorboard_monitor = TensorboardMonitor('./log', sess, counter) tensorboard_monitor.add_scalar_summary('score', 'per_game_summary') tensorboard_monitor.add_scalar_summary('training_loss', 'training_summary') for i in range(4): tensorboard_monitor.add_histogram_summary('Q%d_training' % i, 'training_summary') checkpoint_monitor = CheckpointRecorder(dqn_agent.dqn, replay_memory, counter, './checkpoints', sess) agi.add_listener(checkpoint_monitor) agi.add_listener(tensorboard_monitor) dqn_agent.add_listener(tensorboard_monitor)
torch.backends.cudnn.benchmark = True # Simple timestamped logger def log(s): print('[' + str(datetime.now().time()) + '] ' + s) # Environment env = Env(args) env.train() action_space = env.action_space() # Agent dqn = Agent(args, env) mem = ReplayMemory(args, args.memory_capacity) priority_weight_increase = (1 - args.priority_weight) / (args.T_max - args.learn_start) # Construct validation memory val_mem = ReplayMemory(args, args.evaluation_size) T, done = 0, True while T < args.evaluation_size - args.history_length + 1: if done: state, done = env.reset(), False val_mem.preappend() # Set up memory for beginning of episode val_mem.append(state, None, None) state, _, done = env.step(random.randint(0, action_space - 1)) T += 1 # No need to postappend on done in validation memory
class DQNAgent: def __init__(self, environment): self.env = environment self.memory = ReplayMemory(MEMORY_CAPACITY) self.dim_actions = self.env.action_space.n self.dim_states = self.env.observation_space.shape self.NN = NN(self.env.observation_space.shape, self.env.action_space.n, BATCH_SIZE, SIZE_HIDDEN, LEARNING_RATE, ACTIVATION) self.observers = [] self.episode_count = 0 self.step_count_total = 1 self.step_count_episode = 1 self.epsilon_min = EPSILON_MIN self.epsilon_max = EPSILON_MAX self.epsilon_decay = EPSILON_DECAY self.target_update = TARGET_UPDATE self.max_steps = MAX_STEPS self.n_episodes = N_EPISODES self.epsilon = EPSILON_MAX self.batch_size = BATCH_SIZE self.usetarget = False self.gamma = GAMMA self.loss = 0 self.done = False self.reward = 0 self.reward_episode = 0 self.learning_switch = False self.learning_start = LEARNING_START def notify(self, event): for observer in self.observers: observer(event) pass def act(self, state): self.step_count_total += 1 action = self.choose_action(state) return action def learn(self, obs): self.memory.store(obs) if self.learning_switch: self.backup() self.notify('step_done') pass def backup(self): self.flashback() if self.step_count_total % self.target_update == 0: print('update') print(self.epsilon) self.NN.update_target() self.usetarget = True pass def flashback(self): X, y = self._make_batch() self.loss = self.NN.train(X, y) if np.isnan(self.loss.history['loss']).any(): print('Warning, loss is {}'.format(self.loss)) pass def choose_action(self, state): if np.random.rand() <= self.epsilon: choice = self.random_choice() else: choice = self.greedy_choice(state) return choice def greedy_choice(self, state): greedy_choice = self.NN.best_action(state, usetarget=False) return greedy_choice def random_choice(self): random_choice = np.random.randint(0, self.dim_actions) return random_choice def _make_batch(self): X = [] y = [] batch = self.memory.get_batch(self.batch_size) for state, action, newstate, reward, done in batch: X.append(state) target = self.NN.predict(state, False) q_vals_new_t = self.NN.predict(newstate, self.usetarget) a_select = self.NN.best_action(newstate, False) if done: target[action] = reward else: target[action] = reward + self.gamma * q_vals_new_t[a_select] y.append(target) return X, y def add_observer(self, observer): self.observers.append(observer) pass
class Agent(): def __init__(self, action_size): self.action_size = action_size # These are hyper parameters for the DQN self.discount_factor = 0.99 self.epsilon = 1.0 self.epsilon_min = 0.01 self.explore_step = 500000 self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.explore_step self.train_start = 100000 self.update_target = 1000 # Generate the memory self.memory = ReplayMemory() # Create the policy net self.policy_net = DQN(action_size) self.policy_net.to(device) self.optimizer = optim.Adam(params=self.policy_net.parameters(), lr=learning_rate) self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=scheduler_step_size, gamma=scheduler_gamma) def load_policy_net(self, path): self.policy_net = torch.load(path) """Get action using policy net using epsilon-greedy policy""" def get_action(self, state): if np.random.rand() <= self.epsilon: ### CODE #### # Choose a random action return torch.tensor([[random.randrange(self.action_size)]], device=device, dtype=torch.long) else: ### CODE #### # Choose the best action with torch.no_grad(): state = torch.FloatTensor(state).unsqueeze(0).cuda() return self.policy_net(state).max(1)[1].view(1, 1) # pick samples randomly from replay memory (with batch_size) def train_policy_net(self, frame): if self.epsilon > self.epsilon_min: self.epsilon -= self.epsilon_decay mini_batch = self.memory.sample_mini_batch(frame) mini_batch = np.array(mini_batch).transpose() history = np.stack(mini_batch[0], axis=0) states = np.float32(history[:, :4, :, :]) / 255. states = torch.from_numpy(states).cuda() actions = list(mini_batch[1]) actions = torch.LongTensor(actions).cuda() rewards = list(mini_batch[2]) rewards = torch.FloatTensor(rewards).cuda() next_states = np.float32(history[:, 1:, :, :]) / 255. next_states = torch.tensor(next_states).cuda() dones = mini_batch[3] # checks if the game is over musk = torch.tensor(list(map(int, dones==False)),dtype=torch.bool) # Compute Q(s_t, a), the Q-value of the current state ### CODE #### state_action_values = self.policy_net(states).gather(1, actions.view(batch_size,-1)) # Compute Q function of next state ### CODE #### next_state_values = torch.zeros(batch_size,device=device).cuda() non_final_mask=torch.tensor(tuple(map(lambda s: s is not None, next_states)), device=device, dtype=torch.uint8) non_final_next_states = torch.cat([i for i in next_states if i is not None]).view(states.size()).cuda() # Compute the expected Q values next_state_values[non_final_mask] = self.policy_net(non_final_next_states).max(1)[0].detach() expected_state_action_values = (next_state_values * self.discount_factor) + rewards # Compute the Huber Loss ### CODE #### loss = F.smooth_l1_loss(state_action_values.view(32), expected_state_action_values) # Optimize the model, .step() both the optimizer and the scheduler! ### CODE #### self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step()
def __init__(self, config, env, doubleDQN=False, duelingDQN=False, NoisyDQN=False, N_stepDQN=False, Prioritized=False): self.device = config.device self.doubleDQN = doubleDQN self.duelingDQN = duelingDQN self.NoisyDQN = NoisyDQN self.N_stepDQN = N_stepDQN self.Prioritized = Prioritized self.gamma = config.gamma # 折扣因子 self.learning_rate = config.learning_rate # 学习率 self.replace_target_iter = config.replace_target_iter # 目标网络更新频率 self.replay_size = config.replay_size # 经验池大小 self.batch_size = config.batch_size # 批样本数 self.priority_alpha = config.priority_alpha self.priority_beta_start = config.priority_beta_start self.priority_beta_frames = config.priority_beta_frames self.epsilon = config.epsilon # epsilon初始值,以其概率选择最大值的动作 self.epsilon_final = config.epsilon_final # epsilon的最小值 self.epsilon_decay = config.epsilon_decay # epsilon衰减率 self.num_states = env.observation_space.shape[0] # 状态空间维度 self.num_actions = env.action_space.n # 动作空间维度 self.learn_start = self.batch_size * 3 # 控制学习的参数 self.learn_step_counter = 0 # 学习的总步数 self.N_step = config.N_step # 多步学习的步数 self.N_step_buffer = [] if self.Prioritized: self.memory = PrioritizedReplayMemory( self.replay_size, self.priority_alpha, self.priority_beta_start, self.priority_beta_frames) # 初始化经验池 else: self.memory = ReplayMemory(self.replay_size) # 初始化经验池 if self.duelingDQN: # 初始化评估网络 self.eval_net = DuelingDQNNet(self.num_states, self.num_actions).to(self.device) # 初始化目标网络 self.target_net = DuelingDQNNet(self.num_states, self.num_actions).to(self.device) elif self.NoisyDQN: # 初始化评估网络 self.eval_net = NoisyNet(self.num_states, self.num_actions).to(self.device) # 初始化目标网络 self.target_net = NoisyNet(self.num_states, self.num_actions).to(self.device) else: self.eval_net = DQNNet(self.num_states, self.num_actions).to(self.device) # 初始化目标网络 self.target_net = DQNNet(self.num_states, self.num_actions).to(self.device) # 目标网络和评估网络初始时参数一致 self.target_net.load_state_dict(self.eval_net.state_dict()) # 训练的优化器 self.optimizer = optim.Adam(self.eval_net.parameters(), lr=self.learning_rate) # 均方损失函数 self.loss_func = nn.MSELoss()
class DQN(object): def __init__(self, config, env, doubleDQN=False, duelingDQN=False, NoisyDQN=False, N_stepDQN=False, Prioritized=False): self.device = config.device self.doubleDQN = doubleDQN self.duelingDQN = duelingDQN self.NoisyDQN = NoisyDQN self.N_stepDQN = N_stepDQN self.Prioritized = Prioritized self.gamma = config.gamma # 折扣因子 self.learning_rate = config.learning_rate # 学习率 self.replace_target_iter = config.replace_target_iter # 目标网络更新频率 self.replay_size = config.replay_size # 经验池大小 self.batch_size = config.batch_size # 批样本数 self.priority_alpha = config.priority_alpha self.priority_beta_start = config.priority_beta_start self.priority_beta_frames = config.priority_beta_frames self.epsilon = config.epsilon # epsilon初始值,以其概率选择最大值的动作 self.epsilon_final = config.epsilon_final # epsilon的最小值 self.epsilon_decay = config.epsilon_decay # epsilon衰减率 self.num_states = env.observation_space.shape[0] # 状态空间维度 self.num_actions = env.action_space.n # 动作空间维度 self.learn_start = self.batch_size * 3 # 控制学习的参数 self.learn_step_counter = 0 # 学习的总步数 self.N_step = config.N_step # 多步学习的步数 self.N_step_buffer = [] if self.Prioritized: self.memory = PrioritizedReplayMemory( self.replay_size, self.priority_alpha, self.priority_beta_start, self.priority_beta_frames) # 初始化经验池 else: self.memory = ReplayMemory(self.replay_size) # 初始化经验池 if self.duelingDQN: # 初始化评估网络 self.eval_net = DuelingDQNNet(self.num_states, self.num_actions).to(self.device) # 初始化目标网络 self.target_net = DuelingDQNNet(self.num_states, self.num_actions).to(self.device) elif self.NoisyDQN: # 初始化评估网络 self.eval_net = NoisyNet(self.num_states, self.num_actions).to(self.device) # 初始化目标网络 self.target_net = NoisyNet(self.num_states, self.num_actions).to(self.device) else: self.eval_net = DQNNet(self.num_states, self.num_actions).to(self.device) # 初始化目标网络 self.target_net = DQNNet(self.num_states, self.num_actions).to(self.device) # 目标网络和评估网络初始时参数一致 self.target_net.load_state_dict(self.eval_net.state_dict()) # 训练的优化器 self.optimizer = optim.Adam(self.eval_net.parameters(), lr=self.learning_rate) # 均方损失函数 self.loss_func = nn.MSELoss() # 储存记忆 def store_transition(self, state, action, reward, next_state, done): if self.N_stepDQN: # 把当前经验放入N_step buffer中 self.N_step_buffer.append( (state, action, reward, next_state, done)) # 如果没有达到设定的步数,return if len(self.N_step_buffer) < self.N_step: return # 计算N步回报 R = sum([ self.N_step_buffer[i][2] * (self.gamma**i) for i in range(self.N_step) ]) state, action, _, _, _ = self.N_step_buffer.pop(0) self.memory.push((state, action, R, next_state, done)) else: self.memory.push((state, action, reward, next_state, done)) # 选择动作 def choose_action(self, s): with torch.no_grad(): if np.random.random( 1) >= self.epsilon: # 如果大于等于epsilon,动作为网络中Q值最大的 X = torch.tensor([s], device=self.device, dtype=torch.float) a = self.eval_net(X).max(1)[1].view(1, 1) # 用eval网络计算q值 return a.item() else: # 如果小于epsilon,动作随机 return np.random.randint(0, self.num_actions) # 从经验池中选取样本 def get_batch(self): transitions, indices, weights = self.memory.sample( self.batch_size) # 批样本 # 解压批样本 # 例如zipped为[(1, 4), (2, 5), (3, 6)],zip(*zipped)解压为[(1, 2, 3), (4, 5, 6)] batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip( *transitions) # 将样本转化为tensor batch_state = torch.tensor(batch_state, device=self.device, dtype=torch.float) batch_action = torch.tensor(batch_action, device=self.device, dtype=torch.long).squeeze().view( -1, 1) # view转换为列tensor batch_reward = torch.tensor(batch_reward, device=self.device, dtype=torch.float).squeeze().view(-1, 1) batch_next_state = torch.tensor(batch_next_state, device=self.device, dtype=torch.float) batch_done = torch.tensor(batch_done, device=self.device, dtype=torch.float).squeeze().view(-1, 1) # print("状态:", batch_state.shape) 128,4 # print("动作:", batch_action.shape) # print("奖励:", batch_reward.shape) # print("done:", batch_done.shape) # return batch_state, batch_action, batch_reward, batch_next_state, batch_done, indices, weights # 学习 def learn(self): # 更新目标网络 if self.learn_step_counter % self.replace_target_iter == 0: self.target_net.load_state_dict(self.eval_net.state_dict()) # 获取批样本 batch_state, batch_action, batch_reward, batch_next_state, batch_done, indices, weights = self.get_batch( ) # print("状态:", batch_state) # print("动作:", batch_action) # print("done:", batch_done) # 计算q(s,a;θ) if self.NoisyDQN: self.eval_net.sample_noise() q_s_a = self.eval_net(batch_state).gather(1, batch_action) # print("q_s_a:", q_s_a.shape) # 计算target yj = rj + (1 - done) * gamma * max(q(s',a;θ')) with torch.no_grad(): if self.NoisyDQN: self.target_net.sample_noise() if self.doubleDQN: next_max_action = self.eval_net(batch_next_state).max( dim=1)[1].view(-1, 1) q_target = batch_reward + ( 1. - batch_done) * self.gamma * self.target_net( batch_next_state).gather(1, next_max_action) # print("q_target:", q_target) # print("q_target.shape:", q_target.shape) else: next_q = self.target_net(batch_next_state) # print("next_q:", next_q) max_next_q_a = next_q.max(1)[0].view(-1, 1) # print("max_next_q_a:", max_next_q_a) # print("max_next_q_a.shape:", max_next_q_a.shape) q_target = batch_reward + ( 1. - batch_done) * self.gamma * max_next_q_a # print("q_target:", q_target) # print("q_target.shape:", q_target.shape) # 损失函数更新 if self.Prioritized: diff = (q_target - q_s_a) self.memory.update_priorities( indices, diff.detach().squeeze().abs().cpu().numpy().tolist()) loss = self.loss_func(q_target, q_s_a) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # 学习的步数加一 self.learn_step_counter += 1 # 保存模型 def save(self): if self.duelingDQN: torch.save(self.eval_net, 'duelingDQN.pkl') elif self.NoisyDQN: torch.save(self.eval_net, 'NoisyDQN.pkl') elif self.N_stepDQN: torch.save(self.eval_net, 'N_stepDQN.pkl') elif self.Prioritized: torch.save(self.eval_net, 'PriorityReplayDQN.pkl') else: torch.save(self.eval_net, 'DQN.pkl') # 加载模型 def load(self): if self.duelingDQN: self.eval_net = torch.load('duelingDQN.pkl') elif self.NoisyDQN: self.eval_net = torch.load('NoisyDQN.pkl') elif self.N_stepDQN: self.eval_net = torch.load('N_stepDQN.pkl') elif self.Prioritized: self.eval_net = torch.load('PriorityReplayDQN.pkl') else: self.eval_net = torch.load('DQN.pkl')
class Agent: def __init__(self, environment, optimizer, memory_length, dueling=True, loss='mse', noisy_net=False, egreedy=False, save_memory=None, save_weights=None, verbose_action=False, ): self.environment = environment self._optimizer = optimizer self._loss = loss self.dueling = dueling self.egreedy = egreedy self.noisy_net = noisy_net # Initialize discount and exploration rate, etc self.total_steps = 0 self.gamma = 0.99 self.epsilon = 1 self.epsilon_min = 0.01 self.epsilon_decay = 0.00005 self.tau = 0.05 self.pretraining_steps = 0 # Build networks self.q_network = self._build_compile_model() self.target_network = self._build_compile_model() self.align_target_model(how='hard') self.memory = ReplayMemory(memory_length) self.save_weights_fp = save_weights self.save_memory_fp = save_memory self.start_time = datetime.datetime.now() self.verbose_action = verbose_action def load_memory(self, fp): with open(fp, 'rb') as f: self.memory.load_memory(pickle.load(f)) print(f'loading {self.memory.length} memories...') def save_memory(self, fp): if fp: with open(fp, 'wb') as f: print('saving replay memory...') pickle.dump(self.memory.get_memory(), f) def load_weights(self, weights_fp): if weights_fp: print('loading weights...') self.q_network.load_weights(weights_fp) self.align_target_model(how='hard') def save_weights(self, weights_fp): if weights_fp: self.q_network.save_weights(weights_fp) def set_epsilon_decay_schedule(self, epsilon, epsilon_min, annealed_steps): self.epsilon = epsilon self.epsilon_min = epsilon_min self.epsilon_decay = math.log(self.epsilon / self.epsilon_min) / annealed_steps def set_beta_schedule(self, beta_start, beta_max, annealed_samplings): self.memory.beta = beta_start self.memory.beta_max = beta_max self.memory.beta_increment_per_sampling = (self.memory.beta_max - self.memory.beta) / annealed_samplings def predict(self, state, use_target=False): if use_target: return self.target_network.predict(state) else: return self.q_network.predict(state) def _decay_epsilon(self): self.epsilon = self.epsilon * np.exp(-self.epsilon_decay) def store(self, state, action, reward, next_state, terminated): self.memory.add((state, action, reward, next_state, terminated)) self.total_steps += 1 if not self.egreedy: if (self.epsilon > self.epsilon_min) and (self.memory.length > self.pretraining_steps): self._decay_epsilon() def batch_store(self, batch_load): batch_load[-2][2] = -0.1 # custom reward altering for row in batch_load: self.store(*row) def _build_compile_model(self): inputs = tf.keras.layers.Input(shape=(32, 290, 4)) conv1 = tf.keras.layers.Conv2D(32, (8, 8), strides=4, padding='same', activation='relu')(inputs) conv2 = tf.keras.layers.Conv2D(64, (4, 4), strides=2, padding='same', activation='relu')(conv1) conv3 = tf.keras.layers.Conv2D(64, (3, 3), strides=1, padding='same', activation='relu')(conv2) conv3 = tf.keras.layers.Flatten()(conv3) if self.noisy_net: advt = NoisyNetDense(256, activation='relu')(conv3) final = NoisyNetDense(2)(advt) else: advt = tf.keras.layers.Dense(256, activation='relu')(conv3) final = tf.keras.layers.Dense(2)(advt) if self.dueling: if self.noisy_net: value = NoisyNetDense(256, activation='relu')(conv3) value = NoisyNetDense(1)(value) else: value = tf.keras.layers.Dense(256, activation='relu')(conv3) value = tf.keras.layers.Dense(1)(value) advt = tf.keras.layers.Lambda(lambda x: x - tf.reduce_mean(x, axis=1, keepdims=True))(final) final = tf.keras.layers.Add()([value, advt]) model = tf.keras.models.Model(inputs=inputs, outputs=final) model.compile(optimizer=self._optimizer, loss=self._loss, metrics=['accuracy']) return model def align_target_model(self, how): assert how in ('hard', 'soft'), '"how" must be either "hard" or "soft"' if how == 'hard': self.target_network.set_weights(self.q_network.get_weights()) elif how == 'soft': for t, e in zip(self.target_network.trainable_variables, self.q_network.trainable_variables): t.assign(t * (1 - self.tau) + (e * self.tau)) def choose_action(self, state): if not self.egreedy: if np.random.rand() <= self.epsilon: action = self.environment.action_space.sample() if self.verbose_action: print(f'action: {action}, q: random') return action q_values = self.predict(state, use_target=False) action = np.argmax(q_values[0]) if self.verbose_action: print(f'action: {action}, q: {q_values}') return action def train(self, batch, is_weights): td_errors = np.zeros(len(batch)) states = np.zeros((len(batch), 32, 290, 4)) targets = np.zeros((len(batch), 2)) for i, (state, action, reward, next_state, terminated) in enumerate(batch): target, td_error = self._get_target(state, action, reward, next_state, terminated) states[i] = state.reshape(32, 290, 4) targets[i] = target td_errors[i] = td_error self.q_network.fit(states, targets, sample_weight=is_weights, batch_size=32, epochs=1, verbose=0) self.align_target_model(how='soft') return td_errors def replay(self, batch_size, epoch_steps=None): num_batches = 1 if epoch_steps: num_batches = int(np.max([np.floor(epoch_steps / 4), 1])) bar = progressbar.ProgressBar(maxval=num_batches, widgets=[f'training - ', progressbar.widgets.Counter(), f'/{num_batches} ', progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()]) bar.start() for i in range(num_batches): leaf_idx, batch, is_weights = self.memory.get_batch(batch_size) # prioritized experience replay td_errors = self.train(batch, is_weights) self.memory.update_sum_tree(leaf_idx, td_errors) bar.update(i + 1) bar.finish() self.save_weights(self.save_weights_fp) def _get_target(self, state, action, reward, next_state, terminated): target = self.predict(state, use_target=False) prev_target = target[0][action] if terminated: target[0][action] = reward else: a = np.argmax(self.predict(next_state, use_target=False)[0]) target[0][action] = reward + (self.gamma * self.predict(next_state, use_target=True)[0][a]) # double Q Network td_error = abs(prev_target - target[0][action]) return target, td_error
class DDPG: def __init__(self, env, actor_model, critic_model, memory=10000, batch_size=64, gamma=0.99, tau=0.001, actor_lr=1e-4, critic_lr=1e-3, critic_decay=1e-2, ou_theta=0.15, ou_sigma=0.2, render=None, evaluate=None, save_path=None, save_every=10, render_every=10, train_per_step=True): self.env = env self.actor = actor_model self.actor_target = actor_model.clone() self.critic = critic_model self.critic_target = critic_model.clone() if use_cuda: for net in [ self.actor, self.actor_target, self.critic, self.critic_target ]: net.cuda() self.memory = ReplayMemory(memory) self.batch_size = batch_size self.gamma = gamma self.tau = tau self.random_process = OrnsteinUhlenbeckProcess( env.action_space.shape[0], theta=ou_theta, sigma=ou_sigma) self.optim_critic = optim.Adam(self.critic.parameters(), lr=critic_lr, weight_decay=critic_decay) self.optim_actor = optim.Adam(self.actor.parameters(), lr=actor_lr) self.render = render self.render_every = render_every self.evaluate = evaluate self.save_path = save_path self.save_every = save_every self.train_per_step = train_per_step def update(self, target, source): zipped = zip(target.parameters(), source.parameters()) for target_param, source_param in zipped: updated_param = target_param.data * (1 - self.tau) + \ source_param.data * self.tau target_param.data.copy_(updated_param) def train_models(self): if len(self.memory) < self.batch_size: return None, None mini_batch = self.memory.sample_batch(self.batch_size) critic_loss = self.train_critic(mini_batch) actor_loss = self.train_actor(mini_batch) self.update(self.actor_target, self.actor) self.update(self.critic_target, self.critic) return critic_loss.data[0], actor_loss.data[0] def mse(self, inputs, targets): return torch.mean((inputs - targets)**2) def train_critic(self, batch): # forward pass pred_actions = self.actor_target(batch.next_states) target_q = batch.rewards + batch.done * self.critic_target( [batch.next_states, pred_actions]) * self.gamma pred_q = self.critic([batch.states, batch.actions]) # backward pass loss = self.mse(pred_q, target_q) self.optim_critic.zero_grad() loss.backward(retain_graph=True) for param in self.critic.parameters(): param.grad.data.clamp_(-1, 1) self.optim_critic.step() return loss def train_actor(self, batch): # forward pass pred_mu = self.actor(batch.states) pred_q = self.critic([batch.states, pred_mu]) # backward pass loss = -pred_q.mean() self.optim_actor.zero_grad() loss.backward() # for param in self.actor.parameters(): # param.grad.data.clamp_(-1, 1) self.optim_actor.step() return loss def prep_state(self, s): return Variable(torch.from_numpy(s).float().unsqueeze(0)) def select_action(self, state, exploration=True): if use_cuda: state = state.cuda() self.actor.eval() action = self.actor(state) self.actor.train() if exploration: noise = Variable( torch.from_numpy(self.random_process.sample()).float()) if use_cuda: noise = noise.cuda() action = action + noise return action def step(self, action): next_state, reward, done, _ = self.env.step( action.data.cpu().numpy()[0]) next_state = self.prep_state(next_state) reward = FloatTensor([reward]) return next_state, reward, done def warmup(self, num_steps): overall_step = 0 while overall_step <= num_steps: done = False state = self.prep_state(self.env.reset()) self.random_process.reset() while not done: overall_step += 1 action = self.select_action(state) next_state, reward, done = self.step(action) self.memory.add(state, action, reward, next_state, done) state = next_state def train(self, num_steps): running_reward = None reward_sums = [] losses = [] overall_step = 0 episode_number = 0 while overall_step <= num_steps: episode_number += 1 done = False state = self.prep_state(self.env.reset()) reward_sum = 0 self.random_process.reset() while not done: overall_step += 1 action = self.select_action(state) next_state, reward, done = self.step(action) self.memory.add(state, action, reward, next_state, done) state = next_state reward_sum += reward[0] if self.train_per_step: losses.append(self.train_models()) if not self.train_per_step: losses.append(self.train_models()) render_this_episode = self.render and (episode_number % self.render_every == 0) evaluation_reward = self.run(render=render_this_episode) reward_sums.append((reward_sum, evaluation_reward)) if self.save_path is not None and (episode_number % self.save_every == 0): self.save_models(self.save_path) self.save_results(self.save_path, losses, reward_sums) running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01 print( 'episode: {} steps: {} running train reward: {:.4f} eval reward: {:.4f}' .format(episode_number, overall_step, running_reward, evaluation_reward)) if self.save_path is not None: self.save_models(self.save_path) self.save_results(self.save_path, losses, reward_sums) return reward_sums, losses def run(self, render=True): state = self.env.reset() done = False reward_sum = 0 while not done: if render: self.env.render() action = self.select_action(self.prep_state(state), exploration=False) state, reward, done, _ = self.env.step( action.data.cpu().numpy()[0]) reward_sum += reward return reward_sum def save_models(self, path): self.actor.save(path) self.critic.save(path) def save_results(self, path, losses, rewards): losses = np.array([l for l in losses if l[0] is not None]) rewards = np.array(rewards) np.savetxt(os.path.join(path, 'losses.csv'), losses, delimiter=',', header='critic,actor', comments='') np.savetxt(os.path.join(path, 'rewards.csv'), rewards, delimiter=',', header='train,evaluation', comments='')
def __init__(self, state_size, action_size, random_seed=399, memory_size=int(1e6), batch_size=128, gamma=0.99, tau=1e-3, lr_actor=1e-4, lr_critic=1e-4, weight_decay=0.0, actor_units=(256, 128), critic_units=(256, 128), action_range=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed memory_size (int): The total amount of memory to save experiences batch_size (int): subset size for each training step gamma (float): discount factor tau (float): interpolation parameter lr_actor (float): learning rate for actor model lr_critic (float): learning rate for critic model weight_decay (float): L2 weight decay actor_units (tuple): A tuple with numbers of nodes in 1st and 2nd hidden layer for actor network critic_units (tuple): A tuple with numbers of nodes in 1st and 2nd hidden layer for critic network action_min (int or float): The min value in the action range action_max (int or float): The max value in the action range """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.random_seed = random_seed self.lr_actor = lr_actor self.lr_critic = lr_critic self.memory_size = memory_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.weight_decay = weight_decay self.actor_units = actor_units self.critic_units = critic_units # action range if isinstance(action_range, tuple) or action_range == None: self.action_range = action_range else: raise ValueError( "action_range needs to be a tuple with two elements or None.") # Actor Network (w/ Target Network) if Agent.actor_local is None: Agent.actor_local = Actor(self.state_size, self.action_size, self.random_seed, fc1_units=self.actor_units[0], fc2_units=self.actor_units[1]).to(device) if Agent.actor_target is None: Agent.actor_target = Actor( self.state_size, self.action_size, self.random_seed, fc1_units=self.actor_units[0], fc2_units=self.actor_units[1]).to(device) if Agent.actor_optimizer is None: Agent.actor_optimizer = optim.Adam(Agent.actor_local.parameters(), lr=self.lr_actor) self.actor_local = Agent.actor_local self.actor_target = Agent.actor_target self.actor_optimizer = Agent.actor_optimizer # Critic Network (w/ Target Network) if Agent.critic_local is None: Agent.critic_local = Critic( self.state_size, self.action_size, self.random_seed, fc1_units=self.critic_units[0], fc2_units=self.critic_units[1]).to(device) if Agent.critic_target is None: Agent.critic_target = Critic( self.state_size, self.action_size, self.random_seed, fc1_units=self.critic_units[0], fc2_units=self.critic_units[1]).to(device) if Agent.critic_optimizer is None: Agent.critic_optimizer = optim.Adam( Agent.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay) self.critic_local = Agent.critic_local self.critic_target = Agent.critic_target self.critic_optimizer = Agent.critic_optimizer # Noise process self.noise = OUNoise(self.action_size, self.random_seed) # Define memory if Agent.memory is None: Agent.memory = ReplayMemory(self.memory_size, self.batch_size, self.random_seed) # Initialize time step (for updating every UPDATE_EVERY steps) self._time_step = 0
def worker(gpu, ngpus_per_node, args): args.gpu = gpu if args.distributed: args.seed += args.gpu torch.cuda.set_device(args.gpu) args.rank = int(os.environ['RANK']) if 'RANK' in os.environ else 0 if args.multiprocessing_distributed: args.rank = args.rank * ngpus_per_node + args.gpu torch.distributed.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:8632', world_size=args.world_size, rank=args.rank) else: args.rank = 0 args.use_cuda_env = args.use_cuda_env and torch.cuda.is_available() args.no_cuda_train = not torch.cuda.is_available() args.verbose = args.verbose and (args.rank == 0) env_device = torch.device('cuda', args.gpu) if args.use_cuda_env else torch.device('cpu') train_device = torch.device('cuda', args.gpu) if (args.no_cuda_train == False) else torch.device('cpu') # Setup np.random.seed(args.seed) torch.manual_seed(np.random.randint(1, 10000)) if args.use_cuda_env or (args.no_cuda_train == False): torch.cuda.manual_seed(random.randint(1, 10000)) if train_device.type == 'cuda': print('Train:\n' + cuda_device_str(train_device.index), flush=True) if args.use_openai: test_env = create_vectorize_atari_env(args.env_name, args.seed, args.evaluation_episodes, episode_life=False, clip_rewards=False) test_env.reset() else: test_env = AtariEnv(args.env_name, args.evaluation_episodes, color_mode='gray', device='cpu', rescale=True, clip_rewards=False, episodic_life=False, repeat_prob=0.0, frameskip=4) # Agent dqn = Agent(args, test_env.action_space) # Construct validation memory if args.rank == 0: print(dqn) print('Initializing evaluation memory with {} entries...'.format(args.evaluation_size), end='', flush=True) start_time = time.time() val_mem = initialize_validation(args, train_device) if args.rank == 0: print('complete ({})'.format(format_time(time.time() - start_time)), flush=True) if args.evaluate: if args.rank == 0: eval_start_time = time.time() dqn.eval() # Set DQN (online network) to evaluation mode rewards, lengths, avg_Q = test(args, 0, dqn, val_mem, test_env, train_device) dqn.train() # Set DQN (online network) back to training mode eval_total_time = time.time() - eval_start_time rmean, rmedian, rstd, rmin, rmax = vec_stats(rewards) lmean, lmedian, lstd, lmin, lmax = vec_stats(lengths) print('reward: {:4.2f}, {:4.0f}, {:4.0f}, {:4.4f} | ' 'length: {:4.2f}, {:4.0f}, {:4.0f}, {:4.4f} | ' 'Avg. Q: {:4.4f} | {}' .format(rmean, rmin, rmax, rstd, lmean, lmin, lmax, lstd, avg_Q, format_time(eval_total_time)), flush=True) else: if args.rank == 0: print('Entering main training loop', flush=True) if args.output_filename: csv_file = open(args.output_filename, 'w', newline='') csv_file.write(json.dumps(vars(args))) csv_file.write('\n') csv_writer = csv.writer(csv_file, delimiter=',') csv_writer.writerow(['frames', 'total_time', 'rmean', 'rmedian', 'rstd', 'rmin', 'rmax', 'lmean', 'lmedian', 'lstd', 'lmin', 'lmax']) else: csv_writer, csv_file = None, None if args.plot: from tensorboardX import SummaryWriter current_time = datetime.now().strftime('%b%d_%H-%M-%S') log_dir = os.path.join(args.log_dir, current_time + '_' + socket.gethostname()) writer = SummaryWriter(log_dir=log_dir) for k, v in vars(args).items(): writer.add_text(k, str(v)) # Environment print('Initializing environments...', end='', flush=True) start_time = time.time() if args.use_openai: train_env = create_vectorize_atari_env(args.env_name, args.seed, args.num_ales, episode_life=True, clip_rewards=args.reward_clip, max_frames=args.max_episode_length) observation = torch.from_numpy(train_env.reset()).squeeze(1) else: train_env = AtariEnv(args.env_name, args.num_ales, color_mode='gray', device=env_device, rescale=True, clip_rewards=args.reward_clip, episodic_life=True, repeat_prob=0.0) train_env.train() observation = train_env.reset(initial_steps=args.ale_start_steps, verbose=args.verbose).clone().squeeze(-1) if args.rank == 0: print('complete ({})'.format(format_time(time.time() - start_time)), flush=True) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) episode_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) final_rewards = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) final_lengths = torch.zeros(args.num_ales, device=train_device, dtype=torch.float32) has_completed = torch.zeros(args.num_ales, device=train_device, dtype=torch.bool) mem = ReplayMemory(args, args.memory_capacity, train_device) mem.reset(observation) priority_weight_increase = (1 - args.priority_weight) / (args.t_max - args.learn_start) state = torch.zeros((args.num_ales, args.history_length, 84, 84), device=mem.device, dtype=torch.float32) state[:, -1] = observation.to(device=mem.device, dtype=torch.float32).div(255.0) num_frames_per_iter = args.num_ales total_steps = math.ceil(args.t_max / (args.world_size * num_frames_per_iter)) epsilons = np.linspace(args.epsilon_start, args.epsilon_final, math.ceil(args.epsilon_frames / num_frames_per_iter)) epsilon_offset = math.ceil(args.learn_start / num_frames_per_iter) prefetcher = data_prefetcher(args.batch_size, train_device, mem) avg_loss = 'N/A' eval_offset = 0 target_update_offset = 0 total_time = 0 # main loop iterator = range(total_steps) if args.rank == 0: iterator = tqdm(iterator) env_stream = torch.cuda.Stream() train_stream = torch.cuda.Stream() for update in iterator: T = args.world_size * update * num_frames_per_iter epsilon = epsilons[min(update - epsilon_offset, len(epsilons) - 1)] if T >= args.learn_start else epsilons[0] start_time = time.time() if update % args.replay_frequency == 0: dqn.reset_noise() # Draw a new set of noisy weights dqn.eval() nvtx.range_push('train:select action') if args.noisy_linear: action = dqn.act(state) # Choose an action greedily (with noisy weights) else: action = dqn.act_e_greedy(state, epsilon=epsilon) nvtx.range_pop() dqn.train() if args.use_openai: action = action.cpu().numpy() torch.cuda.synchronize() with torch.cuda.stream(env_stream): nvtx.range_push('train:env step') observation, reward, done, info = train_env.step(action) # Step if args.use_openai: # convert back to pytorch tensors observation = torch.from_numpy(observation).squeeze(1) reward = torch.from_numpy(reward.astype(np.float32)) done = torch.from_numpy(done.astype(np.bool)) action = torch.from_numpy(action) else: observation = observation.clone().squeeze(-1) nvtx.range_pop() observation = observation.to(device=train_device) reward = reward.to(device=train_device) done = done.to(device=train_device, dtype=torch.bool) action = action.to(device=train_device) observation = observation.float().div_(255.0) not_done = 1.0 - done.float() state[:, :-1].copy_(state[:, 1:].clone()) state *= not_done.view(-1, 1, 1, 1) state[:, -1].copy_(observation) # update episodic reward counters has_completed |= done episode_rewards += reward.float() final_rewards[done] = episode_rewards[done] episode_rewards *= not_done episode_lengths += not_done final_lengths[done] = episode_lengths[done] episode_lengths *= not_done # Train and test if T >= args.learn_start: mem.priority_weight = min(mem.priority_weight + priority_weight_increase, 1) # Anneal importance sampling weight β to 1 prefetcher.preload() avg_loss = 0.0 num_minibatches = min(int(args.num_ales / args.replay_frequency), 8) for _ in range(num_minibatches): # Sample transitions nvtx.range_push('train:sample states') idxs, states, actions, returns, next_states, nonterminals, weights = prefetcher.next() nvtx.range_pop() nvtx.range_push('train:network update') loss = dqn.learn(states, actions, returns, next_states, nonterminals, weights) nvtx.range_pop() nvtx.range_push('train:update priorities') mem.update_priorities(idxs, loss) # Update priorities of sampled transitions nvtx.range_pop() avg_loss += loss.mean().item() avg_loss /= num_minibatches # Update target network if T >= target_update_offset: dqn.update_target_net() target_update_offset += args.target_update torch.cuda.current_stream().wait_stream(env_stream) torch.cuda.current_stream().wait_stream(train_stream) nvtx.range_push('train:append memory') mem.append(observation, action, reward, done) # Append transition to memory nvtx.range_pop() total_time += time.time() - start_time if args.rank == 0: if args.plot and ((update % args.replay_frequency) == 0): writer.add_scalar('train/epsilon', epsilon, T) writer.add_scalar('train/rewards', final_rewards.mean(), T) writer.add_scalar('train/lengths', final_lengths.mean(), T) if T >= eval_offset: eval_start_time = time.time() dqn.eval() # Set DQN (online network) to evaluation mode rewards, lengths, avg_Q = test(args, T, dqn, val_mem, test_env, train_device) dqn.train() # Set DQN (online network) back to training mode eval_total_time = time.time() - eval_start_time eval_offset += args.evaluation_interval rmean, rmedian, rstd, rmin, rmax = vec_stats(rewards) lmean, lmedian, lstd, lmin, lmax = vec_stats(lengths) print('reward: {:4.2f}, {:4.0f}, {:4.0f}, {:4.4f} | ' 'length: {:4.2f}, {:4.0f}, {:4.0f}, {:4.4f} | ' 'Avg. Q: {:4.4f} | {}' .format(rmean, rmin, rmax, rstd, lmean, lmin, lmax, lstd, avg_Q, format_time(eval_total_time)), flush=True) if args.output_filename and csv_writer and csv_file: csv_writer.writerow([T, total_time, rmean, rmedian, rstd, rmin, rmax, lmean, lmedian, lstd, lmin, lmax]) csv_file.flush() if args.plot: writer.add_scalar('eval/rewards', rmean, T) writer.add_scalar('eval/lengths', lmean, T) writer.add_scalar('eval/avg_Q', avg_Q, T) loss_str = '{:4.4f}'.format(avg_loss) if isinstance(avg_loss, float) else avg_loss progress_data = 'T = {:,} epsilon = {:4.2f} avg reward = {:4.2f} loss: {}' \ .format(T, epsilon, final_rewards.mean().item(), loss_str) iterator.set_postfix_str(progress_data) if args.plot and (args.rank == 0): writer.close() if args.use_openai: train_env.close() test_env.close()
returns_np = np.asarray(returns) fsv = np.asarray(first_state_values) lsv = np.asarray(last_state_values) plt.title('Training...Returns') plt.xlabel('Frames') plt.ylabel('Return') plt.plot(ep_durations, returns) plt.plot(ep_durations, fsv, 'C1') plt.plot(ep_durations, lsv, 'C2') plt.pause(0.002) #init agent, memory and environment agent = Agent(N_ACTIONS, EPS_START, EPS_END, EPS_STEPS, GAMMA, TRAIN, use_cuda, BATCH_SIZE, 'CP') memory = ReplayMemory(RM_CAPACITY) env = gym.make(ENV) ep_durations = [0] #used for ploting returns = [0] last_state_values = [0] first_state_values = [0] for i_episode in range(INIT_RM): if not TRAIN: break cur_state = env.reset() while True: action = agent.take_action(FloatTensor([cur_state])) next_state, reward, done, _ = env.step(env.action_space.sample())
class MADDPG: def __init__(self, n_agents, dim_obs, dim_act, batch_size, capacity, episodes_before_train, load_models=None): # self.actors = [Actor(dim_obs, dim_act) for i in range(n_agents)] # self.critics = [Critic(n_agents, dim_obs, dim_act) for i in range(n_agents)] if load_models is None: self.models = Models(n_agents, dim_obs, dim_act) self.actors_target = deepcopy(self.models.actors) self.critics_target = deepcopy(self.models.critics) self.critic_optimizer = [Adam(x.parameters(), lr=0.0001) for x in self.models.critics] # 0.001 self.actor_optimizer = [Adam(x.parameters(), lr=0.00001) for x in self.models.actors] # 0.0001 self.memory = ReplayMemory(capacity) self.var = [1.0 for i in range(n_agents)] else: print('Start loading models!') states = th.load(load_models) self.models = states['models'] self.critic_optimizer = states['critic_optimizer'] self.actor_optimizer = states['actor_optimizer'] self.critics_target = states['critics_target'] self.actors_target = states['actors_target'] self.memory = states['memory'] self.var = states['var'] print('Models loaded!') self.n_agents = n_agents self.n_states = dim_obs self.n_actions = dim_act self.batch_size = batch_size self.use_cuda = th.cuda.is_available() self.episodes_before_train = episodes_before_train self.GAMMA = 0.95 self.tau = 0.01 if self.use_cuda: for x in self.models.actors: x.cuda() for x in self.models.critics: x.cuda() for x in self.actors_target: x.cuda() for x in self.critics_target: x.cuda() self.steps_done = 0 self.episode_done = 0 def update_policy(self): # do not train until exploration is enough if self.episode_done <= self.episodes_before_train: return None, None ByteTensor = th.cuda.ByteTensor if self.use_cuda else th.ByteTensor FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor c_loss = [] a_loss = [] critics_grad = [] actors_grad = [] for agent in range(self.n_agents): transitions = self.memory.sample(self.batch_size) batch = Experience(*zip(*transitions)) non_final_mask = ByteTensor(list(map(lambda s: s is not None, batch.next_states))) # state_batch: batch_size x n_agents x dim_obs state_batch = Variable(th.stack(batch.states).type(FloatTensor)) action_batch = Variable(th.stack(batch.actions).type(FloatTensor)) reward_batch = Variable(th.stack(batch.rewards).type(FloatTensor)) # : (batch_size_non_final) x n_agents x dim_obs non_final_next_states = Variable(th.stack( [s for s in batch.next_states if s is not None]).type(FloatTensor)) # for current agent whole_state = state_batch.view(self.batch_size, -1) whole_action = action_batch.view(self.batch_size, -1) # critic network self.critic_optimizer[agent].zero_grad() current_Q = self.models.critics[agent](whole_state, whole_action) # forward? non_final_next_actions = [ self.actors_target[i](non_final_next_states[:, i, :]) for i in range(self.n_agents)] non_final_next_actions = th.stack(non_final_next_actions) # non_final_next_actions = Variable(non_final_next_actions) non_final_next_actions = ( non_final_next_actions.transpose(0, 1).contiguous()) target_Q = Variable(th.zeros(self.batch_size).type(FloatTensor)) target_Q[non_final_mask] = self.critics_target[agent]( non_final_next_states.view(-1, self.n_agents * self.n_states), non_final_next_actions.view(-1, self.n_agents * self.n_actions)) # scale_reward: to scale reward in Q functions target_Q = (target_Q * self.GAMMA) + (reward_batch[:, agent] * scale_reward) loss_Q = nn.MSELoss()(current_Q, target_Q.detach()) loss_Q.backward() self.critic_optimizer[agent].step() # actor network self.actor_optimizer[agent].zero_grad() state_i = state_batch[:, agent, :] action_i = self.models.actors[agent](state_i) # forward ac = action_batch.clone() ac[:, agent, :] = action_i whole_action = ac.view(self.batch_size, -1) actor_loss = -self.models.critics[agent](whole_state, whole_action) # forward actor_loss = actor_loss.mean() actor_loss.backward() self.actor_optimizer[agent].step() c_loss.append(loss_Q) a_loss.append(actor_loss) # for test ''' s = 0 for x in self.models.critics[agent].parameters(): s += 1 print('s: ', s) print(type(x)) print('x.grad.shape: ', x.grad.size()) print('x.data.shape: ', x.data.size()) ''' critics_agent_grad = [] actors_agent_grad = [] for x in self.models.critics[agent].parameters(): critics_agent_grad.append(x.grad.data.norm(2)) # critics_agent_grad.append(th.mean(x.grad).data[0]) for x in self.models.actors[agent].parameters(): actors_agent_grad.append(x.grad.data.norm(2)) # actors_agent_grad.append(th.mean(x.grad).data[0]) critics_grad.append(critics_agent_grad) actors_grad.append(actors_agent_grad) if self.steps_done % 100 == 0 and self.steps_done > 0: for i in range(self.n_agents): soft_update(self.critics_target[i], self.models.critics[i], self.tau) soft_update(self.actors_target[i], self.models.actors[i], self.tau) ''' # gradient clipping if self.clip is not None: nn.utils.clip_grad_norm(self.model.parameters(), self.clip) ''' # return c_loss, a_loss #, critics_grad, actors_grad return critics_grad, actors_grad def select_action(self, state_batch): # state_batch: n_agents x state_dim actions = Variable(th.zeros( self.n_agents, self.n_actions)) FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor for i in range(self.n_agents): sb = state_batch[i, :].detach() act = self.models.actors[i](sb.unsqueeze(0)).squeeze() act += Variable(th.from_numpy(np.random.randn(2) * self.var[i]).type(FloatTensor)) if self.episode_done > self.episodes_before_train and self.var[i] > 0.05: # and self.episode_done % 100 == 0 self.var[i] *= 0.999998 act = th.clamp(act, -1.0, 1.0) actions[i, :] = act self.steps_done += 1 # print('steps_done: ', self.steps_done) # print('episode_done: ', self.episode_done) return actions
class Agent: def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions, combined=False, max_mem_size=100000, eps_end=0.05, eps_dec=5e-4): self.gamma = gamma self.epsilon = epsilon self.eps_min = eps_end self.eps_dec = eps_dec self.lr = lr self.action_space = [i for i in range(n_actions)] self.batch_size = batch_size self.memory = ReplayMemory(input_dims, max_mem_size, batch_size, combined) self.iter_cntr = 0 self.replace_target = 100 self.Q_eval = DeepQNetwork(lr, n_actions=n_actions, input_dims=input_dims, fc1_dims=256, fc2_dims=256) self.Q_next = DeepQNetwork(lr, n_actions=n_actions, input_dims=input_dims, fc1_dims=256, fc2_dims=256) def choose_action(self, observation): if np.random.random() > self.epsilon: state = T.tensor([observation]).to(self.Q_eval.device) actions = self.Q_eval.forward(state) action = T.argmax(actions).item() else: action = np.random.choice(self.action_space) return action def learn(self): if not self.memory.is_sufficient(): return self.Q_eval.optimizer.zero_grad() batch_index = np.arange(self.batch_size, dtype=np.int32) states, actions, rewards, new_states, dones = \ self.memory.sample_memory() states = T.tensor(states).to(self.Q_eval.device) new_states = T.tensor(new_states).to(self.Q_eval.device) rewards = T.tensor(rewards).to(self.Q_eval.device) dones = T.tensor(dones).to(self.Q_eval.device) q_eval = self.Q_eval.forward(states)[batch_index, actions] q_next = self.Q_eval.forward(new_states) q_next[dones] = 0.0 q_target = rewards + self.gamma * T.max(q_next, dim=1)[0] loss = self.Q_eval.loss(q_target, q_eval).to(self.Q_eval.device) loss.backward() self.Q_eval.optimizer.step() self.iter_cntr += 1 self.epsilon = self.epsilon - self.eps_dec \ if self.epsilon > self.eps_min else self.eps_min if self.iter_cntr % self.replace_target == 0: self.Q_next.load_state_dict(self.Q_eval.state_dict())
def main(): args = get_args() args.critic_layers = literal_eval(args.critic_layers) args.actor_layers = literal_eval(args.actor_layers) # create save directory save_dir = os.path.join('weights', args.exp_name) if not os.path.exists(save_dir): os.makedirs(save_dir) else: shutil.move(save_dir, save_dir + '.backup') os.makedirs(save_dir) state_transform = NormState(args.prosthetic) # state_transform = StateVelCentr(obstacles_mode='standard', # exclude_centr=True, # vel_states=[]) env = RunEnv2(state_transform, integrator_accuracy=args.accuracy, model=args.modeldim, prosthetic=args.prosthetic, difficulty=args.difficulty, skip_frame=1) env.change_model(args.modeldim, args.prosthetic, args.difficulty) num_actions = env.get_action_space_size() del env print('building model') # build model model_params = { 'state_size': state_transform.state_size, 'num_act': num_actions, 'gamma': args.gamma, 'actor_layers': args.actor_layers, 'critic_layers': args.critic_layers, 'actor_lr': args.actor_lr, 'critic_lr': args.critic_lr, 'layer_norm': args.layer_norm } train_fn, actor_fn, target_update_fn, params_actor, params_crit, actor_lr, critic_lr = \ build_model(**model_params) actor = Agent(actor_fn, params_actor, params_crit) if args.weights is not None: actor.load(args.weights) actor_lr_step = (args.actor_lr - args.actor_lr_end) / args.max_steps critic_lr_step = (args.critic_lr - args.critic_lr_end) / args.max_steps # build actor weights = [p.get_value() for p in params_actor] # build replay memory memory = ReplayMemory(state_transform.state_size, num_actions, 5000000) # init shared variables global_step = Value('i', 0) updates = Value('i', 0) best_reward = Value('f', -1e8) testing = Value('i', 0) # init agents data_queue = Queue() workers = [] weights_queues = [] if not args.test: num_agents = args.n_threads - 2 print('starting {} agents'.format(num_agents)) else: num_agents = 1 print('starting testing agent') for i in range(num_agents): w_queue = Queue() worker = Process(target=run_agent, args=(args, model_params, weights, state_transform, data_queue, w_queue, i, global_step, updates, best_reward, args.param_noise_prob, save_dir, args.max_steps)) worker.daemon = True worker.start() sleep(args.sleep) workers.append(worker) weights_queues.append(w_queue) if not args.test: print('starting training') else: print('starting testing') prev_steps = 0 start_save = time() start_test = time() weights_rew_to_check = [] while global_step.value < args.max_steps: # get all data try: i, batch, weights_check, reward = data_queue.get_nowait() if weights_check is not None: weights_rew_to_check.append((weights_check, reward)) weights_queues[i].put(weights) # add data to memory memory.add_samples(*batch) except queue.Empty: pass # training step # TODO: consider not training during testing model if not args.test: if len(memory) > args.start_train_steps: batch = memory.random_batch(args.batch_size) if np.random.rand() < args.flip_prob: states, actions, rewards, terminals, next_states = batch states_flip = state_transform.flip_states(states) next_states_flip = state_transform.flip_states(next_states) actions_flip = np.zeros_like(actions) actions_flip[:, :num_actions // 2] = actions[:, num_actions // 2:] actions_flip[:, num_actions // 2:] = actions[:, :num_actions // 2] states_all = np.concatenate((states, states_flip)) actions_all = np.concatenate((actions, actions_flip)) rewards_all = np.tile(rewards.ravel(), 2).reshape(-1, 1) terminals_all = np.tile(terminals.ravel(), 2).reshape(-1, 1) next_states_all = np.concatenate( (next_states, next_states_flip)) batch = (states_all, actions_all, rewards_all, terminals_all, next_states_all) actor_loss, critic_loss = train_fn(*batch) updates.value += 1 if np.isnan(actor_loss): raise Value('actor loss is nan') if np.isnan(critic_loss): raise Value('critic loss is nan') target_update_fn() weights = actor.get_actor_weights() delta_steps = global_step.value - prev_steps prev_steps += delta_steps actor_lr.set_value( lasagne.utils.floatX( max(actor_lr.get_value() - delta_steps * actor_lr_step, args.actor_lr_end))) critic_lr.set_value( lasagne.utils.floatX( max(critic_lr.get_value() - delta_steps * critic_lr_step, args.critic_lr_end))) # check if need to save and test if (time() - start_save) / 60. > args.save_period_min: fname = os.path.join( save_dir, 'weights_updates_{}.pkl'.format(updates.value)) actor.save(fname) start_save = time() # start new test process weights_rew_to_check = [(w, r) for w, r in weights_rew_to_check if r > best_reward.value and r > 0] weights_rew_to_check = sorted(weights_rew_to_check, key=lambda x: x[1]) if ((time() - start_test) / 60. > args.test_period_min or len(weights_rew_to_check) > 0) and testing.value == 0: testing.value = 1 print('start test') if len(weights_rew_to_check) > 0: _weights, _ = weights_rew_to_check.pop() else: _weights = weights worker = Process(target=test_agent, args=(args, testing, state_transform, args.num_test_episodes, model_params, _weights, best_reward, updates, global_step, save_dir)) worker.daemon = True worker.start() start_test = time() print('training finished') # end all processes for w in workers: w.join()
class Agent: def __init__(self, env, exploration_rate=1, exploration_decay=0.9999, explore=True): self.action_space = env.action_space.n self.memory = ReplayMemory(MEMORY_SIZE) self.memory.fill_memory(env) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print(self.device) self.dqn = DQN(4, self.action_space).float().to(self.device) self.env = env self.episode_rewards = [] self.exploration_rate = exploration_rate self.exploration_decay = exploration_decay self.explore = explore self.model_optim = optim.Adam(self.dqn.parameters(), lr=1e-4) self.episodes = 0 def get_action(self, obs): if self.exploration_rate > random.random() and self.explore: action = random.randint(0, self.action_space - 1) else: obs = torch.tensor(obs, device=self.device).reshape(1, 4, 80, 80).float() action = self.dqn(obs).argmax().tolist() return action def train(self, num_episodes): num_steps = 0 running_loss = 0 loss = nn.MSELoss() episode_rewards = [] for episode in tqdm(range(num_episodes)): obs = rgb2gray(self.env.reset()).reshape(1, 80, 80) for i in range(3): obs = np.append(obs, rgb2gray(self.env.step(0)[0]), 0) terminal = False episode_reward = 0 while not terminal: action = self.get_action(obs) result = self.env.step(action) terminal = result[2] new_obs = np.append(obs[1:], rgb2gray(result[0]), 0) reward = result[1] if reward > 0: print(episode, reward) episode_reward += reward self.memory.push(obs, action, new_obs, reward, terminal) batch = self.memory.sample(BATCH_SIZE) observations, y = self.process_batch(batch) num_steps += 1 outputs = self.dqn(observations) episode_loss = loss(outputs, y) self.model_optim.zero_grad() episode_loss.backward() self.model_optim.step() running_loss += episode_loss.item() if num_steps % 1000 == 0: # print every 2000 mini-batches print(num_steps) episode_rewards.append(episode_reward) if self.exploration_rate > 0.1: self.exploration_rate *= self.exploration_decay self.episodes += num_episodes self.save(str(self.episodes) + '_model') self.episode_rewards += episode_rewards np.save(str(self.episodes) + '_rewards', self.episode_rewards) return episode_rewards def process_batch(self, batch): observations = [batch[i][0] for i in range(len(batch))] observations = torch.tensor(np.array(observations)).reshape( (BATCH_SIZE, 4, 80, 80)).float().to(self.device) next_observations = [batch[i][2] for i in range(len(batch))] next_observations = torch.tensor(np.array(next_observations)).reshape( (BATCH_SIZE, 4, 80, 80)).float().to(self.device) maxs = self.dqn(next_observations) maxs = maxs.max(1).values.float().to(self.device) rewards = [batch[i][3] for i in range(len(batch))] rewards = torch.tensor(rewards).float().to(self.device) terminals = [~batch[i][4] for i in range(len(batch))] terminals = torch.tensor(terminals).float().to(self.device) maxs = -maxs * terminals y = self.dqn(observations) Qs = rewards + GAMMA * maxs for i in range(len(batch)): y[i, batch[i][1]] = Qs[i] return observations, y def load_dqn(self, path): self.dqn = torch.load(path) def save(self, path): torch.save(self.dqn, path)
############################ Memory ###################### Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward')) ########################################################## ################## Declare networks ###################### screen_height, screen_width = 84, 96 policy_net = DQN(screen_height, screen_width, nb_actions).to(device) # policy_net.load_state_dict(torch.load('./models/model')) target_net = DQN(screen_height, screen_width, nb_actions).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = optim.RMSprop(policy_net.parameters()) memory = ReplayMemory(size_memory) ########################################################## ################### Select Action ######################### def select_action(state): global steps_done sample = random.random() # Between 0 and 1 eps_threshold = EPS_END + (EPS_START - EPS_END) * \ math.exp(-1. * steps_done / EPS_DECAY) steps_done += 1 if sample > eps_threshold: #Action determined by the NN with torch.no_grad(): # t.max(1) will return largest column value of each row. # second column on max result is index of where max element was
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") dqn_online = DQN(N_ACTIONS, STATE_SHAPE) dqn_target = DQN(N_ACTIONS, STATE_SHAPE) dqn_online.to(device) dqn_target.to(device) # optimizer = torch.optim.RMSprop(dqn_online.parameters(), lr=LR, momentum=0.95, eps=0.01) # paper used rmsprop optimizer = torch.optim.Adam(dqn_online.parameters(), lr=LR) if CKPT_ENABLED and os.path.exists(CKPT_FILENAME): progress = load_checkpoint(dqn_online, dqn_target, optimizer, CKPT_FILENAME) else: progress = [] dqn_target.eval() mem_buffer = ReplayMemory(MEMORY_SIZE, STATE_SHAPE) loss_fn = torch.nn.SmoothL1Loss() # huber loss function agent = DQNAgent(device, mem_buffer, dqn_online, dqn_target, optimizer, loss_fn, GAMMA, BATCH_SIZE, UPDATE_ONLINE_INTERVAL, UPDATE_TARGET_INTERVAL) # training phase # adjust these hyperparameters as necessary num_episodes = 5000 # number of episodes to train for explore_phase_length = 50000 # number of steps without any exploitation (paper used 50k) epsilon = 1.0 # initial epsilon value (paper used 1.0) epsilon_decrement_steps = 1000000 # how many steps to decrement epsilon to min value (paper used 1 million) intermediate_epsilon = 0.1 # can be used to decay epsilon in two phases as recommended by openai (set equal to min_epsilon to disable) min_epsilon = 0.01 # smallest possible value of epsilon (paper used 0.1 for dqn, 0.01 for ddqn)
HEIGHT = 84 WIDTH = 84 TEST_EPISODES = 10 MODEL_PATH = 'dqn_model_scale.pt' # create environment # See wrappers.py env = create_atari_env("Breakout-v0", episode_life=False, frame_stack=True, scale=True, clip_rewards=False) epsilon = EPS_START steps_done = 0 # initialize replay memory memory = ReplayMemory(MEMORY_SIZE) # create networks action_num = env.action_space.n policy_net = DQN(HEIGHT, WIDTH, action_num).to(device) target_net = DQN(HEIGHT, WIDTH, action_num).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() print(policy_net) # setup optimizer # optimizer = optim.RMSprop(policy_net.parameters()) optimizer = optim.Adam(policy_net.parameters(), lr=lr) # train model train(env, NUM_EPISODES)
def main(args): ''' compares 3 different agents Args: param1 (args) : command line argumente ''' env = UnityEnvironment(file_name="Banana_Linux/Banana.x86_64", no_graphics=True) brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset()[brain_name] # number of agents in the environment print('Number of agents:', len(env_info.agents)) # number of actions action_size = brain.vector_action_space_size print('Number of actions:', action_size) # examine the state space state = env_info.vector_observations[0] print('States look like:', state) state_size = len(state) print('States have length:', state_size) agent = Duelling_DDQNAgent(args, state_size=state_size, action_size=action_size) mem = ReplayMemory(args, args.evaluation_size) scores = dqn(agent, env, brain_name, mem, args, n_episodes=args.n_episodes, eps_decay=args.eps_decay) save_and_plot(scores, args, 1) mem = ReplayMemory(args, args.evaluation_size) agent = Double_DQNAgent(args, state_size=state_size, action_size=action_size) scores2 = dqn(agent, env, brain_name, mem, args, n_episodes=args.n_episodes, eps_decay=args.eps_decay) save_and_plot(scores2, args, 2) args.priority_exponent = 0.8 args.multi_step = 7 args.update_every = 4 args.noise = True mem = ReplayMemory(args, args.evaluation_size) agent = Agent(args, state_size=state_size, action_size=action_size) scores3 = dqn(agent, env, brain_name, mem, args, n_episodes=args.n_episodes, eps_decay=args.eps_decay) save_and_plot(scores3, args, 3) # plot the scores fig = plt.figure() fig.add_subplot(111) plt.plot(np.arange(len(scores)), scores, label="Duelling Double DQN") plt.plot(np.arange(len(scores2)), scores2, label="Double DQN") plt.plot(np.arange(len(scores3)), scores3, label="Rainblow") plt.legend() plt.ylabel('Score') plt.xlabel('Episode #') plt.show()
class StatusHandler(tornado.websocket.WebSocketHandler): agent = Agent(args) mem = ReplayMemory(args, args.memory_capacity, agent_count) agent_initialized = False cycle_counter = 1 rgb_image_count = 1 depth_image_count = 0 depth_image_dim = 0 ir_count = 1 ground_count = 0 compass_count = 1 target_count = 1 priority_weight_increase = (1 - args.priority_weight) / (args.T_max - args.learn_start) if args.mode_distribute: thread_event = threading.Event() state_cnn = torch.zeros(4, agent_count, 3, 128, 128) state_oth = torch.zeros(4, agent_count, 11) T = 0 def open(self): print("open") def on_close(self): print("close") def on_message(self, message): print("received message") self.received_message(message) def callback(self, count): self.write_message('{"inventoryCount":"%d"}' % count) def send_action(self, action): dat = msgpack.packb({"command": "".join(map(str, action))}) self.write_message(dat, binary=True) def received_message(self, m): payload = m dat = msgpack.unpackb(payload, encoding='utf-8') image = [] depth = [] agent_count = len(dat['image']) for i in range(agent_count): image.append(Image.open(io.BytesIO(bytearray(dat['image'][i])))) if (self.depth_image_count == 1): depth_dim = len(dat['depth'][0]) temp = (Image.open(io.BytesIO(bytearray(dat['depth'][i])))) depth.append( np.array(ImageOps.grayscale(temp)).reshape( self.depth_image_dim)) if (self.ir_count == 1): ir = dat['ir'] ir_dim = len(ir[0]) else: ir = [] ir_dim = 0 if (self.ground_count == 1): ground = dat['ground'] ground_dim = len(ground[0]) else: ground = [] ground_dim = 0 if (self.compass_count == 1): compass = dat['compass'] compass_dim = len(compass[0]) else: compass = [] compass_dim = 0 if (self.target_count == 1): target = dat['target'] target_dim = len(target[0]) else: target = [] target_dim = 0 self.agent.agent_count = agent_count observation = { "image": image, "depth": depth, "ir": ir, "ground": ground, "compass": compass, "target": target } reward = np.array(dat['reward'], dtype=np.float32) reward = torch.tensor(reward) end_episode = np.array(dat['endEpisode'], dtype=np.bool) print("get daze!") s_cnn = self.agent._observation_to_state_cnn(observation) self.state_cnn = torch.stack( (self.state_cnn[1], self.state_cnn[2], self.state_cnn[3], s_cnn)) s_cnn_ = torch.cat([self.state_cnn[n] for n in range(4)], dim=1) s_oth = self.agent._observation_to_state_other(observation) self.state_oth = torch.stack( (self.state_oth[1], self.state_oth[2], self.state_oth[3], s_oth)) s_oth_ = torch.cat([self.state_oth[n] for n in range(4)], dim=1) state = {'cnn': s_cnn_, 'oth': s_oth_} action = self.agent.act(state) action_ = action.numpy() self.send_action(action_) print(action) # for i in range(1000): self.mem.append({ 'cnn': s_cnn, 'oth': s_oth }, action, reward, end_episode) if self.T > 1000: self.agent.learn(self.mem, self.T) self.T += 1 if self.T % args.replay_frequency == 0: # self.agent.reset_noise() # Draw a new set of noisy weights pass # Update target network if self.T % args.target_update == 0: self.agent.update_target_net()
class MADDPG: def __init__(self, n_agents, dim_obs, dim_act, batch_size, capacity, episodes_before_train): self.actors = [Actor(dim_obs, dim_act) for i in range(n_agents)] self.critics = [ Critic(n_agents, dim_obs, dim_act) for i in range(n_agents) ] ifload = False if ifload: for i in range(2): name1 = "parameter/actor_v3" + str(i) + ".pth" name2 = "parameter/critic_v3" + str(i) + ".pth" #print(name1) self.actors[i].load_state_dict( th.load(name1, map_location=th.device('cpu'))) self.critics[i].load_state_dict( th.load(name2, map_location=th.device('cpu'))) self.actors_target = deepcopy(self.actors) self.critics_target = deepcopy(self.critics) ## Constrain........ self.constrain = Constrain(dim_obs, 2) self.n_agents = n_agents self.n_states = dim_obs self.n_actions = dim_act self.memory = ReplayMemory(capacity) self.batch_size = batch_size self.use_cuda = th.cuda.is_available() self.episodes_before_train = episodes_before_train self.GAMMA = 0.95 self.tau = 0.01 self.var = [1.0 for i in range(n_agents)] self.critic_optimizer = [ Adam(x.parameters(), lr=0.0008) for x in self.critics ] self.actor_optimizer = [ Adam(x.parameters(), lr=0.0002) for x in self.actors ] self.constrain_optimizer = Adam(self.constrain.parameters(), lr=0.0006) if self.use_cuda: for x in self.actors: x.cuda() for x in self.critics: x.cuda() for x in self.actors_target: x.cuda() for x in self.critics_target: x.cuda() self.constrain.cuda() self.steps_done = 0 self.episode_done = 0 def update_policy(self): # do not train until exploration is enough if self.episode_done <= self.episodes_before_train: return None, None ByteTensor = th.cuda.ByteTensor if self.use_cuda else th.ByteTensor FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor c_loss = [] a_loss = [] for agent in range(self.n_agents): transitions = self.memory.sample(self.batch_size) batch = Experience(*zip(*transitions)) non_final_mask = ByteTensor( list(map(lambda s: s is not None, batch.next_states))) # state_batch: batch_size x n_agents x dim_obs state_batch = th.stack(batch.states).type(FloatTensor) action_batch = th.stack(batch.actions).type(FloatTensor) reward_batch = th.stack(batch.rewards).type(FloatTensor) # : (batch_size_non_final) x n_agents x dim_obs non_final_next_states = th.stack([ s for s in batch.next_states if s is not None ]).type(FloatTensor) # for current agent whole_state = state_batch.view(self.batch_size, -1) whole_action = action_batch.view(self.batch_size, -1) self.critic_optimizer[agent].zero_grad() #print("whole_action",whole_action) current_Q = self.critics[agent](whole_state, whole_action) non_final_next_actions = [ self.actors_target[i](non_final_next_states[:, i, :]) for i in range(self.n_agents) ] non_final_next_actions = th.stack(non_final_next_actions) non_final_next_actions = (non_final_next_actions.transpose( 0, 1).contiguous()) target_Q = th.zeros(self.batch_size).type(FloatTensor) target_Q[non_final_mask] = self.critics_target[agent]( non_final_next_states.view(-1, self.n_agents * self.n_states), non_final_next_actions.view(-1, self.n_agents * self.n_actions)).squeeze() # scale_reward: to scale reward in Q functions target_Q = (target_Q.unsqueeze(1) * self.GAMMA) + ( reward_batch[:, agent].unsqueeze(1) * scale_reward) loss_Q = nn.MSELoss()(current_Q, target_Q.detach()) loss_Q.backward() self.critic_optimizer[agent].step() self.actor_optimizer[agent].zero_grad() state_i = state_batch[:, agent, :] action_i = self.actors[agent](state_i) ac = action_batch.clone() ac[:, agent, :] = action_i whole_action = ac.view(self.batch_size, -1) actor_loss = -self.critics[agent](whole_state, whole_action) actor_loss = actor_loss.mean() actor_loss.backward() self.actor_optimizer[agent].step() c_loss.append(loss_Q) a_loss.append(actor_loss) if self.steps_done % 30 == 0 and self.steps_done > 0: for i in range(self.n_agents): soft_update(self.critics_target[i], self.critics[i], self.tau) soft_update(self.actors_target[i], self.actors[i], self.tau) if self.steps_done % 300 == 0: th.save(self.critics[i].state_dict(), "parameter/critic_v3" + str(i) + ".pth") th.save(self.actors[i].state_dict(), "parameter/actor_v3" + str(i) + ".pth") return c_loss, a_loss def update_rule(self): if self.episode_done <= self.episodes_before_train: return None FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor transitions = self.memory.sample(self.batch_size) batch = Experience(*zip(*transitions)) state_batch = th.stack(batch.states).type(FloatTensor) action_batch = th.stack(batch.actions).type(FloatTensor) whole_state = state_batch.view(self.batch_size, -1) whole_action = action_batch.view(self.batch_size, -1) #for ag in range(self.n_agents): true_act, rules = self.select_rule_action(state_batch) if self.steps_done % 2 == 0: id = 0 else: id = 1 Q = [] for ag in range(self.n_agents): Q.append(self.critics[ag](whole_state, Variable(th.Tensor(true_act)))) Qsum = sum(Q) if self.steps_done % 600 == 0: print("true_act..", true_act[15]) print("rule..", rules[id][15]) print("Qsum..", Qsum[15]) loss_r = -rules[id] * Qsum loss_r = loss_r.mean() loss_r.backward() self.constrain_optimizer.step() return loss_r def rule_act(self, state_batch): #true_act = [] rules = [] #obs = state_batch[:,0,:] #rule = self.constrain(obs) rules.append(self.constrain(state_batch[:, 0, :])) rules.append(self.constrain(state_batch[:, 1, :])) rule = rules[1].detach().numpy() #print(rule) action = [np.random.choice(2, 1, p=softmax(x)) for x in rule] #[ [0] if x[0]>x[1] else [1] for x in rule ] #true_act.append(list(action)) #true_act.append(list(action)) true_act = [[x[0], x[0]] for x in action ] #action#np.array(true_act).reshape(self.batch_size,2) #print(true_act) #print(true_act[1]) return true_act, rules # def select_rule_action(self, state_batch): true_act = [] rules = [] for id in range(2): obs = state_batch[:, id, :] act = self.actors[id](obs) act = th.clamp(act, 0.0, 1.0) ## ?? act = act.detach().numpy() act_prob = [[1 - x[0], x[0]] for x in act] #[ 1-act[0], act[0]] #act_prob = Variable(th.Tensor( act_prob)) self.constrain_optimizer.zero_grad() rule = self.constrain(obs) rules.append(rule) rule0 = rule.detach().numpy() scale_act = [ softmax(np.array(rule0[i]) * np.array(act_prob[i])) for i in range(self.batch_size) ] #scale_act = softmax(scale_act) action = [np.random.choice(2, 1, p=x) for x in scale_act] true_act.append(action) true_act = np.array(true_act).reshape(self.batch_size, 2) return true_act, rules def select_rule_action2(self, state_batch): true_act = [] rules = [] obs = state_batch[:, 0, :] rule = self.constrain(obs) rules.append(rule) rules.append(self.constrain(state_batch[:, 1, :])) rule = rule.detach().numpy() action = [np.random.choice(2, 1, p=x) for x in rule] #scale_act = [ softmax(np.array(rule[i])*np.array(act_prob[i])) for i in range(self.batch_size) ] #scale_act = softmax(scale_act) #action = [ np.random.choice(2,1,p = x) for x in scale_act ] true_act.append(action) true_act.append(action) true_act = np.array(true_act).reshape(self.batch_size, 2) return true_act, rules def getLaw(self, rule_prob, action_prob): forbidden_prob = [rule_prob[1], rule_prob[0]] for k in range(len(action_prob)): if action_prob[k] < forbidden_prob[k]: action_prob[k] = 0 return action_prob def select_action(self, state_batch, rule_prob): # state_batch: n_agents x state_dim actions = th.zeros(self.n_agents, self.n_actions) FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor for i in range(self.n_agents): sb = state_batch[i, :].detach() act = self.actors[i](sb.unsqueeze(0)) #.squeeze() act += th.from_numpy( np.random.randn(self.n_actions) * self.var[i]).type(FloatTensor) if self.episode_done > self.episodes_before_train and\ self.var[i] > 0.05: self.var[i] *= 0.999998 act = th.clamp(act, 0.0, 1.0) #print("act...",act) actProb = [1 - act[0][0], act[0][0]] action_prob = self.getLaw(rule_prob, actProb) at = np.argmax(np.array(action_prob)) #print("at...",at) act = Variable(th.Tensor([[at]])) actions[i, :] = act self.steps_done += 1 return actions def select_action2(self, state_batch): # state_batch: n_agents x state_dim actions = th.zeros(self.n_agents, self.n_actions) FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor for i in range(self.n_agents): sb = state_batch[i, :].detach() act = self.actors[i](sb.unsqueeze(0)) #.squeeze() act += th.from_numpy( np.random.randn(self.n_actions) * self.var[i]).type(FloatTensor) if self.episode_done > self.episodes_before_train and\ self.var[i] > 0.05: self.var[i] *= 0.999998 act = th.clamp(act, 0.0, 1.0) actions[i, :] = act return actions def select_eval_action(self, state_batch, rule_prob, rule): # state_batch: n_agents x state_dim actions = th.zeros(self.n_agents, self.n_actions) #FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor for i in range(self.n_agents): sta = Variable(th.Tensor([[0]])) act = self.actors[i](sta) #.squeeze() act = th.clamp(act, 0.0, 1.0) if rule: actProb = [1 - act[0][0], act[0][0]] action_prob = self.getLaw(rule_prob, actProb) #if law:#act[0][0]>0.88: at = np.argmax(np.array(action_prob)) #print("at... ",at) act = Variable(th.Tensor([[at]])) actions[i, :] = act self.steps_done += 1 return actions
# Agent dqn = Agent(args, env) # If a model is provided, and evaluate is fale, presumably we want to resume, so try to load memory if args.model is not None and not args.evaluate: if not args.memory: raise ValueError('Cannot resume training without memory save path. Aborting...') elif not os.path.exists(args.memory): raise ValueError( 'Could not find memory file at {path}. Aborting...'.format(path=args.memory)) mem = load_memory(args.memory, args.disable_bzip_memory) else: mem = ReplayMemory(args, args.memory_capacity, env) priority_weight_increase = (1 - args.priority_weight) / (args.T_max - args.learn_start) # # Construct validation memory # val_mem = ReplayMemory(args, args.evaluation_size, test_env) # T, done = 0, True # while T < args.evaluation_size: # if done: # state, done = env.reset(), False # next_state, _, done, _ = env.step(np.random.randint(0, n_actions)) # val_mem.append(state, -1, 0.0, done) # state = next_state # T += 1
class Agent(): def __init__(self, action_size): self.action_size = action_size # These are hyper parameters for the DQN self.discount_factor = 0.99 self.epsilon = 1.0 self.epsilon_min = 0.01 self.explore_step = 1000000 self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.explore_step self.train_start = 100000 self.update_target = 1000 # Generate the memory self.memory = ReplayMemory() # Create the policy net and the target net self.policy_net = DQN(action_size) self.policy_net.to(device) self.optimizer = optim.Adam(params=self.policy_net.parameters(), lr=learning_rate) self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=scheduler_step_size, gamma=scheduler_gamma) # Initialize a target network and initialize the target network to the policy net ### CODE ### self.target_net = DQN(action_size).to(device) self.update_target_net() self.target_net.eval() def load_policy_net(self, path): self.policy_net = torch.load(path) # after some time interval update the target net to be same with policy net def update_target_net(self): ### CODE ### self.target_net.load_state_dict(self.policy_net.state_dict()) """Get action using policy net using epsilon-greedy policy""" def get_action(self, state): if np.random.rand() <= self.epsilon: ### CODE #### (copy over from agent.py!) a = torch.tensor([[random.randrange(self.action_size)]], device=device, dtype=torch.long) else: ### CODE #### (copy over from agent.py!) with torch.no_grad(): state = torch.from_numpy(state).reshape(1,4,84,84).to(device) a = self.policy_net(state).max(1)[1].view(1, 1) return a # pick samples randomly from replay memory (with batch_size) def train_policy_net(self, frame): if self.epsilon > self.epsilon_min: self.epsilon -= self.epsilon_decay mini_batch = self.memory.sample_mini_batch(frame) mini_batch = np.array(mini_batch).transpose() history = np.stack(mini_batch[0], axis=0) states = np.float32(history[:, :4, :, :]) / 255. states = torch.from_numpy(states).cuda() actions = list(mini_batch[1]) actions = torch.LongTensor(actions).cuda() rewards = list(mini_batch[2]) rewards = torch.FloatTensor(rewards).cuda() next_states = np.float32(history[:, 1:, :, :]) / 255. dones = mini_batch[3] # checks if the game is over musk = torch.tensor(list(map(int, dones==False)),dtype=torch.uint8) # Your agent.py code here with double DQN modifications ### CODE ### curr_Q = self.policy_net(states).gather(1, actions.unsqueeze(1)).squeeze(1) next_state_values = torch.zeros(32, device=device) next_states = torch.from_numpy(next_states).to(device) next_state_values[musk==1] = self.target_net(next_states[musk==1]).max(1)[0].detach() #next_state_values[musk] = self.target_net(next_states[musk]).detach().gather(1, self.policy_net(next_states[musk]).argmax(1).unsqueeze(1)).squeeze(1) target_Q = next_state_values * self.discount_factor + rewards loss = F.smooth_l1_loss(curr_Q, target_Q) self.optimizer.zero_grad() loss.backward() #torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 10) for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() self.scheduler.step()
class Agent(object): """ The learner and decision maker. Based on the DQN algorithm - ref Mnih et. al 2015 i.e. Q-Learning with experience replay & a target network All calls to tensorflow are wrapped into methods. Support for environments is currently manually configured. """ def __init__(self, env, discount, tau, sess, total_steps, batch_size, layers, learning_rate, epsilon_decay_fraction=0.5, memory_fraction=0.25, process_observation=False, process_target=False, **kwargs): self.env = env self.discount = discount self.tau = tau self.sess = sess self.batch_size = batch_size # number of steps where epsilon is decayed from 1.0 to 0.1 decay_steps = total_steps * epsilon_decay_fraction self.epsilon_getter = EpsilonDecayer(decay_steps) # the counter is stepped up every time we act or learn self.counter = 0 if (repr(env) == '<TimeLimit<CartPoleEnv<CartPole-v0>>>') or \ (repr(env) == '<TimeLimit<CartPoleEnv<CartPole-v1>>>'): obs_space_shape = env.observation_space.shape # the shape of the gym Discrete space is the number of actions # not the shape of a single action array # create a tuple to specify the action space self.action_space_shape = (1, ) # a list of all possible actions self.actions = [act for act in range(env.action_space.n)] elif repr(env) == '<TimeLimit<PendulumEnv<Pendulum-v0>>>': raise ValueError('Build in progress') obs_space_shape = env.observation_space.shape self.action_space_shape = env.action_space.shape self.actions = np.linspace(env.action_space.low, env.action_space.high, num=20, endpoint=True).tolist() elif repr(env) == '<TimeLimit<MountainCarEnv<MountainCar-v0>>>': obs_space_shape = env.observation_space.shape self.action_space_shape = (1, ) self.actions = [act for act in range(env.action_space.n)] else: raise ValueError('Environment not supported') self.memory = ReplayMemory(obs_space_shape, self.action_space_shape, size=int(total_steps * memory_fraction)) model_config = { 'input_shape': obs_space_shape, 'output_shape': (len(self.actions), ), 'layers': layers, 'learning_rate': learning_rate } # the two approximations of Q(s,a) # use the same config dictionary for both self.online = Qfunc(model_config, scope='online') self.target = Qfunc(model_config, scope='target') # set up the operations to copy the online network parameters to # the target network self.update_ops = self.make_target_net_update_ops() if process_observation: self.observation_processor = Normalizer(obs_space_shape[0]) if process_target: self.target_processor = Normalizer(1) self.acting_writer = tf.summary.FileWriter('./results/acting', graph=self.sess.graph) self.learning_writer = tf.summary.FileWriter('./results/learning', graph=self.sess.graph) self.sess.run(tf.global_variables_initializer()) self.update_target_network(tau=1.0) def __repr__(self): return '<class DQN Agent>' def make_target_net_update_ops(self): """ Creates the Tensorflow operations to update the target network. The two lists of Tensorflow Variables (one for the online net, one for the target net) are iterated over together and new weights are assigned to the target network """ with tf.variable_scope('update_target_network'): self.tf_tau = tf.placeholder(tf.float32, shape=(), name='tau') update_ops = [] for online, target in zip(self.online.params, self.target.params): o_name, t_name = online.name.split('/')[1:], target.name.split( '/')[1:] print('copying {} to {}'.format(o_name, t_name)) assert o_name == t_name val = tf.add(tf.multiply(online, self.tf_tau), tf.multiply(target, 1 - self.tf_tau)) operation = target.assign(val) update_ops.append(operation) return update_ops def update_target_network(self, tau=None): """ Updates the target network weights using the parameter tau Relies on the sorted lists of tf.Variables kept in each Qfunc object """ if tau is None: tau = self.tau logging.debug('updating target net at count {}'.format(self.counter)) self.sess.run(self.update_ops, {self.tf_tau: tau}) def remember(self, observation, action, reward, next_observation, done): """ Store experience in the agent's memory. args observation (np.array) action (np.array) reward (np.array) next_observation (np.array) done (np.array) """ if hasattr(self, 'observation_processor'): observation = self.observation_processor(observation) next_observation = self.observation_processor(next_observation) return self.memory.remember(observation, action, reward, next_observation, done) def predict_target(self, observations): """ Target network is used to predict the maximum discounted expected return for the next_observation as experienced by the agent args observations (np.array) returns max_q (np.array) shape=(batch_size, 1) """ fetches = [ self.target.q_values, self.target.max_q, self.target.acting_summary ] feed_dict = {self.target.observation: observations} q_vals, max_q, summary = self.sess.run(fetches, feed_dict) self.learning_writer.add_summary(summary, self.counter) logging.debug('predict_target - next_obs {}'.format(observations)) logging.debug('predict_target - q_vals {}'.format(q_vals)) logging.debug('predict_target - max_q {}'.format(max_q)) return max_q.reshape(observations.shape[0], 1) def predict_online(self, observation): """ We use our online network to choose actions. args observation (np.array) a single observation returns action """ obs = observation.reshape((1, *self.env.observation_space.shape)) fetches = [ self.online.q_values, self.online.max_q, self.online.optimal_action_idx, self.online.acting_summary ] feed_dict = {self.online.observation: obs} q_values, max_q, action_idx, summary = self.sess.run( fetches, feed_dict) self.acting_writer.add_summary(summary, self.counter) max_q = max_q.flatten()[0] max_q_sum = tf.Summary( value=[tf.Summary.Value(tag='max_q_acting', simple_value=max_q)]) self.acting_writer.add_summary(max_q_sum, self.counter) self.acting_writer.flush() # index at zero because TF returns an array action = self.actions[action_idx[0]] logging.debug('predict_online - observation {}'.format(obs)) logging.debug('predict_online - pred_q_values {}'.format(q_values)) logging.debug('predict_online - max_q {}'.format(max_q)) logging.debug('predict_online - action_index {}'.format(action_idx)) logging.debug('predict_online - action {}'.format(action)) return action def act(self, observation): """ Our agent attempts to manipulate the world. Acting according to epsilon greedy policy. args observation (np.array) returns action (np.array) """ self.counter += 1 epsilon = self.epsilon_getter.epsilon logging.debug('epsilon is {}'.format(epsilon)) if epsilon > random_uniform(): action = self.env.action_space.sample() logging.debug('acting randomly - action is {}'.format(action)) else: action = self.predict_online(observation) logging.debug('acting optimally action is {}'.format(action)) epsilon_sum = tf.Summary( value=[tf.Summary.Value(tag='epsilon', simple_value=epsilon)]) self.acting_writer.add_summary(epsilon_sum, self.counter) self.acting_writer.flush() # return np.array(action).reshape(1, *self.action_space_shape) return action def learn(self): """ Our agent attempts to make sense of the world. A batch sampled using experience replay is used to train the online network using targets from the target network. returns train_info (dict) """ batch = self.memory.get_batch(self.batch_size) observations = batch['observations'] actions = batch['actions'] rewards = batch['rewards'] terminals = batch['terminal'] next_observations = batch['next_observations'] next_obs_q = self.predict_target(next_observations) # if next state is terminal, set the value to zero next_obs_q[terminals] = 0 # creating a target for Q(s,a) using the Bellman equation rewards = rewards.reshape(rewards.shape[0], 1) target = rewards + self.discount * next_obs_q if hasattr(self, 'target_processor'): target = self.target_processor(target) indicies = np.zeros((actions.shape[0], 1), dtype=int) for arr, action in zip(indicies, actions): idx = self.actions.index(action) arr[0] = idx rng = np.arange(actions.shape[0]).reshape(actions.shape[0], 1) indicies = np.concatenate([rng, indicies], axis=1) fetches = [ self.online.q_values, self.online.q_value, self.online.loss, self.online.train_op, self.online.learning_summary ] feed_dict = { self.online.observation: observations, self.online.action: indicies, self.online.target: target } q_vals, q_val, loss, train_op, train_sum = self.sess.run( fetches, feed_dict) logging.debug('learning - observations {}'.format(observations)) logging.debug('learning - rewards {}'.format(rewards)) logging.debug('learning - terminals {}'.format(terminals)) logging.debug('learning - next_obs_q {}'.format(next_obs_q)) logging.debug('learning - actions {}'.format(actions)) logging.debug('learning - indicies {}'.format(indicies)) logging.debug('learning - q_values {}'.format(q_vals)) logging.debug('learning - q_value {}'.format(q_val)) logging.debug('learning - target {}'.format(target)) logging.debug('learning - loss {}'.format(loss)) self.learning_writer.add_summary(train_sum, self.counter) self.update_target_network() return {'loss': loss}
class Agent(): def __init__(self, action_size): self.load_model = True self.action_size = action_size # These are hyper parameters for the DQN self.discount_factor = 0.99 self.epsilon = 1.0 self.epsilon_min = 0.01 self.explore_step = 100000 self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.explore_step self.train_start = 100000 self.update_target = 1000 # Generate the memory self.memory = ReplayMemory() # Create the policy net and the target net self.policy_net = DQN(action_size) self.policy_net.to(device) self.target_net = DQN(action_size) self.target_net.to(device) self.optimizer = optim.Adam(params=self.policy_net.parameters(), lr=learning_rate) # initialize target net self.update_target_net() if self.load_model: self.policy_net = torch.load('./save_model/ec1_breakout_dqn') # after some time interval update the target net to be same with policy net def update_target_net(self): self.target_net.load_state_dict(self.policy_net.state_dict()) """Get action using policy net using epsilon-greedy policy""" def get_action(self, state): if np.random.rand() <= self.epsilon: ### CODE #### # Choose a random action a = torch.tensor([[random.randrange(3)]]) if torch.cuda.is_available(): a = a.cuda() else: ### CODE #### state = torch.tensor(state).unsqueeze(0) if torch.cuda.is_available(): state = state.cuda() a = self.policy_net(state).max(1)[1] a = a.view(1, 1) return a # pick samples randomly from replay memory (with batch_size) def train_policy_net(self, frame): if self.epsilon > self.epsilon_min: self.epsilon -= self.epsilon_decay mini_batch = self.memory.sample_mini_batch(frame) mini_batch = np.array(mini_batch).transpose() history = np.stack(mini_batch[0], axis=0) states = np.float32(history[:, :4, :, :]) / 255. actions = list(mini_batch[1]) rewards = list(mini_batch[2]) next_states = np.float32(history[:, 1:, :, :]) / 255. dones = mini_batch[3] # checks if the game is over # Compute Q(s_t, a) - Q of the current state ### CODE #### states = torch.tensor(states, device=device) actions = torch.tensor(actions, device=device, dtype=torch.long).view(-1, 1) next_states = torch.tensor(next_states, device=device) rewards = torch.tensor(rewards, device=device) a = self.policy_net(states) Q = a.gather(1, actions).view(-1) # Compute Q function of next state ### CODE #### Q_next = self.target_net(next_states) # Find maximum Q-value of action at next state from target net ### CODE #### Q_next = Q_next.max(1)[0].detach() # Compute the Huber Loss ### CODE #### Huber_loss = F.smooth_l1_loss(Q, (Q_next * self.discount_factor + rewards)) # Optimize the model ### CODE #### self.optimizer.zero_grad() Huber_loss.backward() for parameter in self.policy_net.parameters(): parameter.grad.data.clamp_(-1, 1) self.optimizer.step()