class ReplayBuffer: """Fixed-size buffer to store experience tuples.""" def __init__(self, action_size, buffer_size, batch_size, seed): """Initialize a ReplayBuffer object. Params ====== action_size (int): dimension of each action buffer_size (int): maximum size of buffer batch_size (int): size of each training batch seed (int): random seed """ self.action_size = action_size self.memory = Memory(capacity=buffer_size, replay_beta=REPLAY_BETA, replay_alpha=REPLAY_ALPHA, replay_beta_increment=REPLAY_BETA_INCREMENT) self.batch_size = batch_size self.seed = random.seed(seed) self.experience = namedtuple( "Experience", field_names=["state", "action", "reward", "next_state", "done"]) def add(self, state, action, reward, next_state, done): """Add a new experience to memory.""" if len(self.memory) <= self.batch_size: error = random.random() else: error = self.memory.max_prio e = self.experience(state, action, reward, next_state, done) self.memory.add(error, e) def sample(self): """Randomly sample a batch of experiences from memory.""" experiences, idxs, ws = self.memory.sample(n=self.batch_size) states = torch.from_numpy( np.vstack([e.state for e in experiences if e is not None])).float().to(device) actions = torch.from_numpy( np.vstack([e.action for e in experiences if e is not None])).long().to(device) rewards = torch.from_numpy( np.vstack([e.reward for e in experiences if e is not None])).float().to(device) next_states = torch.from_numpy( np.vstack([e.next_state for e in experiences if e is not None])).float().to(device) dones = torch.from_numpy( np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device) return (states, actions, rewards, next_states, dones), idxs, ws def __len__(self): """Return the current size of internal memory.""" return len(self.memory)
class Agent(object): def __init__(self): self.network, self.target_network = AtariNet(ACTIONS_SIZE), AtariNet( ACTIONS_SIZE) self.memory = Memory(MEMORY_SIZE) self.learning_count = 0 self.optimizer = torch.optim.Adam(self.network.parameters(), lr=LR) self.loss_func = nn.MSELoss() def action(self, state, israndom): if israndom and random.random() < EPSILON: return np.random.randint(0, ACTIONS_SIZE) state = torch.unsqueeze(torch.FloatTensor(state), 0) actions_value = self.network.forward(state) return torch.max(actions_value, 1)[1].data.numpy()[0] def learn(self, state, action, reward, next_state, done): old_val = self.network.forward(torch.FloatTensor([state])).gather( 1, torch.LongTensor([[action]]))[0] target_val = self.network.forward(torch.FloatTensor([state])) if done: done = 0 target = reward else: done = 1 target = reward + GAMMA * torch.max(target_val) error = abs(old_val[0] - target) self.memory.add(error.data, (state, action, reward, next_state, done)) if self.memory.tree.n_entries < MEMORY_THRESHOLD: return if self.learning_count % UPDATE_TIME == 0: self.target_network.load_state_dict(self.network.state_dict()) self.learning_count += 1 batch, idxs, is_weights = self.memory.sample(BATCH_SIZE) state = torch.FloatTensor([x[0] for x in batch]) action = torch.LongTensor([[x[1]] for x in batch]) reward = torch.FloatTensor([[x[2]] for x in batch]) next_state = torch.FloatTensor([x[3] for x in batch]) done = torch.FloatTensor([[x[4]] for x in batch]) eval_q = self.network.forward(state).gather(1, action) next_q = self.target_network(next_state).detach() target_q = reward + GAMMA * next_q.max(1)[0].view(BATCH_SIZE, 1) * done errors = torch.abs(eval_q - target_q).data.numpy().flatten() loss = self.loss_func(eval_q, target_q) for i in range(BATCH_SIZE): idx = idxs[i] self.memory.update(idx, errors[i]) self.optimizer.zero_grad() loss.backward() self.optimizer.step()
class PERAgent(OffPolicyAgent): # construct agent's model separately, so it can be sized according to problem def __init__(self, n_replay, env, target_policy, behavior_policy, lr, discount, type = 'BC'): super().__init__(n_replay, env, target_policy, behavior_policy, lr, discount, type) # reseed numpy, reset weights of network # Reset must be performed before every episode def reset(self,seed): # Reset time self.t=0 # Set seed value np.random.seed(seed) # Reset replay buffer self.replay_buffer = Memory(self.n_replay) # Rebuild model self.build_model(self.n_features,self.env.nA) def generate_action(self, s, target_policy_sel = True): pval = self.target_policy[s] if target_policy_sel else self.behavior_policy[s] return np.random.choice(a=self.actions, p=pval) def generate_all_actions(self,target_policy_sel = True): return np.array([self.generate_action(item, target_policy_sel) for item in range(self.target_policy.shape[0])]) # Generate steps of experience def generate_experience(self, k=16): # Initialize environment s = self.env.reset() done = False steps = 0 # For each step while steps < k: # choose action according to behavior policy a = self.generate_action(s,False) # Take a step in environment based on chosen action (s2,r,done,_) = self.env.step(a) # Compute importance ratios ratio = self.target_policy[s,a] / self.behavior_policy[s,a] # states and target action for Computing TD Error current_state = self.construct_features([s]) next_state = self.construct_features([s2]) target_policy_action = self.generate_action(s,True) # Get bootstrap estimate of next state action values value_s = self.model.predict([current_state,np.zeros(current_state.shape[0])]) value_next_s = self.model.predict([next_state,np.zeros(next_state.shape[0])]) updated_val = r if done else (r + self.discount*value_next_s[0][target_policy_action]) # Compute TD error td_error = np.abs(updated_val - value_s[0][a]) # Stop execution if weights blow up - not converged if td_error > 10**5: return 1 # Add experience to IR replay buffer self.replay_buffer.add_per(td_error, (s,a,r,s2)) # Set for next step s=s2 self.t += 1 steps += 1 # If episode ends, reset environment if done: done = False s = self.env.reset() return 0 # do batch of training using replay buffer def train_batch(self, n_samples, batch_size): # Sample a minibatch from replay buffer data_samples, idxs, ratios, buffer_total = self.replay_buffer.sample(n_samples) # Extract rewards, states, next states, actions from samples rewards = extract_transition_components(data_samples, TransitionComponent.reward) next_states = extract_transition_components(data_samples, TransitionComponent.next_state) next_state_features = self.construct_features(next_states) states = extract_transition_components(data_samples, TransitionComponent.state) state_features = self.construct_features(states) actions = extract_transition_components(data_samples, TransitionComponent.action) # Calculate Target policy actions target_policy_actions = np.array([self.generate_action(state, True) for state in states]) # Calculate state values for TD error next_values_sa = self.model.predict([next_state_features, np.zeros(next_state_features.shape[0])]) next_values = np.choose(target_policy_actions,next_values_sa.T) # v(s') is zero for terminal state, so need to fix model prediction for i in range(n_samples): # if experience ends in terminal state, value function returns 0 if next_states[i] == -1 or next_states[i] == 10: #TODO this only works for randomwalk of size 10 next_values[i] = 0.0 # Compute targets by bootstrap estimates targets = (rewards + self.discount*next_values) # Compute error for updating priorities pred_values = self.model.predict([state_features, np.zeros(state_features.shape[0])]) final_targets = np.copy(pred_values) np.put_along_axis(final_targets, np.expand_dims(actions,axis = 1),targets[:,np.newaxis],axis = 1) pred = np.choose(actions, pred_values.T) error = np.abs(pred - targets) # Priority update for i in range(batch_size): self.replay_buffer.update(idxs[i], error[i]) # train on samples self.model.fit([state_features, ratios], final_targets, batch_size=batch_size, verbose=0)
class MAD4PG: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, buffer_size=int(1e6), batch_size=64, gamma=0.99, tau=1e-3, update_every=3, num_mc_steps=5, num_agents=2): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.BATCH_SIZE = batch_size self.GAMMA = gamma self.TAU = tau self.UPDATE_EVERY = update_every self.num_mc_steps = num_mc_steps self.experiences = [ ExperienceQueue(num_mc_steps) for _ in range(num_agents) ] self.memory = Memory(buffer_size) self.t_step = 0 self.train_start = batch_size self.mad4pg_agent = [ D4PG(state_size, action_size, seed, device, num_atoms=N_ATOMS, q_min=Vmin, q_max=Vmax), D4PG(state_size, action_size, seed, device, num_atoms=N_ATOMS, q_min=Vmin, q_max=Vmax) ] def acts(self, states, add_noise=0.0): acts = [] for s, a in zip(states, self.mad4pg_agent): acts.append(a.act(np.expand_dims(s, 0), add_noise)) return np.vstack(acts) # borrow from https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/tree/master/Chapter14 def distr_projection(self, next_distr_v, rewards_v, dones_mask_t, gamma): next_distr = next_distr_v.data.cpu().numpy() rewards = rewards_v.data.cpu().numpy() dones_mask = dones_mask_t.cpu().numpy().astype(np.bool) batch_size = len(rewards) proj_distr = np.zeros((batch_size, N_ATOMS), dtype=np.float32) dones_mask = np.squeeze(dones_mask) rewards = rewards.reshape(-1) for atom in range(N_ATOMS): tz_j = np.minimum( Vmax, np.maximum(Vmin, rewards + (Vmin + atom * DELTA_Z) * gamma)) b_j = (tz_j - Vmin) / DELTA_Z l = np.floor(b_j).astype(np.int64) u = np.ceil(b_j).astype(np.int64) eq_mask = u == l proj_distr[eq_mask, l[eq_mask]] += next_distr[eq_mask, atom] ne_mask = u != l proj_distr[ne_mask, l[ne_mask]] += next_distr[ne_mask, atom] * (u - b_j)[ne_mask] proj_distr[ne_mask, u[ne_mask]] += next_distr[ne_mask, atom] * (b_j - l)[ne_mask] if dones_mask.any(): proj_distr[dones_mask] = 0.0 tz_j = np.minimum(Vmax, np.maximum(Vmin, rewards[dones_mask])) b_j = (tz_j - Vmin) / DELTA_Z l = np.floor(b_j).astype(np.int64) u = np.ceil(b_j).astype(np.int64) eq_mask = u == l if dones_mask.shape == (): if dones_mask: proj_distr[0, l] = 1.0 else: ne_mask = u != l proj_distr[0, l] = (u - b_j)[ne_mask] proj_distr[0, u] = (b_j - l)[ne_mask] else: eq_dones = dones_mask.copy() eq_dones[dones_mask] = eq_mask if eq_dones.any(): proj_distr[eq_dones, l[eq_mask]] = 1.0 ne_mask = u != l ne_dones = dones_mask.copy() ne_dones[dones_mask] = ne_mask if ne_dones.any(): proj_distr[ne_dones, l[ne_mask]] = (u - b_j)[ne_mask] proj_distr[ne_dones, u[ne_mask]] = (b_j - l)[ne_mask] return torch.FloatTensor(proj_distr).to(device) def step(self, states, actions, rewards, next_states, dones): for agent_index in range(len(self.mad4pg_agent)): agent_experiences = self.experiences[agent_index] agent_experiences.states.appendleft(states[agent_index]) agent_experiences.rewards.appendleft(rewards[agent_index] * self.GAMMA**self.num_mc_steps) agent_experiences.actions.appendleft(actions[agent_index]) if len(agent_experiences.rewards) == self.num_mc_steps or dones[ agent_index]: # N-steps return: r= r1+gamma*r2+..+gamma^(t-1)*rt done_tensor = torch.tensor( dones[agent_index]).float().to(device) condition = True while condition: for i in range(len(agent_experiences.rewards)): agent_experiences.rewards[i] /= self.GAMMA state = torch.tensor( agent_experiences.states[-1]).float().unsqueeze(0).to( device) next_state = torch.tensor( next_states[agent_index]).float().unsqueeze(0).to( device) action = torch.tensor( agent_experiences.actions[-1]).float().unsqueeze(0).to( device) sum_reward = torch.tensor(sum( agent_experiences.rewards)).float().unsqueeze(0).to( device) with evaluating( self.mad4pg_agent[agent_index]) as cur_agent: q_logits_expected = cur_agent.critic_local( state, action) action_next = cur_agent.actor_target(next_state) q_target_logits_next = cur_agent.critic_target( next_state, action_next) q_target_distr_next = F.softmax(q_target_logits_next, dim=1) q_target_distr_next_projected = self.distr_projection( q_target_distr_next, sum_reward, done_tensor, self.GAMMA**self.num_mc_steps) cross_entropy = -F.log_softmax( q_logits_expected, dim=1) * q_target_distr_next_projected error = cross_entropy.sum(dim=1).mean().cpu().data self.memory.add( error, (states[agent_index], actions[agent_index], sum_reward, next_states[agent_index], dones[agent_index])) agent_experiences.states.pop() agent_experiences.rewards.pop() agent_experiences.actions.pop() condition = False and dones[agent_index] and len( agent_experiences.states) > 0 if dones[agent_index]: agent_experiences.states.clear() agent_experiences.rewards.clear() agent_experiences.actions.clear() self.t_step = (self.t_step + 1) % self.UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn # print(self.memory.tree.n_entries) if self.memory.tree.n_entries > self.train_start: for agent_index in range(len(self.mad4pg_agent)): sampled_experiences, idxs = self.sample() self.learn(self.mad4pg_agent[agent_index], sampled_experiences, idxs) def sample(self): # prioritized experience replay mini_batch, idxs, is_weights = self.memory.sample(self.BATCH_SIZE) mini_batch = np.array(mini_batch).transpose() statess = np.vstack([m for m in mini_batch[0] if m is not None]) actionss = np.vstack([m for m in mini_batch[1] if m is not None]) rewardss = np.vstack([m for m in mini_batch[2] if m is not None]) next_statess = np.vstack([m for m in mini_batch[3] if m is not None]) doness = np.vstack([m for m in mini_batch[4] if m is not None]) # bool to binary doness = doness.astype(int) statess = torch.from_numpy(statess).float().to(device) actionss = torch.from_numpy(actionss).float().to(device) rewardss = torch.from_numpy(rewardss).float().to(device) next_statess = torch.from_numpy(next_statess).float().to(device) doness = torch.from_numpy(doness).float().to(device) return (statess, actionss, rewardss, next_statess, doness), idxs def learn(self, agent, experiences, idxs): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models # Compute critic loss q_logits_expected = agent.critic_local(states, actions) actions_next = agent.actor_target(next_states) q_targets_logits_next = agent.critic_target(next_states, actions_next) q_targets_distr_next = F.softmax(q_targets_logits_next, dim=1) q_targets_distr_projected_next = self.distr_projection( q_targets_distr_next, rewards, dones, self.GAMMA**self.num_mc_steps) cross_entropy = -F.log_softmax(q_logits_expected, dim=1) * q_targets_distr_projected_next critic_loss = cross_entropy.sum(dim=1).mean() with torch.no_grad(): errors = cross_entropy.sum(dim=1).cpu().data.numpy() # update priority for i in range(self.BATCH_SIZE): idx = idxs[i] self.memory.update(idx, errors[i]) # Minimize the loss agent.critic_optimizer.zero_grad() critic_loss.backward() agent.critic_optimizer.step() # Compute actor loss actions_pred = agent.actor_local(states) crt_distr_v = agent.critic_local(states, actions_pred) actor_loss = -agent.critic_local.distr_to_q(crt_distr_v) actor_loss = actor_loss.mean() # Minimize the loss agent.actor_optimizer.zero_grad() actor_loss.backward() agent.actor_optimizer.step() # ----------------------- update target networks ----------------------- # agent.soft_update(agent.critic_local, agent.critic_target, self.TAU) agent.soft_update(agent.actor_local, agent.actor_target, self.TAU)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action buffer_size (int): maximum size of buffer batch_size (int): size of each training batch """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.buffer_size = buffer_size self.memory = Memory( capacity=self.buffer_size) # internal memory using SumTree self.batch_size = batch_size # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size).to(device) self.actor_target = Actor(state_size, action_size).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size).to(device) self.critic_target = Critic(state_size, action_size).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done, batch_size=BATCH_SIZE, update_every=UPDATE_EVERY): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % update_every if self.t_step == 0: # Learn, if enough samples are available in memory if self.memory.tree.n_entries >= batch_size: experiences, idxs, is_weights = self.sample() self.learn(experiences, idxs, is_weights) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: #action = [act + self.noise.sample() for act in action] action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, idxs, is_weights, batch_size=BATCH_SIZE, gamma=GAMMA): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) #Loss calculation critic_loss = (torch.from_numpy(is_weights).float().to(device) * F.mse_loss(Q_expected, Q_targets)).mean() # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() #Introducing gradient clipping torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) #.......................update priorities in prioritized replay buffer.......# #Calculate errors used in prioritized replay buffer errors = (Q_expected - Q_targets).squeeze().cpu().data.numpy() # update priority for i in range(batch_size): idx = idxs[i] self.memory.update(idx, errors[i]) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def add(self, state, action, reward, next_state, done, gamma=GAMMA): """Add a new experience to memory.""" next_state_torch = torch.from_numpy(next_state).float().to(device) reward_torch = torch.unsqueeze( torch.from_numpy(np.array(reward)).float().to(device), 1) done_torch = torch.unsqueeze( torch.from_numpy(np.array(done).astype( np.uint8)).float().to(device), 1) state_torch = torch.from_numpy(state).float().to(device) action_torch = torch.from_numpy(action).float().to(device) self.actor_target.eval() self.critic_target.eval() self.critic_local.eval() with torch.no_grad(): action_next = self.actor_target(next_state_torch) Q_target_next = self.critic_target(next_state_torch, action_next) Q_target = reward_torch + (gamma * Q_target_next * (1 - done_torch)) Q_expected = self.critic_local(state_torch, action_torch) self.actor_local.train() self.critic_target.train() self.critic_local.train() #Error used in prioritized replay buffer error = (Q_expected - Q_target).squeeze().cpu().data.numpy() #Adding experiences to prioritized replay buffer for i in np.arange(len(reward)): self.memory.add( error[i], (state[i], action[i], reward[i], next_state[i], done[i])) def sample(self): """Randomly sample a batch of experiences from memory.""" experiences, idxs, is_weights = self.memory.sample(self.batch_size) states = np.vstack([e[0] for e in experiences]) states = torch.from_numpy(states).float().to(device) actions = np.vstack([e[1] for e in experiences]) actions = torch.from_numpy(actions).float().to(device) rewards = np.vstack([e[2] for e in experiences]) rewards = torch.from_numpy(rewards).float().to(device) next_states = np.vstack([e[3] for e in experiences]) next_states = torch.from_numpy(next_states).float().to(device) dones = np.vstack([e[4] for e in experiences]).astype(np.uint8) dones = torch.from_numpy(dones).float().to(device) return (states, actions, rewards, next_states, dones), idxs, is_weights
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = Memory(BUFFER_SIZE) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): state = torch.from_numpy(state).float().unsqueeze(0).to(device) next_state = torch.from_numpy(next_state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() self.qnetwork_target.eval() with torch.no_grad(): target_action_values = self.qnetwork_target(next_state) expected_action_values = self.qnetwork_local(state) self.qnetwork_local.train() self.qnetwork_target.train() old_val = expected_action_values[0][action] new_val = reward if not done: new_val += GAMMA * torch.max(target_action_values) error = abs(old_val - new_val) # Save experience in replay memory self.memory.add(error, (state, action, reward, next_state, done)) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if self.memory.tree.n_entries > BATCH_SIZE: experiences = self.memory.sample(BATCH_SIZE) self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()).astype(int) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ mini_batches, idxs, is_weights = experiences states = torch.from_numpy(np.vstack([mini_batch[0] for mini_batch in mini_batches])).float().to(device) actions = torch.from_numpy(np.vstack([mini_batch[1] for mini_batch in mini_batches])).long().to(device) rewards = torch.from_numpy(np.vstack([mini_batch[2] for mini_batch in mini_batches])).float().to(device) next_states = torch.from_numpy(np.vstack([mini_batch[3] for mini_batch in mini_batches])).float().to(device) dones = torch.from_numpy(np.vstack([int(mini_batch[4]) for mini_batch in mini_batches])).float().to(device) ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" Q_source_next = self.qnetwork_local(next_states).detach().max(1)[1].unsqueeze(1) Q_target = self.qnetwork_target(next_states) Q_double_target = torch.tensor([Q_target[i][max_index] for i, max_index in enumerate(Q_source_next)]).detach().unsqueeze(1) Q_observed = rewards + (gamma * Q_double_target * (1 - dones)) Q_expected = self.qnetwork_local(states).gather(1, actions) errors = torch.abs(Q_expected - Q_observed).data.numpy() # update priority for i in range(BATCH_SIZE): idx = idxs[i] self.memory.update(idx, errors[i]) loss = (torch.FloatTensor(is_weights) * F.mse_loss(Q_expected, Q_observed)).mean() self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class DQNAgent(): def __init__(self, state_size, action_size): self.render = False self.load_model = False # get size of state and action self.state_size = state_size self.action_size = action_size self.discount_factor = 0.99 self.learning_rate = 0.001 self.lr_step_size = 10 self.lr_gamma = 0.9 self.memory_size = 2**15 self.epsilon = 1.0 self.epsilon_min = 0.05 self.explore_step = 1000 self.epsilon_decay = 0.99995 self.batch_size = 64 self.train_start = 10000 # create prioritized replay memory using SumTree self.memory = Memory(self.memory_size) # create main model and target model self.model = DQN(state_size, action_size) self.model.apply(self.weights_init) self.target_model = DQN(state_size, action_size) self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate) self.scheduler = StepLR(self.optimizer, step_size=self.lr_step_size, gamma=self.lr_gamma) # initialize target model self.update_target_model() if self.load_model: self.model = torch.load('save_model/per_dqn') self.model.train() # weight xavier initialize def weights_init(self, m): classname = m.__class__.__name__ if classname.find('Linear') != -1: torch.nn.init.xavier_uniform_(m.weight) # after some time interval update the target model to be same with model def update_target_model(self): self.target_model.load_state_dict(self.model.state_dict()) # get action from model using epsilon-greedy policy def get_action(self, state): if np.random.rand() <= self.epsilon: return random.randrange(self.action_size) else: state = torch.from_numpy(state).float() q_value = self.model(state) _, action = torch.max(q_value, 1) return int(action) # save sample (error,<s,a,r,s'>) to the replay memory def append_sample(self, state, action, reward, next_state, done): target = self.model(torch.tensor(state).float()).data old_val = target[0][action] target_val = self.target_model(torch.tensor(next_state).float()).data if done: target[0][action] = reward else: target[0][action] = reward + \ self.discount_factor * torch.max(target_val) error = abs(old_val - target[0][action]) self.memory.add(error, (state, action, reward, next_state, done)) # pick samples from prioritized replay memory (with batch_size) def train_model(self): if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay self.epsilon = max(self.epsilon, self.epsilon_min) mini_batch, idxs, is_weights = self.memory.sample(self.batch_size) mini_batch = np.array(mini_batch).transpose() states = np.vstack(mini_batch[0]) actions = list(mini_batch[1]) rewards = list(mini_batch[2]) next_states = np.vstack(mini_batch[3]) dones = mini_batch[4] # bool to binary dones = dones.astype(int) # Q function of current state states = torch.tensor(states).float() pred = self.model(states) # one-hot encoding a = torch.tensor(actions, dtype=torch.long).view(-1, 1) one_hot_action = torch.zeros(self.batch_size, self.action_size) one_hot_action.scatter_(1, a, 1) pred = torch.sum(pred.mul(one_hot_action), dim=1) # Q function of next state next_states = torch.tensor(next_states, dtype=torch.float) next_pred = self.target_model(next_states.float()).data rewards = torch.tensor(rewards, dtype=torch.float) dones = torch.tensor(dones, dtype=torch.float) # Q Learning: get maximum Q value at s' from target model target = rewards + (1 - dones) * \ self.discount_factor * next_pred.max(1)[0] errors = torch.abs(pred - target).data.numpy() # update priority for i in range(self.batch_size): idx = idxs[i] self.memory.update(idx, errors[i]) self.optimizer.zero_grad() # MSE Loss function loss = (torch.tensor(is_weights).float() * F.mse_loss(pred, target)).mean() loss.backward() # and train self.optimizer.step() return loss.item()
class IRAgent_FourRooms(OffPolicyAgent_FourRooms): # construct agent's model separately, so it can be sized according to problem def __init__(self, n_replay, env, target_policy, behavior_policy, lr, discount, type='BC'): super().__init__(n_replay, env, target_policy, behavior_policy, lr, discount, type) # reseed numpy, reset weights of network # Reset must be performed before every episode def reset(self, seed=0): # Reset time self.t = 0 # Set seed value np.random.seed(seed) # Reset replay buffer self.replay_buffer = Memory(self.n_replay) # Rebuild model self.build_model(self.env.size[0] * self.env.size[1], 1) # instead of generating one episode of experience, take 16 steps of experience def generate_experience(self, k=16): # Initialize environment s = self.env.reset() done = False steps = 0 # counting to k steps while steps < k: # choose action according to policy a = np.random.choice(a=self.actions, p=self.behavior_policy[s[0], s[1]]) # Take a step in environment based on chosen action (s2, r, done, _) = self.env.step(a) # Compute importance ratios ratio = self.target_policy[s[0], s[1], a] / self.behavior_policy[s[0], s[1], a] # Add experience to IR replay buffer self.replay_buffer.add(ratio, (s, a, r, s2)) # Set for next step s = s2 self.t += 1 steps += 1 # If episode ends, reset environment if done: s = self.env.reset() done = False # do batch of training using replay buffer def train_batch(self, batch_size): # Sample a minibatch from replay buffer data_samples, _, _, buffer_total = self.replay_buffer.sample( batch_size) # Extract rewards, states, next states from samples rewards = extract_transition_components(data_samples, TransitionComponent.reward) next_states = extract_transition_components( data_samples, TransitionComponent.next_state) next_state_features = self.construct_features(next_states) states = extract_transition_components(data_samples, TransitionComponent.state) state_features = self.construct_features(states) # Importance ratios for update equation - IR does not use this ratios = np.ones(len(states)) # In case of Bias Correction, pre-multiply bias corrector to update if self.name == "BC": ratios = ratios * (buffer_total / self.replay_buffer.tree.n_entries) # Get value estimate for next state next_values = self.model.predict( [next_state_features, np.zeros(next_state_features.shape[0])]).flatten() # v(s') is zero for terminal state, so need to fix model prediction for i in range(batch_size): # if experience ends in terminal state then s==s2 if (states[i] == next_states[i]).all(): next_values[i] = 0.0 # Compute targets by bootstrap estimates targets = (rewards + self.discount * next_values) # Train on samples self.model.fit([state_features, ratios], targets, batch_size=batch_size, verbose=0)
class Dqn(): def __init__(self): self.eval_net, self.target_net = Net(), Net() self.eval_net.cuda() self.target_net.cuda() # create prioritized replay memory using SumTree self.memory = Memory(Train_Configs.MEMORY_CAPACITY) self.learn_counter = 0 self.optimizer = optim.Adam(self.eval_net.parameters(), lr=Train_Configs.LR,betas=(0.9, 0.99), eps=1e-08, weight_decay=2e-5) self.loss = nn.MSELoss(reduce=False, size_average=False) self.fig, self.ax = plt.subplots() self.discount_factor = Train_Configs.GAMMA def store_trans(self, state_path, action, reward, next_state_path,done): ## action type: id x, y, c = my_utils.translate_actionID_to_XY_and_channel(action) trans = state_path+'#'+str(action)+'#'+str(reward)+'#'+next_state_path#np.hstack((state, [action], [reward], next_state)) #------ calculate TD errors from (s,a,r,s'), #--only from the first depth image, without considering other 9 rotated depth images state_d = state_path next_state_d = next_state_path if c > 0: state_d = my_utils.get_rotate_depth(c,state_d) next_state_d = my_utils.get_rotate_depth(c, next_state_d) state_depth = my_utils.copy_depth_to_3_channel(state_d).reshape(1, 3, DIM_STATES[0], DIM_STATES[1]) next_state_depth = my_utils.copy_depth_to_3_channel(next_state_d).reshape(1, 3, DIM_STATES[0], DIM_STATES[1]) if c == 0: state_rgb = my_utils.trans_HWC_to_CHW(cv2.imread(state_path.replace('npy','png').replace('state_depth','state_image'))).reshape(1, 3, DIM_STATES[0], DIM_STATES[1]) next_state_rgb = my_utils.trans_HWC_to_CHW(cv2.imread(next_state_path.replace('npy','png').replace('state_depth', 'state_image'))).reshape(1, 3, DIM_STATES[0], DIM_STATES[1]) else: state_rgb = my_utils.get_rotate_rgb(c,state_path.replace('npy','png').replace('state_depth','state_image')).reshape(1, 3, DIM_STATES[0], DIM_STATES[1]) next_state_rgb = my_utils.get_rotate_rgb(c,next_state_path.replace('npy','png').replace('state_depth','state_image')).reshape(1, 3, DIM_STATES[0], DIM_STATES[1]) # # normlize # state_depth = (state_depth - Train_Configs.MIN_DEPTH_ARR) / (Train_Configs.MAX_DEPTH_ARR - Train_Configs.MIN_DEPTH_ARR) # next_state_depth = (next_state_depth - Train_Configs.MIN_DEPTH_ARR) / (Train_Configs.MAX_DEPTH_ARR - Train_Configs.MIN_DEPTH_ARR) # numpy to tensor state_depth = torch.cuda.FloatTensor(state_depth) next_state_depth = torch.cuda.FloatTensor(next_state_depth) state_rgb = torch.cuda.FloatTensor(state_rgb) next_state_rgb = torch.cuda.FloatTensor(next_state_rgb) target_singleChannel_q_map = self.eval_net.forward(state_rgb,state_depth)#dim:[1,1,224,224],CHANNEL=1 # x,y,c = my_utils.translate_actionID_to_XY_and_channel(action) old_val = target_singleChannel_q_map[0][0][x][y] # old_val = target[0][action] target_val_singleChannel_q_map = self.target_net.forward(next_state_rgb,next_state_depth)#dim:[1,1,224,224] if done == 1: target_q = reward # target[0][action] = reward else: target_q = reward + self.discount_factor * torch.max(target_val_singleChannel_q_map) # target[0][action] = reward + self.discount_factor * torch.max(target_val) error = abs(old_val - target_q) self.memory.add(float(error), trans) def choose_action(self, state_path,EPSILON): state_rgb = [] state_depth = [] state_rgb.append(my_utils.trans_HWC_to_CHW(cv2.imread(state_path.replace('npy','png').replace('state_depth','state_image')))) state_depth.append(my_utils.copy_depth_to_3_channel(state_path))#dim:[3, DIM_STATES[0], DIM_STATES[1]]#.reshape(1, 3, DIM_STATES[0], DIM_STATES[1])) for i in range(1,Train_Configs.ROTATION_BINS): state_rotate_rgb = my_utils.get_rotate_rgb(i,state_path.replace('npy','png').replace('state_depth','state_image')) state_rgb.append(state_rotate_rgb) #------------------------ state_rotate_depth = my_utils.get_rotate_depth(i,state_path) state_rotate_3_depth = my_utils.copy_depth_to_3_channel(state_rotate_depth) state_depth.append(state_rotate_3_depth) state_rgb = np.array(state_rgb) state_depth = np.array(state_depth) # # normlize # state_depth = (state_depth - Train_Configs.MIN_DEPTH_ARR) / (Train_Configs.MAX_DEPTH_ARR - Train_Configs.MIN_DEPTH_ARR) # numpy to tensor state_rgb = torch.cuda.FloatTensor(state_rgb) # dim:[INPUT_IMAGE,3,224,224] state_depth = torch.cuda.FloatTensor(state_depth) #dim:[INPUT_IMAGE,3,224,224] # random exploration prob = np.min((EPSILON,1)) p_select = np.array([prob, 1 - prob]) selected_ac_type = np.random.choice([0, 1], p=p_select.ravel()) if selected_ac_type == 0:#origin predicted action target_multiChannel_q_map = self.eval_net.forward(state_rgb,state_depth) # dim:[INPUT_IMAGES,1,224,224] action = my_utils.find_maxQ_in_qmap(target_multiChannel_q_map.cpu().detach().numpy()) ac_ty = '0' else: if np.random.randn() <= 0.5:#sample action according to depth image action = my_utils.select_randpID_from_mask(state_path) ac_ty = '1' else:# random sample action = np.random.randint(0,DIM_ACTIONS) ac_ty = '2' return ac_ty,action # the id of action def plot(self, ax, x): ax.cla() ax.set_xlabel("episode") ax.set_ylabel("total reward") ax.plot(x, 'b-') plt.pause(0.000000000000001) def load_batch_data(self,batch_list):#batch_list.dim:[batch_size] # print(batch_list) batch_state_rgb = [] batch_state_depth = [] batch_action = [] batch_reward = [] batch_next_state_rgb = [] batch_next_state_depth = [] for item in batch_list: data = item.split('#')#state+'#'+str(action)+'#'+str(reward)+'#'+next_state action_id = int(data[1]) batch_state_rgb.append(my_utils.get_rotate_rgb(action_id,data[0].replace('npy','png').replace('state_depth','state_image'))) batch_state_depth.append(my_utils.copy_depth_to_3_channel(my_utils.get_rotate_depth(action_id,data[0])).reshape((3,DIM_STATES[0],DIM_STATES[1]))) batch_action.append([int(data[1])]) batch_reward.append([float(data[2])]) batch_next_state_rgb.append(my_utils.get_rotate_rgb(action_id, data[3].replace('npy','png').replace('state_depth', 'state_image'))) batch_next_state_depth.append(my_utils.copy_depth_to_3_channel(my_utils.get_rotate_depth(action_id,data[3])).reshape((3,DIM_STATES[0],DIM_STATES[1]))) batch_state_depth = np.array(batch_state_depth) batch_next_state_depth = np.array(batch_next_state_depth) # # normlize # batch_state_depth = (batch_state_depth - Train_Configs.MIN_DEPTH_ARR) / (Train_Configs.MAX_DEPTH_ARR - Train_Configs.MIN_DEPTH_ARR) # batch_next_state_depth = (batch_next_state_depth - Train_Configs.MIN_DEPTH_ARR) / (Train_Configs.MAX_DEPTH_ARR - Train_Configs.MIN_DEPTH_ARR) return torch.cuda.FloatTensor(batch_state_rgb),torch.cuda.FloatTensor(batch_state_depth),torch.cuda.LongTensor(batch_action),torch.cuda.FloatTensor(batch_reward),torch.cuda.FloatTensor(batch_next_state_rgb),torch.cuda.FloatTensor(batch_next_state_depth) def learn(self): # learn 100 times then the target network update if self.learn_counter % Train_Configs.Q_NETWORK_ITERATION ==0: self.target_net.load_state_dict(self.eval_net.state_dict()) self.learn_counter+=1 mini_batch, idxs, is_weights = self.memory.sample(Train_Configs.BATCH_SIZE)# batch_state_rgb,batch_state_depth,batch_action,batch_reward,batch_next_state_rgb,batch_next_state_depth = self.load_batch_data(mini_batch)#dim:[1] eval_singleChannel_q_map = self.eval_net(batch_state_rgb,batch_state_depth) # dim:[BATCH_SIZE,1,224,224] x_y_c_list = my_utils.translate_actionID_to_XY_and_channel_batch(batch_action) # old_val = target_multiChannel_q_map[0][c][x][y] batch_q = [] # for xyc in x_y_c_list: for i in range(len(x_y_c_list)): xyc = x_y_c_list[i] batch_q.append([eval_singleChannel_q_map[i][0][xyc[0]][xyc[1]]]) q_eval = torch.cuda.FloatTensor(batch_q)#self.eval_net(batch_state).gather(1, batch_action)#action: a value in range [0,DIM_ACTIONS-1] q_eval = Variable(q_eval.cuda(), requires_grad=True) target_singleChannel_q_map = self.target_net(batch_next_state_rgb,batch_next_state_depth).cpu().detach().numpy()#q_next,dim:[BATCH_SIZE,1,224,224] batch_q_next = [] for b_item in target_singleChannel_q_map:#dim:[1,224,224] batch_q_next.append([np.max(b_item)]) q_next = torch.cuda.FloatTensor(batch_q_next) # q_next = Variable(q_next.cuda(), requires_grad=True) q_target = batch_reward + Train_Configs.GAMMA*q_next q_target = Variable(q_target.cuda(), requires_grad=True) # self.average_q = q_eval.mean() weight_tensor = torch.cuda.FloatTensor(is_weights)# weight_tensor = weight_tensor.reshape((Train_Configs.BATCH_SIZE,1)) weight_tensor = Variable(weight_tensor.cuda(), requires_grad=False) loss = (weight_tensor * self.loss(q_eval, q_target)).mean()##(torch.FloatTensor(is_weights) * F.mse_loss(pred, target)).mean() self.optimizer.zero_grad() loss.backward() self.optimizer.step() return float(loss),float(q_eval.mean())
class Agent: """ Interacts with and learns from the environment. Learns using a Deep Q-Network with prioritised experience replay. Two models are instantiated, one for use during evaluation and updating (qnetwork_local) and one to be used for the target values in the learning algorithm (qnetwork_target) """ BUFFER_SIZE = int(1e5) # prioritised experience replay buffer size BATCH_SIZE = 64 # minibatch size TAU = 1e-3 # for soft update of target parameters LR = 5e-4 # learning rate UPDATE_EVERY = 4 # how often to update the network device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") def __init__(self, state_size: int = 37, action_size: int = 4, seed: int = 44, gamma: float = 0.99, tau: float = 1e-3): """ Initialize an Agent object. :param state_size: dimension of each state :param action_size: dimension of each action :param seed: random seed for network initialisation :param gamma: discount factor :param tau: lag for soft update of target network parameters """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.gamma = gamma self.tau = tau self.max_w = 0 # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.LR) # Prioritised Experience Replay memory self.memory = Memory(self.BUFFER_SIZE) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state: np.ndarray, action: int, reward: float, next_state: np.ndarray, done: bool, gamma: Optional[float] = None, tau: Optional[float] = None): """ An agent step takes the current experience and stores it in the replay memory, then samples from the memory and calls the learning algorithm. :param state: the state vector :param action: the action performed on the state :param reward: the reward given upon performing the action :param next_state: the next state after doing the action :param done: True if the episode has ended :param gamma: discount factor :param tau: lag for soft update of target network parameters """ gamma_value = gamma if gamma is not None else self.gamma tau_value = tau if tau is not None else self.tau self.memory.add((state, action, reward, next_state, done)) # Save experience in replay memory # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if self.memory.tree.n_entries > self.BATCH_SIZE: experiences, idxs, importance_weights = self.memory.sample( self.BATCH_SIZE) self.learn(experiences, idxs, importance_weights, gamma_value, tau_value) def act(self, state: np.ndarray, eps: float = 0.0): """ Returns actions for given state as per current policy. Uses the local copy of the model. :param state: current state :param eps: epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.int32(np.argmax(action_values.cpu().data.numpy())) else: return np.int32(random.choice(np.arange(self.action_size))) def learn(self, experiences: Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], indices: np.ndarray, importance_weights: torch.Tensor, gamma: float, tau: float): """ Update value parameters using given batch of experience tuples. :param experiences: tuple of (s, a, r, s', done) tuples :param indices: indices of the SumTree that contain the priority values for these experiences. Used for updating the priority values after error has been found :param importance_weights: the weighting that each experience carries when used in updating the network :param gamma: discount factor :param tau: lag for soft update of target network parameters """ states, actions, rewards, next_states, dones = experiences # For Double-DQN, get action with the highest q-value (for next_states) from the local model next_action = self.qnetwork_local(next_states).detach().max( 1)[1].unsqueeze(1) # Get max predicted Q values (for next states) from target model q_targets_next = self.qnetwork_target(next_states).gather( 1, next_action) # Compute Q targets for current states q_targets = rewards + (gamma * q_targets_next * (1 - dones)) # Get expected Q values from local model q_expected = self.qnetwork_local(states).gather(1, actions) error = torch.abs(q_targets - q_expected).detach().numpy() # update priorities self.memory.batch_update(indices, error) # Compute mse and loss with importance weights t_mse = F.mse_loss(q_expected, q_targets) loss = (importance_weights * t_mse).mean() # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # update target network with model parameters approaching those of the local network. self.soft_update(self.qnetwork_local, self.qnetwork_target, tau) @staticmethod def soft_update(local_model: torch.nn.Module, target_model: torch.nn.Module, tau: float): """ Soft update model parameters. Every learning step the target network is updated to bring its parameters nearer by a factor TAU to those of the improving local network. If TAU = 1 the target network becomes a copy of the local network. If TAU = 0 the target network is not updated. θ_target = τ*θ_local + (1 - τ)*θ_target :param local_model: weights will be copied from :param target_model: weights will be copied to :param tau: interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDQN_Agent: def __init__(self, useDepth=False): self.useDepth = useDepth self.eps_start = 0.9 self.eps_end = 0.05 self.eps_decay = 30000 self.gamma = 0.8 self.learning_rate = 0.001 self.batch_size = 512 self.memory = Memory(10000) self.max_episodes = 10000 self.save_interval = 2 self.test_interval = 10 self.network_update_interval = 10 self.episode = -1 self.steps_done = 0 self.max_steps = 34 self.policy = DQN() self.target = DQN() self.test_network = DQN() self.target.eval() self.test_network.eval() self.updateNetworks() self.env = DroneEnv(useDepth) self.optimizer = optim.Adam(self.policy.parameters(), self.learning_rate) if torch.cuda.is_available(): print('Using device:', device) print(torch.cuda.get_device_name(0)) else: print("Using CPU") # LOGGING cwd = os.getcwd() self.save_dir = os.path.join(cwd, "saved models") if not os.path.exists(self.save_dir): os.mkdir("saved models") if not os.path.exists(os.path.join(cwd, "videos")): os.mkdir("videos") if torch.cuda.is_available(): self.policy = self.policy.to(device) # to use GPU self.target = self.target.to(device) # to use GPU self.test_network = self.test_network.to(device) # to use GPU # model backup files = glob.glob(self.save_dir + '\\*.pt') if len(files) > 0: files.sort(key=os.path.getmtime) file = files[-1] checkpoint = torch.load(file) self.policy.load_state_dict(checkpoint['state_dict']) self.episode = checkpoint['episode'] self.steps_done = checkpoint['steps_done'] self.updateNetworks() print("Saved parameters loaded" "\nModel: ", file, "\nSteps done: ", self.steps_done, "\nEpisode: ", self.episode) else: if os.path.exists("log.txt"): open('log.txt', 'w').close() if os.path.exists("last_episode.txt"): open('last_episode.txt', 'w').close() if os.path.exists("last_episode.txt"): open('saved_model_params.txt', 'w').close() self.optimizer = optim.Adam(self.policy.parameters(), self.learning_rate) obs, _ = self.env.reset() tensor = self.transformToTensor(obs) writer.add_graph(self.policy, tensor) def updateNetworks(self): self.target.load_state_dict(self.policy.state_dict()) def transformToTensor(self, img): tensor = torch.FloatTensor(img).to(device) tensor = tensor.unsqueeze(0) tensor = tensor.unsqueeze(0) tensor = tensor.float() return tensor def convert_size(self, size_bytes): if size_bytes == 0: return "0B" size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") i = int(math.floor(math.log(size_bytes, 1024))) p = math.pow(1024, i) s = round(size_bytes / p, 2) return "%s %s" % (s, size_name[i]) def act(self, state): self.eps_threshold = self.eps_end + (self.eps_start - self.eps_end) * math.exp( -1.0 * self.steps_done / self.eps_decay ) self.steps_done += 1 if random.random() > self.eps_threshold: # print("greedy") if torch.cuda.is_available(): action = np.argmax(self.policy(state).cpu().data.squeeze().numpy()) else: action = np.argmax(self.policy(state).data.squeeze().numpy()) else: action = random.randrange(0, 4) return int(action) def append_sample(self, state, action, reward, next_state): next_state = self.transformToTensor(next_state) current_q = self.policy(state).squeeze().cpu().detach().numpy()[action] next_q = self.target(next_state).squeeze().cpu().detach().numpy()[action] expected_q = reward + (self.gamma * next_q) error = abs(current_q - expected_q), self.memory.add(error, state, action, reward, next_state) def learn(self): if self.memory.tree.n_entries < self.batch_size: return states, actions, rewards, next_states, idxs, is_weights = self.memory.sample(self.batch_size) states = tuple(states) next_states = tuple(next_states) states = torch.cat(states) actions = np.asarray(actions) rewards = np.asarray(rewards) next_states = torch.cat(next_states) current_q = self.policy(states)[[range(0, self.batch_size)], [actions]] next_q =self.target(next_states).cpu().detach().numpy()[[range(0, self.batch_size)], [actions]] expected_q = torch.FloatTensor(rewards + (self.gamma * next_q)).to(device) errors = torch.abs(current_q.squeeze() - expected_q.squeeze()).cpu().detach().numpy() # update priority for i in range(self.batch_size): idx = idxs[i] self.memory.update(idx, errors[i]) loss = F.smooth_l1_loss(current_q.squeeze(), expected_q.squeeze()) self.optimizer.zero_grad() loss.backward() self.optimizer.step() def train(self): print("Starting...") score_history = [] reward_history = [] if self.episode == -1: self.episode = 1 for e in range(1, self.max_episodes + 1): start = time.time() state, _ = self.env.reset() steps = 0 score = 0 while True: state = self.transformToTensor(state) action = self.act(state) next_state, reward, done, _ = self.env.step(action) if steps == self.max_steps: done = 1 #self.memorize(state, action, reward, next_state) self.append_sample(state, action, reward, next_state) self.learn() state = next_state steps += 1 score += reward if done: print("----------------------------------------------------------------------------------------") if self.memory.tree.n_entries < self.batch_size: print("Training will start after ", self.batch_size - self.memory.tree.n_entries, " steps.") break print( "episode:{0}, reward: {1}, mean reward: {2}, score: {3}, epsilon: {4}, total steps: {5}".format( self.episode, reward, round(score / steps, 2), score, self.eps_threshold, self.steps_done)) score_history.append(score) reward_history.append(reward) with open('log.txt', 'a') as file: file.write( "episode:{0}, reward: {1}, mean reward: {2}, score: {3}, epsilon: {4}, total steps: {5}\n".format( self.episode, reward, round(score / steps, 2), score, self.eps_threshold, self.steps_done)) if torch.cuda.is_available(): print('Total Memory:', self.convert_size(torch.cuda.get_device_properties(0).total_memory)) print('Allocated Memory:', self.convert_size(torch.cuda.memory_allocated(0))) print('Cached Memory:', self.convert_size(torch.cuda.memory_reserved(0))) print('Free Memory:', self.convert_size(torch.cuda.get_device_properties(0).total_memory - ( torch.cuda.max_memory_allocated() + torch.cuda.max_memory_reserved()))) # tensorboard --logdir=runs memory_usage_allocated = np.float64(round(torch.cuda.memory_allocated(0) / 1024 ** 3, 1)) memory_usage_cached = np.float64(round(torch.cuda.memory_reserved(0) / 1024 ** 3, 1)) writer.add_scalar("memory_usage_allocated", memory_usage_allocated, self.episode) writer.add_scalar("memory_usage_cached", memory_usage_cached, self.episode) writer.add_scalar('epsilon_value', self.eps_threshold, self.episode) writer.add_scalar('score_history', score, self.episode) writer.add_scalar('reward_history', reward, self.episode) writer.add_scalar('Total steps', self.steps_done, self.episode) writer.add_scalars('General Look', {'score_history': score, 'reward_history': reward}, self.episode) # save checkpoint if self.episode % self.save_interval == 0: checkpoint = { 'episode': self.episode, 'steps_done': self.steps_done, 'state_dict': self.policy.state_dict() } torch.save(checkpoint, self.save_dir + '//EPISODE{}.pt'.format(self.episode)) if self.episode % self.network_update_interval == 0: self.updateNetworks() self.episode += 1 end = time.time() stopWatch = end - start print("Episode is done, episode time: ", stopWatch) if self.episode % self.test_interval == 0: self.test() break writer.close() def test(self): self.test_network.load_state_dict(self.target.state_dict()) start = time.time() steps = 0 score = 0 image_array = [] state, next_state_image = self.env.reset() image_array.append(next_state_image) while True: state = self.transformToTensor(state) action = int(np.argmax(self.test_network(state).cpu().data.squeeze().numpy())) next_state, reward, done, next_state_image = self.env.step(action) image_array.append(next_state_image) if steps == self.max_steps: done = 1 state = next_state steps += 1 score += reward if done: print("----------------------------------------------------------------------------------------") print("TEST, reward: {}, score: {}, total steps: {}".format( reward, score, self.steps_done)) with open('tests.txt', 'a') as file: file.write("TEST, reward: {}, score: {}, total steps: {}\n".format( reward, score, self.steps_done)) writer.add_scalars('Test', {'score': score, 'reward': reward}, self.episode) end = time.time() stopWatch = end - start print("Test is done, test time: ", stopWatch) # Convert images to video frameSize = (256, 144) import cv2 video = cv2.VideoWriter("videos\\test_video_episode_{}_score_{}.avi".format(self.episode, score), cv2.VideoWriter_fourcc(*'DIVX'), 7, frameSize) for img in image_array: video.write(img) video.release() break
class A2CAgent: def __init__(self, replay_size, memory_size=10000, prioritized=False, load_models=False, actor_model_file='', critic_model_file='', is_eval=False): self.state_size = 2 self.action_size = 3 self.step = 0 self.replay_size = replay_size self.replay_queue = deque(maxlen=self.replay_size) self.memory_size = memory_size self.prioritized = prioritized if self.prioritized: self.memory = Memory(capacity=memory_size) # Hyper parameters for learning self.value_size = 1 self.layer_size = 16 self.discount_factor = 0.99 self.actor_learning_rate = 0.0005 self.critic_learning_rate = 0.005 self.is_eval = is_eval # Create actor and critic neural networks self.actor = self.build_actor() self.critic = self.build_critic() #self.actor.summary() if load_models: if actor_model_file: self.actor.load_weights(actor_model_file) if critic_model_file: self.critic.load_weights(critic_model_file) # The actor takes a state and outputs probabilities of each possible action def build_actor(self): layer1 = Dense(self.layer_size, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform') layer2 = Dense(self.layer_size, input_dim=self.layer_size, activation='relu', kernel_initializer='he_uniform') # Use softmax activation so that the sum of probabilities of the actions becomes 1 layer3 = Dense(self.action_size, activation='softmax', kernel_initializer='he_uniform') # self.action_size = 2 actor = Sequential(layers=[layer1, layer2, layer3]) # Print a summary of the network actor.summary() # We use categorical crossentropy loss since we have a probability distribution actor.compile(loss='categorical_crossentropy', optimizer=Adam(lr=self.actor_learning_rate)) return actor # The critic takes a state and outputs the predicted value of the state def build_critic(self): layer1 = Dense(self.layer_size, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform') layer2 = Dense(self.layer_size, input_dim=self.layer_size, activation='relu', kernel_initializer='he_uniform') layer3 = Dense(self.value_size, activation='linear', kernel_initializer='he_uniform') # self.value_size = 1 critic = Sequential(layers=[layer1, layer2, layer3]) # Print a summary of the network critic.summary() critic.compile(loss='mean_squared_error', optimizer=Adam(lr=self.critic_learning_rate)) return critic def act(self, state): # Get probabilities for each action policy = self.actor.predict(np.array([state]), batch_size=1).flatten() # Randomly choose an action if not self.is_eval: return np.random.choice(self.action_size, 1, p=policy).take(0) else: return np.argmax(policy) # 20191117- for evaluation def store_transition(self, s, a, r, s_, dd): if self.prioritized: # prioritized replay transition = np.hstack((s, [a, r], s_, dd)) self.memory.store( transition) # have high priority for newly arrived transition else: #self.replay_queue.append((s, [a, r], s_, dd)) transition = np.hstack((s, [a, r], s_, dd)) self.replay_queue.append(transition) def expReplay(self, batch_size=64, lr=1, factor=0.95): if self.prioritized: tree_idx, batch_memory, ISWeights = self.memory.sample(batch_size) else: batch_memory = random.sample(self.replay_queue, batch_size) s_prevBatch = np.array([replay[[0, 1]] for replay in batch_memory]) a = np.array([replay[[2]] for replay in batch_memory]) r = np.array([replay[[3]] for replay in batch_memory]) s_currBatch = np.array([replay[[4, 5]] for replay in batch_memory]) d = np.array([replay[[6]] for replay in batch_memory]) td_error = np.zeros((d.shape[0], ), dtype=float) for i in range(d.shape[0]): q_prev = self.critic.predict(np.array([s_prevBatch[i, :]])) q_curr = self.critic.predict(np.array([s_currBatch[i, :]])) if int(d[i]) == 1: q_curr = r[i] q_realP = r[i] + factor * q_curr advantages = np.zeros((1, self.action_size)) advantages[0, int(a[i])] = q_realP - q_prev if self.prioritized: td_error[i] = abs(advantages[0, int(a[i])]) self.actor.fit(np.array([s_prevBatch[i, :]]), advantages, epochs=1, verbose=0) self.critic.fit(np.array([s_prevBatch[i, :]]), reshape(q_realP), epochs=1, verbose=0) if self.prioritized: self.memory.batch_update(tree_idx, td_error)
class DoubleDQN(object): def __init__(self, replay_size, memory_size=10000, prioritized=False): self.step = 0 self.replay_size = replay_size self.replay_queue = deque(maxlen=self.replay_size) self.memory_size = memory_size self.tau = 1e-2 #MountainCar-v0 self.model = self.create_model() self.prioritized = prioritized self.target_model = self.create_model() self.target_model.set_weights(self.model.get_weights()) if self.prioritized: self.memory = Memory(capacity=memory_size) def create_model(self): STATE_DIM, ACTION_DIM = 2, 3 model = models.Sequential([ layers.Dense(100, input_dim=STATE_DIM, activation='relu'), layers.Dense(ACTION_DIM, activation="linear") ]) model.compile(loss='mean_squared_error', optimizer=optimizers.Adam(0.001)) return model def act(self, s, epsilon=0.1): # if np.random.uniform() < epsilon - self.step * 0.0002: return np.random.choice([0, 1, 2]) return np.argmax(self.model.predict(np.array([s]))[0]) def save_model(self, file_path='MountainCar-v0-Ddqn.h5'): print('model saved') self.model.save(file_path) def store_transition(self, s, a, r, s_, dd): if self.prioritized: # prioritized replay transition = np.hstack((s, [a, r], s_, dd)) # transition -> 7x1 self.memory.store( transition) # have high priority for newly arrived transition else: #self.replay_queue.append((s, [a, r], s_, dd)) transition = np.hstack((s, [a, r], s_, dd)) # transition -> 7x1 self.replay_queue.append(transition) def expReplay(self, batch_size=64, lr=1, factor=0.95): if self.prioritized: tree_idx, batch_memory, ISWeights = self.memory.sample(batch_size) else: batch_memory = random.sample(self.replay_queue, batch_size) s_batch = np.array([replay[[0, 1]] for replay in batch_memory]) a = np.array([replay[[2]] for replay in batch_memory]) r = np.array([replay[[3]] for replay in batch_memory]) next_s_batch = np.array([replay[[4, 5]] for replay in batch_memory]) d = np.array([replay[[6]] for replay in batch_memory]) Q = self.model.predict(s_batch) Q_next = self.model.predict(next_s_batch) Q_targ = self.target_model.predict(next_s_batch) #update Q value td_error = np.zeros((d.shape[0], ), dtype=float) for i in range(d.shape[0]): old_q = Q[i, int(a[i])] if int(d[i]) == 1: Q[i, int(a[i])] = r[i] else: next_best_action = np.argmax(Q_next[i, :]) Q[i, int(a[i])] = r[i] + factor * Q_targ[i, next_best_action] if self.prioritized: td_error[i] = abs(old_q - Q[i, int(a[i])]) if self.prioritized: self.memory.batch_update(tree_idx, td_error) self.model.fit(s_batch, Q, verbose=0) def transfer_weights(self): """ Transfer Weights from Model to Target at rate Tau """ W = self.model.get_weights() tgt_W = self.target_model.get_weights() for i in range(len(W)): tgt_W[i] = self.tau * W[i] + (1 - self.tau) * tgt_W[i] self.target_model.set_weights(tgt_W)
class Agent: # todo change name to Agent def __init__(self, eps, lr, gamma, batch_size, tau, max_memory, lambda_1, lambda_2, lambda_3, n_steps, l_margin): # Input Parameters self.eps = eps # eps-greedy self.gamma = gamma # discount factor self.batch_size = batch_size self.tau = tau # frequency of target replacement self.ed = 0.005 # bonus for demonstration # todo they aren't used self.ea = 0.001 # todo they aren't used self.l_margin = l_margin self.n_steps = n_steps self.lambda1 = lambda_1 # n-step return self.lambda2 = lambda_2 # supervised loss self.lambda3 = lambda_3 # L2 self.counter = 0 # target replacement counter # todo change to iter_counter self.replay = Memory(capacity=max_memory) self.loss = nn.MSELoss() self.policy = Policy() # todo change not have to pass architecture self.opt = optim.Adam(self.policy.predictNet.parameters(), lr=lr, weight_decay=lambda_3) self.replay.e = 0 self.demoReplay = ddict(list) self.noisy = hasattr(self.policy.predictNet, "sample") def choose_action(self, state): state = torch.Tensor(state) A = self.policy.sortedA(state) if self.noisy: self.policy.predictNet.sample() return A[0] if np.random.random() < self.eps: return random.sample(A, 1)[0] return A[0] def sample(self): return self.replay.sample(self.batch_size) def store_demonstration(self, s, a, r, s_, done, episode): s = torch.Tensor(s) s_ = torch.Tensor(s_) episodeReplay = self.demoReplay[ episode] # replay of certain demo episode index = len(episodeReplay) data = (s, a, r, s_, done, (episode, index)) episodeReplay.append(data) self.replay.add(transition=data, demonstration=True) def store_transition(self, s, a, r, s_, done): s = torch.Tensor(s) s_ = torch.Tensor(s_) data = (s, a, r, s_, done, None) self.replay.add(transition=data, demonstration=False) def calculate_td_errors(self, samples): if self.noisy: self.policy.predictNet.sample() # for choosing action alls, alla, allr, alls_, alldone, *_ = zip(*samples) maxA = [self.policy.sortedA(s_)[0] for s_ in alls_] if self.noisy: self.policy.predictNet.sample() # for prediction self.policy.targetNet.sample() # for target Qtarget = torch.Tensor(allr) Qtarget[torch.tensor(alldone) != 1] += self.gamma * self.policy.calcQ( self.policy.targetNet, alls_, maxA)[torch.tensor(alldone) != 1] Qpredict = self.policy.calcQ(self.policy.predictNet, alls, alla) return Qpredict, Qtarget def JE(self, samples): loss = torch.tensor(0.0) count = 0 # number of demo for s, aE, *_, isdemo in samples: if isdemo is None: continue A = self.policy.sortedA(s) if len(A) == 1: continue QE = self.policy.calcQ(self.policy.predictNet, s, aE) A1, A2 = np.array(A)[: 2] # action with largest and second largest Q maxA = A2 if (A1 == aE).all() else A1 Q = self.policy.calcQ(self.policy.predictNet, s, maxA) if (Q + self.l_margin) < QE: continue else: loss += (Q - QE) count += 1 return loss / count if count != 0 else loss def Jn(self, samples, Qpredict): # wait for refactoring, can't use with noisy layer loss = torch.tensor(0.0) count = 0 for i, (s, a, r, s_, done, isdemo) in enumerate(samples): if isdemo is None: continue episode, idx = isdemo nidx = idx + self.n_steps lepoch = len(self.demoReplay[episode]) if nidx > lepoch: continue count += 1 ns, na, nr, ns_, ndone, _ = zip( *self.demoReplay[episode][idx:nidx]) ns, na, ns_, ndone = ns[-1], na[-1], ns_[-1], ndone[-1] discountedR = reduce( lambda x, y: (x[0] + self.gamma**x[1] * y, x[1] + 1), nr, (0, 0))[0] maxA = self.policy.sortedA(ns_)[0] target = discountedR if ndone else discountedR + self.gamma**self.n_steps * self.policy.calcQ( self.policy.targetNet, ns_, maxA) predict = Qpredict[i] loss += (target - predict)**2 return loss / count def L2(self, parameters): loss = 0 for p in parameters: loss += (p**2).sum() return loss def learn(self): self.opt.zero_grad() samples, idxs, = self.sample() Qpredict, Qtarget = self.calculate_td_errors(samples) for i in range(self.batch_size): error = math.fabs(float(Qpredict[i] - Qtarget[i])) self.replay.update(idxs[i], error) JDQ = self.loss(Qpredict, Qtarget) JE = self.JE(samples) Jn = self.Jn(samples, Qpredict) L2 = self.L2(self.policy.predictNet.parameters()) J = JDQ + self.lambda2 * JE + self.lambda1 * Jn + self.lambda3 * L2 J.backward() self.opt.step() self.counter += 1 if self.counter % self.tau == 0: self.policy.updateTargetNet()
class QLearning: def __init__( self, k, d, env_name, env_dir, env_fixed_xo, n_hidden, save_and_load_path, load, tensorboard_path, logger_path, learn_wall_time_limit, prioritized, trial_size, learning_rate=0.005, # we have finite horizon, so we don't worry about reward explosion # see: https://goo.gl/Ew4629 (Other Prediction Problems and Update Rules) reward_decay=1.0, e_greedy=0.8, save_model_iter=5000, memory_capacity=300000, memory_capacity_start_learning=10000, batch_size=64, e_greedy_increment=0.0005, replace_target_iter=500, planning=False, random_seed=None, ): self.env_name = env_name self.env, self.n_features, self.n_actions = self.get_env( env_name, env_dir, env_fixed_xo, k, d) self.save_and_load_path = save_and_load_path self.load = load self.path_check(load) # create a graph for model variables and session self.graph = tf.Graph() self.sess = tf.Session(graph=self.graph) if not load: self.random_seed = random_seed numpy.random.seed(self.random_seed) tf.set_random_seed(self.random_seed) self.tensorboard_path = tensorboard_path self.logger_path = logger_path self.tb_writer = TensorboardWriter( folder_name=self.tensorboard_path, session=self.sess) self.logger = Logger(self.logger_path) self.n_hidden = n_hidden self.lr = learning_rate self.gamma = reward_decay self.epsilon_max = e_greedy self.save_model_iter = save_model_iter self.memory_capacity = memory_capacity self.memory_capacity_start_learning = memory_capacity_start_learning self.learn_wall_time_limit = learn_wall_time_limit self.batch_size = batch_size self.epsilon_increment = e_greedy_increment self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max self.prioritized = prioritized # decide to use prioritized experience replay or not self.trial_size = trial_size self.replace_target_iter = replace_target_iter self.planning = planning # decide to use planning for additional learning with self.graph.as_default(): self._build_net() self.sess.run(tf.global_variables_initializer()) self.memory = Memory(prioritized=self.prioritized, capacity=self.memory_capacity, n_features=self.n_features, n_actions=self.n_actions, batch_size=self.batch_size, planning=self.planning, qsa_feature_extractor=self.env.step_state, qsa_feature_extractor_for_all_acts=self.env. all_possible_next_states) self.learn_iterations = 0 self.learn_wall_time = 0. self.sample_iterations = 0 self.sample_wall_time = 0. self.last_cpu_time = 0. self.last_wall_time = 0. self.last_save_time = time.time() self.last_test_learn_iterations = 0 else: self.load_model() self.memory_lock = multiprocessing.Lock( ) # lock for memory modification def get_env(self, env_name, env_dir, env_fixed_xo, k, d): # n_actions: # of one-card modification + 1 for not changing any card # n_features: input dimension to qlearning network (x_o and x_p plus time step as a feature) if env_name == 'env_nn': from environment.env_nn import Environment if env_dir: env = Environment.load(env_dir) else: raise NotImplementedError( 'we enforce environment has been created') n_features, n_actions = 2 * env.k + 1, env.d * (env.k - env.d) + 1 elif env_name == 'env_nn_noisy': from environment.env_nn_noisy import Environment if env_dir: env = Environment.load(env_dir) else: raise NotImplementedError( 'we enforce environment has been created') n_features, n_actions = 2 * env.k + 1, env.d * (env.k - env.d) + 1 elif env_name == 'env_greedymove': from environment.env_greedymove import Environment if env_dir: env = Environment.load(env_dir) else: raise NotImplementedError( 'we enforce environment has been created') n_features, n_actions = 2 * env.k + 1, env.d * (env.k - env.d) + 1 elif env_name == 'env_gamestate': from environment.env_gamestate import Environment if env_dir: env = Environment.load(env_dir) else: raise NotImplementedError( 'we enforce environment has been created') n_features, n_actions = 2 * env.k + 1, env.d * (env.k - env.d) + 1 return env, n_features, n_actions def path_check(self, load): save_and_load_path_dir = os.path.dirname(self.save_and_load_path) if load: assert os.path.exists( save_and_load_path_dir ), "model path not exist:" + save_and_load_path_dir else: os.makedirs(save_and_load_path_dir, exist_ok=True) # remove old existing models if any files = glob.glob(save_and_load_path_dir + '/*') for file in files: os.remove(file) def save_model(self): # save tensorflow with self.graph.as_default(): saver = tf.train.Saver() path = saver.save(self.sess, self.save_and_load_path) # save memory self.memory_lock.acquire() with open(self.save_and_load_path + '_memory.pickle', 'wb') as f: pickle.dump(self.memory, f, protocol=-1) # -1: highest protocol self.memory_lock.release() # save variables with open(self.save_and_load_path + '_variables.pickle', 'wb') as f: pickle.dump( (self.random_seed, self.tensorboard_path, self.logger_path, self.n_hidden, self.lr, self.gamma, self.epsilon_max, self.save_model_iter, self.memory_capacity, self.memory_capacity_start_learning, self.learn_wall_time_limit, self.batch_size, self.epsilon_increment, self.epsilon, self.prioritized, self.trial_size, self.replace_target_iter, self.planning, self.learn_iterations, self.sample_iterations, self.learn_wall_time, self.sample_wall_time, self.cpu_time, self.wall_time, self.last_test_learn_iterations), f, protocol=-1) self.last_save_time = time.time() print('save model to', path) def load_model(self): # load tensorflow with self.graph.as_default(): saver = tf.train.import_meta_graph(self.save_and_load_path + '.meta') saver.restore( self.sess, tf.train.latest_checkpoint( os.path.dirname(self.save_and_load_path))) # placeholders self.s = self.graph.get_tensor_by_name('s:0') # Q(s,a) feature self.s_ = self.graph.get_tensor_by_name('s_:0') # Q(s',a') feature self.rewards = self.graph.get_tensor_by_name('reward:0') # reward self.terminal_weights = self.graph.get_tensor_by_name( 'terminal:0') # terminal # variables self.q_eval = self.graph.get_tensor_by_name('eval_net/q_eval:0') self.eval_w1 = self.graph.get_tensor_by_name('eval_net/l1/w1:0') self.eval_b1 = self.graph.get_tensor_by_name('eval_net/l1/b1:0') self.eval_w2 = self.graph.get_tensor_by_name('eval_net/l2/w2:0') self.eval_b2 = self.graph.get_tensor_by_name('eval_net/l2/b2:0') self.q_next = self.graph.get_tensor_by_name('eval_net/q_next:0') self.q_target = self.graph.get_tensor_by_name("q_target:0") self.is_weights = self.graph.get_tensor_by_name("is_weights:0") self.loss = self.graph.get_tensor_by_name("loss:0") self.abs_errors = self.graph.get_tensor_by_name("abs_errors:0") # operations self.train_op = self.graph.get_operation_by_name('train_op') # load memory with open(self.save_and_load_path + '_memory.pickle', 'rb') as f: self.memory = pickle.load(f) # -1: highest protocol # load variables with open(self.save_and_load_path + '_variables.pickle', 'rb') as f: self.random_seed, \ self.tensorboard_path, self.logger_path, \ self.n_hidden, \ self.lr, self.gamma, \ self.epsilon_max, self.save_model_iter, \ self.memory_capacity, self.memory_capacity_start_learning, \ self.learn_wall_time_limit, self.batch_size, \ self.epsilon_increment, self.epsilon, \ self.prioritized, self.trial_size, \ self.replace_target_iter, self.planning, \ self.learn_iterations, \ self.sample_iterations, \ self.learn_wall_time, \ self.sample_wall_time, \ self.last_cpu_time, \ self.last_wall_time, \ self.last_test_learn_iterations = pickle.load(f) numpy.random.seed(self.random_seed) tf.set_random_seed(self.random_seed) self.tb_writer = TensorboardWriter(folder_name=self.tensorboard_path, session=self.sess) self.logger = Logger(self.logger_path) self.last_save_time = time.time() def _build_net(self): self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # Q(s,a) feature self.s_ = tf.placeholder(tf.float32, [None, self.n_actions, self.n_features], name='s_') # Q(s',a') feature self.rewards = tf.placeholder(tf.float32, [None], name='reward') # reward self.terminal_weights = tf.placeholder(tf.float32, [None], name='terminal') # terminal w_initializer, b_initializer = \ tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) # config of layers # ------------------ build evaluate_net ------------------ with tf.variable_scope('eval_net'): # s is Q(s,a) feature, shape: (n_sample, n_features) # s_ Q(s',a') for all a' feature, shape: (n_sample, n_actions, n_features) with tf.variable_scope('l1'): self.eval_w1 = tf.get_variable( 'w1', [self.n_features, self.n_hidden], initializer=w_initializer) self.eval_b1 = tf.get_variable('b1', [self.n_hidden], initializer=b_initializer) # l1 shape: (n_sample, n_hidden) l1 = tf.nn.relu(tf.matmul(self.s, self.eval_w1) + self.eval_b1) # l1_ shape: shape: (n_sample, n_actions, n_hidden) l1_ = tf.nn.relu( tf.einsum('ijk,kh->ijh', self.s_, self.eval_w1) + self.eval_b1) with tf.variable_scope('l2'): self.eval_w2 = tf.get_variable('w2', [self.n_hidden, 1], initializer=w_initializer) self.eval_b2 = tf.get_variable('b2', [1], initializer=b_initializer) # out shape: (n_sample, 1) out = tf.matmul(l1, self.eval_w2) + self.eval_b2 # out_ shape: (n_sample, n_actions, 1), Q(s',a') for all a' feature out_ = tf.einsum('ijh,ho->ijo', l1_, self.eval_w2) + self.eval_b2 self.q_eval = tf.squeeze(out, name='q_eval') self.q_next = tf.squeeze(out_, name='q_next') # ------------------ loss function ---------------------- self.q_target = tf.add( self.rewards, self.terminal_weights * (self.gamma * tf.reduce_max(self.q_next, axis=1)), name='q_target') # We do not want the target to be used for computing the gradient self.q_target = tf.stop_gradient(self.q_target) # importance sampling weight self.is_weights = tf.placeholder(tf.float32, [None], name='is_weights') self.loss = tf.reduce_mean( self.is_weights * tf.squared_difference(self.q_target, self.q_eval), name='loss') self.abs_errors = tf.abs(self.q_target - self.q_eval, name='abs_errors') self.train_op = tf.train.RMSPropOptimizer(self.lr).minimize( self.loss, name='train_op') def store_transition(self, s, a, r, s_, terminal): self.memory_lock.acquire() # transition is a tuple (current_state, action, reward, next_state, whether_terminal) self.memory.store((s, a, r, s_, terminal)) self.memory_lock.release() def update_memory_priority(self, exp_ids, abs_errors): """ update memory priority """ self.memory_lock.acquire() self.memory.update_priority(exp_ids, abs_errors) self.memory_lock.release() def choose_action(self, state, next_possible_states, next_possbile_actions, epsilon_greedy=True): pred_q_values = self.sess.run(self.q_eval, feed_dict={ self.s: next_possible_states }).flatten() if not epsilon_greedy or np.random.uniform() < self.epsilon: action_idx = np.argmax(pred_q_values) else: action_idx = np.random.choice( numpy.arange(len(next_possbile_actions))) action = next_possbile_actions[action_idx] pred_q_value = pred_q_values[action_idx] return action, pred_q_value # def _replace_target_params(self): # with self.graph.as_default(): # t_params = tf.get_collection('target_net_params') # e_params = tf.get_collection('eval_net_params') # self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)]) # print('target_params_replaced') def planning_learn(self, qsa_next_features, qsa_features): """ additional learning from planning """ raise NotImplementedError @property def cpu_time(self): cpu_time = psutil.Process().cpu_times() return cpu_time.user + cpu_time.system + cpu_time.children_system + cpu_time.children_user + self.last_cpu_time @property def wall_time(self): return time.time() - psutil.Process().create_time( ) + self.last_wall_time def learn(self): while True: if self.wall_time > self.learn_wall_time_limit: break if self.memory_size() < self.memory_capacity_start_learning: print('LEARN:{}:wait for more samples:wall time:{}'.format( self.learn_iterations, self.wall_time)) time.sleep(2) continue # don't learn too fast if self.learn_iterations > self.sample_iterations > 0: time.sleep(0.2) continue learn_time = time.time() qsa_feature, qsa_next_features, rewards, terminal_weights, is_weights, exp_ids \ = self.memory.sample() _, loss, abs_errors = self.sess.run( [self.train_op, self.loss, self.abs_errors], feed_dict={ self.s: qsa_feature, self.s_: qsa_next_features, self.rewards: rewards, self.terminal_weights: terminal_weights, self.is_weights: is_weights }) if self.prioritized: self.update_memory_priority(exp_ids, abs_errors) mem_total_p = self.memory.memory.tree.total_p else: mem_total_p = -1 if self.planning: self.planning_learn() self.epsilon = self.cur_epsilon() learn_time = time.time() - learn_time self.learn_iterations += 1 self.learn_wall_time += learn_time print( 'LEARN:{}:mem_size:{}:virtual:{}:wall_t:{:.2f}:total:{:.2f}:cpu_time:{:.2f}:pid:{}:wall_t:{:.2f}:mem_p:{:.2f}' .format(self.learn_iterations, self.memory_size(), self.memory_virtual_size(), learn_time, self.learn_wall_time, self.cpu_time, os.getpid(), self.wall_time, mem_total_p)) def cur_epsilon(self): return self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max def tb_write(self, tags, values, step): """ write to tensorboard """ if self.tb_writer: self.tb_writer.write(tags, values, step) def get_logger(self): return self.logger def memory_size(self): return self.memory.size def memory_virtual_size(self): return self.memory.virtual_size def function_call_counts_training(self): """ number of function calls during training, which equals to memory virtual size """ return self.memory.virtual_size def collect_samples(self, EPISODE_SIZE, TEST_PERIOD): """ collect samples in a process """ for i_episode in range(self.sample_iterations, EPISODE_SIZE): if self.wall_time > self.learn_wall_time_limit: self.save_model() break # don't sample too fast while 0 < self.learn_iterations < self.sample_iterations - 3: time.sleep(0.2) sample_wall_time = time.time() cur_state = self.env.reset() for i_episode_step in range(self.trial_size): # prevent wall time over limit during sampling if self.wall_time > self.learn_wall_time_limit: self.save_model() break # save every 6 min if time.time() - self.last_save_time > 6 * 60: self.save_model() next_possible_states, next_possible_actions = self.env.all_possible_next_state_action( cur_state) action, _ = self.choose_action(cur_state, next_possible_states, next_possible_actions, epsilon_greedy=True) cur_state_, reward = self.env.step(action) terminal = True if i_episode_step == self.trial_size - 1 else False self.store_transition(cur_state, action, reward, cur_state_, terminal) cur_state = cur_state_ sample_wall_time = time.time() - sample_wall_time self.sample_iterations += 1 self.sample_wall_time += sample_wall_time # end_state distilled output = reward (might be noisy) end_output = self.env.still(reward) mem_total_p = -1 if not self.prioritized else self.memory.memory.tree.total_p print( 'SAMPLE:{}:finished output:{:.5f}:cur_epsilon:{:.5f}:mem_size:{}:virtual:{}:wall_t:{:.2f}:total:{:.2f}:pid:{}:wall_t:{:.2f}:mem_p:{:.2f}' .format(self.sample_iterations, end_output, self.cur_epsilon(), self.memory_size(), self.memory_virtual_size(), sample_wall_time, self.sample_wall_time, os.getpid(), self.wall_time, mem_total_p)) # test every once a while if self.memory_virtual_size() >= self.memory_capacity_start_learning \ and self.learn_iterations % TEST_PERIOD == 0 \ and self.learn_iterations > self.last_test_learn_iterations \ and self.wall_time < self.learn_wall_time_limit: #self.env.test(TRIAL_SIZE, RANDOM_SEED, self.learn_step_counter, self.wall_time, self.env_name, # rl_model=self) max_val_rl, max_state_rl, end_val_rl, end_state_rl, duration_rl, _, _ = self.exp_test( ) max_val_mc, max_state_mc, _, _, duration_mc, _ = self.env.monte_carlo( ) self.logger.log_test(output_mc=max_val_mc, state_mc=max_state_mc, duration_mc=duration_mc, output_rl=max_val_rl, state_rl=max_state_rl, duration_rl=duration_rl, learn_step_counter=self.learn_iterations, wall_time=self.wall_time) self.tb_write( tags=[ 'Prioritized={0}, gamma={1}, seed={2}, env={3}, fixed_xo={4}/(Max_RL-MC)' .format(self.prioritized, self.gamma, self.random_seed, self.env_name, self.env.if_set_fixed_xo()), 'Prioritized={0}, gamma={1}, seed={2}, env={3}, fixed_xo={4}/Ending Output (RL)' .format(self.prioritized, self.gamma, self.random_seed, self.env_name, self.env.if_set_fixed_xo()), ], values=[max_val_rl - max_val_mc, end_val_rl], # note we record end value for RL step=self.learn_iterations) self.last_test_learn_iterations = self.learn_iterations def exp_test(self, debug=True): """ If debug is true, find the max output along the search. If debug is false, only return the end output """ cur_state = self.env.reset() duration = time.time() start_state = cur_state.copy() end_output = max_output = -99999. max_state = None for i in range(self.trial_size): next_possible_states, next_possible_actions = self.env.all_possible_next_state_action( cur_state) action, q_val = self.choose_action(cur_state, next_possible_states, next_possible_actions, epsilon_greedy=False) if debug: # reward is noisy output cur_state, reward = self.env.step(action) # noiseless, stilled end output end_output = self.env.still( self.env.output_noiseless(cur_state)) print( 'TEST :{}:output: {:.5f}, qval: {:.5f}, reward {:.5f}, at {}' .format(i, end_output, q_val, reward, cur_state)) if end_output > max_output: max_output = end_output max_state = cur_state.copy() else: cur_state = self.env.step_without_reward(action) print('TEST :{}:qval: {:.5f}, at {}'.format( i, q_val, cur_state)) duration = time.time() - duration end_state = cur_state if not debug: end_output = self.env.still(self.env.output_noiseless(cur_state)) if_set_fixed_xo = self.env.if_set_fixed_xo() return max_output, max_state, end_output, end_state, duration, if_set_fixed_xo, start_state # very adhoc methods to query environment's information def set_env_fixed_xo(self, x_o): self.env.set_fixed_xo(x_o) def get_env_if_set_fixed_xo(self): return self.env.if_set_fixed_xo() def get_learn_iteration(self): return self.learn_iterations def get_wall_time(self): return self.wall_time
class Agent(): """ The Agent interacts with the environment and learns from the interactions and the environment """ def __init__(self, state_size, action_size, buffer_size, batch_size, gamma, tau, lr_actor, lr_critic, \ weight_decay, epsilon, epsilon_decay, update_every, update_times, start_learning, random_seed, \ mu ,theta, sigma): """ Parameters ========== state_size (in): dimension of each state action_size (int): dimension of each action random_seed (int): random seed epsilon (int): start value of epsilon, default 1.0 buffer_size (int): size of the memorybuffer batch_size (int): size of the batch gamma (float): discounted value, must be between 0 and 1 tau (float): blend parameter for the soft update, has to be between 0 and 1 lr_actor (float): learning rate for the actor dnn lr_critic (float): learning rate for the critic dnn weight_decay (float): L2 penalty epsilon_decay (float): factor to reduce epsilon frequently update_every (int): update frequence update_times (int): how many times the weights should be updated at one update step """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.random_seed = random_seed self.epsilon = epsilon self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr_actor = lr_actor self.lr_critic = lr_critic self.weight_decay = weight_decay self.epsilon_decay = epsilon_decay self.update_every = update_every self.update_times = update_times self.start_learning = start_learning self.theta = theta self.sigma = sigma self.mu = mu self.update_every_x = 0 self.noise_getter_mean = 0.0 # The Actor ########### self.actor_local = Actor(self.state_size, self.action_size, self.random_seed).to(device) self.actor_target = Actor(self.state_size, self.action_size, self.random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr = self.lr_actor) # The Critic ############ self.critic_local = Critic(self.state_size, self.action_size, self.random_seed).to(device) self.critic_target = Critic(self.state_size, self.action_size, self.random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr = self.lr_critic, weight_decay = self.weight_decay) # for target_param, param in zip(self.actor_target.parameters(), self.actor_local.parameters()): # target_param.data.copy_(param.data) # for target_param, param in zip(self.critic_target.parameters(), self.critic_local.parameters()): # target_param.data.copy_(param.data) # The Replay Buffer ################### self.PER = Memory(self.buffer_size) # The "Ornstein-Uhlenbeck-Noise" ################################ self.noise = OUNoise(self.action_size, self.random_seed, 0., self.theta, self.sigma) def step(self, state, action, reward, next_state, done, learn_reset=True): """ add info to the ringbuffer, if enough entries are available --> learn """ #calculate the TD error state_calc = torch.from_numpy(state).float().to(device) self.actor_local.eval() self.critic_target_eval() ## necessary??? self.actor_target.eval() with torch.no_grad(): old_val = self.critic_local(state,action).cpu().data.numpy() actions_next = self.actor_target(next_states) target_val = self.critic_target(next_states, actions_next).cpu().data.numpy() if done: target = reward else: target = reward + self.gamma * target_val error = abs(old_val - target)) self.actor_local.train() self.critic_target.train() self.actor_target.train() self.RER.add(error, (state, action, reward, next_state, done)) #print("Length buffer: " + str(len(self.RepMem))) self.update_every_x = (self.update_every_x+1) % self.update_every if self.update_every_x == 0: # start learning when buffer is half-full self.reset() if len(self.PER) > int(self.batch_size * self.start_learning): #print ("len_buff: {}\t threshold: {}".format(len(self.RepMem),int(self.batch_size * self.start_learning))) for _ in range(int(self.update_times)): sample_batch, idxs, is_weights = self.PER.sample(self.batch_size) #states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device) #actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(device) #rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device) #next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device) #dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device) #print (sample_batch) #print("----------") self.learn(sample_batch,idxs, is_weights, learn_reset) #print ("Learned") #print ("++++++++++") def act(self, state, add_noise = True): """ choose an action due to the given state and policy Parameters: =========== state: the current state add_noise (bool): True: adds a noise for exploration return: the estimate action """ ### adapt state and send it to device state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: noise_ = self.noise.sample() #print (action) self.noise_getter_mean = np.mean(noise_) action += self.epsilon * noise_ #print (action) #print ("------------------------------------------") ### be shure that with the noise the action is still between -1 and 1 return np.clip(action, -1.0, 1.0) def get_epsilon(self): """ getter function: used to monitor epsilon """ return self.epsilon def get_noise_mean(self): """ getter function: used to monitor the noise """ return self.noise_getter_mean def learn(self, sample_batch, idxs, is_weights, noise_reset=True): """ update the DNNs Parameters: ========== experiences: batch sample """ sample_batch = np.array(sample_batch).transpose() states = torch.from_numpy(np.vstack(sample_batch[0])).float().to(device) #actions = list(sample_batch[1]) #rewards = list(sample_batch[2]) #next_states = np.vstack(sample_batch[3]) #dones = sample_batch[4].astype(int) actions = torch.from_numpy(np.vstack(sample_batch[1])).float().to(device) rewards = torch.from_numopy(np.vstack(sample_batch[2])).float().to(device) next_states = torch.from_numpy(np.vstack(sample_batch[3])).float().to(device) dones = torch.from_numpy(np.vstack(sample_batch).astype(np.uint8)).float().to(device) # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) #print ("Q_targets_next"+ str(Q_targets_next.shape)) #print ("rewards"+str(rewards.shape)) #print ("dones"+str(dones.shape)) #print("---------") # Compute Q targets for current states (y_i) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = torch.FloatTensor(is_weighhts) * F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) self.epsilon = max(0.001, self.epsilon * self.epsilon_decay) if noise_reset: self.noise.reset() def soft_update(self, local_model, target_model, tau): """ θ_target = τ*θ_local + (1 - τ)*θ_target Parameters ========== local_model (Actor or Critic object): from that model the parameters are used target_model (Actor or Critic object): to that model the parameters are updated tau (float): blend parameter, has to be between 0 and 1 """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def reset(self): self.noise.reset()