def load_checkpoints(filepath): checkpoint = torch.load(filepath) model = QNetwork(checkpoint['state_size'], checkpoint['action_size'], checkpoint['hidden_layers']) model.load_state_dict(checkpoint['state_dict']) return model
def load_model_into_agent(agent): """ Loads a pretrained network into the created agent. """ model = QNetwork(PARAM.STATE_SIZE, PARAM.ACTION_SIZE, 0).to(device) model.load_state_dict(torch.load(MODEL_TO_LOAD)) agent.qnetwork_target = model agent.qnetwork_local = model
class DDQN(nn.Module): def __init__(self, obs, ac, config): super().__init__() self.q = QNetwork(obs, ac) self.target = QNetwork(obs, ac) self.target.load_state_dict(self.q.state_dict()) self.target_net_update_freq = config.target_net_update_freq self.update_counter = 0 def get_action(self, x): with torch.no_grad(): a = self.q(x).max(1)[1] return a.item() def update_policy(self, adam, memory, params): b_states, b_actions, b_rewards, b_next_states, b_masks = memory.sample( params.batch_size) states = torch.tensor(b_states).float() actions = torch.tensor(b_actions).long().reshape(-1, 1) rewards = torch.tensor(b_rewards).float().reshape(-1, 1) next_states = torch.tensor(b_next_states).float() masks = torch.tensor(b_masks).float().reshape(-1, 1) current_q_values = self.q(states).gather(1, actions) # print(current_q_values[:5]) with torch.no_grad(): max_next_q_vals = self.target(next_states).max(1)[0].reshape(-1, 1) # max_next_q_vals = self. expected_q_vals = rewards + max_next_q_vals * 0.99 * masks # print(expected_q_vals[:5]) loss = F.mse_loss(expected_q_vals, current_q_values) # input(loss) # print('\n'*5) adam.zero_grad() loss.backward() for p in self.q.parameters(): p.grad.data.clamp_(-1., 1.) adam.step() self.update_counter += 1 if self.update_counter % self.target_net_update_freq == 0: self.update_counter = 0 self.target.load_state_dict(self.q.state_dict())
class DQNAgent(): def __init__(self, state_size, action_size): self.state_size = state_size self.action_size = action_size self.policy_network = QNetwork(state_size, action_size).to(device) self.target_network = QNetwork(state_size, action_size).to(device) self.optimizer = optim.Adam(self.policy_network.parameters(), lr=LR) self.eps = EPS_START self.memory = ReplayBuffer(BUFFER_SIZE) self.t_step = 0 self.learn_count = 0 def step(self, state, action, reward, next_state, done): self.memory.store_transition(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0 and len(self.memory) > BATCH_SIZE: experiences = self.memory.sample(BATCH_SIZE, device) self.learn(experiences) def act(self, state): if np.random.rand() < self.eps: return np.random.randint(self.action_size) else: state = torch.from_numpy(state).unsqueeze(0).to(device) action_values = self.policy_network(state) return torch.argmax(action_values).item() def update_eps(self): self.eps = max(EPS_END, EPS_DECAY * self.eps) def learn(self, experiences): states, actions, rewards, next_states, dones = experiences Q_current = self.policy_network(states).gather(1, actions) Q_targets_next = self.target_network(next_states).max(1)[0].unsqueeze( 1) Q_targets = rewards + GAMMA * Q_targets_next * (1 - dones) loss = F.mse_loss(Q_current, Q_targets) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.learn_count += 1 if self.learn_count % SYNC_TARGET_EVERY == 0: self.target_network.load_state_dict( self.policy_network.state_dict())
def load_qnet(self, model_name): """Load Q-Network parameters from file. Params ====== model_name (str): name of the Q-Network """ # Saved QNetwork is alway the CPU version. qnetwork_loaded = QNetwork(self.aug_state_size, self.action_size, self.hsize1, self.hsize2, seed=None) qnetwork_loaded.load_state_dict(torch.load(model_name + '.pth')) self.qnetwork_local.update_weights(qnetwork_loaded.to( device)) # copy loaded network weights to local network
def evaluate(model_path,history_num,max_episode_steps,episode_num,result_save_path): checkpoint = torch.load(model_path) qnetwork = QNetwork(*checkpoint['model_hyper']) qnetwork.load_state_dict(checkpoint['model']) env = gym.make('MountainCar-v0') test_success_history = [] test_reward_history = [] for episode in range(episode_num): print('episode %d'%(episode)) observation = env.reset() #initialize state state = State(history_num) state.init_state(observation) done = False reward_sum = 0 for t in range(max_episode_steps): env.render() state.display() # select a action with max q value action action = qnetwork.decide_action(state.toTensor().view(1,-1)) action = action.sum().item() observation, reward, done, info = env.step(action) reward_sum = reward_sum+reward if done: print('done') print(reward_sum) success = False if observation[0]>=0.5: success = True test_success_history.append(success) test_reward_history.append(reward_sum) break state.update_state_by_observation(observation,action) print('- '*100) print('save to %s'%(result_save_path)) with open(result_save_path,'wb') as f: pkl.dump((test_success_history,test_reward_history),f)
class AgentPriority(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, hidden_layers, lr=5e-4, alpha=0.5, beta=0.4): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed hidden_layers (list[int, int, ...]): size of hidden layers lr (float): learning rate alpha (float (0<=alpha<=1)): parameter alpha for priority beta (float (0<=beta<=1)): parameter for importance sampling weight """ self.state_size = state_size self.action_size = action_size self.seed = seed # Q-Network self.lr = lr self.qnetwork_local = QNetwork(state_size, action_size, self.seed, hidden_layers).to(device) self.qnetwork_target = QNetwork(state_size, action_size, self.seed, hidden_layers).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) # Replay memory self.alpha = alpha self.beta = beta self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, self.alpha, self.beta) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # discount self.gamma = GAMMA self.checkpoint = { "input_size": self.state_size, "output_size": self.action_size, "hidden_layers": [each.out_features for each in self.qnetwork_local.hidden_layers], "state_dict": self.qnetwork_local.state_dict() } self.checkpointfile = 'priority_ddqn.pth' def step(self, state, action, reward, next_state, done): # Save experience in replay memory delta = self.comp_delta(state, action, reward, next_state, done) self.memory.add(state, action, reward, next_state, done, delta) # Learn NUM_LEARNS times par every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0 and len(self.memory) >= MIN_BUF_SIZE: self.memory.set_priority_params(self.alpha, self.beta) for i in range(NUM_LEARNS): if i % SORT_EVERY == 0: # Sort memory based on delta every SORT_EVERY learnings self.memory.argsort_deltas() # Update q_target with q_local self.update_qtarget() # If PARAMETER_ANNEALING is set to True,anneal alpha & beta. if PARAMETER_ANNEALING: self.parameter_anneal() experiences, weights, mem_idxs = self.memory.sample() self.learn(experiences, weights, mem_idxs) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()).astype(np.int32) else: return random.choice(np.arange(self.action_size)).astype(np.int32) def learn(self, experiences, weights, mem_idxs): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor mem_idxs (list of ints): indices in the replay buffer corresponding to the given experiences (used to update delta) """ states, actions, rewards, next_states, dones = experiences # Get argmax of Q values (for next states) from Q_local model Q_local_actions = self.qnetwork_local(next_states).detach().max( 1)[1].unsqueeze(1) # Evaluate that actions with Q_target model Q_targets_next = self.qnetwork_target(next_states).gather( 1, Q_local_actions).detach() # Compute Q targets for current states Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # update deltas in self.memory deltas = (Q_targets - Q_expected).detach().cpu().numpy() self.memory.update_deltas(deltas, mem_idxs) # Compute loss loss = F.mse_loss(weights * Q_expected, weights * Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def update_qtarget(self): for target_param, local_param in zip(self.qnetwork_target.parameters(), self.qnetwork_local.parameters()): target_param.data.copy_(local_param.data) def comp_delta(self, state, action, reward, next_state, done): """Compute delta given an experience delta = reward + gamma*argmax_action(Q_target(next_state, a)) - Q_local(state, action) """ state_ts = torch.from_numpy(np.expand_dims(state, 0)).float().to(device) action_ts = torch.from_numpy(np.array([[action]])).long().to(device) reward_ts = torch.from_numpy(np.array([[reward]])).float().to(device) next_state_ts = torch.from_numpy(np.expand_dims(next_state, 0)).float().to(device) done_ts = torch.from_numpy(np.array([[int(done)]])).float().to(device) Q_targets_next = self.qnetwork_target(next_state_ts).detach().max( 1)[0].unsqueeze(1) Q_targets = reward_ts + (self.gamma * Q_targets_next * (1 - done_ts)) Q_expected = self.qnetwork_local(state_ts).gather(1, action_ts) delta = (Q_targets - Q_expected).detach().cpu().numpy()[0, 0] return delta def get_gamma(self): return self.gamma def save_model(self): torch.save(self.checkpoint, self.checkpointfile) def set_lr(self, lr): self.lr = lr def load_model(self, filepath): checkpoint = torch.load(filepath) self.qnetwork_local = QNetwork(checkpoint["input_size"], checkpoint["output_size"], self.seed, checkpoint["hidden_layers"]) self.qnetwork_local.load_state_dict(checkpoint["state_dict"]) def set_uniform_sampling(self): """ Set alpha to 0.0 and beta to 1.0 so that the agent becomes equivalent to the uniform sampling. """ self.alpha = 0.0 self.beta = 1.0 self.memory.set_priority_params(self.alpha, self.beta) def parameter_anneal(self): self.alpha = max(0.0, self.alpha - ALPHA_ANNEALING) self.beta = min(1.0, self.beta + BETA_ANNEALING) self.memory.set_priority_params(self.alpha, self.beta)
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, double_dqn=True): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action """ self.state_size = state_size self.action_size = action_size self.double_dqn = double_dqn # Q-Network self.qnetwork_local = QNetwork(state_size, action_size).to(device) self.qnetwork_target = copy.deepcopy(self.qnetwork_local) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def save(self, filename): torch.save(self.qnetwork_local.state_dict(), filename + ".local") torch.save(self.qnetwork_target.state_dict(), filename + ".target") def load(self, filename): if os.path.exists(filename + ".local"): self.qnetwork_local.load_state_dict(torch.load(filename + ".local")) if os.path.exists(filename + ".target"): self.qnetwork_target.load_state_dict(torch.load(filename + ".target")) def step(self, state, action, reward, next_state, done, train=True): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() if train: self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) if self.double_dqn: # Double DQN q_best_action = self.qnetwork_local(next_states).max(1)[1] Q_targets_next = self.qnetwork_target(next_states).gather(1, q_best_action.unsqueeze(-1)) else: # DQN Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(-1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class SAC(object): def __init__(self, num_inputs, action_space, args): self.gamma = args.gamma self.tau = args.tau self.alpha = args.alpha self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.device = torch.device("cuda" if args.cuda else "cpu") self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning is True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) def select_action(self, state, evaluate=False): state = torch.FloatTensor(state).to(self.device).unsqueeze(0) if evaluate is False: action, _, _ = self.policy.sample(state) else: _, _, action = self.policy.sample(state) return action.detach().cpu().numpy()[0] def update_parameters(self, memory, batch_size, updates): # Sample a batch from memory state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample( batch_size=batch_size) state_batch = torch.FloatTensor(state_batch).to(self.device) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to( self.device).unsqueeze(1) mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1) with torch.no_grad(): next_state_action, next_state_log_pi, _ = self.policy.sample( next_state_batch) qf1_next_target, qf2_next_target = self.critic_target( next_state_batch, next_state_action) min_qf_next_target = torch.min( qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi next_q_value = reward_batch + mask_batch * self.gamma * min_qf_next_target qf1, qf2 = self.critic( state_batch, action_batch ) # Two Q-functions to mitigate positive bias in the policy improvement step qf1_loss = F.mse_loss( qf1, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf2_loss = F.mse_loss( qf2, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] critic_loss = qf1_loss + qf2_loss self.critic_optim.zero_grad() critic_loss.backward() self.critic_optim.step() pi, log_pi, _ = self.policy.sample(state_batch) qf1_pi, qf2_pi = self.critic(state_batch, pi) min_qf_pi = torch.min(qf1_pi, qf2_pi) policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean( ) # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() if self.automatic_entropy_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() alpha_tlogs = self.alpha.clone() # For TensorboardX logs else: alpha_loss = torch.tensor(0.).to(self.device) alpha_tlogs = torch.tensor(self.alpha) # For TensorboardX logs if updates % self.target_update_interval == 0: soft_update(self.critic_target, self.critic, self.tau) return qf1_loss.item(), qf2_loss.item(), policy_loss.item( ), alpha_loss.item(), alpha_tlogs.item() # Save model parameters def save_model(self, env_name, suffix="", actor_path=None, critic_path=None): if not os.path.exists('models/'): os.makedirs('models/') if actor_path is None: actor_path = "models/sac_actor_{}_{}".format(env_name, suffix) if critic_path is None: critic_path = "models/sac_critic_{}_{}".format(env_name, suffix) print('Saving models to {} and {}'.format(actor_path, critic_path)) torch.save(self.policy.state_dict(), actor_path) torch.save(self.critic.state_dict(), critic_path) # Load model parameters def load_model(self, actor_path, critic_path, device='cpu'): print('Loading models from {} and {}'.format(actor_path, critic_path)) if actor_path is not None: self.policy.load_state_dict( torch.load(actor_path, map_location=torch.device(device))) if critic_path is not None: self.critic.load_state_dict( torch.load(critic_path, map_location=torch.device(device)))
class SAC(object): def __init__(self, num_inputs, action_space, args): self.num_inputs = num_inputs self.action_space = action_space.shape[0] self.gamma = args.gamma self.tau = args.tau self.policy_type = args.policy self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.critic = QNetwork(self.num_inputs, self.action_space, args.hidden_size) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) if self.policy_type == "Gaussian": self.alpha = args.alpha # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning == True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape)).item() self.log_alpha = torch.zeros(1, requires_grad=True) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) else: pass self.policy = GaussianPolicy(self.num_inputs, self.action_space, args.hidden_size) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) self.value = ValueNetwork(self.num_inputs, args.hidden_size) self.value_target = ValueNetwork(self.num_inputs, args.hidden_size) self.value_optim = Adam(self.value.parameters(), lr=args.lr) hard_update(self.value_target, self.value) else: self.policy = DeterministicPolicy(self.num_inputs, self.action_space, args.hidden_size) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) self.critic_target = QNetwork(self.num_inputs, self.action_space, args.hidden_size) hard_update(self.critic_target, self.critic) def select_action(self, state, eval=False): state = torch.FloatTensor(state).unsqueeze(0) if eval == False: self.policy.train() action, _, _, _, _ = self.policy.sample(state) else: self.policy.eval() _, _, _, action, _ = self.policy.sample(state) if self.policy_type == "Gaussian": action = torch.tanh(action) else: pass #action = torch.tanh(action) action = action.detach().cpu().numpy() return action[0] def update_parameters(self, state_batch, action_batch, reward_batch, next_state_batch, mask_batch, updates): state_batch = torch.FloatTensor(state_batch) next_state_batch = torch.FloatTensor(next_state_batch) action_batch = torch.FloatTensor(action_batch) reward_batch = torch.FloatTensor(reward_batch).unsqueeze(1) mask_batch = torch.FloatTensor(np.float32(mask_batch)).unsqueeze(1) """ Use two Q-functions to mitigate positive bias in the policy improvement step that is known to degrade performance of value based methods. Two Q-functions also significantly speed up training, especially on harder task. """ expected_q1_value, expected_q2_value = self.critic( state_batch, action_batch) new_action, log_prob, _, mean, log_std = self.policy.sample( state_batch) if self.policy_type == "Gaussian": if self.automatic_entropy_tuning: """ Alpha Loss """ alpha_loss = -( self.log_alpha * (log_prob + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() alpha_logs = self.alpha.clone() # For TensorboardX logs else: alpha_loss = torch.tensor(0.) alpha_logs = self.alpha # For TensorboardX logs """ Including a separate function approximator for the soft value can stabilize training. """ expected_value = self.value(state_batch) target_value = self.value_target(next_state_batch) next_q_value = reward_batch + mask_batch * self.gamma * ( target_value).detach() else: """ There is no need in principle to include a separate function approximator for the state value. We use a target critic network for deterministic policy and eradicate the value value network completely. """ alpha_loss = torch.tensor(0.) alpha_logs = self.alpha # For TensorboardX logs next_state_action, _, _, _, _, = self.policy.sample( next_state_batch) target_critic_1, target_critic_2 = self.critic_target( next_state_batch, next_state_action) target_critic = torch.min(target_critic_1, target_critic_2) next_q_value = reward_batch + mask_batch * self.gamma * ( target_critic).detach() """ Soft Q-function parameters can be trained to minimize the soft Bellman residual JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] ∇JQ = ∇Q(st,at)(Q(st,at) - r(st,at) - γV(target)(st+1)) """ q1_value_loss = F.mse_loss(expected_q1_value, next_q_value) q2_value_loss = F.mse_loss(expected_q2_value, next_q_value) q1_new, q2_new = self.critic(state_batch, new_action) expected_new_q_value = torch.min(q1_new, q2_new) if self.policy_type == "Gaussian": """ Including a separate function approximator for the soft value can stabilize training and is convenient to train simultaneously with the other networks Update the V towards the min of two Q-functions in order to reduce overestimation bias from function approximation error. JV = 𝔼st~D[0.5(V(st) - (𝔼at~π[Qmin(st,at) - α * log π(at|st)]))^2] ∇JV = ∇V(st)(V(st) - Q(st,at) + (α * logπ(at|st))) """ next_value = expected_new_q_value - (self.alpha * log_prob) value_loss = F.mse_loss(expected_value, next_value.detach()) else: pass """ Reparameterization trick is used to get a low variance estimator f(εt;st) = action sampled from the policy εt is an input noise vector, sampled from some fixed distribution Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] ∇Jπ = ∇log π + ([∇at (α * logπ(at|st)) − ∇at Q(st,at)])∇f(εt;st) """ policy_loss = ((self.alpha * log_prob) - expected_new_q_value).mean() # Regularization Loss mean_loss = 0.001 * mean.pow(2).mean() std_loss = 0.001 * log_std.pow(2).mean() policy_loss += mean_loss + std_loss self.critic_optim.zero_grad() q1_value_loss.backward() self.critic_optim.step() self.critic_optim.zero_grad() q2_value_loss.backward() self.critic_optim.step() if self.policy_type == "Gaussian": self.value_optim.zero_grad() value_loss.backward() self.value_optim.step() else: value_loss = torch.tensor(0.) self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() """ We update the target weights to match the current value function weights periodically Update target parameter after every n(args.target_update_interval) updates """ if updates % self.target_update_interval == 0 and self.policy_type == "Deterministic": soft_update(self.critic_target, self.critic, self.tau) elif updates % self.target_update_interval == 0 and self.policy_type == "Gaussian": soft_update(self.value_target, self.value, self.tau) return value_loss.item(), q1_value_loss.item(), q2_value_loss.item( ), policy_loss.item(), alpha_loss.item(), alpha_logs # Save model parameters def save_model(self, env_name, suffix="", actor_path=None, critic_path=None, value_path=None): if not os.path.exists('models/'): os.makedirs('models/') if actor_path is None: actor_path = "models/sac_actor_{}_{}".format(env_name, suffix) if critic_path is None: critic_path = "models/sac_critic_{}_{}".format(env_name, suffix) if value_path is None: value_path = "models/sac_value_{}_{}".format(env_name, suffix) print('Saving models to {}, {} and {}'.format(actor_path, critic_path, value_path)) torch.save(self.value.state_dict(), value_path) torch.save(self.policy.state_dict(), actor_path) torch.save(self.critic.state_dict(), critic_path) # Load model parameters def load_model(self, actor_path, critic_path, value_path): print('Loading models from {}, {} and {}'.format( actor_path, critic_path, value_path)) if actor_path is not None: self.policy.load_state_dict(torch.load(actor_path)) if critic_path is not None: self.critic.load_state_dict(torch.load(critic_path)) if value_path is not None: self.value.load_state_dict(torch.load(value_path))
class Agent(): def __init__(self, state_size, action_size, behavior_name, index_player, replay_memory_size=1e4, batch_size=512, gamma=0.99, learning_rate=1e4, target_tau=1e3, update_rate=100, seed=0): #affect your agent vs other agents self.state_size = state_size self.current_state = [] self.action_size = action_size self.buffer_size = int(replay_memory_size) self.batch_size = batch_size self.gamma = gamma self.learn_rate = learning_rate self.tau = target_tau self.update_rate = update_rate self.seed = random.seed(seed) self.behavior_name = behavior_name self.index_player = index_player self.close_ball_reward = 0 self.touch_ball_reward = 0 """ Now we define two models: (a) one netwoek will be updated every (step % update_rate == 0), (b) A target network, with weights updated to equal to equal to the network (a) at a slower (target_tau) rate. """ self.network = QNetwork(state_size, action_size, seed).to(device) self.target_network = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.network.parameters(), lr=self.learn_rate) # Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def load_model(self, path_model, path_target=None): params = torch.load(path_model) self.network.set_params(params) self.network.load_state_dict(torch.load(path_model)) if path_target != None: self.target_network.load_state_dict(torch.load(path_target)) def model_step(self, state, action, reward, next_state): # save experience in replay memory self.memory.add(state, action, reward, next_state) # learn every UPDATE_EVERY time steps self.t_step = self.t_step + 1 if self.t_step % self.update_rate == 0: # if enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma, self.t_step) def choose_action(self, state, eps=0.0): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.network.eval() with torch.no_grad(): action_values = self.network(state) self.network.train() # epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy() ) # return a number from 0 to action_size else: return random.choice(np.arange( self.action_size)) # return a number from 0 to action_size def learn(self, experiences, gamma, stp): states, actions, rewards, next_states = experiences # Get Q values from current observations (s,a) using model network # get max Q values for (s', a') from target model self.network.train() Q_sa = self.network(states).gather(1, actions) #print(Q_sa) Q_sa_prime_target_values = self.target_network(next_states).max( 1)[0].to(device).float().detach() #Q_sa_prime_targets = Q_sa_prime_target_values.max(1)[0].unsqueeze(1) #print(Q_sa_prime_target_values) # compute Q targets for current states #print(rewards) Q_sa_targets = rewards + gamma * Q_sa_prime_target_values.unsqueeze(1) #print(Q_sa_targets) #input('train') #Q_sa_targets = Q_sa_targets.unsqueeze(1) # Compute loss (error) criterion = torch.nn.MSELoss(reduction='sum') loss = criterion( Q_sa.to(device), Q_sa_targets.to(device)) #F.mse_loss(Q_sa, Q_sa_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # update target network if stp % 100 == 0: print('Updating Model') self.soft_update(self.network, self.target_network, self.tau) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def Read(self): decision_steps, terminal_steps = env.get_steps(self.behavior_name) try: signal_front = np.array( sensor_front_sig( decision_steps.obs[0][self.index_player, :])) # 3 x 11 x 8 signal_back = np.array( sensor_back_sig( decision_steps.obs[1][self.index_player, :])) # 3 x 3 x 8 #pre_state = [] signal_front = np.array(signal_front) #print(signal_front.shape) #print(signal_back.shape) r = np.concatenate((signal_front, signal_back), axis=1) #print(r.shape) #input('ff') #pre_state.extend(list(np.array(signal_front).flatten())) #pre_state.extend(list(np.array(signal_back).flatten())) #state = np.array(pre_state) self.current_state = r count_close_to_ball = 0 count_touch_ball = 0 count_back_touch = 0 count_back_close = 0 self.rew_d_to_our_post = 0 self.rew_for_ball_dist = -0.1 # Front Observation for i in range(len(signal_front[0])): if signal_front[0][i][0] == 1.0: count_close_to_ball += 1 self.rew_for_ball_dist = max( 0.3 * (1 - signal_front[0][i][7]), self.rew_for_ball_dist) # Kicked the ball at the front if signal_front[0][i][7] <= 0.03: count_touch_ball += 1 if signal_front[0][i][1] == 1.0: self.rew_d_to_our_post = -0.1 if signal_front[0][i][2] == 1.0: self.rew_d_to_our_post = 0.1 # Back observation for i in range(len(signal_back[0])): if signal_back[0][i][0] == 1.0: count_back_close += 0.2 # Touches the ball at the back if signal_back[0][i][7] <= 0.03: count_back_touch += 0.3 self.back_touch = 1 if count_back_touch > 0 else 0.2 self.back_close = 1 if count_back_close > 0 else 0.1 # add reward if kick the ball self.touch_ball_reward = 1 if count_touch_ball > 0 else -0.15 # Penalize for back touching the ball if count_back_touch > 0: self.touch_ball_reward = -0.25 # Penalize if the ball is not in view self.close_ball_reward = 0.25 if count_close_to_ball > 0 else -0.05 # Penalize if the ball is behind the agent if count_back_close > 0: self.close_ball_reward = -0.1 return self.current_state except: self.touch_ball_reward = 0 self.close_ball_reward = 0 return self.current_state def upd_after_goal(self, n_upds): self.memory.upd_goal(n_upds) if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma, self.t_step) def we_goll(self): self.memory.we_goll() if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma, self.t_step) experiences = self.memory.sample() self.learn(experiences, self.gamma, self.t_step) def us_goll(self): self.memory.us_goll() if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma, self.t_step) experiences = self.memory.sample() self.learn(experiences, self.gamma, self.t_step)
class SAC(object): def __init__(self, num_inputs, action_space, variant): self.gamma = variant['gamma'] self.tau = variant['tau'] self.alpha = variant['alpha'] self.policy_type = variant['policy_type'] self.target_update_interval = variant['target_update_interval'] self.automatic_entropy_tuning = variant['automatic_entropy_tuning'] self.lr = variant.get("lr", 1e-3) self.device = torch.device("cuda" if variant['cuda'] else "cpu") self.hidden_size = variant.get('hidden_size', [128, 128]) self.critic = QNetwork(num_inputs, action_space.shape[0], self.hidden_size).to(self.device) self.critic_optim = Adam(self.critic.parameters(), lr=self.lr) self.critic_target = QNetwork(num_inputs, action_space.shape[0], self.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) if self.policy_type == 'Gaussian': if self.automatic_entropy_tuning: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=self.lr) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], self.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=self.lr) else: self.alpha = 0 self.automatic_entropy_tuning = False self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], self.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=self.lr) def select_action(self, state, evaluate=False): state = torch.FloatTensor(state).to(self.device).unsqueeze(0) if evaluate is False: action, _, _ = self.policy.sample(state) else: _, _, action = self.policy.sample(state) return action.detach().cpu().numpy()[0] def update_parameters(self, memory, batch_size, updates): #sample a batch from memory state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample( batch_size=batch_size) state_batch = torch.FloatTensor(state_batch).to(self.device) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to(self.device) mask_batch = torch.FloatTensor(mask_batch).to(self.device) with torch.no_grad(): next_state_action, next_state_log_pi, _ = self.policy.sample( next_state_batch) qf1_next_target, qf2_next_target = self.critic_target( next_state_batch, next_state_action) min_qf_next_target = torch.min( qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi next_q_value = reward_batch + mask_batch * self.gamma * ( min_qf_next_target) qf1, qf2 = self.critic( state_batch, action_batch ) # Two Q-functions to mitigate positive bias in the policy improvement step qf1_loss = F.mse_loss( qf1, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf2_loss = F.mse_loss( qf2, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] # samle a batch of action and appropriate log_pi pi, log_pi, _ = self.policy.sample(state_batch) qf1_pi, qf2_pi = self.critic(state_batch, pi) min_qf_pi = torch.min(qf1_pi, qf2_pi) policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean( ) # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] self.critic_optim.zero_grad() qf1_loss.backward() self.critic_optim.step() self.critic_optim.zero_grad() qf2_loss.backward() self.critic_optim.step() self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() if self.automatic_entropy_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() # alpha_tlogs = self.alpha.clone() else: alpha_loss = torch.tensor(0.0).to(self.device) if update % self.target_update_interval == 0: soft_update(self.critic_target, self.critic, self.tau) return qf1_loss.item(), qf2_loss.item(), policy_loss.item( ), alpha_loss.item() def save_model(self, env_nam, suffix=".pkl", actor_path=None, critic_path=None): if not os.path.exists('models/'): os.makedirs('models/') if actor_path is None: actor_path = "models/sac_actor_{}_{}".format(env_name, suffix) if critic_path is None: critic_path = "models/sac_critic_{}_{}".format(env_name, suffix) print("Saving models to {} and {}".format(actor_path, critic_path)) torch.save(self.policy.state_dict(), actor_path) torch.save(self.critic.state_dict(), critic_path) def load_model(self, actor_path, critic_path): print('loading models from {} and {}'.format(actor_path, critic_path)) if actor_path is not None: self.policy.load_state_dict(torch.load(actor_path)) if critic_path is not None: self.critic.load_state_dict(torch.load(critic_path))
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed=SEED, batch_size=BATCH_SIZE, buffer_size=BUFFER_SIZE, start_since=START_SINCE, gamma=GAMMA, target_update_every=T_UPDATE, tau=TAU, lr=LR, weight_decay=WEIGHT_DECAY, update_every=UPDATE_EVERY, priority_eps=P_EPS, a=A, initial_beta=INIT_BETA, n_multisteps=N_STEPS, v_min=V_MIN, v_max=V_MAX, clip=CLIP, n_atoms=N_ATOMS, initial_sigma=INIT_SIGMA, linear_type=LINEAR, factorized=FACTORIZED, **kwds): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed batch_size (int): size of each sample batch buffer_size (int): size of the experience memory buffer start_since (int): number of steps to collect before start training gamma (float): discount factor target_update_every (int): how often to update the target network tau (float): target network soft-update parameter lr (float): learning rate weight_decay (float): weight decay for optimizer update_every (int): update(learning and target update) interval priority_eps (float): small base value for priorities a (float): priority exponent parameter initial_beta (float): initial importance-sampling weight n_multisteps (int): number of steps to consider for each experience v_min (float): minimum reward support value v_max (float): maximum reward support value clip (float): gradient norm clipping (`None` to disable) n_atoms (int): number of atoms in the discrete support distribution initial_sigma (float): initial noise parameter weights linear_type (str): one of ('linear', 'noisy'); type of linear layer to use factorized (bool): whether to use factorized gaussian noise in noisy layers """ if kwds != {}: print("Ignored keyword arguments: ", end='') print(*kwds, sep=', ') assert isinstance(state_size, int) assert isinstance(action_size, int) assert isinstance(seed, int) assert isinstance(batch_size, int) and batch_size > 0 assert isinstance(buffer_size, int) and buffer_size >= batch_size assert isinstance(start_since, int) and batch_size <= start_since <= buffer_size assert isinstance(gamma, (int, float)) and 0 <= gamma <= 1 assert isinstance(target_update_every, int) and target_update_every > 0 assert isinstance(tau, (int, float)) and 0 <= tau <= 1 assert isinstance(lr, (int, float)) and lr >= 0 assert isinstance(weight_decay, (int, float)) and weight_decay >= 0 assert isinstance(update_every, int) and update_every > 0 assert isinstance(priority_eps, (int, float)) and priority_eps >= 0 assert isinstance(a, (int, float)) and 0 <= a <= 1 assert isinstance(initial_beta, (int, float)) and 0 <= initial_beta <= 1 assert isinstance(n_multisteps, int) and n_multisteps > 0 assert isinstance(v_min, (int, float)) and isinstance(v_max, (int, float)) and v_min < v_max if clip: assert isinstance(clip, (int, float)) and clip >= 0 assert isinstance(n_atoms, int) and n_atoms > 0 assert isinstance(initial_sigma, (int, float)) and initial_sigma >= 0 assert isinstance(linear_type, str) and linear_type.strip().lower() in ('linear', 'noisy') assert isinstance(factorized, bool) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) self.state_size = state_size self.action_size = action_size self.seed = seed self.batch_size = batch_size self.buffer_size = buffer_size self.start_since = start_since self.gamma = gamma self.target_update_every = target_update_every self.tau = tau self.lr = lr self.weight_decay = weight_decay self.update_every = update_every self.priority_eps = priority_eps self.a = a self.beta = initial_beta self.n_multisteps = n_multisteps self.v_min = v_min self.v_max = v_max self.clip = clip self.n_atoms = n_atoms self.initial_sigma = initial_sigma self.linear_type = linear_type.strip().lower() self.factorized = factorized # Distribution self.supports = torch.linspace(v_min, v_max, n_atoms, device=device) self.delta_z = (v_max - v_min) / (n_atoms - 1) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, n_atoms, linear_type, initial_sigma, factorized).to(device) self.qnetwork_target = QNetwork(state_size, action_size, n_atoms, linear_type, initial_sigma, factorized).to(device) self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict()) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr, weight_decay=weight_decay) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, n_multisteps, gamma, a) # Initialize time step (for updating every UPDATE_EVERY steps and TARGET_UPDATE_EVERY steps) self.u_step = 0 self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.u_step = (self.u_step + 1) % self.update_every if self.u_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) >= self.start_since: experiences, target_discount, is_weights, indices = self.memory.sample(self.beta) new_priorities = self.learn(experiences, is_weights, target_discount) self.memory.update_priorities(indices, new_priorities) # update the target network every TARGET_UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.target_update_every if self.t_step == 0: self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) with torch.no_grad(): z_probs = F.softmax(self.qnetwork_local(state), dim=-1) action_values = self.supports.mul(z_probs).sum(dim=-1, keepdim=False) # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) return random.choice(np.arange(self.action_size)) def learn(self, experiences, is_weights, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples is_weights (torch.Tensor): tensor of importance-sampling weights gamma (float): discount factor for the target max-Q value Returns ======= new_priorities (List[float]): list of new priority values for the given sample """ states, actions, rewards, next_states, dones = experiences with torch.no_grad(): rows = tuple(range(next_states.size(0))) a_argmax = F.softmax(self.qnetwork_local(next_states), dim=2)\ .mul(self.supports)\ .sum(dim=2, keepdim=False)\ .argmax(dim=1, keepdim=False) p = F.softmax(self.qnetwork_target(next_states)[rows, a_argmax], dim=1) tz_projected = torch.clamp(rewards + (1 - dones) * gamma * self.supports, min=self.v_min, max=self.v_max) # """ b = (tz_projected - self.v_min) / self.delta_z u = b.ceil() l = b.floor() u_updates = b - l + u.eq(l).type(u.dtype) # fixes the problem when having b == u == l l_updates = u - b indices_flat = torch.cat((u.long(), l.long()), dim=1) indices_flat = indices_flat.add( torch.arange(start=0, end=b.size(0) * b.size(1), step=b.size(1), dtype=indices_flat.dtype, layout=indices_flat.layout, device=indices_flat.device).unsqueeze(1) ).view(-1) updates_flat = torch.cat((u_updates.mul(p), l_updates.mul(p)), dim=1).view(-1) target_distributions = torch.zeros_like(p) target_distributions.view(-1).index_add_(0, indices_flat, updates_flat) """ b = ((tz_projected - V_MIN) / self.delta_z).t() # transpose for later for-loop convenience u = b.ceil() l = b.floor() u_updates = b - l + u.eq(l).type(u.dtype) l_updates = u - b target_distributions = torch.zeros_like(p) for u_indices, l_indices, u_update, l_update, prob in zip(u.long(), l.long(), u_updates, l_updates, p.t()): target_distributions[rows, u_indices] += u_update * prob target_distributions[rows, l_indices] += l_update * prob """ pred_distributions = self.qnetwork_local(states) pred_distributions = pred_distributions.gather(dim=1, index=actions.unsqueeze(1).expand(-1, -1, pred_distributions.size(2))).squeeze(1) """ cross_entropy = target_distributions.mul(pred_distributions.exp().sum(dim=-1, keepdim=True).log() - pred_distributions).sum(dim=-1, keepdim=False) new_priorities = cross_entropy.detach().add(self.priority_eps).cpu().numpy() loss = cross_entropy.mul(is_weights.view(-1)).mean() """ kl_divergence = F.kl_div(F.log_softmax(pred_distributions, dim=-1), target_distributions, reduce=False).sum(dim=-1, keepdim=False) new_priorities = kl_divergence.detach().add(self.priority_eps).cpu().numpy() loss = kl_divergence.mul(is_weights.view(-1)).mean() # """ self.optimizer.zero_grad() loss.backward() if self.clip: torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), self.clip) self.optimizer.step() return new_priorities def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): """Interacts with and learns from the environment. Attributes: state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ def __init__(self, state_size, action_size, seed, hidden_layers=[128, 64], filename=None): """Initialize an Agent object. Args: filename: path of .pth file with trained weights """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) if filename: weights = torch.load(filename) self.qnetwork_local.load_state_dict(weights) self.qnetwork_target.load_state_dict(weights) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Args: state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Args: experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss Q_t_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) Q_t = rewards + (gamma * Q_t_next * (1 - dones)) Q_e = self.qnetwork_local(states).gather(1, actions) loss = F.mse_loss(Q_e, Q_t) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Args: local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, layer_spec, seed=0): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.layer_spec = layer_spec self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, layer_spec).to(device) self.qnetwork_target = QNetwork(state_size, action_size, layer_spec).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # (Prioritized) experience replay setup self.buffer_size = BUFFER_SIZE self.batch_size = BATCH_SIZE self.min_prio = MIN_PRIO self.alpha = ALPHA self.beta = INIT_BETA self.beta_increment = BETA_INC if USE_PER: self.memory = PrioritizedReplayBuffer(size=self.buffer_size, alpha=self.alpha) else: self.memory = DequeReplayBuffer(action_size=self.action_size, buffer_size=self.buffer_size, batch_size=self.batch_size, seed=42) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # print info about Agent print('Units in the hidden layers are {}.'.format(str(layer_spec))) print('Using Double-DQN is \"{}\".'.format(str(USE_DDQN))) print('Using prioritized experience replay is \"{}\".'.format( str(USE_PER))) def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get subset and learn if len(self.memory) > BATCH_SIZE: self.beta = min(1., self.beta + self.beta_increment) experiences = self.memory.sample(self.batch_size, beta=self.beta) self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ # Get TD step from experiences states, actions, rewards, next_states, dones, weights, idxes = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # DOUBLE DQN: Select action based on _local, evaluate action based on _target if USE_DDQN: Q_action_select = self.qnetwork_local(next_states).detach().max( 1)[1].unsqueeze(1) Q_targets_next = self.qnetwork_target(next_states).detach().gather( 1, Q_action_select) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute (PER-weighted) MSE loss if USE_PER: TD_error = Q_targets - Q_expected weighted_TD_error = weights * (TD_error**2) loss = torch.mean(weighted_TD_error) # Update priorities in Replay Buffer prio_updates = np.abs( TD_error.detach().squeeze(1).cpu().numpy()) + self.min_prio self.memory.update_priorities(idxes, prio_updates.tolist()) else: loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # soft-update target network self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def save_checkpoint(self): checkpoint = { 'input_size': self.state_size, 'output_size': self.action_size, 'layer_spec': self.layer_spec, 'state_dict': self.qnetwork_local.state_dict() } torch.save(checkpoint, 'checkpoint.pth') print('Checkpoint succesfully saved.') def load_checkpoint(self, filepath='checkpoint.pth'): checkpoint = torch.load(filepath) self.qnetwork_local = QNetwork(checkpoint['input_size'], checkpoint['output_size'], checkpoint['layer_spec']).to(device) self.qnetwork_local.load_state_dict(checkpoint['state_dict']) print('Checkpoint successfully loaded.') def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, double_dqn=False, dueling_network=False, prioritized_replay=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed double_dqn (bool): use Double DQN method dueling_network (bool): use Dueling Network prioritized_replay (bool): use Prioritized Replay Buffer """ self.state_size = state_size self.action_size = action_size self.dueling_network = dueling_network self.double_dqn = double_dqn self.prioritized_replay = prioritized_replay random.seed(seed) # Q-Network self.hidden_layers = [128, 32] if self.dueling_network: self.hidden_state_value_layers = [64, 32] self.qnetwork_local = DuelingQNetwork( state_size, action_size, seed, self.hidden_layers, self.hidden_state_value_layers).to(device) self.qnetwork_target = DuelingQNetwork( state_size, action_size, seed, self.hidden_layers, self.hidden_state_value_layers).to(device) self.qnetwork_target.eval() else: self.qnetwork_local = QNetwork(state_size, action_size, seed, self.hidden_layers).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, self.hidden_layers).to(device) self.qnetwork_target.eval() self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.lr_scheduler = optim.lr_scheduler.ExponentialLR( self.optimizer, LR_DECAY) # Replay memory if prioritized_replay: self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device, alpha=0.6, beta=0.4, beta_scheduler=1.0) else: self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def load(self, filepath): # load weights from file state_dict = torch.load(filepath) self.qnetwork_local.load_state_dict(state_dict) self.qnetwork_local.eval() def save(self, filepath): # Save weights to file torch.save(self.qnetwork_local.state_dict(), filepath) def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ # Epsilon-greedy action selection if random.random() >= eps: state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() return np.argmax(action_values.cpu().data.numpy()).astype(int) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done, w) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones, w = experiences with torch.no_grad(): # Use of Double DQN method if self.double_dqn: # Select the greedy actions (maximum Q target for next states) from local model greedy_actions = self.qnetwork_local(next_states).max( dim=1, keepdim=True)[1] # Get the Q targets (for next states) for the greedy actions from target model q_targets_next = self.qnetwork_target(next_states).gather( 1, greedy_actions) # Use of Fixed Q-Target else: # Get max predicted Q values (for next states) from target model q_targets_next = self.qnetwork_target(next_states).max( dim=1, keepdim=True)[0] # Compute Q targets for current states q_targets = rewards + (gamma * q_targets_next * (1 - dones)) # Get expected Q values from local model q_expected = self.qnetwork_local(states).gather( 1, actions) # shape: [batch_size, 1] # Compute loss if self.prioritized_replay: q_targets.sub_(q_expected) q_targets.squeeze_() q_targets.pow_(2) with torch.no_grad(): td_error = q_targets td_error.pow_(0.5) self.memory.update_priorities(td_error) q_targets.mul_(w) loss = q_targets.mean() else: loss = F.mse_loss(q_expected, q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.lr_scheduler.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) @staticmethod def soft_update(local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def dqn(self, env, brain_name, n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): """Deep Q-Learning. Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores eps = eps_start # initialize epsilon for i_episode in range(1, n_episodes + 1): env_info = env.reset( train_mode=False)[brain_name] # reset the environment state = env_info.vector_observations[0] # get the current state score = 0 # reset the score for t in range(max_t): action = self.act(state, eps).astype( int) # choose action based on epsilon-greedy policy env_info = env.step(action)[ brain_name] # send the action to the environment next_state = env_info.vector_observations[ 0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished self.step(state, action, reward, next_state, done) # make the agent take a step state = next_state # update the state score += reward # add the reward to the score if done: # (if done) break # end episode scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(eps_end, eps_decay * eps) # decrease epsilon print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) if np.mean(scores_window) >= 13.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_window))) torch.save(self.qnetwork_local.state_dict(), 'checkpoint.pth') break return scores def test(self, env, brain_name): self.qnetwork_local.load_state_dict(torch.load('checkpoint.pth')) # load environment variables # action_size, state_size = info.getInfo() env_info = env.reset( train_mode=False)[brain_name] # reset the environment state = env_info.vector_observations[0] # get the current state score = 0 # initialize the score while True: action = self.act(state).astype(int) # select an action env_info = env.step(action)[ brain_name] # send the action to the environment next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished score += reward # update the score state = next_state # roll over the state to next time step if done: # exit loop if episode finished break return score
class Agent: def __init__(self, state_size, action_size, seed, fc1_size=64, fc2_size=64, checkpoint_filename=''): """ Initializes an agent object TODO make the structure of the qfunction approximator more flexible :param state_size: dimension of each state :param action_size: dimension of each action :param seed: random seed :param fc1_size: number of units of the first fully connected layer of the q function approximator :param fc2_size: number of units of the second fully connected layer of the q function approximator :param checkpoint_filename: name of the checkpoint file which contains the load_state_dict pickled weights of the q function approximator. :return agent: initialized agent """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.fc1_size = fc1_size self.fc2_size = fc2_size self.BUFFER_SIZE = int(1e5) # replay buffer size self.BATCH_SIZE = 64 # minibatch size self.GAMMA = 0.99 # discount factor self.TAU = 1e-3 # for soft update of target parameters self.LR = 5e-4 # learning rate self.UPDATE_EVERY = 4 # how often to update the network # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed, fc1_size, fc2_size).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, fc1_size, fc2_size).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.LR) self.criterion = torch.nn.MSELoss() # Replay memory self.memory = ReplayBuffer(action_size, self.BUFFER_SIZE, self.BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 if checkpoint_filename != '': self.qnetwork_local.load_state_dict( torch.load(checkpoint_filename)) def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, self.GAMMA) def act(self, state, eps=0.): """ Returns actions for given state as per current policy """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """ Update value parameters using given batch of experience tuples. :param experiences: (Tuple[torch.Variable]) tuple of (s, a, r, s', done) tuples :param gamma: (float) discount factor """ y = self._q_target(experiences, gamma) y_pred = self._q_estimated(experiences, gamma) loss = self.criterion(y_pred, y) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, self.TAU) def soft_update(self, local_model, target_model, tau): """ Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target :param local_model: (PyTorch model) weights will be copied from :param target_model: (PyTorch model) weights will be copied to :param tau: interpolation parameter :return: """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def _q_target(self, experiences, gamma): """Method that calculates the target q value used for training""" raise NotImplementedError def _q_estimated(self, experiences, gamma): """Method that calculates the estimated q value used for training""" states, actions, rewards, next_states, dones = experiences # feedforward the local network return self.qnetwork_local(states).gather(1, actions)
def main(args): env = gym.make(args.env) if 'MiniGrid' in args.env: env = ImgObsWrapper(env) path = args.base_path + args.env os.makedirs(path, exist_ok=True) # obs_shape = np.prod(env.observation_space.shape).astype(int) obs_shape = env.observation_space.shape act_shape = env.action_space.n q = QNetwork(obs_shape, act_shape) q_target = QNetwork(obs_shape, act_shape) opt = optim.Adam(lr=args.lr, params=q.parameters()) memory = Memory(capacity=args.memory) scheduler = LinearSchedule(schedule_timesteps=int(args.max_steps * 0.1), final_p=0.01) avg_rw = deque(maxlen=40) avg_len = deque(maxlen=40) def get_action(s, t): s = torch.Tensor(s[None,:]) _q = q(s) if np.random.sample() > scheduler.value: best_action = np.argmax(_q.detach(), axis=-1).item() else: best_action = np.random.randint(0, act_shape) scheduler.update(t) return best_action def train(batch): batch = Transition(*zip(*batch)) s = torch.Tensor(batch.state) a = torch.Tensor(one_hot(np.array(batch.action), num_classes=act_shape)) r = torch.Tensor(batch.reward) d = torch.Tensor(batch.done) s1 = torch.Tensor(batch.next_state) value = (q(s) * a).sum(dim=-1) next_value = r + args.gamma * (1. - d) * torch.max(q_target(s1), dim=-1)[0] loss = (.5 * (next_value - value) ** 2).mean() opt.zero_grad() loss.backward() opt.step() state = env.reset() q_target.load_state_dict(q.state_dict()) ep_rw = 0 ep_len = 0 ep = 0 for t in range(args.max_steps): action = get_action(state, t) next_state, reward, done, _ = env.step(action) memory.push(state, action, next_state, reward, done) ep_rw += reward ep_len += 1 state = next_state.copy() if done: ep += 1 avg_rw.append(ep_rw) avg_len.append(ep_len) ep_rw = 0 ep_len = 0 state = env.reset() if t % args.train_every == 0 and len(memory) > args.batch_size: batch = memory.sample(batch_size=args.batch_size) train(batch) if t % args.update_every == 0: q_target.load_state_dict(q.state_dict()) print(f't:{t}\tep:{ep}\tavg_rw:{np.mean(avg_rw)}\tavg_len:{np.mean(avg_len)}\teps:{scheduler.value}') env = Monitor(env, directory=path) for ep in range(4): s = env.reset() while True: a = get_action(s, t=0) s1, r, d, _ = env.step(a) s = s1.copy() if d: break
for i in range(0, len(env.OBSTACLE_X)): plt.plot(env.OBSTACLE_X[i], env.OBSTACLE_Y[i], marker="s", color="red", markersize=22) index = 0 eps = 0 for i in range(3, 5): model = QNetwork(state_size=(len(OBSTACLE_X) + 1) * 2, action_size=81, seed=0) model.load_state_dict( torch.load('dqn_models{}checkpoint{}.pth'.format('/', index))) state = env.reset() env.render() time.sleep(5) for t in range(max_t): state = torch.from_numpy(state).float().unsqueeze(0) action_values = model(state) # Epsilon-greedy action selection if random.random() > eps: action = np.argmax(action_values.cpu().data.numpy()) else: action = random.choice(np.arange(model.action_size)) next_state, _, done, _ = env.step(action) state = next_state
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed=SEED, batch_size=BATCH_SIZE, buffer_size=BUFFER_SIZE, start_since=START_SINCE, gamma=GAMMA, target_update_every=T_UPDATE, tau=TAU, lr=LR, weight_decay=WEIGHT_DECAY, update_every=UPDATE_EVERY, priority_eps=P_EPS, a=A, initial_beta=INIT_BETA, n_multisteps=N_STEPS, clip=CLIP, initial_sigma=INIT_SIGMA, linear_type=LINEAR, **kwds): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed batch_size (int): size of each sample batch buffer_size (int): size of the experience memory buffer start_since (int): number of steps to collect before start training gamma (float): discount factor target_update_every (int): how often to update the target network tau (float): target network soft-update parameter lr (float): learning rate weight_decay (float): weight decay for optimizer update_every (int): update(learning and target update) interval priority_eps (float): small base value for priorities a (float): priority exponent parameter initial_beta (float): initial importance-sampling weight n_multisteps (int): number of steps to consider for each experience clip (float): gradient norm clipping (`None` to disable) initial_sigma (float): initial noise parameter weights linear_type (str): one of ('linear', 'noisy'); type of linear layer to use """ if kwds != {}: print("Ignored keyword arguments: ", end='') print(*kwds, sep=', ') assert isinstance(state_size, int) assert isinstance(action_size, int) assert isinstance(seed, int) assert isinstance(batch_size, int) and batch_size > 0 assert isinstance(buffer_size, int) and buffer_size >= batch_size assert isinstance(start_since, int) and batch_size <= start_since <= buffer_size assert isinstance(gamma, (int, float)) and 0 <= gamma <= 1 assert isinstance(target_update_every, int) and target_update_every > 0 assert isinstance(tau, (int, float)) and 0 <= tau <= 1 assert isinstance(lr, (int, float)) and lr >= 0 assert isinstance(weight_decay, (int, float)) and weight_decay >= 0 assert isinstance(update_every, int) and update_every > 0 assert isinstance(priority_eps, (int, float)) and priority_eps >= 0 assert isinstance(a, (int, float)) and 0 <= a <= 1 assert isinstance(initial_beta, (int, float)) and 0 <= initial_beta <= 1 assert isinstance(n_multisteps, int) and n_multisteps > 0 if clip: assert isinstance(clip, (int, float)) and clip >= 0 assert isinstance(initial_sigma, (int, float)) and initial_sigma >= 0 assert isinstance(linear_type, str) and linear_type.strip().lower() in ('linear', 'noisy') self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.batch_size = batch_size self.buffer_size = buffer_size self.start_since = start_since self.gamma = gamma self.target_update_every = target_update_every self.tau = tau self.lr = lr self.weight_decay = weight_decay self.update_every = update_every self.priority_eps = priority_eps self.a = a self.beta = initial_beta self.n_multisteps = n_multisteps self.clip = clip self.initial_sigma = initial_sigma self.linear_type = linear_type.strip().lower() # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, linear_type, initial_sigma, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, linear_type, initial_sigma, seed).to(device) self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict()) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr, weight_decay=weight_decay) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, n_multisteps, gamma, a, seed) # Initialize time step (for updating every UPDATE_EVERY steps and TARGET_UPDATE_EVERY steps) self.u_step = 0 self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.u_step = (self.u_step + 1) % self.update_every if self.u_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) >= self.start_since: experiences, target_discount, is_weights, indices = self.memory.sample(self.beta) new_priorities = self.learn(experiences, is_weights, target_discount) self.memory.update_priorities(indices, new_priorities) # update the target network every TARGET_UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.target_update_every if self.t_step == 0: self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) return random.choice(np.arange(self.action_size)) def learn(self, experiences, is_weights, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples is_weights (torch.Tensor): tensor of importance-sampling weights gamma (float): discount factor for the target max-Q value Returns ======= new_priorities (List[float]): list of new priority values for the given sample """ states, actions, rewards, next_states, dones = experiences with torch.no_grad(): target = rewards + gamma * (1 - dones) * self.qnetwork_target(next_states)\ .gather(dim=1, index=self.qnetwork_local(next_states)\ .argmax(dim=1, keepdim=True)) pred = self.qnetwork_local(states) diff = target.sub(pred.gather(dim=1, index=actions)) new_priorities = diff.detach().abs().add(P_EPS).cpu().numpy().reshape((-1,)) loss = diff.pow(2).mul(is_weights).mean() self.optimizer.zero_grad() loss.backward() if self.clip: torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), CLIP) self.optimizer.step() return new_priorities def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent: def __init__(self, state_size, action_size, num_agents, double_dqn=False): self.action_size = action_size self.double_dqn = double_dqn # Q-Network self.qnetwork_local = QNetwork(state_size, action_size).to(device) self.qnetwork_target = copy.deepcopy(self.qnetwork_local) self.optimizer = torch.optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.lr_scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=4000, gamma=0.98, last_epoch=-1) # Replay memory self.memory = ReplayBuffer(BUFFER_SIZE) self.num_agents = num_agents self.t_step = 0 def reset(self): self.finished = [False] * self.num_agents # Decide on an action to take in the environment def act(self, state, eps=0.): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) # Epsilon-greedy action selection if random.random() > eps: return torch.argmax(action_values).item() else: return torch.randint(self.action_size, ()).item() # Record the results of the agent's action and update the model def step(self, handle, state, action, reward, next_state, agent_done): if not self.finished[handle]: # Save experience in replay memory self.memory.push(state, action, reward, next_state, agent_done) self.finished[handle] = agent_done # Perform a gradient update every UPDATE_EVERY time steps self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0 and len(self.memory) > BATCH_SIZE * 1: # 320 self.learn(*self.memory.sample(BATCH_SIZE, device)) def learn(self, states, actions, rewards, next_states, dones): self.qnetwork_local.train() # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) if self.double_dqn: Q_best_action = self.qnetwork_local(next_states).argmax(1) Q_targets_next = self.qnetwork_target(next_states).gather(1, Q_best_action.unsqueeze(-1)) else: Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(-1) # Compute Q targets for current states Q_targets = rewards + GAMMA * Q_targets_next * (1 - dones) # Compute loss and perform a gradient step self.optimizer.zero_grad() loss = F.mse_loss(Q_expected, Q_targets) loss.backward() self.optimizer.step() self.lr_scheduler.step() # Update the target network parameters to `tau * local.parameters() + (1 - tau) * target.parameters()` for target_param, local_param in zip(self.qnetwork_target.parameters(), self.qnetwork_local.parameters()): target_param.data.copy_(TAU * local_param.data + (1.0 - TAU) * target_param.data) # Checkpointing methods def save(self, path, *data): torch.save(self.qnetwork_local.state_dict(), path / 'model_checkpoint.local') torch.save(self.qnetwork_target.state_dict(), path / 'model_checkpoint.target') torch.save(self.optimizer.state_dict(), path / 'model_checkpoint.optimizer') with open(path / 'model_checkpoint.meta', 'wb') as file: pickle.dump(data, file) def load(self, path, *defaults): try: print("Loading model from checkpoint...") self.qnetwork_local.load_state_dict(torch.load(path / 'model_checkpoint.local')) self.qnetwork_target.load_state_dict(torch.load(path / 'model_checkpoint.target')) self.optimizer.load_state_dict(torch.load(path / 'model_checkpoint.optimizer')) with open(path / 'model_checkpoint.meta', 'rb') as file: return pickle.load(file) except: print("No checkpoint file was found") return defaults
class SAC(object): def __init__(self, num_inputs, action_space, args): self.num_inputs = num_inputs self.action_space = action_space.shape[0] self.gamma = args.gamma self.tau = args.tau self.policy_type = args.policy self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.device = torch.device("cuda" if args.cuda else "cpu") self.critic = QNetwork(self.num_inputs, self.action_space, args.hidden_size).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) if self.policy_type == "Gaussian": self.alpha = args.alpha # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning == True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) self.policy = GaussianPolicy(self.num_inputs, self.action_space, args.hidden_size).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) self.value = ValueNetwork(self.num_inputs, args.hidden_size).to(self.device) self.value_target = ValueNetwork(self.num_inputs, args.hidden_size).to(self.device) self.value_optim = Adam(self.value.parameters(), lr=args.lr) hard_update(self.value_target, self.value) else: self.policy = DeterministicPolicy(self.num_inputs, self.action_space, args.hidden_size).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) self.critic_target = QNetwork(self.num_inputs, self.action_space, args.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) def select_action(self, state, eval=False): state = torch.FloatTensor(state).to(self.device).unsqueeze(0) if eval == False: self.policy.train() action, _, _ = self.policy.sample(state) else: self.policy.eval() _, _, action = self.policy.sample(state) action = action.detach().cpu().numpy() return action[0] def update_parameters(self, state_batch, action_batch, reward_batch, next_state_batch, mask_batch, updates): state_batch = torch.FloatTensor(state_batch).to(self.device) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to( self.device).unsqueeze(1) mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1) qf1, qf2 = self.critic( state_batch, action_batch ) # Two Q-functions to mitigate positive bias in the policy improvement step pi, log_pi, _ = self.policy.sample(state_batch) if self.policy_type == "Gaussian": if self.automatic_entropy_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() alpha_logs = torch.tensor(self.alpha) # For TensorboardX logs else: alpha_loss = torch.tensor(0.).to(self.device) alpha_logs = torch.tensor(self.alpha) # For TensorboardX logs vf = self.value( state_batch ) # separate function approximator for the soft value can stabilize training. with torch.no_grad(): vf_next_target = self.value_target(next_state_batch) next_q_value = reward_batch + mask_batch * self.gamma * ( vf_next_target) else: alpha_loss = torch.tensor(0.).to(self.device) alpha_logs = self.alpha # For TensorboardX logs with torch.no_grad(): next_state_action, _, _, _, _, = self.policy.sample( next_state_batch) # Use a target critic network for deterministic policy and eradicate the value value network completely. qf1_next_target, qf2_next_target = self.critic_target( next_state_batch, next_state_action) min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) next_q_value = reward_batch + mask_batch * self.gamma * ( min_qf_next_target) qf1_loss = F.mse_loss( qf1, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf2_loss = F.mse_loss( qf2, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf1_pi, qf2_pi = self.critic(state_batch, pi) min_qf_pi = torch.min(qf1_pi, qf2_pi) if self.policy_type == "Gaussian": vf_target = min_qf_pi - (self.alpha * log_pi) value_loss = F.mse_loss( vf, vf_target.detach() ) # JV = 𝔼st~D[0.5(V(st) - (𝔼at~π[Qmin(st,at) - α * log π(at|st)]))^2] policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean( ) # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] # Regularization Loss # mean_loss = 0.001 * mean.pow(2).mean() # std_loss = 0.001 * log_std.pow(2).mean() # policy_loss += mean_loss + std_loss self.critic_optim.zero_grad() qf1_loss.backward() self.critic_optim.step() self.critic_optim.zero_grad() qf2_loss.backward() self.critic_optim.step() if self.policy_type == "Gaussian": self.value_optim.zero_grad() value_loss.backward() self.value_optim.step() else: value_loss = torch.tensor(0.).to(self.device) self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() """ We update the target weights to match the current value function weights periodically Update target parameter after every n(args.target_update_interval) updates """ if updates % self.target_update_interval == 0 and self.policy_type == "Deterministic": soft_update(self.critic_target, self.critic, self.tau) elif updates % self.target_update_interval == 0 and self.policy_type == "Gaussian": soft_update(self.value_target, self.value, self.tau) return value_loss.item(), qf1_loss.item(), qf2_loss.item( ), policy_loss.item(), alpha_loss.item(), alpha_logs.item() # Save model parameters def save_model(self, env_name, suffix="", actor_path=None, critic_path=None, value_path=None): if not os.path.exists('models/'): os.makedirs('models/') if actor_path is None: actor_path = "models/sac_actor_{}_{}".format(env_name, suffix) if critic_path is None: critic_path = "models/sac_critic_{}_{}".format(env_name, suffix) if value_path is None: value_path = "models/sac_value_{}_{}".format(env_name, suffix) print('Saving models to {}, {} and {}'.format(actor_path, critic_path, value_path)) torch.save(self.value.state_dict(), value_path) torch.save(self.policy.state_dict(), actor_path) torch.save(self.critic.state_dict(), critic_path) # Load model parameters def load_model(self, actor_path, critic_path, value_path): print('Loading models from {}, {} and {}'.format( actor_path, critic_path, value_path)) if actor_path is not None: self.policy.load_state_dict(torch.load(actor_path)) if critic_path is not None: self.critic.load_state_dict(torch.load(critic_path)) if value_path is not None: self.value.load_state_dict(torch.load(value_path))
class DQN_Agent: def __init__(self, state_size, action_size, seed=42): self.action_size = action_size # Q-Network self.q_eval = QNetwork(state_size, action_size, seed).to(device) self.q_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.RMSprop(self.q_eval.parameters(), lr=LR) # Replay Buffer self.memory = ReplayBuffer(seed=seed) self.step_count = 0 self.seed = random.seed(seed) def act(self, state): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.q_eval.eval() with torch.no_grad(): q_values = self.q_eval(state) self.q_eval.train() epsilon = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * self.step_count / EPS_DECAY) if random.random() > epsilon: # greedy return np.argmax(q_values.cpu().data.numpy()) else: # explore return random.choice(np.arange(self.action_size)) def step(self, state, action, reward, next_state, done): self.memory.push(state, action, reward, next_state, done) loss_value = None if len(self.memory) >= BATCH_SIZE: # sample transitions from replay buffer states, actions, rewards, next_states, dones = self.memory.sample() # r if done # r + max_a \gamma Q(s, a; \theta') if not done q_next_values = self.q_target(next_states).detach().max(1)[0].unsqueeze(1) q_learning_targets = rewards + GAMMA * q_next_values * (1 - dones) # Q(s, a; \theta) q_values = self.q_eval(states).gather(1, actions) # perform gradient descent on the loss loss = F.mse_loss(q_values, q_learning_targets) loss_value = loss.data.item() self.optimizer.zero_grad() loss.backward() self.optimizer.step() # update target Q-Network self.update_target() self.step_count += 1 return loss_value def update_target(self): if self.step_count % UPDATE_TARGET_STEPS == 0: self.q_target.load_state_dict(self.q_eval.state_dict())
class Agent(): """ Creates an agent that interacts with a Unity-ML Environment using a Deep Q-learning model (in pytorch). """ def __init__(self, n_state, n_actions, n_hidden=32, n_layers=2, seed=333, snapshotfile="snapshot.pth"): """ Initialize the agent. Args: n_state (int): Number of features that represent the state n_actions (int): Number of actions available to agent n_hidden (int): Number of units in hidden neural net layers n_layers (int): Number of layers for neural network seed (int): Set the random seed (for reproducibility) snapshotfile (str): Filepath to use for saving weights """ self.n_state = n_state self.n_actions = n_actions self.seed = random.seed(seed) self.snapshotfile = snapshotfile # Deep Q-Network self.qnetwork_local = QNetwork(n_state, n_actions, seed, n_hidden=64).to(device) self.qnetwork_target = QNetwork(n_state, n_actions, seed, n_hidden=64).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.loss_func = torch.nn.MSELoss(reduce=True) # Experience Replay Memory self.memory = ReplayBuffer(n_actions, EXPERIENCE_MEMORY_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # TODO: have the is_training attribute control eval and train # mode in pytprch network self.is_training = True def memorize_and_learn_step(self, state, action, reward, next_state, done): """ Given S,A,R',S' and if it is finished, it saves the eperience to memory, and occasionally samples from memorized experiences and learns from those memories. """ # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Once every UPDATE_EVERY steps, randomly sample memories to learn from self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def choose_action(self, state, epsilon=0.0): """ Given an environment state, it returns an action using epsilon greedy policy. Args: state (array_like): current state epsilon (float) : probability of choosing a random action """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): # temporarially set requires_grad flag to false action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > epsilon: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.n_actions)) def learn(self, experiences, gamma): """ Update the weights of the neural network representing the Q values, given a batch of experience tuples. Args: experiences (tuple of torch.Variable): tuple with the following torch tensors (states, actions, rewards, next_states, dones) gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Q_TARGET next_logits = self.qnetwork_target( next_states).detach() # no need to calculate gradients, so detach q_next = torch.max(next_logits, dim=1, keepdim=True)[0] # where dones=1, it will ignore q_next, and just use current reward q_target = rewards + ((1 - dones) * (gamma * q_next)) # Q_CURRENT - based on action taken in experience current_logits = self.qnetwork_local(states) q_pred = torch.gather(current_logits, 1, actions) # LOSS loss = self.loss_func(q_pred, q_target) # loss = F.mse_loss(q_pred, q_target) # OPTIMIZE WEIGHTS self.optimizer.zero_grad() # zero the parameter gradients loss.backward() self.optimizer.step() # UPDATE TARGET NETWORK self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """ Performs a soft update on the target Q network weights, by shifting them slightly towards the local Q network by a factor of `tau`. θ_target = τ*θ_local + (1 - τ)*θ_target Args: local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def snapshot(self, file=None): """ Takes a snapshot file of the neural netowrk weights """ file = self.snapshotfile if file is None else file torch.save(self.qnetwork_local.state_dict(), file) def load_snapshot(self, file=None): """ Loads the neural network weights from a file """ file = self.snapshotfile if file is None else file self.qnetwork_local.load_state_dict(torch.load(file)) self.qnetwork_target.load_state_dict(torch.load(file))
class Agent(): """ Initialize Agent, inclduing: DQN Hyperparameters Local and Targat State-Action Policy Networks Replay Memory Buffer from Replay Buffer Class (define below) """ def __init__(self, state_size, action_size, dqn_type='DQN', replay_memory_size=1e5, batch_size=64, gamma=0.99, learning_rate=1e-3, target_tau=2e-3, update_rate=4, seed=0): """ DQN Agent Parameters ====== state_size (int): dimension of each state action_size (int): dimension of each action dqn_type (string): can be either 'DQN' for vanillia dqn learning (default) or 'DDQN' for double-DQN. replay_memory size (int): size of the replay memory buffer (typically 5e4 to 5e6) batch_size (int): size of the memory batch used for model updates (typically 32, 64 or 128) gamma (float): paramete for setting the discoun ted value of future rewards (typically .95 to .995) learning_rate (float): specifies the rate of model learing (typically 1e-4 to 1e-3)) seed (int): random seed for initializing training point. """ self.dqn_type = dqn_type self.state_size = state_size self.action_size = action_size self.buffer_size = int(replay_memory_size) self.batch_size = batch_size self.gamma = gamma self.learn_rate = learning_rate self.tau = target_tau self.update_rate = update_rate self.seed = random.seed(seed) """ # DQN Agent Q-Network # For DQN training, two nerual network models are employed; # (a) A network that is updated every (step % update_rate == 0) # (b) A target network, with weights updated to equal the network at a slower (target_tau) rate. # The slower modulation of the target network weights operates to stablize learning. """ self.network = QNetwork(state_size, action_size, seed).to(device) self.target_network = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.network.parameters(), lr=self.learn_rate, betas=BETAS) # Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 ######################################################## # STEP() method # def step(self, state, action, reward, next_state, done, update=True): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_rate if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() if update: self.learn(experiences, self.gamma) ######################################################## # ACT() method # def act(self, state, eps=0.0): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.network.eval() with torch.no_grad(): action_values = self.network(state) self.network.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) ######################################################## # LEARN() method # Update value parameters using given batch of experience tuples. def learn(self, experiences, gamma, DQN=True): """ Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get Q values from current observations (s, a) using model nextwork Qsa = self.network(states).gather(1, actions) if (self.dqn_type == 'DDQN'): #Double DQN #************************ Qsa_prime_actions = self.network(next_states).detach().max( 1)[1].unsqueeze(1) Qsa_prime_targets = self.target_network( next_states)[Qsa_prime_actions].unsqueeze(1) else: #Regular (Vanilla) DQN #************************ # Get max Q values for (s',a') from target model Qsa_prime_target_values = self.target_network(next_states).detach() Qsa_prime_targets = Qsa_prime_target_values.max(1)[0].unsqueeze(1) # Compute Q targets for current states Qsa_targets = rewards + (gamma * Qsa_prime_targets * (1 - dones)) # Compute loss (error) loss = F.mse_loss(Qsa, Qsa_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.network, self.target_network, self.tau) ######################################################## """ Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target """ def soft_update(self, local_model, target_model, tau): """ Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def save_the_model(self, iteration, f_name): if not os.path.exists('./save/dqn/'): os.makedirs('./save/dqn/') f_name = 'dqn_param_' + str(iteration) + '_' + f_name + '_model.pth' torch.save(self.network.state_dict(), './save/dqn/' + f_name) print('DQN Model Saved') def load_the_model(self, iteration, f_name): f_path = './save/dqn/dqn_param_' + str( iteration) + '_' + f_name + '_model.pth' self.network.load_state_dict(torch.load(f_path)) print('DQN Model Loaded')
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, duel=True, qnetwork_weights=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.duel = duel # 0 - walk forward # 1 - walk backward # 2 - turn left # 3 - turn right self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed, duel=True).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, duel=True).to(device) # load with trained weights if needed if qnetwork_weights is not None: self.qnetwork_local.load_state_dict(qnetwork_weights) self.qnetwork_target.load_state_dict(qnetwork_weights) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()).astype(int) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ # experiences is already sent to GPU in the replay buffer class, so no need to worry about it here states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # calculate MSE loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, filepath): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.avarage_score = 0 self.start_epoch = 0 self.seed = random.randint(0, seed) random.seed(seed) print("seed ", seed, " self.seed ", self.seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, self.seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, self.seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) if filepath: self.load_model(filepath) # Replay memory print("buffer size ", BUFFER_SIZE) self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, self.seed) print("memory ", self.memory) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() #print("experiences ",experiences) self.learn_DDQN(experiences, GAMMA) self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: self.update_network(self.qnetwork_local, self.qnetwork_target) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn_DDQN(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next_argmax = self.qnetwork_local(next_states).squeeze( 0).detach().max(1)[1].unsqueeze(1) #Q_targets_next0 = self.qnetwork_target(next_states).squeeze(0).detach() #Q_targets_next = Q_targets_next0.max(1)[0].unsqueeze(1) Q_targets_next = self.qnetwork_target(next_states).squeeze(0).gather( 1, Q_targets_next_argmax) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).squeeze(0).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # #self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next0 = self.qnetwork_target(next_states).squeeze(0).detach() Q_targets_next = Q_targets_next0.max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).squeeze(0).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # #self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def save_model(self, filepath, epoch, score, last=False): checkpoint = { 'input_size': self.state_size, 'output_size': self.action_size, 'hidden_layers': [each.in_features for each in self.qnetwork_local.hidden_layers], 'state_dict': self.qnetwork_local.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict(), 'epoch': epoch, 'avarage_score': score } checkpoint['hidden_layers'].append( self.qnetwork_local.hidden_layers[-1].out_features) torch.save(checkpoint, filepath) if last: torch.save(self.qnetwork_local.state_dict(), '{}_state_dict_{}.pt'.format(last, epoch)) #print("checkpoint['hidden_layers'] ",checkpoint['hidden_layers']) def load_model(self, filepath): print("seed ", self.seed) if os.path.isfile(filepath): print("=> loading checkpoint '{}'".format(filepath)) checkpoint = torch.load(filepath) print("checkpoint['hidden_layers'] ", checkpoint['hidden_layers']) self.qnetwork_local = QNetwork( checkpoint['input_size'], checkpoint['output_size'], self.seed, checkpoint['hidden_layers']).to(device) self.qnetwork_local.load_state_dict(checkpoint['state_dict']) self.qnetwork_local.to(device) self.qnetwork_target = QNetwork( checkpoint['input_size'], checkpoint['output_size'], self.seed, checkpoint['hidden_layers']).to(device) self.qnetwork_target.load_state_dict(checkpoint['state_dict']) self.qnetwork_target.to(device) if 'optimizer_state_dict' in checkpoint: self.optimizer.load_state_dict( checkpoint['optimizer_state_dict']) for state in self.optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.to(device) print(self.optimizer) if 'epoch' in checkpoint: self.start_epoch = checkpoint['epoch'] if 'avarage_score' in checkpoint: self.avarage_score = checkpoint['avarage_score'] print(self.qnetwork_target) print(self.optimizer) else: print("=> no checkpoint found at '{}'".format(filepath)) def update_network(self, local_model, target_model): for target, local in zip(target_model.parameters(), local_model.parameters()): target.data.copy_(local.data) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed=0, gamma=0.99, learning_rate=5e-4, use_RB=True, RB_size=int(1e5), RB_batch_size=64, use_TM=True, TM_update_every=4, use_DDQN=True, use_PER=False, PER_epsilon=0.01, PER_alpha=0.5, PER_beta=0.4, PER_beta_increment=0.001, use_DUELING=True): """Initialize an Agent object. Params ====== state_size (int) : dimension of each state action_size (int) : dimension of each action seed (int) : random seed gamma (float) : discount factor learning_rate (float) : learning rate of the model use_RB (bool) : Use a replay buffer RB_size (int) : replay buffer size RB_batch_size (int) : minibatch size of the learning use_TM (bool) : Use a target model TM_update_every (int) : update target model every t steps use_DDQN (bool) : Use Double DQN, only valid if use target model use_PER (bool) : Use a prioritized replay buffer PER_epsilon (float) : Small value added to priorities to avoid zero probabilities PER_alpha (float) : Power used to compute the sampling probabilities [0-1] : 0=> Uniform sampling 1=>Fully prioritized PER_beta (float) : Used in importance-sampling - Initial value increased to 1 PER_beta_increment (float) : To increment beta at each sampling use_DUELING (bool) : Use DUELING network """ # Control some parameters assert not use_PER or ( use_PER and use_RB ), "Use replay buffer if use PER" # To make sure we remember to update RB params assert not use_DDQN or (use_DDQN and use_TM), "Use target model if use DDQN" self.state_size = state_size self.action_size = action_size self.gamma = gamma # Q-Network self.qnetwork_policy = QNetwork(state_size, action_size, seed, use_DUELING=use_DUELING).to(device) self.optimizer = optim.Adam(self.qnetwork_policy.parameters(), lr=learning_rate) self.use_DDQN = use_DDQN self.use_TM = use_TM if use_TM: self.qnetwork_target = QNetwork(state_size, action_size, seed, use_DUELING=use_DUELING).to(device) self.TM_update_every = TM_update_every # Initialize time step self.t_step = 0 # Replay memory self.use_RB = use_RB self.RB_batch_size = RB_batch_size self.use_PER = use_PER if use_PER: self.memory = ReplayBufferPER(RB_size, RB_batch_size, seed, epsilon=PER_epsilon, alpha=PER_alpha, beta=PER_beta, beta_increment=PER_beta_increment) elif use_RB: self.memory = ReplayBuffer(RB_size, RB_batch_size, seed) # Init the seed random.seed(seed) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ # Epsilon-greedy action selection if random.random() > eps: state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_policy.eval() with torch.no_grad(): action_values = self.qnetwork_policy(state) return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def step(self, state, action, reward, next_state, done): # Save experience in replay memory if any if self.use_PER: # Need to compute the error of this experience Q_target, Q_expected = self._QValues([(state, action, reward, next_state, done)]) error = (Q_target - Q_expected).cpu().squeeze().data.item() self.memory.add(error, (state, action, reward, next_state, done)) elif self.use_RB: self.memory.add((state, action, reward, next_state, done)) else: self.experiences = [(state, action, reward, next_state, done)] # One more step. self.t_step += 1 # If no replay buffer or not enough samples available in memory, learn if not self.use_RB or len(self.memory) > self.RB_batch_size: self._learn() def _QValues(self, batch): """Execute a forward path for the QNetworks to get the QValues (expected and target) So the TD error can be computed or used to learn Params ====== batch : Array of tuple <state, action, reward, next_state, done> """ # Get the types by line mini_batch = np.array(batch).transpose() states = torch.Tensor(np.vstack(mini_batch[0])).float().to(device) actions = torch.Tensor(np.vstack(mini_batch[1])).long().to(device) rewards = torch.Tensor(np.vstack(mini_batch[2])).float().to(device) next_states = torch.Tensor(np.vstack(mini_batch[3])).float().to(device) dones = torch.Tensor(np.vstack( mini_batch[4]).astype(int)).float().to(device) # Get max predicted Q values (for next states) from target model if not self.use_TM or (self.use_TM and self.use_DDQN): self.qnetwork_policy.eval() with torch.no_grad(): action_values_policy = self.qnetwork_policy(next_states) if self.use_TM: self.qnetwork_target.eval() with torch.no_grad(): action_values_target = self.qnetwork_target(next_states) if self.use_TM: if self.use_DDQN: Q_targets_next = action_values_target.gather( dim=1, index=action_values_policy.max(dim=1, keepdim=True)[1]) else: Q_targets_next = action_values_target.max(dim=1, keepdim=True)[0] else: Q_targets_next = action_values_policy.max(dim=1, keepdim=True)[0] # Need to be at zero if we were done Q_targets_next *= torch.ones_like(dones) - dones # Compute the Q targets for current states Q_targets = rewards + self.gamma * Q_targets_next # Get the Q values from policy model self.qnetwork_policy.train() Q_expected = self.qnetwork_policy(states).gather(dim=1, index=actions) return Q_targets, Q_expected def _learn(self): """Update value parameters using given a batch of experience tuples.""" if self.use_PER: experiences, indexes, IS_weights = self.memory.sample() IS_weights = torch.Tensor(np.vstack(IS_weights)).float().to(device) elif self.use_RB: experiences = self.memory.sample() else: experiences = self.experiences # Get the Qvalues for those experiences Q_targets, Q_expected = self._QValues(experiences) if self.use_PER: # Update priorities of the replay buffer errors = (Q_targets - Q_expected).cpu().squeeze().data.numpy() self.memory.update_priorities(indexes, errors) # Update Qs with the importance-sampling weight correction Q_expected *= IS_weights**0.5 Q_targets *= IS_weights**0.5 # Loss computation loss = F.mse_loss(Q_expected, Q_targets) #loss = F.smooth_l1_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # if self.use_TM: self.t_step %= self.TM_update_every if self.t_step == 0: self.qnetwork_target.load_state_dict( self.qnetwork_policy.state_dict()) def save_weights(self, file='checkpoint.pth'): """Save the agent network weights in a checkpoint file""" torch.save(self.qnetwork_policy.state_dict(), file) def load_weights(self, file='checkpoint.pth'): """Load the agent network weights from a checkpoint file""" self.qnetwork_policy.load_state_dict(torch.load(file))
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def save(self, filename): """Saves the agent to the local workplace Params ====== filename (string): where to save the weights """ checkpoint = { 'input_size': self.state_size, 'output_size': self.action_size, 'hidden_layers': [each.out_features for each in self.qnetwork_local.hidden_layers], 'state_dict': self.qnetwork_local.state_dict() } torch.save(checkpoint, filename) def load_weights(self, filename): """ Load weights to update agent's Q-Network. Expected is a format like the one produced by self.save() Params ====== filename (string): where to load data from. """ checkpoint = torch.load(filename) if not checkpoint['input_size'] == self.state_size: print( f"Error when loading weights from checkpoint {filename}: input size {checkpoint['input_size']} doesn't match state size of agent {self.state_size}" ) return None if not checkpoint['output_size'] == self.action_size: print( f"Error when loading weights from checkpoint {filename}: output size {checkpoint['output_size']} doesn't match action space size of agent {self.action_size}" ) return None my_hidden_layers = [ each.out_features for each in self.qnetwork_local.hidden_layers ] if not checkpoint['hidden_layers'] == my_hidden_layers: print( f"Error when loading weights from checkpoint {filename}: hidden layers {checkpoint['hidden_layers']} don't match agent's hidden layers {my_hidden_layers}" ) return None self.qnetwork_local.load_state_dict(checkpoint['state_dict']) self.qnetwork_target = self.qnetwork_local