class TD3(object):
    """Classes implementing TD3 and DDPG off-policy learners

         Parameters:
               args (object): Parameter class


     """
    def to_cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic_target.cuda()
        self.critic.cuda()

    def __init__(self, args):

        self.args = args

        self.actor = Actor(args)
        self.actor.apply(utils.init_weights)
        self.actor_target = Actor(args)
        self.actor_optim = Adam(self.actor.parameters(), lr=1e-4)

        self.critic = Critic(args)
        self.critic.apply(utils.init_weights)
        self.critic_target = Critic(args)
        self.critic_optim = Adam(self.critic.parameters(), lr=1e-3)

        self.gamma = args.gamma
        self.tau = self.args.tau
        self.loss = nn.MSELoss()

        self.hard_update(
            self.actor_target,
            self.actor)  # Make sure target is with the same weight
        self.hard_update(self.critic_target, self.critic)
        self.actor_target.cuda()
        self.critic_target.cuda()
        self.actor.cuda()
        self.critic.cuda()
        self.num_critic_updates = 0

        #Statistics Tracker
        self.action_loss = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.policy_loss = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.critic_loss = {'mean': []}
        self.q = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.val = {'min': [], 'max': [], 'mean': [], 'std': []}

    def compute_stats(self, tensor, tracker):
        """Computes stats from intermediate tensors

             Parameters:
                   tensor (tensor): tensor
                   tracker (object): logger

             Returns:
                   None


         """
        tracker['min'].append(torch.min(tensor).item())
        tracker['max'].append(torch.max(tensor).item())
        tracker['mean'].append(torch.mean(tensor).item())
        tracker['mean'].append(torch.mean(tensor).item())

    def update_parameters(self,
                          state_batch,
                          next_state_batch,
                          action_batch,
                          reward_batch,
                          done_batch,
                          dpp,
                          num_epoch=1):
        """Runs a step of Bellman upodate and policy gradient using a batch of experiences

             Parameters:
                  state_batch (tensor): Current States
                  next_state_batch (tensor): Next States
                  action_batch (tensor): Actions
                  reward_batch (tensor): Rewards
                  done_batch (tensor): Done batch
                  num_epoch (int): Number of learning iteration to run with the same data

             Returns:
                   None

         """

        if isinstance(state_batch, list):
            state_batch = torch.cat(state_batch)
            next_state_batch = torch.cat(next_state_batch)
            action_batch = torch.cat(action_batch)
            reward_batch = torch.cat(reward_batch).done_batch = torch.cat(
                done_batch)

        for _ in range(num_epoch):
            ########### CRITIC UPDATE ####################

            #Compute next q-val, next_v and target
            with torch.no_grad():
                #Policy Noise
                policy_noise = np.random.normal(
                    0, self.args.policy_noise,
                    (action_batch.size()[0], action_batch.size()[1]))
                policy_noise = torch.clamp(torch.Tensor(policy_noise),
                                           -self.args.policy_noise_clip,
                                           self.args.policy_noise_clip)

                #Compute next action_bacth
                next_action_batch = self.actor_target.forward(
                    next_state_batch) + policy_noise.cuda()
                next_action_batch = torch.clamp(next_action_batch, 0, 1)

                #Compute Q-val and value of next state masking by done
                q1, q2, next_val = self.critic_target.forward(
                    next_state_batch, next_action_batch)
                q1 = (1 - done_batch) * q1
                q2 = (1 - done_batch) * q2
                next_val = (1 - done_batch) * next_val
                next_q = torch.min(q1, q2)

                #Compute target q and target val
                target_q = reward_batch + (self.gamma * next_q)
                target_val = reward_batch + (self.gamma * next_val)

            self.critic_optim.zero_grad()
            current_q1, current_q2, current_val = self.critic.forward(
                (state_batch), (action_batch))
            self.compute_stats(current_q1, self.q)

            dt = self.loss(current_q1, target_q)
            dt = dt + self.loss(current_val, target_val)
            self.compute_stats(current_val, self.val)

            dt = dt + self.loss(current_q2, target_q)
            self.critic_loss['mean'].append(dt.item())

            dt.backward()

            self.critic_optim.step()
            self.num_critic_updates += 1

            #Delayed Actor Update
            if self.num_critic_updates % self.args.policy_ups_freq == 0:

                actor_actions = self.actor.forward(state_batch)

                if dpp:
                    policy_loss = -self.shape_dpp(self.critic, self.actor,
                                                  state_batch,
                                                  self.args.sensor_model)

                else:
                    Q1, Q2, val = self.critic.forward(state_batch,
                                                      actor_actions)
                    policy_loss = -(Q1 - val)

                self.compute_stats(policy_loss, self.policy_loss)
                policy_loss = policy_loss.mean()
                self.actor_optim.zero_grad()

                policy_loss.backward(retain_graph=True)
                if self.args.action_loss:
                    action_loss = torch.abs(actor_actions - 0.5)
                    self.compute_stats(action_loss, self.action_loss)
                    action_loss = action_loss.mean() * self.args.action_loss_w
                    action_loss.backward()
                    #if self.action_loss[-1] > self.policy_loss[-1]: self.args.action_loss_w *= 0.9 #Decay action_w loss if action loss is larger than policy gradient loss
                self.actor_optim.step()

                if self.num_critic_updates % self.args.policy_ups_freq == 0:
                    self.soft_update(self.actor_target, self.actor, self.tau)
                self.soft_update(self.critic_target, self.critic, self.tau)

    def soft_update(self, target, source, tau):
        """Soft update from target network to source

            Parameters:
                  target (object): A pytorch model
                  source (object): A pytorch model
                  tau (float): Tau parameter

            Returns:
                None

        """

        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - tau) +
                                    param.data * tau)

    def hard_update(self, target, source):
        """Hard update (clone) from target network to source

            Parameters:
                  target (object): A pytorch model
                  source (object): A pytorch model

            Returns:
                None
        """

        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)

    def shape_dpp(self, critic, actor, state, sensor_model):

        Q1, _, val = critic((state), actor((state)))
        original_T = Q1 - val

        all_adv = [original_T]

        state = utils.to_numpy(state.cpu())
        #mid_index = int(180 / self.args.angle_res)
        coupling = self.args.coupling

        max_ind = int(360 / self.args.angle_res)

        perturb_index = [
            np.argwhere(state[i, 0:max_ind] != -1).flatten()
            for i in range(len(state))
        ]
        for i, entry in enumerate(perturb_index):
            np.random.shuffle(entry)
            if len(entry) < coupling:
                perturb_index[i] = np.tile(entry, (coupling, 1)).flatten()

        for coupling_mag in range(coupling):

            empty_ind = [int(entry[coupling_mag]) for entry in perturb_index]

            if sensor_model == 'density':
                for i, ind in enumerate(empty_ind):
                    state[i, ind] = 1.0
            elif sensor_model == 'closets':
                for i, ind in enumerate(empty_ind):
                    state[i, ind] = 1.0

            shaped_state = utils.to_tensor(state).cuda()

            Q1, _, val = critic((shaped_state), actor((shaped_state)))
            adv = (Q1 - val) / (coupling_mag + 1)
            all_adv.append(adv)

        all_adv = torch.cat(all_adv, 1)
        dpp_max = torch.max(all_adv, 1)[0].unsqueeze(1)

        with torch.no_grad():
            normalizer = dpp_max / original_T

        return original_T * normalizer
Пример #2
0
class TD3(object):
	"""Classes implementing TD3 and DDPG off-policy learners

		 Parameters:
			   args (object): Parameter class


	 """
	def __init__(self, id, algo_name, state_dim, action_dim, hidden_size, actor_lr, critic_lr, gamma, tau, savetag, foldername, actualize, use_gpu, init_w = True):

		self.algo_name = algo_name; self.gamma = gamma; self.tau = tau; self.total_update = 0; self.agent_id = id;	self.actualize = actualize; self.use_gpu = use_gpu
		self.tracker = utils.Tracker(foldername, ['q_'+savetag, 'qloss_'+savetag, 'policy_loss_'+savetag, 'alz_score'+savetag,'alz_policy'+savetag], '.csv', save_iteration=1000, conv_size=1000)

		#Initialize actors
		self.policy = Actor(state_dim, action_dim, hidden_size, policy_type='DeterministicPolicy')
		if init_w: self.policy.apply(utils.init_weights)
		self.policy_target = Actor(state_dim, action_dim, hidden_size, policy_type='DeterministicPolicy')
		utils.hard_update(self.policy_target, self.policy)
		self.policy_optim = Adam(self.policy.parameters(), actor_lr)


		self.critic = QNetwork(state_dim, action_dim,hidden_size)
		if init_w: self.critic.apply(utils.init_weights)
		self.critic_target = QNetwork(state_dim, action_dim, hidden_size)
		utils.hard_update(self.critic_target, self.critic)
		self.critic_optim = Adam(self.critic.parameters(), critic_lr)

		if actualize:
			self.ANetwork = ActualizationNetwork(state_dim, action_dim, hidden_size)
			if init_w: self.ANetwork.apply(utils.init_weights)
			self.actualize_optim = Adam(self.ANetwork.parameters(), critic_lr)
			self.actualize_lr = 0.2
			if use_gpu: self.ANetwork.cuda()

		self.loss = nn.MSELoss()

		if use_gpu:
			self.policy_target.cuda(); self.critic_target.cuda(); self.policy.cuda(); self.critic.cuda()
		self.num_critic_updates = 0

		#Statistics Tracker
		#self.action_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.policy_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.q_loss = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.q = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.alz_score = {'min':None, 'max': None, 'mean':None, 'std':None}
		self.alz_policy = {'min':None, 'max': None, 'mean':None, 'std':None}
		#self.val = {'min':None, 'max': None, 'mean':None, 'std':None}
		#self.value_loss = {'min':None, 'max': None, 'mean':None, 'std':None}


	def update_parameters(self, state_batch, next_state_batch, action_batch, reward_batch, done_batch, global_reward, num_epoch=1, **kwargs):
		"""Runs a step of Bellman upodate and policy gradient using a batch of experiences

			 Parameters:
				  state_batch (tensor): Current States
				  next_state_batch (tensor): Next States
				  action_batch (tensor): Actions
				  reward_batch (tensor): Rewards
				  done_batch (tensor): Done batch
				  num_epoch (int): Number of learning iteration to run with the same data

			 Returns:
				   None

		 """

		if isinstance(state_batch, list): state_batch = torch.cat(state_batch); next_state_batch = torch.cat(next_state_batch); action_batch = torch.cat(action_batch); reward_batch = torch.cat(reward_batch). done_batch = torch.cat(done_batch); global_reward = torch.cat(global_reward)

		for _ in range(num_epoch):
			########### CRITIC UPDATE ####################

			#Compute next q-val, next_v and target
			with torch.no_grad():
				#Policy Noise
				policy_noise = np.random.normal(0, kwargs['policy_noise'], (action_batch.size()[0], action_batch.size()[1]))
				policy_noise = torch.clamp(torch.Tensor(policy_noise), -kwargs['policy_noise_clip'], kwargs['policy_noise_clip'])

				#Compute next action_bacth
				next_action_batch = self.policy_target.clean_action(next_state_batch, return_only_action=True) + policy_noise.cuda() if self.use_gpu else policy_noise
				next_action_batch = torch.clamp(next_action_batch, -1, 1)

				#Compute Q-val and value of next state masking by done
				q1, q2 = self.critic_target.forward(next_state_batch, next_action_batch)
				q1 = (1 - done_batch) * q1
				q2 = (1 - done_batch) * q2
				#next_val = (1 - done_batch) * next_val

				#Select which q to use as next-q (depends on algo)
				if self.algo_name == 'TD3' or self.algo_name == 'TD3_actor_min': next_q = torch.min(q1, q2)
				elif self.algo_name == 'DDPG': next_q = q1
				elif self.algo_name == 'TD3_max': next_q = torch.max(q1, q2)

				#Compute target q and target val
				target_q = reward_batch + (self.gamma * next_q)
				#if self.args.use_advantage: target_val = reward_batch + (self.gamma * next_val)

			if self.actualize:
				##########Actualization Network Update
				current_Ascore = self.ANetwork.forward(state_batch, action_batch)
				utils.compute_stats(current_Ascore, self.alz_score)
				target_Ascore = (self.actualize_lr) * (global_reward * 10.0) + (1 - self.actualize_lr) * current_Ascore.detach()
				actualize_loss = self.loss(target_Ascore, current_Ascore).mean()



			self.critic_optim.zero_grad()
			current_q1, current_q2 = self.critic.forward((state_batch), (action_batch))
			utils.compute_stats(current_q1, self.q)

			dt = self.loss(current_q1, target_q)
			# if self.args.use_advantage:
			#     dt = dt + self.loss(current_val, target_val)
			#     utils.compute_stats(current_val, self.val)

			if self.algo_name == 'TD3' or self.algo_name == 'TD3_max': dt = dt + self.loss(current_q2, target_q)
			utils.compute_stats(dt, self.q_loss)

			# if self.args.critic_constraint:
			#     if dt.item() > self.args.critic_constraint_w:
			#         dt = dt * (abs(self.args.critic_constraint_w / dt.item()))
			dt.backward()

			self.critic_optim.step()
			self.num_critic_updates += 1

			if self.actualize:
				self.actualize_optim.zero_grad()
				actualize_loss.backward()
				self.actualize_optim.step()


			#Delayed Actor Update
			if self.num_critic_updates % kwargs['policy_ups_freq'] == 0:

				actor_actions = self.policy.clean_action(state_batch, return_only_action=False)

				# # Trust Region constraint
				# if self.args.trust_region_actor:
				#     with torch.no_grad(): old_actor_actions = self.actor_target.forward(state_batch)
				#     actor_actions = action_batch - old_actor_actions


				Q1, Q2 = self.critic.forward(state_batch, actor_actions)

				# if self.args.use_advantage: policy_loss = -(Q1 - val)
				policy_loss = -Q1

				utils.compute_stats(-policy_loss,self.policy_loss)
				policy_loss = policy_loss.mean()

				###Actualzie Policy Update
				if self.actualize:
					A1 = self.ANetwork.forward(state_batch, actor_actions)
					utils.compute_stats(A1, self.alz_policy)
					policy_loss += -A1.mean()*0.1



				self.policy_optim.zero_grad()



				policy_loss.backward(retain_graph=True)
				#nn.utils.clip_grad_norm_(self.actor.parameters(), 10)
				# if self.args.action_loss:
				#     action_loss = torch.abs(actor_actions-0.5)
				#     utils.compute_stats(action_loss, self.action_loss)
				#     action_loss = action_loss.mean() * self.args.action_loss_w
				#     action_loss.backward()
				#     #if self.action_loss[-1] > self.policy_loss[-1]: self.args.action_loss_w *= 0.9 #Decay action_w loss if action loss is larger than policy gradient loss
				self.policy_optim.step()


			# if self.args.hard_update:
			#     if self.num_critic_updates % self.args.hard_update_freq == 0:
			#         if self.num_critic_updates % self.args.policy_ups_freq == 0: self.hard_update(self.actor_target, self.actor)
			#         self.hard_update(self.critic_target, self.critic)


			if self.num_critic_updates % kwargs['policy_ups_freq'] == 0: utils.soft_update(self.policy_target, self.policy, self.tau)
			utils.soft_update(self.critic_target, self.critic, self.tau)

			self.total_update += 1
			if self.agent_id == 0:
				self.tracker.update([self.q['mean'], self.q_loss['mean'], self.policy_loss['mean'],self.alz_score['mean'], self.alz_policy['mean']] ,self.total_update)
Пример #3
0
class PPO(object):
    """Classes implementing TD3 and DDPG off-policy learners

         Parameters:
               args (object): Parameter class


     """
    def __init__(self, args):

        self.args = args

        self.actor = Actor(args)
        if args.init_w: self.actor.apply(utils.init_weights)
        self.actor_target = Actor(args)
        self.optim = Adam(self.actor.parameters(), lr=5e-4)

        self.vfunc = ValueFunc(args)
        if args.init_w: self.vfunc.apply(utils.init_weights)

        self.gamma = args.gamma
        self.loss = nn.SmoothL1Loss()  #nn.MSELoss()

        #self.actor.cuda(); self.vfunc.cuda()
        self.num_critic_updates = 0

        #Statistics Tracker
        self.action_loss = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.policy_loss = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.critic_loss = {'mean': []}
        self.q = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.val = {'min': [], 'max': [], 'mean': [], 'std': []}

    def compute_gae(self, trajectory, gamma=0.99, tau=0.95):
        with torch.no_grad():
            values = []
            next_values = []
            rewards = []
            masks = []
            states = []
            actions = []

            for entry in trajectory:
                states.append(torch.tensor(entry[0]))
                actions.append(torch.tensor(entry[1]))
                values.append(self.vfunc(torch.Tensor(entry[0])))
                rewards.append(torch.Tensor(entry[3]))
                masks.append(torch.Tensor(entry[5]))
            values.append(self.vfunc(torch.Tensor(entry[2])))

            gae = 0.0
            returns = []
            for step in reversed(range(len(rewards))):
                delta = rewards[step] + gamma * values[
                    step + 1] * masks[step] - values[step]
                gae = delta + gamma * tau * masks[step] * gae
                returns.insert(0, gae + values[step])

        return states, actions, values, returns

    def compute_stats(self, tensor, tracker):
        """Computes stats from intermediate tensors

             Parameters:
                   tensor (tensor): tensor
                   tracker (object): logger

             Returns:
                   None


         """
        tracker['min'].append(torch.min(tensor).item())
        tracker['max'].append(torch.max(tensor).item())
        tracker['mean'].append(torch.mean(tensor).item())
        tracker['mean'].append(torch.mean(tensor).item())

    def update_parameters(self,
                          states,
                          actions,
                          log_probs,
                          returns,
                          advantages,
                          ppo_epochs=8,
                          mini_batch_size=128,
                          clip_param=0.2):
        """Runs a step of Bellman upodate and policy gradient using a batch of experiences

             Parameters:
                  state_batch (tensor): Current States
                  next_state_batch (tensor): Next States
                  action_batch (tensor): Actions
                  reward_batch (tensor): Rewards
                  done_batch (tensor): Done batch
                  num_epoch (int): Number of learning iteration to run with the same data

             Returns:
                   None

         """

        for _ in range(ppo_epochs):
            ind = random.sample(range(len(states)), mini_batch_size)
            mini_s = states[ind]
            mini_a = actions[ind]
            mini_ret = returns[ind]
            mini_adv = advantages[ind]

            #PPO Update
            new_action, value = self.actor(mini_s), self.vfunc(mini_s)

            ratio = mini_a - new_action
            surr1 = ratio * mini_adv
            surr2 = torch.clamp(ratio, 1.0 - clip_param,
                                1.0 + clip_param) * mini_adv

            actor_loss = -torch.min(surr1, surr2).mean()
            critic_loss = (mini_ret - value).pow(2).mean()

            loss = 0.5 * critic_loss + actor_loss

            self.optim.zero_grad()
            loss.backward()
            self.optim.step()

    def soft_update(self, target, source, tau):
        """Soft update from target network to source

            Parameters:
                  target (object): A pytorch model
                  source (object): A pytorch model
                  tau (float): Tau parameter

            Returns:
                None

        """

        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - tau) +
                                    param.data * tau)

    def hard_update(self, target, source):
        """Hard update (clone) from target network to source

            Parameters:
                  target (object): A pytorch model
                  source (object): A pytorch model

            Returns:
                None
        """

        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)
Пример #4
0
class Off_Policy_Algo(object):
    """Classes implementing TD3 and DDPG off-policy learners

         Parameters:
               args (object): Parameter class


     """
    def __init__(self,
                 wwid,
                 algo_name,
                 state_dim,
                 action_dim,
                 actor_lr,
                 critic_lr,
                 gamma,
                 tau,
                 init_w=True):

        self.algo_name = algo_name
        self.gamma = gamma
        self.tau = tau

        self.HLoss = HLoss()
        #Initialize actors
        self.actor = Actor(state_dim, action_dim, wwid, self.algo_name)
        if init_w: self.actor.apply(utils.init_weights)
        self.actor_target = Actor(state_dim, action_dim, wwid, self.algo_name)
        utils.hard_update(self.actor_target, self.actor)
        self.actor_optim = Adam(self.actor.parameters(), actor_lr)

        self.critic = Critic(state_dim, action_dim)
        if init_w: self.critic.apply(utils.init_weights)
        self.critic_target = Critic(state_dim, action_dim)
        utils.hard_update(self.critic_target, self.critic)
        self.critic_optim = Adam(self.critic.parameters(), critic_lr)

        self.loss = nn.MSELoss()

        if torch.cuda.is_available():
            self.actor_target.cuda()
            self.critic_target.cuda()
            self.actor.cuda()
            self.critic.cuda()
        self.num_critic_updates = 0

        #Statistics Tracker
        self.action_loss = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.policy_loss = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.critic_loss = {'mean': []}
        self.q = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.val = {'min': [], 'max': [], 'mean': [], 'std': []}

    def save_net(self, path):
        torch.save(self.actor.state_dict(), path)

    def act(self, state):
        return self.actor(state)

    def share_memory(self):
        self.actor.share_memory()
        self.actor_target.share_memory()
        self.critic.share_memory()
        self.critic_target.share_memory()

    def compute_stats(self, tensor, tracker):
        """Computes stats from intermediate tensors

             Parameters:
                   tensor (tensor): tensor
                   tracker (object): logger

             Returns:
                   None


         """
        tracker['min'].append(torch.min(tensor).item())
        tracker['max'].append(torch.max(tensor).item())
        tracker['mean'].append(torch.mean(tensor).item())
        tracker['mean'].append(torch.mean(tensor).item())

    def update_parameters(self,
                          state_batch,
                          next_state_batch,
                          action_batch,
                          reward_batch,
                          done_batch,
                          num_epoch=1,
                          **kwargs):
        """Runs a step of Bellman upodate and policy gradient using a batch of experiences

             Parameters:
                  state_batch (tensor): Current States
                  next_state_batch (tensor): Next States
                  action_batch (tensor): Actions
                  reward_batch (tensor): Rewards
                  done_batch (tensor): Done batch
                  num_epoch (int): Number of learning iteration to run with the same data

             Returns:
                   None

         """

        if isinstance(state_batch, list):
            state_batch = torch.cat(state_batch)
            next_state_batch = torch.cat(next_state_batch)
            action_batch = torch.cat(action_batch)
            reward_batch = torch.cat(reward_batch).done_batch = torch.cat(
                done_batch)

        for _ in range(num_epoch):
            ########### CRITIC UPDATE ####################

            #Compute next q-val, next_v and target
            with torch.no_grad():
                #Policy Noise
                policy_noise = np.random.normal(
                    0, kwargs['policy_noise'],
                    (action_batch.size()[0], action_batch.size()[1]))
                policy_noise = torch.clamp(torch.Tensor(policy_noise),
                                           -kwargs['policy_noise_clip'],
                                           kwargs['policy_noise_clip'])

                #Compute next action_bacth
                #next_action_batch = self.actor_target.turn_max_into_onehot(self.actor_target.Gumbel_softmax_sample_distribution(next_state_batch, use_cuda=True))\
                #        if self.algo_name == 'dis' else self.actor_target.forward(next_state_batch) + policy_noise.cuda()  #this should use one-hot from logits
                next_action_batch = self.actor_target.turn_max_into_onehot(self.actor_target.forward(next_state_batch)) \
                    if self.algo_name == 'dis' else self.actor_target.forward(next_state_batch) + policy_noise.cuda()  # this should use one-hot from logits
                if random.random() < 0.0001:
                    print('off_policy line 114, changed next action batch')
                next_action_batch = torch.clamp(next_action_batch, 0, 1)

                #Compute Q-val and value of next state masking by done
                q1, q2, _ = self.critic_target.forward(next_state_batch,
                                                       next_action_batch)
                q1 = (1 - done_batch) * q1
                q2 = (1 - done_batch) * q2

                #Select which q to use as next-q (depends on algo)
                if self.algo_name == 'TD3' or self.algo_name == 'TD3_actor_min' or self.algo_name == 'dis':
                    next_q = torch.min(q1, q2)
                elif self.algo_name == 'DDPG':
                    next_q = q1
                elif self.algo_name == 'TD3_max':
                    next_q = torch.max(q1, q2)

                #Compute target q and target val
                target_q = reward_batch + (self.gamma * next_q)

            self.critic_optim.zero_grad()
            current_q1, current_q2, current_val = self.critic.forward(
                (state_batch),
                (action_batch
                 ))  #here the action batch should be the soft version
            self.compute_stats(current_q1, self.q)

            dt = self.loss(current_q1, target_q)

            if self.algo_name == 'TD3' or self.algo_name == 'TD3_max' or self.algo_name == 'dis':
                dt = dt + self.loss(current_q2, target_q)
            self.critic_loss['mean'].append(dt.item())
            #print(dt.item(), "off_policy_algo line 136")

            dt.backward()

            self.critic_optim.step()
            self.num_critic_updates += 1

            #Delayed Actor Update
            if self.num_critic_updates % kwargs['policy_ups_freq'] == 0:

                actor_actions = self.actor.Gumbel_softmax_sample_distribution(state_batch, use_cuda=True)\
                    if self.algo_name == 'dis' else self.actor.forward(state_batch)
                #actor_actions = self.actor.forward(state_batch)
                #if random.random() < 0.001: print('actor action changed')
                Q1, Q2, val = self.critic.forward(state_batch, actor_actions)

                # if self.args.use_advantage: policy_loss = -(Q1 - val)
                policy_loss = -Q1 + 0.1 * self.HLoss(
                    actor_actions
                )  # HLoss is a single scalar, directly regularized logits?

                if random.random() < 0.0005:
                    print('added entropy regularization, off_policy_algo 161')

                self.compute_stats(policy_loss, self.policy_loss)
                policy_loss = policy_loss.mean()

                #print(policy_loss, 'off_policy line 157')
                self.actor_optim.zero_grad()

                policy_loss.backward(retain_graph=True)
                self.actor_optim.step()

                #if random.random() <= 0.001:
                #    self.test_actor_gradient_descent(state_batch)

            if self.num_critic_updates % kwargs['policy_ups_freq'] == 0:
                utils.soft_update(self.actor_target, self.actor, self.tau)
            utils.soft_update(self.critic_target, self.critic, self.tau)

    def test_actor_gradient_descent(self, state_batch):
        #this method test if running gradient descent on the actor actually decrease the loss
        print("test_actor_gradient_descent, off_policy_algo line 179")
        for i in range(10):
            actor_actions = self.actor.forward(state_batch)
            print("logits_",
                  self.actor.w_out(self.actor.logits(state_batch))[0])
            print("action_batch", actor_actions[0])
            Q1, Q2, val = self.critic.forward(state_batch, actor_actions)
            policy_loss = -Q1
            policy_loss = policy_loss.mean()
            print("policy_loss at i = ", i, " is ", policy_loss)
            self.actor_optim.zero_grad()
            policy_loss.backward(retain_graph=True)
            print("gradient_", self.actor.f1.bias.grad[0])
            self.actor_optim.step()
            print("bias_", self.actor.f1.bias[0])
Пример #5
0
class Off_Policy_Algo(object):
    """Classes implementing TD3 and DDPG off-policy learners

         Parameters:
               args (object): Parameter class


     """
    def __init__(self, wwid, algo_name, state_dim, action_dim, actor_lr, critic_lr, gamma, tau, init_w = True):

        self.algo_name = algo_name; self.gamma = gamma; self.tau = tau

        #Initialize actors
        self.actor = Actor(state_dim, action_dim, wwid)
        if init_w: self.actor.apply(utils.init_weights)
        self.actor_target = Actor(state_dim, action_dim, wwid)
        utils.hard_update(self.actor_target, self.actor)
        self.actor_optim = Adam(self.actor.parameters(), actor_lr)


        self.critic = Critic(state_dim, action_dim)
        if init_w: self.critic.apply(utils.init_weights)
        self.critic_target = Critic(state_dim, action_dim)
        utils.hard_update(self.critic_target, self.critic)
        self.critic_optim = Adam(self.critic.parameters(), critic_lr)

        self.loss = nn.MSELoss()

        self.actor_target.cuda(); self.critic_target.cuda(); self.actor.cuda(); self.critic.cuda()
        self.num_critic_updates = 0

        #Statistics Tracker
        self.action_loss = {'min':[], 'max': [], 'mean':[], 'std':[]}
        self.policy_loss = {'min':[], 'max': [], 'mean':[], 'std':[]}
        self.critic_loss = {'mean':[]}
        self.q = {'min':[], 'max': [], 'mean':[], 'std':[]}
        self.val = {'min':[], 'max': [], 'mean':[], 'std':[]}

    def compute_stats(self, tensor, tracker):
        """Computes stats from intermediate tensors

             Parameters:
                   tensor (tensor): tensor
                   tracker (object): logger

             Returns:
                   None


         """
        tracker['min'].append(torch.min(tensor).item())
        tracker['max'].append(torch.max(tensor).item())
        tracker['mean'].append(torch.mean(tensor).item())
        tracker['mean'].append(torch.mean(tensor).item())

    def update_parameters(self, state_batch, next_state_batch, action_batch, reward_batch, done_batch, num_epoch=1, **kwargs):
        """Runs a step of Bellman upodate and policy gradient using a batch of experiences

             Parameters:
                  state_batch (tensor): Current States
                  next_state_batch (tensor): Next States
                  action_batch (tensor): Actions
                  reward_batch (tensor): Rewards
                  done_batch (tensor): Done batch
                  num_epoch (int): Number of learning iteration to run with the same data

             Returns:
                   None

         """

        if isinstance(state_batch, list): state_batch = torch.cat(state_batch); next_state_batch = torch.cat(next_state_batch); action_batch = torch.cat(action_batch); reward_batch = torch.cat(reward_batch). done_batch = torch.cat(done_batch)

        for _ in range(num_epoch):
            ########### CRITIC UPDATE ####################

            #Compute next q-val, next_v and target
            with torch.no_grad():
                #Policy Noise
                policy_noise = np.random.normal(0, kwargs['policy_noise'], (action_batch.size()[0], action_batch.size()[1]))
                policy_noise = torch.clamp(torch.Tensor(policy_noise), -kwargs['policy_noise_clip'], kwargs['policy_noise_clip'])

                #Compute next action_bacth
                next_action_batch = self.actor_target.forward(next_state_batch) + policy_noise.cuda()
                next_action_batch = torch.clamp(next_action_batch, 0,1)

                #Compute Q-val and value of next state masking by done
                q1, q2, _ = self.critic_target.forward(next_state_batch, next_action_batch)
                q1 = (1 - done_batch) * q1
                q2 = (1 - done_batch) * q2

                #Select which q to use as next-q (depends on algo)
                if self.algo_name == 'TD3' or self.algo_name == 'TD3_actor_min': next_q = torch.min(q1, q2)
                elif self.algo_name == 'DDPG': next_q = q1
                elif self.algo_name == 'TD3_max': next_q = torch.max(q1, q2)

                #Compute target q and target val
                target_q = reward_batch + (self.gamma * next_q)


            self.critic_optim.zero_grad()
            current_q1, current_q2, current_val = self.critic.forward((state_batch), (action_batch))
            self.compute_stats(current_q1, self.q)

            dt = self.loss(current_q1, target_q)

            if self.algo_name == 'TD3' or self.algo_name == 'TD3_max': dt = dt + self.loss(current_q2, target_q)
            self.critic_loss['mean'].append(dt.item())

            dt.backward()

            self.critic_optim.step()
            self.num_critic_updates += 1


            #Delayed Actor Update
            if self.num_critic_updates % kwargs['policy_ups_freq'] == 0:

                actor_actions = self.actor.forward(state_batch)
                Q1, Q2, val = self.critic.forward(state_batch, actor_actions)

                # if self.args.use_advantage: policy_loss = -(Q1 - val)
                policy_loss = -Q1

                self.compute_stats(policy_loss,self.policy_loss)
                policy_loss = policy_loss.mean()


                self.actor_optim.zero_grad()



                policy_loss.backward(retain_graph=True)
                self.actor_optim.step()


            if self.num_critic_updates % kwargs['policy_ups_freq'] == 0: utils.soft_update(self.actor_target, self.actor, self.tau)
            utils.soft_update(self.critic_target, self.critic, self.tau)
class TD3_DDPG(object):
    """Classes implementing TD3 and DDPG off-policy learners

         Parameters:
               args (object): Parameter class


     """
    def __init__(self, args):

        self.args = args
        self.algo = args.algo

        self.actor = Actor(args)
        if args.init_w: self.actor.apply(utils.init_weights)
        self.actor_target = Actor(args)
        self.actor_optim = Adam(self.actor.parameters(), lr=5e-5)

        self.critic = Critic(args)
        if args.init_w: self.critic.apply(utils.init_weights)
        self.critic_target = Critic(args)
        self.critic_optim = Adam(self.critic.parameters(), lr=5e-4)

        self.gamma = args.gamma
        self.tau = self.args.tau
        self.loss = nn.MSELoss()

        self.hard_update(
            self.actor_target,
            self.actor)  # Make sure target is with the same weight
        self.hard_update(self.critic_target, self.critic)
        self.actor_target.cuda()
        self.critic_target.cuda()
        self.actor.cuda()
        self.critic.cuda()
        self.num_critic_updates = 0

        #Statistics Tracker
        self.action_loss = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.policy_loss = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.critic_loss = {'mean': []}
        self.q = {'min': [], 'max': [], 'mean': [], 'std': []}
        self.val = {'min': [], 'max': [], 'mean': [], 'std': []}

    def compute_stats(self, tensor, tracker):
        """Computes stats from intermediate tensors

             Parameters:
                   tensor (tensor): tensor
                   tracker (object): logger

             Returns:
                   None


         """
        tracker['min'].append(torch.min(tensor).item())
        tracker['max'].append(torch.max(tensor).item())
        tracker['mean'].append(torch.mean(tensor).item())
        tracker['mean'].append(torch.mean(tensor).item())

    def update_parameters(self,
                          state_batch,
                          next_state_batch,
                          action_batch,
                          reward_batch,
                          done_batch,
                          num_epoch=1):
        """Runs a step of Bellman upodate and policy gradient using a batch of experiences

             Parameters:
                  state_batch (tensor): Current States
                  next_state_batch (tensor): Next States
                  action_batch (tensor): Actions
                  reward_batch (tensor): Rewards
                  done_batch (tensor): Done batch
                  num_epoch (int): Number of learning iteration to run with the same data

             Returns:
                   None

         """

        if isinstance(state_batch, list):
            state_batch = torch.cat(state_batch)
            next_state_batch = torch.cat(next_state_batch)
            action_batch = torch.cat(action_batch)
            reward_batch = torch.cat(reward_batch).done_batch = torch.cat(
                done_batch)

        for _ in range(num_epoch):
            ########### CRITIC UPDATE ####################

            #Compute next q-val, next_v and target
            with torch.no_grad():
                #Policy Noise
                policy_noise = np.random.normal(
                    0, self.args.policy_noise,
                    (action_batch.size()[0], action_batch.size()[1]))
                policy_noise = torch.clamp(torch.Tensor(policy_noise),
                                           -self.args.policy_noise_clip,
                                           self.args.policy_noise_clip)

                #Compute next action_bacth
                next_action_batch = self.actor_target.forward(
                    next_state_batch) + policy_noise.cuda()
                next_action_batch = torch.clamp(next_action_batch, 0, 1)

                #Compute Q-val and value of next state masking by done
                q1, q2, next_val = self.critic_target.forward(
                    next_state_batch, next_action_batch)
                if self.args.use_done_mask:
                    q1 = (1 - done_batch) * q1
                    q2 = (1 - done_batch) * q2
                    next_val = (1 - done_batch) * next_val

                #Clamp Q-vals
                if self.args.q_clamp != None:
                    q1 = torch.clamp(q1, -self.args.q_clamp, self.args.q_clamp)
                    q1 = torch.clamp(q2, -self.args.q_clamp, self.args.q_clamp)

                #Select which q to use as next-q (depends on algo)
                if self.algo == 'TD3' or self.algo == 'TD3_actor_min':
                    next_q = torch.min(q1, q2)
                elif self.algo == 'DDPG':
                    next_q = q1
                elif self.algo == 'TD3_max':
                    next_q = torch.max(q1, q2)

                #Compute target q and target val
                target_q = reward_batch + (self.gamma * next_q)
                if self.args.use_advantage:
                    target_val = reward_batch + (self.gamma * next_val)

            self.critic_optim.zero_grad()
            current_q1, current_q2, current_val = self.critic.forward(
                (state_batch), (action_batch))
            self.compute_stats(current_q1, self.q)

            dt = self.loss(current_q1, target_q)
            if self.args.use_advantage:
                dt = dt + self.loss(current_val, target_val)
                self.compute_stats(current_val, self.val)

            if self.algo == 'TD3' or self.algo == 'TD3_max':
                dt = dt + self.loss(current_q2, target_q)
            self.critic_loss['mean'].append(dt.item())

            if self.args.critic_constraint:
                if dt.item() > self.args.critic_constraint_w:
                    dt = dt * (abs(self.args.critic_constraint_w / dt.item()))
            dt.backward()

            self.critic_optim.step()
            self.num_critic_updates += 1

            #Delayed Actor Update
            if self.num_critic_updates % self.args.policy_ups_freq == 0:

                actor_actions = self.actor.forward(state_batch)

                # Trust Region constraint
                if self.args.trust_region_actor:
                    with torch.no_grad():
                        old_actor_actions = self.actor_target.forward(
                            state_batch)
                    actor_actions = action_batch - old_actor_actions

                Q1, Q2, val = self.critic.forward(state_batch, actor_actions)

                if self.args.use_advantage: policy_loss = -(Q1 - val)
                else: policy_loss = -Q1
                self.compute_stats(policy_loss, self.policy_loss)
                policy_loss = policy_loss.mean()

                self.actor_optim.zero_grad()

                policy_loss.backward(retain_graph=True)
                #nn.utils.clip_grad_norm_(self.actor.parameters(), 10)
                if self.args.action_loss:
                    action_loss = torch.abs(actor_actions - 0.5)
                    self.compute_stats(action_loss, self.action_loss)
                    action_loss = action_loss.mean() * self.args.action_loss_w
                    action_loss.backward()
                    #if self.action_loss[-1] > self.policy_loss[-1]: self.args.action_loss_w *= 0.9 #Decay action_w loss if action loss is larger than policy gradient loss
                self.actor_optim.step()

            if self.args.hard_update:
                if self.num_critic_updates % self.args.hard_update_freq == 0:
                    if self.num_critic_updates % self.args.policy_ups_freq == 0:
                        self.hard_update(self.actor_target, self.actor)
                    self.hard_update(self.critic_target, self.critic)

            else:
                if self.num_critic_updates % self.args.policy_ups_freq == 0:
                    self.soft_update(self.actor_target, self.actor, self.tau)
                self.soft_update(self.critic_target, self.critic, self.tau)

    def soft_update(self, target, source, tau):
        """Soft update from target network to source

            Parameters:
                  target (object): A pytorch model
                  source (object): A pytorch model
                  tau (float): Tau parameter

            Returns:
                None

        """

        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - tau) +
                                    param.data * tau)

    def hard_update(self, target, source):
        """Hard update (clone) from target network to source

            Parameters:
                  target (object): A pytorch model
                  source (object): A pytorch model

            Returns:
                None
        """

        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)