Exemplo n.º 1
0
    def __init__(self,
                 env,
                 use_conv=True,
                 learning_rate=3e-4,
                 gamma=0.99,
                 buffer_size=10000,
                 resume=False):
        self.env = env
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.replay_buffer = BasicBuffer(max_size=buffer_size)
        self.epsilon = 1
        self.epsilon_min = 0.001
        self.epsilon_decay = 0.0005
        self.losses = []

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.path = 'saved-models/qwop_cnn.game.model'
        self.model = CNN()
        self.date = datetime.now().strftime("%b-%d-%Y-%H-%M-%S")
        self.save_path = 'saved-models/' + self.date + '-qwop_cnn.game' + '.model'

        if resume:
            self.model.load_state_dict(torch.load(self.path))
            self.model.eval()
            f = open('states/epsilon_decay.txt')
            self.epsilon = float(f.readline())
            f.close()

        self.optimizer = torch.optim.Adam(self.model.parameters())
        self.MSE_loss = nn.MSELoss()
    def __init__(self, env, gamma, tau, buffer_maxlen, critic_learning_rate, actor_learning_rate):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        # hyperparameters
        self.env = env
        self.gamma = gamma
        self.tau = tau

        # initialize actor and critic networks
        self.critic = Critic(self.obs_dim, self.action_dim).to(self.device)
        self.critic_target = Critic(self.obs_dim, self.action_dim).to(self.device)

        self.actor = Actor(self.obs_dim, self.action_dim).to(self.device)
        self.actor_target = Actor(self.obs_dim, self.action_dim).to(self.device)

        # Copy critic target parameters
        for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()):
            target_param.data.copy_(param.data)

        # optimizers
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_learning_rate)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_learning_rate)

        self.replay_buffer = BasicBuffer(buffer_maxlen)
        self.noise = OUNoise(self.env.action_space)
Exemplo n.º 3
0
    def __init__(self,
                 env,
                 use_conv=True,
                 learning_rate=3e-4,
                 gamma=0.99,
                 tau=0.01,
                 buffer_size=10000):
        self.env = env
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.tau = tau
        self.replay_buffer = BasicBuffer(max_size=buffer_size)

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.use_conv = use_conv
        if self.use_conv:
            self.model1 = ConvDQN(env.observation_space.shape,
                                  env.action_space.n).to(self.device)
            self.model2 = ConvDQN(env.observation_space.shape,
                                  env.action_space.n).to(self.device)
        else:
            self.model1 = DQN(env.observation_space.shape,
                              len(env.action_space)).to(self.device)
            self.model2 = DQN(env.observation_space.shape,
                              len(env.action_space)).to(self.device)

        self.optimizer1 = torch.optim.Adam(self.model1.parameters())
        self.optimizer2 = torch.optim.Adam(self.model2.parameters())
Exemplo n.º 4
0
 def __init__(self,
              env,
              use_conv=True,
              learning_rate=3e-4,
              gamma=0.99,
              buffer_size=10000):
     self.env = env
     self.gamma = gamma
     self.replay_buffer = BasicBuffer(buffer_size)
     self.model = DistributionalDQN(self.env.observation_space.shape,
                                    self.env.action_space.n, use_conv)
     self.optimizer = torch.optim.Adam(self.model.parameters())
Exemplo n.º 5
0
 def __init__(self,
              thisRegiment,
              enemyRegiment,
              nbatcommand,
              gamma=0.95,
              buffer_size=256):
     super().__init__(thisRegiment, enemyRegiment, nbatcommand)
     self.gamma = gamma
     self.replay_buffer = BasicBuffer(max_size=buffer_size)
     self.device = None
     self.model = None
     self.optimizer = None
     self.MSE_loss = None
    def __init__(self, env, gamma, tau, alpha, q_lr, policy_lr, a_lr,
                 buffer_maxlen):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.action_range = [env.action_space.low, env.action_space.high]
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.update_step = 0
        self.delay_step = 2

        # initialize networks
        self.q_net1 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.target_q_net1 = SoftQNetwork(self.obs_dim,
                                          self.action_dim).to(self.device)
        self.target_q_net2 = SoftQNetwork(self.obs_dim,
                                          self.action_dim).to(self.device)
        self.policy_net = PolicyNetwork(self.obs_dim,
                                        self.action_dim).to(self.device)

        # copy params to target param
        for target_param, param in zip(self.target_q_net1.parameters(),
                                       self.q_net1.parameters()):
            target_param.data.copy_(param)

        for target_param, param in zip(self.target_q_net2.parameters(),
                                       self.q_net2.parameters()):
            target_param.data.copy_(param)

        # initialize optimizers
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=policy_lr)

        # entropy temperature
        self.alpha = alpha
        self.target_entropy = -torch.prod(
            torch.Tensor(self.env.action_space.shape).to(self.device)).item()
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
        self.alpha_optim = optim.Adam([self.log_alpha], lr=a_lr)

        self.replay_buffer = BasicBuffer(buffer_maxlen)
class DuelingAgent:
    def __init__(self,
                 env,
                 use_conv=True,
                 learning_rate=3e-4,
                 gamma=0.99,
                 buffer_size=10000):
        self.env = env
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.replay_buffer = BasicBuffer(max_size=buffer_size)

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.use_conv = use_conv
        if self.use_conv:
            self.model = ConvDuelingDQN(env.observation_space.shape,
                                        env.action_space.n).to(self.device)
        else:
            self.model = DuelingDQN(env.observation_space.shape,
                                    env.action_space.n).to(self.device)

        self.optimizer = torch.optim.Adam(self.model.parameters())
        self.MSE_loss = nn.MSELoss()

    def get_action(self, state, eps=0.20):
        state = torch.FloatTensor(state).float().unsqueeze(0).to(self.device)
        qvals = self.model.forward(state)
        action = np.argmax(qvals.cpu().detach().numpy())

        if (np.random.randn() > eps):
            return self.env.action_space.sample()
        return action

    def compute_loss(self, batch):
        states, actions, rewards, next_states, dones = batch
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)

        curr_Q = self.model.forward(states).gather(1, actions.unsqueeze(1))
        curr_Q = curr_Q.squeeze(1)
        next_Q = self.model.forward(next_states)
        max_next_Q = torch.max(next_Q, 1)[0]
        expected_Q = rewards.squeeze(1) + self.gamma * max_next_Q

        loss = self.MSE_loss(curr_Q, expected_Q)

        return loss

    def update(self, batch_size):
        batch = self.replay_buffer.sample(batch_size)
        loss = self.compute_loss(batch)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
Exemplo n.º 8
0
    def __init__(self, env: object, gamma: float, tau: float, buffer_maxlen: int,
     noise_std: float, noise_bound: float, critic_lr: float, actor_lr:float):

        # Selecting the device to use, wheter CUDA (GPU) if available or CPU
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # Creating the Gym environments for training and evaluation
        self.env = env
        # Get max and min values of the action of this environment
        self.action_range = [self.env.action_space.low, self.env.action_space.high]
        # Get dimension of of the state and the state
        self.obs_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.critic_lr = critic_lr
        self.actor_lr = actor_lr
        self.buffer_maxlen = buffer_maxlen
        self.noise_std = noise_std
        self.noise_bound = noise_bound

        # Scaling and bias factor for the actions -> We need scaling of the actions because each environment has different min and max values of actions
        self.scale = (self.action_range[1] - self.action_range[0]) / 2.0
        self.bias = (self.action_range[1] + self.action_range[0]) / 2.0

        # initialize networks
        self.critic1 = Critic(self.obs_dim, self.action_dim).to(self.device)
        self.target_critic1 = Critic(self.obs_dim, self.action_dim).to(self.device)
        self.actor = Actor(self.obs_dim, self.action_dim).to(self.device)
        self.target_actor = Actor(self.obs_dim, self.action_dim).to(self.device)
        torch.save(self.actor,'tes')

        # copy weight parameters to the target Critic network and actor network
        for target_param, param in zip(self.target_critic1.parameters(), self.critic1.parameters()):
            target_param.data.copy_(param)

        for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()):
            target_param.data.copy_(param)


        # initialize optimizers 
        self.critic1_optimizer = optim.Adam(self.critic1.parameters(), lr=self.critic_lr)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.actor_lr)

        # Create a replay buffer
        self.replay_buffer = BasicBuffer(self.buffer_maxlen)
Exemplo n.º 9
0
    def __init__(self, env, gamma, tau, buffer_maxlen, delay_step, noise_std,
                 noise_bound, critic_lr, actor_lr):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.noise_std = noise_std
        self.noise_bound = noise_bound
        self.update_step = 0
        self.delay_step = delay_step

        # initialize actor and critic networks
        self.critic1 = Critic(self.obs_dim, self.action_dim).to(self.device)
        self.critic2 = Critic(self.obs_dim, self.action_dim).to(self.device)
        self.critic1_target = Critic(self.obs_dim,
                                     self.action_dim).to(self.device)
        self.critic2_target = Critic(self.obs_dim,
                                     self.action_dim).to(self.device)

        self.actor = Actor(self.obs_dim, self.action_dim).to(self.device)
        self.actor_target = Actor(self.obs_dim,
                                  self.action_dim).to(self.device)

        # Copy critic target parameters
        for target_param, param in zip(self.critic1_target.parameters(),
                                       self.critic1.parameters()):
            target_param.data.copy_(param.data)

        for target_param, param in zip(self.critic2_target.parameters(),
                                       self.critic2.parameters()):
            target_param.data.copy_(param.data)

        # initialize optimizers
        self.critic1_optimizer = optim.Adam(self.critic1.parameters(),
                                            lr=critic_lr)
        self.critic2_optimizer = optim.Adam(self.critic1.parameters(),
                                            lr=critic_lr)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)

        self.replay_buffer = BasicBuffer(buffer_maxlen)
Exemplo n.º 10
0
class NoisyDQNAgent:
    def __init__(self,
                 env,
                 use_conv=True,
                 learning_rate=1e-4,
                 gamma=0.99,
                 buffer_maxlen=100000):
        self.env = env
        self.use_conv = use_conv
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.replay_buffer = BasicBuffer(buffer_maxlen)

        if self.use_conv:
            self.model = ConvNoisyDQN(env.observation_space.shape,
                                      env.action_space.n)
        else:
            self.model = NoisyDQN(self.env.observation_space.shape,
                                  self.env.action_space.n)

        self.MSE_loss = nn.MSELoss()
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)

    def get_action(self, state):
        state = autograd.Variable(torch.FloatTensor(state).unsqueeze(0))
        qvals = self.model.forward(state)
        action = np.argmax(qvals.detach().numpy())

        return action

    def compute_loss(self, batch):
        states, actions, rewards, next_states, dones = batch
        states = torch.FloatTensor(states)
        actions = torch.LongTensor(actions)
        rewards = torch.FloatTensor(rewards)
        next_states = torch.FloatTensor(next_states)
        dones = torch.FloatTensor(dones)

        curr_Q = self.model.forward(states).gather(1, actions.unsqueeze(1))
        curr_Q = curr_Q.squeeze(1)
        next_Q = self.model.forward(next_states)
        max_next_Q = torch.max(next_Q, 1)[0]
        expected_Q = rewards.squeeze(1) + self.gamma * max_next_Q
        loss = self.MSE_loss(curr_Q, expected_Q)

        return loss

    def update(self, batch_size):
        batch = self.replay_buffer.sample(batch_size)
        loss = self.compute_loss(batch)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
Exemplo n.º 11
0
class C51Agent:
    def __init__(self,
                 env,
                 use_conv=True,
                 learning_rate=3e-4,
                 gamma=0.99,
                 buffer_size=10000):
        self.env = env
        self.gamma = gamma
        self.replay_buffer = BasicBuffer(buffer_size)
        self.model = DistributionalDQN(self.env.observation_space.shape,
                                       self.env.action_space.n, use_conv)
        self.optimizer = torch.optim.Adam(self.model.parameters())

    def get_action(self, state):
        state = autograd.Variable(torch.from_numpy(state).float().unsqueeze(0))
        dist, qvals = self.model.forward(state)
        action = np.argmax(qvals.detach().numpy())

        return action

    def compute_error(self, batch_size):
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
            batch_size)
        states = torch.FloatTensor(states)
        actions = torch.LongTensor(actions)
        rewards = torch.FloatTensor(rewards)
        next_states = torch.FloatTensor(next_states)
        dones = torch.FloatTensor(dones)

        curr_dist, _ = self.model.forward(states)
        curr_action_dist = curr_dist[range(batch_size), actions]

        next_dist, next_qvals = self.model.forward(next_states)
        next_actions = torch.max(next_qvals, 1)[1]
        next_dist = self.model.softmax(next_dist)
        optimal_dist = next_dist[range(batch_size), next_actions]

        projection = dist_projection(optimal_dist, rewards, dones, self.gamma,
                                     self.model.n_atoms, self.model.Vmin,
                                     self.model.Vmax, self.model.support)

        loss = -KL_divergence_two_dist(optimal_dist, projection)

        return loss

    def update(self, batch_size):

        loss = self.compute_error(batch_size)
        loss = loss.mean()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
Exemplo n.º 12
0
    def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        # self.action_range = [env.action_space.low, env.action_space.high]
        # TODO: as a simple demo, I changed here; for the implementation, we should pass this as parameters
        self.action_range = [[-1, 1], [-1, 1]]
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = 2
        # self.action_dim = 1

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.update_step = 0
        self.delay_step = 2

        # initialize networks
        self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.target_value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.q_net1 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.policy_net = PolicyNetwork(self.obs_dim,
                                        self.action_dim).to(self.device)

        # copy params to target param
        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(param)

        # initialize optimizers
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr)
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=policy_lr)

        self.replay_buffer = BasicBuffer(buffer_maxlen)
Exemplo n.º 13
0
    def __init__(self,
                 env,
                 use_conv=True,
                 learning_rate=1e-4,
                 gamma=0.99,
                 buffer_maxlen=100000):
        self.env = env
        self.use_conv = use_conv
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.replay_buffer = BasicBuffer(buffer_maxlen)

        if self.use_conv:
            self.model = ConvNoisyDQN(env.observation_space.shape,
                                      env.action_space.n)
        else:
            self.model = NoisyDQN(self.env.observation_space.shape,
                                  self.env.action_space.n)

        self.MSE_loss = nn.MSELoss()
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
Exemplo n.º 14
0
    def __init__(self,
                 env,
                 use_conv=True,
                 learning_rate=3e-4,
                 gamma=0.99,
                 buffer_size=10000,
                 resume=False):
        self.env = env
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.replay_buffer = BasicBuffer(max_size=buffer_size)
        self.epsilon = 1
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.0005

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.use_conv = use_conv
        if self.use_conv:
            self.path = 'saved-models/qwop_cnn.game.model'
            self.model = ConvDQN(env.observation_space.shape,
                                 env.action_space.n).to(self.device)

        else:
            self.path = 'saved-models/qwop_nn.game.model'
            self.model = DQN(env.observation_space.shape,
                             env.action_space.n).to(self.device)

        if resume:
            self.model.load_state_dict(torch.load(self.path))
            self.epsilon = 0.001
            self.model.eval()

        self.optimizer = torch.optim.Adam(self.model.parameters())
        self.MSE_loss = nn.MSELoss()
Exemplo n.º 15
0
    def __init__(self,
                 env,
                 use_conv=True,
                 learning_rate=3e-4,
                 gamma=0.99,
                 buffer_size=10000):
        self.env = env
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.replay_buffer = BasicBuffer(max_size=buffer_size)

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.use_conv = use_conv
        if self.use_conv:
            self.model = ConvDuelingDQN(env.observation_space.shape,
                                        env.action_space.n).to(self.device)
        else:
            self.model = DuelingDQN(env.observation_space.shape,
                                    env.action_space.n).to(self.device)

        self.optimizer = torch.optim.Adam(self.model.parameters())
        self.MSE_loss = nn.MSELoss()
Exemplo n.º 16
0
class DoubleDQNAgent:
    def __init__(self,
                 env,
                 use_conv=True,
                 learning_rate=3e-4,
                 gamma=0.99,
                 tau=0.01,
                 buffer_size=10000):
        self.env = env
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.tau = tau
        self.replay_buffer = BasicBuffer(max_size=buffer_size)

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.use_conv = use_conv
        if self.use_conv:
            self.model1 = ConvDQN(env.observation_space.shape,
                                  env.action_space.n).to(self.device)
            self.model2 = ConvDQN(env.observation_space.shape,
                                  env.action_space.n).to(self.device)
        else:
            self.model1 = DQN(env.observation_space.shape,
                              len(env.action_space)).to(self.device)
            self.model2 = DQN(env.observation_space.shape,
                              len(env.action_space)).to(self.device)

        self.optimizer1 = torch.optim.Adam(self.model1.parameters())
        self.optimizer2 = torch.optim.Adam(self.model2.parameters())

    def get_action(self, state, eps=0.20):
        if (np.random.randn() < eps):
            return np.random.choice(self.env.action_space)

        state = torch.FloatTensor(state).float().unsqueeze(0).to(self.device)
        qvals = self.model1.forward(state)
        action = np.argmax(qvals.cpu().detach().numpy())

        return action

    def compute_loss(self, batch):
        states, actions, rewards, next_states, dones = batch
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)

        # resize tensors
        actions = actions.view(actions.size(0), 1)
        dones = dones.view(dones.size(0), 1)

        # compute loss
        curr_Q1 = self.model1.forward(states).gather(1, actions)
        curr_Q2 = self.model2.forward(states).gather(1, actions)

        next_Q1 = self.model1.forward(next_states)
        next_Q2 = self.model2.forward(next_states)
        next_Q = torch.min(
            torch.max(self.model1.forward(next_states), 1)[0],
            torch.max(self.model2.forward(next_states), 1)[0])
        next_Q = next_Q.view(next_Q.size(0), 1)
        expected_Q = rewards + (1 - dones) * self.gamma * next_Q

        loss1 = F.mse_loss(curr_Q1, expected_Q.detach())
        loss2 = F.mse_loss(curr_Q2, expected_Q.detach())

        return loss1, loss2

    def update(self, batch_size):
        batch = self.replay_buffer.sample(batch_size)
        loss1, loss2 = self.compute_loss(batch)

        self.optimizer1.zero_grad()
        loss1.backward()
        self.optimizer1.step()

        self.optimizer2.zero_grad()
        loss2.backward()
        self.optimizer2.step()
Exemplo n.º 17
0
class SACAgent:
    def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.action_range = [env.action_space.low, env.action_space.high]
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.update_step = 0
        self.delay_step = 2

        # initialize networks
        self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.target_value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.q_net1 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.policy_net = PolicyNetwork(self.obs_dim,
                                        self.action_dim).to(self.device)

        # copy params to target param
        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(param)

        # initialize optimizers
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr)
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=policy_lr)

        self.replay_buffer = BasicBuffer(buffer_maxlen)

    def get_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        mean, log_std = self.policy_net.forward(state)
        std = log_std.exp()

        normal = Normal(mean, std)
        z = normal.sample()
        action = torch.tanh(z)
        action = action.cpu().detach().squeeze(0).numpy()

        return self.rescale_action(action)

    def rescale_action(self, action):
        return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\
            (self.action_range[1] + self.action_range[0]) / 2.0

    def update(self, batch_size):
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
            batch_size)
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.FloatTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        dones = dones.view(dones.size(0), -1)

        next_actions, next_log_pi = self.policy_net.sample(next_states)
        next_q1 = self.q_net1(next_states, next_actions)
        next_q2 = self.q_net2(next_states, next_actions)
        next_v = self.target_value_net(next_states)

        # value Loss
        next_v_target = torch.min(next_q1, next_q2) - next_log_pi
        curr_v = self.value_net.forward(states)
        v_loss = F.mse_loss(curr_v, next_v_target.detach())

        # q loss
        curr_q1 = self.q_net1.forward(states, actions)
        curr_q2 = self.q_net2.forward(states, actions)
        expected_q = rewards + (1 - dones) * self.gamma * next_v
        q1_loss = F.mse_loss(curr_q1, expected_q.detach())
        q2_loss = F.mse_loss(curr_q2, expected_q.detach())

        # update value network and q networks
        self.value_optimizer.zero_grad()
        v_loss.backward()
        self.value_optimizer.step()

        self.q1_optimizer.zero_grad()
        q1_loss.backward()
        self.q1_optimizer.step()

        self.q2_optimizer.zero_grad()
        q2_loss.backward()
        self.q2_optimizer.step()

        #delayed update for policy net and target value nets
        if self.update_step % self.delay_step == 0:
            new_actions, log_pi = self.policy_net.sample(states)
            min_q = torch.min(self.q_net1.forward(states, new_actions),
                              self.q_net2.forward(states, new_actions))
            policy_loss = (log_pi - min_q).mean()

            self.policy_optimizer.zero_grad()
            policy_loss.backward()
            self.policy_optimizer.step()

            # target networks
            for target_param, param in zip(self.target_value_net.parameters(),
                                           self.value_net.parameters()):
                target_param.data.copy_(self.tau * param +
                                        (1 - self.tau) * target_param)

        self.update_step += 1
class DDPGAgent:

    def __init__(self, env, gamma, tau, buffer_maxlen, critic_learning_rate, actor_learning_rate):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        # hyperparameters
        self.env = env
        self.gamma = gamma
        self.tau = tau

        # initialize actor and critic networks
        self.critic = Critic(self.obs_dim, self.action_dim).to(self.device)
        self.critic_target = Critic(self.obs_dim, self.action_dim).to(self.device)

        self.actor = Actor(self.obs_dim, self.action_dim).to(self.device)
        self.actor_target = Actor(self.obs_dim, self.action_dim).to(self.device)

        # Copy critic target parameters
        for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()):
            target_param.data.copy_(param.data)

        # optimizers
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_learning_rate)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_learning_rate)

        self.replay_buffer = BasicBuffer(buffer_maxlen)
        self.noise = OUNoise(self.env.action_space)

    def get_action(self, obs):
        state = torch.FloatTensor(obs).unsqueeze(0).to(self.device)
        action = self.actor.forward(state)
        action = action.squeeze(0).cpu().detach().numpy()

        return action

    def update(self, batch_size):
        states, actions, rewards, next_states, _ = self.replay_buffer.sample(batch_size)
        state_batch, action_batch, reward_batch, next_state_batch, masks = self.replay_buffer.sample(batch_size)
        state_batch = torch.FloatTensor(state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        masks = torch.FloatTensor(masks).to(self.device)

        curr_Q = self.critic.forward(state_batch, action_batch)
        next_actions = self.actor_target.forward(next_state_batch)
        next_Q = self.critic_target.forward(next_state_batch, next_actions.detach())
        expected_Q = reward_batch + self.gamma * next_Q

        # update critic
        q_loss = F.mse_loss(curr_Q, expected_Q.detach())

        self.critic_optimizer.zero_grad()
        q_loss.backward()
        self.critic_optimizer.step()

        # update actor
        policy_loss = -self.critic.forward(state_batch, self.actor.forward(state_batch)).mean()

        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        self.actor_optimizer.step()

        # update target networks
        for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau))

        for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau))
Exemplo n.º 19
0
class QCommander(Commander):
    def __init__(self,
                 thisRegiment,
                 enemyRegiment,
                 nbatcommand,
                 gamma=0.95,
                 buffer_size=256):
        super().__init__(thisRegiment, enemyRegiment, nbatcommand)
        self.gamma = gamma
        self.replay_buffer = BasicBuffer(max_size=buffer_size)
        self.device = None
        self.model = None
        self.optimizer = None
        self.MSE_loss = None

    def set_model(self):
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        input_dim = self.thisRegiment_size + self.enemyRegiment_size
        output_dim = len(self.order_action_map)
        self.model = DQN(input_dim, output_dim).to(self.device)
        self.optimizer = torch.optim.Adam(self.model.parameters())
        self.MSE_loss = nn.MSELoss()

    def order(self, state, eps=0.2):
        '''
        state: [array of regiment1 health + array of regiment2 health]
        e.g. action: (4, None, None, 4) -> battalion 4 will not attack any enemy battalion (wasteful!). Only happen
        when agent choose according to the q-table (early stage)
        '''
        if np.random.uniform(0, 1) < eps:
            # Make sure all actions are chosen. Otherwise, some are not going to get visited and updated.
            thisaction = random.sample(self.action_order_map.keys(),
                                       self.nbatcommand)[0]
            return self.action_order_map[thisaction], thisaction

        state = torch.FloatTensor(state).float().unsqueeze(0).to(self.device)
        #self.model.eval() # need this when forward passing one sample into nn with batchnorm layer
        qvals = self.model.forward(state)
        action = np.argmax(qvals.cpu().detach().numpy())

        return self.action_order_map[action], action

    def compute_loss(self, batch):
        states, actions, rewards, next_states, dones = batch
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = np.array([int(done) for done in dones])
        dones = torch.FloatTensor(dones).to(self.device)

        curr_Q = self.model.forward(states).gather(
            1, actions.unsqueeze(1))  # [batch_size, 1]
        curr_Q = curr_Q.squeeze(1)
        next_Q = self.model.forward(next_states)  # [batch_size, naction]
        max_next_Q = torch.max(next_Q, 1)[0]  # [batch_size, 1]
        expected_Q = rewards.squeeze(1) + self.gamma * (1 - dones) * max_next_Q

        loss = self.MSE_loss(curr_Q, expected_Q)
        return loss

    def update(self, batch_size):
        #batch =self.replay_buffer.sample_sequence(batch_size)
        batch = self.replay_buffer.sample(batch_size)
        loss = self.compute_loss(batch)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()
Exemplo n.º 20
0
class TD3Agent:
    def __init__(self, env, gamma, tau, buffer_maxlen, delay_step, noise_std,
                 noise_bound, critic_lr, actor_lr):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.noise_std = noise_std
        self.noise_bound = noise_bound
        self.update_step = 0
        self.delay_step = delay_step

        # initialize actor and critic networks
        self.critic1 = Critic(self.obs_dim, self.action_dim).to(self.device)
        self.critic2 = Critic(self.obs_dim, self.action_dim).to(self.device)
        self.critic1_target = Critic(self.obs_dim,
                                     self.action_dim).to(self.device)
        self.critic2_target = Critic(self.obs_dim,
                                     self.action_dim).to(self.device)

        self.actor = Actor(self.obs_dim, self.action_dim).to(self.device)
        self.actor_target = Actor(self.obs_dim,
                                  self.action_dim).to(self.device)

        # Copy critic target parameters
        for target_param, param in zip(self.critic1_target.parameters(),
                                       self.critic1.parameters()):
            target_param.data.copy_(param.data)

        for target_param, param in zip(self.critic2_target.parameters(),
                                       self.critic2.parameters()):
            target_param.data.copy_(param.data)

        # initialize optimizers
        self.critic1_optimizer = optim.Adam(self.critic1.parameters(),
                                            lr=critic_lr)
        self.critic2_optimizer = optim.Adam(self.critic1.parameters(),
                                            lr=critic_lr)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)

        self.replay_buffer = BasicBuffer(buffer_maxlen)

    def get_action(self, obs):
        state = torch.FloatTensor(obs).unsqueeze(0).to(self.device)
        action = self.actor.forward(state)
        action = action.squeeze(0).cpu().detach().numpy()

        return action

    def update(self, batch_size):
        state_batch, action_batch, reward_batch, next_state_batch, masks = self.replay_buffer.sample(
            batch_size)
        state_batch = torch.FloatTensor(state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        masks = torch.FloatTensor(masks).to(self.device)

        action_space_noise = self.generate_action_space_noise(action_batch)
        next_actions = self.actor.forward(state_batch) + action_space_noise
        next_Q1 = self.critic1_target.forward(next_state_batch, next_actions)
        next_Q2 = self.critic2_target.forward(next_state_batch, next_actions)
        expected_Q = reward_batch + self.gamma * torch.min(next_Q1, next_Q2)

        # critic loss
        curr_Q1 = self.critic1.forward(state_batch, action_batch)
        curr_Q2 = self.critic2.forward(state_batch, action_batch)
        critic1_loss = F.mse_loss(curr_Q1, expected_Q.detach())
        critic2_loss = F.mse_loss(curr_Q2, expected_Q.detach())

        # update critics
        self.critic1_optimizer.zero_grad()
        critic1_loss.backward()
        self.critic1_optimizer.step()

        self.critic2_optimizer.zero_grad()
        critic2_loss.backward()
        self.critic2_optimizer.step()

        # delyaed update for actor & target networks
        if (self.update_step % self.delay_step == 0):
            # actor
            self.actor_optimizer.zero_grad()
            policy_gradient = -self.critic1(state_batch,
                                            self.actor(state_batch)).mean()
            policy_gradient.backward()
            self.actor_optimizer.step()

            # target networks
            self.update_targets()

        self.update_step += 1

    def generate_action_space_noise(self, action_batch):
        noise = torch.normal(torch.zeros(action_batch.size()),
                             self.noise_std).clamp(-self.noise_bound,
                                                   self.noise_bound).to(
                                                       self.device)
        return noise

    def update_targets(self):
        for target_param, param in zip(self.critic1_target.parameters(),
                                       self.critic1.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data *
                                    (1.0 - self.tau))

        for target_param, param in zip(self.critic2_target.parameters(),
                                       self.critic2.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data *
                                    (1.0 - self.tau))

        for target_param, param in zip(self.actor_target.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data *
                                    (1.0 - self.tau))
Exemplo n.º 21
0
class DQNAgent:
    def __init__(self,
                 env,
                 use_conv=True,
                 learning_rate=3e-4,
                 gamma=0.99,
                 buffer_size=10000,
                 resume=False):
        self.env = env
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.replay_buffer = BasicBuffer(max_size=buffer_size)
        self.epsilon = 1
        self.epsilon_min = 0.001
        self.epsilon_decay = 0.0005
        self.losses = []

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.path = 'saved-models/qwop_cnn.game.model'
        self.model = CNN()
        self.date = datetime.now().strftime("%b-%d-%Y-%H-%M-%S")
        self.save_path = 'saved-models/' + self.date + '-qwop_cnn.game' + '.model'

        if resume:
            self.model.load_state_dict(torch.load(self.path))
            self.model.eval()
            f = open('states/epsilon_decay.txt')
            self.epsilon = float(f.readline())
            f.close()

        self.optimizer = torch.optim.Adam(self.model.parameters())
        self.MSE_loss = nn.MSELoss()

    def get_action(self, state):
        state = torch.unsqueeze(state, 0).float().to(self.device)
        qvals = self.model.forward(state)
        action = np.argmax(qvals.cpu().detach().numpy())

        if self.epsilon > self.epsilon_min:
            self.epsilon *= (1 - self.epsilon_decay)
        if (np.random.randn() < self.epsilon):
            return self.env.action_space.sample()
        return action

    def compute_loss(self, batch):
        states, actions, rewards, next_states, dones = batch
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        dones = torch.FloatTensor(dones)

        curr_Q = []
        next_Q = []
        for (old, new) in zip(states, next_states):
            curr_Q_state = self.model.forward(torch.unsqueeze(old, 0).float())
            next_Q_state = self.model.forward(torch.unsqueeze(new, 0).float())
            curr_Q.append(curr_Q_state.tolist())
            next_Q.append(next_Q_state.tolist())

        curr_Q = torch.FloatTensor(curr_Q)
        next_Q = torch.FloatTensor(next_Q)
        # print("curr:", curr_Q)
        # print("action:", actions)
        curr_Q = curr_Q.squeeze(1)
        curr_Q = curr_Q.gather(1, actions.unsqueeze(1))
        next_Q = next_Q.squeeze(1)
        max_next_Q = torch.max(next_Q, 1)[0]

        expected_Q = rewards.squeeze(1) + (1 - dones) * self.gamma * max_next_Q

        # print("curr:", curr_Q)
        # print("next:", next_Q)
        # print("max:", max_next_Q)
        # print("exp:", expected_Q)
        curr_Q = curr_Q.squeeze(1)
        curr_Q.requires_grad_()
        expected_Q.requires_grad_()
        loss = self.MSE_loss(curr_Q, expected_Q)
        return loss

    def update(self, batch_size):
        try:
            batch = self.replay_buffer.sample(batch_size)
            loss = self.compute_loss(batch)
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            self.losses.append(loss.item())
        except ValueError:
            print('Error')
            pass

    def update_buffer(self, prev_state, action, reward, next_state, done):
        self.replay_buffer.push(prev_state, action, reward, next_state, done)

    def save_model(self):
        torch.save(self.model.state_dict(), self.save_path)
        f = open("states/epsilon_decay_" + self.date + '.txt', "w")
        f.write(str(self.epsilon))
        f.close()
Exemplo n.º 22
0
class SACAgent:
    def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        # self.action_range = [env.action_space.low, env.action_space.high]
        # TODO: as a simple demo, I changed here; for the implementation, we should pass this as parameters
        self.action_range = [[-1, 1], [-1, 1]]
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = 2
        # self.action_dim = 1

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.update_step = 0
        self.delay_step = 2

        # initialize networks
        self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.target_value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.q_net1 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.policy_net = PolicyNetwork(self.obs_dim,
                                        self.action_dim).to(self.device)

        # copy params to target param
        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(param)

        # initialize optimizers
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr)
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=policy_lr)

        self.replay_buffer = BasicBuffer(buffer_maxlen)

    # pi: state -> acton
    def get_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        mean, log_std = self.policy_net.forward(state)
        std = log_std.exp()

        normal = Normal(mean, std)
        z = normal.sample()
        action = torch.tanh(z)
        action = action.cpu().detach().squeeze(0).numpy()

        return self.rescale_action(action)

    def rescale_action(self, action):
        '''if action < 0.5:
            return 0
        else:
            return 1'''
        scaled_action = []
        for idx, a in enumerate(action):
            action_range = self.action_range[idx]
            a = (action_range[1] - action_range[0]) / 2.0 + (
                action_range[1] + action_range[0]) / 2.0
            scaled_action.append(a)
        return scaled_action

    def update(self, batch_size):
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
            batch_size)
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.FloatTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        dones = dones.view(dones.size(0), -1)

        next_actions, next_log_pi = self.policy_net.sample(next_states)
        next_q1 = self.q_net1(next_states, next_actions)
        next_q2 = self.q_net2(next_states, next_actions)
        next_v = self.target_value_net(next_states)

        # value Loss
        next_v_target = torch.min(next_q1, next_q2) - next_log_pi
        curr_v = self.value_net.forward(states)
        v_loss = F.mse_loss(curr_v, next_v_target.detach())

        #TODO: Question: why using 2 Q-networks?
        # To reduce bias in training.

        # q loss
        curr_q1 = self.q_net1.forward(states, actions)
        curr_q2 = self.q_net2.forward(states, actions)
        expected_q = rewards + (1 - dones) * self.gamma * next_v
        q1_loss = F.mse_loss(curr_q1, expected_q.detach())
        q2_loss = F.mse_loss(curr_q2, expected_q.detach())

        # update value network and q networks
        self.value_optimizer.zero_grad()
        v_loss.backward()
        self.value_optimizer.step()

        self.q1_optimizer.zero_grad()
        q1_loss.backward()
        self.q1_optimizer.step()

        self.q2_optimizer.zero_grad()
        q2_loss.backward()
        self.q2_optimizer.step()

        # delayed update for policy net and target value nets
        # TODO: Question: what does this part do?
        # The original paper mentioned 2 methods for approximating the value function
        # 1. the EMA of policy weights to update the Q network
        # 2. periodical update of the policy network, which is used in this code
        if self.update_step % self.delay_step == 0:
            new_actions, log_pi = self.policy_net.sample(states)
            min_q = torch.min(self.q_net1.forward(states, new_actions),
                              self.q_net2.forward(states, new_actions))
            policy_loss = (log_pi - min_q).mean()

            self.policy_optimizer.zero_grad()
            policy_loss.backward()
            self.policy_optimizer.step()

            # target networks
            for target_param, param in zip(self.target_value_net.parameters(),
                                           self.value_net.parameters()):
                target_param.data.copy_(self.tau * param +
                                        (1 - self.tau) * target_param)

        self.update_step += 1
Exemplo n.º 23
0
class SACAgent():
    def __init__(self, env: object, gamma: float, tau: float,
                 buffer_maxlen: int, critic_lr: float, actor_lr: float,
                 reward_scale: int):

        # Selecting the device to use, wheter CUDA (GPU) if available or CPU
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        # Creating the Gym environments for training and evaluation
        self.env = env
        # Get max and min values of the action of this environment
        self.action_range = [
            self.env.action_space.low, self.env.action_space.high
        ]
        # Get dimension of of the state and the action
        self.obs_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.critic_lr = critic_lr
        self.actor_lr = actor_lr
        self.buffer_maxlen = buffer_maxlen
        self.reward_scale = reward_scale

        # Scaling and bias factor for the actions -> We need scaling of the actions because each environment has different min and max values of actions
        self.scale = (self.action_range[1] - self.action_range[0]) / 2.0
        self.bias = (self.action_range[1] + self.action_range[0]) / 2.0

        # initialize networks
        self.q_net1 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.target_q_net1 = SoftQNetwork(self.obs_dim,
                                          self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.target_q_net2 = SoftQNetwork(self.obs_dim,
                                          self.action_dim).to(self.device)
        self.policy = PolicyNetwork(self.obs_dim,
                                    self.action_dim).to(self.device)

        # copy weight parameters to the target Q networks
        for target_param, param in zip(self.target_q_net1.parameters(),
                                       self.q_net1.parameters()):
            target_param.data.copy_(param)

        for target_param, param in zip(self.target_q_net2.parameters(),
                                       self.q_net2.parameters()):
            target_param.data.copy_(param)

        # initialize optimizers
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(),
                                       lr=self.critic_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(),
                                       lr=self.critic_lr)
        self.policy_optimizer = optim.Adam(self.policy.parameters(),
                                           lr=self.actor_lr)

        # Create a replay buffer
        self.replay_buffer = BasicBuffer(self.buffer_maxlen)

    def update(self, batch_size: int):
        # Sampling experiences from the replay buffer
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
            batch_size)

        # Convert numpy arrays of experience tuples into pytorch tensors
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.FloatTensor(actions).to(self.device)
        rewards = self.reward_scale * torch.FloatTensor(rewards).to(
            self.device)  # in SAC we do reward scaling for the sampled rewards
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        dones = dones.view(dones.size(0), -1)

        # Critic update (computing the loss)
        # Please refer to equation (6) in the paper for details
        # Sample actions for the next states (s_t+1) using the current policy
        next_actions, next_log_pi, _, _ = self.policy.sample(
            next_states, self.scale)
        next_actions = self.rescale_action(next_actions)

        # Compute Q(s_t+1,a_t+1) by giving the states and actions to the Q network and choose the minimum from 2 target Q networks
        next_q1 = self.target_q_net1(next_states, next_actions)
        next_q2 = self.target_q_net2(next_states, next_actions)
        min_q = torch.min(next_q1,
                          next_q2)  # find minimum between next_q1 and next_q2

        # Compute the next Q_target (Q(s_t,a_t)-alpha(next_log_pi))
        next_q_target = (min_q - next_log_pi)

        # Compute the Q(s_t,a_t) using s_t and a_t from the replay buffer
        curr_q1 = self.q_net1.forward(states, actions)
        curr_q2 = self.q_net2.forward(states, actions)

        # Find expected Q, i.e., r(t) + gamma*next_q_target
        expected_q = rewards + (1 - dones) * self.gamma * next_q_target

        # Compute loss between Q network and expected Q
        q1_loss = F.mse_loss(curr_q1, expected_q.detach())
        q2_loss = F.mse_loss(curr_q2, expected_q.detach())

        # Backpropagate the losses and update Q network parameters
        self.q1_optimizer.zero_grad()
        q1_loss.backward()
        self.q1_optimizer.step()

        self.q2_optimizer.zero_grad()
        q2_loss.backward()
        self.q2_optimizer.step()

        # Policy update (computing the loss)
        # Sample new actions for the current states (s_t) using the current policy
        new_actions, log_pi, _, _ = self.policy.sample(states, self.scale)
        new_actions = self.rescale_action(new_actions)

        # Compute Q(s_t,a_t) and choose the minimum from 2 Q networks
        new_q1 = self.q_net1.forward(states, new_actions)
        new_q2 = self.q_net2.forward(states, new_actions)
        min_q = torch.min(new_q1, new_q2)

        # Compute the next policy loss, i.e., alpha*log_pi - Q(s_t,a_t) eq. (7)
        policy_loss = (log_pi - min_q).mean()

        # Backpropagate the losses and update policy network parameters
        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        # Updating target networks with soft update using update rate tau
        for target_param, param in zip(self.target_q_net1.parameters(),
                                       self.q_net1.parameters()):
            target_param.data.copy_(self.tau * param +
                                    (1 - self.tau) * target_param)

        for target_param, param in zip(self.target_q_net2.parameters(),
                                       self.q_net2.parameters()):
            target_param.data.copy_(self.tau * param +
                                    (1 - self.tau) * target_param)

    def get_action(
            self, state: np.ndarray,
            stochastic: bool) -> Tuple[np.ndarray, torch.Tensor, torch.Tensor]:
        # state: the state input to the pi network
        # stochastic: boolean (True -> use noisy action, False -> use noiseless (deterministic action))
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)

        # Get mean and sigma from the policy network
        mean, log_std = self.policy.forward(state)
        std = log_std.exp()

        # Stochastic mode is used for training, non-stochastic mode is used for evaluation
        if stochastic:
            normal = Normal(mean, std)
            z = normal.sample()
            action = torch.tanh(z)
            action = action.cpu().detach().squeeze(0).numpy()
        else:
            normal = Normal(mean, 0)
            z = normal.sample()
            action = torch.tanh(z)
            action = action.cpu().detach().squeeze(0).numpy()

        # return a rescaled action, and also the mean and standar deviation of the action
        # we use a rescaled action since the output of the policy network is [-1,1] and the mujoco environments could be ranging from [-n,n] where n is an arbitrary real value
        return self.rescale_action(action), mean, std

    def rescale_action(self, action: np.ndarray) -> np.ndarray:
        # we use a rescaled action since the output of the policy network is [-1,1] and the mujoco environments could be ranging from [-n,n] where n is an arbitrary real value
        # scale -> scalar multiplication
        # bias -> scalar offset
        return action * self.scale[0] + self.bias[0]

    def Actor_save(self, WORKSPACE: str):
        # save 각 node별 모델 저장
        print("Save the torch model")
        savePath = WORKSPACE + "./policy_model5_Hop_.pth"
        torch.save(self.policy.state_dict(), savePath)

    def Actor_load(self, WORKSPACE: str):
        # save 각 node별 모델 로드
        print("load the torch model")
        savePath = WORKSPACE + "./policy_model5_Hop_.pth"  # Best
        self.policy = PolicyNetwork(self.obs_dim,
                                    self.action_dim).to(self.device)
        self.policy.load_state_dict(torch.load(savePath))
Exemplo n.º 24
0
class TD3Agent():
    def __init__(self, env: object, gamma: float, delay_step: int, tau: float,
                 buffer_maxlen: int, noise_std: float, noise_bound: float,
                 critic_lr: float, actor_lr: float):

        # Selecting the device to use, wheter CUDA (GPU) if available or CPU
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        # Creating the Gym environments for training and evaluation
        self.env = env
        # Get max and min values of the action of this environment
        self.action_range = [
            self.env.action_space.low, self.env.action_space.high
        ]
        # Get dimension of of the state and the state
        self.obs_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]

        # Total_step initialization
        self.steps = 0

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.critic_lr = critic_lr
        self.actor_lr = actor_lr
        self.buffer_maxlen = buffer_maxlen
        self.noise_std = noise_std
        self.noise_bound = noise_bound
        self.delay_step = delay_step

        # Scaling and bias factor for the actions -> We need scaling of the actions because each environment has different min and max values of actions
        self.scale = (self.action_range[1] - self.action_range[0]) / 2.0
        self.bias = (self.action_range[1] + self.action_range[0]) / 2.0

        # initialize networks
        self.critic1 = Critic(self.obs_dim, self.action_dim).to(self.device)
        self.target_critic1 = Critic(self.obs_dim,
                                     self.action_dim).to(self.device)
        self.critic2 = Critic(self.obs_dim, self.action_dim).to(self.device)
        self.target_critic2 = Critic(self.obs_dim,
                                     self.action_dim).to(self.device)
        self.actor = Actor(self.obs_dim, self.action_dim).to(self.device)
        self.target_actor = Actor(self.obs_dim,
                                  self.action_dim).to(self.device)

        # copy weight parameters to the target Q network and actor network
        for target_param, param in zip(self.target_critic1.parameters(),
                                       self.critic1.parameters()):
            target_param.data.copy_(param)

        for target_param, param in zip(self.target_critic2.parameters(),
                                       self.critic2.parameters()):
            target_param.data.copy_(param)

        for target_param, param in zip(self.target_actor.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(param)

        # initialize optimizers
        self.critic1_optimizer = optim.Adam(self.critic1.parameters(),
                                            lr=self.critic_lr)
        self.critic2_optimizer = optim.Adam(self.critic2.parameters(),
                                            lr=self.critic_lr)
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.actor_lr)

        # Create a replay buffer
        self.replay_buffer = BasicBuffer(self.buffer_maxlen)

    def update(self, batch_size: int, steps: int):
        self.steps = steps

        # Sampling experiences from the replay buffer
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
            batch_size)

        # Convert numpy arrays of experience tuples into pytorch tensors
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.FloatTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        dones = dones.view(dones.size(0), -1)

        # Critic update (computing the loss
        # Sample actions for the next states (s_t+1) using the target actor
        next_actions = self.target_actor.forward(next_states)
        next_actions = self.rescale_action(next_actions)

        # Adding gaussian noise to the actions
        noise = self.get_noise(next_actions, self.noise_std + 0.1,
                               -self.noise_bound, self.noise_bound)
        noisy_next_actions = next_actions + noise

        # Compute Q(s_t+1,a_t+1)
        next_q1 = self.target_critic1(next_states, noisy_next_actions)
        next_q2 = self.target_critic2(next_states, noisy_next_actions)

        # Choose minimum Q
        min_q = torch.min(next_q1, next_q2)

        # Find expected Q, i.e., r(t) + gamma*next_q
        expected_q = rewards + (1 - dones) * self.gamma * min_q

        # Find current Q values for the given states and actions from replay buffer
        curr_q1 = self.critic1.forward(states, actions)
        curr_q2 = self.critic2.forward(states, actions)

        # Compute loss between Q network and expected Q
        critic1_loss = F.mse_loss(curr_q1, expected_q.detach())
        critic2_loss = F.mse_loss(curr_q2, expected_q.detach())

        # Backpropagate the losses and update Q network parameters
        self.critic1_optimizer.zero_grad()
        critic1_loss.backward()
        self.critic1_optimizer.step()

        self.critic2_optimizer.zero_grad()
        critic2_loss.backward()
        self.critic2_optimizer.step()

        # actor update (computing the loss)

        if self.steps % self.delay_step == 0:
            # Sample new actions for the current states (s_t) using the current actor
            new_actions = self.actor.forward(states)

            # Compute Q(s_t,a_t)
            new_q1 = self.critic1.forward(states, new_actions)

            # Compute the actor loss, i.e., -Q1
            actor_loss = -new_q1.mean()

            # Backpropagate the losses and update actor network parameters
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # Update the target networks
            for target_param, param in zip(self.target_critic1.parameters(),
                                           self.critic1.parameters()):
                target_param.data.copy_(self.tau * param +
                                        (1 - self.tau) * target_param)

            for target_param, param in zip(self.target_critic2.parameters(),
                                           self.critic2.parameters()):
                target_param.data.copy_(self.tau * param +
                                        (1 - self.tau) * target_param)

            for target_param, param in zip(self.target_actor.parameters(),
                                           self.actor.parameters()):
                target_param.data.copy_(self.tau * param +
                                        (1 - self.tau) * target_param)

    def get_noise(self, action: torch.Tensor, sigma: float, bottom: float,
                  top: float) -> torch.Tensor:
        # sigma: standard deviation of the noise
        # bottom,top: minimum and maximum values for the given noiuse
        return torch.normal(torch.zeros(action.size()),
                            sigma).clamp(bottom, top).to(self.device)

    def get_action(self, state: np.ndarray, stochastic: bool) -> np.ndarray:
        # state: the state input to the pi network
        # stochastic: boolean (True -> use noisy action, False -> use noiseless,deterministic action)
        # Convert state numpy to tensor
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        action = self.actor.forward(state)

        if stochastic:
            # Add gaussian noise to the rescaled action
            action = self.rescale_action(action) + self.get_noise(
                action, self.noise_std, -self.noise_bound, self.noise_bound)
        else:
            action = self.rescale_action(action)

        # Convert action tensor to numpy
        action = action.squeeze(0).cpu().detach().numpy()
        return action

    def rescale_action(self, action: torch.Tensor) -> torch.Tensor:
        # we use a rescaled action since the output of the actor network is [-1,1] and the mujoco environments could be ranging from [-n,n] where n is an arbitrary real value
        # scale -> scalar multiplication
        # bias -> scalar offset
        return action * self.scale[0] + self.bias[0]

    def Actor_save(self, WORKSPACE: str):
        # save 각 node별 모델 저장
        print("Save the torch model")
        savePath = WORKSPACE + "./actor_model5_Hop_.pth"
        torch.save(self.actor.state_dict(), savePath)

    def Actor_load(self, WORKSPACE: str):
        # save 각 node별 모델 로드
        print("load the torch model")
        savePath = WORKSPACE + "./actor_model5_Hop_.pth"  # Best
        self.actor = Actor(self.obs_dim, self.action_dim).to(self.device)
        self.actor.load_state_dict(torch.load(savePath))
Exemplo n.º 25
0
    def __init__(self, env: object, gamma: float, tau: float,
                 buffer_maxlen: int, delay_step: int, noise_std: float,
                 noise_bound: float, critic_lr: float, actor_lr: float):

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        # Environment로 부터 State(observation), Action space 설정
        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.noise_std = noise_std
        self.noise_bound = noise_bound
        self.update_step = 0
        self.delay_step = delay_step
        self.buffer_maxlen = buffer_maxlen
        self.critic1 = []
        self.critic2 = []
        self.critic_target1 = []
        self.critic_target2 = []
        self.actor = []
        self.actor_target = []
        self.critic_optimizer1 = []
        self.critic_optimizer2 = []
        self.actor_optimizer = []

        # initialize actor and critic networks depends on the action_dims(because it's MA)
        for _ in range(self.action_dim):
            self.critic1.append(
                Critic(self.obs_dim, self.action_dim).to(self.device))
            self.critic2.append(
                Critic(self.obs_dim, self.action_dim).to(self.device))
            self.critic_target1.append(
                Critic(self.obs_dim, self.action_dim).to(self.device))
            self.critic_target2.append(
                Critic(self.obs_dim, self.action_dim).to(self.device))

        for _ in range(self.action_dim):
            self.actor.append(
                Actor(self.obs_dim, self.action_dim).to(self.device))
            self.actor_target.append(
                Actor(self.obs_dim, self.action_dim).to(self.device))

        # Copy critic target parameters
        for i in range(self.action_dim):
            for target_param, param in zip(self.critic_target1[i].parameters(),
                                           self.critic1[i].parameters()):
                target_param.data.copy_(param.data)
            for target_param, param in zip(self.critic_target2[i].parameters(),
                                           self.critic2[i].parameters()):
                target_param.data.copy_(param.data)

        # initialize optimizers
        for i in range(self.action_dim):
            self.critic_optimizer1.append(
                optim.Adam(self.critic1[i].parameters(), lr=critic_lr))
            self.critic_optimizer2.append(
                optim.Adam(self.critic2[i].parameters(), lr=critic_lr))
            self.actor_optimizer.append(
                optim.Adam(self.actor[i].parameters(), lr=actor_lr))

        self.replay_buffer = BasicBuffer(10000)
        self.replay_buffer_base = BasicBuffer(self.buffer_maxlen)
Exemplo n.º 26
0
class TD3Agent:
    """
    Each joint will be the agent. Thus we will have one action (Agent) value on each joint.
    """
    def __init__(self, env: object, gamma: float, tau: float,
                 buffer_maxlen: int, delay_step: int, noise_std: float,
                 noise_bound: float, critic_lr: float, actor_lr: float):

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        # Environment로 부터 State(observation), Action space 설정
        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.noise_std = noise_std
        self.noise_bound = noise_bound
        self.update_step = 0
        self.delay_step = delay_step
        self.buffer_maxlen = buffer_maxlen
        self.critic1 = []
        self.critic2 = []
        self.critic_target1 = []
        self.critic_target2 = []
        self.actor = []
        self.actor_target = []
        self.critic_optimizer1 = []
        self.critic_optimizer2 = []
        self.actor_optimizer = []

        # initialize actor and critic networks depends on the action_dims(because it's MA)
        for _ in range(self.action_dim):
            self.critic1.append(
                Critic(self.obs_dim, self.action_dim).to(self.device))
            self.critic2.append(
                Critic(self.obs_dim, self.action_dim).to(self.device))
            self.critic_target1.append(
                Critic(self.obs_dim, self.action_dim).to(self.device))
            self.critic_target2.append(
                Critic(self.obs_dim, self.action_dim).to(self.device))

        for _ in range(self.action_dim):
            self.actor.append(
                Actor(self.obs_dim, self.action_dim).to(self.device))
            self.actor_target.append(
                Actor(self.obs_dim, self.action_dim).to(self.device))

        # Copy critic target parameters
        for i in range(self.action_dim):
            for target_param, param in zip(self.critic_target1[i].parameters(),
                                           self.critic1[i].parameters()):
                target_param.data.copy_(param.data)
            for target_param, param in zip(self.critic_target2[i].parameters(),
                                           self.critic2[i].parameters()):
                target_param.data.copy_(param.data)

        # initialize optimizers
        for i in range(self.action_dim):
            self.critic_optimizer1.append(
                optim.Adam(self.critic1[i].parameters(), lr=critic_lr))
            self.critic_optimizer2.append(
                optim.Adam(self.critic2[i].parameters(), lr=critic_lr))
            self.actor_optimizer.append(
                optim.Adam(self.actor[i].parameters(), lr=actor_lr))

        self.replay_buffer = BasicBuffer(10000)
        self.replay_buffer_base = BasicBuffer(self.buffer_maxlen)

    def get_action(self, obs: np.ndarray) -> Tuple[list, list]:
        # Action 을 얻기 위해 state를 받는다.
        state = torch.FloatTensor(obs).unsqueeze(0).to(self.device)
        action_list = []
        action_list_ = []
        # 그후, 각 node별로 NN으로 부터 Inference를 하고 이를 List에 append한다. 이때 acion은 학습용으로 with Noise, action_은 Test용으로 without Noise.
        for i in range(self.action_dim):
            action_list.append(
                (self.actor[i].forward(state[0, i])).cpu().detach() +
                (self.generate_action_space_noise(0.4)).cpu().detach())
            action_list_.append(
                (self.actor[i].forward(state[0, i])).cpu().detach())

        action = action_list
        action_ = action_list_

        return action, action_

    def update(self, batch_size: int, step_env: int):

        # Replay Buffer로 부터 batch Sample
        state_batch, action_batch, reward_batch, next_state_batch, dones = self.replay_buffer.sample(
            batch_size)

        # Batch_sample Data variable 초기화
        state_batch = np.array(state_batch)
        action_batch = np.array(action_batch)
        reward_batch = np.array(reward_batch)
        next_state_batch = np.array(next_state_batch)

        dones = torch.FloatTensor(dones).to(self.device)
        dones = dones.view(dones.size(0), -1)

        # 각 Node별 Update진행
        for i in range(self.action_dim):
            # null Action 확인 후 관련 Node 모두 0으로 초기화
            action_null = np.where(action_batch[:, i] == 0)
            state_batch_null = state_batch[:, i]
            state_batch_null[action_null] = 0
            state_batch_null = torch.FloatTensor(state_batch_null).to(
                self.device)

            action_batch_null = action_batch[:, i]
            action_batch_null[action_null] = 0
            action_batch_null = torch.FloatTensor(action_batch_null).to(
                self.device)

            next_state_batch_null = next_state_batch[:, i]
            next_state_batch_null[action_null] = 0
            next_state_batch_null = torch.FloatTensor(
                next_state_batch_null).to(self.device)

            reward_batch_null = reward_batch
            reward_batch_null[action_null] = 0
            reward_batch_null = torch.FloatTensor(reward_batch_null).to(
                self.device)

            # Add Noise for next action
            action_space_noise = self.generate_action_space_noise(0.2)

            next_actions = self.actor[i].forward(
                next_state_batch_null) + action_space_noise

            # To make expected Q-value(s_t+1)
            next_Q1 = self.critic_target1[i].forward(next_state_batch_null,
                                                     next_actions)
            next_Q2 = self.critic_target2[i].forward(next_state_batch_null,
                                                     next_actions)
            expected_Q = reward_batch_null + (
                1 - dones) * self.gamma * torch.min(next_Q1, next_Q2)
            expected_Q = expected_Q.cpu().detach().numpy()
            expected_Q[action_null] = 0
            expected_Q = torch.FloatTensor(expected_Q).to(self.device)

            # To remove the effect of null node, Masking array 생성
            masking_torch = np.ones([100, 1])
            masking_torch[action_null] = 0
            masking_torch = torch.FloatTensor(masking_torch).to(self.device)

            # 학습 위해 Critic value inference
            curr_Q1 = self.critic1[i].forward(state_batch_null,
                                              action_batch_null.reshape(-1, 1))
            curr_Q1 *= masking_torch.detach()
            curr_Q2 = self.critic2[i].forward(state_batch_null,
                                              action_batch_null.reshape(-1, 1))
            curr_Q2 *= masking_torch.detach()
            # Critic value inference and Critic value(S_+1) (Q(s_t) -(r+Q(s_t+1) )^2
            critic1_loss = F.mse_loss(curr_Q1, expected_Q.detach())
            critic2_loss = F.mse_loss(curr_Q2, expected_Q.detach())

            # Do the optimizer
            self.critic_optimizer1[i].zero_grad()
            critic1_loss.backward()
            self.critic_optimizer1[i].step()

            self.critic_optimizer2[i].zero_grad()
            critic2_loss.backward()
            self.critic_optimizer2[i].step()

            # delyaed update for actor & target networks
            if (self.update_step % self.delay_step == 0):
                # actor
                new_actions = self.actor[i](state_batch_null)
                policy_gradient = -self.critic1[i](state_batch_null,
                                                   new_actions)
                policy_gradient *= masking_torch.detach()
                policy_gradient = policy_gradient.mean()
                self.actor_optimizer[i].zero_grad()
                policy_gradient.backward()
                self.actor_optimizer[i].step()

                # target networks
                self.update_targets(i)

        self.update_step += 1

    def generate_action_space_noise(self, noise_std: float) -> torch.Tensor:
        noise = torch.normal(torch.zeros(1),
                             noise_std).clamp(-self.noise_bound,
                                              self.noise_bound).to(self.device)
        return noise

    def update_targets(self, i: int):
        for target_param, param in zip(self.critic_target1[i].parameters(),
                                       self.critic1[i].parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data *
                                    (1.0 - self.tau))

        for target_param, param in zip(self.critic_target2[i].parameters(),
                                       self.critic2[i].parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data *
                                    (1.0 - self.tau))

        for target_param, param in zip(self.actor_target[i].parameters(),
                                       self.actor[i].parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data *
                                    (1.0 - self.tau))

    def Actor_save(self):
        # save 각 node별 모델 저장
        print("Save the torch model")
        for i in range(self.action_dim):
            savePath = "./actor_model5_Hop_" + str(i) + ".pth"
            torch.save(self.actor[i].state_dict(), savePath)

    def Actor_load(self):
        # save 각 node별 모델 로드
        print("load the torch model")
        for i in range(self.action_dim):
            savePath = "./actor_model_wlk" + str(i) + ".pth"  # Best
            self.actor[i] = Actor(self.obs_dim,
                                  self.action_dim).to(self.device)
            self.actor[i].load_state_dict(torch.load(savePath))