Exemplo n.º 1
0
class Agent():
    # needs functions init, choose_action, store_transition
    def __init__(self, input_dims, fc1_dims, fc2_dims, n_actions, alpha, beta,
                batch_size=100, max_size=1e6, mu=0, sigma=0.1, clip=0.5):
        self.input_dims = input_dims
        self.n_actions = n_actions
        self.alpha = alpha
        self.beta = beta
        self.clip = clip

        self.batch_size = batch_size
        self.max_size = max_size
        self.noise = gauss(mu, sigma)
        #self.clamp = max(0.5, x)?

        self.actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims, n_actions, 'actor_net')
        self.critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims, n_actions, 'critic_net')
        self.target_critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims, n_actions, 'target_critic')
        self.memory = ReplayBuffer(self.max_size, input_dims, n_actions, batch_size=self.batch_size)

    def choose_action(self, observation):
        self.actor.eval()
        state = T.Tensor([observation], dtype=T.float).to(self.device)
        mu = self.actor.forward(state).to(self.actor.device)
        mu_prime = mu + T.Tensor(self.noise(), dtype=T.float).to(self.actor.device)
        mu_prime = np.min(self.clip, mu_prime)
        mu_prime = np.max(self.clip, mu_prime)
        self.actor.train()

        return mu_prime.cpu().detach().numpy()[0]

    def remember(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)
Exemplo n.º 2
0
class Simple_DQNAgent(Agent):
	"""
	This agent can handle the networks ConvDQN and LinearDQN. This agent uses a single DQN and a replay buffer for learning.
	"""
	def __init__(self, env, network, learning_rate, gamma, eps_max, eps_min, eps_dec, buffer_size):
		super().__init__(env, network, learning_rate, gamma, eps_max, eps_min, eps_dec)

		if self.network == "SimpleConvDQN":
			self.model = ConvDQN(env.env_shape, env.no_of_actions)
		elif self.network == "LinearDQN":
			self.model = LinearDQN(env.env_shape, env.no_of_actions)

		self.replay_buffer = ReplayBuffer(max_size=buffer_size, input_shape = env.env_shape)

	def get_action(self, state):

		if(np.random.randn() <= self.eps):
			return self.env.sample_action()

		else:
			state = T.tensor(state, dtype=T.float).unsqueeze(0).to(self.model.device)
			actions = self.model.forward(state)
			return T.argmax(actions).item()

	def update(self, batch_size):
		self.model.optimizer.zero_grad()

		batch = self.replay_buffer.sample(batch_size)
		states, actions, rewards, next_states, dones = batch
		states_t = T.tensor(states, dtype=T.float).to(self.model.device)
		actions_t = T.tensor(actions).to(self.model.device)
		rewards_t = T.tensor(rewards, dtype=T.float).to(self.model.device)
		next_states_t = T.tensor(next_states, dtype=T.float).to(self.model.device)

		curr_Q = self.model.forward(states_t).gather(1, actions_t.unsqueeze(1))
		curr_Q = curr_Q.squeeze(1)
		next_Q = self.model.forward(next_states_t)
		max_next_Q = T.max(next_Q, 1)[0]
		expected_Q = rewards_t + self.gamma * max_next_Q


		loss = self.model.MSE_loss(curr_Q, expected_Q).to(self.model.device)

		loss.backward()
		self.model.optimizer.step()

		self.dec_eps()

	def learn(self,state, action, reward, next_state, done, batch_size):
		self.replay_buffer.store_transition(state, action, reward, next_state, done)

		if len(self.replay_buffer) > batch_size:
			self.update(batch_size)
Exemplo n.º 3
0
class DDPGAgent():
    def __init__(self, env_id, alpha, beta, input_dims, tau, n_actions, gamma=0.99,
                max_size=1000000, fc1_dims=256, fc2_dims=256, batch_size=256):
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.alpha = alpha
        self.beta = beta

        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        #self.noise = OUActionNoise(mu=np.zeros(n_actions))
        self.actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims,
                                    n_actions=n_actions, name=env_id+'_actor')
        self.critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims,
                                    n_actions=n_actions, name=env_id+'_critic')
        self.target_actor  = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims,
                                    n_actions=n_actions, name=env_id+'_target_actor')
        self.target_critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims,
                                    n_actions=n_actions, name=env_id+'_target_critic')
        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        self.actor.eval()
        state = T.tensor([observation], dtype=T.float).to(self.actor.device)
        mu = self.actor.forward(state).to(self.actor.device)
        mu_prime = mu #+ T.tensor(self.noise(), dtype=T.float).to(self.actor.device)
        self.actor.train()

        return mu_prime.cpu().detach().numpy()[0]

    def remember(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def load_models(self):
        print("... loading checkpoint")
        self.actor.load_checkpoint()
        self.target_actor.load_checkpoint()
        self.critic.load_checkpoint()
        self.target_critic.load_checkpoint()

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return
        states, actions, rewards, states_, dones = \
            self.memory.sample_buffer(self.batch_size)

        states = T.tensor(states, dtype=T.float).to(self.actor.device)
        actions = T.tensor(actions, dtype=T.float).to(self.actor.device)
        rewards = T.tensor(rewards, dtype=T.float).to(self.actor.device)
        states_ = T.tensor(states_, dtype=T.float).to(self.actor.device)
        dones = T.tensor(dones).to(self.actor.device)

        target_actions = self.target_actor.forward(states_)
        critic_value_ = self.target_critic.forward(states_, target_actions)
        critic_value = self.critic.forward(states, actions)

        critic_value_[dones] = 0.0
        critic_value_ = critic_value_.view(-1)

        target = rewards + self.gamma * critic_value_
        target = target.view(self.batch_size, 1)

        self.critic.optimizer.zero_grad()
        critic_loss = F.mse_loss(target, critic_value)
        critic_loss.backward()
        self.critic.optimizer.step()

        self.actor.optimizer.zero_grad()
        actor_loss = -self.critic.forward(states, self.actor.forward(states))
        actor_loss = T.mean(actor_loss)
        actor_loss.backward()
        self.actor.optimizer.step()

        self.update_network_parameters()

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        actor_params = self.actor.named_parameters()
        critic_params = self.critic.named_parameters()
        target_actor_params = self.target_actor.named_parameters()
        target_critic_params = self.target_critic.named_parameters()

        critic_state_dict = dict(critic_params)
        actor_state_dict = dict(actor_params)
        target_critic_dict = dict(target_critic_params)
        target_actor_dict = dict(target_actor_params)

        for name in critic_state_dict:
            critic_state_dict[name] = tau*critic_state_dict[name].clone() + \
                                    (1-tau) * target_critic_dict[name].clone()
        for name in actor_state_dict:
            actor_state_dict[name] = tau*actor_state_dict[name].clone() + \
                                    (1-tau) * target_actor_dict[name].clone()

        self.target_critic.load_state_dict(critic_state_dict)
        self.target_actor.load_state_dict(actor_state_dict)
Exemplo n.º 4
0
class DQNAgent(Agent):
	"""
	Uses a replay buffer and has two DQNs, one that is used to get best actions and updated every step and the other, a target network,
	used to compute the target Q value every step. This target network is only updated with the first DQN only after a fixed number of steps.
	"""
	def __init__(self, env, network, learning_rate, gamma, eps_max, eps_min, eps_dec, buffer_size, replace_cnt):
		super().__init__(env, network, learning_rate, gamma, eps_max, eps_min, eps_dec)

		self.replay_buffer = ReplayBuffer(max_size=buffer_size, input_shape = env.env_shape)

		self.learn_step_counter = 0
		self.replace_cnt = replace_cnt
		self.q_eval = ConvDQN(env.env_shape, env.no_of_actions)
		self.q_target = ConvDQN(env.env_shape, env.no_of_actions)


	def get_action(self, state):

		if(np.random.randn() <= self.eps):
			return self.env.sample_action()

		else:
			state = T.tensor(state, dtype=T.float).unsqueeze(0).to(self.q_eval.device)
			actions = self.q_eval.forward(state)
			return T.argmax(actions).item()

	def replace_target_network(self):
		if self.learn_step_counter % self.replace_cnt == 0:
			self.q_target.load_state_dict(self.q_eval.state_dict())

	def get_batch_tensors(self, batch_size):
		batch = self.replay_buffer.sample(batch_size)
		states, actions, rewards, next_states, dones = batch
		states_t = T.tensor(states, dtype=T.float).to(self.q_eval.device)
		actions_t = T.tensor(actions).to(self.q_eval.device)
		rewards_t = T.tensor(rewards, dtype=T.float).to(self.q_eval.device)
		next_states_t = T.tensor(next_states, dtype=T.float).to(self.q_eval.device)
		return states_t, actions_t, rewards_t, next_states_t

	def update(self, batch_size):

		states_t, actions_t, rewards_t, next_states_t = self.get_batch_tensors(batch_size)
		self.q_eval.optimizer.zero_grad()

		self.replace_target_network()

		indices = np.arange(batch_size)
		curr_Q = self.q_eval.forward(states_t)[indices, actions_t]
		max_next_Q = self.q_target.forward(next_states_t).max(1)[0]
		expected_Q = rewards_t + self.gamma * max_next_Q

		loss = self.q_eval.MSE_loss(curr_Q, expected_Q).to(self.q_eval.device)

		loss.backward()
		self.q_eval.optimizer.step()
		self.learn_step_counter += 1

		self.dec_eps()

	def learn(self,state, action, reward, next_state, done, batch_size):
		self.replay_buffer.store_transition(state, action, reward, next_state, done)

		if len(self.replay_buffer) > batch_size:
			self.update(batch_size)
Exemplo n.º 5
0
class DuelingDQNAgent(Agent):
	def __init__(self, env, network, learning_rate, gamma, eps_max, eps_min, eps_dec, buffer_size, replace_cnt):
		super().__init__(env, network, learning_rate, gamma, eps_max, eps_min, eps_dec)

		self.replay_buffer = ReplayBuffer(max_size=buffer_size, input_shape = env.env_shape)

		self.learn_step_counter = 0
		self.replace_cnt = replace_cnt
		self.q_eval = DuelingDQN(env.env_shape, env.no_of_actions)
		self.q_target = DuelingDQN(env.env_shape, env.no_of_actions)


	def get_action(self, state):

		if(np.random.randn() <= self.eps):
			return self.env.sample_action()

		else:
			state = T.tensor(state, dtype=T.float).unsqueeze(0).to(self.q_eval.device)
			_, advantage = self.q_eval.forward(state)
			return T.argmax(advantage).item()

	def replace_target_network(self):
		if self.learn_step_counter % self.replace_cnt == 0:
			self.q_target.load_state_dict(self.q_eval.state_dict())

	def get_batch_tensors(self, batch_size):
		batch = self.replay_buffer.sample(batch_size)
		states, actions, rewards, next_states, dones = batch
		states_t = T.tensor(states, dtype=T.float).to(self.q_eval.device)
		actions_t = T.tensor(actions).to(self.q_eval.device)
		rewards_t = T.tensor(rewards, dtype=T.float).to(self.q_eval.device)
		next_states_t = T.tensor(next_states, dtype=T.float).to(self.q_eval.device)
		return states_t, actions_t, rewards_t, next_states_t

	def update(self, batch_size):

		states_t, actions_t, rewards_t, next_states_t = self.get_batch_tensors(batch_size)
		self.q_eval.optimizer.zero_grad()

		self.replace_target_network()

		indices = np.arange(batch_size)
		Vs, As = self.q_eval.forward(states_t)
		curr_Q = T.add(Vs, (As - As.mean(dim=1, keepdim=True)))[indices, actions_t]

		Vns, Ans = self.q_target.forward(next_states_t)
		max_next_Q = T.add(Vs, (As - As.mean(dim=1, keepdim=True))).max(1)[0]

		expected_Q = rewards_t + self.gamma * max_next_Q

		loss = self.q_eval.MSE_loss(curr_Q, expected_Q).to(self.q_eval.device)

		loss.backward()
		self.q_eval.optimizer.step()
		self.learn_step_counter += 1

		self.dec_eps()

	def learn(self,state, action, reward, next_state, done, batch_size):
		self.replay_buffer.store_transition(state, action, reward, next_state, done)

		if len(self.replay_buffer) > batch_size:
			self.update(batch_size)
Exemplo n.º 6
0
class TD3Agent():
    def __init__(self,
                 env_id,
                 alpha,
                 beta,
                 input_dims,
                 tau,
                 env,
                 gamma=0.99,
                 update_actor_interval=2,
                 warmup=1000,
                 n_actions=2,
                 max_size=1000000,
                 layer1_size=256,
                 layer2_size=256,
                 batch_size=256,
                 noise=0):
        self.gamma = gamma
        self.tau = tau
        self.max_action = env.action_space.high
        self.min_action = env.action_space.low

        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.learn_step_cntr = 0
        self.time_step = 0
        self.warmup = warmup
        self.n_actions = n_actions
        self.update_actor_iter = update_actor_interval

        self.actor = ActorNetwork(alpha,
                                  input_dims,
                                  layer1_size,
                                  layer2_size,
                                  n_actions=n_actions,
                                  name=env_id + '_actor')

        self.critic_1 = CriticNetwork(beta,
                                      input_dims,
                                      layer1_size,
                                      layer2_size,
                                      n_actions=n_actions,
                                      name=env_id + '_critic_1')
        self.critic_2 = CriticNetwork(beta,
                                      input_dims,
                                      layer1_size,
                                      layer2_size,
                                      n_actions=n_actions,
                                      name=env_id + '_critic_2')

        self.target_actor = ActorNetwork(alpha,
                                         input_dims,
                                         layer1_size,
                                         layer2_size,
                                         n_actions=n_actions,
                                         name=env_id + '_target_actor')
        self.target_critic_1 = CriticNetwork(beta,
                                             input_dims,
                                             layer1_size,
                                             layer2_size,
                                             n_actions=n_actions,
                                             name=env_id + '_target_critic_1')
        self.target_critic_2 = CriticNetwork(beta,
                                             input_dims,
                                             layer1_size,
                                             layer2_size,
                                             n_actions=n_actions,
                                             name=env_id + '_target_critic_2')

        self.noise = noise
        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        if self.time_step < self.warmup:
            mu = T.tensor(
                np.random.normal(scale=self.noise,
                                 size=(self.n_actions, ))).to(
                                     self.actor.device)
        else:
            state = T.tensor(observation, dtype=T.float).to(self.actor.device)
            mu = self.actor.forward(state).to(self.actor.device)
        mu_prime = mu + T.tensor(np.random.normal(scale=self.noise),
                                 dtype=T.float).to(self.actor.device)

        mu_prime = T.clamp(mu_prime, self.min_action[0], self.max_action[0])
        #mu_prime = T.clamp(mu_prime, self.min_action, self.max_action)
        self.time_step += 1

        return mu_prime.cpu().detach().numpy()

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return
        state, action, reward, new_state, done = \
                self.memory.sample_buffer(self.batch_size)

        reward = T.tensor(reward, dtype=T.float).to(self.critic_1.device)
        done = T.tensor(done).to(self.critic_1.device)
        state_ = T.tensor(new_state, dtype=T.float).to(self.critic_1.device)
        state = T.tensor(state, dtype=T.float).to(self.critic_1.device)
        action = T.tensor(action, dtype=T.float).to(self.critic_1.device)

        target_actions = self.target_actor.forward(state_)
        target_actions = target_actions + \
                T.clamp(T.tensor(np.random.normal(scale=0.2)), -0.5, 0.5)
        target_actions = T.clamp(target_actions, self.min_action[0],
                                 self.max_action[0])

        q1_ = self.target_critic_1.forward(state_, target_actions)
        q2_ = self.target_critic_2.forward(state_, target_actions)

        q1 = self.critic_1.forward(state, action)
        q2 = self.critic_2.forward(state, action)

        q1_[done] = 0.0
        q2_[done] = 0.0

        q1_ = q1_.view(-1)
        q2_ = q2_.view(-1)

        critic_value_ = T.min(q1_, q2_)

        target = reward + self.gamma * critic_value_
        target = target.view(self.batch_size, 1)

        self.critic_1.optimizer.zero_grad()
        self.critic_2.optimizer.zero_grad()

        q1_loss = F.mse_loss(target, q1)
        q2_loss = F.mse_loss(target, q2)
        critic_loss = q1_loss + q2_loss
        critic_loss.backward()
        self.critic_1.optimizer.step()
        self.critic_2.optimizer.step()

        self.learn_step_cntr += 1

        if self.learn_step_cntr % self.update_actor_iter != 0:
            return

        self.actor.optimizer.zero_grad()
        actor_q1_loss = self.critic_1.forward(state, self.actor.forward(state))
        actor_loss = -T.mean(actor_q1_loss)
        actor_loss.backward()
        self.actor.optimizer.step()

        self.update_network_parameters()

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        actor_params = self.actor.named_parameters()
        critic_1_params = self.critic_1.named_parameters()
        critic_2_params = self.critic_2.named_parameters()
        target_actor_params = self.target_actor.named_parameters()
        target_critic_1_params = self.target_critic_1.named_parameters()
        target_critic_2_params = self.target_critic_2.named_parameters()

        critic_1 = dict(critic_1_params)
        critic_2 = dict(critic_2_params)
        actor = dict(actor_params)
        target_actor = dict(target_actor_params)
        target_critic_1 = dict(target_critic_1_params)
        target_critic_2 = dict(target_critic_2_params)

        for name in critic_1:
            critic_1[name] = tau*critic_1[name].clone() + \
                    (1-tau)*target_critic_1[name].clone()

        for name in critic_2:
            critic_2[name] = tau*critic_2[name].clone() + \
                    (1-tau)*target_critic_2[name].clone()

        for name in actor:
            actor[name] = tau*actor[name].clone() + \
                    (1-tau)*target_actor[name].clone()

        self.target_critic_1.load_state_dict(critic_1)
        self.target_critic_2.load_state_dict(critic_2)
        self.target_actor.load_state_dict(actor)

    def load_models(self):
        print('... loading checkpoint ...')
        self.actor.load_checkpoint()
        self.target_actor.load_checkpoint()
        self.critic_1.load_checkpoint()
        self.critic_2.load_checkpoint()
        self.target_critic_1.load_checkpoint()
        self.target_critic_2.load_checkpoint()
Exemplo n.º 7
0
class DQNAgent:
    def __init__(self, env, render, config_info):
        self.env = env
        self._reset_env()
        self.render = render

        # Set seeds
        self.seed = 0
        env.seed(self.seed)
        torch.manual_seed(self.seed)
        np.random.seed(self.seed)

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Device in use : {self.device}")

        # Define checkpoint
        checkpoint = Checkpoint(self.device, **config_info)

        # Create / load checkpoint dict
        (
            self.ckpt,
            self.path_ckpt_dict,
            self.path_ckpt,
            config,
        ) = checkpoint.manage_checkpoint()

        # Unroll useful parameters from config dict
        self.batch_size = config["training"]["batch_size"]
        self.max_timesteps = config["training"]["max_timesteps"]
        self.replay_size = config["training"]["replay_size"]
        self.start_temp = config["training"]["start_temperature"]
        self.final_temp = config["training"]["final_temperature"]
        self.decay_temp = config["training"]["decay_temperature"]
        self.gamma = config["training"]["gamma"]
        self.early_stopping = config["training"]["early_stopping"]
        self.update_frequency = config["training"]["update_frequency"]
        self.eval_frequency = config["training"]["eval_frequency"]

        # Define state and action dimension spaces
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.n

        # Define Q-network and target Q-network
        self.network = DQN(state_dim, action_dim, **config["model"]).to(self.device)
        self.target_network = DQN(state_dim, action_dim, **config["model"]).to(
            self.device
        )

        # Loss and optimizer
        self.criterion = nn.MSELoss()
        lr = config["optimizer"]["learning_rate"]
        self.optimizer = optim.Adam(self.network.parameters(), lr=lr)

        # Load network's weight if resume training
        checkpoint.load_weights(
            self.ckpt, self.network, self.target_network, self.optimizer
        )

        # Initialize replay buffer
        self.replay_buffer = ReplayBuffer(self.replay_size)

        self.transition = namedtuple(
            "transition",
            field_names=["state", "action", "reward", "done", "next_state"],
        )

    def _reset_env(self):
        self.state, self.done = self.env.reset(), False
        self.episode_reward = 0.0

    def play_step(self, temperature=1):
        reward_signal = None

        # Boltmann exploration
        state_v = torch.tensor(self.state, dtype=torch.float32).to(self.device)
        q_values = self.network(state_v)
        probas = Categorical(F.softmax(q_values / temperature, dim=0))
        action = probas.sample().item()

        # Perform one step in the environment
        next_state, reward, self.done, _ = self.env.step(action)

        # Create a tuple for the new transition
        new_transition = self.transition(
            self.state, action, reward, self.done, next_state
        )

        # Add transition to the replay buffer
        self.replay_buffer.store_transition(new_transition)

        self.state = next_state
        self.episode_reward += reward

        if self.render:
            self.env.render()

        if self.done:
            reward_signal = self.episode_reward
            self._reset_env()

        return reward_signal

    def train(self):

        # Initializations
        all_episode_rewards = []
        episode_timestep = 0
        best_mean_reward = None
        episode_num = 0
        temp = self.start_temp  # start epsilon to explore while filling the buffer
        writer = SummaryWriter(log_dir=self.path_ckpt, comment="-dqn")

        # Evaluate untrained policy
        evaluations = [self.eval_policy()]

        # Training loop
        for t in range(int(self.max_timesteps)):
            episode_timestep += 1

            # -> is None if episode is not terminated
            # -> is episode reward when episode is terminated
            reward_signal = self.play_step(temp)

            # when episode is terminated
            if reward_signal is not None:
                episode_reward = reward_signal

                mean_reward = np.mean(all_episode_rewards[-10:])

                print(
                    f"Timestep [{t + 1}/{int(self.max_timesteps)}] ; "
                    f"Episode num : {episode_num + 1} ; "
                    f"Episode length : {episode_timestep} ; "
                    f"Reward : {episode_reward:.2f} ; "
                    f"Mean reward {mean_reward:.2f}"
                )

                # Save episode's reward & reset counters
                all_episode_rewards.append(episode_reward)
                episode_timestep = 0
                episode_num += 1

                # Save checkpoint
                self.ckpt["episode_num"] = episode_num
                self.ckpt["all_episode_rewards"].append(episode_reward)
                self.ckpt["optimizer_state_dict"] = self.optimizer.state_dict()
                torch.save(self.ckpt, self.path_ckpt_dict)

                writer.add_scalar("episode reward", episode_reward, t)
                writer.add_scalar("mean reward", mean_reward, t)

                # Save network if performance is better than average
                if best_mean_reward is None or best_mean_reward < mean_reward:
                    self.ckpt["best_mean_reward"] = mean_reward
                    self.ckpt["model_state_dict"] = self.network.state_dict()
                    self.ckpt[
                        "target_model_state_dict"
                    ] = self.target_network.state_dict()
                    if best_mean_reward is not None:
                        print(f"Best mean reward updated : {best_mean_reward}")
                    best_mean_reward = mean_reward

                # Criterion to early stop training
                if mean_reward > self.early_stopping:
                    self.plot_reward()
                    print(f"Solved in {t + 1}  timesteps!")
                    break

            # Fill the replay buffer
            if len(self.replay_buffer) < self.replay_size:
                continue
            else:
                # Adjust exploration parameter
                temp = np.maximum(
                    self.final_temp, self.start_temp - (t / self.decay_temp)
                )
            writer.add_scalar("temperature", temp, t)

            # Get the weights of the network before update
            weights_network = self.network.state_dict()

            # when it's time perform a batch gradient descent
            if t % self.update_frequency == 0:
                # Backward and optimize
                self.optimizer.zero_grad()
                batch = self.replay_buffer.sample_buffer(self.batch_size)
                loss = self.train_on_batch(batch)
                loss.backward()
                self.optimizer.step()

            # Synchronize target network
            self.target_network.load_state_dict(weights_network)

            # Evaluate episode
            if (t + 1) % self.eval_frequency == 0:
                evaluations.append(self.eval_policy())
                np.save(self.path_ckpt, evaluations)

    def train_on_batch(self, batch_samples):
        # Unpack batch_size of transitions randomly drawn from the replay buffer
        states, actions, rewards, dones, next_states = batch_samples

        # Transform np arrays into tensors and send them to device
        states_v = torch.tensor(states).to(self.device)
        next_states_v = torch.tensor(next_states).to(self.device)
        actions_v = torch.tensor(actions).to(self.device)
        rewards_v = torch.tensor(rewards).to(self.device)
        dones_bool = torch.tensor(dones, dtype=torch.bool).to(self.device)

        # Vectorized version
        q_vals = self.network(states_v)  # dim=batch_size x num_actions
        # Get the Q-values corresponding to the action
        q_vals = q_vals.gather(1, actions_v.view(-1, 1))
        q_vals = q_vals.view(1, -1)[0]

        target_next_q_vals = self.target_network(next_states_v)
        # Max action of the target Q-values
        target_max_next_q_vals, _ = torch.max(target_next_q_vals, dim=1)
        # If state is terminal
        target_max_next_q_vals[dones_bool] = 0.0
        # No update of the target during backpropagation
        target_max_next_q_vals = target_max_next_q_vals.detach()

        # Bellman approximation for target Q-values
        target_q_vals = rewards_v + self.gamma * target_max_next_q_vals

        return self.criterion(q_vals, target_q_vals)

    def eval_policy(self, eval_episodes=10):
        # Runs policy for X episodes and returns average reward
        # A fixed seed is used for the eval environment
        self.env.seed(self.seed + 100)

        avg_reward = 0.0
        temperature = 1
        for _ in range(eval_episodes):
            self._reset_env()
            reward_signal = None
            while reward_signal is None:
                reward_signal = self.play_step(temperature)
            avg_reward += reward_signal

        avg_reward /= eval_episodes

        print("---------------------------------------")
        print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}")
        print("---------------------------------------")
        return avg_reward

    def plot_reward(self):
        plt.plot(self.ckpt["all_episode_rewards"])
        plt.xlabel("Episode")
        plt.ylabel("Reward")
        plt.title(f"Reward evolution for {self.env.unwrapped.spec.id} Gym environment")
        plt.tight_layout()
        path_fig = os.path.join(self.path_ckpt, "figure.png")
        plt.savefig(path_fig)
        print(f"Figure saved to {path_fig}")
        plt.show()
Exemplo n.º 8
0
class Agent:
    def __init__(self, input_dims, n_actions, env,
                 fc1_dims, fc2_dims, alpha, beta,
                 gamma, tau, noise1, noise2, clamp,
                 delay, max_size, batch_size, warmup):

        self.gamma = gamma
        self.tau = tau
        self.noise1 = noise1
        self.noise2 = noise2
        self.clamp = clamp
        self.delay = delay
        self.batch_size = batch_size
        self.warmup = warmup
        self.learn_cntr = 0
        self.env = env
        self.n_actions = n_actions

        self.actor = ActorNetwork(
                     input_shape=input_dims,
                     n_actions=n_actions,
                     fc1_dims=fc1_dims,
                     fc2_dims=fc2_dims,
                     alpha=alpha,
                     name='Actor_TD3PG.cpt',
                     checkpoint_dir='tmp/models')

        self.critic_1 = CriticNetwork(
                        input_shape=input_dims,
                        n_actions=n_actions,
                        fc1_dims=fc1_dims,
                        fc2_dims=fc2_dims,
                        beta=beta,
                        name='Critic_1_TD3PG.cpt',
                        checkpoint_dir='tmp/models')

        self.critic_2 = CriticNetwork(
                        input_shape=input_dims,
                        n_actions=n_actions,
                        fc1_dims=fc1_dims,
                        fc2_dims=fc2_dims,
                        beta=beta,
                        name='Critic_2_TD3PG.cpt',
                        checkpoint_dir='tmp/models')

        self.target_actor = ActorNetwork(
                            input_shape=input_dims,
                            n_actions=n_actions,
                            fc1_dims=fc1_dims,
                            fc2_dims=fc2_dims,
                            alpha=alpha,
                            name='Target_Actor_TD3PG.cpt',
                            checkpoint_dir='tmp/models')

        self.target_critic_1 = CriticNetwork(
                               input_shape=input_dims,
                               n_actions=n_actions,
                               fc1_dims=fc1_dims,
                               fc2_dims=fc2_dims,
                               beta=beta,
                               name='Target_Critic_1_TD3PG.cpt',
                               checkpoint_dir='tmp/models')

        self.target_critic_2 = CriticNetwork(
                               input_shape=input_dims, 
                               n_actions=n_actions, 
                               fc1_dims=fc1_dims,
                               fc2_dims=fc2_dims, 
                               beta=beta, 
                               name='Target_Critic_2_TD3PG.cpt',
                               checkpoint_dir='tmp/models')

        self.memory = ReplayBuffer(
                      max_size=max_size, 
                      input_shape=input_dims, 
                      n_actions=n_actions)

        self.update_target_networks()

    def update_target_networks(self):
        tau = self.tau

        actor = dict(self.actor.named_parameters())
        critic_1 = dict(self.critic_1.named_parameters())
        critic_2 = dict(self.critic_2.named_parameters())
        target_actor = dict(self.target_actor.named_parameters())
        target_critic_1 = dict(self.target_critic_1.named_parameters())
        target_critic_2 = dict(self.target_critic_2.named_parameters())
        
        for name in actor:
            actor[name] = tau*actor[name].clone() + (1-tau)*target_actor[name].clone()
        
        for name in critic_1:
            critic_1[name] = tau*critic_1[name].clone() + (1-tau)*target_critic_1[name].clone()
        
        for name in critic_2:
            critic_2[name] = tau*critic_2[name].clone() + (1-tau)*target_critic_2[name].clone()
        
        self.target_actor.load_state_dict(actor)
        self.target_critic_1.load_state_dict(critic_1)
        self.target_critic_2.load_state_dict(critic_2)
    
    def choose_action(self, observation):
        if self.learn_cntr < self.warmup:
            mu = np.random.normal(scale=self.noise1, 
                                  size=self.n_actions)
            mu = T.tensor(mu).to(self.actor.device)
        else:
            state = T.tensor(observation,
                             dtype=T.float).to(self.actor.device)
            mu = self.actor.forward(state)
        noise = T.tensor(np.random.normal(scale=self.noise1,
                                          size=self.n_actions), 
                         dtype=T.float).to(self.actor.device)
        mu_ = T.clamp(T.add(mu, noise), min=self.env.action_space.low[0],
                                        max=self.env.action_space.high[0])
        self.learn_cntr += 1
        return mu_.cpu().detach().numpy()
    
    def save_models(self):
        self.actor.save_checkpoint()
        self.critic_1.save_checkpoint()
        self.critic_2.save_checkpoint()
        self.target_actor.save_checkpoint()
        self.target_critic_1.save_checkpoint()
        self.target_critic_2.save_checkpoint()

    def load_models(self):
        self.actor.load_checkpoint()
        self.critic_1.load_checkpoint()
        self.critic_2.load_checkpoint()
        self.target_actor.load_checkpoint()
        self.target_critic_1.load_checkpoint()
        self.target_critic_2.load_checkpoint()
        
    def remember(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)
        
    def sample(self):
        states, actions, rewards, states_, done = \
                                self.memory.sample_buffer(self.batch_size)
        
        states = T.tensor(states, dtype=T.float).to(self.critic_1.device)
        actions = T.tensor(actions, dtype=T.float).to(self.critic_1.device)
        rewards = T.tensor(rewards, dtype=T.float).to(self.critic_1.device)
        states_ = T.tensor(states_, dtype=T.float).to(self.critic_1.device)
        done = T.tensor(done, dtype=T.int).to(self.critic_1.device)
        
        return states, actions, rewards, states_, done
        
    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return
        
        states, actions, rewards, states_, done = self.sample()

        Vs1 = self.critic_1.forward(states, actions)
        Vs2 = self.critic_2.forward(states, actions)

        actions_ = self.target_actor.forward(states_)

        noise = T.tensor(np.random.normal(scale=self.noise1,
                                          size=self.n_actions), 
                         dtype=T.float).to(self.actor.device)
        noise = T.clamp(noise, min=-self.clamp, max=self.clamp)
        
        actions_ = T.add(actions_, noise)
        actions_ = T.clamp(actions_, min=self.env.action_space.low[0], 
                                     max=self.env.action_space.high[0])

        critic_1_Vs_ = self.target_critic_1.forward(states_, actions_)
        critic_2_Vs_ = self.target_critic_2.forward(states_, actions_)
        min_Vs_ = T.min(critic_1_Vs_, critic_2_Vs_)

        target = rewards + self.gamma*min_Vs_*(1-done)

        self.critic_1.optim.zero_grad()
        self.critic_2.optim.zero_grad()
        critic_1_loss = F.mse_loss(Vs1, target)
        critic_2_loss = F.mse_loss(Vs2, target)
        critic_loss = T.add(critic_1_loss, critic_2_loss)
        critic_loss.backward()
        self.critic_1.optim.step()
        self.critic_2.optim.step()

        if self.learn_cntr % self.delay == 0:
            self.actor.optim.zero_grad()
            actor_loss = self.critic_1.forward(states_, self.actor.forward(states_))
            actor_loss = -T.mean(actor_loss)
            actor_loss.backward()
            self.actor.optim.step()
            
            self.update_target_networks()
Exemplo n.º 9
0
class MADDPG:
    def __init__(self, agent_init_params,batch_size=1024,replay_buffer_capacity=100000,
                 gamma=0.95, tau=0.01, lr=0.01, hidden_dim=64,
                 discrete_action=False,env='simple_reference'):
        """
        Inputs:
            agent_init_params (list of dict): List of dicts with parameters to
                                              initialize each agent
                num_in_pol (int): Input dimensions to policy
                num_out_pol (int): Output dimensions to policy
                num_in_critic (int): Input dimensions to critic
            alg_types (list of str): Learning algorithm for each agent (DDPG
                                       or MADDPG)
            gamma (float): Discount factor
            tau (float): Target update rate
            lr (float): Learning rate for policy and critic
            hidden_dim (int): Number of hidden dimensions for networks
            discrete_action (bool): Whether or not to use discrete action space
        """
        self.nagents = len(agent_init_params)
        self.agents=[]

        n_actions_total=np.sum([agent['n_actions_physical']+agent['n_actions_communication'] for agent in agent_init_params])

        n_observations_total=np.sum([agent['input_dims'] for agent in agent_init_params])

        for i in range(self.nagents):

            current_agent=Agent(id_number=i,**agent_init_params[i])
            current_agent.initialize_critic(n_actions_total+n_observations_total)
            self.agents.append(current_agent)

        self.agent_init_params = agent_init_params
        self.gamma = gamma
        self.tau = tau
        self.lr = lr

        '''
        self.discrete_action = discrete_action
        self.pol_dev = 'cpu'  # device for policies
        self.critic_dev = 'cpu'  # device for critics
        self.trgt_pol_dev = 'cpu'  # device for target policies
        self.trgt_critic_dev = 'cpu'  # device for target critics
        self.niter = 0
        '''
        
        self.replay_buffer=ReplayBuffer(replay_buffer_capacity)
        self.batch_size=batch_size

        self.env=make_env(env)

        self.loss=nn.MSELoss()

    def reset(self):
        return self.env.reset()

    def step(self,observations): #You have all the agents and policies. Once you have observations, you can just calculate actions to step

        #'observations' is a list of np arrays containing the observations of each agent.

        #'actions' should be a list of np arrays containing the actions of each agent. There's one np array per agent
        actions=[]


        for i,observation in enumerate(observations):
            current_action=self.agents[i].Action(observation)
            actions.append(current_action)

        

        new_states,rewards,terminals,_= self.env.step(actions)

        self.replay_buffer.store_transition(observations,actions,rewards,new_states,terminals)
        return new_states,rewards,terminals

    def update_networks(self,hard=False): #Carry out soft updates
        for agent in self.agents:
            agent.update_network_parameters(self.tau)

    def next_actions(self,next_states):
        next_step_actions=[]

        for i,agent in enumerate(self.agents):
            action=agent.Action(next_states[i],target=True)
            next_step_actions.append(action)

        return next_step_actions

    def learn(self):
        if self.replay_buffer.mem_cntr<self.batch_size: return

        states_batch,actions_batch,rewards_batch,next_states_batch,terminal_batch=self.replay_buffer.sample_buffer(self.batch_size)

        for agent in self.agents:
            #Critic Update
            self.update_critic(agent,states_batch,actions_batch,rewards_batch,next_states_batch,terminal_batch)
            self.update_actor(agent,states_batch,actions_batch,rewards_batch,next_states_batch,terminal_batch)



    def update_critic(self,agent,states_batch,actions_batch,rewards_batch,next_states_batch,terminal_batch):
        critic_losses=[]
        for i in range(self.batch_size):
            current_states=states_batch[i]
            current_actions=actions_batch[i]
            current_rewards=rewards_batch[i]
            next_states=next_states_batch[i]
            next_step_actions=[]

            for j,next_agent in enumerate(self.agents):
                action=next_agent.Action(next_states[j],target=True)
                next_step_actions.append(action)

            agent.critic.eval()                        
            Q=agent.critic.forward(current_states,current_actions).to(agent.critic.device)
            target=current_rewards[agent.id]+self.gamma*agent.critic.forward(next_states,next_step_actions).to(agent.critic.device).detach()
            
            loss=self.loss(Q,target)
            critic_losses.append(loss)

        agent.critic.train()
        critic_losses=torch.stack(critic_losses,0)
        mean_critic_loss=torch.mean(critic_losses).to(agent.critic.device)

        agent.critic.optimizer.zero_grad()
        mean_critic_loss.backward()

        nn.utils.clip_grad_norm_(agent.critic.parameters(), 0.5)
        agent.critic.optimizer.step()

    def update_actor(self,agent,states_batch,actions_batch,rewards_batch,next_states_batch,terminal_batch):
        Q_values=[]
        for i in range(self.batch_size):
            current_states=states_batch[i]
            current_actions=actions_batch[i]
            current_rewards=rewards_batch[i]

            agent.actor.eval()
            agent_action=agent.actor.forward(current_states[agent.id])#This is a tensor, not discretized. We could use gumbel softmax to approximate the discretization

            physical_actions=agent_action[0:agent.n_actions_physical]
            comm_actions=F.gumbel_softmax(agent_action[agent.n_actions_physical:],hard=True)

            agent_action=torch.cat((physical_actions,comm_actions))


            actions_for_critic=deepcopy(current_actions)

            for i in range(self.nagents):
                if(i==agent.id):
                    actions_for_critic[agent.id]=agent_action
                else:actions_for_critic[i]=torch.tensor(actions_for_critic[i],dtype=torch.float32).to(agent.critic.device)

            actions_for_critic=list(chain.from_iterable(actions_for_critic))
            actions_for_critic=torch.stack(actions_for_critic)

            Q= -agent.critic.forward(current_states,actions_for_critic,actions_need_processing=False)
            Q_values.append(Q)

        Q_values=torch.stack(Q_values,0)

        mean_Q=torch.mean(Q_values)
            
        agent.actor.optimizer.zero_grad()
        agent.actor.train()

        mean_Q.backward()

        nn.utils.clip_grad_norm_(agent.actor.parameters(), 0.5)
        agent.actor.optimizer.step()
Exemplo n.º 10
0
class Agent():
    def __init__(self,
                 env_id,
                 alpha=0.0003,
                 beta=0.0003,
                 input_dims=[8],
                 env=None,
                 gamma=0.99,
                 n_actions=2,
                 max_size=1000000,
                 tau=0.005,
                 layer1_size=256,
                 layer2_size=256,
                 batch_size=256,
                 reward_scale=2):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions

        self.actor = ActorNetwork(env_id,
                                  alpha,
                                  input_dims,
                                  n_actions=n_actions,
                                  name='actor',
                                  max_action=env.action_space.high)
        self.critic_1 = CriticNetwork(env_id,
                                      beta,
                                      input_dims,
                                      n_actions=n_actions,
                                      name='critic_1')
        self.critic_2 = CriticNetwork(env_id,
                                      beta,
                                      input_dims,
                                      n_actions=n_actions,
                                      name='critic_2')
        self.value = ValueNetwork(env_id, beta, input_dims, name='value')
        self.target_value = ValueNetwork(env_id,
                                         beta,
                                         input_dims,
                                         name='target_value')

        self.scale = reward_scale
        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        state = T.Tensor([observation]).to(self.actor.device)
        actions, _ = self.actor.sample_normal(state, reparameterize=False)

        return actions.cpu().detach().numpy()[0]

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        target_value_params = self.target_value.named_parameters()
        value_params = self.value.named_parameters()

        target_value_state_dict = dict(target_value_params)
        value_state_dict = dict(value_params)

        for name in value_state_dict:
            value_state_dict[name] = tau*value_state_dict[name].clone() + \
                    (1-tau)*target_value_state_dict[name].clone()

        self.target_value.load_state_dict(value_state_dict)

    def load_models(self):
        print('.... loading models ....')
        self.actor.load_checkpoint()
        self.value.load_checkpoint()
        self.target_value.load_checkpoint()
        self.critic_1.load_checkpoint()
        self.critic_2.load_checkpoint()

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        state, action, reward, new_state, done = \
                self.memory.sample_buffer(self.batch_size)

        reward = T.tensor(reward, dtype=T.float).to(self.actor.device)
        done = T.tensor(done).to(self.actor.device)
        state_ = T.tensor(new_state, dtype=T.float).to(self.actor.device)
        state = T.tensor(state, dtype=T.float).to(self.actor.device)
        action = T.tensor(action, dtype=T.float).to(self.actor.device)

        value = self.value(state).view(-1)
        value_ = self.target_value(state_).view(-1)
        value_[done] = 0.0

        actions, log_probs = self.actor.sample_normal(state,
                                                      reparameterize=False)
        log_probs = log_probs.view(-1)
        q1_new_policy = self.critic_1.forward(state, actions)
        q2_new_policy = self.critic_2.forward(state, actions)
        critic_value = T.min(q1_new_policy, q2_new_policy)
        critic_value = critic_value.view(-1)

        self.value.optimizer.zero_grad()
        value_target = critic_value - log_probs
        value_loss = 0.5 * F.mse_loss(value, value_target)
        value_loss.backward(retain_graph=True)
        self.value.optimizer.step()

        actions, log_probs = self.actor.sample_normal(state,
                                                      reparameterize=True)
        log_probs = log_probs.view(-1)
        q1_new_policy = self.critic_1.forward(state, actions)
        q2_new_policy = self.critic_2.forward(state, actions)
        critic_value = T.min(q1_new_policy, q2_new_policy)
        critic_value = critic_value.view(-1)

        actor_loss = log_probs - critic_value
        actor_loss = T.mean(actor_loss)
        self.actor.optimizer.zero_grad()
        actor_loss.backward(retain_graph=True)
        self.actor.optimizer.step()

        self.critic_1.optimizer.zero_grad()
        self.critic_2.optimizer.zero_grad()
        q_hat = self.scale * reward + self.gamma * value_
        q1_old_policy = self.critic_1.forward(state, action).view(-1)
        q2_old_policy = self.critic_2.forward(state, action).view(-1)
        critic_1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat)
        critic_2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat)

        critic_loss = critic_1_loss + critic_2_loss
        critic_loss.backward()
        self.critic_1.optimizer.step()
        self.critic_2.optimizer.step()

        self.update_network_parameters()
Exemplo n.º 11
0
class SACAgent():
    def __init__(self, alpha, beta, tau, env, env_id, input_dims, gamma=0.99, n_actions=2,
                max_size=1000000, layer1_size=256, layer2_size=256, batch_size=256,
                reward_scale=2):

        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions

        # can use shared critic input layer and different outputs
        # but in this case using 2 seperate critics
        # env.action_space.max_action switched for env.action_space.high for LunarLanderContinuous
        self.actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size,
                                env.action_space.high[0], n_actions, env_id+'_actor')
        self.critic_1 = CriticNetwork(beta, input_dims, layer1_size, layer2_size,
                                n_actions, env_id + '_critic_1')
        self.critic_2 = CriticNetwork(beta, input_dims, layer1_size, layer2_size,
                                n_actions, env_id + '_critic_2')
        self.value = ValueNetwork(beta, input_dims, layer1_size, layer2_size, env_id+'_value')
        self.target_value = ValueNetwork(beta, input_dims, layer1_size, layer2_size, '_target_value')

        self.scale = reward_scale
        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        state = T.tensor([observation]).to(self.actor.device)
        actions, _ = self.actor.sample_normal(state, reparameterize=False)
        return actions.cpu().detach().numpy()[0] # returned as arr of arrays on gpu as torch tensor

    def remember(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        target_value_params = self.target_value.named_parameters()
        value_params = self.value.named_parameters()

        target_value_state_dict = dict(target_value_params)
        value_state_dict = dict(value_params)

        # overwriting parameters - setting new values
        for name in value_state_dict:
            value_state_dict[name] = tau*value_state_dict[name].clone() + \
                (1-tau)*target_value_state_dict[name].clone()

        self.target_value.load_state_dict(value_state_dict)

    def save_models(self):
        print("... saving models")
        self.actor.save_checkpoint()
        self.critic_1.save_checkpoint()
        self.critic_2.save_checkpoint()
        self.value.save_checkpoint()
        self.target_value.save_checkpoint()

    def load_models(self):
        print("... loading models")
        self.actor.load_checkpoint()
        self.critic_1.load_checkpoint()
        self.critic_2.load_checkpoint()
        self.value.load_checkpoint()
        self.target_value.load_checkpoint()

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        state, action, reward, new_state, done = self.memory.sample_buffer(self.batch_size)

        # self.critic_1.device or self.actor.device
        state = T.tensor(state, dtype=T.float).to(self.actor.device)
        action = T.tensor(action, dtype=T.float).to(self.actor.device)
        reward = T.tensor(reward, dtype=T.float).to(self.actor.device)
        state_ = T.tensor(new_state, dtype=T.float).to(self.actor.device)
        done = T.tensor(done).to(self.actor.device)

        # passing states and new states through value and target value networks
        # collapsing along batch dimension since we don't need 2d tensor for scalar quantities
        value = self.value(state).view(-1)
        value_ = self.target_value(state_).view(-1)
        value_[done] = 0.0 # setting terminal states to 0

        # pass current states through current policy get action & log prob values
        actions, log_probs = self.actor.sample_normal(state, reparameterize=False)
        log_probs = log_probs.view(-1)
        # critic values for current policy state action pairs
        q1_new_policy = self.critic_1.forward(state, actions)
        q2_new_policy = self.critic_2.forward(state, actions)
        # take critic min and collapse
        critic_value = T.min(q1_new_policy, q2_new_policy)
        critic_value = critic_value.view(-1)

        self.value.optimizer.zero_grad()
        value_target = critic_value - log_probs
        value_loss = 0.5 * F.mse_loss(value, value_target)
        value_loss.backward(retain_graph=True)
        self.value.optimizer.step()

        # actor loss (using reparam trick)
        actions, log_probs = self.actor.sample_normal(state, reparameterize=True)
        log_probs = log_probs.view(-1)
        # take critic min for new policy and collapse
        q1_new_policy = self.critic_1.forward(state, action)
        q2_new_policy = self.critic_2.forward(state, action)
        critic_value = T.min(q1_new_policy, q2_new_policy)
        critic_value = critic_value.view(-1)

        # calculating actor loss
        actor_loss = log_probs - critic_value
        actor_loss = T.mean(actor_loss)
        self.actor.optimizer.zero_grad()
        actor_loss.backward(retain_graph=True)
        self.actor.optimizer.step()

        q_hat = self.scale * reward + self.gamma*value_ # qhat
        q1_old_policy = self.critic_1.forward(state, action).view(-1) # old policy (from replay buffer)
        q2_old_policy = self.critic_2.forward(state, action).view(-1)
        critic_1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat)
        critic_2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat)

        self.critic_1.optimizer.zero_grad()
        self.critic_2.optimizer.zero_grad()
        critic_loss = critic_1_loss + critic_2_loss
        critic_loss.backward()
        self.critic_1.optimizer.step()
        self.critic_2.optimizer.step()

        self.update_network_parameters()
Exemplo n.º 12
0
class Agent():
    # needs functions init, choose_action, store_transition
    def __init__(self,
                 alpha,
                 beta,
                 input_dims,
                 tau,
                 env,
                 gamma=0.99,
                 update_actor_interval=2,
                 n_actions=2,
                 warmup=1000,
                 max_size=1e6,
                 layer1_size=400,
                 layer2_size=300,
                 batch_size=100,
                 noise=0.1):
        self.gamma = gamma
        self.tau = tau
        self.max_action = env.action_space.high
        self.min_action = env.action_space.low
        #self.max_action = n_actions
        #self.min_action = 0

        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.learn_step_cntr = 0  # how often to call the learning function on the actor network
        self.time_step = 0  # handles countdown to end of warmup
        self.warmup = warmup
        self.n_actions = n_actions
        self.update_actor_iter = update_actor_interval

        self.actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size,
                                  n_actions, 'actor_net')
        self.critic_1 = CriticNetwork(beta, input_dims, layer1_size,
                                      layer2_size, n_actions, 'critic_1')
        self.critic_2 = CriticNetwork(beta, input_dims, layer1_size,
                                      layer2_size, n_actions, 'critic_2')

        self.target_actor = ActorNetwork(alpha,
                                         input_dims,
                                         layer1_size,
                                         layer2_size,
                                         n_actions,
                                         name='target_actor')
        self.target_critic_1 = CriticNetwork(beta,
                                             input_dims,
                                             layer1_size,
                                             layer2_size,
                                             n_actions,
                                             name='target_critic_1')
        self.target_critic_2 = CriticNetwork(beta,
                                             input_dims,
                                             layer1_size,
                                             layer2_size,
                                             n_actions,
                                             name='target_critic_2')

        self.noise = noise
        self.update_network_parameters(
            tau=1)  # sets the target network parameters to original

    def choose_action(self, observation):
        if self.time_step < self.warmup:
            mu = T.tensor(
                np.random.normal(scale=self.noise,
                                 size=(self.n_actions, ))).to(
                                     self.actor.device)
        else:
            state = T.tensor(observation, dtype=T.float).to(self.actor.device)
            mu = self.actor.forward(state).to(self.actor.device)

        mu_prime = mu + T.tensor(np.random.normal(scale=self.noise),
                                 dtype=T.float).to(self.actor.device)
        # clamping on action to make sure it stays in range
        mu_prime = T.clamp(mu_prime, self.min_action[0], self.max_action[0])
        self.time_step += 1
        return mu_prime.cpu().detach().numpy()

    def remember(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return
        state, action, reward, state_, done = self.memory.sample_buffer(
            self.batch_size)

        state = T.tensor(state, dtype=T.float).to(self.critic_1.device)
        action = T.tensor(action, dtype=T.float).to(self.critic_1.device)
        reward = T.tensor(reward, dtype=T.float).to(self.critic_1.device)
        state_ = T.tensor(state_, dtype=T.float).to(self.critic_1.device)
        done = T.tensor(done).to(self.critic_1.device)

        target_actions = self.actor.forward(state_)  # get the new states
        target_actions = target_actions + T.clamp(
            T.tensor(np.random.normal(scale=0.2)), -0.5, 0.5)  # add noise
        #target_actions = T.clamp(target_actions, self.min_action[0], self.max_action[0])
        target_actions = T.clamp(target_actions, self.min_action[0],
                                 self.max_action[0])
        # clamp to ensure target action in bounds of environment () - may break if -ve element != -(+ve) element

        q1_ = self.target_critic_1.forward(state_, target_actions)
        q2_ = self.target_critic_2.forward(state_, target_actions)

        # needed for loss function
        q1 = self.critic_1.forward(state, action)
        q2 = self.critic_2.forward(state, action)

        #handle when new states are terminal
        q1_[done] = 0.0
        q2_[done] = 0.0

        # collapse on batch dimension
        q1_ = q1_.view(-1)
        q2_ = q2_.view(-1)

        critic_value_ = T.min(
            q1_, q2_)  # perform minimisation operation (to get min)
        target = reward + self.gamma * critic_value_
        target = target.view(
            self.batch_size,
            1)  # add batch dimension to feed through loss function

        self.critic_1.optimizer.zero_grad()
        self.critic_2.optimizer.zero_grad()

        # Calculate and sum losses (can only backprop once in pytorch)
        q1_loss = F.mse_loss(target, q1)
        q2_loss = F.mse_loss(target, q2)
        critic_loss = q1_loss + q2_loss
        critic_loss.backward()  # backprop

        # step optimizer
        self.critic_1.optimizer.step()
        self.critic_2.optimizer.step()

        self.learn_step_cntr += 1
        if self.learn_step_cntr % self.update_actor_iter != 0:
            return

        self.actor.optimizer.zero_grad()
        actor_q1_loss = self.critic_1.forward(state, self.actor.forward(
            state))  # actor loss proportional to loss of critic net (1)
        actor_loss = -T.mean(actor_q1_loss)
        actor_loss.backward()
        self.actor.optimizer.step()

        self.update_network_parameters()

    def update_network_parameters(self, tau=None):  # using soft update rule
        # called at beginning of initializer to set init network params
        if tau is None:
            tau = self.tau

        # get the named parameters of every network
        actor_params = self.actor.named_parameters()
        critic_1_params = self.critic_1.named_parameters()
        critic_2_params = self.critic_2.named_parameters()
        target_actor_params = self.target_actor.named_parameters()
        target_critic_1_params = self.target_critic_1.named_parameters()
        target_critic_2_params = self.target_critic_2.named_parameters()

        # converting to dicts
        actor_state_dict = dict(actor_params)
        critic_1_state_dict = dict(critic_1_params)
        critic_2_state_dict = dict(critic_2_params)
        target_actor_state_dict = dict(target_actor_params)
        target_critic_1_state_dict = dict(target_critic_1_params)
        target_critic_2_state_dict = dict(target_critic_2_params)

        # overwriting parameters - setting new values
        for name in critic_1_state_dict:
            critic_1_state_dict[name] = tau*critic_1_state_dict[name].clone() + \
                (1-tau)*target_critic_1_state_dict[name].clone()

        for name in critic_2_state_dict:
            critic_2_state_dict[name] = tau*critic_2_state_dict[name].clone() + \
                (1-tau)*target_critic_2_state_dict[name].clone()

        for name in actor_state_dict:
            actor_state_dict[name] = tau*actor_state_dict[name].clone() + \
                (1-tau)*target_actor_state_dict[name].clone()

        self.target_critic_1.load_state_dict(critic_1_state_dict)
        self.target_critic_2.load_state_dict(critic_2_state_dict)
        self.target_actor.load_state_dict(actor_state_dict)

    def save_model(self):
        self.actor.save_checkpoint()
        self.target_actor.save_checkpoint()
        self.critic_1.save_checkpoint()
        self.critic_2.save_checkpoint()
        self.target_critic_1.save_checkpoint()
        self.target_critic_2.save_checkpoint()

    def load_model(self):
        self.actor.load_checkpoint()
        self.target_actor.load_checkpoint()
        self.critic_1.load_checkpoint()
        self.critic_2.load_checkpoint()
        self.target_critic_1.load_checkpoint()
        self.target_critic_2.load_checkpoint()
Exemplo n.º 13
0
class DQN:
    def __init__(
        self,
        input_dims=198,
        n_actions=6,
        gamma=0.1,
        epsilon=0.9,
        lr=0.0005,
        mem_size=10000,
        batch_size=32,
        eps_min=0.01,
        eps_dec=5e-10,
        replace=1000,
        algo="dnqagent",
        env_name="minerai",
        chkpt_dir="tmp/dqn",
    ):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.algo = algo
        self.env_name = env_name
        self.chkpt_dir = chkpt_dir
        self.learn_step_counter = 0
        self.n_actions = n_actions
        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        self.q_eval = DQNetwork(
            self.lr,
            self.n_actions,
            input_dims=self.input_dims,
            name=self.env_name + "_" + self.algo + "_q_eval",
            chkpt_dir=self.chkpt_dir,
        )

        self.q_next = DQNetwork(
            self.lr,
            self.n_actions,
            input_dims=self.input_dims,
            name=self.env_name + "_" + self.algo + "_q_next",
            chkpt_dir=self.chkpt_dir,
        )
        # self.load_models()
    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = torch.tensor([observation], dtype=torch.float).to(
                self.q_eval.device
            )
            actions = self.q_eval.forward(state, self.get_state2(observation))
            action = torch.argmax(actions).item()
        else:
            action = randrange(self.n_actions)
        return action

    def store_transition(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def sample_memory(self):
        state, action, reward, new_state, done = self.memory.sample_buffer(
            self.batch_size
        )

        states = torch.tensor(state).to(self.q_eval.device)
        rewards = torch.tensor(reward).to(self.q_eval.device)
        dones = torch.tensor(done).to(self.q_eval.device)
        actions = torch.tensor(action).to(self.q_eval.device)
        states_ = torch.tensor(new_state).to(self.q_eval.device)

        return states, actions, rewards, states_, dones

    def replace_target_network(self):
        if self.replace_target_cnt is not None and \
           self.learn_step_counter % self.replace_target_cnt == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())

    def decrement_epsilon(self):
        self.epsilon = (
            self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min
        )

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return
        self.q_eval.optimizer.zero_grad()
        self.replace_target_network()
        states, actions, rewards, states_, dones = self.sample_memory()
        indices = np.arange(self.batch_size)
        q_pred = self.q_eval.forward(states, self.get_state2(states))[indices, actions]
        q_next = self.q_next.forward(states_, self.get_state2(states_)).max(dim=1)[0]
        q_next[dones] = 0.0
        q_target = rewards + self.gamma * q_next
        loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
        loss.backward()
        self.q_eval.optimizer.step()
        self.learn_step_counter += 1
        self.decrement_epsilon()

    def get_state2(self, observation):
        observation = np.array(torch.tensor(observation,requires_grad=False).cpu()).reshape(-1, 198)
        for i in range(observation.shape[0]):
            observation[
                i, min(int(observation[i, 192]) + int(observation[i, 193]) * 9, 0)
            ] = 1000
            observation[
                i, min(int(observation[i, 194]) + int(observation[i, 195]) * 9, 0)
            ] = 1000
            observation[
                i, min(int(observation[i, 196]) + int(observation[i, 197]) * 9, 0)
            ] = 1000
            observation[
                i, min(int(observation[i, 189]) + int(observation[i, 190]) * 9, 0)
            ] = 10000

        return (
            torch.tensor([observation], dtype=torch.float, requires_grad=True)
            .to(self.q_eval.device)
            .view(-1, 198)
        )