def __init__(self, state_dim, action_dim, max_action, args):
     self.actor = Actor(state_dim, action_dim, max_action).to(args.device)
     self.actor_target = Actor(state_dim, action_dim, max_action).to(args.device)
     self.actor_target.load_state_dict(self.actor.state_dict())
     self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
     self.critic = Critic(state_dim, action_dim).to(args.device)
     self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
     self.list_target_critic = []
     # create the different 
     for c in range(args.num_q_target):
         critic_target = Critic(state_dim, action_dim).to(args.device)
         critic_target.load_state_dict(critic_target.state_dict())
         self.list_target_critic.append(critic_target)
     
     self.target_critic = Critic(state_dim, action_dim).to(args.device)
     self.target_critic.load_state_dict(self.target_critic.state_dict())
     self.max_action = max_action
     self.num_q_target = args.num_q_target
     self.batch_size = args.batch_size
     self.discount = args.discount
     self.tau = args.tau 
     self.policy_noise = args.policy_noise
     self.noise_clip = args.noise_clip
     self.policy_freq = args.policy_freq
     self.device = args.device
     self.update_counter = 0
     self.step = 0 
     self.currentQNet = 0
示例#2
0
class MultiAgent(object):
    def __init__(self, config: DefaultMunch):
        self.config = config
        self.memory = self.config.memory
        self.n_agents = self.config.n_agents
        self.action_size = self.config.action_size
        self.state_size = self.config.state_size
        self.critic_local = Critic(self.state_size, self.config.action_size,
                                   self.config.n_agents).to(self.config.device)
        self.critic_target = Critic(self.state_size, self.config.action_size,
                                    self.config.n_agents).to(
                                        self.config.device)
        self.critic_target.load_state_dict(self.critic_local.state_dict())
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.config.lr_critic)
        self.agents = [Agent(self.config, self) for i in range(self.n_agents)]

    def step(self, states, actions, rewards, next_states, dones):
        self.memory.add((states[0], actions[0], rewards[0], next_states[0],
                         dones[0], states[1], actions[1], next_states[1]))
        self.agents[0].step()
        self.memory.add((states[1], actions[1], rewards[1], next_states[1],
                         dones[1], states[0], actions[0], next_states[0]))
        self.agents[1].step()

    def act(self, states, add_noise=True):
        actions1: torch.Tensor = self.agents[0].act(states[0], add_noise)
        actions2: torch.Tensor = self.agents[1].act(states[1], add_noise)
        actions = torch.stack([actions1, actions2], dim=0)
        return actions

    def reset(self):
        for agent in self.agents:
            agent.reset()

    def save(self, path, episode):
        for i, agent in enumerate(self.agents):
            agent.save(path + str(i), episode)

    def load(self, path):
        for i, agent in enumerate(self.agents):
            agent.load(path + str(i))
示例#3
0
class Agent():
    def __init__(self, nS, nA, indicies, config):
        self.nS = nS
        self.nA = nA
        self.indicies = indicies
        self.vector_size = self.indicies[-1][1]
        self.grade_mask = config.grade_technique_keys
        self.terrain_mask = config.terrain_technique_keys
        self.action_low = config.action_low
        self.action_high = config.action_high
        self.seed = config.seed

        self.clip_norm = config.clip_norm
        self.tau = config.tau
        self.gamma = config.gamma
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.L2 = config.L2
        self.SGD_epoch = config.SGD_epoch
        # noise
        self.noise = OUnoise(nA, config.seed)
        self.noise_scale = 1.0
        self.noise_decay = config.noise_decay

        # Priority Replay Buffer
        self.batch_size = config.batch_size
        self.buffer_size = config.buffer_size
        self.alpha = config.ALPHA
        self.beta = self.start_beta = config.START_BETA
        self.end_beta = config.END_BETA

        # actors networks
        self.actor = Actor(self.seed, nS, nA, self.grade_mask,
                           self.terrain_mask, indicies).to(self.device)
        self.actor_target = Actor(self.seed, nS, nA, self.grade_mask,
                                  self.terrain_mask, indicies).to(self.device)

        # Param noise
        self.param_noise = AdaptiveParamNoise()
        self.actor_perturbed = Actor(self.seed, nS, nA, self.grade_mask,
                                     self.terrain_mask,
                                     indicies).to(self.device)

        # critic networks
        self.critic = Critic(self.seed, nS, nA).to(self.device)
        self.critic_target = Critic(self.seed, nS, nA).to(self.device)

        # Copy the weights from local to target
        hard_update(self.critic, self.critic_target)
        hard_update(self.actor, self.actor_target)

        # optimizer
        self.actor_opt = optim.Adam(self.actor.parameters(),
                                    lr=1e-4,
                                    weight_decay=self.L2)
        self.critic_opt = optim.Adam(self.critic.parameters(),
                                     lr=1e-3,
                                     weight_decay=self.L2)

        # replay buffer
        self.PER = PriorityReplayBuffer(self.buffer_size,
                                        self.batch_size,
                                        self.seed,
                                        alpha=self.alpha,
                                        device=self.device)

        # reset agent for training
        self.reset_episode()
        self.it = 0

    def save_weights(self, path):
        params = {}
        params['actor'] = self.actor.state_dict()
        params['critic'] = self.critic.state_dict()
        torch.save(params, path)

    def load_weights(self, path):
        checkpoint = torch.load(path, map_location=self.device)
        self.actor.load_state_dict(checkpoint['actor'])
        self.actor_target.load_state_dict(checkpoint['actor'])
        self.critic.load_state_dict(checkpoint['critic'])
        self.critic_target.load_state_dict(checkpoint['critic'])

    def reset_episode(self):
        self.noise.reset()

    def ddpg_distance_metric(self, actions1, actions2):
        """
        TODO
        Necessary for param noise
        Computes distance between actions taken by two different policies
        Expects numpy arrays
        """
        diff = actions1 - actions2
        mean_diff = np.mean(np.square(diff), axis=0)
        dist = np.sqrt(np.mean(mean_diff))
        return dist

    def norm_action(self, action):
        for index in self.indicies:
            action[index[0]:index[1]] = action[index[0]:index[1]] / np.sum(
                action[index[0]:index[1]])
        return action

    def act(self, state):
        with torch.no_grad():
            action = self.actor(self.tensor(state)).cpu().numpy()
        action += np.random.rand(self.indicies[-1][1]) * self.noise_scale
        self.noise_scale = max(self.noise_scale * self.noise_decay, 0.01)
        self.actor.train()
        action = self.norm_action(action)
        return action

    def act_perturbed(self, state):
        """
        TODO
        """
        with torch.no_grad():
            action = self.actor_perturbed(self.tensor(state)).cpu().numpy()
        return action

    def perturbed_update(self):
        """
        TODO
        """
        hard_update(self.actor, self.actor_perturbed)
        params = self.actor_perturbed.state_dict()
        for name in params:
            if 'ln' in name:
                pass
            param = params[name]
            random = torch.randn(param.shape).to(self.device)
            param += random * self.param_noise.current_stddev

    def evaluate(self, state):
        self.actor.eval()
        with torch.no_grad():
            action = self.actor(self.tensor(state)).cpu().numpy()
        return action

    def step(self, obs, actions, rewards, next_obs):
        # cast as torch tensors
        next_obs = torch.from_numpy(next_obs.reshape(
            self.vector_size)).float().to(self.device)
        obs = torch.from_numpy(obs.reshape(self.vector_size)).float().to(
            self.device)
        actions = torch.from_numpy(actions.reshape(
            self.vector_size)).float().to(self.device)
        # Calc TD error
        next_action = self.actor(next_obs)
        next_value = self.critic_target(next_obs, next_action)
        target = rewards + self.gamma * next_value
        local = self.critic(obs, actions)
        TD_error = (target - local).squeeze(0)
        self.PER.add(obs, actions, rewards, next_obs, TD_error)
        for _ in range(self.SGD_epoch):
            samples, indicies, importances = self.PER.sample()
            self.learn(samples, indicies, importances)

    def add_replay_warmup(self, obs, actions, rewards, next_obs):
        next_obs = torch.from_numpy(next_obs.reshape(
            self.vector_size)).float().to(self.device)
        obs = torch.from_numpy(obs.reshape(self.vector_size)).float().to(
            self.device)
        actions = torch.from_numpy(actions.reshape(
            self.vector_size)).float().to(self.device)
        # Calculate TD_error
        next_action = self.actor(next_obs)
        next_value = self.critic_target(next_obs, next_action)
        target = np.max(rewards) + self.gamma * next_value
        local = self.critic(obs, actions)
        TD_error = (target - local).squeeze(0)
        self.PER.add(obs, actions, np.max(rewards), next_obs, TD_error)

    def learn(self, samples, indicies, importances):

        states, actions, rewards, next_states = samples

        with torch.no_grad():
            target_actions = self.actor_target(next_states)
        next_values = self.critic_target(next_states, target_actions)
        y_target = rewards + self.gamma * next_values
        y_current = self.critic(states, actions)
        TD_error = y_current - y_target
        # update critic
        critic_loss = ((torch.tensor(importances).to(self.device) *
                        TD_error)**2).mean()
        self.critic.zero_grad()
        critic_loss.backward()
        # torch.nn.utils.clip_grad_norm_(self.critic.parameters(),self.clip_norm)
        self.critic_opt.step()

        # update actor
        local_actions = self.actor(states)
        actor_loss = -self.critic(states, local_actions).mean()
        self.actor.zero_grad()
        actor_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.actor.parameters(), self.clip_norm)
        self.actor_opt.step()

        # Update PER
        TD_errors = TD_error.squeeze(1).detach().cpu().numpy()
        self.PER.sum_tree.update_priorities(TD_errors, indicies)

        # soft update networks
        self.soft_update()

    def soft_update(self):
        """Soft update of target network
        θ_target = τ*θ_local + (1 - τ)*θ_target
        """
        for target_param, param in zip(self.actor_target.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(self.tau * param.data +
                                    (1 - self.tau) * target_param.data)
        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(self.tau * param.data +
                                    (1 - self.tau) * target_param.data)

    def tensor(self, x):
        return torch.from_numpy(x).float().to(self.device)
示例#4
0
文件: ddpg.py 项目: marsXyr/DP-ERL
class DDPG(object):
    def __init__(self, state_dim, action_dim, max_action, memory, args):

        # actor
        self.actor = Actor(state_dim,
                           action_dim,
                           max_action,
                           layer_norm=args.layer_norm)
        self.actor_target = Actor(state_dim,
                                  action_dim,
                                  max_action,
                                  layer_norm=args.layer_norm)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=args.actor_lr)

        # crtic
        self.critic = Critic(state_dim, action_dim, layer_norm=args.layer_norm)
        self.critic_target = Critic(state_dim,
                                    action_dim,
                                    layer_norm=args.layer_norm)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=args.critic_lr)

        # cuda
        if torch.cuda.is_available():
            self.actor = self.actor.cuda()
            self.actor_target = self.actor_target.cuda()
            self.critic = self.critic.cuda()
            self.critic_target = self.critic_target.cuda()

        # misc
        self.criterion = nn.MSELoss()
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.max_action = max_action
        self.memory = memory

        # hyper-parameters
        self.tau = args.tau
        self.discount = args.discount
        self.batch_size = args.batch_size

    def show_lr(self):
        print(self.actor_optimizer.state_dict())

    def select_action(self, state, noise=None):
        state = FloatTensor(state.reshape(-1, self.state_dim))
        action = self.actor(state).cpu().data.numpy().flatten()

        if noise is not None:
            action += noise.sample()

        return np.clip(action, -self.max_action, self.max_action)

    def train(self, iterations):

        for _ in tqdm(range(iterations)):

            # Sample replay buffer
            x, y, u, r, d = self.memory.sample(self.batch_size)
            state = FloatTensor(x)
            action = FloatTensor(u)
            next_state = FloatTensor(y)
            done = FloatTensor(1 - d)
            reward = FloatTensor(r)

            # Q target = reward + discount * Q(next_state, pi(next_state))
            with torch.no_grad():
                target_Q = self.critic_target(next_state,
                                              self.actor_target(next_state))
                target_Q = reward + (done * self.discount * target_Q)

            # Get current Q estimate
            current_Q = self.critic(state, action)

            # Compute critic loss
            critic_loss = self.criterion(current_Q, target_Q)

            # Optimize the critic
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            # Compute actor loss
            actor_loss = -self.critic(state, self.actor(state)).mean()

            # Optimize the actor
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # Update the frozen target models
            for param, target_param in zip(self.critic.parameters(),
                                           self.critic_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

            for param, target_param in zip(self.actor.parameters(),
                                           self.actor_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

    def train_critic(self, iterations):

        for _ in tqdm(range(iterations)):

            # Sample replay buffer
            states, n_states, actions, rewards, dones = self.memory.sample(
                self.batch_size)

            sys.stdout.flush()

            # Q target = reward + discount * Q(next_state, pi(next_state))
            with torch.no_grad():
                target_Q = self.critic_target(n_states,
                                              self.actor_target(n_states))
                target_Q = rewards + (1 - dones) * self.discount * target_Q

            # Get current Q estimate
            current_Q = self.critic(states, actions)

            # Compute critic loss
            critic_loss = self.criterion(current_Q, target_Q)

            # Optimize the critic
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            # Compute actor loss
            actor_loss = - \
                self.critic(states, self.actor(states)).mean()

            # Optimize the actor
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # Update the frozen target models
            for param, target_param in zip(self.critic.parameters(),
                                           self.critic_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

            for param, target_param in zip(self.actor.parameters(),
                                           self.actor_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

    def load(self, filename):
        self.actor.load_model(filename, "actor")
        self.critic.load_model(filename, "critic")

    def save(self, output):
        self.actor.save_model(output, "actor")
        self.critic.save_model(output, "critic")
示例#5
0
class ActorCriticAgent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 gamma=0.99,
                 tau=1e-3,
                 batch_size=128,
                 hidden_layer_size=(512, 256),
                 lr_actor_critic=(1e-3, 1e-4),
                 noise=(0.6, 0.995)):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            gamma (float): discount factor
            update_every (int): how often to update the network
            tau (float): for soft update of target parameters
            batch_size (int): minibatch size
            hidden_layer_size (tuple(int, int)): tuple of hidden layer size for the actor and critic network
            lr_actor_critic (tuple(float, float)): tuple of learning rates of the actor and of the critic
            noise (tuple(float, float)): tuple containing the noise factor and the rate to apply to the factor after each episode
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.name = f'agent'

        lr_actor, lr_critic = lr_actor_critic
        # Actor Networks (local one and target one)
        fc1_units, fc2_units = hidden_layer_size
        self.actor_local = Actor(state_size,
                                 action_size,
                                 seed,
                                 fc1_units=fc1_units,
                                 fc2_units=fc2_units).to(device)
        self.actor_target = Actor(state_size,
                                  action_size,
                                  seed,
                                  fc1_units=fc1_units,
                                  fc2_units=fc2_units).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (local one and target one)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=WEIGHT_DECAY)

        # Initialize the target model weights with the local ones (same values)
        self.actor_target.load_state_dict(self.actor_local.state_dict())
        self.critic_target.load_state_dict(self.critic_local.state_dict())

        # Noise process
        factor, decay_rate = noise
        self.noise = GaussianNoise(action_size, factor, decay_rate=decay_rate)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, batch_size, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, states, actions, rewards, next_states, dones):
        # Save experience in replay memory
        for i in range(len(states)):
            self.memory.add(states[i], actions[i], rewards[i], next_states[i],
                            dones[i])

        # If enough samples are available in memory, get random subset and learn
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

    def act(self, states, add_noise=True):
        """Returns actions for given state as per current policy.

        Params
        ======
            states (array_like): current state
            add_noise: indicates if noise should be added
        """
        states = torch.from_numpy(states).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            actions += self.noise.sample()

        return np.clip(actions, -1, 1)

    def end(self):
        """ Method applied at the end of each episode """
        self.noise.end()

    def reset(self):
        self.noise.reset()

    def learn(self, experiences):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets.detach())
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # as suggested in the "Benchmak implementation" section of the course"
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states.detach(), actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local, self.actor_target)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)
示例#6
0
def train(BATCH_SIZE, DISCOUNT, ENTROPY_WEIGHT, HIDDEN_SIZE, LEARNING_RATE,
          MAX_STEPS, POLYAK_FACTOR, REPLAY_SIZE, TEST_INTERVAL,
          UPDATE_INTERVAL, UPDATE_START, ENV, OBSERVATION_LOW, VALUE_FNC,
          FLOW_TYPE, FLOWS, DEMONSTRATIONS, PRIORITIZE_REPLAY,
          BEHAVIOR_CLONING, ARM, BASE, RPA, REWARD_DENSE, logdir):

    ALPHA = 0.3
    BETA = 1
    epsilon = 0.0001  #0.1
    epsilon_d = 0.1  #0.3
    weights = 1  #1
    lambda_ac = 0.85  #0.7
    lambda_bc = 0.3  #0.4

    setup_logger(logdir, locals())
    ENV = __import__(ENV)
    if ARM and BASE:
        env = ENV.youBotAll('youbot_navig2.ttt',
                            obs_lowdim=OBSERVATION_LOW,
                            rpa=RPA,
                            reward_dense=REWARD_DENSE,
                            boundary=1)
    elif ARM:
        env = ENV.youBotArm('youbot_navig.ttt',
                            obs_lowdim=OBSERVATION_LOW,
                            rpa=RPA,
                            reward_dense=REWARD_DENSE)
    elif BASE:
        env = ENV.youBotBase('youbot_navig.ttt',
                             obs_lowdim=OBSERVATION_LOW,
                             rpa=RPA,
                             reward_dense=REWARD_DENSE,
                             boundary=1)

    action_space = env.action_space
    obs_space = env.observation_space()
    step_limit = env.step_limit()

    if OBSERVATION_LOW:
        actor = SoftActorGated(HIDDEN_SIZE,
                               action_space,
                               obs_space,
                               flow_type=FLOW_TYPE,
                               flows=FLOWS).float().to(device)
        critic_1 = Critic(HIDDEN_SIZE,
                          1,
                          obs_space,
                          action_space,
                          state_action=True).float().to(device)
        critic_2 = Critic(HIDDEN_SIZE,
                          1,
                          obs_space,
                          action_space,
                          state_action=True).float().to(device)
    else:
        actor = ActorImageNet(HIDDEN_SIZE,
                              action_space,
                              obs_space,
                              flow_type=FLOW_TYPE,
                              flows=FLOWS).float().to(device)
        critic_1 = Critic(HIDDEN_SIZE,
                          1,
                          obs_space,
                          action_space,
                          state_action=True).float().to(device)
        critic_2 = Critic(HIDDEN_SIZE,
                          1,
                          obs_space,
                          action_space,
                          state_action=True).float().to(device)
        critic_1.load_state_dict(
            torch.load(
                'data/youbot_all_final_21-08-2019_22-32-00/models/critic1_model_473000.pkl'
            ))
        critic_2.load_state_dict(
            torch.load(
                'data/youbot_all_final_21-08-2019_22-32-00/models/critic1_model_473000.pkl'
            ))

    actor.apply(weights_init)
    # critic_1.apply(weights_init)
    # critic_2.apply(weights_init)

    if VALUE_FNC:
        value_critic = Critic(HIDDEN_SIZE, 1, obs_space,
                              action_space).float().to(device)
        target_value_critic = create_target_network(value_critic).float().to(
            device)
        value_critic_optimiser = optim.Adam(value_critic.parameters(),
                                            lr=LEARNING_RATE)
    else:
        target_critic_1 = create_target_network(critic_1)
        target_critic_2 = create_target_network(critic_2)
    actor_optimiser = optim.Adam(actor.parameters(), lr=LEARNING_RATE)
    critics_optimiser = optim.Adam(list(critic_1.parameters()) +
                                   list(critic_2.parameters()),
                                   lr=LEARNING_RATE)

    # Replay buffer
    if PRIORITIZE_REPLAY:
        # D = PrioritizedReplayBuffer(REPLAY_SIZE, ALPHA)
        D = ReplayMemory(device, 3, DISCOUNT, 1, BETA, ALPHA, REPLAY_SIZE)
    else:
        D = deque(maxlen=REPLAY_SIZE)

    eval_ = evaluation_sac(env, logdir, device)

    #Automatic entropy tuning init
    target_entropy = -np.prod(action_space).item()
    log_alpha = torch.zeros(1, requires_grad=True, device=device)
    alpha_optimizer = optim.Adam([log_alpha], lr=LEARNING_RATE)

    home = os.path.expanduser('~')
    if DEMONSTRATIONS:
        dir_dem = os.path.join(home, 'robotics_drl/data/demonstrations/',
                               DEMONSTRATIONS)
        D, n_demonstrations = load_buffer_demonstrations(
            D, dir_dem, PRIORITIZE_REPLAY, OBSERVATION_LOW)
    else:
        n_demonstrations = 0

    if not BEHAVIOR_CLONING:
        behavior_loss = 0

    os.mkdir(os.path.join(home, 'robotics_drl', logdir, 'models'))
    dir_models = os.path.join(home, 'robotics_drl', logdir, 'models')

    state, done = env.reset(), False
    if OBSERVATION_LOW:
        state = state.float().to(device)
    else:
        state['low'] = state['low'].float()
        state['high'] = state['high'].float()
    pbar = tqdm(range(1, MAX_STEPS + 1), unit_scale=1, smoothing=0)

    steps = 0
    success = 0
    for step in pbar:
        with torch.no_grad():
            if step < UPDATE_START and not DEMONSTRATIONS:
                # To improve exploration take actions sampled from a uniform random distribution over actions at the start of training
                action = torch.tensor(env.sample_action(),
                                      dtype=torch.float32,
                                      device=device).unsqueeze(dim=0)
            else:
                # Observe state s and select action a ~ μ(a|s)
                if not OBSERVATION_LOW:
                    state['low'] = state['low'].float().to(device)
                    state['high'] = state['high'].float().to(device)
                action, _ = actor(state, log_prob=False, deterministic=False)
                if not OBSERVATION_LOW:
                    state['low'] = state['low'].float().cpu()
                    state['high'] = state['high'].float().cpu()
                #if (policy.mean).mean() > 0.4:
                #    print("GOOD VELOCITY")
            # Execute a in the environment and observe next state s', reward r, and done signal d to indicate whether s' is terminal
            next_state, reward, done = env.step(
                action.squeeze(dim=0).cpu().tolist())
            if OBSERVATION_LOW:
                next_state = next_state.float().to(device)
            else:
                next_state['low'] = next_state['low'].float()
                next_state['high'] = next_state['high'].float()
            # Store (s, a, r, s', d) in replay buffer D
            if PRIORITIZE_REPLAY:
                if OBSERVATION_LOW:
                    D.add(state.cpu().tolist(),
                          action.cpu().squeeze().tolist(), reward,
                          next_state.cpu().tolist(), done)
                else:
                    D.append(state['high'], state['low'],
                             action.cpu().squeeze().tolist(), reward, done)
            else:
                D.append({
                    'state':
                    state.unsqueeze(dim=0) if OBSERVATION_LOW else state,
                    'action':
                    action,
                    'reward':
                    torch.tensor([reward], dtype=torch.float32, device=device),
                    'next_state':
                    next_state.unsqueeze(
                        dim=0) if OBSERVATION_LOW else next_state,
                    'done':
                    torch.tensor([True if reward == 1 else False],
                                 dtype=torch.float32,
                                 device=device)
                })

            state = next_state

            # If s' is terminal, reset environment state
            steps += 1

            if done or steps > step_limit:  #TODO: incorporate step limit in the environment
                eval_c2 = True  #TODO: multiprocess pyrep with a session for each testing and training
                steps = 0
                if OBSERVATION_LOW:
                    state = env.reset().float().to(device)
                else:
                    state = env.reset()
                    state['low'] = state['low'].float()
                    state['high'] = state['high'].float()
                if reward == 1:
                    success += 1

        if step > UPDATE_START and step % UPDATE_INTERVAL == 0:
            for _ in range(1):
                # Randomly sample a batch of transitions B = {(s, a, r, s', d)} from D
                if PRIORITIZE_REPLAY:
                    if OBSERVATION_LOW:
                        state_batch, action_batch, reward_batch, state_next_batch, done_batch, weights_pr, idxes = D.sample(
                            BATCH_SIZE, BETA)
                        state_batch = torch.from_numpy(state_batch).float().to(
                            device)
                        next_state_batch = torch.from_numpy(
                            state_next_batch).float().to(device)
                        action_batch = torch.from_numpy(
                            action_batch).float().to(device)
                        reward_batch = torch.from_numpy(
                            reward_batch).float().to(device)
                        done_batch = torch.from_numpy(done_batch).float().to(
                            device)
                        weights_pr = torch.from_numpy(weights_pr).float().to(
                            device)
                    else:
                        idxes, high_state_batch, low_state_batch, action_batch, reward_batch, high_state_next_batch, low_state_next_batch, done_batch, weights_pr = D.sample(
                            BATCH_SIZE)

                        state_batch = {
                            'low':
                            low_state_batch.float().to(device).view(-1, 32),
                            'high':
                            high_state_batch.float().to(device).view(
                                -1, 12, 128, 128)
                        }
                        next_state_batch = {
                            'low':
                            low_state_next_batch.float().to(device).view(
                                -1, 32),
                            'high':
                            high_state_next_batch.float().to(device).view(
                                -1, 12, 128, 128)
                        }

                        action_batch = action_batch.float().to(device)
                        reward_batch = reward_batch.float().to(device)
                        done_batch = done_batch.float().to(device)
                        weights_pr = weights_pr.float().to(device)
                        # for j in range(BATCH_SIZE):
                        #     new_state_batch['high'] = torch.cat((new_state_batch['high'], state_batch[j].tolist()['high'].view(-1,(3+1)*env.frames,128,128)), dim=0)
                        #     new_state_batch['low'] = torch.cat((new_state_batch['low'], state_batch[j].tolist()['low'].view(-1,32)), dim=0)
                        #     new_next_state_batch['high'] = torch.cat((new_next_state_batch['high'], state_next_batch[j].tolist()['high'].view(-1,(3+1)*env.frames,128,128)), dim=0)
                        #     new_next_state_batch['low'] = torch.cat((new_next_state_batch['low'], state_next_batch[j].tolist()['low'].view(-1,32)), dim=0)
                        # new_state_batch['high'] = new_state_batch['high'].to(device)
                        # new_state_batch['low'] = new_state_batch['low'].to(device)
                        # new_next_state_batch['high'] = new_next_state_batch['high'].to(device)
                        # new_next_state_batch['low'] = new_next_state_batch['low'].to(device)

                    batch = {
                        'state': state_batch,
                        'action': action_batch,
                        'reward': reward_batch,
                        'next_state': next_state_batch,
                        'done': done_batch
                    }
                    state_batch = []
                    state_next_batch = []

                else:
                    batch = random.sample(D, BATCH_SIZE)
                    state_batch = []
                    action_batch = []
                    reward_batch = []
                    state_next_batch = []
                    done_batch = []
                    for d in batch:
                        state_batch.append(d['state'])
                        action_batch.append(d['action'])
                        reward_batch.append(d['reward'])
                        state_next_batch.append(d['next_state'])
                        done_batch.append(d['done'])

                    batch = {
                        'state': torch.cat(state_batch, dim=0),
                        'action': torch.cat(action_batch, dim=0),
                        'reward': torch.cat(reward_batch, dim=0),
                        'next_state': torch.cat(state_next_batch, dim=0),
                        'done': torch.cat(done_batch, dim=0)
                    }

                action, log_prob = actor(batch['state'],
                                         log_prob=True,
                                         deterministic=False)

                #Automatic entropy tuning
                alpha_loss = -(
                    log_alpha.float() *
                    (log_prob + target_entropy).float().detach()).mean()
                alpha_optimizer.zero_grad()
                alpha_loss.backward()
                alpha_optimizer.step()
                alpha = log_alpha.exp()
                weighted_sample_entropy = (alpha.float() * log_prob).view(
                    -1, 1)

                # Compute targets for Q and V functions
                if VALUE_FNC:
                    y_q = batch['reward'] + DISCOUNT * (
                        1 - batch['done']) * target_value_critic(
                            batch['next_state'])
                    y_v = torch.min(
                        critic_1(batch['state']['low'], action.detach()),
                        critic_2(batch['state']['low'], action.detach())
                    ) - weighted_sample_entropy.detach()
                else:
                    # No value function network
                    with torch.no_grad():
                        next_actions, next_log_prob = actor(
                            batch['next_state'],
                            log_prob=True,
                            deterministic=False)
                        target_qs = torch.min(
                            target_critic_1(
                                batch['next_state']['low'] if
                                not OBSERVATION_LOW else batch['next_state'],
                                next_actions),
                            target_critic_2(
                                batch['next_state']['low'] if
                                not OBSERVATION_LOW else batch['next_state'],
                                next_actions)) - alpha * next_log_prob
                    y_q = batch['reward'] + DISCOUNT * (
                        1 - batch['done']) * target_qs.detach()

                td_error_critic1 = critic_1(
                    batch['state']['low'] if not OBSERVATION_LOW else
                    batch['state'], batch['action']) - y_q
                td_error_critic2 = critic_2(
                    batch['state']['low'] if not OBSERVATION_LOW else
                    batch['state'], batch['action']) - y_q

                q_loss = (td_error_critic1).pow(2).mean() + (
                    td_error_critic2).pow(2).mean()
                # q_loss = (F.mse_loss(critic_1(batch['state'], batch['action']), y_q) + F.mse_loss(critic_2(batch['state'], batch['action']), y_q)).mean()
                critics_optimiser.zero_grad()
                q_loss.backward()
                critics_optimiser.step()

                # Compute priorities, taking demonstrations into account
                if PRIORITIZE_REPLAY:
                    td_error = weights_pr * (td_error_critic1.detach() +
                                             td_error_critic2.detach()).mean()
                    action_dem = torch.tensor([]).to(device)
                    if OBSERVATION_LOW:
                        state_dem = torch.tensor([]).to(device)
                    else:
                        state_dem = {
                            'low': torch.tensor([]).float().to(device),
                            'high': torch.tensor([]).float().to(device)
                        }
                    priorities = torch.abs(td_error).tolist()
                    i = 0
                    count_dem = 0
                    for idx in idxes:
                        priorities[i] += epsilon
                        if idx < n_demonstrations:
                            priorities[i] += epsilon_d
                            count_dem += 1
                            if BEHAVIOR_CLONING:
                                action_dem = torch.cat(
                                    (action_dem, batch['action'][i].view(
                                        1, -1)),
                                    dim=0)
                                if OBSERVATION_LOW:
                                    state_dem = torch.cat(
                                        (state_dem, batch['state'][i].view(
                                            1, -1)),
                                        dim=0)
                                else:
                                    state_dem['high'] = torch.cat(
                                        (state_dem['high'],
                                         batch['state']['high'][i, ].view(
                                             -1,
                                             (3 + 1) * env.frames, 128, 128)),
                                        dim=0)
                                    state_dem['low'] = torch.cat(
                                        (state_dem['low'],
                                         batch['state']['low'][i, ].view(
                                             -1, 32)),
                                        dim=0)
                        i += 1
                    if not action_dem.nelement() == 0:
                        actual_action_dem, _ = actor(state_dem,
                                                     log_prob=False,
                                                     deterministic=True)
                        # q_value_actor = (critic_1(batch['state'][i], batch['action'][i]) + critic_2(batch['state'][i], batch['action'][i]))/2
                        # q_value_actual = (critic_1(batch['state'][i], actual_action_dem) + critic_2(batch['state'][i], actual_action_dem))/2
                        # if q_value_actor > q_value_actual: # Q Filter
                        behavior_loss = F.mse_loss(
                            action_dem, actual_action_dem).unsqueeze(dim=0)
                    else:
                        behavior_loss = 0

                    D.update_priorities(idxes, priorities)
                lambda_bc = (count_dem / BATCH_SIZE) / 5

                # Update V-function by one step of gradient descent
                if VALUE_FNC:
                    v_loss = (value_critic(batch['state']) -
                              y_v).pow(2).mean().to(device)

                    value_critic_optimiser.zero_grad()
                    v_loss.backward()
                    value_critic_optimiser.step()

                # Update policy by one step of gradient ascent
                with torch.no_grad():
                    new_qs = torch.min(
                        critic_1(
                            batch["state"]['low'] if not OBSERVATION_LOW else
                            batch['state'], action),
                        critic_2(
                            batch["state"]['low'] if not OBSERVATION_LOW else
                            batch['state'], action))
                policy_loss = lambda_ac * (weighted_sample_entropy.view(
                    -1) - new_qs).mean().to(device) + lambda_bc * behavior_loss
                actor_optimiser.zero_grad()
                policy_loss.backward()
                actor_optimiser.step()

                # Update target value network
                if VALUE_FNC:
                    update_target_network(value_critic, target_value_critic,
                                          POLYAK_FACTOR)
                else:
                    update_target_network(critic_1, target_critic_1,
                                          POLYAK_FACTOR)
                    update_target_network(critic_2, target_critic_2,
                                          POLYAK_FACTOR)
        state_dem = []

        # Continues to sample transitions till episode is done and evaluation is on
        if step > UPDATE_START and step % TEST_INTERVAL == 0: eval_c = True
        else: eval_c = False

        if eval_c == True and eval_c2 == True:
            eval_c = False
            eval_c2 = False
            actor.eval()
            critic_1.eval()
            critic_2.eval()
            q_value_eval = eval_.get_qvalue(critic_1, critic_2)
            return_ep, steps_ep = eval_.sample_episode(actor)

            logz.log_tabular('Training steps', step)
            logz.log_tabular('Cumulative Success', success)
            logz.log_tabular('Validation return', return_ep.mean())
            logz.log_tabular('Validation steps', steps_ep.mean())
            logz.log_tabular('Validation return std', return_ep.std())
            logz.log_tabular('Validation steps std', steps_ep.std())
            logz.log_tabular('Q-value evaluation', q_value_eval)
            logz.log_tabular('Q-network loss', q_loss.detach().cpu().numpy())
            if VALUE_FNC:
                logz.log_tabular('Value-network loss',
                                 v_loss.detach().cpu().numpy())
            logz.log_tabular('Policy-network loss',
                             policy_loss.detach().cpu().squeeze().numpy())
            logz.log_tabular('Alpha loss', alpha_loss.detach().cpu().numpy())
            logz.log_tabular('Alpha', alpha.detach().cpu().squeeze().numpy())
            logz.log_tabular('Demonstrations current batch', count_dem)
            logz.dump_tabular()

            logz.save_pytorch_model(actor.state_dict())

            torch.save(actor.state_dict(),
                       os.path.join(dir_models, 'actor_model_%s.pkl' % (step)))
            torch.save(
                critic_1.state_dict(),
                os.path.join(dir_models, 'critic1_model_%s.pkl' % (step)))
            torch.save(
                critic_2.state_dict(),
                os.path.join(dir_models, 'critic1_model_%s.pkl' % (step)))

            #pbar.set_description('Step: %i | Reward: %f' % (step, return_ep.mean()))

            actor.train()
            critic_1.train()
            critic_2.train()

    env.terminate()
示例#7
0
class TD3:
    def __init__(self,
                 env,
                 state_dim,
                 action_dim,
                 max_action,
                 gamma=0.99,
                 tau=0.005,
                 policy_noise=0.2,
                 noise_clip=0.5,
                 policy_freq=2):
        self.actor = Actor(state_dim, action_dim)
        self.actor_target = Actor(state_dim, action_dim)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-3)

        self.critic = Critic(state_dim, action_dim)
        self.critic_target = Critic(state_dim, action_dim)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3)

        self.max_action = max_action
        self.gamma = gamma
        self.tau = tau
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.policy_freq = policy_freq

        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

        self.actor.to(self.device)
        self.actor_target.to(self.device)
        self.critic.to(self.device)
        self.critic_target.to(self.device)

        self.env = env
        self.total_it = 0

    def select_action(self, state, noise=0.1):
        action = self.actor(state.to(self.device)).data.cpu().numpy().flatten()
        if noise != 0:
            action = (action + np.random.normal(
                0, noise, size=self.env.action_space.shape[0]))

        return action.clip(self.env.action_space.low,
                           self.env.action_space.high)

    def train(self, replay_buffer, batch_size=128):
        self.total_it += 1

        states, states_, actions, rewards, terminal = replay_buffer.sample_buffer(
            batch_size)

        with torch.no_grad():
            noise = (torch.randn_like(actions.to(self.device)) *
                     self.policy_noise).clamp(-self.noise_clip,
                                              self.noise_clip)

            next_action = (self.actor_target(states_.to(self.device)) +
                           noise).clamp(-self.max_action, self.max_action)

            # compute the target Q value
            target_q1, target_q2 = self.critic_target(
                states_.to(self.device), next_action.to(self.device))
            target_q = torch.min(target_q1, target_q2)
            # target_q = rewards + terminal * self.gamma + target_q.cpu()
            # target_q = rewards + (terminal.reshape(256, 1) * self.gamma * target_q).detach()
            target_q = rewards + terminal * self.gamma * target_q[:, 0].cpu()

        # Get current Q value
        current_q1, current_q2 = self.critic(states.to(self.device),
                                             actions.to(self.device))

        # Compute critic loss
        critic_loss = F.mse_loss(current_q1[:, 0], target_q.to(
            self.device)) + F.mse_loss(current_q2[:, 0],
                                       target_q.to(self.device))

        # optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Delayed policy updates
        if self.total_it % self.policy_freq == 0:
            # Compote actor loss
            actor_loss = -self.critic.q1(states.to(
                self.device), self.actor(states.to(self.device))).mean()

            # Optimize the actor
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # Update the frozen target models
            for param, target_param in zip(self.critic.parameters(),
                                           self.critic_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

            for param, target_param in zip(self.actor.parameters(),
                                           self.actor_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

    def save(self, filename):
        torch.save(self.critic.state_dict(), filename + "_critic")
        torch.save(self.critic_optimizer.state_dict(),
                   filename + "_critic_optimizer")
        torch.save(self.actor.state_dict(), filename + "_actor")
        torch.save(self.actor_optimizer.state_dict(),
                   filename + "_actor_optimizer")

    def load(self, filename):
        self.critic.load_state_dict(torch.load(filename + "_critic"))
        self.critic_optimizer.load_state_dict(
            torch.load(filename + "_critic_optimizer"))
        self.actor.load_state_dict(torch.load(filename + "_actor"))
        self.actor_optimizer.load_state_dict(
            torch.load(filename + "_actor_optimizer"))
            loss_critic.backward(retain_graph=True)
            opt_critic.step()

        gen_fake = critic(fake)
        loss_gen = -torch.mean(gen_fake)
        generator.zero_grad()
        loss_gen.backward()
        opt_gen.step()

        if batch_idx % 20 == 0:
            print(
                f"Epoch [{epoch} / {epochs_nums}] Batch [{batch_idx} / {len(loader)}]  Loss C: {-loss_critic:.4f} , Loss G: {loss_gen:.4f}"
            )
            with torch.no_grad():
                fake = generator(fixed_sample).to(device)
                data = real.to(device)
                img_grid_fake = torchvision.utils.make_grid(fake[:32],
                                                            normalize=True)
                img_grid_real = torchvision.utils.make_grid(real[:32],
                                                            normalize=True)

                Writer_fake.add_image("Images Fake",
                                      img_grid_fake,
                                      global_step=step)
                Writer_real.add_image("Images Real",
                                      img_grid_real,
                                      global_step=step)
                step = step + 1
                torch.save(generator.state_dict(), "gen_weights.pt")
                torch.save(critic.state_dict(), "critic_weights.pt")
示例#9
0
class DDPG(object):
    def __init__(self, seed, nA, nS, L2, index):
        self.seed = seed
        self.nA = nA
        self.nS = nS
        self.nO = 52  # 24 * 2 state space + 2 * 2 action space
        self.L2 = L2
        self.index = index
        self.noise = OUnoise(nA, seed)

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.local_critic = Critic(seed, self.nO, nA).to(self.device)
        self.target_critic = Critic(seed, self.nO, nA).to(self.device)
        self.local_actor = Actor(seed, nS, nA).to(self.device)
        self.target_actor = Actor(seed, nS, nA).to(self.device)

        # Copy the weights from local to target
        hard_update(self.local_critic, self.target_critic)
        hard_update(self.local_actor, self.target_actor)

        self.critic_optimizer = optim.Adam(self.local_critic.parameters(),
                                           lr=1e-3,
                                           weight_decay=self.L2)
        self.actor_optimizer = optim.Adam(self.local_actor.parameters(),
                                          lr=1e-4)

    def load_weights(self, critic_path, actor_path):
        # Load weigths from both
        self.local_critic.load_state_dict(
            torch.load(critic_path + 'local_critic_' + str(self.index) +
                       '.ckpt'))
        self.local_actor.load_state_dict(
            torch.load(actor_path + 'local_actor_' + str(self.index) +
                       '.ckpt'))
        self.target_critic.load_state_dict(
            torch.load(critic_path + 'target_critic_' + str(self.index) +
                       '.ckpt'))
        self.target_actor.load_state_dict(
            torch.load(actor_path + 'target_actor_' + str(self.index) +
                       '.ckpt'))
        self.local_actor.eval()

    def save_weights(self, critic_path, actor_path):
        # Save weights for both
        torch.save(self.local_actor.state_dict(),
                   actor_path + 'local_actor_' + str(self.index) + '.ckpt')
        torch.save(self.target_actor.state_dict(),
                   actor_path + 'target_actor_' + str(self.index) + '.ckpt')
        torch.save(self.local_critic.state_dict(),
                   critic_path + 'local_critic_' + str(self.index) + '.ckpt')
        torch.save(self.target_critic.state_dict(),
                   critic_path + 'target_critic_' + str(self.index) + '.ckpt')

    def act(self, state):
        action = self.local_actor(
            state).detach().cpu().numpy() + self.noise.sample()
        return action

    def target_act(self, next_state):
        action = self.target_actor(
            next_state).detach().cpu().numpy() + self.noise.sample()
        return action

    def step(self):
        pass

    def learn(self):
        pass
示例#10
0
class ddpg_agent:
    def __init__(self, args, env):
        self.args = args
        self.env = env
        # get the number of inputs...
        num_inputs = self.env.observation_space.shape[0]
        num_actions = self.env.action_space.shape[0]
        self.action_scale = self.env.action_space.high[0]
        # build up the network
        self.actor_net = Actor(num_inputs, num_actions)
        self.critic_net = Critic(num_inputs, num_actions)
        # get the target network...
        self.actor_target_net = Actor(num_inputs, num_actions)
        self.critic_target_net = Critic(num_inputs, num_actions)
        if self.args.cuda:
            self.actor_net.cuda()
            self.critic_net.cuda()
            self.actor_target_net.cuda()
            self.critic_target_net.cuda()
        # copy the parameters..
        self.actor_target_net.load_state_dict(self.actor_net.state_dict())
        self.critic_target_net.load_state_dict(self.critic_net.state_dict())
        # setup the optimizer...
        self.optimizer_actor = torch.optim.Adam(self.actor_net.parameters(),
                                                lr=self.args.actor_lr)
        self.optimizer_critic = torch.optim.Adam(
            self.critic_net.parameters(),
            lr=self.args.critic_lr,
            weight_decay=self.args.critic_l2_reg)
        # setting up the noise
        self.ou_noise = OUNoise(num_actions)
        # check some dir
        if not os.path.exists(self.args.save_dir):
            os.mkdir(self.args.save_dir)
        self.model_path = self.args.save_dir + self.args.env_name + '/'
        if not os.path.exists(self.model_path):
            os.mkdir(self.model_path)

    # start to train the network..
    def learn(self):
        # init the brain memory
        replay_buffer = []
        total_timesteps = 0
        running_reward = None
        for episode_idx in range(self.args.max_episode):
            state = self.env.reset()
            # get the scale of the ou noise...
            self.ou_noise.scale = (self.args.noise_scale - self.args.final_noise_scale) * max(0, self.args.exploration_length - episode_idx) / \
                                self.args.exploration_length + self.args.final_noise_scale
            self.ou_noise.reset()
            # start the training
            reward_total = 0
            while True:
                state_tensor = torch.tensor(state,
                                            dtype=torch.float32).unsqueeze(0)
                if self.args.cuda:
                    state_tensor = state_tensor.cuda()
                with torch.no_grad():
                    policy = self.actor_net(state_tensor)
                # start to select the actions...
                actions = self._select_actions(policy)
                # step
                state_, reward, done, _ = self.env.step(actions *
                                                        self.action_scale)
                total_timesteps += 1
                reward_total += reward
                # start to store the samples...
                replay_buffer.append((state, reward, actions, done, state_))
                # check if the buffer size is outof range
                if len(replay_buffer) > self.args.replay_size:
                    replay_buffer.pop(0)
                if len(replay_buffer) > self.args.batch_size:
                    mini_batch = random.sample(replay_buffer,
                                               self.args.batch_size)
                    # start to update the network
                    _, _ = self._update_network(mini_batch)
                if done:
                    break
                state = state_
            running_reward = reward_total if running_reward is None else running_reward * 0.99 + reward_total * 0.01
            if episode_idx % self.args.display_interval == 0:
                torch.save(self.actor_net.state_dict(),
                           self.model_path + 'model.pt')
                print('[{}] Episode: {}, Frames: {}, Rewards: {}'.format(
                    datetime.now(), episode_idx, total_timesteps,
                    running_reward))

        self.env.close()

    # select actions
    def _select_actions(self, policy):
        actions = policy.detach().cpu().numpy()[0]
        actions = actions + self.ou_noise.noise()
        actions = np.clip(actions, -1, 1)
        return actions

    # update the network
    def _update_network(self, mini_batch):
        state_batch = np.array([element[0] for element in mini_batch])
        state_batch = torch.tensor(state_batch, dtype=torch.float32)
        # reward batch
        reward_batch = np.array([element[1] for element in mini_batch])
        reward_batch = torch.tensor(reward_batch,
                                    dtype=torch.float32).unsqueeze(1)
        # done batch
        done_batch = np.array([int(element[3]) for element in mini_batch])
        done_batch = 1 - done_batch
        done_batch = torch.tensor(done_batch, dtype=torch.float32).unsqueeze(1)
        # action batch
        actions_batch = np.array([element[2] for element in mini_batch])
        actions_batch = torch.tensor(actions_batch, dtype=torch.float32)
        # next stsate
        state_next_batch = np.array([element[4] for element in mini_batch])
        state_next_batch = torch.tensor(state_next_batch, dtype=torch.float32)
        # check if use the cuda
        if self.args.cuda:
            state_batch = state_batch.cuda()
            reward_batch = reward_batch.cuda()
            done_batch = done_batch.cuda()
            actions_batch = actions_batch.cuda()
            state_next_batch = state_next_batch.cuda()

        # update the critic network...
        with torch.no_grad():
            actions_out = self.actor_target_net(state_next_batch)
            expected_q_value = self.critic_target_net(state_next_batch,
                                                      actions_out)
        # get the target value
        target_value = reward_batch + self.args.gamma * expected_q_value * done_batch
        target_value = target_value.detach()
        values = self.critic_net(state_batch, actions_batch)
        critic_loss = (target_value - values).pow(2).mean()
        self.optimizer_critic.zero_grad()
        critic_loss.backward()
        self.optimizer_critic.step()
        # start to update the actor network
        actor_loss = -self.critic_net(state_batch,
                                      self.actor_net(state_batch)).mean()
        self.optimizer_actor.zero_grad()
        actor_loss.backward()
        self.optimizer_actor.step()
        # then, start to softupdate the network...
        self._soft_update_target_network(self.critic_target_net,
                                         self.critic_net)
        self._soft_update_target_network(self.actor_target_net, self.actor_net)

        return actor_loss.item(), critic_loss.item()

    # soft update the network
    def _soft_update_target_network(self, target, source):
        # update the critic network firstly...
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(self.args.tau * param.data +
                                    (1 - self.args.tau) * target_param.data)

    # functions to test the network
    def test_network(self):
        model_path = self.args.save_dir + self.args.env_name + '/model.pt'
        self.actor_net.load_state_dict(
            torch.load(model_path, map_location=lambda storage, loc: storage))
        self.actor_net.eval()
        # start to test
        for _ in range(5):
            state = self.env.reset()
            reward_sum = 0
            while True:
                self.env.render()
                state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
                with torch.no_grad():
                    actions = self.actor_net(state)
                actions = actions.detach().numpy()[0]
                state_, reward, done, _ = self.env.step(self.action_scale *
                                                        actions)
                reward_sum += reward
                if done:
                    break
                state = state_
            print('The reward of this episode is {}.'.format(reward_sum))
        self.env.close()
示例#11
0
class MADDPGAgent(object):
    """Multi Agent DDPG Implementation

    Paper: https://arxiv.org/abs/1706.02275
    I used their code to understand how the agents were implemented
    https://github.com/openai/maddpg
    """
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 agent_index,
                 writer,
                 random_seed,
                 dirname,
                 print_every=1000,
                 model_path=None,
                 saved_config=None,
                 eval_mode=False):
        """Initialize an Agent object.
        
        Parameters:    
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            agent_index (int): index (id) of current agent
            writer (object): visdom visualiser for realtime visualisations            
            random_seed (int): random seed
            dirname (string): output directory to store config, losses
            print_every (int): how often to print progress
            model_path (string): if defined, load saved model to resume training
            eval_mode (bool): whether to use eval mode
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.agent_index = agent_index
        self.writer = writer
        self.dirname = dirname
        self.print_every = print_every
        # save config params
        if not saved_config:
            self.config = CONFIG
            save_to_json(self.config,
                         '{}/hyperparams.json'.format(self.dirname))
        else:
            self.config = json.load(open(saved_config, 'r'))
            logger.info(
                'Loading config from saved location {}'.format(saved_config))

        # Create Critic network
        self.local_critic = Critic(self.state_size * num_agents,
                                   self.action_size * num_agents,
                                   random_seed,
                                   fc1_units=self.config['FC1'],
                                   fc2_units=self.config['FC2']).to(device)
        self.target_critic = Critic(self.state_size * num_agents,
                                    self.action_size * num_agents,
                                    random_seed,
                                    fc1_units=self.config['FC1'],
                                    fc2_units=self.config['FC2']).to(device)
        # Optimizer
        self.critic_optimizer = optim.Adam(
            self.local_critic.parameters(),
            lr=self.config['LR_CRITIC'],
            weight_decay=self.config['WEIGHT_DECAY'])

        # Create Actor network
        self.local_actor = Actor(self.state_size,
                                 self.action_size,
                                 random_seed,
                                 fc1_units=self.config['FC1'],
                                 fc2_units=self.config['FC2']).to(device)
        self.target_actor = Actor(self.state_size,
                                  self.action_size,
                                  random_seed,
                                  fc1_units=self.config['FC1'],
                                  fc2_units=self.config['FC2']).to(device)
        self.actor_optimizer = optim.Adam(self.local_actor.parameters(),
                                          lr=self.config['LR_ACTOR'])

        # Load saved model (if available)
        if model_path:
            logger.info('Loading model from {}'.format(model_path))
            self.local_actor.load_state_dict(
                torch.load('{}/checkpoint_actor_{}.pth'.format(
                    model_path, self.agent_index)))
            self.target_actor.load_state_dict(
                torch.load('{}/checkpoint_actor_{}.pth'.format(
                    model_path, self.agent_index)))
            self.local_critic.load_state_dict(
                torch.load('{}/checkpoint_critic_{}.pth'.format(
                    model_path, self.agent_index)))
            self.target_critic.load_state_dict(
                torch.load('{}/checkpoint_critic_{}.pth'.format(
                    model_path, self.agent_index)))
            if eval_mode:
                logger.info('agent {} set to eval mode')
                self.actor_local.eval()

        self.noise = OUNoise(self.action_size,
                             random_seed,
                             sigma=self.config['SIGMA'])
        self.learn_step = 0

    def act(self, state, add_noise=True, noise_weight=1):
        """Get the actions to take under the supplied states

        Parameters:
            state (array_like): Game state provided by the environment
            add_noise (bool): Whether we should apply the noise
            noise_weight (int): How much weight should be applied to the noise
        """
        state = torch.from_numpy(state).float().to(device)
        # Run inference in eval mode
        self.local_actor.eval()
        with torch.no_grad():
            action = self.local_actor(state).cpu().data.numpy()
        self.local_actor.train()
        # add noise if true
        if add_noise:
            action += self.noise.sample() * noise_weight
        return np.clip(action, -1, 1)

    def reset(self):
        """Resets the noise"""
        self.noise.reset()

    def learn(self, agents, experience, gamma):
        """Use the experience to allow agents to learn. 
        The critic of each agent can see the actions taken by all agents 
        and incorporate that in the learning.

        Parameters:
            agents (MADDPGAgent): instance of all the agents
            experience (Tuple[torch.Tensor]):  tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        num_agents = len(agents)
        states, actions, rewards, next_states, dones = experience
        # ---------------central critic-------------------
        # use target actor to get action, here we get target actors from
        # all agents to predict the next action
        next_actions = torch.zeros(
            (len(states), num_agents, self.action_size)).to(device)
        for i, agent in enumerate(agents):
            next_actions[:, i] = agent.target_actor(states[:, i, :])

        # Flatten state and action
        # e.g from state (100,2,24) --> (100, 48)
        critic_states = flatten(next_states)
        next_actions = flatten(next_actions)

        # calculate target and expected
        Q_targets_next = self.target_critic(critic_states, next_actions)
        Q_targets = rewards[:, self.agent_index, :] + (
            gamma * Q_targets_next * (1 - dones[:, self.agent_index, :]))
        Q_expected = self.local_critic(flatten(states), flatten(actions))

        # use mse loss
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        critic_loss_value = critic_loss.item()
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        if self.config['CLIP_GRADS']:
            for param in self.local_critic.parameters():
                param.grad.data.clamp_(-1 * self.config['CLAMP_VALUE'],
                                       self.config['CLAMP_VALUE'])
        self.critic_optimizer.step()

        # ---------------actor---------------------
        # Only update the predicted action of current agent
        predicted_actions = torch.zeros(
            (len(states), num_agents, self.action_size)).to(device)
        predicted_actions.data.copy_(actions.data)
        predicted_actions[:, self.agent_index] = self.local_actor(
            states[:, self.agent_index])
        actor_loss = -self.local_critic(flatten(states),
                                        flatten(predicted_actions)).mean()
        # Kept to remind myself about the mistake that several tooks hours of investigation
        # and was only found when I looked at grads from self.local_actor.parameters()
        # actor_loss = -self.local_critic(flatten(states), flatten(actions)).mean()

        actor_loss_value = actor_loss.item()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        if self.config['CLIP_GRADS']:
            for param in self.local_actor.parameters():
                # import pdb; pdb.set_trace()
                param.grad.data.clamp_(-1 * self.config['CLAMP_VALUE'],
                                       self.config['CLAMP_VALUE'])
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        if self.learn_step == 0:
            # One time only, start local and target with same parameters
            self._copy_weights(self.local_critic, self.target_critic)
            self._copy_weights(self.local_actor, self.target_actor)
        else:
            self.soft_update(self.local_critic, self.target_critic,
                             self.config["TAU"])
            self.soft_update(self.local_actor, self.target_actor,
                             self.config["TAU"])

        self.learn_step += 1
        return actor_loss_value, critic_loss_value

    def _copy_weights(self, source_network, target_network):
        """Copy source network weights to target"""
        for target_param, source_param in zip(target_network.parameters(),
                                              source_network.parameters()):
            target_param.data.copy_(source_param.data)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def checkpoint(self):
        """Checkpoint actor and critic models"""
        if not os.path.exists('{}/multi'.format(self.dirname)):
            os.makedirs('{}/multi'.format(self.dirname))
        torch.save(
            self.local_critic.state_dict(),
            '{}/multi/checkpoint_critic_{}.pth'.format(self.dirname,
                                                       self.agent_index))
        torch.save(
            self.local_actor.state_dict(),
            '{}/multi/checkpoint_actor_{}.pth'.format(self.dirname,
                                                      self.agent_index))
示例#12
0
文件: training.py 项目: xamm/DRL_HFV
    if epoch % LOG_INTERVAL == 0:

        writer.add_scalar("Actor_loss:{}".format(agent_id),
                          actor_loss.detach().cpu().numpy(),
                          epoch + trainend_steps)
        writer.add_scalar("Critic_loss:{}".format(agent_id),
                          critic_loss.detach().cpu().numpy(),
                          epoch + trainend_steps)

    o, d, r, ep_ret, ep_len = env.reset(), False, 0, 0, 0

    #-----------------SAVE CHECKPOINT----------------
    if epoch % LOG_INTERVAL == 0:
        save_dict = {}
        save_dict['steps'] = epoch + trainend_steps
        save_dict[value_name] = value_func.state_dict()
        save_dict[crtc_op_name] = critic_optimizers.state_dict()
        for agent_id in range(N_VEHICLES):
            p_name = policy_name.replace("X", str(agent_id))
            a_opt = actr_op_name.replace('X', str(agent_id))
            save_dict[p_name] = policy[agent_id].state_dict()
            save_dict[a_opt] = actor_optimizers[agent_id].state_dict()
        if os.path.isfile(save_dir):
            torch.save(save_dict, save_dir)
        else:
            os.mkdir('weights')
            torch.save(save_dict, save_dir)
#-----------------TEST TRAINED POLICY----------------
    if epoch % (LOG_INTERVAL * 2) == 0:
        o_t, d_t, r_t = env_test.reset(), False, 0
class DDPGAgent(Agent):
    """Interacts with and learns from the environment."""
    def __init__(self, idx, params):
        """Initialize an Agent object.
        
        Params
        ======
            params (dict-like): dictionary of parameters for the agent
        """
        super().__init__(params)

        self.idx = idx
        self.params = params
        self.update_every = params['update_every']
        self.gamma = params['gamma']
        self.num_agents = params['num_agents']
        self.name = "BATCH D4PG"
        # self.her = params['her']

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(params['actor_params']).to(device)
        self.actor_target = Actor(params['actor_params']).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=params['actor_params']['lr'])

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(params['critic_params']).to(device)
        self.critic_target = Critic(params['critic_params']).to(device)

        print("\n################ ACTOR ################\n")
        print(self.actor_local)

        print("\n################ CRITIC ################\n")
        print(self.critic_local)

        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=params['critic_params']['lr'],
            weight_decay=params['critic_params']['weight_decay'])

        # Noise process
        self.noise = OUNoise(self.params['noise_params'])

        # Replay memory
        self.memory = params['experience_replay']

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # next_state = torch.from_numpy(next_states[self.idx]).float().unsqueeze(0).to(device)
        # state = torch.from_numpy(states[self.idx]).float().unsqueeze(0).to(device)

        # # print("\nSTATE\n", state, "\nACTION\n", actions[self.idx], "\nREWARD\n", rewards[self.idx], "\nNEXT STATE\n", next_state, "\nDONE\n", dones[self.idx])
        # # Save experience / reward
        # self.memory.add(state.cpu(), actions[self.idx], rewards[self.idx], next_state.cpu(), dones[self.idx])

        next_state = torch.from_numpy(next_state).float().unsqueeze(0).to(
            device)
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        # print("\nSTATE\n", state, "\nACTION\n", action, "\nREWARD\n", reward, "\nNEXT STATE\n", next_state, "\nDONE\n", done)
        # Save experience / reward

        self.memory.add(state.cpu(), action, reward, next_state.cpu(), done)

    def step_her(self, agent_idx, timestep, state, action, reward, next_state,
                 done, goal):
        """Save experience in replay memory, and use random sample from buffer to learn."""

        next_state = torch.from_numpy(next_state).float().unsqueeze(0).to(
            device)
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        self.memory.add_to_episode(agent_idx, timestep, state.cpu(), action,
                                   reward, next_state.cpu(), done, goal)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        # self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        # self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1., 1.)

    def reset(self):
        self.noise.reset()

    def learn(self):
        # Learn every UPDATE_EVERY time steps.
        self.t_step += 1
        # print(self.t_step)
        # # self.t_step = (self.t_step + 1) % self.update_every
        # if self.t_step % self.update_every == 0:
        #     print("LEARNING", self.t_step)
        #     # If enough samples are available in memory, get random subset and learn
        #     if self.memory.ready():
        #         experiences = self.memory.sample()
        #         # print("################################## LEARN XP LENGTH",len(experiences))
        #         self.learn_(experiences)

        # If enough samples are available in memory, get random subset and learn
        if self.memory.ready():
            experiences = self.memory.sample()
            # print("################################## LEARN XP LENGTH",len(experiences))
            self.learn_(experiences)

    def learn_(self, experiences):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local, self.actor_target)

    def add_param_noise(self, noise):
        """Adds noise to the weights of the agent"""
        with torch.no_grad():
            for param in self.actor_local.parameters():
                param.add_(torch.randn(param.size()).to(device) * noise)
            for param in self.critic_local.parameters():
                param.add_(torch.randn(param.size()).to(device) * noise)

    def save_agent(self, average_reward, episode, save_history=False):
        """Save the checkpoint"""
        checkpoint = {
            'actor_state_dict': self.actor_target.state_dict(),
            'critic_state_dict': self.critic_target.state_dict(),
            'average_reward': average_reward,
            'episode': episode
        }

        if not os.path.exists("checkpoints"):
            os.makedirs("checkpoints")

        filePath = 'checkpoints\\' + self.name + '.pth'
        # print("\nSaving checkpoint\n")
        torch.save(checkpoint, filePath)

        if save_history:
            filePath = 'checkpoints\\' + self.name + '_' + str(
                episode) + '.pth'
            torch.save(checkpoint, filePath)

    def load_agent(self):
        """Load the checkpoint"""
        # print("\nLoading checkpoint\n")
        filePath = 'checkpoints\\' + self.name + '.pth'

        if os.path.exists(filePath):
            checkpoint = torch.load(filePath,
                                    map_location=lambda storage, loc: storage)

            self.actor_local.load_state_dict(checkpoint['actor_state_dict'])
            self.actor_target.load_state_dict(checkpoint['actor_state_dict'])
            self.critic_local.load_state_dict(checkpoint['critic_state_dict'])
            self.critic_target.load_state_dict(checkpoint['critic_state_dict'])

            average_reward = checkpoint['average_reward']
            episode = checkpoint['episode']

            print(
                "Loading checkpoint - Average Reward {} at Episode {}".format(
                    average_reward, episode))
        else:
            print(
                "\nCannot find {} checkpoint... Proceeding to create fresh neural network\n"
                .format(self.name))
class Actor_Crtic_Agent():
    def __init__(self,
                 name,
                 id,
                 device,
                 state_size,
                 action_size,
                 load_agent=False):

        self.device = device

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(RANDOM_SEED)
        self.name = name
        self.id = id

        # Hyperparameters
        self.gamma = GAMMA
        self.tau = TAU
        self.lr_actor = LR_ACTOR
        self.lr_critic = LR_CRITIC
        self.weight_decay = LEARNING_RATE_DECAY

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 RANDOM_SEED).to(self.device)
        self.actor_target = Actor(state_size, action_size,
                                  RANDOM_SEED).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   RANDOM_SEED).to(self.device)
        self.critic_target = Critic(state_size, action_size,
                                    RANDOM_SEED).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.lr_critic,
                                           weight_decay=self.weight_decay)

        if load_agent:
            self.load_agent(self.name)

        # Noise process
        self.noise = OUNoise(action_size, RANDOM_SEED)

    def step(self, state, action, reward, next_state, done, shared_memory):
        """Save experience in replay memory, and use random sample from buffer to learn."""

        # Save experience / reward
        shared_memory.add(state, action, reward, next_state, done)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            noise = self.noise.sample()
            action += noise
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, shared_memory):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        indices, weights, experiences = shared_memory.sample()
        # if shared_memory.priority:
        #     states, actions, rewards, next_states, dones, indices = experiences
        # else:
        critic_losses = []
        actor_losses = []

        # print(weights)
        for experience, index, weight in zip(experiences, indices, weights):
            print(index, weight)
            state, action, reward, next_state, done = experience

            state = torch.from_numpy(state).float().to(self.device)
            action = torch.from_numpy(action).float().to(self.device)
            # reward = torch.from_numpy(reward).float().to(self.device)
            next_state = torch.from_numpy(next_state).float().to(self.device)
            # done = torch.from_numpy(done).astype(np.uint8).float().to(self.device)

            # ---------------------------- update critic ---------------------------- #

            # Get predicted next-state actions and Q values from target models
            action_next = self.actor_target(next_state)
            Q_target_next = self.critic_target(next_state, action_next)

            # Compute Q target for current state (y_i)
            Q_target = reward + (self.gamma * Q_target_next * (1 - done))

            # Compute critic loss
            Q_expected = self.critic_local(state, action)
            critic_loss = F.mse_loss(Q_expected, Q_target)

            # Minimize the loss
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            # torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1)
            self.critic_optimizer.step()

            # ---------------------------- update actor ---------------------------- #

            # Compute actor loss
            actions_pred = self.actor_local(state)
            actor_loss = -self.critic_local(state, actions_pred).mean()

            # Minimize the loss
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            critic_losses.append(critic_loss.detach().cpu().numpy())
            actor_losses.append(actor_loss.detach().cpu().numpy())

            # ----------------------- update target networks ----------------------- #
            self.soft_update(self.critic_local, self.critic_target)
            self.soft_update(self.actor_local, self.actor_target)

        if shared_memory.priority:
            # prios = actor_loss.detach().cpu().numpy() * weights + 1e-5
            shared_memory.update(indices,
                                 np.ndarray.flatten(np.array(actor_losses)))

    def soft_update(self, local_model, target_model):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        tau = self.tau
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def save_agent(self, fileName):
        """Save the checkpoint"""
        checkpoint = {
            'actor_state_dict': self.actor_target.state_dict(),
            'critic_state_dict': self.critic_target.state_dict(),
            'best_reward': self.best_reward
        }

        if not os.path.exists("checkpoints"):
            os.makedirs("checkpoints")

        filePath = 'checkpoints\\' + fileName + '.pth'
        # print("\nSaving checkpoint\n")
        torch.save(checkpoint, filePath)

    def load_agent(self, fileName):
        """Load the checkpoint"""
        # print("\nLoading checkpoint\n")
        filePath = 'checkpoints\\' + fileName + '.pth'

        if os.path.exists(filePath):
            checkpoint = torch.load(filePath,
                                    map_location=lambda storage, loc: storage)
            self.actor_local.load_state_dict(checkpoint['actor_state_dict'])
            self.actor_target.load_state_dict(checkpoint['actor_state_dict'])
            self.critic_local.load_state_dict(checkpoint['critic_state_dict'])
            self.critic_target.load_state_dict(checkpoint['critic_state_dict'])
            self.best_reward = checkpoint['best_reward']

            print(
                "Loading checkpoint - Last Best Reward {} (%) at Frame {} with LR {}"
                .format((np.exp(self.best_reward) - 1) * 100,
                        self.last_upgraded_frame, self.learning_rate))
        else:
            print(
                "\nCannot find {} checkpoint... Proceeding to create fresh neural network\n"
                .format(fileName))
示例#15
0
class SingleDDPGAgent:
    """
        Single agent DDPG.
        Interacts with and learns from the environment.
    """
    def __init__(self, state_size, action_size, cfg, num_agents=1, agent_id=0):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            cfg (config object): main configuration with other passed settings
            num_agents (int): optional (default: 1). If >1 will multiply state and action
                            space sizes for critic. Used for usage with MADDPG.
            agent_id (int): optional (default: 0). Set agent id for MADDPG.
        """
        print("Initializing single DDPG agent!")

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(cfg.random_seed)
        self.n_agents = num_agents
        self.agent_id = agent_id

        self.cfg = cfg

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, cfg.random_seed,
                                 cfg.dense_layers_actor).to(device)
        self.actor_target = Actor(state_size, action_size, cfg.random_seed,
                                  cfg.dense_layers_actor).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=cfg.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size * num_agents,
                                   action_size * num_agents, cfg.random_seed,
                                   cfg.dense_layers_critic).to(device)
        self.critic_target = Critic(state_size * num_agents,
                                    action_size * num_agents, cfg.random_seed,
                                    cfg.dense_layers_critic).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=cfg.lr_critic,
                                           weight_decay=cfg.weight_decay)

        self.hard_copy_weights(self.critic_local, self.critic_target)
        self.hard_copy_weights(self.actor_local, self.actor_target)

        self.t_step = 0

        # Noise process
        self.noise = OUNoise(action_size,
                             cfg.random_seed,
                             theta=cfg.theta_ou,
                             sigma=cfg.sigma_ou)

        # Replay memory
        self.memory = ReplayBuffer(action_size, cfg.buffer_size,
                                   cfg.batch_size, cfg.random_seed, cfg)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        max_prio = self.memory.get_max_priority()
        self.memory.add(state, action, reward, next_state, max_prio, done)

        # Learn, if enough samples are available in memory
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.cfg.update_every
        if self.t_step == 0:
            if len(self.memory) > self.cfg.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences, self.cfg.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state.view(
                1, -1)).squeeze().cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def target_act(self, state):
        """ Let target network return action."""
        self.actor_target.eval()
        with torch.no_grad():
            action_target = self.actor_target(state)

        return np.clip(action_target, -1, 1)

    def reset(self):
        self.t_step = 0
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', prio, done, indices) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, priorities, dones, indices = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)

        if self.cfg.prioritized_replay:
            weights = 1. / (
                (self.cfg.batch_size * priorities)**self.cfg.priority_beta)
            weights /= max(weights)
            # calculating new transition priorities based on residuals
            # between target and local network predictions
            diffs = Q_targets - Q_expected  # TD-error
            diffs = np.abs(np.squeeze(diffs.tolist()))
            self.memory.update_prios(indices, diffs)
            # bias-annealing weights
            Q_expected *= weights
            Q_targets *= weights

        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.cfg.tau)
        self.soft_update(self.actor_local, self.actor_target, self.cfg.tau)

    @staticmethod
    def hard_copy_weights(local_model, target_model):
        """Update model parameters.

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(local_param.data)

    @staticmethod
    def soft_update(local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def save_weights(self, model_save_path, suffix=""):
        """
        Simple method to save network weights.
        """
        # actors
        torch.save(
            self.actor_local.state_dict(),
            os.path.join(model_save_path,
                         "weights_actor_local{:s}.pth".format(suffix)))
        torch.save(
            self.actor_target.state_dict(),
            os.path.join(model_save_path,
                         "weights_actor_target{:s}.pth".format(suffix)))
        # critics
        torch.save(
            self.critic_local.state_dict(),
            os.path.join(model_save_path,
                         "weights_critic_local{:s}.pth".format(suffix)))
        torch.save(
            self.critic_target.state_dict(),
            os.path.join(model_save_path,
                         "weights_critic_target{:s}.pth".format(suffix)))

    def load_weights(self, model_save_path, suffix=""):
        """
        Method to load network weights from saved files.
        """
        self.actor_local.load_state_dict(
            torch.load(
                os.path.join(model_save_path,
                             "weights_actor_local{:s}.pth".format(suffix))))
        self.actor_target.load_state_dict(
            torch.load(
                os.path.join(model_save_path,
                             "weights_actor_target{:s}.pth".format(suffix))))

        self.critic_local.load_state_dict(
            torch.load(
                os.path.join(model_save_path,
                             "weights_critic_local{:s}.pth".format(suffix))))
        self.critic_target.load_state_dict(
            torch.load(
                os.path.join(model_save_path,
                             "weights_critic_target{:s}.pth".format(suffix))))
示例#16
0
class MADDPGAgent():
    """Multi Agent DDPG Implementation.
    Alogrithm: https://arxiv.org/abs/1706.02275.
    """
    def __init__(self,
                 num_agents: int,
                 state_size: int,
                 action_size: int,
                 agent_id: int,
                 writer: utils.VisdomWriter,
                 hparams: hp.HyperParams,
                 result_dir: str = 'results',
                 print_every=1000,
                 model_path=None,
                 saved_config=None,
                 eval_mode=False):
        """Initialize an Agent object
        Params
        =====
            num_agents: number of agents in the game
            state_size: dimension of the state for each agent
            action_size: dimension of the action space for each agent
            agent_id: id(index) for current agent
            writer: for realtime training visualization
            hparams: a set of hyper parameters
            result_dir: relative_path for saving artifacts
        """
        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        self.agent_id = agent_id
        self.writer = writer

        # random.seed(self.seed)
        self.seed = hparams.seed
        random.seed(self.seed)
        # param for critic loss calculation
        self.gamma = hparams.gamma
        # param for soft update
        self.tau = hparams.tau

        # learning rates
        self.lr_actor = hparams.lr_actor
        self.lr_critic = hparams.lr_critic
        # param for critic optimizer initialization
        self.weight_decay = hparams.weight_decay

        # Critic network
        self.critic_local = Critic(
            self.num_agents,
            self.state_size,
            self.action_size,
            self.seed,
            fcs1_units=hparams.critic_fcs1_units,
            fc2_units=hparams.critic_fc2_units).to(device)
        self.critic_target = Critic(
            self.num_agents,
            self.state_size,
            self.action_size,
            self.seed,
            fcs1_units=hparams.critic_fcs1_units,
            fc2_units=hparams.critic_fc2_units).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.lr_critic,
                                           weight_decay=self.weight_decay)

        # Actor network
        self.actor_local = Actor(self.state_size,
                                 self.action_size,
                                 self.seed,
                                 fc1_units=hparams.actor_fc1_units,
                                 fc2_units=hparams.actor_fc2_units).to(device)
        self.actor_target = Actor(self.state_size,
                                  self.action_size,
                                  self.seed,
                                  fc1_units=hparams.actor_fc1_units,
                                  fc2_units=hparams.actor_fc2_units).to(device)
        self.actor_optimzer = optim.Adam(self.actor_local.parameters(),
                                         lr=self.lr_actor)

        # Noise Process for action space exploration
        self.noise = noise.OUNoise(action_size, self.seed, sigma=hparams.sigma)

        # Replay buffer
        self.buffer_size = hparams.buffer_size
        self.batch_size = hparams.batch_size

        self.learn_step = 0
        self.result_dir = result_dir

    def act(self, state: np.ndarray, add_noise=True):
        """Returns actions for given states as per current policy
        Params:
            states: game states from environment
            add_noise: whether we should apply noise. True when in training, otherwise false
        Return:
            action clipped
        """
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, agents, experiences, gamma: float):
        """Update policy and value parameters using giving batch of experince tuples
        Q_targets = r + gamma * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
    
        For MDDDPG, agent critic uses states, actions from all agents to reduce the effect from 
        non-stationary environment
        Each agent draws action based on its own state

        Params:
        =====
            agents: MADDPGAgent objects
            experiences: sampled experiences from agents
            gamma:discount factor for Q-target calculation

        Return:
            critic_loss and actor_loss
        """

        # unpack
        states, actions, rewards, next_states, dones = experiences
        # ------------ updata critic -----------------#
        # get predicted actions from all agents actor_target network
        next_pred_actions = torch.zeros(len(actions), self.num_agents,
                                        self.action_size).to(device)
        for i, agent in enumerate(agents):
            next_pred_actions[:, i] = agent.actor_target(next_states[:, i, :])

        # flatten states and actions to produce inputs to Critic network
        critic_next_states = utils.flatten(next_states)
        next_pred_actions = utils.flatten(next_pred_actions)

        Q_targets_next = self.critic_target(critic_next_states,
                                            next_pred_actions)

        # compute the target Q-value, update current agent only
        Q_targets = rewards[:, self.agent_id, :] + gamma * Q_targets_next * (
            1 - dones[:, self.agent_id, :])

        # use local network to get expected Q-value and calculate loss
        critic_states = utils.flatten(states)
        critic_actions = utils.flatten(actions)
        Q_expected = self.critic_local(critic_states, critic_actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # clip the gradient, how to decide the max_norm?
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ------------ update actor -----------------#
        # compute actor loss
        pred_actions = torch.zeros(len(actions), self.num_agents,
                                   self.action_size)
        pred_actions.data.copy_(actions.data)
        # update action for this agent only !
        pred_actions[:,
                     self.agent_id] = self.actor_local(states[:,
                                                              self.agent_id])
        critic_states = utils.flatten(states)
        critic_pred_actions = utils.flatten(pred_actions)
        actor_loss = -self.critic_local(critic_states,
                                        critic_pred_actions).mean()
        self.actor_optimzer.zero_grad()
        actor_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1)
        self.actor_optimzer.step()

        # ------------ update target networks --------#
        if self.learn_step == 0:
            utils.hard_update(self.actor_local, self.actor_target)
            utils.hard_update(self.critic_local, self.critic_target)
        else:
            utils.soft_update(self.actor_local, self.actor_target, self.tau)
            utils.soft_update(self.critic_local, self.critic_target, self.tau)

        self.learn_step += 1
        # return the losses
        return actor_loss.item(), critic_loss.item()

    def check_point(self):
        """
        Save model checkpoints and configurations
        """
        critic_pth = os.path.join(self.result_dir,
                                  f'checkpoint_critic_{self.agent_id}.pth')
        actor_pth = os.path.join(self.result_dir,
                                 f'checkpoint_actor_{self.agent_id}.pth')

        torch.save(self.actor_local.state_dict(), actor_pth)
        torch.save(self.critic_local.state_dict(), critic_pth)
示例#17
0
class DDPG(object):

    def __init__(self, gamma, tau,num_inputs, env,device, results_path=None):

        self.gamma = gamma
        self.tau = tau
        self.min_action,self.max_action = env.action_range()
        self.device = device
        self.num_actions = env.action_space()
        self.noise_stddev = 0.3

        self.results_path = results_path
        self.checkpoint_path = os.path.join(self.results_path, 'checkpoint/')
        os.makedirs(self.checkpoint_path, exist_ok=True)

        # Define the actor
        self.actor = Actor(num_inputs, self.num_actions).to(device)
        self.actor_target = Actor(num_inputs, self.num_actions).to(device)

        # Define the critic
        self.critic = Critic(num_inputs, self.num_actions).to(device)
        self.critic_target = Critic(num_inputs, self.num_actions).to(device)

        # Define the optimizers for both networks
        self.actor_optimizer  = Adam(self.actor.parameters(),  lr=1e-4 )                          # optimizer for the actor network
        self.critic_optimizer = Adam(self.critic.parameters(), lr=1e-4,   weight_decay=0.002)  # optimizer for the critic network

        self.hard_swap()

        self.ou_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.num_actions),
                                            sigma=float(self.noise_stddev) * np.ones(self.num_actions))
        self.ou_noise.reset()

    def eval_mode(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic_target.eval()
        self.critic.eval()

    def train_mode(self):
        self.actor.train()
        self.actor_target.train()
        self.critic_target.train()
        self.critic.train()


    def get_action(self, state, episode, action_noise=True):
        x = state.to(self.device)

        # Get the continous action value to perform in the env
        self.actor.eval()  # Sets the actor in evaluation mode
        mu = self.actor(x)
        self.actor.train()  # Sets the actor in training mode
        mu = mu.data

        # During training we add noise for exploration
        if action_noise:
            noise = torch.Tensor(self.ou_noise.noise()).to(self.device) * 1.0/(1.0 + 0.1*episode)
            noise = noise.clamp(0,0.1)
            mu = mu + noise  # Add exploration noise ε ~ p(ε) to the action. Do not use OU noise (https://spinningup.openai.com/en/latest/algorithms/ddpg.html)

        # Clip the output according to the action space of the env
        mu = mu.clamp(self.min_action,self.max_action)

        return mu

    def update_params(self, batch):
        # Get tensors from the batch
        state_batch = torch.cat(batch.state).to(self.device)
        action_batch = torch.cat(batch.action).to(self.device)
        reward_batch = torch.cat(batch.reward).to(self.device)
        done_batch = torch.cat(batch.done).to(self.device)
        next_state_batch = torch.cat(batch.next_state).to(self.device)

        # Get the actions and the state values to compute the targets
        next_action_batch = self.actor_target(next_state_batch)
        next_state_action_values = self.critic_target(next_state_batch, next_action_batch.detach())

        # Compute the target
        reward_batch = reward_batch.unsqueeze(1)
        done_batch = done_batch.unsqueeze(1)
        expected_values = reward_batch + (1.0 - done_batch) * self.gamma * next_state_action_values

        # Update the critic network
        self.critic_optimizer.zero_grad()
        state_action_batch = self.critic(state_batch, action_batch)
        value_loss = F.mse_loss(state_action_batch, expected_values.detach())
        value_loss.backward()
        self.critic_optimizer.step()

        # Update the actor network
        self.actor_optimizer.zero_grad()
        policy_loss = -self.critic(state_batch, self.actor(state_batch))
        policy_loss = policy_loss.mean()
        policy_loss.backward()
        for param in self.actor.parameters():
                param.grad.data.clamp_(-1, 1)
        self.actor_optimizer.step()

       # Update the target networks
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        return value_loss.item(), policy_loss.item()
    
    def hard_swap(self):
        # Make sure both targets are with the same weight
        hard_update(self.actor_target, self.actor)
        hard_update(self.critic_target, self.critic)

    def store_model(self):
        print("Storing model at: ", self.checkpoint_path)
        checkpoint = {
            'actor': self.actor.state_dict(),
            'actor_optim': self.actor_optimizer.state_dict(),
            'critic': self.critic.state_dict(),
            'criti_optim': self.critic_optimizer.state_dict()
        }
        torch.save(checkpoint, os.path.join(self.checkpoint_path, 'checkpoint.pth') )

    def load_model(self):
        files = os.listdir(self.checkpoint_path)
        if files:
            print("Loading models checkpoints!")
            model_dicts = torch.load(os.path.join(self.checkpoint_path, 'checkpoint.pth'),map_location=self.device)
            self.actor.load_state_dict(model_dicts['actor'])
            self.actor_optimizer.load_state_dict(model_dicts['actor_optim'])
            self.critic.load_state_dict(model_dicts['critic'])
            self.critic_optimizer.load_state_dict(model_dicts['criti_optim'])
        else:
            print("Checkpoints not found!")
示例#18
0
class TD3Agent:
    """
    Encapsulates the functioning of the TD3 agent
    """
    def __init__(self,
                 state_dim,
                 action_dim,
                 max_action,
                 device,
                 memory_capacity=10000,
                 discount=0.99,
                 update_freq=2,
                 tau=0.005,
                 policy_noise_std=0.2,
                 policy_noise_clip=0.5,
                 actor_lr=1e-3,
                 critic_lr=1e-3,
                 train_mode=True):
        self.train_mode = train_mode  # whether the agent is in training or testing mode

        self.state_dim = state_dim  # dimension of the state space
        self.action_dim = action_dim  # dimension of the action space

        self.device = device  # defines which cuda or cpu device is to be used to run the networks
        self.discount = discount  # denoted a gamma in the equation for computation of the Q-value
        self.update_freq = update_freq  # defines how frequently should the actor and target be updated
        self.tau = tau  # defines the factor used for Polyak averaging (i.e., soft updating of the target networks)
        self.max_action = max_action  # the max value of the range in the action space (assumes a symmetric range in the action space)
        self.policy_noise_clip = policy_noise_clip  # max range within which the noise for the target policy smoothing must be contained
        self.policy_noise_std = policy_noise_std  # standard deviation, i.e. sigma, of the Gaussian noise applied for target policy smoothing

        # create an instance of the replay buffer
        self.memory = ReplayMemory(memory_capacity)

        # instances of the networks for the actor and the two critics
        self.actor = Actor(state_dim, action_dim, max_action, actor_lr)
        self.critic = Critic(
            state_dim, action_dim, critic_lr
        )  # the critic class encapsulates two copies of the neural network for the two critics used in TD3

        # instance of the target networks for the actor and the two critics
        self.target_actor = Actor(state_dim, action_dim, max_action, actor_lr)
        self.target_critic = Critic(state_dim, action_dim, critic_lr)

        # initialise the targets to the same weight as their corresponding current networks
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())

        # since we do not learn/train on the target networks
        self.target_actor.eval()
        self.target_critic.eval()

        # for test mode
        if not self.train_mode:
            self.actor.eval()
            self.critic.eval()

        self.actor.to(self.device)
        self.critic.to(self.device)
        self.target_actor.to(self.device)
        self.target_critic.to(self.device)

    def select_action(self, state, exploration_noise=0.1):
        """
        Function to returns the appropriate action for the given state.
        During training, it returns adds a zero-mean gaussian noise with std=exploration_noise to the action to encourage exploration.
        No noise is added to the action decision during testing mode.

        Parameters
        ---
        state: vector or tensor
            The current state of the environment as observed by the agent
        exploration_noise: float, optional
            Standard deviation, i.e. sigma, of the Gaussian noise to be added to the agent's action to encourage exploration

        Returns
        ---
        A numpy array representing the noisy action to be performed by the agent in the current state
        """

        if not torch.is_tensor(state):
            state = torch.tensor([state], dtype=torch.float32).to(self.device)

        act = self.actor(state).cpu().data.numpy().flatten(
        )  # performs inference using the actor based on the current state as the input and returns the corresponding np array

        if not self.train_mode:
            exploration_noise = 0.0  # since we do not need noise to be added to the action during testing

        noise = np.random.normal(
            0.0, exploration_noise, size=act.shape
        )  # generate the zero-mean gaussian noise with standard deviation determined by exploration_noise

        noisy_action = act + noise
        noisy_action = noisy_action.clip(
            min=-self.max_action, max=self.max_action
        )  # to ensure that the noisy action being returned is within the limit of "legal" actions afforded to the agent; assumes action range is symmetric

        return noisy_action

    def learn(self, current_iteration, batchsize):
        """
        Function to perform the updates on the 6 neural networks that run the TD3 algorithm.

        Parameters
        ---
        current_iteration: int
            Total number of steps that have been performed by the agent
        batchsize: int
            Number of experiences to be randomly sampled from the memory for the agent to learn from

        Returns
        ---
        none
        """

        if len(self.memory) < batchsize:
            return
        states, actions, next_states, rewards, dones = self.memory.sample(
            batchsize, self.device
        )  # a batch of experiences randomly sampled form the memory

        # ensure that the actions and rewards tensors have the appropriate shapes
        actions = actions.view(-1, self.action_dim)
        rewards = rewards.view(-1, 1)

        # generate noisy target actions for target policy smoothing
        pred_action = self.target_actor(next_states)
        noise = torch.zeros_like(pred_action).normal_(
            0, self.policy_noise_std).to(self.device)
        noise = torch.clamp(noise,
                            min=-self.policy_noise_clip,
                            max=self.policy_noise_clip)
        noisy_pred_action = torch.clamp(pred_action + noise,
                                        min=-self.max_action,
                                        max=self.max_action)

        # calculate TD-Target using Clipped Double Q-learning
        target_q1, target_q2 = self.target_critic(next_states,
                                                  noisy_pred_action)
        target_q = torch.min(target_q1, target_q2)
        target_q[
            dones] = 0.0  # being in a terminal state implies there are no more future states that the agent would encounter in the given episode and so set the associated Q-value to 0
        y = rewards + self.discount * target_q

        current_q1, current_q2 = self.critic(
            states, actions
        )  # the critic class encapsulates two copies of the neural network thereby returning two Q values with each forward pass

        critic_loss = F.mse_loss(current_q1, y) + F.mse_loss(
            current_q2, y
        )  # the losses of the two critics need to be added as there is only one optimiser shared between the two networks
        critic_loss = critic_loss.mean()

        self.critic.optimizer.zero_grad()
        critic_loss.backward()
        self.critic.optimizer.step()

        # delayed policy and target updates
        if current_iteration % self.update_freq == 0:

            # actor loss is calculated by a gradient ascent along crtic 1, thus need to apply the negative sign to convert to a gradient descent
            pred_current_actions = self.actor(states)
            pred_current_q1, _ = self.critic(
                states, pred_current_actions
            )  # since we only need the Q-value from critic 1, we can ignore the second value obtained through the forward pass

            actor_loss = -pred_current_q1.mean()

            self.actor.optimizer.zero_grad()
            actor_loss.backward()
            self.actor.optimizer.step()

            # apply slow-update to all three target networks
            self.soft_update_targets()

    def soft_update_net(self, source_net_params, target_net_params):
        """
        Function to perform Polyak averaging to update the parameters of the provided network

        Parameters
        ---
        source_net_params: list
            trainable parameters of the source, ie. current version of the network
        target_net_params: list
            trainable parameters of the corresponding target network

        Returns
        ---
        none
        """

        for source_param, target_param in zip(source_net_params,
                                              target_net_params):
            target_param.data.copy_(self.tau * source_param.data +
                                    (1 - self.tau) * target_param.data)

    def soft_update_targets(self):
        """
        Function that calls Polyak averaging on all three target networks

        Parameters
        ---
        none

        Returns
        ---
        none
        """

        self.soft_update_net(self.actor.parameters(),
                             self.target_actor.parameters())
        self.soft_update_net(self.critic.parameters(),
                             self.target_critic.parameters())

    def save(self, path, model_name):
        """
        Function to save the actor and critic networks

        Parameters
        ---
        path: str
            Location where the model is to be saved
        model_name: str
            Name of the model

        Returns
        ---
        none
        """

        self.actor.save_model('{}/{}_actor'.format(path, model_name))
        self.critic.save_model('{}/{}_critic'.format(path, model_name))

    def load(self, model_name):
        """
        Function to load the actor and critic networks

        Parameters
        ---
        path: str
            Location where the model is saved
        model_name: str
            Name of the model

        Returns
        ---
        none
        """

        self.actor.load_model('{}/{}_actor'.format(path, model_name))
        self.critic.load_model('{}/{}_critic'.format(path, model_name))
class TD31v1(object):
    """ TD3 plus the Ensemble of Critics as an agent object
    to act and update the networkweights, save and laod the weights
    """
    def __init__(self, state_dim, action_dim, max_action, args):
        self.actor = Actor(state_dim, action_dim, max_action).to(args.device)
        self.actor_target = Actor(state_dim, action_dim, max_action).to(args.device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
        self.critic = Critic(state_dim, action_dim).to(args.device)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
        self.list_target_critic = []
        # create the different 
        for c in range(args.num_q_target):
            critic_target = Critic(state_dim, action_dim).to(args.device)
            critic_target.load_state_dict(critic_target.state_dict())
            self.list_target_critic.append(critic_target)
        
        self.target_critic = Critic(state_dim, action_dim).to(args.device)
        self.target_critic.load_state_dict(self.target_critic.state_dict())
        self.max_action = max_action
        self.num_q_target = args.num_q_target
        self.batch_size = args.batch_size
        self.discount = args.discount
        self.tau = args.tau 
        self.policy_noise = args.policy_noise
        self.noise_clip = args.noise_clip
        self.policy_freq = args.policy_freq
        self.device = args.device
        self.update_counter = 0
        self.step = 0 
        self.currentQNet = 0
    
    def select_action(self, state):
        state = torch.Tensor(state.reshape(1, -1)).to(self.device)
        return self.actor(state).cpu().data.numpy().flatten()
    
    
    def train(self, replay_buffer, writer, iterations):
        """ Update function for the networkweights of the Actor and Critis 
            current and Target by useing the 3 new features of the TD3 paper
            to the DDPG implementation
            1. Delay the policy Updates 
            2. Two crtitc networks take the min Q value
            3. Target Policy Smoothing
            Own use an Ensemble Approach of delayed updated critics 
        
        """
        self.step += 1
        
        for it in range(iterations):
            
            # Step 1: Sample a batch of transitions (s, s’, a, r) from the memory
            batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(self.batch_size)
            # convert the numpy arrays to tensors object 
            # if cuda is available send data to gpu
            state = torch.Tensor(batch_states).to(self.device)
            next_state = torch.Tensor(batch_next_states).to(self.device)
            action = torch.Tensor(batch_actions).to(self.device)
            reward = torch.Tensor(batch_rewards).to(self.device)
            done = torch.Tensor(batch_dones).to(self.device)
            
            # Step 2: use the  Target Actor  to create the action of the next
            # state (part of the TD-Target 
            next_action = self.actor_target(next_state)
            
            # Step 3: Add Gaussian noise to the action for exploration 
            # clip the action value in case its outside the boundaries 
            noise = torch.Tensor(batch_actions).data.normal_(0, self.policy_noise).to(self.device)
            noise = noise.clamp(-self.noise_clip, self.noise_clip)
            next_action = (next_action + noise).clamp(-self.max_action, self.max_action)
            
            # Step 4: Use the differenet Target Critis (delayed update)
            # and min of two  Critic from TD3 to create the different Q Targets
            # compute the average of all Q Targets to get a single value
            target_Q = 0
            for critic in self.list_target_critic:
                target_Q1, target_Q2 = critic(next_state, next_action) 
                target_Q += torch.min(target_Q1, target_Q2)
            
            target_Q *= 1./ self.num_q_target  
           
           # Step 5: Create the update based on the bellman  equation 
            target_Q = reward + ((1 - done) * self.discount * target_Q).detach()
            
            # Step 6: Use the critic compute the Q estimate for current state and action
            current_Q1, current_Q2 = self.critic(state, action) 
            
            # Step 7: Compute the critc loss with the mean squard error
            # loss function 
            critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
            writer.add_scalar('critic_loss', critic_loss , self.step)
            

            # Step 8: Backpropagate this Critic loss and update the parameters
            # of the two Critic models use the adam optimizer
            
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()
            
            # Step 9: Delayed update og Actor model
            if it % self.policy_freq == 0:
                actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
                writer.add_scalar('actor_loss', actor_loss , self.step)
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()
                
                # Step 10: we update the weights of the Critic target by polyak averaging
                # hyperparameter tau determines the combination of current and
                # target weights
                for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                    target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
                
                for param, target_param in zip(self.critic.parameters(), self.target_critic.parameters()):
                    target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
     
                
    def hardupdate(self):
        """ for the critic ensembles """
        
        self.update_counter +=1
        self.currentQNet = self.update_counter % self.num_q_target
        # Step 11: Override every n steps the weights 
        for target_param, param in zip(self.target_critic.parameters(), self.list_target_critic[self.currentQNet].parameters()):
            param.data.copy_(target_param.data)
    
    # Making a save method to save a trained model
    def save(self, filename, directory):
        torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
        torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename))
    
    # Making a load method to load a pre-trained model
    def load(self, filename, directory):
        self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
        self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))
示例#20
0
class DDPGAgent:
    """
    Encapsulates the functioning of the DDPG agent
    """

    def __init__(self, state_dim, action_dim, max_action, device, memory_capacity=10000, discount=0.99, tau=0.005, sigma=0.2, theta=0.15, actor_lr=1e-4, critic_lr=1e-3, train_mode=True):
        self.train_mode = train_mode # whether the agent is in training or testing mode

        self.state_dim = state_dim # dimension of the state space
        self.action_dim = action_dim # dimension of the action space
        
        self.device = device # defines which cuda or cpu device is to be used to run the networks
        self.discount = discount # denoted a gamma in the equation for computation of the Q-value
        self.tau = tau # defines the factor used for Polyak averaging (i.e., soft updating of the target networks)
        self.max_action = max_action # the max value of the range in the action space (assumes a symmetric range in the action space)
        
        # create an instance of the replay buffer
        self.memory = ReplayMemory(memory_capacity)

        # create an instance of the noise generating process
        self.ou_noise = OrnsteinUhlenbeckNoise(mu=np.zeros(self.action_dim), sigma=sigma, theta=theta)

        # instances of the networks for the actor and the critic
        self.actor = Actor(state_dim, action_dim, max_action, actor_lr)
        self.critic = Critic(state_dim, action_dim, critic_lr)

        # instance of the target networks for the actor and the critic
        self.target_actor = Actor(state_dim, action_dim, max_action, actor_lr)
        self.target_critic = Critic(state_dim, action_dim, critic_lr)

        # initialise the targets to the same weight as their corresponding current networks
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())

        # since we do not learn/train on the target networks
        self.target_actor.eval()
        self.target_critic.eval()

        # for test mode
        if not self.train_mode:
            self.actor.eval()
            self.critic.eval()
            self.ounoise = None

        self.actor.to(self.device)
        self.critic.to(self.device)

        self.target_actor.to(self.device)
        self.target_critic.to(self.device)

    def select_action(self, state):
        """
        Function to return the appropriate action for the given state.
        During training, it adds a zero-mean OU noise to the action to encourage exploration.
        During testing, no noise is added to the action decision.

        Parameters
        ---
        state: vector or tensor
            The current state of the environment as observed by the agent
        
        Returns
        ---
        A numpy array representing the noisy action to be performed by the agent in the current state
        """

        if not torch.is_tensor(state):
            state = torch.tensor([state], dtype=torch.float32).to(self.device)
        
        self.actor.eval()
        act = self.actor(state).cpu().data.numpy().flatten() # performs inference using the actor based on the current state as the input and returns the corresponding np array
        self.actor.train()

        noise = 0.0

        ## for adding Gaussian noise (to use, update the code pass the exploration noise as input)
        #if self.train_mode:
        #	noise = np.random.normal(0.0, exploration_noise, size=act.shape) # generate the zero-mean gaussian noise with standard deviation determined by exploration_noise

        # for adding OU noise
        if self.train_mode:
            noise = self.ou_noise.generate_noise()

        noisy_action = act + noise
        noisy_action = noisy_action.clip(min=-self.max_action, max=self.max_action) # to ensure that the noisy action being returned is within the limit of "legal" actions afforded to the agent; assumes action range is symmetric

        return noisy_action

    def learn(self, batchsize):
        """
        Function to perform the updates on the 4 neural networks that run the DDPG algorithm.

        Parameters
        ---
        batchsize: int
            Number of experiences to be randomly sampled from the memory for the agent to learn from

        Returns
        ---
        none
        """

        if len(self.memory) < batchsize:
            return
        states, actions, next_states, rewards, dones = self.memory.sample(batchsize, self.device) # a batch of experiences randomly sampled form the memory

        # ensure that the actions and rewards tensors have the appropriate shapes
        actions = actions.view(-1, self.action_dim) 
        rewards = rewards.view(-1, 1)

        with torch.no_grad():
            # generate target actions
            target_action = self.target_actor(next_states)

            # calculate TD-Target
            target_q = self.target_critic(next_states, target_action)
            target_q[dones] = 0.0 # being in a terminal state implies there are no more future states that the agent would encounter in the given episode and so set the associated Q-value to 0
            y = rewards + self.discount * target_q

        current_q = self.critic(states, actions)
        critic_loss = F.mse_loss(current_q, y).mean()
        
        self.critic.optimizer.zero_grad()
        critic_loss.backward()
        self.critic.optimizer.step()

        # actor loss is calculated by a gradient ascent along the crtic, thus need to apply the negative sign to convert to a gradient descent
        pred_current_actions = self.actor(states)
        pred_current_q = self.critic(states, pred_current_actions)
        actor_loss = - pred_current_q.mean()

        self.actor.optimizer.zero_grad()
        actor_loss.backward()
        self.actor.optimizer.step()

        # apply slow-update to the target networks
        self.soft_update_targets()


    def soft_update_net(self, source_net_params, target_net_params):
        """
        Function to perform Polyak averaging to update the parameters of the provided network

        Parameters
        ---
        source_net_params: list
            trainable parameters of the source, ie. current version of the network
        target_net_params: list
            trainable parameters of the corresponding target network

        Returns
        ---
        none
        """

        for source_param, target_param in zip(source_net_params, target_net_params):
            target_param.data.copy_(self.tau * source_param.data + (1 - self.tau) * target_param.data)

    def soft_update_targets(self):
        """
        Function that calls Polyak averaging on all three target networks

        Parameters
        ---
        none

        Returns
        ---
        none
        """

        self.soft_update_net(self.actor.parameters(), self.target_actor.parameters())
        self.soft_update_net(self.critic.parameters(), self.target_critic.parameters())

    def save(self, path, model_name):
        """
        Function to save the actor and critic networks

        Parameters
        ---
        path: str
            Location where the model is to be saved
        model_name: str
            Name of the model

        Returns
        ---
        none
        """

        self.actor.save_model('{}/{}_actor'.format(path, model_name))
        self.critic.save_model('{}/{}_critic'.format(path, model_name))

    def load(self, path, model_name):
        """
        Function to load the actor and critic networks

        Parameters
        ---
        path: str
            Location where the model is saved
        model_name: str
            Name of the model

        Returns
        ---
        none
        """

        self.actor.load_model('{}/{}_actor'.format(path, model_name))
        self.critic.load_model('{}/{}_critic'.format(path, model_name))
示例#21
0
class AgentDDPG:
    """Deep Deterministic Policy Gradient implementation for continuous action space reinforcement learning tasks"""
    def __init__(self,
                 state_size,
                 hidden_size,
                 action_size,
                 actor_learning_rate=1e-4,
                 critic_learning_rate=1e-3,
                 gamma=0.99,
                 tau=1e-2,
                 use_cuda=False,
                 actor_path=None,
                 critic_path=None):
        # Params
        self.state_size, self.hidden_size, self.action_size = state_size, hidden_size, action_size
        self.gamma, self.tau = gamma, tau
        self.use_cuda = use_cuda

        # Networks
        self.actor = Actor(state_size, hidden_size, action_size)
        self.actor_target = Actor(state_size, hidden_size, action_size)

        self.critic = Critic(state_size + action_size, hidden_size,
                             action_size)
        self.critic_target = Critic(state_size + action_size, hidden_size,
                                    action_size)

        # Load model state_dicts from saved file
        if actor_path and path.exists(actor_path):
            self.actor.load_state_dict(torch.load(actor_path))

        if critic_path and path.exists(critic_path):
            self.critic.load_state_dict(torch.load(critic_path))

        # Hard copy params from original networks to target networks
        copy_params(self.actor, self.actor_target)
        copy_params(self.critic, self.critic_target)

        if self.use_cuda:
            self.actor.cuda()
            self.actor_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()

        # Create replay buffer for storing experience
        self.replay_buffer = ReplayBuffer(cache_size=int(1e6))

        # Training
        self.critic_criterion = nn.MSELoss()
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=actor_learning_rate)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=critic_learning_rate)

    def save_to_file(self, actor_file, critic_file):
        # Save the state_dict's of the Actor and Critic networks
        torch.save(self.actor.state_dict(), actor_file)
        torch.save(self.critic.state_dict(), critic_file)

    def get_action(self, state):
        """Select action with respect to state according to current policy and exploration noise"""
        state = Variable(torch.from_numpy(state).float())

        if self.use_cuda:
            state = state.cuda()

        a = self.actor.forward(state)

        if self.use_cuda:
            return a.detach().cpu().numpy()

        return a.detach().numpy()

    def save_experience(self, state_t, action_t, reward_t, state_t1):
        self.replay_buffer.add_sample(state_t, action_t, reward_t, state_t1)

    def update(self, batch_size):
        states, actions, rewards, next_states = self.replay_buffer.get_samples(
            batch_size)
        states = torch.FloatTensor(states)
        actions = torch.FloatTensor(actions)
        rewards = torch.FloatTensor(rewards)
        next_states = torch.FloatTensor(next_states)

        if self.use_cuda:
            states = states.cuda()
            next_states = next_states.cuda()
            actions = actions.cuda()
            rewards = rewards.cuda()

        # Critic loss
        Qvals = self.critic.forward(states, actions)
        next_actions = self.actor_target.forward(next_states)
        next_Q = self.critic_target.forward(next_states, next_actions.detach())
        Qprime = rewards + self.gamma * next_Q
        critic_loss = self.critic_criterion(Qvals, Qprime)

        # Update critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Actor loss
        policy_loss = -self.critic.forward(states,
                                           self.actor.forward(states)).mean()

        # Update actor
        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        self.actor_optimizer.step()

        # update target networks
        soft_copy_params(self.actor, self.actor_target, self.tau)
        soft_copy_params(self.critic, self.critic_target, self.tau)

    def add_noise_to_weights(self, amount=0.1):
        self.actor.apply(
            lambda x: _add_noise_to_weights(x, amount, self.use_cuda))
        self.critic.apply(
            lambda x: _add_noise_to_weights(x, amount, self.use_cuda))
        self.actor_target.apply(
            lambda x: _add_noise_to_weights(x, amount, self.use_cuda))
        self.critic_target.apply(
            lambda x: _add_noise_to_weights(x, amount, self.use_cuda))
示例#22
0
class Agent:
    def __init__(self,env, env_params, args, models=None, record_episodes=[0,.1,.25,.5,.75,1.]):
        self.env= env
        self.env_params = env_params
        self.args = args


        # networks
        if models == None:
                self.actor = Actor(self.env_params).double()
                self.critic = Critic(self.env_params).double()
        else:
                self.actor , self.critic = self.LoadModels()
        # target networks used to predict env actions with
        self.actor_target = Actor(self.env_params,).double()
        self.critic_target = Critic(self.env_params).double()

        self.actor_target.load_state_dict(self.actor.state_dict())
        self.critic_target.load_state_dict(self.critic.state_dict())

        if self.args.cuda:
            self.actor.cuda()
            self.critic.cuda()
            self.actor_target.cuda()
            self.critic_target.cuda()


        self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=0.001)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=0.001)

        self.normalize = Normalizer(env_params,self.args.gamma)
        self.buffer = ReplayBuffer(1_000_000, self.env_params)
        self.tensorboard = ModifiedTensorBoard(log_dir = f"logs")
        self.record_episodes = [int(eps * self.args.n_epochs) for eps in record_episodes]

    def ModelsEval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def ModelsTrain(self):
        self.actor.train()
        self.actor_target.train()
        self.critic.train()
        self.critic_target.train()

    def GreedyAction(self, state):
        self.ModelsEval()
        with torch.no_grad():
            state = torch.tensor(state, dtype=torch.double).unsqueeze(dim=0)
            if self.args.cuda:
                state = state.cuda()
            action = self.actor.forward(state).detach().cpu().numpy().squeeze()
        return action

    def NoiseAction(self, state):
        self.ModelsEval()
        with torch.no_grad():
            state = torch.tensor(state, dtype=torch.double).unsqueeze(dim=0)
            if self.args.cuda:
                state = state.cuda()
            action = self.actor.forward(state).detach().cpu().numpy()
            action += self.args.noise_eps * self.env_params['max_action'] * np.random.randn(*action.shape)
            action = np.clip(action, -self.env_params['max_action'], self.env_params['max_action'])
        return action.squeeze()

    def Update(self):
        self.ModelsTrain()
        for i in range(self.args.n_batch):
            state, a_batch, r_batch, nextstate, d_batch = self.buffer.SampleBuffer(self.args.batch_size)
            a_batch = torch.tensor(a_batch,dtype=torch.double)
            r_batch = torch.tensor(r_batch,dtype=torch.double)
            # d_batch = torch.tensor(d_batch,dtype=torch.double)
            state = torch.tensor(state,dtype=torch.double)
            nextstate = torch.tensor(nextstate,dtype=torch.double)
            # d_batch = 1 - d_batch

            if self.args.cuda:
                a_batch = a_batch.cuda()
                r_batch = r_batch.cuda()
                # d_batch = d_batch.cuda()
                state = state.cuda()
                nextstate = nextstate.cuda()

            with torch.no_grad():
                action_next = self.actor_target.forward(nextstate)
                q_next = self.critic_target.forward(nextstate,action_next)
                q_next = q_next.detach().squeeze()
                q_target = r_batch + self.args.gamma * q_next
                q_target = q_target.detach().squeeze()

            q_prime = self.critic.forward(state, a_batch).squeeze()
            critic_loss = F.mse_loss(q_target, q_prime)

            action = self.actor.forward(state)
            actor_loss = -self.critic.forward(state, action).mean()
            # params = torch.cat([x.view(-1) for x in self.actor.parameters()])
            # l2_reg = self.args.l2_norm *torch.norm(params,2)
            # actor_loss += l2_reg

            self.actor_optim.zero_grad()
            actor_loss.backward()
            self.actor_optim.step()

            self.critic_optim.zero_grad()
            critic_loss.backward()
            self.critic_optim.step()

        self.SoftUpdateTarget(self.critic, self.critic_target)
        self.SoftUpdateTarget(self.actor, self.actor_target)

    def Explore(self):
        for epoch in range(self.args.n_epochs +1):
            start_time = time.process_time()
            for cycle in range(self.args.n_cycles):
                for _ in range(self.args.num_rollouts_per_mpi):
                    state = self.env.reset()
                    for t in range(self.env_params['max_timesteps']):
                        action = self.NoiseAction(state)
                        nextstate, reward, done, info = self.env.step([action])
                        nextstate = nextstate.squeeze()
                        reward = self.normalize.normalize_reward(reward)
                        self.buffer.StoreTransition(state, action, reward, nextstate, done)
                        state = nextstate
                    self.Update()
            avg_reward = self.Evaluate()
            self.tensorboard.step = epoch
            elapsed_time = time.process_time() - start_time
            print(f"Epoch {epoch} of total of {self.args.n_epochs +1} epochs, average reward is: {avg_reward}.\
                    Elapsedtime: {int(elapsed_time /60)} minutes {int(elapsed_time %60)} seconds")
            if epoch % 5 or epoch + 1 == self.args.n_epochs:
                self.SaveModels(epoch)
                self.record(epoch)


    def Evaluate(self):
        self.ModelsEval()
        total_reward = []
        episode_reward = 0
        succes_rate = []
        for episode in range(self.args.n_evaluate):
            state = self.env.reset()
            episode_reward = 0
            for t in range(self.env_params['max_timesteps']):
                action = self.GreedyAction(state)
                nextstate, reward, done, info = self.env.step([action])
                episode_reward += reward
                state = nextstate
                if done or t + 1 == self.env_params['max_timesteps']:
                    total_reward.append(episode_reward)
                    episode_reward = 0

        average_reward = sum(total_reward)/len(total_reward)
        min_reward = min(total_reward)
        max_reward = max(total_reward)
        self.tensorboard.update_stats(reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward)
        return average_reward

    def record(self, epoch):
        self.ModelsEval()
        try:
            if not os.path.exists("videos"):
                os.mkdir('videos')
            recorder = VideoRecorder(self.env, path=f'videos/epoch-{epoch}.mp4')
            for _ in range(self.args.n_record):
                done =False
                state = self.env.reset()
                while not done:
                    recorder.capture_frame()
                    action = self.GreedyAction(state)
                    nextstate,reward,done,info = self.env.step([action])
                    state = nextstate
                recorder.close()
        except Exception as e:
            print(e)

    def SaveModels(self, ep):
        if not os.path.exists("models"):
            os.mkdir('models')
        torch.save(self.actor.state_dict(), os.path.join('models', 'Actor.pt'))
        torch.save(self.critic.state_dict(), os.path.join('models', 'Critic.pt'))

    def LoadModels(self, actorpath, criticpath):
        actor = Actor(self.env_params, self.hidden_neurons)
        critic  = Critic(self.env_params, self.hidden_neurons)
        actor.load_state_dict(torch.load(actorpath))
        critic.load_state_dict(torch.load(criticpath))
        return actor, critic

    def SoftUpdateTarget(self, source, target):
        for target_param, param in zip(target.parameters(), source.parameters()):
            target_param.data.copy_((1 - self.args.polyak) * param.data + self.args.polyak * target_param.data)
示例#23
0
class DDPG:
    """Implementation of DDPG.

    This implementation is adapted to this particular environment running several agent.
    At each time step, the same actor is controlling each agent sequentially.
    """

    def __init__(self, state_size, action_size, config):
        """Initialize algorithm."""
        if config.PER:
            self.memory = PrioritizeReplayBuffer(
                config.BUFFER_SIZE, config.BATCH_SIZE, config.SEED
            )
        else:
            self.memory = ReplayBuffer(
                config.BUFFER_SIZE, config.BATCH_SIZE, config.SEED
            )

        # Randomly initialize critic netowrk and actor
        self.actor = Actor(state_size, action_size, config.SEED).to(device)
        self.critic = Critic(state_size, action_size, config.SEED).to(device)

        # Initialize target networks with weights from actor critic
        # Actor
        self.actor_target = Actor(state_size, action_size, config.SEED).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        # Critic
        self.critic_target = Critic(state_size, action_size, config.SEED).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())

        # Actor optimizer
        self.actor_optimizer = torch.optim.Adam(
            self.actor.parameters(), lr=config.LR_ACTOR
        )
        # Critic optimizer
        self.critic_optimizer = torch.optim.Adam(
            self.critic.parameters(), lr=config.LR_CRITIC
        )

        self.config = config

        self.t_step = 0

        self.expl_noise = config.EXPL_NOISE

    def step(self, target_sample=None, **kwargs):
        """Run a step of algorithm update."""
        # Sample a random minibatch of transitions
        states, actions, rewards, next_states, dones = self._draw_minibatch()

        # Compute the target Q value
        target_Q = self.critic_target(
            next_states, self.actor_target(next_states)
        ).detach()
        y = rewards + (1 - dones) * self.config.GAMMA * target_Q

        # Update critic by minimizing the loss
        current_Q = self.critic(states, actions)

        # Compute TD error
        td_error = y - current_Q

        if self.config.PER:
            # Get importance_sampling_weights
            weights = torch.Tensor(self.memory.importance_sampling()).unsqueeze(1)
            # Update priorities
            self.memory.update_priorities(td_error.detach().cpu().numpy())
            # Compute critic loss
            critic_loss = torch.mean(weights * td_error ** 2)
        else:
            # Compute critic loss
            critic_loss = torch.mean(td_error ** 2)

        # Optimize critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # Clip gradient
        nn.utils.clip_grad_norm_(self.critic.parameters(), 1)
        self.critic_optimizer.step()

        # Update the actor policy using the sampled policy gradient:
        actor_loss = -self.critic(states, self.actor(states)).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        # CLip gradient
        nn.utils.clip_grad_norm_(self.actor.parameters(), 1)
        self.actor_optimizer.step()

        # Update target networks
        self.soft_update()

    def train(self, env, num_episode):
        """Train a DDPG agent."""
        scores = []
        scores_window = deque(maxlen=100)

        for episode in range(num_episode):
            # Init state and episode score
            states = env.reset(train_mode=True)
            score = np.zeros(states.shape[0])
            done = False

            # Run episode
            while not done:
                # Select and run action
                actions = self.predict_actions(states)
                # TODO: dynamic low and high selection
                actions = self.add_gaussian_noise(actions, -1, 1)
                next_states, rewards, dones = env.step(actions)

                # Store all n_agent episodes in replay buffer
                for state, action, reward, next_state, done in zip(
                    states, actions, rewards, next_states, dones
                ):
                    self.memory.add(state, action, reward, next_state, done)

                # Update time step
                self.t_step = (self.t_step + 1) % self.config.UPDATE_EVERY

                # Optimisation step if UPDATE_EVERY and enough examples in memory
                if self.t_step == 0 and len(self.memory) > self.config.BATCH_SIZE:
                    for _ in range(self.config.UPDATE_STEPS):
                        self.step()

                # Update state and scores
                states = next_states
                score += rewards

                # End episode if any of the agent is done, to avoid storing too much
                # Done transitions in the replay buffer
                done = any(dones)

            # Keep track of running mean
            scores_window.append(max(score))

            # Append current mean to scores list
            scores.append(np.mean(scores_window))

            # Logging
            print(
                "\rEpisode {}\tAverage Score: {:.2f}, Last Score: {:.2f}".format(
                    episode, np.mean(scores_window), max(score)
                ),
                end="",
            )
            if (episode + 1) % 100 == 0:
                print(
                    "\rEpisode {}\tAverage Score: {:.2f}".format(
                        episode, np.mean(scores_window)
                    )
                )

        return scores

    def soft_update(self):
        """Update the frozen target models."""
        tau = self.config.TAU
        # Actor
        for param, target_param in zip(
            self.critic.parameters(), self.critic_target.parameters()
        ):
            target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

        # Critic
        for param, target_param in zip(
            self.actor.parameters(), self.actor_target.parameters()
        ):
            target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

    def predict_actions(self, states, **kwargs):
        """Predict next actions based on current policy."""
        states = torch.from_numpy(states).float().unsqueeze(0).to(device)

        # Set actor to eval mode
        self.actor.eval()

        actions = []
        with torch.no_grad():
            for state in states:
                action = self.actor(state)
                actions.append(action.detach().numpy())

        # Set actor to train mode
        self.actor.train()

        return np.array(actions).squeeze()

    def add_gaussian_noise(self, action, low, high):
        """Add Gaussian noise to action, and clip between low and high."""
        return (action + np.random.normal(0, self.expl_noise, size=action.shape)).clip(
            low, high
        )

    def _draw_minibatch(self):
        """Draw a minibatch in the replay buffer."""
        states, actions, rewards, next_states, done = zip(*self.memory.sample())

        states = torch.Tensor(states).to(device)
        actions = torch.Tensor(actions).to(device)
        rewards = torch.Tensor(rewards).unsqueeze(1).to(device)
        next_states = torch.Tensor(next_states).to(device)
        done = torch.Tensor(done).unsqueeze(1).to(device)

        return states, actions, rewards, next_states, done

    def save_model(self, path, **kwargs):
        """Save actor model weights."""
        torch.save(self.actor.state_dict(), path)
示例#24
0
文件: ddpg.py 项目: marsXyr/DP-ERL
class D3PG(object):
    def __init__(self, state_dim, action_dim, max_action, memory, args):

        # misc
        self.criterion = nn.MSELoss()
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.max_action = max_action
        self.memory = memory
        self.n = args.n_actor

        # actors
        self.actors = [
            Actor(state_dim,
                  action_dim,
                  max_action,
                  layer_norm=args.layer_norm) for i in range(self.n)
        ]
        self.actors_target = [
            Actor(state_dim,
                  action_dim,
                  max_action,
                  layer_norm=args.layer_norm) for i in range(self.n)
        ]
        self.actors_optimizer = [
            torch.optim.Adam(self.actors[i].parameters(), lr=args.actor_lr)
            for i in range(self.n)
        ]

        for i in range(self.n):
            self.actors_target[i].load_state_dict(self.actors[i].state_dict())

        # crtic
        self.critic = Critic(state_dim, action_dim, layer_norm=args.layer_norm)
        self.critic_target = Critic(state_dim,
                                    action_dim,
                                    layer_norm=args.layer_norm)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=args.critic_lr)

        # cuda
        if torch.cuda.is_available():
            for i in range(self.n):
                self.actors[i] = self.actors[i].cuda()
                self.actors_target[i] = self.actors_target[i].cuda()
            self.critic = self.critic.cuda()
            self.critic_target = self.critic_target.cuda()

        # shared memory
        for i in range(self.n):
            self.actors[i].share_memory()
            self.actors_target[i].share_memory()
        self.critic.share_memory()
        self.critic_target.share_memory()

        # hyper-parameters
        self.tau = args.tau
        self.discount = args.discount
        self.batch_size = args.batch_size
        self.reward_scale = args.reward_scale

    def train(self, iterations, actor_index):

        for _ in tqdm(range(iterations)):

            # Sample replay buffer
            states, n_states, actions, rewards, dones = self.memory.sample(
                self.batch_size)

            # Q target = reward + discount * Q(next_state, pi(next_state))
            with torch.no_grad():
                target_Q = self.critic_target(
                    n_states, self.actors_target[actor_index](n_states))
                target_Q = self.reward_scale * rewards + \
                    (1 - dones) * self.discount * target_Q

            # Get current Q estimate
            current_Q = self.critic(states, actions)

            # Compute critic loss
            critic_loss = self.criterion(current_Q, target_Q)

            # Optimize the critic
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            # Compute actor loss
            actor_loss = - \
                self.critic(states, self.actors[actor_index](states)).mean()

            # Optimize the actor
            self.actors_optimizer[actor_index].zero_grad()
            actor_loss.backward()
            self.actors_optimizer[actor_index].step()

            # Update the frozen target models
            for param, target_param in zip(self.critic.parameters(),
                                           self.critic_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

            for param, target_param in zip(
                    self.actors[actor_index].parameters(),
                    self.actors_target[actor_index].parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

    def load(self, filename):
        for i in range(self.n):
            self.actors[i].load_model(filename, "actor_" + str(i))
        self.critic.load_model(filename, "critic")

    def save(self, output):
        for i in range(self.n):
            self.actors[i].save_model(output, "actor_" + str(i))
        self.critic.save_model(output, "critic")
示例#25
0
                              args.embedding_dim, args.latent_dim,
                              vocab.size(), args.dropout, args.seq_len)
    autoencoder.load_state_dict(
        torch.load('autoencoder.th', map_location=lambda x, y: x))
    generator = Generator(args.n_layers, args.block_dim)
    critic = Critic(args.n_layers, args.block_dim)

    g_optimizer = optim.Adam(generator.parameters(), lr=args.lr)
    c_optimizer = optim.Adam(critic.parameters(), lr=args.lr)

    if args.cuda:
        autoencoder = autoencoder.cuda()
        generator = generator.cuda()
        critic = critic.cuda()

    print('G Parameters:', sum([p.numel() for p in generator.parameters() if \
                                p.requires_grad]))
    print('C Parameters:', sum([p.numel() for p in critic.parameters() if \
                                p.requires_grad]))

    best_loss = np.inf

    for epoch in range(1, args.epochs + 1):
        g_loss, c_loss = train(epoch)
        loss = g_loss + c_loss
        if loss < best_loss:
            best_loss = loss
            print('* Saved')
            torch.save(generator.state_dict(), 'generator.th')
            torch.save(critic.state_dict(), 'critic.th')
示例#26
0
LAMBDA = .95
EPSILON = .2
TARGET_DISCOUNT = .4
N_TIMESTEPS_PER_UPDATE = 300
# ~~~~~~~~~~~~~~~~~~

# Initialization
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
env = gym.make('CartPole-v1')

replay_memory = ReplayMemory(memory_capacity)

policy_net = Actor(sum(env.observation_space.shape), 200, env.action_space.n)
value_net = Critic(sum(env.observation_space.shape), 200, 1)
target_value_net = Critic(sum(env.observation_space.shape), 200, 1)
target_value_net.load_state_dict(value_net.state_dict())
target_value_net.eval()

params = list(policy_net.parameters()) + list(value_net.parameters())
optimizer = optim.SGD(params, lr=1e-3, momentum=.9, weight_decay=1e-6)

writer = SummaryWriter()

reward_normalizer = RewardNormalizer()
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

global_t = 0
for ep in range(10000):

    # episode loop
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~