Пример #1
0
    def train(self,
              transitions: int,
              sigma_max: float = 1.,
              sigma_min: float = 0.,
              buffer_size: int = 10000,
              batch_size: int = 128,
              progress_upd_step: int = None,
              start_training: int = 1000,
              shaping_coef: float = 300.):
        history = ReplayBuffer(buffer_size)
        progress_upd_step = progress_upd_step if progress_upd_step else transitions // 100

        log = {
            "alpha": self.alpha,
            "gamma": self.gamma,
            "sigma_max": sigma_max,
            "sigma_min": sigma_min,
            "buffer_size": buffer_size,
            "batch_size": batch_size,
            "tau": self.tau,
            "shaping_coef": shaping_coef,
            "step": [],
            "reward_mean": [],
            "reward_std": []
        }

        state = self.reset()
        t = tqdm(range(transitions))
        for i in t:
            sigma = sigma_max - (sigma_max - sigma_min) * i / transitions
            action = self.act(state)
            noise = np.random.normal(scale=sigma, size=action.shape)
            action = np.clip(action + noise, -1, 1)

            next_state, reward, done, _ = self.env.step(action)
            reward += shaping_coef * (self.gamma * np.abs(next_state[1]) -
                                      np.abs(state[1]))
            done_ = next_state[0] >= 0.5

            history.add((state, action, next_state, reward, done_))

            state = self.reset() if done else next_state

            if i > start_training:
                batch = history.sample(batch_size)
                self.update_critic(batch)
                self.update_actor(batch)

            if (i + 1) % progress_upd_step == 0:
                reward_mean, reward_std = self.evaluate_policy()

                log["step"].append(i)
                log["reward_mean"].append(reward_mean)
                log["reward_std"].append(reward_std)

                t.set_description(
                    f"step: {i + 1} | Rmean = {reward_mean:0.4f} | Rstd = {reward_std:0.4f}"
                )

        return log
Пример #2
0
    def train(self,
              transitions: int,
              eps_max: float = 0.5,
              eps_min: float = 0.,
              buffer_size: int = 10000,
              batch_size: int = 128,
              shaping_coef: float = 300.,
              progress_upd_step: int = None,
              start_training: int = 10000):
        history = ReplayBuffer(size=buffer_size)
        progress_upd_step = progress_upd_step if progress_upd_step else transitions // 100

        log = {
            "alpha": self.alpha,
            "gamma": self.gamma,
            "buffer_size": buffer_size,
            "batch_size": batch_size,
            "tau": self.tau,
            "shaping_coef": shaping_coef,
            "eps_max": eps_max,
            "eps_min": eps_min,
            "step": [],
            "reward_mean": [],
            "reward_std": []
        }

        state = self.reset()

        t = tqdm(range(transitions))
        for i in t:
            eps = eps_max - (eps_max - eps_min) * i / transitions
            if random() < eps:
                action = self.env.action_space.sample()
            else:
                action = self.act(state)

            next_state, reward, done, _ = self.env.step(action)
            reward += shaping_coef * (self.gamma * np.abs(next_state[1]) -
                                      np.abs(state[1]))
            done_ = next_state[0] >= 0.5

            history.add((state, action, next_state, reward, done_))

            state = self.reset() if done else next_state

            if i > start_training:
                self.update(history.sample(batch_size))

            if (i + 1) % progress_upd_step == 0:
                reward_mean, reward_std = self.evaluate_policy()

                log["step"].append(i)
                log["reward_mean"].append(reward_mean)
                log["reward_std"].append(reward_std)

                t.set_description(
                    f"step: {i + 1} | Rmean = {reward_mean:0.4f} | Rstd = {reward_std:0.4f}"
                )

        return log
class Agent:
    def __init__(self,
                 input_dim,
                 output_dim,
                 tau=0.001,
                 gamma=0.99,
                 train_batch_size=640):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.tau = tau
        self.gamma = gamma
        self.train_batch_size = train_batch_size
        self.main_critic = Critic(input_dim, output_dim, tau, gamma)
        self.target_critic = Critic(input_dim, output_dim, tau, gamma)

        self.main_actor = Actor(input_dim, output_dim, tau, gamma)
        self.target_actor = Actor(input_dim, output_dim, tau, gamma)

        self.target_critic.model.set_weights(
            self.main_critic.model.get_weights())
        self.target_actor.model.set_weights(
            self.main_actor.model.get_weights())

        self.memory = ReplayBuffer(batch_size=train_batch_size)

    def get_action(self, state):
        return self.main_actor.get_action(state)

    def train(self):
        data = self.memory.sample()
        states = np.vstack([e.state for e in data if e is not None])
        actions = np.array([e.action for e in data if e is not None
                            ]).astype(np.float32).reshape(-1, self.output_dim)
        rewards = np.array([e.reward for e in data if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in data
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in data if e is not None])

        actions_next = self.target_actor.model.predict_on_batch(next_states)
        Q_targets_next = self.target_critic.model.predict_on_batch(
            [next_states, actions_next])

        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)

        self.main_critic.train(states, actions, Q_targets)
        action_gradients = np.reshape(self.main_critic.get_gradient(states,actions), \
                                         (-1, self.output_dim))

        self.main_actor.train(states, action_gradients)

        self.target_actor.model = self.main_actor.soft_update(
            self.target_actor.model)
        self.target_critic.model = self.main_critic.soft_update(
            self.target_critic.model)
Пример #4
0
class AgentCommon():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)

        # Noise process
        #self.noise = OUNoise(action_size, random_seed)
        self.noise = OUNoise((self.num_agents, action_size), seed = random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
        
        self.actorL = ActorAgent(state_size, action_size, num_agents, self.noise, LR_ACTOR, self.memory, random_seed)
        self.actorR = ActorAgent(state_size, action_size, num_agents, self.noise, LR_ACTOR, self.memory, random_seed)
        self.sharedcritic = CriticAgent(state_size, action_size, num_agents, LR_CRITIC, WEIGHT_DECAY, TAU, random_seed)
    
    def step(self, state, action, reward, next_state, done):
        self.actorL.step(state[0], action[0], reward[0], next_state[0], done[0])
        self.actorR.step(state[1], action[1], reward[1], next_state[1], done[1])
        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences1 = self.memory.sample()
            experiences2 = self.memory.sample()
            self.sharedcritic.learn(self.actorL,experiences1, GAMMA)
            self.sharedcritic.learn(self.actorR,experiences2, GAMMA)

    def act(self, state, add_noise=True):
        actionL = self.actorL.act(state[0],add_noise=add_noise)
        actionR = self.actorL.act(state[1],add_noise=add_noise)
        return[actionL,actionR]
    
    def reset(self):
        self.noise.reset()
Пример #5
0
class Agent():
    """ DDPG Agent, interacts with environment and learns from environment """
    def __init__(self, device, state_size, n_agents, action_size, random_seed, \
                         buffer_size, batch_size, gamma, TAU, lr_actor, lr_critic, weight_decay,  \
                         learn_interval, learn_num, ou_sigma, ou_theta, checkpoint_folder = './'):

        # Set Computational device
        self.DEVICE = device

        # Init State, action and agent dimensions
        self.state_size = state_size
        self.n_agents = n_agents
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.l_step = 0
        self.log_interval = 200

        # Init Hyperparameters
        self.BUFFER_SIZE = buffer_size
        self.BATCH_SIZE = batch_size
        self.GAMMA = gamma
        self.TAU = TAU
        self.LR_ACTOR = lr_actor
        self.LR_CRITIC = lr_critic
        self.WEIGHT_DECAY = weight_decay
        self.LEARN_INTERVAL = learn_interval
        self.LEARN_NUM = learn_num

        # Init Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Init Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

        # Init Noise Process
        self.noise = OUNoise((n_agents, action_size),
                             random_seed,
                             mu=0.,
                             theta=ou_theta,
                             sigma=ou_sigma)

        # Init Replay Memory
        self.memory = ReplayBuffer(device, action_size, buffer_size,
                                   batch_size, random_seed)

    # think
    def act(self, states, add_noise=True):
        """ Decide what action to take next """

        # evaluate state through actor_local
        states = torch.from_numpy(states).float().to(self.DEVICE)
        actions = np.zeros((self.n_agents, self.action_size))

        self.actor_local.eval()  # put actor_local network in "evaluation" mode
        with torch.no_grad():
            for n, state in enumerate(states):
                actions[n, :] = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()  # put actor_local back into "training" mode

        # add noise for better performance
        if add_noise:
            actions += self.noise.sample()

        return np.clip(actions, -1, 1)

    # embody
    def step(self, t, s, a, r, s_, done):
        """ Commit step into the brain """

        # Save SARS' to replay buffer --- state-action-reward-next_state tuple
        for n in range(self.n_agents):
            # self.memory.add(s, a, r, s_, done)

            # print ("going to learn 10 times")

            self.memory.add(s[n], a[n], r[n], s_[n], done[n])

        if t % self.LEARN_INTERVAL != 0:
            return

        # Learn (if enough samples are available in memory        )
        if len(self.memory) > self.BATCH_SIZE:
            # print ("going to learn 10 times")
            for _ in range(self.LEARN_NUM):
                experiences = self.memory.sample()  # get a memory sample
                self.learn(experiences, self.GAMMA)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """ Learn from experiences, with discount factor gamma
        
        Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        
        Params:
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        # ------ Update Critic ------ #

        # get predicted next-state actions and Q values from target networks
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # minimize loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        #         torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ------ Update Actor ------ #

        # compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # minimize loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ------ Update Target Networks ------ #
        self.soft_update(self.critic_local, self.critic_target, self.TAU)
        self.soft_update(self.actor_local, self.actor_target, self.TAU)

        # keep count of steps taken
        # self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #6
0
    buffer = ReplayBuffer(BUFFER_SIZE)

    losses = np.zeros(N_EPISODES)
    # Loop over episodes
    for episode in range(N_EPISODES):

        episode_losses = np.zeros(EPISODE_LENGTH)
        # Reset the environment for the start of the episode.
        agent.reset()
        # Loop over steps within this episode. The episode length here is 20.
        for step_num in range(EPISODE_LENGTH):
            # Step the agent once, and get the transition tuple for this step
            transition = agent.step()

            buffer.append(transition)

            if len(buffer) >= BATCH_SIZE:
                loss = dqn.batch_train_q_network(buffer.sample(BATCH_SIZE))
                episode_losses[step_num] = loss

            # time.sleep(0.2)

        losses[episode] = np.average(episode_losses)
        print("Finished episode {}, average loss = {}".format(
            episode, losses[episode]))

    # shift x-axis by BATCH_SIZE iterations
    ax.plot(losses, color='blue')
    plt.yscale('log')
    fig.savefig("dqn_erb_loss_vs_episodes.png")
Пример #7
0
class TD3():
    """ Twin Delayed Deep Deterministic Policy Gradient Model """

    def __init__(self, state_size, action_size, random_seed):
        
                """ Initialize the model with arguments as follows:
                
                    ARGUMENTS
                    =========
                        - state_size (int) = dimension of input space
                        - action_size (int) = dimension of action space
                        - random_seed (int) = random seed

                    Returns 
                    =======
                        - best learned action to take after Actor-Critic Learning
                """
            
                self.state_size = state_size
                self.action_size = action_size
                self.seed = random.seed(random_seed)

                # create noise
                self.noise = OUNoise(action_size, random_seed)
                self.noise_decay = NOISE_DECAY
                
                # create memory
                self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed, device)
                


                # Actor Networks (local online net + target net)
                self.actor_local = Actor(state_size, action_size, random_seed).to(device)
                self.actor_target = Actor(state_size, action_size, random_seed).to(device)
                self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr = LR_ACTOR)

                # Critic Networks (local online net + target net)
                self.critic_local = Critic(state_size, action_size, random_seed).to(device)
                self.critic_target = Critic(state_size, action_size, random_seed).to(device)
                self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)
                
                # instantiate online and target networks with same weights
                self.soft_update(self.actor_local, self.actor_target, 1)
                self.soft_update(self.critic_local, self.critic_target, 1)
                
                self.learn_counter = 0
                
                
    def act(self, state, add_noise=True):
        """ Choose an action while interacting and learning in the environment """

        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample() * self.noise_decay
            self.noise_decay *= self.noise_decay
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, noise_clip=0.5, policy_freq=2):
        """ Sample from experiences and learn """

        # update the learn counter
        self.learn_counter += 1

        # get experience tuples
        states, actions, rewards, next_states, dones  = experiences
            
        # build noise on the action 
        ##### CAVE: need to put actions onto cpu() to create a cpu tensor that is put onto CUDA with .to(device)
        #noise = torch.FloatTensor(actions.cpu()).data.normal_(0, policy_noise).to(device)
        #noise = noise.clamp(-noise_clip, noise_clip)
        ### <<--- adding this kind of noise was implemented in the paper on github,
        ### but i used OU-Noise in the act method, so maybe better to use the same while learning

        noise = torch.FloatTensor([self.noise.sample() for _ in range(len(actions))]).to(device)
        noise = noise.clamp(-noise_clip, noise_clip)  
        # clip between -/+ max action dims because action+noise might run oor
        next_action = (self.actor_target(next_states) + noise).clamp(-1, 1)

        # compute the target Q value
        target_Q1, target_Q2 = self.critic_target(next_states, next_action)
        target_Q = torch.min(target_Q1, target_Q2)
        target_Q = rewards + (gamma * target_Q * (1-dones)).detach()

        # get current Q estimates
        current_Q1, current_Q2 = self.critic_local(states, actions)

        # compute critic loss
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

        # update the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # delay the policy update
        if self.learn_counter % policy_freq == 0:
                    
                # get actor_local predicted next action and use critic_local to complete
                actions_pred = self.actor_local.forward(states)
                actor_loss = -self.critic_local.Q1(states, actions_pred).mean()

                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()

                # delay update of actor and critic target models
                self.soft_update(self.actor_local, self.actor_target, TAU)
                self.soft_update(self.critic_local, self.critic_target, TAU)


    def soft_update(self, local_model, target_model, tau):
        # Perform soft update of the target networks
        # at every time step, keep 1-tau of target network
        # and add only a small fraction (tau) of the current online networks
        # to prevent oszillation
        for local_param, target_param in zip(local_model.parameters(), target_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

    def step(self, state, action, reward, next_state, done):
        # at every iteration, add new SARS' trajectory to memory, then learn from batches 
        # if learning_step is reached and enough samples are in the buffer
        
        self.memory.add(state, action, reward, next_state, done)
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)
def main():

    ##########
    # CONFIG #
    ##########
    # Target Reward
    tgt_score = 0.5
    # Device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Seed
    seed = 7
    seeding(seed)
    # Model Architecture
    # Actor
    hidden_in_actor = 256
    hidden_out_actor = 128
    lr_actor = 1e-4
    # Critic
    hidden_in_critic = 256
    hidden_out_critic = 128
    lr_critic = 3e-4
    weight_decay_critic = 0
    # Episodes
    number_of_episodes = 10000
    episode_length = 2000
    # Buffer
    buffer_size = int(1e6)
    batchsize = 512
    # Agent Update Frequency
    episode_per_update = 1
    # Rewards Discounts Factor
    discount_factor = 0.95
    # Soft Update Weight
    tau = 1e-2
    # Noise Process
    noise_factor = 2
    noise_reduction = 0.9999
    noise_floor = 0.0
    # Window
    win_len = 100
    # Save Frequency
    save_interval = 200
    # Logger
    log_path = os.getcwd() + "/log"
    logger = SummaryWriter(log_dir=log_path)
    # Model Directory
    model_dir = os.getcwd() + "/model_dir"
    os.makedirs(model_dir, exist_ok=True)
    # Load Saved Model
    load_model = False

    ####################
    # Load Environment #
    ####################
    env = UnityEnvironment(file_name="./Tennis_Linux_NoVis/Tennis.x86_64")
    # Get brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    print('Brain Name:', brain_name)
    # Reset the environment
    env_info = env.reset(train_mode=True)[brain_name]
    # Number of Agents
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)
    # size of each action
    action_size = brain.vector_action_space_size
    print('Size of each action:', action_size)
    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]
    print('There are {} agents. Each observes a state with length: {}'.format(
        states.shape[0], state_size))

    ####################
    # Show Progressbar #
    ####################
    widget = [
        'episode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]
    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()
    start = time.time()

    ###############
    # Multi Agent #
    ###############
    maddpg = MADDPG(state_size, action_size, num_agents, hidden_in_actor,
                    hidden_out_actor, lr_actor, hidden_in_critic,
                    hidden_out_critic, lr_critic, weight_decay_critic,
                    discount_factor, tau, seed, device)

    if load_model:
        load_dict_list = torch.load(os.path.join(model_dir,
                                                 'episode-saved.pt'))
        for i in range(num_agents):
            maddpg.maddpg_agent[i].actor.load_state_dict(
                load_dict_list[i]['actor_params'])
            maddpg.maddpg_agent[i].actor_optimizer.load_state_dict(
                load_dict_list[i]['actor_optim_params'])
            maddpg.maddpg_agent[i].critic.load_state_dict(
                load_dict_list[i]['critic_params'])
            maddpg.maddpg_agent[i].critic_optimizer.load_state_dict(
                load_dict_list[i]['critic_optim_params'])

    #################
    # Replay Buffer #
    #################
    rebuffer = ReplayBuffer(buffer_size, seed, device)

    #################
    # TRAINING LOOP #
    #################
    # initialize scores
    scores_history = []
    scores_window = deque(maxlen=save_interval)

    # i_episode = 0
    for i_episode in range(number_of_episodes):
        timer.update(i_episode)

        # Reset Environmet
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        scores = np.zeros(num_agents)

        # Reset Agent
        maddpg.reset()

        # episode_t = 0
        for episode_t in range(episode_length):

            # Explore with decaying noise factor
            actions = maddpg.act(states, noise_factor=noise_factor)
            env_info = env.step(actions)[brain_name]  # Environment reacts
            next_states = env_info.vector_observations  # get the next states
            rewards = env_info.rewards  # get the rewards
            dones = env_info.local_done  # see if episode has finished

            ###################
            # Save Experience #
            ###################
            rebuffer.add(states, actions, rewards, next_states, dones)

            scores += rewards
            states = next_states

            if any(dones):
                break

        scores_history.append(np.max(scores))  # save most recent score
        scores_window.append(np.max(scores))  # save most recent score
        avg_rewards = np.mean(scores_window)
        noise_factor = max(noise_floor, noise_factor *
                           noise_reduction)  # Reduce Noise Factor

        #########
        # LEARN #
        #########
        if len(rebuffer) > batchsize and i_episode % episode_per_update == 0:
            for a_i in range(num_agents):
                samples = rebuffer.sample(batchsize)
                maddpg.update(samples, a_i, logger)
            # Soft Update
            maddpg.update_targets()

        ##################
        # Track Progress #
        ##################
        if i_episode % save_interval == 0 or i_episode == number_of_episodes - 1:
            logger.add_scalars('rewards', {
                'Avg Reward': avg_rewards,
                'Noise Factor': noise_factor
            }, i_episode)
            print(
                '\nElapsed time {:.1f} \t Update Count {} \t Last Episode t {}'
                .format((time.time() - start) / 60, maddpg.update_count,
                        episode_t),
                '\nEpisode {} \tAverage Score: {:.2f} \tNoise Factor {:2f}'.
                format(i_episode, avg_rewards, noise_factor),
                end="\n")

        ##############
        # Save Model #
        ##############
        save_info = ((i_episode) % save_interval == 0
                     or i_episode == number_of_episodes)
        if save_info:
            save_dict_list = []
            for i in range(num_agents):
                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)
            torch.save(save_dict_list,
                       os.path.join(model_dir, 'episode-Latest.pt'))

            pd.Series(scores_history).to_csv(
                os.path.join(model_dir, "scores.csv"))

            # plot the scores
            rolling_mean = pd.Series(scores_history).rolling(win_len).mean()
            fig = plt.figure()
            ax = fig.add_subplot(111)
            plt.plot(np.arange(len(scores_history)), scores_history)
            plt.axhline(y=tgt_score, color='r', linestyle='dashed')
            plt.plot(rolling_mean, lw=3)
            plt.ylabel('Score')
            plt.xlabel('Episode #')
            # plt.show()
            fig.savefig(os.path.join(model_dir, 'Average_Score.pdf'))
            fig.savefig(os.path.join(model_dir, 'Average_Score.jpg'))
            plt.close()

        if avg_rewards > tgt_score:
            logger.add_scalars('rewards', {
                'Avg Reward': avg_rewards,
                'Noise Factor': noise_factor
            }, i_episode)
            print(
                '\nElapsed time {:.1f} \t Update Count {} \t Last Episode t {}'
                .format((time.time() - start) / 60, maddpg.update_count,
                        episode_t),
                '\nEpisode {} \tAverage Score: {:.2f} \tNoise Factor {:2f}'.
                format(i_episode, avg_rewards, noise_factor),
                end="\n")
            break

    env.close()
    logger.close()
    timer.finish()
Пример #9
0
                 ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))
print("s_t: ", s_t)
print("s_t size: ", s_t.size)
a = [[0, 1]]
#t_start = timeit.default_timer()
for i in range(max_step):
    ob, r_t, done, info = env.step(a[0])
    if done:
        break
    s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                      ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))
    memory.put((s_t, a[0], r_t, s_t1, done))
    s_t = s_t1
#t_end = timeit.default_timer()
s_done = s_t
print('done?: ', s_done)
#print('{}steps, {} time spent'.format(i,t_end-t_start))
env.end()
s, a, r, sp, d = memory.sample(3)
print('s: ', s)
print('a: ', a)
print('r: ', r)
print('sp: ', sp)
print('d: ', d)

# # --noise 테스트합니다.--
# noise = OrnsteinUhlenbeckNoise(mu = np.zeros(1),theta=0.1,dt=0.2,sigma = 0.1, x0 = np.array([0.5]))
# for i in range(300):
#     noise()
#     print(noise)
Пример #10
0
class DDQNAgent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 hidden_layers=[64, 64],
                 buffer_size=int(1e5),
                 batch_size=64,
                 gamma=0.99,
                 tau=1e-3,
                 learning_rate=5e-4,
                 update_every=4,
                 head_name="DuelingDQN",
                 head_scale="max"):
        """Initialize an Agent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            hidden_layers (list of int ; optional): number of each layer nodes
            buffer_size (int ; optional): replay buffer size
            batch_size (int; optional): minibatch size
            gamma (float; optional): discount factor
            tau (float; optional): for soft update of target parameters
            learning_rate (float; optional): learning rate
            update_every (int; optional): how often to update the network
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = learning_rate
        self.update_every = update_every

        # detect GPU device
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        # Assign model parameters and assign device
        model_params = [
            state_size, action_size, seed, hidden_layers, head_name, head_scale
        ]
        self.qnetwork_local = QNetwork(*model_params).to(self.device)
        self.qnetwork_target = QNetwork(*model_params).to(self.device)

        # Set up optimizer
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.lr)

        # Initialize Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed, self.device)
        # Initialize time step (for updating every self.update_every steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Update time step
        self.t_step = self.t_step + 1

        # Learn every self.update_every time steps.
        if self.t_step % self.update_every == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)

        # Go to evaluation mode and get Q values for current state
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)

        # get back to train mode
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        # From the experiences buffer, separate out S_t, A_t, R_t, S_t+1, done data
        states, actions, rewards, next_states, dones = experiences

        # Go to evaluation mode
        self.qnetwork_target.eval()
        with torch.no_grad():
            # get Q values for the next state
            Q_dash_local = self.qnetwork_local(next_states)
            Q_dash_target = self.qnetwork_target(next_states)

            # Find the predicted action based on the local Q_network
            argmax_action = torch.max(Q_dash_local, dim=1, keepdim=True)[1]

            # Get the Q-value from the target network
            Q_dash_max = Q_dash_target.gather(1, argmax_action)

            # Update the target value
            y = rewards + gamma * Q_dash_max * (1 - dones)

        # Go back to train mode
        self.qnetwork_target.train()

        # Predict Q-values based on the local network
        self.optimizer.zero_grad()
        Q = self.qnetwork_local(states)
        y_pred = Q.gather(1, actions)

        # TD-error/loss function
        loss = torch.sum((y - y_pred)**2)

        # Optimize the network
        loss.backward()
        self.optimizer.step()

        # Update the target network using the local and target networks
        self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        ?_target = ?*?_local + (1 - ?)*?_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #11
0
class DQNAgent(Agent):
    class SAVE:
        MEMORY = 1
        TARGETNETWORK = 2
        TRAINNETWORK = 4
        HYPERPARAM = 8
        ALL = 15

    def __init__(self,
                 DQNType,
                 input_shape,
                 replaybuffersize=100000,
                 input_preprocess=[]):
        super().__init__(MOVEMENTS.COMPLEX)
        self.memory = ReplayBuffer(replaybuffersize)
        self.train_network = DQNType(input_shape, len(self.movements))
        self.target_network = self.train_network.clone_model()
        self.input_preprocess = input_preprocess

        ## Initialize
        self.counter = 0
        self.epsilon = 1

        ## hyperparameters
        self.hyperparams = {
            "burn_in": 10000,
            "copy_each": 5000,
            "learn_each": 1,
            "save_each": 5000,
            "final_epsilon": 0.1,
            "epsilon_decay_rate": 0.99998,
            "batch_size": 32,
            "gamma": 0.99
        }

    def setparam(self, **kwargs):
        for key, val in kwargs.items():
            self.hyperparams[key] = val
        return self

    def getparams(self):
        return self.hyperparams

    def preprocess(self, image):
        for pc in self.input_preprocess:
            image = pc(image)
        return image

    def reward(self, reward, info_old, info_new):
        return reward + (info_new["score"] - info_old["score"]) / 100

    def action(self, states):
        self.action_states = self.preprocess(states)
        ## Random exploration
        if random.uniform(0, 1) < self.epsilon:
            self.action_num = random.choice(range(len(self.movements)))
        ## Make a decision based on the network
        else:
            normalized_states = figure.normalize()(
                self.action_states)  # convert to 0-1 scale
            output = self.train_network.predict(normalized_states[None, ...])
            self.action_num = np.argmax(output)
        return self.movements[self.action_num]

    def feedback(self, states, reward, info, done):
        # what to do after getting a reward
        self.counter += 1
        self.memory.append((
            self.action_states,  # already preprocessed
            self.action_num,
            reward,
            info,
            done,
            self.preprocess(states)))
        self.updateNetwork()

    def save(self, file_path, saveMethod=None):
        if saveMethod is None:
            saveMethod = self.SAVE.ALL
        if (saveMethod & self.SAVE.MEMORY):
            self.memory.save(file_path + "memory")
        if (saveMethod & self.SAVE.TARGETNETWORK):
            self.target_network.save_model(file_path + "target_net")
        if (saveMethod & self.SAVE.TRAINNETWORK):
            self.train_network.save_model(file_path + "train_net")
        # if (saveMethod & self.SAVE.HYPERPARAM):
        #     with open(file_path + "hyperparam.json", "w") as f:
        #         json.dump(self.hyperparams, f, indent=2)
    def load(self, file_path):
        try:
            self.target_network.load_model(file_path + "target_net")
            self.train_network.load_model(file_path + "train_net")
            with open(file_path + "hyperparam.json", "r") as f:
                self.hyperparams = json.load(f)
        except Exception as e:
            print(e)

    def updateNetwork(self):
        if self.counter < self.hyperparams["burn_in"]:
            return
        self.epsilon *= self.hyperparams["epsilon_decay_rate"]
        self.epsilon = max(self.epsilon, self.hyperparams["final_epsilon"])
        if (self.counter - self.hyperparams["burn_in"]
            ) % self.hyperparams["learn_each"] == 0:
            self.learn()
        if (self.counter - self.hyperparams["burn_in"]
            ) % self.hyperparams["copy_each"] == 0:
            self.target_network = self.train_network.clone_model()
        if (self.counter - self.hyperparams["burn_in"]
            ) % self.hyperparams["save_each"] == 0:
            self.save("./autosave/step_" + str(self.counter))

    def learn(self):
        learn_sample = self.memory.sample(self.hyperparams["batch_size"])
        state_raw = np.stack(
            [states for states, _, _, _, _, _ in learn_sample], axis=0)
        actions = [action for _, action, _, _, _, _ in learn_sample]
        rewards = [reward for _, _, reward, _, _, _ in learn_sample]
        not_done = [not done for _, _, _, _, done, _ in learn_sample]
        next_state_raw = np.stack(
            [states for _, _, _, _, _, states in learn_sample], axis=0)
        state = figure.normalize()(state_raw)
        next_state = figure.normalize()(next_state_raw)
        best_action_next = np.argmax(self.train_network.predict(next_state),
                                     axis=1)
        # Predicts the Q values calculated at the best_action_next
        # We shall only keep those entries corresponding to the real actions taken
        # Terminal states should not involve calculating the expected Q value.
        Q_value_next_target_mat = self.target_network.predict(
            next_state, actions=best_action_next)
        Q_value_next_target_vec = np.max(Q_value_next_target_mat, axis=1)
        Q_value_target_vec = np.array(rewards) + self.hyperparams[
            "gamma"] * np.array(not_done) * Q_value_next_target_vec
        Q_value_target_mat = np.zeros(Q_value_next_target_mat.shape)
        for id, num in enumerate(actions):
            Q_value_target_mat[id, num] = Q_value_target_vec[id]

        self.train_network.fit(state, actions, Q_value_target_mat, verbose=0)
Пример #12
0
    # Loop over episodes
    for episode in range(N_EPISODES):
        epsilon = min(10 / (episode + 1), 1)

        episode_losses = np.zeros(EPISODE_LENGTH)
        # Reset the environment for the start of the episode.
        agent.reset()
        # Loop over steps within this episode. The episode length here is 20.
        for step_num in range(EPISODE_LENGTH):
            # Step the agent once, and get the transition tuple for this step
            transition = agent.step(dqn, epsilon)

            buffer.append(transition)

            if len(buffer) >= BATCH_SIZE:
                loss = dqn.batch_train_q_network(buffer.sample(BATCH_SIZE), target_network=target)
                episode_losses[step_num] = loss

            if (episode * EPISODE_LENGTH + step_num) % TARGET_SWAP == 0:
                print("Swapped target network on step {}".format(episode * EPISODE_LENGTH + step_num))
                target.q_network.load_state_dict(dqn.q_network.state_dict())
            # time.sleep(0.05)
        
        losses[episode] = np.average(episode_losses)
        print("Finished episode {}, average loss = {}".format(episode, losses[episode]))

    
    # evaluate Q-value
    q_values = np.zeros((10, 10, 4))
    for col in range(10):
        x = col / 10 + 0.05
Пример #13
0
class MADDPG:
    def __init__(self,
                 num_agents,
                 local_obs_dim,
                 local_action_size,
                 global_obs_dim,
                 global_action_size,
                 discount_factor=0.95,
                 tau=0.02,
                 device=device,
                 random_seed=4,
                 lr_critic=1.0e-4,
                 weight_decay=0.0):
        super(MADDPG, self).__init__()

        # parameter configuration
        self.num_agents = num_agents
        self.device = device
        self.discount_factor = discount_factor
        self.tau = tau
        self.num_agents = num_agents
        self.global_action_size = global_action_size
        self.global_obs_dim = global_obs_dim
        torch.manual_seed(random_seed)
        random.seed(random_seed)
        self.random_seed = random_seed
        self.weight_decay = weight_decay

        # define actors
        self.actors = [
            DDPGActor(num_agents,
                      local_obs_dim,
                      local_action_size,
                      global_obs_dim,
                      global_action_size,
                      device=device) for _ in range(num_agents)
        ]
        # define centralized critic
        self.critic = Critic(global_obs_dim, global_action_size,
                             self.random_seed).to(self.device)
        self.target_critic = Critic(global_obs_dim, global_action_size,
                                    self.random_seed).to(self.device)
        hard_update(self.target_critic, self.critic)

        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=lr_critic,
                                     weight_decay=self.weight_decay)

        # noise coef
        self.noise_coef = 1.0
        self.noise_coef_decay = 1e-6

        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed)

    def act(self, obs_all_agents):
        actions = [
            ddpg_actor.act(local_obs, self.noise_coef)
            for ddpg_actor, local_obs in zip(self.actors, obs_all_agents)
        ]
        return actions

    def target_act(self, obs_all_agents):
        actions = [
            ddpg_actor.target_act(local_obs, noise_coef=0, add_noise=False)
            for ddpg_actor, local_obs in zip(self.actors, obs_all_agents)
        ]
        return actions

    def step(self, obs, obs_full, actions, rewards, next_obs, next_obs_full,
             dones, timestep):
        self.memory.add(obs, obs_full, actions, rewards, next_obs,
                        next_obs_full, dones)

        timestep = timestep % TRAIN_EVERY

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE and timestep == 0:
            for _ in range(N_LEARN_UPDATES):
                experiences = self.memory.sample()
                self.learn(experiences, self.discount_factor)

    def learn(self, experiences, gamma):
        obs, obs_full, action, reward, next_obs, next_obs_full, done = experiences

        obs = obs.permute(1, 0, -1)  # agent_id * batch_size * state_size
        obs_full = obs_full.view(-1, self.global_obs_dim)
        next_obs = next_obs.permute(1, 0, -1)
        next_obs_full = next_obs_full.view(-1, self.global_obs_dim)
        action = action.reshape(-1, self.global_action_size)

        # ---------------- update centralized critic ----------------------- #
        self.critic_optimizer.zero_grad()

        # get target actions from all target_actors
        target_actions = np.array(self.target_act(next_obs))
        target_actions = torch.from_numpy(target_actions).float().permute(
            1, 0, -1)
        target_actions = target_actions.reshape(-1, self.global_action_size)

        # update critic
        with torch.no_grad():
            q_next = self.target_critic.forward(next_obs_full,
                                                target_actions.to(self.device))

        y = reward + gamma * q_next * (1 - done)

        q = self.critic.forward(obs_full, action)

        critic_loss = 0
        for i in range(self.num_agents):
            critic_loss += F.mse_loss(q, y[:, i].detach().reshape(
                -1, 1)) / self.num_agents
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------- update actor for all agents --------------------- #
        for ii in range(len(self.actors)):
            self.actors[ii].actor_optimizer.zero_grad()

            q_action = [ self.actors[i].actor_local(ob) if i == ii \
                   else self.actors[i].actor_local(ob).detach()
                   for i, ob in enumerate(obs) ]

            q_action = torch.stack(q_action).permute(1, 0, -1)
            q_action = q_action.reshape(-1, self.global_action_size).to(
                self.device)

            # policy_gradient
            actor_loss = -self.critic.forward(obs_full, q_action).mean()
            actor_loss.backward()
            self.actors[ii].actor_optimizer.step()

        # --------------- soft update all target networks ------------------- #
        soft_update(self.target_critic, self.critic, self.tau)
        for actor in self.actors:
            actor.update_target(self.tau)

        # -------------- reset noise --------------------------------------- #
        for actor in self.actors:
            actor.action_noise.reset()

        self.noise_coef -= self.noise_coef_decay
        if self.noise_coef < 0.01:
            self.noise_coef = 0.01
Пример #14
0
        done = 0
        total_reward = 0
        step = agent.step_move()
        epsilon = max(1 - step * arg.epsilon_decrease, arg.epsilon_min)
        while not done:
            if np.random.uniform(0, 1) < epsilon:
                action = agent.random_action()
            else:
                action = agent.choose_action(obs)
            obs_, reward, done, _ = env.step(
                action + 1)  # because there is only three action
            replay_buffer.store_transition(obs, obs_, action, reward, done)
            total_reward += reward
            obs = obs_

        print('in {}, {}th game: the reward {} '.format(
            arg.run_name, step, total_reward))

        if step % train_period == 0:
            s1, s2, a, r, d = replay_buffer.sample(batch_size=train_batch)
            if step % record_period == 0:
                loss = agent.train(s1, s2, a, r, d, True)
                agent.log_reward(total_reward)
                agent.save()
            else:
                loss = agent.train(s1, s2, a, r, d, False)
            print('{}th game: the training loss {}'.format(step, loss))

        if step % arg.update_period == 0:
            agent.update_target_network()
class DDPG_Agent():
    """Interacts with and learns from the environment."""
#self.state_size, self.action_size, self.seed, hidden_layers_actor, hidden_layers_critic, self.buffer_size, learning_rate_actor, learning_rate_critic
    def __init__(self, state_size, action_size, num_agents, seed, device,
                 buffer_size=int(1e5), batch_size=128, num_batches = 5, update_every=10,
                 gamma=0.99, tau=8e-3,
                 learning_rate_actor=1e-3, learning_rate_critic=1e-3, weight_decay=0.0001,                
                 hidden_layers_actor=[32,32], hidden_layers_critic=[32, 32, 32],
                 add_noise=True, start_eps=5.0, end_eps=0.0, end_eps_episode=500,
                 agent_id=-1):
        """Initialize an Agent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            seed (int): random seed
            hidden_layers (list of int ; optional): number of each layer nodes
            buffer_size (int ; optional): replay buffer size
            batch_size (int; optional): minibatch size
            gamma (float; optional): discount factor
            tau (float; optional): for soft update of target parameters
            learning_rate_X (float; optional): learning rate for X=actor or critic
        """
        print('In DPPG_AGENT: seed = ', seed)
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(seed)
        self.device = device
        
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.update_every = update_every
        self.num_batches = num_batches
        
        self.gamma = gamma
        self.tau = tau
        
        self.lr_actor = learning_rate_actor
        self.lr_critic = learning_rate_critic
        self.weight_decay_critic = weight_decay
        
        self.add_noise = add_noise
        self.start_eps = start_eps
        self.eps = start_eps
        self.end_eps = end_eps
        self.eps_decay = 1/(end_eps_episode*num_batches)  # set decay rate based on epsilon end target
        self.timestep = 0
        
        self.agent_id = agent_id
     
        ### SET UP THE ACTOR NETWORK ###
        # Assign model parameters and assign device
        model_params_actor  = [state_size, action_size, seed, hidden_layers_actor]
        
        # Create the Actor Network (w/ Target Network)
        self.actor_local = Actor(*model_params_actor).to(self.device)
        self.actor_target = Actor(*model_params_actor).to(self.device)
        #print('actor_local network is: ', print(self.actor_local))
        
        # Set up optimizer for the Actor network
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor)       
        
        ### SET UP THE CRITIC NETWORK ###
        model_params_critic = [state_size, action_size, seed, hidden_layers_critic]

        # Create the Critic Network (w/ Target Network)
        self.critic_local = Critic(*model_params_critic).to(self.device)
        self.critic_target = Critic(*model_params_critic).to(self.device)
        
        # Set up optimizer for the Critic Network
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay_critic)

        # Noise process
        self.noise = OUNoise(action_size, self.seed)
        
        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed, device)

    def step(self, states, actions, rewards, next_states, dones, agent_number):
        # Increment timestep by 1
        self.timestep += 1
        
        # Save experience in replay memory
        self.memory.add(states, actions, rewards, next_states, dones)
        
         # If there are enough samples and a model update is to be made at this time step
        if len(self.memory) > self.batch_size and self.timestep%self.update_every == 0:
            # For each batch
            for i in range(self.num_batches):
                # Sample experiences from memory
                experiences = self.memory.sample()
        
                # Learn from the experience
                self.learn(experiences, self.gamma, agent_number)

    def act(self, state, scale_noise=True):
        """Returns actions for given state as per current policy.
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().to(self.device)
        
        # Go to evaluation mode and get Q values for current state
        self.actor_local.eval()
        with torch.no_grad():
            # Get action for the agent and concatenate them
            action = [self.actor_local(state[0]).cpu().data.numpy()]
            
        # get back to train mode
        self.actor_local.train()
        
        # Add noise to the action probabilities
        # Note, we want the magnitude of noise to decrease as the agent keeps learning
        action += int(scale_noise)*(self.eps)*self.noise.sample()
        
        return np.clip(action, -1.0, 1.0)
    
    def reset(self):
        """
        Reset the noise, and all neural network parameters for the current agent
        """
        self.noise.reset()
        self.eps = self.start_eps
        self.timestep = 0
        self.critic_local.reset_parameters()
        self.actor_local.reset_parameters()
        self.critic_target.reset_parameters()
        self.actor_target.reset_parameters()
        
        # ReSet up optimizer for the Actor network
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor)
        
        # Set up optimizer for the Critic Network
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay_critic)
        
        # Clear the experience buffer
        self.memory.clear_buffer()
        
    def reset_noise(self):
        """
        Reset the noise only
        """
        self.noise.reset()
   
    def learn(self, experiences, gamma, agent_number):
        ####     DRAW FROM MEMORY AND PREPARE SARS DATA        ####
        # From the experiences buffer, separate out S_t, A_t, R_t, S_t+1, done data
        states, actions, rewards, next_states, dones = experiences
        
        # NOTE: actions has dimension of batch_size x concatenated action for all agents
      
        # get the next action for the current agent for the entire batch
        actions_next = self.actor_target(next_states)
    
        # Construct next action vector for the agent
        if agent_number == 0:
            actions_next = torch.cat((actions_next, actions[:,2:]), dim=1)
        else:
            actions_next = torch.cat((actions[:,:2], actions_next), dim=1)
        
        ####    UPDATE CRITIC   ####
        # Get predicted next-state actions and Q values from target models
        # Get the next targets
        Q_targets_next = self.critic_target(next_states, actions_next)
        
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        
        # Define the loss
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        
        # Clip gradient @1
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()
   

        # --------------UPDATE ACTOR -----------------------#
        # Compute actor loss
        actions_pred = self.actor_local(states)

        # Construct action prediction vector relative to each agent
        if agent_number == 0:
            actions_pred = torch.cat((actions_pred, actions[:,2:]), dim=1)
        else:
            actions_pred = torch.cat((actions[:,:2], actions_pred), dim=1)
        
        # Calculate the loss. Note the negative sign since we use steepest ascent
        actor_loss = -self.critic_local(states, actions_pred).mean()
        
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Update the target networks using the local and target networks
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)
        
        # update noise decay parameter
        self.eps -= self.eps_decay
        self.eps = max(self.eps, self.end_eps)
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        X_target = tau*X_local + (1 - tau)*X_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(object):
    """
    The Agent interacts with and learns from the environment.
    """
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 random_seed=0,
                 params=params):
        """
        Initialize an Agent object.
        Params
        ======
        state_size (int): dimension of each state
        action_size (int): dimension of each action
        num_agents (int): number of agents
        random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        self.params = params

        # Actor (Policy) Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(self.params['DEVICE'])
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(self.params['DEVICE'])
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.params['LR_ACTOR'])

        # Critic (Value) Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(self.params['DEVICE'])
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(self.params['DEVICE'])
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.params['LR_CRITIC'],
            weight_decay=self.params['WEIGHT_DECAY'])

        # Initialize target and local to same weights
        self.hard_update(self.actor_local, self.actor_target)
        self.hard_update(self.critic_local, self.critic_target)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.params['BUFFER_SIZE'],
                                   self.params['BATCH_SIZE'], random_seed)

    def hard_update(self, local_model, target_model):
        """
        Hard update model parameters.
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(local_param.data)

    def step(self, states, actions, rewards, next_states, dones):
        """
        Save experiences in replay memory and use random sample from buffer to learn.
        """

        # Save experience / reward, cater for when multiples
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn if enough samples are available in memory
        if len(self.memory) > self.params['BATCH_SIZE']:
            experiences = self.memory.sample()
            self.learn(experiences, self.params['GAMMA'])

    def act(self, states, add_noise=True):
        """
        Returns actions for a given state as per current policy.
        """
        states = torch.from_numpy(states).float().to(self.params['DEVICE'])
        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            for i, state in enumerate(states):
                actions[i, :] = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            actions += self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma=params['GAMMA']):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Update Critic(Value)
        # Get predicted next-state actions and Q-Values from target Network
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q Targe for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimise the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(
            self.critic_local.parameters(),
            1)  # Stabilize learning per bernchmark guidelines
        self.critic_optimizer.step()

        # Update Actor (Policy)
        # Compute Actor Loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Update target networks
        self.soft_update(self.critic_local,
                         self.critic_target,
                         tau=self.params['TAU'])
        self.soft_update(self.actor_local,
                         self.actor_target,
                         tau=self.params['TAU'])

    def soft_update(self, local_model, target_model, tau=params['TAU']):
        """
        Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #17
0
def train(graph_distr, epochs, batch_size, eps, n_step, discount, capacity,
          gcn_params, opt_params):
    '''
    graph_distr: object that wraps the graph generating distr
    epochs: int
    batch_size: int
    eps: float for exploration probability
    n: int, num steps for n-step Q-learning
    discount: float, how much to discount future state/action value
    capacity: int, number of episodes to keep in memory
    gcn_params: dictionary of graph conv net parameters
    opt_params: dictionary of params for optimizer
    '''
    qnet = QNetwork(gcn_params)
    memory = ReplayBuffer(capacity)
    opt_params['params'] = qnet.parameters()
    optimizer = get_optimizer(opt_params)

    for e in range(epochs):
        node_labels, edge_weights, adj = graph_distr.next()
        embedding = qnet.embed_graph(node_labels, edge_weights, adj)

        state = []  # s_0
        state_vec = Variable(torch.zeros((1, qnet.embed_dim)))
        state_vec_prev = None
        actions = []
        rewards = []
        s_complement = set(range(len(adj)))
        losses = []
        best_actions = []

        for t in range(len(adj)):
            if t > 0:
                v_best_t = qnet.best_action(state, list(s_complement),
                                            embedding)
            if random.random() < eps or t == 0:
                v_t = random.choice(tuple(s_complement))
            else:
                v_t = v_best_t

            action_vec = embedding[v_t].unsqueeze(0)
            vprev = None if t == 0 else state[-1]
            r_t = 0 if t == 0 else -edge_weights.data[vprev, v_t]
            s_complement.remove(v_t)

            # ideally store: s_0 , a_0, r_0, s_1, v_best_1
            # ideally store: s_1 , a_1, r_1, s_2, v_best_2
            if t >= n_step:
                new_state = state[:]
                # the action prev is what action got taken.
                # v_best_t must be the argmax action of the current state
                v_best_embedding = embedding[v_best_t].unsqueeze(0)
                episode = (state_vec_prev, action_vec_prev, rewards[-1],
                           state_vec, v_best_embedding)
                # should try to add v_best_t so we dont recompute later

                memory.push(*episode)
                if len(memory) > batch_size:
                    batch = memory.sample(batch_size)
                    batch_loss = qnet.backprop_batch(batch, optimizer)
                    losses.append(batch_loss)

            state_vec_prev = state_vec
            action_vec_prev = action_vec
            state.append(v_t)
            state_vec = state_vec + action_vec
            rewards.append(r_t)

        epoch_loss = torch.mean(torch.cat(losses))
        print('Epoch {} | avg loss: {:.3f} | Exploration rate: {:.3f}'.format(
            e, float(epoch_loss), eps))
        eps = update_exploration(eps)
Пример #18
0
class Agent:
    def __init__(self,
                 state_size,
                 action_size,
                 device,
                 buffer_size=int(1e5),
                 batch_size=64,
                 gamma=0.99,
                 tau=1e-3,
                 lr=5e-4,
                 update_every=4):
        self.state_size = state_size
        self.action_size = action_size
        self.device = device
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.update_every = update_every

        # model settings
        self.qnet_local = Model(state_size, action_size).to(self.device)
        self.qnet_target = Model(state_size, action_size).to(self.device)
        self.optimizer = optim.Adam(self.qnet_local.parameters(), lr=self.lr)

        # replay buffer settings
        self.replay_buffer = ReplayBuffer(self.buffer_size, self.batch_size)
        self.update_step = 0

    def step(self, state, action, reward, next_state, done):
        self.replay_buffer.add(state, action, reward, next_state, done)

        self.update_step = (self.update_step + 1) % self.update_every
        if (self.update_step
                == 0) and (len(self.replay_buffer) > self.batch_size):
            experiences = self.replay_buffer.sample()
            self.learn(experiences)

    def act(self, state, eps=0.0):
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)

        self.qnet_local.eval()
        with torch.no_grad():
            action_values = self.qnet_local(state)
        self.qnet_local.train()

        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return np.random.choice(self.action_size)

    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        # convert to tensors and send to device
        states = torch.from_numpy(states).float().to(self.device)
        actions = torch.from_numpy(actions).long().to(self.device)
        rewards = torch.from_numpy(rewards).float().to(self.device)
        next_states = torch.from_numpy(next_states).float().to(self.device)
        dones = torch.from_numpy(dones).float().to(self.device)

        # max returns max values (0) and indices (1)
        # unsqueeze is needed to add batch dim B x 1
        q_max = self.qnet_target(next_states).detach().max(1)[0].unsqueeze(1)
        y = rewards + self.gamma * q_max * (1 - dones)

        # select action values corresponding to actions
        # this is what .gather does
        # note for the expected we pass states, not next_states
        q_expected = self.qnet_local(states).gather(1, actions)

        loss = F.mse_loss(q_expected, y)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update()

    def soft_update(self):
        for target_param, local_param in zip(self.qnet_target.parameters(),
                                             self.qnet_local.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1 - self.tau) * target_param.data)

    def train(self,
              env,
              n_episodes=2000,
              max_t=1000,
              eps_start=1.0,
              eps_end=0.01,
              eps_decay=0.995):
        scores = []
        scores_window = deque(maxlen=100)
        eps = eps_start

        brain_name = env.brain_names[0]

        for i_episode in range(1, n_episodes + 1):

            env_info = env.reset(train_mode=True)[brain_name]
            state = env_info.vector_observations[0]

            score = 0
            for t in range(max_t):
                action = self.act(state, eps)
                env_info = env.step(action)[brain_name]
                next_state = env_info.vector_observations[0]
                reward = env_info.rewards[0]
                done = env_info.local_done[0]

                self.step(state, action, reward, next_state, done)

                state = next_state
                score += reward

                if done:
                    break
            scores_window.append(score)
            scores.append(score)
            avg_scores = np.mean(scores_window)
            eps = max(eps_end, eps_decay * eps)

            print(f'\rEpisode {i_episode}\tAverage Score: {avg_scores:.2f}',
                  end='')
            if i_episode % 100 == 0:
                print(
                    f'\rEpisode {i_episode}\tAverage Score: {avg_scores:.2f}')
            if avg_scores >= 13.0:
                print(f'\nEnvironment solved in {i_episode - 100} episodes!'
                      f'\tAverage Score: {np.mean(scores_window):.2f}')
                torch.save(self.qnet_local.state_dict(), 'checkpoint.pth')
                break

        return scores

    def evaluate(self, env):

        brain_name = env.brain_names[0]
        env_info = env.reset(train_mode=False)[brain_name]
        state = env_info.vector_observations[0]

        score = 0
        for i in range(2000):

            action = self.act(state)
            env_info = env.step(action)[brain_name]
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]
            state = next_state

            score += reward
            if done:
                break

        print(f'Total score: {score:.2f}')
class Agent():
    """Interacts with and learns from the environment"""

    def __init__(self, state_size, action_size, fc1_units=256, fc2_units=128, device=torch.device('cpu')):
        """DQN agent

        Args:
          state_size (int): dimension of each state
          action_size (int): dimension of each action (or the number of action choices)
          seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.device = device

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       fc1_units=fc1_units, fc2_units=fc2_units).to(self.device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        fc1_units=fc1_units, fc2_units=fc2_units).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Initialze qnetwork_target parameters to qnetwork_local
        self.soft_update(self.qnetwork_local, self.qnetwork_target, 1)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, device=self.device)

        # Initialize the time step counter (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subnet and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Args:
          state (array_like): current state
          eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)

        # Set qnetwork_local to evaluation mode
        self.qnetwork_local.eval()

        # This operation should not be included in gradient calculation
        with torch.no_grad():
            action_values = self.qnetwork_local(state)

        # Set back qnetwork_local to training mode
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Args:
          experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
          gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)

        # Compute Q tagets for current states with actual rewards
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ----- Update the target network -----
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        theta_target = tau * theta_local + (1 - tau) * theta_target

        Args:
          local_model (torch.nn.Module): weights will be copied from
          target_model (torch.nn.MOdule): weights will be copied to
          tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1. - tau) * target_param.data)
Пример #20
0
class DDPG(Model):
    """ Interface """
    def __init__(self,
                 name,
                 args,
                 sess=None,
                 reuse=False,
                 log_tensorboard=True,
                 save=True):
        self.learn_steps = 0

        # hyperparameters
        self.gamma = args[name]['gamma']
        self.tau = args[name]['tau']
        self.init_noise_sigma = args[name]['init_noise_sigma']
        self.noise_decay = args[name]['noise_decay']

        # replay buffer
        self.buffer = ReplayBuffer(sample_size=args['batch_size'],
                                   max_len=args[name]['buffer_size'])

        super(DDPG, self).__init__(name,
                                   args,
                                   sess=sess,
                                   reuse=reuse,
                                   build_graph=True,
                                   log_tensorboard=log_tensorboard,
                                   save=save)

        self._initialize_target_net()

    @property
    def main_variables(self):
        return self.actor_critic.trainable_variables

    @property
    def _target_variables(self):
        return self._target_actor_critic.trainable_variables

    def act(self, state):
        self.sess.run(self.noise_op)
        state = state.reshape((-1, self.state_size))
        action = self.sess.run(self.actor_critic.actor_action,
                               feed_dict={self.actor_critic.state: state})
        self.sess.run(self.denoise_op)
        return np.squeeze(action)

    def step(self, state, action, reward, next_state, done):
        self.buffer.add(state, action, reward, next_state, done)

        if len(self.buffer) > self.buffer.sample_size + 100:
            self._learn()

    """ Implementation """

    def _build_graph(self):
        # env info
        self._setup_env()

        # main actor-critic
        self.actor_critic = self._create_actor_critic()
        # target actor-critic
        self._target_actor_critic = self._create_actor_critic(is_target=True)

        # losses
        self.actor_loss, self.critic_loss = self._loss()

        # optimizating operation
        self.opt_op = self._optimize([self.actor_loss, self.critic_loss])

        # target net update operations
        self.init_target_op, self.update_target_op = self._targetnet_ops()

        # operations that add/remove noise from parameters
        self.noise_op, self.denoise_op = self._noise_params()

    def _setup_env(self):
        self.state_size = self._args[self.name]['state_size']
        self.action_size = self._args[self.name]['action_size']
        self.env_info = {}
        with tf.name_scope('placeholders'):
            self.env_info['state'] = tf.placeholder(tf.float32,
                                                    shape=(None,
                                                           self.state_size),
                                                    name='state')
            self.env_info['action'] = tf.placeholder(tf.float32,
                                                     shape=(None,
                                                            self.action_size),
                                                     name='action')
            self.env_info['next_state'] = tf.placeholder(
                tf.float32, shape=(None, self.state_size), name='next_state')
            self.env_info['reward'] = tf.placeholder(tf.float32,
                                                     shape=(None, 1),
                                                     name='reward')
            self.env_info['done'] = tf.placeholder(tf.uint8,
                                                   shape=(None, 1),
                                                   name='done')

    def _create_actor_critic(self, is_target=False):
        name = 'target_actor_critic' if is_target else 'actor_critic'
        log_tensorboard = False if is_target else True
        actor_critic = ActorCritic(name,
                                   self._args,
                                   self.env_info,
                                   self.action_size,
                                   reuse=self.reuse,
                                   log_tensorboard=log_tensorboard,
                                   is_target=is_target)

        return actor_critic

    def _loss(self):
        with tf.name_scope('loss'):
            with tf.name_scope('l2_loss'):
                encoder_l2_loss = tf.losses.get_regularization_loss(
                    scope=self.actor_critic.variable_scope + '/state_encoder',
                    name='encoder_l2_loss')
                actor_l2_loss = tf.losses.get_regularization_loss(
                    scope=self.actor_critic.variable_scope + '/actor',
                    name='actor_l2_loss')
                critic_l2_loss = tf.losses.get_regularization_loss(
                    scope=self.actor_critic.variable_scope + '/critic',
                    name='critic_l2_loss')

            with tf.name_scope('actor_loss'):
                actor_loss = tf.negative(
                    tf.reduce_mean(self.actor_critic.Q_with_actor),
                    name='actor_loss') + encoder_l2_loss + actor_l2_loss

            with tf.name_scope('critic_loss'):
                target_Q = tf.stop_gradient(
                    self.env_info['reward'] +
                    self.gamma * tf.cast(1 - self.env_info['done'], tf.float32)
                    * self._target_actor_critic.Q_with_actor,
                    name='target_Q')
                critic_loss = tf.losses.mean_squared_error(
                    target_Q,
                    self.actor_critic.Q) + encoder_l2_loss + critic_l2_loss

            if self.log_tensorboard:
                tf.summary.scalar('actor_l2_loss_', actor_l2_loss)
                tf.summary.scalar('critic_l2_loss_', critic_l2_loss)
                tf.summary.scalar('encoder_l2_loss_', encoder_l2_loss)
                tf.summary.scalar('actor_loss_', actor_loss)
                tf.summary.scalar('critic_loss_', critic_loss)

        return actor_loss, critic_loss

    def _optimize(self, losses):
        with tf.variable_scope('optimizer'):
            actor_loss, critic_loss = losses
            actor_opt_op = self._optimize_objective(actor_loss, 'actor')
            critic_opt_op = self._optimize_objective(critic_loss, 'critic')

            opt_op = tf.group(actor_opt_op, critic_opt_op)

        return opt_op

    def _optimize_objective(self, loss, name):
        # params for optimizer
        learning_rate = self._args['actor_critic'][name][
            'learning_rate'] if 'learning_rate' in self._args['actor_critic'][
                name] else 1e-3
        beta1 = self._args['actor_critic'][name][
            'beta1'] if 'beta1' in self._args['actor_critic'][name] else .9
        beta2 = self._args['actor_critic'][name][
            'beta2'] if 'beta2' in self._args['actor_critic'][name] else .999
        clip_norm = self._args[name]['actor_critic'][
            'clip_norm'] if 'clip_norm' in self._args['actor_critic'] else 5.

        with tf.variable_scope(name + '_opt', reuse=self.reuse):
            # setup optimizer
            self._optimizer = tf.train.AdamOptimizer(
                learning_rate=learning_rate, beta1=beta1, beta2=beta2)

            tvars = self.actor_critic.actor_trainable_variables if name == 'actor' else self.actor_critic.critic_trainable_variables
            grads, tvars = list(
                zip(*self._optimizer.compute_gradients(loss, var_list=tvars)))
            grads, _ = tf.clip_by_global_norm(grads, clip_norm)
            opt_op = self._optimizer.apply_gradients(zip(grads, tvars))

        if self.log_tensorboard:
            with tf.name_scope(name):
                with tf.name_scope('gradients_'):
                    for grad, var in zip(grads, tvars):
                        if grad is not None:
                            tf.summary.histogram(var.name.replace(':0', ''),
                                                 grad)
                with tf.name_scope('params_'):
                    for var in tvars:
                        tf.summary.histogram(var.name.replace(':0', ''), var)

        return opt_op

    def _targetnet_ops(self):
        with tf.name_scope('target_net_op'):
            target_main_var_pairs = list(
                zip(self._target_variables, self.main_variables))
            init_target_op = list(
                map(lambda v: tf.assign(v[0], v[1], name='init_target_op'),
                    target_main_var_pairs))
            update_target_op = list(
                map(
                    lambda v: tf.assign(v[0],
                                        self.tau * v[1] +
                                        (1. - self.tau) * v[0],
                                        name='update_target_op'),
                    target_main_var_pairs))

        return init_target_op, update_target_op

    def _learn(self):
        states, actions, rewards, next_states, dones = self.buffer.sample()

        feed_dict = {
            self.env_info['state']: states,
            self.env_info['action']: actions,
            self.env_info['reward']: rewards,
            self.env_info['next_state']: next_states,
            self.env_info['done']: dones,
        }

        # update the main networks
        if self.log_tensorboard:
            _, summary = self.sess.run([self.opt_op, self.merged_op],
                                       feed_dict=feed_dict)
            self.writer.add_summary(summary, self.learn_steps)
        else:
            _ = self.sess.run(self.opt_op, feed_dict=feed_dict)

        # update the target networks
        self.sess.run(self.update_target_op)

        self.learn_steps += 1

    def _noise_params(self):
        with tf.variable_scope('noise'):
            noise_sigma = tf.get_variable('noise_sigma',
                                          initializer=self.init_noise_sigma,
                                          trainable=False)

            noise_decay_op = tf.assign(noise_sigma,
                                       self.noise_decay * noise_sigma,
                                       name='noise_decay_op')

            param_noise_pairs = []
            for var in self.actor_critic.actor_perturbable_variables:
                noise = tf.truncated_normal(tf.shape(var), stddev=noise_sigma)
                param_noise_pairs.append((var, noise))

            with tf.control_dependencies([noise_decay_op]):
                noise_op = list(
                    map(
                        lambda v: tf.assign(v[0], v[0] + v[1], name='noise_op'
                                            ), param_noise_pairs))
                denoise_op = list(
                    map(
                        lambda v: tf.assign(
                            v[0], v[0] - v[1], name='denoise_op'),
                        param_noise_pairs))

        return noise_op, denoise_op

    def _initialize_target_net(self):
        self.sess.run(self.init_target_op)
Пример #21
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, agent_id, args):

        self.state_size = state_size
        self.action_size = action_size
        self.seed = args['seed']
        self.device = args['device']
        self.args = args

        # Q-Network
        self.actor_network = ActorNetwork(state_size, action_size,
                                          args).to(self.device)
        self.actor_target = ActorNetwork(state_size, action_size,
                                         args).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_network.parameters(),
                                          lr=args['LR_ACTOR'])

        #Model takes too long to run --> load model weights from previous run (took > 24hours on my machine)
        if not agent_id:
            self.actor_network.load_state_dict(torch.load(
                args['agent_p0_path']),
                                               strict=False)
            self.actor_target.load_state_dict(torch.load(
                args['agent_p0_path']),
                                              strict=False)
        else:
            self.actor_network.load_state_dict(torch.load(
                args['agent_p1_path']),
                                               strict=False)
            self.actor_target.load_state_dict(torch.load(
                args['agent_p1_path']),
                                              strict=False)

        # Replay memory
        self.memory = ReplayBuffer(action_size, args['BUFFER_SIZE'],
                                   args['BATCH_SIZE'], self.seed)

        # Noise process
        self.noise = OUNoise(action_size, self.seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory

        self.memory.add(state, action, reward, next_state, done)

        if len(self.memory) > self.args['BATCH_SIZE']:
            experiences = self.memory.sample()
            self.train(experiences)

    def act(self, current_state):

        with torch.no_grad():

            self.actor_network.eval()

            input_state = torch.from_numpy(current_state).float().to(
                self.device)

            with torch.no_grad():
                action = self.actor_network(input_state).cpu().data.numpy()

            self.actor_network.train()

            action += self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def train(self, experiences):

        global states_
        global next_states_
        global actions_
        global max_min_actions_vector
        global max_min_states_vector

        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #

        with torch.no_grad():
            # Get predicted next-state actions and Q values from target models
            actions_next = self.actor_target(next_states)
            Q_targets_next = mCritic.target(next_states, actions_next)

            # Compute Q targets for current states (y_i)
            Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = mCritic.network(states, actions)
        mCritic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        mCritic.optimizer.zero_grad()
        mCritic_loss.backward()
        mCritic.optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_network(states)
        actor_loss = -mCritic.network(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(mCritic.network, mCritic.target, TAU)
        self.soft_update(self.actor_network, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #22
0
class Agent():
    """ Interacts with and learns from then environment."""
    def __init__(self, state_size, action_size, seed, model=QNetwork):
        """Initialize an Agent object.
        
        Param
        =====
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            model (object): model to use
            
        Return
        ======
            None
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = seed

        # Q-Network
        self.qnetwork_local = model(state_size, action_size, seed).to(device)
        self.qnetwork_target = model(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=hyperparameters["lr"])

        # Replay memory
        self.memory = ReplayBuffer(action_size, hyperparameters["buffer_size"],
                                   hyperparameters["batch_size"], seed, device)
        # Initialize time step (for updating every hyperparameters["update_every"] steps)
        self.t_step = 0

        # Init tracking of params
        wandb.login()
        wandb.init(project=project_name, name=name, config=hyperparameters)
        jovian.log_hyperparams(hyperparameters)

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every hyperparameters["update_every"] time steps.
        self.t_step = (self.t_step + 1) % hyperparameters["update_every"]
        if self.t_step == 0:
            # If enough samples are availble in memory, get random subset and learn
            if len(self.memory) > hyperparameters["batch_size"]:
                experiences = self.memory.sample()
                self.learn(experiences, hyperparameters["gamma"])

    def act(self, state, eps=0.):
        """Return actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for espilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.
        
        Params:
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', d) tuples
            gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ---------------- update target network ----------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target,
                         hyperparameters["tau"])

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def get_model_name(self):
        return name

    def get_project_name(self):
        return project_name
Пример #23
0
class Agent():
    """Code adapted from the Udacity course"""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max action from max Q values (for next states) from target model

        indexes_of_Q_local_for_next_states = self.qnetwork_local(
            next_states).detach().max(1)[1].unsqueeze(1)
        Q_target_for_next_states = self.qnetwork_target(next_states).detach()
        Q_thetas = Q_target_for_next_states.gather(
            1, indexes_of_Q_local_for_next_states)

        Q_targets = rewards + (gamma * Q_thetas * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        Polyak averaging
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #24
0
class DDPG:
    def __init__(self, env, batch_size, mem_size, discount, actor_params,
                 critic_params):
        self._batch_size = batch_size
        self._mem_size = mem_size
        self._discount = discount
        self._sess = tensorflow.Session()
        k_backend.set_session(self._sess)
        self._env = env
        self._state_dim = env.observation_space.shape[0]
        self._action_dim = env.action_space.shape[0]
        self._action_min = env.action_space.low
        self._action_max = env.action_space.high
        self._state_min = env.observation_space.low
        self._state_max = env.observation_space.high
        self._actor = Actor(self._sess, self._state_dim, self._action_dim,
                            self._action_min, self._action_max, actor_params)
        self._critic = Critic(self._sess, 0.5, self._state_dim,
                              self._action_dim, critic_params)
        self._memory = ReplayBuffer(mem_size)

    def get_action(self, state):
        return self._actor._model.predict(state)

    def train(self):
        '''
        No training takes place until the replay buffer contains
        at least batch size number of experiences
        '''

        if (self._memory.size() > self._batch_size):
            self._train()

    def _train(self):
        states, actions, rewards, done, next_states = self._memory.sample(
            self._batch_size)
        self._train_critic(states, actions, rewards, done, next_states)
        action_gradients = self._critic.action_gradients(states, actions)
        self._actor.train(states, action_gradients)

    def q_estimate(self, state, action):
        return self._critic._model.predict(state, action)

    def _get_q_targets(self, next_states, done, rewards):
        '''
        q = r if done else =  r + gamma * qnext
        '''
        # use actor network to determine the next action under current policy
        # estimate Q values from the critic network

        actions = self.get_action(next_states)
        qnext = self.q_estimate(next_states, actions)

        q_targets = [
            reward if end else reward * self._discount * next_q
            for (reward, next_q, end) in zip(rewards, qnext, done)
        ]
        return q_targets

    def _train_critic(self, states, actions, rewards, done, next_states):
        q_targets = self._get_q_targets(next_states, done, rewards)
        self._critic.train(states, actions, q_targets)

    def experience(self, state, action, reward, done, next_state):
        # store in replay buffer
        self._memory.add(state, action, reward, done, next_state)

        self.train()
Пример #25
0
class QMixTrainer:
    def __init__(self, env, args):
        self.env = env
        self.args = args
        self.agents = MultiAgents(args)
        self.train_datacollector = DataCollector(self.env, self.agents, args)
        self.replaybuffer = ReplayBuffer(args)

    def evaluate(self):
        mean_episode_reward = 0
        for epsd in range(self.args.eval_episodes):
            _, episode_reward = self.train_datacollector.collect_one_episode_data(
                if_train=False)
            mean_episode_reward += episode_reward

        return mean_episode_reward / self.args.eval_episodes

    def train(self):
        episode_rewards = []
        loss_history = []
        eval_episode_rewards = []
        train_steps = 0

        print("Initializing replay buffer...")
        episodes_data = []
        for epsd in range(10000):
            #print("simulating {} episode ...".format(epsd))
            episode_data, episode_reward = self.train_datacollector.collect_one_episode_data(
                epsd, if_train=False, if_init_buffer=True)
            if episode_reward == 1:
                print("goal !!!!!")
                episodes_data.append(episode_data)
        print("collected {} episodes".format(len(episodes_data)))
        l = len(episodes_data)
        batch_data = {}
        for key in episodes_data[0].keys():
            batch_data[key] = np.zeros((l, ) + episodes_data[0][key].shape)

        #batch_data = episodes_data[0]
        #episodes_data.pop(0)
        for epsd in range(l):
            for key in batch_data.keys():
                #print("key {} shape {}".format(key,batch_data[key].shape))
                batch_data[key][epsd] = episodes_data[epsd][key]

        self.replaybuffer.store_episode(batch_data)

        print("Start to train")
        plt.figure()
        for epoch in range(self.args.n_epoch):
            print("Training Epoch {} epsilon: {}".format(
                epoch, self.train_datacollector.epsilon))

            episodes_data = []
            reward_sum = 0
            for epsd in range(self.args.n_episodes_per_epoch):
                episode_data, episode_reward = self.train_datacollector.collect_one_episode_data(
                    epsd, if_train=True)
                #print("Episode {} reward is {}".format(epsd, episode_reward))
                episodes_data.append(episode_data)
                #reward_sum += episode_reward
                episode_rewards.append(episode_reward)
            #episode_rewards.append(reward_sum / self.args.n_episodes_per_epoch)

            batch_data = {}
            for key in episodes_data[0].keys():
                batch_data[key] = np.zeros((self.args.n_episodes_per_epoch, ) +
                                           episodes_data[0][key].shape)

            #batch_data = episodes_data[0]
            #episodes_data.pop(0)
            for epsd in range(self.args.n_episodes_per_epoch):
                for key in batch_data.keys():
                    #print("key {} shape {}".format(key,batch_data[key].shape))
                    batch_data[key][epsd] = episodes_data[epsd][key]

            self.replaybuffer.store_episode(batch_data)
            for t_stps in range(self.args.n_train_steps_per_epoch):
                mini_batch = self.replaybuffer.sample(
                    min(self.replaybuffer.current_size, self.args.batch_size))
                loss = self.agents.train(mini_batch, train_steps)
                loss_history.append(loss)
                train_steps = train_steps + 1

            if epoch % self.args.evaluate_freq == 0:
                mean_episode_reward = self.evaluate()
                eval_episode_rewards.append(mean_episode_reward)

                print(
                    "Evaluation Result (Mean Episode Reward) of Epoch {} is : {}"
                    .format(epoch, mean_episode_reward))

                plt.cla()
                plt.plot(range(len(episode_rewards)), episode_rewards)
                plt.xlabel('episode')
                plt.ylabel('episode reward')
                plt.savefig(
                    os.path.join(self.args.resource_dir,
                                 "episode_reward_epoch_{}.png".format(epoch)))
                '''
                plt.figure()
                plt.plot(range(len(eval_episode_rewards)), eval_episode_rewards)
                plt.xlabel('episode')
                plt.ylabel('episode reward')
                plt.savefig(os.path.join(self.args.resource_dir,"eval_episode_reward_epoch_{}.png".format(epoch)))
                '''

                np.savetxt(os.path.join(self.args.resource_dir,
                                        "episode_rewards.txt"),
                           episode_rewards,
                           fmt="%.4f")
                np.savetxt(os.path.join(self.args.resource_dir,
                                        "eval_episode_rewards.txt"),
                           eval_episode_rewards,
                           fmt="%.4f")
                np.savetxt(
                    os.path.join(self.args.resource_dir, "loss_history.txt"),
                    loss_history)
        plt.cla()
        plt.plot(range(len(episode_rewards)), episode_rewards)
        plt.xlabel('episode')
        plt.ylabel('episode reward')
        plt.savefig(
            os.path.join(self.args.resource_dir,
                         "episode_reward_epoch_{}.png".format(epoch)))
        '''
        plt.figure()
        plt.plot(range(len(eval_episode_rewards)), eval_episode_rewards)
        plt.xlabel('episode')
        plt.ylabel('episode reward')
        plt.savefig(os.path.join(self.args.resource_dir,"eval_episode_reward_epoch_{}.png".format(epoch)))
        '''

        np.savetxt(os.path.join(self.args.resource_dir, "episode_rewards.txt"),
                   episode_rewards,
                   fmt="%.4f")
        np.savetxt(os.path.join(self.args.resource_dir,
                                "eval_episode_rewards.txt"),
                   eval_episode_rewards,
                   fmt="%.4f")
        np.savetxt(os.path.join(self.args.resource_dir, "loss_history.txt"),
                   loss_history)
Пример #26
0
    def train(self, transitions: int, eps_max: float = 0.5, eps_min: float = 0., buffer_size: int = 10000,
              batch_size: int = 128, shaping_coef: float = 300., progress_upd_step: int = 0,
              start_training: int = 10000, to_sink: bool = False):
        history = ReplayBuffer(size=buffer_size)
        progress_upd_step = progress_upd_step if progress_upd_step else transitions // 100

        log = {
            "alpha": self.alpha,
            "gamma": self.gamma,
            "buffer_size": buffer_size,
            "batch_size": batch_size,
            "tau": self.tau,
            "shaping_coef": shaping_coef,
            "eps_max": eps_max,
            "eps_min": eps_min,
            "bins": self.num_bins,
            "to_sink": to_sink,
            "step": [],
            "reward_mean": [],
            "reward_std": []
        }

        state = self.reset()

        t = tqdm(range(transitions))
        for i in t:
            eps = eps_max - (eps_max - eps_min) * i / transitions
            if random() < eps:
                action = self.env.action_space.sample()
            else:
                action = self.act(state)

            next_state, reward, done, _ = self.env.step(action)
            reward += shaping_coef * (self.gamma * np.abs(next_state[1]) - np.abs(state[1]))
            done_ = next_state[0] > 0.5

            history.add((state, action, next_state, reward, done_))

            state = self.reset() if done else next_state

            if i > start_training:
                self.update(history.sample(batch_size))

            # soft update
            with torch.no_grad():
                for param, param_target in zip(self.dqn.parameters(), self.dqn_target.parameters()):
                    param_target.data.mul_(1 - self.tau)
                    param_target.data.add_(self.tau * param.data)

            if (i + 1) % progress_upd_step == 0:
                reward_mean, reward_std = self.evaluate_policy()

                log["step"].append(i)
                log["reward_mean"].append(reward_mean)
                log["reward_std"].append(reward_std)

                t.set_description(f"step: {i + 1} | Rmean = {reward_mean:0.4f} | Rstd = {reward_std:0.4f}")

                if to_sink and reward_mean >= 90 and self.evaluate_policy(episodes=100)[0] >= 90:
                    self.sink(history, start_training, eps, shaping_coef)
                    shaping_coef = 1
                    to_sink = False

        return log
Пример #27
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0.0  # 0.0
        self.exploration_theta = 0.1  # 0.15
        self.exploration_sigma = 0.1  # 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def act_no_noise(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action)  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Пример #28
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.eps = 3.0
        self.eps_decay = 0.9999

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size * 2, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size * 2, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size * 2, action_size * 2,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size * 2, action_size * 2,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=0)

        # Noise process
        self.noise = OUNoise((1, action_size), random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self,
             state,
             action,
             reward,
             next_state,
             done,
             agent_number,
             learn_iterations=5):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        #self.timestep += 1
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)
        # Learn, if enough samples are available in memory and at learning interval settings
        if len(self.memory
               ) > BATCH_SIZE:  #and self.timestep % LEARN_EVERY == 0:
            for _ in range(learn_iterations):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA, agent_number)

    def act(self, states, add_noise):
        """Returns actions for both agents as per current policy, given their respective states."""
        states = torch.from_numpy(states).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()
        # add noise to actions
        if add_noise:
            actions += self.eps * self.noise.sample()
        actions = np.clip(actions, -1, 1)
        return actions

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, agent_number):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        # Since the critic takes the actions of both agents we need to update only
        # one part of the given action
        if agent_number == 0:
            actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1)
        elif agent_number == 1:
            actions_next = torch.cat((actions[:, :2], actions_next), dim=1)
        # Compute Q targets for current states (y_i)
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        # Since the critic takes the actions of both agents we need to update only
        # one part of the given action
        if agent_number == 0:
            actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1)
        elif agent_number == 1:
            actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1)
        # Compute actor loss
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        # update epsilon
        self.eps *= self.eps_decay
        self.eps = max(self.eps, 1)
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #29
0
def train_v1(eps_start, eps_end, eps_decay, n_step, mem_capacity, num_episodes,
             embed_dim, iters):
    graph_generator = GraphGenerator(16, 16)
    memory = ReplayBuffer(mem_capacity)
    steps_done = 0
    gnn = Struc2Vec(embed_dim, iters)
    qnet = QNet(embed_dim)
    optimizer = optim.Adam(list(gnn.parameters()) + list(qnet.parameters()),
                           lr=0.0001,
                           weight_decay=1e-4)
    for e in range(num_episodes):
        node_labels, adj, edge_weights = graph_generator.next()
        vtx_feats = gnn(node_labels, adj, edge_weights)
        remaining_vertices = set([i for i in range(len(adj))])
        state = Variable(torch.zeros(embed_dim))
        curr_tour = []
        T = len(adj)
        rewards = []
        states = [state]

        for t in range(T):
            eps_threshold = util.get_eps_threshold(eps_start, eps_end,
                                                   eps_decay, steps_done)
            if random.random() > eps_threshold:
                # arg max action
                curr_vtx = arg_max_action(qnet, vtx_features,
                                          remaining_vertices)
            else:
                # random action
                curr_vtx = random.sample(remaining_vertices, 1)[0]

            action = vtx_feats[curr_vtx]
            # reward maintenance
            est_reward = qnet(state, curr_vtx)
            reward = get_reward(curr_tour, curr_vtx, edge_weights)
            rewards.append(reward)

            # update states
            curr_tour.append(curr_vtx)
            remaining_vertices.remove(curr_vtx)
            states.append(state + action)
            # wait till after doing the memory stuff to add the state

            # we only do these updates after n steps
            if t >= n_step:
                _, next_reward = arg_max_action(qnet, vtx_features,
                                                remaining_vertices)
                state_tminusn = states[-n_step]  # this is a torch tensor
                action_tminusn = vtx_feats[
                    curr_tour[-nstep]]  # this gives the vertex id
                reward_tminusn = sum(reward[-n:])
                memory.push(state_minusn, action_tminusn, reward_tminusn,
                            state, action)

                transitions = memory.sample(batch_size)
                # batch.state, batch.action, batch.reward, etc are now tuples
                # TODO: this looks a bit gross....
                batch = Transition(*zip(*batch))
                state_batch = torch.cat([s.unsqueeze(0) for s in batch.state],
                                        dim=0)
                action_batch = torch.cat(
                    [a.unsqueeze(0) for a in batch.action], dim=0)
                reward_batch = torch.cat(batch.reward)
                newstate_batch = torch.cat(
                    [ns.unsqueeze(0) for ns in batch.new_state], dim=0)
                max_action_batch = torch.cat(
                    [ma.unsqueeze(0) for ma in batch.max_action], dim=0)

                # TODO: make qnet allow batch
                # does the experience replay memory contain state/action/reward/next_state
                # from only the current episode's graph? Or can any graph seen before be
                # in the memory?
                # The argmax action is the thing taken at time t-n_step right?
                oldstate_action_value = qnet(state_batch, action_batch)
                newstate_action_value = qnet(new_state_batch, max_action_batch)
                expected_sa_values = reward_batch + gamma * newstate_action_value
                loss = F.mse_loss(oldstate_action_value, expected_sa_values)

                optimizer.zero_grad()
                loss.backward()
                # clamp grads?

            state += action
            steps_done += 1
Пример #30
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 hidden_layers=[64, 64],
                 drop_p=0.3,
                 with_dueling=False,
                 isDDQN=False):
        """Initialize an Agent object.
        
        Params  
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            hidden_layers (array): Hidden number of nodes in each layer
            drop_p (float [0-1]) : Probability of dropping nodes (implementation of dropout)
            with_dueling (boolean) : If true, network is dueling network, otherwise false.
            isDDQN (boolean) : If true, double dqn in implemented, otherwise false.
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size,
                                       action_size,
                                       seed,
                                       hidden_layers=hidden_layers,
                                       drop_p=drop_p,
                                       dueling=with_dueling).to(device)
        self.qnetwork_target = QNetwork(state_size,
                                        action_size,
                                        seed,
                                        hidden_layers=hidden_layers,
                                        drop_p=drop_p,
                                        dueling=with_dueling).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        # Parameter instance of DDQN.
        self.isDDQN = isDDQN

    def step(self, state, action, reward, next_state, done):
        """Takes a step and with each time step sample from buffer and learn"""
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        if self.isDDQN:
            # Get optimal action from local model and feed forward next_states on target network
            best_local_actions = self.qnetwork_local(states).max(
                1)[1].unsqueeze(1)
            double_dqn_targets = self.qnetwork_target(next_states)
            # Get value of the target dqn vialocal optimal action
            Q_targets_next = torch.gather(double_dqn_targets, 1,
                                          best_local_actions)
        else:
            # Get max predicted Q values (for next states) from target model (without ddqn)
            Q_targets_next = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # update target network
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)