Exemplo n.º 1
0
 def test_ReplayBuffer(self):
     mem = ReplayBuffer(2)
     mem.push(1)
     mem.push(2)
     [sample] = mem.sample(2)
     self.assertEqual(sorted(sample), [1, 2])
     mem.push(3)
     [sample] = mem.sample(2)
     self.assertEqual(sorted(sample), [2, 3])
     mem.push(4)
     [sample] = mem.sample(2)
     self.assertEqual(sorted(sample), [3, 4])
Exemplo n.º 2
0
class MaddpgAgent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed):

        self.agents = [
            Agent(state_size=state_size,
                  action_size=action_size,
                  random_seed=random_seed),
            Agent(state_size=state_size,
                  action_size=action_size,
                  random_seed=random_seed)
        ]

        self.seed = random.seed(random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        # self.soft_update(self.critic_local, self.critic_target, 1)
        # self.soft_update(self.actor_local, self.actor_target, 1)

    def act(self, states, add_noise=True):
        actions = [
            agent.act(state, add_noise)
            for agent, state in zip(self.agents, states)
        ]
        return actions

    def step(self, states, actions, rewards, next_states, dones):

        # Shared replay buffer
        for i, _ in enumerate(self.agents):
            self.memory.add(states[i], actions[i], rewards[i], next_states[i],
                            dones[i])

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:

            # Learn, if enough samples are available in memory
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def learn(self, experiences, gamma):
        for agent in self.agents:
            agent.learn(experiences, gamma)

    def reset(self):
        for agent in self.agents:
            agent.reset()

    def save_checkpont(self):
        for i, agent in enumerate(self.agents):
            agent.save_checkpont(i)
Exemplo n.º 3
0
def learn(env,
          policy,
          q_func,
          optimizer_spec,
          session,
          stopping_criterion=None,
          replay_buffer_size=1000000,
          batch_size=32,
          gamma=0.99,
          learning_starts=50000,
          learning_freq=4,
          frame_history_len=4,
          target_update_freq=10000,
          grad_norm_clipping=10,
          lr_multiplier=1.0):
    """Run Deep Q-learning algorithm.
    You can specify your own convnet using q_func.
    All schedules are w.r.t. total number of steps taken in the environment.
    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            img_in: tf.Tensor
                tensorflow tensor representing the input image
            num_actions: int
                number of actions
            scope: str
                scope in which all the model related variables
                should be created
            reuse: bool
                whether previously created variables should be reused.
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    session: tf.Session
        tensorflow session to use.
    exploration: rl_algs.deepq.utils.schedules.Schedule
        schedule for probability of chosing random action.
    stopping_criterion: (env, t) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    grad_norm_clipping: float or None
        If not None gradients' norms are clipped to this value.
    """
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_shape = env.observation_space.shape
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_shape = (img_h, img_w, frame_history_len * img_c)
    num_actions = env.action_space.n

    # set up placeholders
    # placeholder for current observation (or state)
    obs_t_ph = tf.placeholder(tf.uint8, [None] + list(input_shape))
    # placeholder for current action
    act_t_ph = tf.placeholder(tf.int32, [None])
    # placeholder for current reward
    rew_t_ph = tf.placeholder(tf.float32, [None])
    # placeholder for next observation (or state)
    obs_tp1_ph = tf.placeholder(tf.uint8, [None] + list(input_shape))
    # placeholder for end of episode mask
    # this value is 1 if the next state corresponds to the end of an episode,
    # in which case there is no Q-value at the next state; at the end of an
    # episode, only the current state reward contributes to the target, not the
    # next state Q-value (i.e. target is just rew_t_ph, not rew_t_ph + gamma * q_tp1)
    done_mask_ph = tf.placeholder(tf.float32, [None])

    # casting to float on GPU ensures lower data transfer times.
    obs_t_float = tf.cast(obs_t_ph, tf.float32) / 255.0
    obs_tp1_float = tf.cast(obs_tp1_ph, tf.float32) / 255.0

    # Declare variables for logging
    t_log = []
    mean_reward_log = []
    best_mean_log = []
    episodes_log = []
    exploration_log = []
    learning_rate_log = []

    # Create a network to produce the current q values for each possible action
    current_q_func = q_func(obs_t_float,
                            num_actions,
                            scope="q_func",
                            reuse=False)  # Current Q-Value Function
    q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                    scope='q_func')

    # Creat the target q function network
    target_q_func = q_func(obs_tp1_float,
                           num_actions,
                           scope="target_q_func",
                           reuse=False)  # Target Q-Value Function
    target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope='target_q_func')

    # Encode actions as as a one hot vector, based on the action that was chosen
    act_t = tf.one_hot(act_t_ph,
                       depth=num_actions,
                       dtype=tf.float32,
                       name="action_one_hot")
    q_act_t = tf.reduce_sum(act_t * current_q_func, axis=1)

    # Calculate the current reward, and use that to get the loss function
    y = rew_t_ph + gamma * tf.reduce_max(target_q_func, reduction_indices=[1])
    total_error = tf.square(tf.subtract(
        y, q_act_t))  #(reward + gamma*V(s') - Q(s, a))**2

    # construct optimization op (with gradient clipping)
    learning_rate = tf.placeholder(tf.float32, (), name="learning_rate")
    optimizer = optimizer_spec.constructor(learning_rate=learning_rate,
                                           **optimizer_spec.kwargs)
    train_fn = minimize_and_clip(optimizer,
                                 total_error,
                                 var_list=q_func_vars,
                                 clip_val=grad_norm_clipping)

    # update_target_fn will be called periodically to copy Q network to target Q network
    update_target_fn = []
    for var, var_target in zip(
            sorted(q_func_vars, key=lambda v: v.name),
            sorted(target_q_func_vars, key=lambda v: v.name)):
        update_target_fn.append(var_target.assign(var))
    update_target_fn = tf.group(*update_target_fn)

    # construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    model_initialized = False
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000
    SAVE_EVERY_N_STEPS = 200000

    for t in itertools.count():
        ### 1. Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env, t):
            break

        ### 2. Step the env and store the transition
        # At this point, "last_obs" contains the latest observation that was
        # recorded from the simulator.
        # Note that you cannot use "last_obs" directly as input
        # into your network, since it needs to be processed to include context
        # from previous frames. The replay buffer has a function called
        # encode_recent_observation that will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.

        # Store last_obs into replay buffer
        idx = replay_buffer.store_frame(last_obs)

        if t == 0:
            act, reward, done = env.action_space.sample(), 0, False

        # Choose action
        if not model_initialized:
            # choose random action
            act = env.action_space.sample()
        else:
            input_batch = replay_buffer.encode_recent_observation()
            act = policy.select_action(current_q_func, input_batch, obs_t_ph)

        # Step simulator forward one step
        last_obs, reward, done, info = env.step(act)
        replay_buffer.store_effect(
            idx, act, reward,
            done)  # Store action taken after last_obs and corresponding reward

        if done == True:  # done was True in latest transition; we have already stored that
            last_obs = env.reset()  # Reset observation
            done = False

        #####

        # at this point, the environment should have been advanced one step (and
        # reset if done was true), and last_obs should point to the new latest
        # observation

        ### 3. Perform experience replay and train the network.
        # note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):
            # Here, you should perform training. Training consists of four steps:
            # 3.a: use the replay buffer to sample a batch of transitions (see the
            # replay buffer code for function definition, each batch that you sample
            # should consist of current observations, current actions, rewards,
            # next observations, and done indicator).
            # 3.b: initialize the model if it has not been initialized yet; to do
            # that, call
            #    initialize_interdependent_variables(session, tf.global_variables(), {
            #        obs_t_ph: obs_t_batch,
            #        obs_tp1_ph: obs_tp1_batch,
            #    })
            # where obs_t_batch and obs_tp1_batch are the batches of observations at
            # the current and next time step. The boolean variable model_initialized
            # indicates whether or not the model has been initialized.
            # Remember that you have to update the target network too (see 3.d)!
            # 3.c: train the model. To do this, you'll need to use the train_fn and
            # total_error ops that were created earlier: total_error is what you
            # created to compute the total Bellman error in a batch, and train_fn
            # will actually perform a gradient step and update the network parameters
            # to reduce total_error. When calling session.run on these you'll need to
            # populate the following placeholders:
            # obs_t_ph
            # act_t_ph
            # rew_t_ph
            # obs_tp1_ph
            # done_mask_ph
            # (this is needed for computing total_error)
            # learning_rate -- you can get this from optimizer_spec.lr_schedule.value(t)
            # (this is needed by the optimizer to choose the learning rate)
            # 3.d: periodically update the target network by calling
            # session.run(update_target_fn)
            # you should update every target_update_freq steps, and you may find the
            # variable num_param_updates useful for this (it was initialized to 0)
            #####

            # 3.a Sample a batch of transitions
            obs_t_batch, act_batch, rew_batch, obs_tp1_batch, done_mask = replay_buffer.sample(
                batch_size)

            # 3.b Initialize model if not initialized yet
            if not model_initialized:
                initialize_interdependent_variables(
                    session, tf.global_variables(), {
                        obs_t_ph: obs_t_batch,
                        obs_tp1_ph: obs_tp1_batch,
                    })
                session.run(update_target_fn)
                model_initialized = True

            # 3.c Train the model using train_fn and total_error
            session.run(
                train_fn, {
                    obs_t_ph: obs_t_batch,
                    act_t_ph: act_batch,
                    rew_t_ph: rew_batch,
                    obs_tp1_ph: obs_tp1_batch,
                    done_mask_ph: done_mask,
                    learning_rate: optimizer_spec.lr_schedule.value(t)
                })

            # 3.d Update target network every target_update_freq steps
            if t % target_update_freq == 0:
                session.run(update_target_fn)
                num_param_updates += 1
            #####

        ### 4. Log progress
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)
        if t % LOG_EVERY_N_STEPS == 0 and model_initialized:
            print("Timestep %d" % (t, ))
            t_log.append(t)
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            mean_reward_log.append(mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            best_mean_log.append(best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            episodes_log.append(len(episode_rewards))
            print("exploration %f" % policy.current_eps)
            exploration_log.append(policy.current_eps)
            print("learning_rate %f" % optimizer_spec.lr_schedule.value(t))
            learning_rate_log.append(optimizer_spec.lr_schedule.value(t))
            sys.stdout.flush()

        if t % SAVE_EVERY_N_STEPS == 0 and model_initialized:
            training_log = ({
                't_log': t_log,
                'mean_reward_log': mean_reward_log,
                'best_mean_log': best_mean_log,
                'episodes_log': episodes_log,
                'exploration_log': exploration_log,
                'learning_rate_log': learning_rate_log
            })
            output_file_name = 'ram_lr' + str(lr_multiplier) + '_' + str(
                t) + '_data.pkl'
            with open(output_file_name, 'wb') as f:
                pickle.dump(training_log, f)
Exemplo n.º 4
0
def lunarworker(wid):
    import tensorflow as tf
    import numpy as np
    import gym
    import time
    import os

    from distagent import DistAgent
    from memory import ReplayBuffer
    from util import Linear, scale, RewMonitor, SkipEnv, StackEnv

    gpus = tf.config.experimental.get_visible_devices("GPU")

    # Select single gpu depending on wid
    total_gpus = 2
    gpu_nr = wid % total_gpus
    tf.config.set_visible_devices(gpus[gpu_nr], 'GPU')

    # Restricts mem to allow multiple tf sessions on one GPU
    tf.config.experimental.set_memory_growth(gpus[gpu_nr], True)

    # Train parameters
    N = int(8e6)
    eps = Linear(startval=0.1, endval=0.01, exploresteps=int(200e3))
    gamma = 0.99
    updatefreq = 4
    targetfreq = 1000
    savefreq = 80000

    # Setup
    env = gym.make("LunarLander-v2")
    env = RewMonitor(env)
    env = SkipEnv(env, skip=4)
    # env = StackEnv(env, n_frames=4)
    action_len = env.action_space.n
    agent = DistAgent(action_len,
                      dense=16,
                      supportsize=29,
                      vmin=-7.0,
                      vmax=7.0)
    mem = ReplayBuffer(size=int(20e3), batchsize=32)

    # Prefill
    tf.print("Collecting history...")
    prefill_end = int(10e3)
    state = env.reset()
    buff = []
    for t in range(1, prefill_end + 1):
        action = env.action_space.sample()
        endstate, rew, done, _ = env.step(action)
        data = (state, action, scale(rew), gamma, endstate, float(done))
        buff.append(data)
        if done:
            state = env.reset()
        else:
            state = endstate
        if t % 10000 == 0:
            tf.print(f"Collected {t} samples.")
    tf.print("Done.")

    tf.print("Storing history...")
    for data in buff:
        mem.add(data)
    tf.print("Done.")

    # Warm up
    states, _, _, _, _, _, = mem.sample()
    agent.probvalues(states)
    agent.t_probvalues(states)
    agent.update_target()

    # Initial dispatch
    tottime = time.time()

    # Training loop
    tf.print(f"Worker {wid} learning...")
    state = env.reset()
    episode_rewards = []
    buff = []
    for t in range(1, N + 1):
        t_eps = tf.constant(eps(t), dtype=tf.float32)
        action = agent.eps_greedy_action(
            state=np.reshape(state, [1, 8]).astype(np.float32),
            epsval=t_eps,
        )[0].numpy()
        endstate, rew, done, info = env.step(action)
        data = (state, action, scale(rew), gamma, endstate, float(done))
        buff.append(data)
        if info["Game Over"]:
            score = info["Episode Score"]
            episode_rewards.append(score)
            state = env.reset()
            if len(episode_rewards) % 100 == 0:
                tmptime = time.time()
                msit = (tmptime - tottime) / t * 1000
                ma100 = np.mean(episode_rewards[-111:-1])
                epstr = (f"Epsiode: {len(episode_rewards)}, " +
                         f"Step: {t}, " + f"MA100: {ma100}, " +
                         f"AvgSpeed: {msit:4.2f} ms/it")
                tf.print(epstr)
        else:
            state = endstate

        if t % updatefreq == 0:
            for data in buff:
                mem.add(data)
            buff = []
            (states, actions, drews, gexps, endstates, dones) = mem.sample()
            agent.train(states, actions, drews, gexps, endstates, dones)

        if t % targetfreq == 0:
            agent.update_target()

        if t % savefreq == 0:
            dir_str = f"lunarmodels/step{t}/"
            os.makedirs(dir_str, exist_ok=True)
            file_str = dir_str + "model-id-" + f"{wid}" + ".h5"
            agent.save(file_str)

    env.close()
    tmptime = time.time()
    tottime = tmptime - tottime
    msit = tottime / N * 1000
    tf.print(f"Learning done in {tottime:6.0f}s using {msit:4.2f} ms/it.")
    tf.print("Done.")
class Maddpg():
    """MADDPG Agent : Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """Initialize a MADDPG Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            random_seed (int): random seed
        """
        
        super(Maddpg, self).__init__()
        
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        
        # Instantiate Multiple  Agent
        self.agents = [ Agent(state_size,action_size, random_seed, num_agents) 
                       for i in range(num_agents) ]
        
        # Instantiate Memory replay Buffer (shared between agents)
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
        
                  
    def reset(self):
        """Reset all the agents"""
        for agent in self.agents:
            agent.reset()

    def act(self, states, noise):
        """Return action to perform for each agents (per policy)"""        
        return [ agent.act(state, noise) for agent, state in zip(self.agents, states) ]
                
    
    def step(self, states, actions, rewards, next_states, dones, num_current_episode):
        """ # Save experience in replay memory, and use random sample from buffer to learn"""
 
        self.memory.add(encode(states), 
                        encode(actions), 
                        rewards,
                        encode(next_states),
                        dones)

        # If enough samples in the replay memory and if it is time to update
        if (len(self.memory) > BATCH_SIZE) and (num_current_episode % UPDATE_EVERY_NB_EPISODE ==0) :
            
            # Note: this code only expects 2 agents
            assert(len(self.agents)==2)
            
            # Allow to learn several time in a row in the same episode
            for i in range(MULTIPLE_LEARN_PER_UPDATE):
                # Sample a batch of experience from the replay buffer 
                experiences = self.memory.sample()   
                # Update Agent #0
                self.maddpg_learn(experiences, own_idx=0, other_idx=1)
                # Sample another batch of experience from the replay buffer 
                experiences = self.memory.sample()   
                # Update Agent #1
                self.maddpg_learn(experiences, own_idx=1, other_idx=0)
                
    
    def maddpg_learn(self, experiences, own_idx, other_idx, gamma=GAMMA):
        """
        Update the policy of the MADDPG "own" agent. The actors have only access to agent own 
        information, whereas the critics have access to all agents information.
        
        Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(states) -> action
            critic_target(all_states, all_actions) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            own_idx (int) : index of the own agent to update in self.agents
            other_idx (int) : index of the other agent to update in self.agents
            gamma (float): discount factor
        """
        
        states, actions, rewards, next_states, dones = experiences
               
        # Filter out the agent OWN states, actions and next_states batch
        own_states =  decode(self.state_size, self.num_agents, own_idx, states)
        own_actions = decode(self.action_size, self.num_agents, own_idx, actions)
        own_next_states = decode(self.state_size, self.num_agents, own_idx, next_states) 
                
        # Filter out the OTHER agent states, actions and next_states batch
        other_states =  decode(self.state_size, self.num_agents, other_idx, states)
        other_actions = decode(self.action_size, self.num_agents, other_idx, actions)
        other_next_states = decode(self.state_size, self.num_agents, other_idx, next_states)
        
        # Concatenate both agent information (own agent first, other agent in second position)
        all_states=torch.cat((own_states, other_states), dim=1).to(device)
        all_actions=torch.cat((own_actions, other_actions), dim=1).to(device)
        all_next_states=torch.cat((own_next_states, other_next_states), dim=1).to(device)
   
        agent = self.agents[own_idx]
        
            
        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models        
        all_next_actions = torch.cat((agent.actor_target(own_states), agent.actor_target(other_states)),
                                     dim =1).to(device) 
        Q_targets_next = agent.critic_target(all_next_states, all_next_actions)
        
        
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        
        # Compute critic loss
        Q_expected = agent.critic_local(all_states, all_actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        
        # Minimize the loss
        agent.critic_optimizer.zero_grad()
        critic_loss.backward()
        if (CLIP_CRITIC_GRADIENT):
            torch.nn.utils.clip_grad_norm(agent.critic_local.parameters(), 1)
        agent.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        all_actions_pred = torch.cat((agent.actor_local(own_states), agent.actor_local(other_states).detach()),
                                     dim = 1).to(device)      
        actor_loss = -agent.critic_local(all_states, all_actions_pred).mean()
        
        # Minimize the loss
        agent.actor_optimizer.zero_grad()
        actor_loss.backward()        
        agent.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        agent.soft_update(agent.critic_local, agent.critic_target, TAU)
        agent.soft_update(agent.actor_local, agent.actor_target, TAU)                   
    
    
                        
    def checkpoints(self):
        """Save checkpoints for all Agents"""
        for idx, agent in enumerate(self.agents):
            actor_local_filename = 'models/checkpoint_actor_local_' + str(idx) + '.pth'
            critic_local_filename = 'models/checkpoint_critic_local_' + str(idx) + '.pth'           
            actor_target_filename = 'models/checkpoint_actor_target_' + str(idx) + '.pth'
            critic_target_filename = 'models/checkpoint_critic_target_' + str(idx) + '.pth'            
            torch.save(agent.actor_local.state_dict(), actor_local_filename) 
            torch.save(agent.critic_local.state_dict(), critic_local_filename)             
            torch.save(agent.actor_target.state_dict(), actor_target_filename) 
            torch.save(agent.critic_target.state_dict(), critic_target_filename)
class DQN:
    def __init__(self,
                 n_states,
                 n_actions,
                 gamma=0.99,
                 epsilon_start=0.9,
                 epsilon_end=0.05,
                 epsilon_decay=200,
                 memory_capacity=10000,
                 policy_lr=0.01,
                 batch_size=128,
                 device="cpu"):

        self.n_actions = n_actions  # 总的动作个数
        self.device = device  # 设备,cpu或gpu等
        self.gamma = gamma  # 奖励的折扣因子
        # e-greedy策略相关参数
        self.actions_count = 0  # 用于epsilon的衰减计数
        self.epsilon = 0
        self.epsilon_start = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        self.batch_size = batch_size
        self.policy_net = MLP(n_states, n_actions).to(self.device)
        self.target_net = MLP(n_states, n_actions).to(self.device)
        # target_net的初始模型参数完全复制policy_net
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()  # 不启用 BatchNormalization 和 Dropout
        # 可查parameters()与state_dict()的区别,前者require_grad=True
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr)
        self.loss = 0
        self.memory = ReplayBuffer(memory_capacity)

    def choose_action(self, state, train=True):
        '''选择动作
        '''
        if train:
            self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
                math.exp(-1. * self.actions_count / self.epsilon_decay)
            self.actions_count += 1
            if random.random() > self.epsilon:
                with torch.no_grad():
                    # 先转为张量便于丢给神经网络,state元素数据原本为float64
                    # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
                    state = torch.tensor([state],
                                         device=self.device,
                                         dtype=torch.float32)
                    # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
                    q_value = self.policy_net(state)
                    # tensor.max(1)返回每行的最大值以及对应的下标,
                    # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
                    # 所以tensor.max(1)[1]返回最大值对应的下标,即action
                    action = q_value.max(1)[1].item()
            else:
                action = random.randrange(self.n_actions)
            return action
        else:
            with torch.no_grad():  # 取消保存梯度
                # 先转为张量便于丢给神经网络,state元素数据原本为float64
                # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
                state = torch.tensor(
                    [state], device='cpu', dtype=torch.float32
                )  # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
                q_value = self.target_net(state)
                # tensor.max(1)返回每行的最大值以及对应的下标,
                # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
                # 所以tensor.max(1)[1]返回最大值对应的下标,即action
                action = q_value.max(1)[1].item()
            return action

    def update(self):

        if len(self.memory) < self.batch_size:
            return
        # 从memory中随机采样transition
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
            self.batch_size)
        '''转为张量
        例如tensor([[-4.5543e-02, -2.3910e-01,  1.8344e-02,  2.3158e-01],...,[-1.8615e-02, -2.3921e-01, -1.1791e-02,  2.3400e-01]])'''
        state_batch = torch.tensor(state_batch,
                                   device=self.device,
                                   dtype=torch.float)
        action_batch = torch.tensor(action_batch,
                                    device=self.device).unsqueeze(
                                        1)  # 例如tensor([[1],...,[0]])
        reward_batch = torch.tensor(
            reward_batch, device=self.device,
            dtype=torch.float)  # tensor([1., 1.,...,1])
        next_state_batch = torch.tensor(next_state_batch,
                                        device=self.device,
                                        dtype=torch.float)
        done_batch = torch.tensor(np.float32(done_batch),
                                  device=self.device).unsqueeze(
                                      1)  # 将bool转为float然后转为张量
        '''计算当前(s_t,a)对应的Q(s_t, a)'''
        '''torch.gather:对于a=torch.Tensor([[1,2],[3,4]]),那么a.gather(1,torch.Tensor([[0],[1]]))=torch.Tensor([[1],[3]])'''
        q_values = self.policy_net(state_batch).gather(
            dim=1, index=action_batch)  # 等价于self.forward
        # 计算所有next states的V(s_{t+1}),即通过target_net中选取reward最大的对应states
        next_state_values = self.target_net(next_state_batch).max(
            1)[0].detach()  # 比如tensor([ 0.0060, -0.0171,...,])
        # 计算 expected_q_value
        # 对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward
        expected_q_values = reward_batch + self.gamma * \
            next_state_values * (1-done_batch[0])
        # self.loss = F.smooth_l1_loss(q_values,expected_q_values.unsqueeze(1)) # 计算 Huber loss
        self.loss = nn.MSELoss()(q_values,
                                 expected_q_values.unsqueeze(1))  # 计算 均方误差loss
        # 优化模型
        self.optimizer.zero_grad(
        )  # zero_grad清除上一步所有旧的gradients from the last step
        # loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分
        self.loss.backward()
        for param in self.policy_net.parameters():  # clip防止梯度爆炸
            param.grad.data.clamp_(-1, 1)

        self.optimizer.step()  # 更新模型

    def save_model(self, path):
        torch.save(self.target_net.state_dict(), path)

    def load_model(self, path):
        self.target_net.load_state_dict(torch.load(path))
Exemplo n.º 7
0
class NECAgent:
    """
    NEC agent
    """
    def __init__(self, config):
        self.nec_net = NEC(config).to(config['device'])
        self.train_eps = config['train_eps']
        self.eval_eps = config['eval_eps']
        self.num_actions = config['num_actions']
        self.replay_buffer = ReplayBuffer(config['observation_shape'],
                                          config['replay_buffer_size'])
        self.batch_size = config['batch_size']
        self.discount = config['discount']
        self.n_step_horizon = config['horizon']
        self.episode = 0
        self.logger = ScoreLogger(config['env_name'], config['exp_name'])
        self.env_name = config['env_name']
        self.exp_name = config['exp_name']
        self.device = config['device']
        self.train()

        # make sure model is on appropriate device at this point before constructing optimizer
        self.optimizer = RMSprop(self.nec_net.parameters(),
                                 lr=config['learning_rate'],
                                 alpha=config['rmsprop_alpha'],
                                 eps=config['rmsprop_epsilon'])
        self.loss_fn = MSELoss()

    def train(self):
        self.training = True
        self.nec_net.train()

    def eval(self):
        self.training = False
        self.nec_net.eval()

    def new_episode(self):
        # trackers for computing N-step returns and updating replay and dnd memories at the end of episode
        self.observations, self.keys, self.actions, self.values, self.rewards = [], [], [], [], []
        self.episode += 1

    def set_epsilon(self, eps):
        self.train_eps = eps

    def step(self, obs):
        q_values, key = self.nec_net.lookup(obs)

        eps = self.train_eps if self.training else self.eval_eps

        # do epsilon-greedy crap
        action = np.random.choice(np.arange(
            self.num_actions)) if np.random.rand() < eps else _argmax(q_values)

        # update trackers
        if self.training:
            self.actions.append(action)
            self.observations.append(obs)
            self.keys.append(key)
            self.values.append(np.max(q_values))

        return action

    def update(self, consequence):
        """
        Called from main training loop to inform agent of consequence of last action including
        reward and if the episode terminated
        """
        reward, done = consequence

        if self.env_name.startswith("CartPole"):
            reward = reward if not done else -reward

        # update reward tracker
        self.rewards.append(reward)

        if done:
            episode_length = len(self.actions)

            # compute N-step returns in reverse order
            returns, n_step_returns = [None] * (episode_length +
                                                1), [None] * episode_length
            returns[episode_length] = 0

            for t in range(episode_length - 1, -1, -1):
                returns[t] = self.rewards[t] + self.discount * returns[t + 1]
                if episode_length - t > self.n_step_horizon:
                    n_step_returns[t] = returns[
                        t] + self.discount**self.n_step_horizon * (
                            self.values[t + self.n_step_horizon] -
                            returns[t + self.n_step_horizon])
                else:  # use on-policy monte carlo returns when below horizon
                    n_step_returns[t] = returns[t]

            self.keys, n_step_returns = torch.stack(self.keys), np.array(
                n_step_returns, dtype=np.float32)  # for fancy indexing

            # batch update of replay memory
            self.replay_buffer.append_batch(
                np.stack(self.observations),
                np.asarray(self.actions, dtype=np.int64), n_step_returns)

            # batch update of episodic memories
            unique_actions = np.unique(self.actions)
            for action in unique_actions:
                action_idxs = np.nonzero(self.actions == action)[0]
                self.nec_net.update_memory(action, self.keys[action_idxs],
                                           n_step_returns[action_idxs])

            # save/log metrics for plotting or whatever
            solved = self.logger.add_score(sum(self.rewards), self.episode)
            if solved:
                path = f'{os.getcwd()}/cartpole/trained_agents/nec_{self.exp_name}.pth'
                torch.save(self.nec_net.state_dict(), path)
                return True

        return False

    def optimize(self):
        """
        Here, we sample from the replay buffer and train the NEC model end-to-end with backprop
        """
        if self.replay_buffer.size() < self.batch_size:
            return

        observations, actions, returns = self.replay_buffer.sample(
            self.batch_size)
        self.optimizer.zero_grad()
        q_values = self.nec_net(observations.to(self.device))[range(
            self.batch_size), actions]  # pick q_values for chosen actions
        loss = self.loss_fn(q_values, returns.to(self.device))
        loss.backward()
        self.optimizer.step()

    def get_q_values(self, observations, actions):
        """
        Computes q_values for observation, action pairs passed in.

        Used for testing
        """
        with torch.no_grad():
            self.eval()
            observations = torch.from_numpy(observations)
            q_values = self.nec_net(observations)[range(len(actions)), actions]

            return q_values.numpy()
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 random_seed,
                 mnoise=True,
                 split_state=True):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.mnoise = mnoise
        self.split_state = split_state

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # initialize targets same as original networks
        self.hard_update(self.actor_target, self.actor_local)
        self.hard_update(self.critic_target, self.critic_local)

        # Noise process
        if self.mnoise:
            self.noise = OUNoise((2, action_size), random_seed)
        else:
            self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, states, actions, rewards, next_states, dones, step):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        if self.split_state:
            for state, action, reward, next_state, done in zip(
                    states, actions, rewards, next_states, dones):
                self.memory.add(state, action, reward, next_state, done)
        else:
            self.memory.add(states, actions, rewards, next_states, dones)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""

        state = torch.from_numpy(state).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            action += self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def hard_update(self, target, source):
        """
        Copy network parameters from source to target
        Inputs:
            target (torch.nn.Module): Net to copy parameters to
            source (torch.nn.Module): Net whose parameters to copy
        """
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)
Exemplo n.º 9
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, number_agents, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            number_agents (int): number of agents
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.number_agents = number_agents

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise processes
        self.noise = OUNoise((number_agents, action_size), random_seed)
        #self.noise = GaussianNoise(size=[number_agents,action_size], seed = 0,sigma=2e-1)
        #self.noise = GeometricBrownianNoise(size=[number_agents,action_size], seed = 0,sigma=2e-1)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done):
        """Save experiences in replay memory, and use random sample from buffer to learn."""

        # We save experience tuples in the memory for each agent.
        for i in range(self.number_agents):
            self.memory.add(state[i, :], action[i, :], reward[i],
                            next_state[i, :], done[i])

        # Learn, if enough samples are available in memory (threshold value: BATCH_SIZE) and at learning interval settings
        if len(self.memory) > BATCH_SIZE:
            for _ in range(UPDATE_RATE):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

#     def act(self, states, add_noise=True):
#         """Returns actions for given state as per current policy."""
#                                                                   # The code has been adapted to implement batch normalization.
#         actions = np.zeros((self.number_agents, self.action_size))
#         self.actor_local.eval()
#         with torch.no_grad():
#             for agent_number, state in enumerate(states):
#                 state = torch.from_numpy(state).float().unsqueeze(0).to(device)   # The code has been adapted to implement batch normalization.
#                 action = self.actor_local(state).cpu().data.numpy()
#                 actions[agent_number, :] = action
#         self.actor_local.train()
#         if add_noise:
#             actions += self.noise.sample()
#         return np.clip(actions, -1, 1)

    def act(self, states, add_noise=True):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((self.number_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            for agent_number, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[agent_number, :] = action
        self.actor_local.train()
        if add_noise:
            actions += self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 10
0
class DDPG_Agent:
    def __init__(self,
                 state_size,
                 action_size,
                 random_seed,
                 actor_hidden=[400, 300],
                 critic_hidden=[400, 300],
                 id=0):
        super(DDPG_Agent, self).__init__()

        self.actor_local = Actor(state_size,
                                 action_size,
                                 random_seed,
                                 hidden_layer_param=actor_hidden).to(DEVICE)
        self.actor_target = Actor(state_size,
                                  action_size,
                                  random_seed,
                                  hidden_layer_param=actor_hidden).to(DEVICE)
        self.critic_local = Critic(state_size,
                                   action_size,
                                   random_seed,
                                   hidden_layer_param=critic_hidden).to(DEVICE)
        self.critic_target = Critic(
            state_size,
            action_size,
            random_seed,
            hidden_layer_param=critic_hidden).to(DEVICE)

        self.actor_opt = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)
        self.critic_opt = optim.Adam(self.critic_local.parameters(),
                                     lr=LR_CRITIC)

        self.memory = ReplayBuffer(action_size, random_seed)

        self.seed = random.seed(random_seed)
        self.id = id
        print(critic_hidden)
        print("")
        print("--- Agent {} Params ---".format(self.id))
        print("Going to train on {}".format(DEVICE))
        print("Learning Rate:: Actor: {} | Critic: {}".format(
            LR_ACTOR, LR_CRITIC))
        print(
            "Replay Buffer:: Buffer Size: {} | Sampled Batch size: {}".format(
                BUFFER_SIZE, BATCH_SIZE))
        print("")
        print("Actor paramaters:: Input: {} | Hidden Layers: {} | Output: {}".
              format(state_size, actor_hidden, action_size))
        print("Critic paramaters:: Input: {} | Hidden Layers: {} | Output: {}".
              format(state_size,
                     [critic_hidden[0] + action_size, *critic_hidden[1:]], 1))
        print(self.actor_local)
        print(self.critic_local)
        print("")
        print("")

    # def act(self, state):
    #     state = torch.from_numpy(state).float().to(DEVICE)

    #     self.actor_local.eval()
    #     with torch.no_grad():
    #         actions = self.actor_local(state).cpu().data.numpy()
    #     self.actor_local.train()

    #     return actions

    def act(self, obs, noise=0.0):
        obs = obs.to(DEVICE)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(obs)  #+ noise*self.noise.noise()

        return action

    def step(self, state, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences)

    def learn(self, experiences):

        states, actions, rewards, next_states, dones = experiences

        # ---                   Teach Critic (with TD)              --- #
        recommended_actions = self.actor_target(next_states)
        Q_nexts = self.critic_target(next_states, recommended_actions)
        Q_targets = (rewards + GAMMA * Q_nexts * (1 - dones)
                     )  # This is what we actually got from experience
        Q_expected = self.critic_local(
            states, actions
        )  # This is what we thought the expected return of that state-action is.
        critic_loss = CRITERION(Q_targets, Q_expected)

        self.critic_opt.zero_grad()
        critic_loss.backward()
        self.critic_opt.step()

        # ---                   Teach Actor                          --- #
        next_actions = self.actor_local(states)
        # Here we get the value of each state-actions.
        # This will be backpropagated to the weights that produced the action in the actor network.
        # Large values will make weights stronger, smaller values (less expected return for that state-action) weaker
        actor_loss = -self.critic_local(states, next_actions).mean()

        self.actor_opt.zero_grad()
        actor_loss.backward()
        self.actor_opt.step()

        # Mix model parameters in both Actor and Critic #
        self.soft_update(self.actor_local, self.actor_target)
        self.soft_update(self.critic_local, self.critic_target)

    def soft_update(self, local, target):
        """Soft update model parameters.
            θ_target = τ*θ_local + (1 - τ)*θ_target

            Params
            ======
                local_model: PyTorch model (weights will be copied from)
                target_model: PyTorch model (weights will be copied to)
                tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target.parameters(),
                                             local.parameters()):
            target_param.data.copy_(TAU * local_param.data +
                                    (1.0 - TAU) * target_param.data)
Exemplo n.º 11
0
class DQN():
    def __init__(self,
                 env,
                 save_location,
                 start_episode=1,
                 saved_model=None,
                 prioritized_replay=False):
        self.env = env
        self.num_actions = env.action_space.n
        self.start_episode = start_episode
        self.save_location = save_location
        self.saved_model = saved_model
        self.prioritized_replay = prioritized_replay
        self.alpha = 0.6
        self.beta = 0

        self.learning_rate = 1e-4
        self.gamma = 0.98
        self.buffer_limit = 10**5
        self.training_frame_start = 10000 * 5
        self.batch_size = 32

        self.eps_start = 1
        self.eps_end = 0.01
        self.decay_factor = 10**5

        if prioritized_replay:
            self.memory = PrioritizedReplayBuffer(size=self.buffer_limit,
                                                  alpha=self.alpha)
            self.prioritized_replay_eps = 1e-5
        else:
            self.memory = ReplayBuffer(size=self.buffer_limit)

        if saved_model:
            self.epsilon_decay = lambda x: self.eps_end
        else:
            self.epsilon_decay = lambda x: self.eps_end + (
                self.eps_start - self.eps_end) * math.exp(-1. * x / self.
                                                          decay_factor)

        self.save_interval = 100000
        self.update_target_interval = 10000

        self.device = device

        self.q = Qnet(84, 84, in_channels=4,
                      n_actions=self.num_actions).to(device)
        self.q_target = Qnet(84, 84, in_channels=4,
                             n_actions=self.num_actions).to(device)

        #[self.q, self.q_target], self.optimizer = amp.initialize([self.q, self.q_target], self.optimizer, opt_level="O1") #playing around with mixed-precision training

    def train(self):
        s, a, r, s_prime, done_mask = self.memory.sample(self.batch_size)

        s = torch.as_tensor(s).to(device)
        a = torch.LongTensor(a).to(device)
        r = torch.as_tensor(r).to(device)
        s_prime = torch.as_tensor(s_prime).to(device)
        done_mask = torch.as_tensor(done_mask).to(device)

        q_out = self.q(s)
        # collect output from the chosen action dimension
        q_a = q_out.gather(1, a)

        # most reward we get in next state s_prime
        max_q_prime = self.q_target(s_prime).max(1)[0].unsqueeze(1)
        target = r + self.gamma * max_q_prime * done_mask

        # how much is our policy different from the true target
        loss = F.smooth_l1_loss(q_a, target)

        self.optimizer.zero_grad()

        #with amp.scale_loss(loss, self.optimizer) as scaled_loss: # playing around with mixed-precision training
        #	scaled_loss.backward()
        loss.backward()
        self.optimizer.step()

    def run(self, num_episodes):
        self.q_target.load_state_dict(
            self.q.state_dict())  # Load policy weights into target network
        self.optimizer = optim.Adam(self.q.parameters(), lr=self.learning_rate)

        if self.saved_model:
            self.q.load_state_dict(
                torch.load(saved_model))  # Load pretrained model

        self.beginLogging()
        #watcher = tw.Watcher()
        env = self.env
        best_episode_score = float('-Inf')
        score = 0.0
        total_frames = 0
        state = get_state(env.reset())  # Start first game
        for episode in tqdm(
                range(self.start_episode, self.start_episode + num_episodes)):
            # anneal 100% to 1% over training
            epsilon = self.epsilon_decay(total_frames)
            episode_score = 0
            done = False
            while not done:
                action = self.q.sample_action(
                    torch.Tensor(state).unsqueeze(0).to(device), epsilon)

                obs, reward, done, info = env.step(action)

                next_state = get_state(obs)

                done_mask = 0.0 if done else 1.0
                self.memory.put((state, action, reward, next_state, done_mask))
                state = next_state

                score += reward
                episode_score += reward

                if total_frames > self.training_frame_start:
                    self.train()

                # Copy policy weights to target
                if total_frames % self.update_target_interval == 0:
                    self.q_target.load_state_dict(self.q.state_dict())
                # Save policy weights
                if total_frames % self.save_interval == 0:
                    torch.save(
                        self.q.state_dict(),
                        os.path.join(self.save_location,
                                     'policy_%s.pt' % episode))
                # Reset environment for the next game
                if done:
                    state = get_state(env.reset())
                total_frames += 1

            best_episode_score = max(best_episode_score, episode_score)
            # Print updates every episode
            out = "n_episode : {}, Total Frames : {}, Average Score : {:.1f}, Episode Score : {:.1f}, Best Score : {:.1f}, n_buffer : {}, eps : {:.1f}%".format(
                episode, total_frames, score / episode, episode_score,
                best_episode_score, len(self.memory), epsilon * 100)
            print(out)
            self.log(out)

            # Microsoft Tensorwatch Watcher for Visualizing Training
            #watcher.observe(
            #    episode = episode,
            #    episode_score = episode_score,
            #    total_score = score,
            #    buffer_size = self.memory.size(),
            #    epsilon = epsilon,
            #    frames = total_frames,
            #)

        # save final model weights
        torch.save(self.q.state_dict(),
                   os.path.join(self.save_location, 'policy_final.pt'))

    def beginLogging(self):
        with open(os.path.join(self.save_location, 'log.out'), 'w') as f:
            f.write('')

    def log(self, out):
        with open(os.path.join(self.save_location, 'log.out'), 'a') as f:
            f.write('%s\n' % out)
Exemplo n.º 12
0
class DQN:
    def __init__(self,
                 n_states,
                 n_actions,
                 gamma=0.99,
                 epsilon_start=0.9,
                 epsilon_end=0.05,
                 epsilon_decay=200,
                 memory_capacity=10000,
                 policy_lr=0.01,
                 batch_size=128,
                 device="cpu"):
        self.actions_count = 0
        self.n_actions = n_actions  # 总的动作个数
        self.device = device  # 设备,cpu或gpu等
        self.gamma = gamma
        # e-greedy策略相关参数
        self.epsilon = 0
        self.epsilon_start = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        self.batch_size = batch_size
        self.policy_net = FCN(n_states, n_actions).to(self.device)
        self.target_net = FCN(n_states, n_actions).to(self.device)
        # target_net的初始模型参数完全复制policy_net
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()  # 不启用 BatchNormalization 和 Dropout
        # 可查parameters()与state_dict()的区别,前者require_grad=True
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr)
        self.loss = 0
        self.memory = ReplayBuffer(memory_capacity)

    def choose_action(self, state, train=True):
        '''选择动作
        '''
        if train:
            self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
                math.exp(-1. * self.actions_count / self.epsilon_decay)
            self.actions_count += 1
            if random.random() > self.epsilon:
                with torch.no_grad():
                    # 先转为张量便于丢给神经网络,state元素数据原本为float64
                    # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
                    state = torch.tensor([state],
                                         device=self.device,
                                         dtype=torch.float32)
                    # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
                    q_value = self.policy_net(state)
                    # tensor.max(1)返回每行的最大值以及对应的下标,
                    # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
                    # 所以tensor.max(1)[1]返回最大值对应的下标,即action
                    action = q_value.max(1)[1].item()
            else:
                action = random.randrange(self.n_actions)
            return action
        else:
            with torch.no_grad():
                # 先转为张量便于丢给神经网络,state元素数据原本为float64
                # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
                state = torch.tensor([state],
                                     device='cpu',
                                     dtype=torch.float32)
                # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
                q_value = self.target_net(state)
                # tensor.max(1)返回每行的最大值以及对应的下标,
                # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
                # 所以tensor.max(1)[1]返回最大值对应的下标,即action
                action = q_value.max(1)[1].item()
            return action

    def update(self):

        if len(self.memory) < self.batch_size:
            return
        # 从memory中随机采样transition
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
            self.batch_size)
        # 转为张量
        # 例如tensor([[-4.5543e-02, -2.3910e-01,  1.8344e-02,  2.3158e-01],...,[-1.8615e-02, -2.3921e-01, -1.1791e-02,  2.3400e-01]])
        state_batch = torch.tensor(state_batch,
                                   device=self.device,
                                   dtype=torch.float)
        action_batch = torch.tensor(action_batch,
                                    device=self.device).unsqueeze(
                                        1)  # 例如tensor([[1],...,[0]])
        reward_batch = torch.tensor(
            reward_batch, device=self.device,
            dtype=torch.float)  # tensor([1., 1.,...,1])
        next_state_batch = torch.tensor(next_state_batch,
                                        device=self.device,
                                        dtype=torch.float)
        done_batch = torch.tensor(np.float32(done_batch),
                                  device=self.device).unsqueeze(
                                      1)  # 将bool转为float然后转为张量

        # 计算当前(s_t,a)对应的Q(s_t, a)
        q_values = self.policy_net(state_batch)
        next_q_values = self.policy_net(next_state_batch)
        # 代入当前选择的action,得到Q(s_t|a=a_t)
        q_value = q_values.gather(dim=1, index=action_batch)
        '''以下是Nature DQN的q_target计算方式
        # 计算所有next states的Q'(s_{t+1})的最大值,Q'为目标网络的q函数
        next_q_state_value = self.target_net(
            next_state_batch).max(1)[0].detach()  # 比如tensor([ 0.0060, -0.0171,...,])
        # 计算 q_target
        # 对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward
        q_target = reward_batch + self.gamma * next_q_state_value * (1-done_batch[0])
        '''
        '''以下是Double DQNq_target计算方式,与NatureDQN稍有不同'''
        next_target_values = self.target_net(next_state_batch)
        # 选出Q(s_t‘, a)对应的action,代入到next_target_values获得target net对应的next_q_value,即Q’(s_t|a=argmax Q(s_t‘, a))
        next_target_q_value = next_target_values.gather(
            1,
            torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)
        q_target = reward_batch + self.gamma * next_target_q_value * (
            1 - done_batch[0])
        self.loss = nn.MSELoss()(q_value, q_target.unsqueeze(1))  # 计算 均方误差loss
        # 优化模型
        self.optimizer.zero_grad(
        )  # zero_grad清除上一步所有旧的gradients from the last step
        # loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分
        self.loss.backward()
        for param in self.policy_net.parameters():  # clip防止梯度爆炸
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()  # 更新模型

    def save_model(self, path):
        torch.save(self.target_net.state_dict(), path)

    def load_model(self, path):
        self.target_net.load_state_dict(torch.load(path))
class DQN_agent(object):
    def __init__(self, env, hyper_params, action_space=len(ACTION_DICT)):

        self.env = env
        self.max_episode_steps = env._max_episode_steps
        """
            beta: The discounted factor of Q-value function
            (epsilon): The explore or exploit policy epsilon.
            initial_epsilon: When the 'steps' is 0, the epsilon is initial_epsilon, 1
            final_epsilon: After the number of 'steps' reach 'epsilon_decay_steps',
                The epsilon set to the 'final_epsilon' determinately.
            epsilon_decay_steps: The epsilon will decrease linearly along with the steps from 0 to 'epsilon_decay_steps'.
        """
        self.beta = hyper_params['beta']
        self.initial_epsilon = 1
        self.final_epsilon = hyper_params['final_epsilon']
        self.epsilon_decay_steps = hyper_params['epsilon_decay_steps']
        """
            episode: Record training episode
            steps: Add 1 when predicting an action
            learning: The trigger of agent learning. It is on while training agent. It is off while testing agent.
            action_space: The action space of the current environment, e.g 2.
        """
        self.episode = 0
        self.steps = 0
        self.best_reward = 0
        self.learning = True
        self.action_space = action_space
        """
            input_len The input length of the neural network. It equals to the length of the state vector.
            output_len: The output length of the neural network. It is equal to the action space.
            eval_model: The model for predicting action for the agent.
            target_model: The model for calculating Q-value of next_state to update 'eval_model'.
            use_target_model: Trigger for turn 'target_model' on/off
        """
        state = env.reset()
        input_len = len(state)
        output_len = action_space
        self.eval_model = DQNModel(input_len,
                                   output_len,
                                   learning_rate=hyper_params['learning_rate'])
        self.use_target_model = hyper_params['use_target_model']
        if self.use_target_model:
            self.target_model = DQNModel(input_len, output_len)
#         memory: Store and sample experience replay.
        self.memory = ReplayBuffer(hyper_params['memory_size'])
        """
            batch_size: Mini batch size for training model.
            update_steps: The frequence of traning model
            model_replace_freq: The frequence of replacing 'target_model' by 'eval_model'
        """
        self.batch_size = hyper_params['batch_size']
        self.update_steps = hyper_params['update_steps']
        self.model_replace_freq = hyper_params['model_replace_freq']

        print("agent initialized")

    # Linear decrease function for epsilon
    def linear_decrease(self, initial_value, final_value, curr_steps,
                        final_decay_steps):
        decay_rate = curr_steps / final_decay_steps
        if decay_rate > 1:
            decay_rate = 1
        return initial_value - (initial_value - final_value) * decay_rate

    def explore_or_exploit_policy(self, state):
        p = uniform(0, 1)
        # Get decreased epsilon
        epsilon = self.linear_decrease(self.initial_epsilon,
                                       self.final_epsilon, self.steps,
                                       self.epsilon_decay_steps)
        #if(np.random.randint(1000)==4):
        #print("epsilon",epsilon)
        if p < epsilon:
            #return action
            return randint(0, self.action_space - 1)
        else:
            #return action
            return self.greedy_policy(state)

    def greedy_policy(self, state):
        return self.eval_model.predict(state)

    # This next function will be called in the main RL loop to update the neural network model given a batch of experience
    # 1) Sample a 'batch_size' batch of experiences from the memory.
    # 2) Predict the Q-value from the 'eval_model' based on (states, actions)
    # 3) Predict the Q-value from the 'target_model' base on (next_states), and take the max of each Q-value vector, Q_max
    # 4) If is_terminal == 1, q_target = reward + discounted factor * Q_max, otherwise, q_target = reward
    # 5) Call fit() to do the back-propagation for 'eval_model'.
    def update_batch(self):
        if len(self.memory
               ) < self.batch_size or self.steps % self.update_steps != 0:
            return

        #print("fetching minibatch from replay memory")
        batch = self.memory.sample(self.batch_size)

        (states, actions, reward, next_states, is_terminal) = batch

        states = states
        next_states = next_states
        terminal = FloatTensor([1 if t else 0 for t in is_terminal])
        reward = FloatTensor(reward)
        batch_index = torch.arange(self.batch_size, dtype=torch.long)

        # Current Q Values
        _, q_values = self.eval_model.predict_batch(states)

        #q_values = q_values[np.arange(self.batch_size), actions]
        q_values = q_values[batch_index, actions]

        # Calculate target
        if self.use_target_model:
            #print("target_model.predict")
            best_actions, q_next = self.target_model.predict_batch(next_states)
        else:
            best_actions, q_next = self.eval_model.predict_batch(next_states)

        q_max = q_next[batch_index, best_actions]

        terminal = 1 - terminal
        q_max *= terminal
        q_target = reward + self.beta * q_max

        # update model
        self.eval_model.fit(q_values, q_target)

    def learn_and_evaluate(self, training_episodes, test_interval):
        test_number = training_episodes // test_interval
        all_results = []

        for i in range(test_number):
            # learn
            self.learn(test_interval)

            # evaluate
            avg_reward = self.evaluate()
            all_results.append(avg_reward)

        return all_results

    def learn(self, test_interval):
        for episode in tqdm(range(test_interval), desc="Training"):
            state = self.env.reset()
            done = False
            steps = 0

            while steps < self.max_episode_steps and not done:
                #INSERT YOUR CODE HERE
                # add experience from explore-exploit policy to memory
                action = self.explore_or_exploit_policy(state)
                next_state, reward, done, info = self.env.step(action)
                self.memory.add(state, action, reward, next_state, done)

                # update the model every 'update_steps' of experience
                self.update_batch()

                # update the target network (if the target network is being used) every 'model_replace_freq' of experiences
                if self.use_target_model and (self.steps %
                                              self.model_replace_freq == 0):
                    self.target_model.replace(self.eval_model)

                self.steps += 1
                steps += 1
                state = next_state

    def evaluate(self, trials=30):
        total_reward = 0
        for _ in tqdm(range(trials), desc="Evaluating"):
            state = self.env.reset()
            done = False
            steps = 0

            while steps < self.max_episode_steps and not done:
                steps += 1
                action = self.greedy_policy(state)
                state, reward, done, _ = self.env.step(action)
                total_reward += reward

        avg_reward = total_reward / trials
        print(avg_reward)
        f = open(result_file, "a+")
        f.write(str(avg_reward) + "\n")
        f.close()
        if avg_reward >= self.best_reward:
            self.best_reward = avg_reward
            self.save_model()
        return avg_reward

    # save model
    def save_model(self):
        self.eval_model.save(result_floder + '/best_model.pt')

    # load model
    def load_model(self):
        self.eval_model.load(result_floder + '/best_model.pt')
Exemplo n.º 14
0
class DQNAgent_Vanila_simple(agent):
    def __init__(self, model, opt, learning=True):
        super().__init__()
        self.memory = ReplayBuffer(3000)
        self.previous_state = None
        self.previous_action = None
        self.previous_legal_actions = None
        self.step = 0
        self.model = model
        self.opt = opt
        self.loss = 0
        self.batch_size = 10
        self.test_q = 0
        self.max_tile = 0
        #self.test_q = 0
        self.epsilon_schedule = LinearSchedule(1000000,
                                               initial_p=0.99,
                                               final_p=0.01)
        self.learning = learning

    def should_explore(self):
        self.epsilon = self.epsilon_schedule.value(self.step)
        return random.random() < self.epsilon

    def action(self):
        if self.learning:
            self.step += 1

        legalActions = self.legal_actions(deepcopy(self.gb.board))
        if len(legalActions) == 0:
            print(111111111111111111111111111111111111111)
        board = deepcopy(self.gb.board)
        board = oneHotMap(board)

        if self.learning and self.should_explore():
            q_values = None
            action = random.choice(legalActions)
            choice = self.actions[action]
        else:
            #mark
            state = torch.from_numpy(board).type(
                torch.FloatTensor).cuda().view(-1, 17, 4, 4)
            action, q_values = self.predict(state, legalActions)
            choice = self.actions[action]
        if self.learning:
            reward = self.gb.currentReward
            if reward != 0:
                reward = np.log2(reward)
            if (self.previous_state is not None
                    and self.previous_action is not None):
                self.memory.add(self.previous_state, self.previous_action,
                                self.previous_legal_actions, reward,
                                legalActions, board, 0)

        self.previous_state = board
        self.previous_action = action
        self.previous_legal_actions = legalActions

        if self.learning:
            self.update()
        return choice

    def enableLearning(self):
        self.model.train()
        self.learning = True
        self.max_tile = 0
        self.reset()

    def disableLearning(self):
        self.model.eval()
        self.learning = False

    def end_episode(self):
        if not self.learning:
            m = np.max(self.gb.board)
            if m > self.max_tile:
                self.max_tile = m
            return
        #print(self.gb.board)

        board = deepcopy(self.gb.board)
        board = oneHotMap(board)

        #legalActions = self.legal_actions(deepcopy(self.gb.board))
        #print(legalActions)
        self.memory.add(self.previous_state, self.previous_action,
                        self.previous_legal_actions, self.gb.currentReward, [],
                        board, 1)
        self.reset()

    def reset(self):

        self.previous_state = None
        self.previous_action = None
        self.previous_legal_actions = None

    def update(self):
        if self.step < self.batch_size:
            return

        batch = self.memory.sample(self.batch_size)
        (states, actions, legal_actions, reward, next_legal_actions,
         next_states, is_terminal) = batch

        terminal = torch.tensor(is_terminal).type(torch.cuda.FloatTensor)
        reward = torch.tensor(reward).type(torch.cuda.FloatTensor)
        states = torch.from_numpy(states).type(torch.FloatTensor).cuda().view(
            -1, 17, 4, 4)
        next_states = torch.from_numpy(next_states).type(
            torch.FloatTensor).cuda().view(-1, 17, 4, 4)
        # Current Q Values

        _, q_values = self.predict_batch(states)
        batch_index = torch.arange(self.batch_size, dtype=torch.long)
        #print(actions)
        #print(q_values)

        q_values = q_values[batch_index, actions]
        #print(q_values)
        # Calculate target
        q_actions_next, q_values_next = self.predict_batch(
            next_states, legalActions=next_legal_actions)
        #print(q_values_next)
        q_max = q_values_next.max(1)[0].detach()

        q_max = (1 - terminal) * q_max
        # if sum(terminal == 1) > 0:
        #     print(reward)
        #     print( (terminal == 1).nonzero())
        #     print(terminal)
        #     print(next_legal_actions)
        #     print(q_max)
        #     input()
        q_target = reward + 0.99 * q_max
        self.opt.zero_grad()
        loss = self.model.loss_function(q_target, q_values)

        loss.backward()

        self.opt.step()

        #train_loss = loss_vae.item() + loss_dqn.item()

        self.loss += loss.item() / len(states)

    def predict_batch(self, input, legalActions=None):
        input = input
        #print(legalActions)

        q_values = self.model(input)
        if legalActions is None:
            values, q_actions = q_values.max(1)
        else:
            isNotlegal = True

            # print(legalActions)
            # print(q_values)
            q_values_true = torch.full((self.batch_size, 4), -100000000).cuda()
            for i, action in enumerate(legalActions):
                q_values_true[i, action] = q_values[i, action]
            values, q_actions = q_values_true.max(1)
            q_values = q_values_true
            #print(q_values_true)
            '''
            while isNotlegal:
                isNotlegal = False
                values, q_actions = q_values.max(1)
                #print(q_values)
                #print(values)
                #print(q_actions)


                for i, action in enumerate(q_actions):
                    #print(legalActions[i])
                    if len(legalActions[i]) == 0:
                        continue

                    if action.item() not in legalActions[i]:
                        isNotlegal = True
                        # print(i)
                        # print(action.item())
                        # print(q_values)
                        q_values[i, action] = -1
                #         print(q_values)
                # print("*********************")
            '''
        return q_actions, q_values

    def predict(self, input, legalActions):
        q_values = self.model(input)
        for action in range(4):
            if action not in legalActions:
                q_values[0, action] = -100000000

        action = torch.argmax(q_values)
        if int(action.item()) not in legalActions:
            print(legalActions, q_values, action)
            print("!!!!!!!!!!!!!!!!!!!!!!!!!")
        return action.item(), q_values

    def legal_actions(self, copy_gb):
        legalActions = []
        for i in range(4):
            try_gb = gameboard(4, deepcopy(copy_gb))
            changed = try_gb.takeAction(self.actions[i])
            if changed:
                legalActions.append(i)
        return legalActions

    '''
Exemplo n.º 15
0
class MADDPG():
    """Interacts with and learns from the environment."""
    def __init__(self, config):
        """Initialize an Agent object.

    Params
    ======
        state_size (int): dimension of each state
        action_size (int): dimension of each action
        random_seed (int): random seed
    """
        self.state_size = config.state_size
        self.action_size = config.action_size
        self.seed = random.seed(config.random_seed)
        self.config = config
        self.t_step = 0
        # Actor Network (w/ Target Network)
        self.actor_local = Actor(self.state_size, self.action_size,
                                 config.random_seed).to(device)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  config.random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=config.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(self.state_size, self.action_size,
                                   config.random_seed).to(device)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    config.random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=config.lr_critic,
                                           weight_decay=config.weight_decay)

        # Noise process
        self.noise = OUNoise(self.action_size, config.random_seed)

        # Replay memory
        self.memory = ReplayBuffer(self.action_size, config.buffer_size,
                                   config.batch_size, config.random_seed)
        # ----------------------- initialize target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, 1)
        self.soft_update(self.actor_local, self.actor_target, 1)

        if config.shared_replay_buffer:
            self.memory = config.memory
        else:
            self.memory = config.memory_fn()

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.config.update_every
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.config.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences, self.config.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
    Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
    where:
        actor_target(state) -> action
        critic_target(state, action) -> Q-value

    Params
    ======
        experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
        gamma (float): discount factor
    """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1)
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target,
                         self.config.tau)
        self.soft_update(self.actor_local, self.actor_target, self.config.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
    θ_target = τ*θ_local + (1 - τ)*θ_target

    Params
    ======
        local_model: PyTorch model (weights will be copied from)
        target_model: PyTorch model (weights will be copied to)
        tau (float): interpolation parameter 
    """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 16
0
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, num_agents, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.epsilon = EPSILON
        self.num_agents = num_agents

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
        #self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed, ALPHA, BETA, ANNEAL_OVER)

        # Tensorboard interface
        self.writer = SummaryWriter(comment="-ddpg-no-pri")
        self.tb_tracker = TBMeanTracker(self.writer, batch_size=10)
        self.step_t = 0

    def step(self, state, action, reward, next_state, done, timestamp):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        
        #for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
        self.memory.add(state, action, reward, next_state, done)

        # Learn at defined interval, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE and timestamp % self.num_agents == 0:
            for _ in range(LEARN_NUM):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)
                self.step_t += 1

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.epsilon * self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        #states, actions, rewards, next_states, dones, idxs, weights = experiences
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()

        critic_loss.backward()
        
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # update priorities
        # updates = torch.abs(Q_expected - Q_targets).cpu().data.squeeze(1).numpy()
        # self.memory.update_priorities(idxs, updates)

        self.tb_tracker.track("loss_critic", critic_loss.to("cpu"), self.step_t)

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.tb_tracker.track("loss_actor", actor_loss.to("cpu"), self.step_t)

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        # ---------------------------- update noise ---------------------------- #
        self.epsilon -= EPSILON_DECAY
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Exemplo n.º 17
0
    def train(self):
        # initialize memory buffer
        buffer = ReplayBuffer(int(500000), self.batch_size, self.num_agents, 0)

        # use keep_awake to keep workspace from disconnecting
        for episode in range(self.number_of_episodes):
            env_info = self.env.reset(train_mode=True)[self.brain_name]

            agent_episode_rewards = [0, 0]

            for agent in self.maddpg.ddpg_agents:
                agent.noise.reset()

            for episode_t in range(self.max_episode_len):
                states = env_info.vector_observations
                states_t = to_tensor(states)

                with torch.no_grad():
                    action_ts = self.maddpg.act(states_t, noise=self.noise)
                    self.noise *= self.noise_reduction

                actions = torch.stack(action_ts).numpy()
                env_info = self.env.step(actions)[self.brain_name]

                next_states = env_info.vector_observations
                rewards = env_info.rewards
                dones = env_info.local_done

                for i in range(self.num_agents):
                    agent_episode_rewards[i] += rewards[i]

                full_state = np.concatenate(states)
                full_next_state = np.concatenate(next_states)

                buffer.add((states, full_state, actions, rewards, next_states, full_next_state, dones))

                # update once after every episode_per_update
                critic_losses = []
                actor_losses = []
                if len(buffer) > self.batch_size and episode % self.episode_per_update == 0:
                    for i in range(self.num_agents):
                        samples = buffer.sample()
                        cl, al = self.maddpg.update(samples, i)
                        critic_losses.append(cl)
                        actor_losses.append(al)
                    self.maddpg.update_targets()  # soft update the target network towards the actual networks

                if np.any(dones):
                    # if any of the agents are done break
                    break

            episode_reward = max(agent_episode_rewards)
            self.episode_rewards.append(episode_reward)
            self.last_100_episode_rewards.append(episode_reward)
            self.avg_rewards.append(np.mean(self.last_100_episode_rewards))
            # scores.append(episode_reward)
            print('\rEpisode {}\tAverage Score: {:.4f}\tScore: {:.4f}'.format(episode, self.avg_rewards[-1],
                                                                              episode_reward),
                  end="")

            if episode % self.print_period == 0:
                print('\rEpisode {}\tAverage Score: {:.4f}'.format(episode, self.avg_rewards[-1]))

            # saving successful model
            # training ends when the threshold value is reached.
            if self.avg_rewards[-1] >= self.threshold:
                save_dict_list = []

                for i in range(self.num_agents):
                    save_dict = {'actor_params': self.maddpg.ddpg_agents[i].actor.state_dict(),
                                 'actor_optim_params': self.maddpg.ddpg_agents[i].actor_optimizer.state_dict(),
                                 'critic_params': self.maddpg.ddpg_agents[i].critic.state_dict(),
                                 'critic_optim_params': self.maddpg.ddpg_agents[i].critic_optimizer.state_dict()}
                    save_dict_list.append(save_dict)

                    torch.save(save_dict_list, self.ckpt)

                raw_score_plotter(self.episode_rewards)
                plotter('Tennis', len(self.episode_rewards), self.avg_rewards, self.threshold)
                break
Exemplo n.º 18
0
class Agent():
    def __init__(self,
                 state_size,
                 action_size,
                 batch_size=128,
                 gamma=0.99,
                 mean_lambda=1e-3,
                 std_lambda=1e-3,
                 z_lambda=0.0):

        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.memory = ReplayBuffer(BUFFERSIZE, self.batch_size)

        self.mean_lambda = mean_lambda
        self.std_lambda = std_lambda
        self.z_lambda = z_lambda

        self.current_value = Value(state_size).to(device)
        self.target_value = Value(state_size).to(device)

        self.softQ = soft_Q(state_size, action_size)
        self.policy = Policy(state_size, action_size)

        self.value_optimizer = optim.Adam(self.current_value.parameters(),
                                          lr=3e-4)
        self.soft_q_optimizer = optim.Adam(self.softQ.parameters(), lr=3e-4)
        self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=3e-4)

    def act(self, state):

        #state = torch.from_numpy(np.asarray(state)).float().to(device)
        action = self.policy.act(state)

        if self.memory.__len__() > self.batch_size:
            self.update()

        return action

    def add_to_memory(self, state, action, reward, next_state, done):

        self.memory.add(state, action, reward, next_state, done)

    def update(self):

        state, action, reward, next_state, done = self.memory.sample()

        expected_soft_q_value = self.softQ.forward(state, action)
        expected_value = self.current_value.forward(state)

        new_action, log_prob, z, mean, log_std = self.policy.evaluate(state)

        target_value = self.target_value.forward(next_state)
        next_soft_q_value = reward + self.gamma * target_value * (1 - done)

        q_val_mse = F.mse_loss(expected_soft_q_value,
                               next_soft_q_value.detach())

        expected_new_q_val = self.softQ.forward(state, new_action)
        next_value = expected_new_q_val - log_prob
        val_loss = F.mse_loss(expected_value, next_value.detach())

        log_prob_target = expected_new_q_val - expected_value
        policy_loss = (log_prob * (log_prob - log_prob_target).detach()).mean()

        mean_loss = self.mean_lambda * mean.pow(2).mean()
        std_loss = self.std_lambda * log_std.pow(2).mean()
        z_loss = self.z_lambda * z.pow(2).sum(1).mean()

        policy_loss += mean_loss + std_loss + z_loss

        self.soft_q_optimizer.zero_grad()
        q_val_mse.backward()
        self.soft_q_optimizer.step()

        self.value_optimizer.zero_grad()
        val_loss.backward()
        self.value_optimizer.step()

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        self.soft_update(self.current_value, self.target_value, TAU)

    def soft_update(self, local_model, target_model, TRANSFER_RATE):

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(TRANSFER_RATE * local_param.data +
                                    (1.0 - TRANSFER_RATE) * target_param.data)
Exemplo n.º 19
0
class MADDPG():
    def __init__(self, state_size, action_size, n_agents, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.n_agents = n_agents
        self.seed = random.seed(seed)

        # Actor-Critic agents
        self.ActorCriticAgents = [
            Agent(state_size, action_size, n_agents, seed)
            for _ in range(n_agents)
        ]

        # Replay memory
        self.memory = ReplayBuffer(self.action_size, BUFFER_SIZE, BATCH_SIZE,
                                   seed)

    def OUNoise_reset(self):
        for agent in self.ActorCriticAgents:
            agent.exploration_noise.reset()

    def act(self, state):
        actions = []
        for i, agent in enumerate(self.ActorCriticAgents):
            agent_action = agent.act(state[i])
            actions.append(agent_action[0])
        return np.stack(actions, axis=0)

    def step(self, ep, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)

        if len(self.memory) > BATCH_SIZE:
            for i in range(self.n_agents):
                self.learn(i)

    def learn(self, agent_index):
        states, actions, rewards, next_states, dones = self.memory.sample()

        target_next_actions = torch.from_numpy(
            np.zeros(shape=actions.shape)).float().to(device)
        for idx, agent in enumerate(self.ActorCriticAgents):
            current_states = states[:, idx]
            target_next_actions[:, idx, :] = agent.actor_target(current_states)

        target_next_actions = torch.reshape(target_next_actions,
                                            shape=(BATCH_SIZE, -1))

        current_agent_states = states[:, agent_index, :]
        current_agent_actions = actions[:, agent_index, :]
        current_agent_rewards = torch.reshape(rewards[:, agent_index],
                                              shape=(BATCH_SIZE, 1))
        current_agent_dones = torch.reshape(dones[:, agent_index],
                                            shape=(BATCH_SIZE, 1))

        action_preds = actions.clone()
        action_preds[:, agent_index, :] = self.ActorCriticAgents[
            agent_index].actor_local(current_agent_states)
        action_preds = torch.reshape(action_preds, shape=(BATCH_SIZE, -1))

        self.ActorCriticAgents[agent_index].update(
            states, current_agent_states, actions, current_agent_actions,
            target_next_actions, rewards, current_agent_rewards, next_states,
            dones, current_agent_dones, action_preds)

    def save_checkpoint(self):
        for i in range(self.n_agents):
            torch.save(self.ActorCriticAgents[i].actor_local.state_dict(),
                       f'actor_checkpoint{i}.pth')
            torch.save(self.ActorCriticAgents[i].critic_local.state_dict(),
                       f'critic_checkpoint{i}.pth')
Exemplo n.º 20
0
class Agent():
    def __init__(self,
                 state_space,
                 action_space,
                 memory_size=1000000,
                 batch_size=32,
                 seed=0,
                 q_size=51):

        self.state_space = state_space
        self.action_space = action_space
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.seed = seed
        self.q_size = q_size

        self.current_model = QDQN(self.state_space,
                                  self.action_space,
                                  n_quantiles=self.q_size).to(device)
        self.target_model = QDQN(self.state_space,
                                 self.action_space,
                                 n_quantiles=self.q_size).to(device)
        self.optimizer = Adam(self.current_model.parameters(), lr=LR)

        self.memory = ReplayBuffer(self.action_space, self.memory_size,
                                   self.batch_size, self.seed)
        self.update_every = 0

        self.tau = (torch.Tensor(
            (2 * np.arange(self.current_model.n_quantiles) + 1) /
            (2.0 * self.current_model.n_quantiles)).view(1, -1)).to(device)

    def soft_update(self, local_model, target_model, TRANSFER_RATE):

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(TRANSFER_RATE * local_param.data +
                                    (1.0 - TRANSFER_RATE) * target_param.data)

    def act(self, state, epsilon):

        if random.random() <= epsilon:
            action = random.choice(np.arange(self.action_space))
        else:
            action = self.current_model.act(state).cpu().numpy()
            #action = self.current_model.act(state, epsilon).cpu().numpy()
        return action

    def step(self, state, action, reward, next_state, done):

        self.memory.add(state, action, reward, next_state, done)

        self.update_every += 1
        if self.update_every % UPDATE_FREQUENCY == 0:
            if len(self.memory) >= self.batch_size:
                experience = self.memory.sample()
                self.learn(experience, GAMMA)

    def learn(self, experience, gamma):

        sampled_state, sampled_action, sampled_reward, sampled_next_state, sampled_done = experience

        #print(self.current_model(sampled_state).shape)
        #print(self.current_model(sampled_state)[0:self.batch_size, 0: self.action_space])
        #print(self.current_model(sampled_state))
        #print(self.current_model(sampled_state).shape)

        #print(sampled_action.shape)
        #print(sampled_action.expand(self.batch_size, self.q_size))
        #print(sampled_action.unsqueeze(1).expand(self.batch_size, 1, self.q_size).shape)
        action = sampled_action.unsqueeze(1).expand(self.batch_size, 1,
                                                    self.q_size)

        #print(self.current_model(sampled_state))
        #print(self.current_model(sampled_state).gather(1, action).squeeze(1))

        theta = self.current_model(sampled_state).gather(1, action).squeeze(1)
        #theta = self.current_model(sampled_state).mean(2)

        z_next = self.target_model(sampled_next_state).detach()
        #print(z_next)
        #print(z_next.shape)

        z_next_max = z_next[np.arange(self.batch_size),
                            z_next.mean(2).max(1)[1]]
        #print(z_next_max)
        Ttheta = sampled_reward + GAMMA * (1 - sampled_done) * z_next_max
        #print(Ttheta)
        #print(Ttheta.shape)
        #print(theta.shape)
        diff = Ttheta.t().unsqueeze(-1) - theta

        loss = self.huber(diff) * (self.tau -
                                   (diff.detach() < 0).float()).abs()
        loss = loss.mean()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.current_model, self.target_model, TRANSFER_RATE)

    def huber(self, x, k=1.0):
        return torch.where(x.abs() < k, 0.5 * x.pow(2),
                           k * (x.abs() - 0.5 * k))
Exemplo n.º 21
0
def main():
    # define arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--render",
                        action="store_true",
                        help="Render the state")
    parser.add_argument("--render_interval",
                        type=int,
                        default=10,
                        help="Number of rollouts to skip before rendering")
    parser.add_argument("--num_rollouts",
                        type=int,
                        default=-1,
                        help="Number of max rollouts")
    parser.add_argument("--logfile",
                        type=str,
                        help="Indicate where to save rollout data")
    parser.add_argument(
        "--load_params",
        type=str,
        help="Load previously learned parameters from [LOAD_PARAMS]")
    parser.add_argument("--save_params",
                        type=str,
                        help="Save learned parameters to [SAVE_PARAMS]")
    args = parser.parse_args()

    signal.signal(signal.SIGINT, stopsigCallback)
    global stopsig

    # create the basketball environment
    env = BasketballVelocityEnv(fps=60.0,
                                timeInterval=0.1,
                                goal=[0, 5, 0],
                                initialLengths=np.array([0, 0, 1, 1, 0, 0, 0]),
                                initialAngles=np.array([0, 45, 0, 0, 0, 0, 0]))

    # create space
    stateSpace = ContinuousSpace(ranges=env.state_range())
    actionRange = env.action_range()
    actionSpace = DiscreteSpace(
        intervals=[15 for i in range(2)] + [1],
        ranges=[actionRange[1], actionRange[2], actionRange[7]])
    processor = JointProcessor(actionSpace)

    # create the model and policy functions
    modelFn = MxFullyConnected(sizes=[stateSpace.n + actionSpace.n, 64, 32, 1],
                               alpha=0.001,
                               use_gpu=True)
    if args.load_params:
        print("loading params...")
        modelFn.load_params(args.load_params)

    softmax = lambda s: np.exp(s) / np.sum(np.exp(s))
    policyFn = EpsilonGreedyPolicy(
        epsilon=0.5,
        getActionsFn=lambda state: actionSpace.sample(1024),
        distributionFn=lambda qstate: softmax(modelFn(qstate)))
    dataset = ReplayBuffer()
    if args.logfile:
        log = open(args.logfile, "a")

    rollout = 0
    while args.num_rollouts == -1 or rollout < args.num_rollouts:
        print("Iteration:", rollout)
        state = env.reset()
        reward = 0
        done = False
        steps = 0
        while not done:
            if stopsig:
                break
            action = policyFn(state)
            nextState, reward, done, info = env.step(
                createAction(processor.process_env_action(action)))
            dataset.append(state, action, reward, nextState)
            state = nextState
            steps += 1
            if args.render and rollout % args.render_interval == 0:
                env.render()
        if stopsig:
            break

        dataset.reset()  # push trajectory into the dataset buffer
        modelFn.fit(processor.process_Q(dataset.sample(1024)), num_epochs=10)
        print("Reward:", reward if (reward >= 0.00001) else 0, "with Error:",
              modelFn.score(), "with steps:", steps)
        if args.logfile:
            log.write("[" + str(rollout) + ", " + str(reward) + ", " +
                      str(modelFn.score()) + "]\n")

        rollout += 1
        if rollout % 100 == 0:
            policyFn.epsilon *= 0.95
            print("Epsilon is now:", policyFn.epsilon)

    if args.logfile:
        log.close()
    if args.save_params:
        print("saving params...")
        modelFn.save_params(args.save_params)
Exemplo n.º 22
0
class DQN:
    def __init__(self,
                 n_actions=100,
                 gamma=0.99,
                 epsilon_start=0.95,
                 epsilon_end=0.05,
                 epsilon_decay=500,
                 memory_capacity=1000,
                 policy_lr=0.01,
                 batch_size=64,
                 device="cuda",
                 path="D:/unity2017/water/ai/saved_model/checkpoint1.pth",
                 pretrained=False):
        self.path = path
        self.device = device  # 设备,cpu或gpu等
        self.gamma = gamma  # 奖励的折扣因子
        self.n_actions = n_actions
        # e-greedy策略相关参数
        self.actions_count = 0  # 用于epsilon的衰减计数
        self.epsilon = 0
        self.epsilon_start = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        self.batch_size = batch_size
        self.policy_net = resnet50()
        num_ftrs = self.policy_net.fc.in_features
        self.policy_net.fc = nn.Linear(num_ftrs, self.n_actions)
        self.policy_net.conv1 = nn.Conv2d(1,
                                          64,
                                          kernel_size=7,
                                          stride=2,
                                          padding=3,
                                          bias=False)
        self.policy_net.to(self.device)
        if pretrained:
            self.policy_net.load_state_dict(torch.load(self.path))
        self.target_net = resnet50()
        self.target_net.conv1 = nn.Conv2d(1,
                                          64,
                                          kernel_size=7,
                                          stride=2,
                                          padding=3,
                                          bias=False)
        self.target_net.fc = nn.Linear(num_ftrs, self.n_actions)
        self.target_net.to(self.device)
        # target_net的初始模型参数完全复制policy_net
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()  # 不启用 BatchNormalization 和 Dropout
        # 可查parameters()与state_dict()的区别,前者require_grad=True
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr)
        self.loss = 0
        self.memory = ReplayBuffer(memory_capacity)
        self.pretrained = pretrained

    def choose_action(self, state, train=True):
        '''选择动作
        '''
        if train:
            self.epsilon = self.epsilon_end + (
                self.epsilon_start - self.epsilon_end) * math.exp(
                    -1. * self.actions_count / self.epsilon_decay)
            self.actions_count += 1
            #if self.pretrained:
            #   self.epsilon = self.epsilon_end
            if random.random() > self.epsilon:
                with torch.no_grad():
                    # 先转为张量便于丢给神经网络,state元素数据原本为float64
                    # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
                    # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
                    state = state.unsqueeze(0).to(self.device)
                    q_value = self.policy_net(state)
                    action = q_value.max(1)[1].item()
            else:
                action = random.randint(0, 99)
            return action
        else:
            with torch.no_grad():  # 取消保存梯度
                # 先转为张量便于丢给神经网络,state元素数据原本为float64
                # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
                state = torch.tensor(
                    state, device='cpu', dtype=torch.float32
                )  # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
                q_value = self.target_net(state)
                action = q_value.max(1)[1].item()
            return action

    def update(self):

        if len(self.memory) < self.batch_size:
            return
        # 从memory中随机采样transition
        state_batch_, action_batch, reward_batch, next_state_batch_, done_batch = self.memory.sample(
            self.batch_size)
        state_batch = torch.ones((self.batch_size, 1, 150, 6),
                                 device=self.device,
                                 dtype=torch.float)
        for i in range(self.batch_size):
            state_batch[i] = state_batch_[i]
        next_state_batch = torch.ones((self.batch_size, 1, 150, 6),
                                      device=self.device,
                                      dtype=torch.float)
        for i in range(self.batch_size):
            next_state_batch[i] = next_state_batch_[i]
        '''转为张量
        例如tensor([[-4.5543e-02, -2.3910e-01,  1.8344e-02,  2.3158e-01],...,[-1.8615e-02, -2.3921e-01, -1.1791e-02,  2.3400e-01]])'''
        # state_batch = torch.tensor(state_batch, device=self.device,dtype=torch.float)
        action_batch = torch.tensor(action_batch,
                                    device=self.device).unsqueeze(
                                        1)  # 例如tensor([[1],...,[0]])
        reward_batch = torch.tensor(
            reward_batch, device=self.device,
            dtype=torch.float)  # tensor([1., 1.,...,1])
        # next_state_batch = torch.tensor(
        #     next_state_batch, device=self.device, dtype=torch.float)
        done_batch = torch.tensor(np.float32(done_batch),
                                  device=self.device).unsqueeze(
                                      1)  # 将bool转为float然后转为张量
        '''计算当前(s_t,a)对应的Q(s_t, a)'''
        '''torch.gather:对于a=torch.Tensor([[1,2],[3,4]]),那么a.gather(1,torch.Tensor([[0],[1]]))=torch.Tensor([[1],[3]])'''
        q_values = self.policy_net(state_batch).gather(
            dim=1, index=action_batch)  # 等价于self.forward
        # 计算所有next states的V(s_{t+1}),即通过target_net中选取reward最大的对应states
        next_state_values = self.target_net(next_state_batch).max(
            1)[0].detach()  # 比如tensor([ 0.0060, -0.0171,...,])
        # 计算 expected_q_value
        # 对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward
        expected_q_values = reward_batch + self.gamma * \
                            next_state_values * (1 - done_batch[0])
        # self.loss = F.smooth_l1_loss(q_values,expected_q_values.unsqueeze(1)) # 计算 Huber loss
        self.loss = nn.MSELoss()(q_values,
                                 expected_q_values.unsqueeze(1))  # 计算 均方误差loss
        # 优化模型
        self.optimizer.zero_grad(
        )  # zero_grad清除上一步所有旧的gradients from the last step
        # loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分
        self.loss.backward()
        for param in self.policy_net.parameters():  # clip防止梯度爆炸
            param.grad.data.clamp_(-1, 1)

        self.optimizer.step()  # 更新模型

    def save_model(self, path):
        torch.save(self.target_net.state_dict(), path)

    def load_model(self, path):
        self.target_net.load_state_dict(torch.load(path))
Exemplo n.º 23
0
class DDPG():
    """Reinforcement Learning agent , learning using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.08
        self.exploration_sigma = 0.15
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.95  # discount factor 0.99
        self.tau = 0.001  # for soft update of target parameters 0.01

        # Score tracker and learning parameters
        self.total_reward = None
        self.count = 0
        self.score = 0
        self.best_score = -np.inf
        self.last_state = None

    def reset_episode(self):

        self.total_reward = None
        self.count = 0

        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):

        if self.total_reward:
            self.total_reward += reward
        else:
            self.total_reward = reward

        self.count += 1

        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        states = np.reshape(states, [-1, self.state_size])
        action = self.actor_local.model.predict(states)[0]
        # add some noise for exploration
        return list(action + self.noise.sample())

    def learn(self, experiences):
        """Update policy and value parameters using given batch of reward tuples."""

        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted actions of next-state  and Q values from target models
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

        # track best score
        self.score = self.total_reward / float(
            self.count) if self.count else -np.inf
        if self.best_score < self.score:
            self.best_score = self.score

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Exemplo n.º 24
0
class Christophers_Agent():
    def __init__(self, task):
        # Task (environment) information
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.action_range = self.action_high - self.action_low

        self.w = np.random.normal(
            size=(
                self.state_size, self.action_size
            ),  # weights for simple linear policy: state_space x action_space
            scale=(self.action_range / (2 * self.state_size)
                   ))  # start producing actions in a decent range

        self.actor = Actor(self.state_size, self.action_size, self.action_low,
                           self.action_high)
        self.critic = Critic(self.state_size, self.action_size)

        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)
        self.critic_target = Critic(self.state_size, self.action_size)

        self.gamma = 0.95
        self.tau = 0.001

        self.best_w = None
        self.best_score = -np.inf

        self.exploration_mu = 0.5
        self.exploration_theta = 0.2
        self.exploration_sigma = 0.4
        self.noise = Noise(self.action_size, self.exploration_mu,
                           self.exploration_theta, self.exploration_sigma)

        self.buffer_size = 100000
        self.batch_size = 32
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        self.best_score = -np.inf
        self.num_steps = 0

        # Episode variables
        self.reset_episode()

    def reset_episode(self):
        if self.get_score() > self.best_score:
            self.best_score = self.get_score()
        self.total_reward = 0.0
        self.num_steps = 0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        self.total_reward += reward
        self.num_steps += 1

        self.memory.add(self.last_state, action, reward, next_state, done)

        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        self.last_state = next_state

    def act(self, state):
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor.model.predict(state)[0]
        action = list(action +
                      self.noise.sample())  # add some noise for exploration
        return action

    def get_score(self):
        return -np.inf if self.num_steps == 0 else self.total_reward / self.num_steps

    def learn(self, experiences):
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        done = np.array([e.done for e in experiences
                         if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        Q_targets = rewards + self.gamma * Q_targets_next * (1 - done)

        self.critic.model.train_on_batch(x=[states, actions], y=Q_targets)

        action_gradients = np.reshape(
            self.critic.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor.train_fn([states, action_gradients, 1])

        self.soft_update(self.critic.model, self.critic_target.model)
        self.soft_update(self.actor.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights)

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Exemplo n.º 25
0
class Agent():
    def __init__(self,
                 env,
                 memory_size=1000000,
                 batch=128,
                 sigma=0.2,
                 noise_clip=0.5,
                 gamma=0.99,
                 update_frequency=2):

        self.states = env.observation_space
        self.state_size = env.observation_space.shape[0]
        self.actions = env.action_space
        self.action_size = env.action_space.shape[0]
        self.sigma = sigma
        self.noise_clip = noise_clip
        self.gamma = gamma
        self.update_frequency = update_frequency

        self.actor = Actor(self.state_size, self.action_size).to(device)
        self.critic0 = Critic(self.state_size, self.action_size).to(device)
        self.critic1 = Critic(self.state_size, self.action_size).to(device)

        self.target_actor = Actor(self.state_size, self.action_size).to(device)
        self.target_critic0 = Critic(self.state_size,
                                     self.action_size).to(device)
        self.target_critic1 = Critic(self.state_size,
                                     self.action_size).to(device)

        self.memory = ReplayBuffer(memory_size, batch)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=ACTOR_LR)
        self.critic0_optimizer = Adam(self.critic0.parameters(), lr=VALUE0_LR)
        self.critic1_optimizer = Adam(self.critic1.parameters(), lr=VALUE1_LR)

        self.soft_update(self.actor, self.target_actor, 1)
        self.soft_update(self.critic0, self.target_critic0, 1)
        self.soft_update(self.critic1, self.target_critic1, 1)

    def act(self, state, step, epsilon=True):

        state = torch.from_numpy(np.asarray(state)).float().to(device)
        action = self.actor.forward(state)
        action = action.detach().cpu().numpy()

        if epsilon:
            noise = np.random.normal(0, 0.1, action.shape[0])
            action += noise

        return action

    def update(self, step):

        state, action, reward, next_state, done = self.memory.sample()

        next_state_action = self.target_actor(next_state)

        noise = Normal(torch.zeros(self.action_size), self.sigma).sample()
        noise = torch.clamp(noise, -self.noise_clip,
                            self.noise_clip).to(device)

        next_state_action += noise

        target_Q0 = self.target_critic0(next_state, next_state_action)
        target_Q1 = self.target_critic1(next_state, next_state_action)
        target_Q = torch.min(target_Q0, target_Q1)

        target_value = reward + self.gamma * target_Q * (1.0 - done)

        expected_Q0 = self.critic0(state, action)
        expected_Q1 = self.critic1(state, action)

        critic_0_loss = F.mse_loss(expected_Q0, target_value.detach())
        critic_1_loss = F.mse_loss(expected_Q1, target_value.detach())

        self.critic0_optimizer.zero_grad()
        critic_0_loss.backward()
        self.critic0_optimizer.step()

        self.critic1_optimizer.zero_grad()
        critic_1_loss.backward()
        self.critic1_optimizer.step()

        if step % self.update_frequency == 0:

            actor_loss = self.critic0.forward(state, self.actor.forward(state))
            actor_loss = -actor_loss.mean()

            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            self.soft_update(self.critic0, self.target_critic0, TRANSFER_RATE)
            self.soft_update(self.critic1, self.target_critic1, TRANSFER_RATE)
            self.soft_update(self.actor, self.target_actor, TRANSFER_RATE)

    def soft_update(self, local_model, target_model, tao):

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tao * local_param.data +
                                    (1.0 - tao) * target_param.data)

    def add_to_memory(self, state, action, reward, next_state, done):

        self.memory.add(state, action, reward, next_state, done)
Exemplo n.º 26
0
class Agent():
    def __init__(self,
                 state_size,
                 action_size,
                 action_sigma=0.1,
                 memory_size=1000000,
                 batch=128,
                 sigma=0.2,
                 noise_clip=0.5,
                 gamma=0.99,
                 update_frequency=2,
                 seed=0):
        '''
        TD3 Agent
        :param state_size: State Dimension
        :param action_size: Action dimension
        :param action_sigma: standard deviation of the noise to be added to the action
        :param memory_size:
        :param batch:
        :param sigma: Standard deviation of the noise to be added to the target function (Chapter 5.3 of TD3 Paper)
        :param noise_clip: How much noise to allow
        :param gamma:
        :param update_frequency:
        :param seed:
        '''

        self.state_size = state_size
        self.action_size = action_size

        self.action_sigma = action_sigma
        self.sigma = sigma
        self.noise_clip = noise_clip
        self.gamma = gamma
        self.update_frequency = update_frequency
        self.seed = seed

        self.actor = Actor(self.state_size, self.action_size).to(device)
        self.critic0 = Critic(self.state_size, self.action_size).to(device)
        #second Critic as described in the paper
        # https: // arxiv.org / pdf / 1802.09477.pdf
        self.critic1 = Critic(self.state_size, self.action_size).to(device)

        self.target_actor = Actor(self.state_size, self.action_size).to(device)
        self.target_critic0 = Critic(self.state_size,
                                     self.action_size).to(device)
        # second Critic as described in the paper
        # https: // arxiv.org / pdf / 1802.09477.pdf
        self.target_critic1 = Critic(self.state_size,
                                     self.action_size).to(device)

        self.memory = ReplayBuffer(memory_size, batch, seed=seed)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=ACTOR_LR)
        self.critic0_optimizer = Adam(self.critic0.parameters(), lr=VALUE0_LR)
        self.critic1_optimizer = Adam(self.critic1.parameters(), lr=VALUE1_LR)

        self.soft_update(self.actor, self.target_actor, 1)
        self.soft_update(self.critic0, self.target_critic0, 1)
        self.soft_update(self.critic1, self.target_critic1, 1)

    def act(self, state, epsilon=True):

        state = torch.from_numpy(np.asarray(state)).float().to(device)
        self.actor.eval()
        with torch.no_grad():
            action = self.actor.forward(state).cpu().data.numpy()
        self.actor.train()

        if epsilon:
            #if we want to inject some noise
            noise = np.random.normal(0, self.action_sigma, action.shape[0])
            action += noise

        return action

    def update(self, step):
        '''
        #https: // arxiv.org / pdf / 1802.09477.pdf
        the function is very similar to typical DDPG algorithm, except for
        1) we have 2 critics to update
        2) we take the min of the 2 values critics output
        3) Has modified Target network with noise injected into it (Chapter 5.3 of the paper)
        4) We delay updating the actor by certain steps

        :param step: how often to update the actor
        :return:
        '''

        state, action, reward, next_state, done = self.memory.sample()

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models

        next_state_action = self.target_actor(next_state)

        #sample a random noise
        noise = Normal(torch.zeros(self.action_size), self.sigma).sample()
        noise = torch.clamp(noise, -self.noise_clip,
                            self.noise_clip).to(device)

        next_state_action += noise

        target_Q0 = self.target_critic0(next_state, next_state_action)
        target_Q1 = self.target_critic1(next_state, next_state_action)
        target_Q = torch.min(target_Q0, target_Q1)

        target_value = reward + self.gamma * target_Q * (1.0 - done)

        expected_Q0 = self.critic0(state, action)
        expected_Q1 = self.critic1(state, action)

        critic_0_loss = F.mse_loss(expected_Q0, target_value.detach())
        critic_1_loss = F.mse_loss(expected_Q1, target_value.detach())

        self.critic0_optimizer.zero_grad()
        critic_0_loss.backward()
        self.critic0_optimizer.step()

        self.critic1_optimizer.zero_grad()
        critic_1_loss.backward()
        self.critic1_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss

        #as mentioned in the paper, we delay updating the actor network.

        if step % self.update_frequency == 0:

            actor_loss = self.critic0.forward(state, self.actor.forward(state))
            actor_loss = -actor_loss.mean()

            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()
            # ----------------------- update target networks ------------------- #
            self.soft_update(self.critic0, self.target_critic0, TRANSFER_RATE)
            self.soft_update(self.critic1, self.target_critic1, TRANSFER_RATE)
            self.soft_update(self.actor, self.target_actor, TRANSFER_RATE)

    def soft_update(self, local_model, target_model, tao):

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tao * local_param.data +
                                    (1.0 - tao) * target_param.data)

    def add_to_memory(self, state, action, reward, next_state, done):

        self.memory.add(state, action, reward, next_state, done)
Exemplo n.º 27
0
class DQN:
    def __init__(self,
                 n_states,
                 n_actions,
                 gamma=0.99,
                 epsilon_start=0.9,
                 epsilon_end=0.05,
                 epsilon_decay=200,
                 memory_capacity=10000,
                 policy_lr=0.01,
                 batch_size=128,
                 device="cpu"):
        self.actions_count = 0
        self.n_actions = n_actions
        self.device = device
        self.gamma = gamma
        self.epsilon = 0
        self.epsilon_start = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        self.batch_size = batch_size
        self.policy_net = FCN(n_states, n_actions).to(self.device)
        self.target_net = FCN(n_states, n_actions).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()  # 不启用 BatchNormalization 和 Dropout
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr)
        self.loss = 0
        self.memory = ReplayBuffer(memory_capacity)

    def select_action(self, state):
        '''选择工作
        Args:
            state [array]: 状态
        Returns:
            [array]: 动作
        '''
        self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
            math.exp(-1. * self.actions_count / self.epsilon_decay)
        self.actions_count += 1
        if random.random() > self.epsilon:
            with torch.no_grad():
                state = torch.tensor(
                    [state], device=self.device, dtype=torch.float32
                )  # 先转为张量便于丢给神经网络,state元素数据原本为float64;注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
                q_value = self.policy_net(
                    state
                )  # tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
                action = q_value.max(1)[1].item()
        else:
            action = random.randrange(self.n_actions)
        return action

    def update(self):

        if len(self.memory) < self.batch_size:
            return

        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
            self.batch_size)

        state_batch = torch.tensor(
            state_batch, device=self.device, dtype=torch.float
        )  # 例如tensor([[-4.5543e-02, -2.3910e-01,  1.8344e-02,  2.3158e-01],...,[-1.8615e-02, -2.3921e-01, -1.1791e-02,  2.3400e-01]])
        action_batch = torch.tensor(action_batch,
                                    device=self.device).unsqueeze(
                                        1)  # 例如tensor([[1],...,[0]])
        reward_batch = torch.tensor(
            reward_batch, device=self.device,
            dtype=torch.float)  # tensor([1., 1.,...,1])
        next_state_batch = torch.tensor(next_state_batch,
                                        device=self.device,
                                        dtype=torch.float)
        done_batch = torch.tensor(np.float32(done_batch),
                                  device=self.device).unsqueeze(
                                      1)  # 将bool转为float然后转为张量
        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        q_values = self.policy_net(state_batch).gather(
            1, action_batch)  # 等价于self.forward
        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.

        next_state_values = self.target_net(next_state_batch).max(
            1)[0].detach()  # tensor([ 0.0060, -0.0171,...,])
        # Compute the expected Q values
        expected_q_values = reward_batch + self.gamma * next_state_values * (
            1 - done_batch[0])

        # Compute Huber loss
        # self.loss = nn.MSELoss(q_values, expected_q_values.unsqueeze(1))
        self.loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1))
        # Optimize the model
        self.optimizer.zero_grad(
        )  # zero_grad clears old gradients from the last step (otherwise you’d just accumulate the gradients from all loss.backward() calls).
        self.loss.backward(
        )  # loss.backward() computes the derivative of the loss w.r.t. the parameters (or anything requiring gradients) using backpropagation.
        for param in self.policy_net.parameters():  # clip防止梯度爆炸
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step(
        )  # causes the optimizer to take a step based on the gradients of the parameters.
Exemplo n.º 28
0
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, num_episodes, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            num_episodes (int): number of training epochs
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = seed

        # Q-Network
        self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.anneal_beta = (1. - BETA) / num_episodes

        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, ALPHA, BETA)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        self.t_learning_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def update_weights(self):
        self.memory.anneal_beta(self.anneal_beta)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, idxs, weights = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        # Compute Q targets for current states 
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # update priorities
        updates = torch.abs(Q_expected - Q_targets).cpu().data.squeeze(1).numpy()
        self.memory.update_priorities(idxs, updates)

        # Compute loss
        loss = F.l1_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        (loss * weights).mean().backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.t_learning_step += 1
        if self.t_learning_step % UPDATE_TARGET_STEPS == 0:
            self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            # PyTorch copy: destination.data.copy(source.data)
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Exemplo n.º 29
0
def main():
    # define arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--render",
                        action="store_true",
                        help="Render the state")
    parser.add_argument("--render_interval",
                        type=int,
                        default=10,
                        help="Number of rollouts to skip before rendering")
    parser.add_argument("--num_rollouts",
                        type=int,
                        default=1000,
                        help="Number of max rollouts")
    parser.add_argument("--logfile",
                        type=str,
                        help="Indicate where to save rollout data")
    parser.add_argument(
        "--load_params",
        type=str,
        help="Load previously learned parameters from [LOAD_PARAMS]")
    parser.add_argument("--save_params",
                        type=str,
                        help="Save learned parameters to [SAVE_PARAMS]")
    parser.add_argument("--gamma",
                        type=float,
                        default=0.99,
                        help="Discount factor")
    parser.add_argument("--test", action="store_true", help="Test the params")
    args = parser.parse_args()

    signal.signal(signal.SIGINT, stopsigCallback)
    global stopsig

    # create the basketball environment
    env = BasketballVelocityEnv(fps=60.0,
                                timeInterval=0.1,
                                goal=[0, 5, 0],
                                initialLengths=np.array([0, 0, 1, 1, 1, 0, 1]),
                                initialAngles=np.array(
                                    [0, 45, -20, -20, 0, -20, 0]))

    # create space
    stateSpace = ContinuousSpace(ranges=env.state_range())
    actionSpace = ContinuousSpace(ranges=env.action_range())

    # create the model and policy functions
    modelFn = PoWERDistribution(stateSpace.n,
                                actionSpace.n,
                                sigma=5.0 if not args.test else 0)
    if args.load_params:
        print("Loading params...")
        modelFn.load_params(args.load_params)

    replayBuffer = ReplayBuffer(1024)
    if args.logfile:
        log = open(args.logfile, "a")

    rollout = 0
    while args.num_rollouts == -1 or rollout < args.num_rollouts:
        print("Iteration:", rollout)
        state = env.reset()
        reward = 0
        done = False
        steps = 0
        while not done and steps < 5:
            if stopsig:
                break
            action, eps = modelFn.predict(
                state, replayBuffer.sample(gamma=args.gamma))
            if steps == 4:
                action[-1] = 1.0
            nextState, reward, done, info = env.step(action)
            replayBuffer.append(state,
                                action,
                                reward,
                                nextState=nextState,
                                info={"eps": eps})
            state = nextState
            steps += 1
            if args.render and rollout % args.render_interval == 0:
                env.render()
        if stopsig:
            break

        # no importance sampling, implement it when we have small datasets
        replayBuffer.reset()
        dataset = replayBuffer.sample(gamma=args.gamma)
        modelFn.fit(dataset)

        avgR = np.sum(dataset["rewards"]) / float(len(dataset["rewards"]))
        avgQ = np.sum(dataset["values"]) / float(len(dataset["values"]))
        print("Rollouts:", rollout, "Error:", modelFn.score(), "Average Q",
              avgQ, "Average R", avgR)
        if args.logfile:
            log.write("[" + str(rollout) + ", " + str(modelFn.score()) + ", " +
                      str(avgQ) + ", " + str(avgR) + "]\n")
        rollout += 1

    if args.logfile:
        log.close()
    if args.save_params:
        print("Saving params...")
        modelFn.save_params(args.save_params)
class Agent:
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed, apply_dueling=False, apply_double=False):
        """
        Initialize a Unity agent object.
        :param state_size: (int) dimension of each state
        :param action_size: (int) dimension of each action
        :param seed: (int) random seed
        """
        assert(self._true_xor(apply_dueling, apply_double),
               "Choose one between dueling networks or DDQN")

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.apply_dueling = apply_dueling
        self.apply_double = apply_double

        # Q-Network
        self.q_net_target = QNetwork(state_size, action_size, seed, apply_dueling=apply_dueling).to(device)
        self.q_net_local = QNetwork(state_size, action_size, seed, apply_dueling=apply_dueling).to(device)
        self.opt = optim.Adam(self.q_net_local.parameters(), lr=LR)

        # Replay memory
        self.memory_buffer = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    @staticmethod
    def _true_xor(*args):
        return sum(args) == 1

    def step(self, state, action, reward, next_state, done):
        """
        Save experience in replay memory buffer for future experience replay
        :param state: The current state of the agent
        :param action: The action that the agent has taken in given state
        :param reward: The reward associated with the state action combination
        :param next_state: The resulting state after taking action in previous state
        :param done: (bool) Has the terminal state been reached?
        :return: None
        """
        self.memory_buffer.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_CYCLE
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn from it
            if BATCH_SIZE < len(self.memory_buffer):
                experiences = self.memory_buffer.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """
        Returns actions for given state as per current policy.
        :param state: (array_like) current state
        :param eps: (float) epsilon, for epsilon-greedy action selection
        :return: (int) The index of the action to be taken by the agent
        """

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.q_net_local.eval()
        with torch.no_grad():  # Do not perform a forward pass in this context
            action_values = self.q_net_local(state)
        self.q_net_local.train()

        # Epsilon-greedy action selection
        greed_p = random.random()

        return np.argmax(action_values.cpu().data.numpy()) if greed_p > eps else \
            random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """
        Update value parameters using given batch of experience tuples.
        :param experiences: (Tuple[torch.Tensor]) tuple of (s, a, r, s', done) tuples
        :param gamma: (float) discount factor
        :return:
        """
        states, actions, rewards, next_states, done_signals = experiences

        if not self.apply_double:
            # Get max predicted Q values for the next state of the target model.
            Q_targets_next = self.q_net_target(next_states).detach().max(1)[0].unsqueeze(1)
        else:
            # In the case of Double-DQN, evaluate the best selected action with the target model's set of parameters.
            indices = torch.argmax(self.q_net_local(next_states).detach(), 1)  # The selected next best action's indices
            # Evaluate that action by comparing with the local network's set of parameters
            Q_targets_next = self.q_net_target(next_states).detach().gather(1, indices.unsqueeze(1))

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - done_signals))

        # Get expected Q values from local model (being trained)
        # x.gather(1, actions) returns a tensor which results from the concatenation of the input tensor values along
        # the given dimensions (here the dim indexes are the taken actions indices)
        Q_expected = self.q_net_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        
        # Minimize the loss
        self.opt.zero_grad()
        loss.backward()
        self.opt.step()

        # perform network update
        self.soft_update(self.q_net_local, self.q_net_target, TAU)

    @staticmethod
    def soft_update(local_model, target_model, tau):
        """
        Soft update model parameters, given by the function:
        θ_target = τ*θ_local + (1 - τ)*θ_target

        :param local_model: (PyTorch model) weights will be copied from
        :param target_model: (PyTorch model) weights will be copied to
        :param tau: (float) interpolation parameter
        :return:
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)