예제 #1
0
    def __init__(self,
                 Q,
                 Q_target,
                 num_actions,
                 discount_factor=0.99,
                 batch_size=64,
                 epsilon=0.95,
                 epsilon_min=0.05,
                 epsilon_decay=0.995,
                 exploration_type='e-annealing',
                 learning_type='dq',
                 replay_buffer_size=1e5):
        self.Q = Q
        self.Q_target = Q_target

        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay

        self.exploration_type = exploration_type
        self.learning_type = learning_type

        self.num_actions = num_actions
        self.batch_size = batch_size
        self.discount_factor = discount_factor

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(replay_buffer_size)

        # start tensorflow session
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver()
예제 #2
0
    def __init__(self,
                 name,
                 Q_current,
                 Q_target,
                 num_actions,
                 discount_factor,
                 batch_size,
                 epsilon,
                 epsilon_decay,
                 boltzmann,
                 double_q,
                 buffer_capacity,
                 random_probs=None):
        """
         Q-Learning agent for off-policy TD control using Function Approximation.
         Finds the optimal greedy policy while following an epsilon-greedy policy.

         Args:
            Q: Action-Value function estimator (Neural Network)
            Q_target: Slowly updated target network to calculate the targets.
            num_actions: Number of actions of the environment.
            discount_factor: gamma, discount factor of future rewards.
            batch_size: Number of samples per batch.
            epsilon: Chance to sample a random action. Float betwen 0 and 1.
        """
        # save hyperparameters in folder

        self.name = name  # probably useless
        self.Q_current = Q_current
        self.Q_target = Q_target

        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.boltzmann = boltzmann

        self.num_actions = num_actions
        self.batch_size = batch_size
        self.discount_factor = discount_factor
        self.buffer_capacity = buffer_capacity

        self.double_q = double_q

        self.random_probs = random_probs

        # define replay buffer
        self.replay_buffer = ReplayBuffer(capacity=buffer_capacity)

        # Start tensorflow session
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver()
예제 #3
0
    def __init__(self,
                 Q,
                 Q_target,
                 num_actions,
                 discount_factor=0.99,
                 batch_size=64,
                 epsilon=0.05,
                 act_probabilities=None,
                 double_q=False,
                 buffer_capacity=100000,
                 prefill_bs_percentage=5):
        """
         Q-Learning agent for off-policy TD control using Function Approximation.
         Finds the optimal greedy policy while following an epsilon-greedy policy.

         Args:
            Q: Action-Value function estimator (Neural Network)
            Q_target: Slowly updated target network to calculate the targets.
            num_actions: Number of actions of the environment.
            discount_factor: gamma, discount factor of future rewards.
            batch_size: Number of samples per batch.
            epsilon: Chance to sample a random action. Float betwen 0 and 1.
        """
        self.Q = Q
        self.Q_target = Q_target

        self.epsilon = epsilon

        self.num_actions = num_actions
        self.batch_size = batch_size
        self.discount_factor = discount_factor

        # define replay buffer
        self.replay_buffer = ReplayBuffer(capacity=buffer_capacity,
                                          min_fill=prefill_bs_percentage *
                                          batch_size)

        # Start tensorflow session
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver()

        # <JAB>
        if act_probabilities is None:
            self.act_probabilities = np.ones(num_actions) / num_actions
        else:
            self.act_probabilities = act_probabilities

        self.double_dqn = double_q
예제 #4
0
    def __init__(self,
                 Q,
                 Q_target,
                 num_actions,
                 game="cartpole",
                 explore_type="epsilon_greedy",
                 epsilon_decay=1,
                 epsilon_min=0.05,
                 tau=1,
                 method="CQL",
                 discount_factor=0.99,
                 batch_size=64,
                 epsilon=0.05):
        """
         Q-Learning agent for off-policy TD control using Function Approximation.
         Finds the optimal greedy policy while following an epsilon-greedy policy.

         Args:
            Q: Action-Value function estimator (Neural Network)
            Q_target: Slowly updated target network to calculate the targets.
            num_actions: Number of actions of the environment.
            discount_factor: gamma, discount factor of future rewards.
            batch_size: Number of samples per batch.
            epsilon: Chance to sample a random action. Float betwen 0 and 1.
        """
        self.Q = Q
        self.Q_target = Q_target
        # now support cartpole or carracing two games
        self.game = game
        # self.state_dim = Q.
        self.epsilon = epsilon
        self.num_actions = num_actions
        self.batch_size = batch_size
        self.discount_factor = discount_factor
        # now support CQL(classical Q) or DQL(Double Q)
        self.method = method
        self.explore_type = explore_type
        # for epsilon annealing
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        # for boltzmann exploration
        self.tau = tau
        # define replay buffer
        self.replay_buffer = ReplayBuffer()

        # start tensorflow session
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver()
예제 #5
0
    def __init__(self, Q, Q_target, num_actions, discount_factor=0.99, batch_size=64, epsilon=0.05):
        """
         Q-Learning agent for off-policy TD control using Function Approximation.
         Finds the optimal greedy policy while following an epsilon-greedy policy.

         Args:
            Q: Action-Value function estimator (Neural Network)
            Q_target: Slowly updated target network to calculate the targets.
            num_actions: Number of actions of the environment.
            discount_factor: gamma, discount factor of future rewards.
            batch_size: Number of samples per batch.
            epsilon: Chance to sample a random action. Float betwen 0 and 1.
        """
        self.Q = Q      
        self.Q_target = Q_target
        
        self.epsilon = epsilon

        self.num_actions = num_actions
        self.batch_size = batch_size
        self.discount_factor = discount_factor

        # define replay buffer
        self.replay_buffer = ReplayBuffer()

        # Start tensorflow session
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver()
예제 #6
0
    def __init__(self, Q, Q_target, num_actions, gamma=0.95, batch_size=64, epsilon=0.1, tau=0.01, lr=1e-4, history_length=0):
        """
         Q-Learning agent for off-policy TD control using Function Approximation.
         Finds the optimal greedy policy while following an epsilon-greedy policy.

         Args:
            Q: Action-Value function estimator (Neural Network)
            Q_target: Slowly updated target network to calculate the targets.
            num_actions: Number of actions of the environment.
            gamma: discount factor of future rewards.
            batch_size: Number of samples per batch.
            tao: indicates the speed of adjustment of the slowly updated target network.
            epsilon: Chance to sample a random action. Float betwen 0 and 1.
            lr: learning rate of the optimizer
        """
        # setup networks
        self.Q = Q.cuda()
        self.Q_target = Q_target.cuda()
        self.Q_target.load_state_dict(self.Q.state_dict())

        # define replay buffer
        self.replay_buffer = ReplayBuffer(history_length)
        
        # parameters
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.epsilon = epsilon

        self.loss_function = torch.nn.MSELoss()
        self.optimizer = optim.Adam(self.Q.parameters(), lr=lr)

        self.num_actions = num_actions
예제 #7
0
    def __init__(self,
                 id,
                 env,
                 state_size,
                 action_size,
                 n_episodes,
                 lr,
                 gamma,
                 global_network,
                 target_network,
                 q,
                 max_t=1000,
                 eps_start=1.0,
                 eps_end=0.01,
                 eps_decay=0.995):
        super(DynaQAgent, self).__init__()
        self.id = id
        self.env = env
        self.state_size = state_size
        self.action_size = action_size
        self.n_episodes = n_episodes
        self.gamma = gamma

        self.q = q

        self.local_memory = ReplayBuffer(self.action_size, BUFFER_SIZE,
                                         BATCH_SIZE)

        self.t_step = 0
        self.max_t = max_t
        self.eps_start = eps_start
        self.eps_end = eps_end
        self.eps_decay = eps_decay

        self.experience = namedtuple(
            "Experience",
            field_names=["state", "action", "reward", "next_state", "done"])

        self.global_network = global_network
        self.target_network = target_network

        self.optimizer = optim.SGD(self.global_network.parameters(),
                                   lr=lr,
                                   momentum=.5)

        self.scores_window = deque(maxlen=100)  # last 100 scores
예제 #8
0
    def __init__(self,
                 id,
                 env,
                 do_render,
                 state_size,
                 action_size,
                 n_episodes,
                 lr,
                 gamma,
                 update_every,
                 global_network,
                 target_network,
                 max_t=1000,
                 eps_start=1.0,
                 eps_end=0.01,
                 eps_decay=0.995):
        super(DQNAgent, self).__init__()
        self.id = id
        self.env = env
        self.do_render = do_render
        self.state_size = state_size
        self.action_size = action_size
        self.n_episodes = n_episodes
        self.gamma = gamma
        self.update_every = update_every

        self.local_memory = ReplayBuffer(env.action_space.n, BUFFER_SIZE,
                                         BATCH_SIZE)

        self.global_network = global_network
        self.qnetwork_target = target_network

        self.optimizer = optim.SGD(self.global_network.parameters(),
                                   lr=lr,
                                   momentum=.5)

        self.t_step = 0
        self.max_t = max_t
        self.eps_start = eps_start
        self.eps_end = eps_end
        self.eps_decay = eps_decay
예제 #9
0
 def __init__(self, observation_space, action_space):
     """
     Changes the frame to be same input as the PyTorchFrame wrapper frames
     Then creates a replay bugffer and a vanilla DQN which the weights are loaded into
     """
     shape = observation_space.shape
     self.observation_space = gym.spaces.Box(low=0.0,
                                             high=1.0,
                                             shape=(shape[-1], shape[0],
                                                    shape[1]),
                                             dtype=np.uint8)
     self.action_space = action_space
     self.memory = ReplayBuffer(int(5e3))
     self.policy_network = DQN(self.observation_space, self.action_space)
     self.policy_network.load_state_dict(
         torch.load("checkpoints/40000.pth",
                    map_location=torch.device(device)))
     self.policy_network.eval()
예제 #10
0
class DQNAgent:
    def __init__(self,
                 Q,
                 Q_target,
                 num_actions,
                 game,
                 exploration,
                 discount_factor=0.99,
                 batch_size=64,
                 epsilon=0.2,
                 epsilon_decay=0.99,
                 epsilon_min=0.03):
        """
         Q-Learning agent for off-policy TD control using Function Approximation.
         Finds the optimal greedy policy while following an epsilon-greedy policy.

         Args:
            Q: Action-Value function estimator (Neural Network)
            Q_target: Slowly updated target network to calculate the targets.
            num_actions: Number of actions of the environment.
            discount_factor: gamma, discount factor of future rewards.
            batch_size: Number of samples per batch.
            epsilon: Chance to sample a random action. Float betwen 0 and 1.
        """
        self.Q = Q
        self.Q_target = Q_target

        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min

        self.num_actions = num_actions
        self.batch_size = batch_size
        self.discount_factor = discount_factor

        self.exploration = exploration

        self.game = game

        # define replay buffer
        self.replay_buffer = ReplayBuffer()

        # Start tensorflow session
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver()

    def train(self, state, action, next_state, reward, done):
        """
        This method stores a transition to the replay buffer and updates the Q networks.
        """

        # TODO:
        # 1. add current transition to replay buffer
        # 2. sample next batch and perform batch update:
        #       2.1 compute td targets:
        #              td_target =  reward + discount * max_a Q_target(next_state_batch, a)
        #       2.2 update the Q network
        #              self.Q.update(...)
        #       2.3 call soft update for target network
        #              self.Q_target.update(...)

        self.replay_buffer.add_transition(state, action, next_state, reward,
                                          done)
        batch_state, batch_action, batch_next_state, batch_rewards, batch_done = self.replay_buffer.next_batch(
            self.batch_size)

        td_target = batch_rewards
        #td_target += self.discount_factor * np.amax(self.Q_target.predict(self.sess, batch_next_state)) #use this or think of something better

        best_action = np.amax(
            self.Q.predict(self.sess,
                           batch_next_state)[np.logical_not(batch_done)], 1)
        td_target[np.logical_not(
            batch_done)] += self.discount_factor * self.Q_target.predict(
                self.sess, batch_next_state)[np.logical_not(batch_done),
                                             best_action]

        self.Q.update(self.sess, batch_state, batch_action, td_target)
        self.Q_target.update(self.sess)

    def act(self, state, deterministic):
        """
        This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action)    
        Args:
            state: current state input
            deterministic:  if True, the agent should execute the argmax action (False in training, True in evaluation)
        Returns:
            action id
        """
        r = np.random.uniform()

        if deterministic:

            action_id = np.argmax(self.Q.predict(self.sess, [state]))

        else:

            if self.exploration == "greedy":

                if self.epsilon > self.epsilon_min:

                    self.epsilon *= self.epsilon_decay

                r = np.random.uniform()

                if r > self.epsilon:
                    # TODO: take greedy action (argmax)
                    action_id = np.argmax(self.Q.predict(self.sess, [state]))

                else:

                    # TODO: sample random action
                    # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work.
                    # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight.
                    # To see how the agent explores, turn the rendering in the training on and look what the agent is doing.
                    # action_id = ...

                    if self.game == "cartpole":
                        action_id = np.random.randint(
                            self.num_actions)  #define number of actions

                    #else if self.game == "CarRacing" :

                    #action_id = ....

                    else:
                        print('Please enter a valid game.')

    # if exploration == "boltzmann":

    #  else:

        return action_id

    def load(self, file_name):
        self.saver.restore(self.sess, file_name)
예제 #11
0
class DQNAgent:

    def __init__(self, Q, Q_target, num_actions, discount_factor=0.99, batch_size=64, epsilon=0.05,epsilon_decay=1, epsilon_min=0.05,tau=1, game='cartpole',exploration="epsilon_greedy", history_length=0) :#, load_data=False):
        """
         Q-Learning agent for off-policy TD control using Function Approximation.
         Finds the optimal greedy policy while following an epsilon-greedy policy.

         Args:
            Q: Action-Value function estimator (Neural Network)
            Q_target: Slowly updated target network to calculate the targets.
            num_actions: Number of actions of the environment.
            discount_factor: gamma, discount factor of future rewards.
            batch_size: Number of samples per batch.
            epsilon: Chance to sample a random action. Float betwen 0 and 1.
        """
        self.Q = Q      
        self.Q_target = Q_target
        self.game = game

        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.tau = tau
        self.epsilon_min = epsilon_min

        self.num_actions = num_actions
        self.batch_size = batch_size
        self.discount_factor = discount_factor

        self.exploration = exploration

        # define replay buffer
        self.replay_buffer = ReplayBuffer()

        # Start tensorflow session
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver()


    def train(self, state, action, next_state, reward, terminal):
        """
        This method stores a transition to the replay buffer and updates the Q networks.
        """
        # TODO:
        # 1. add current transition to replay buffer
        # 2. sample next batch and perform batch update: 
        #       2.1 compute td targets: 
        #              td_target =  reward + discount * max_a Q_target(next_state_batch, a)
        #       2.2 update the Q network
        #              self.Q.update(...)
        #       2.3 call soft update for target network
        #              self.Q_target.update(...)
   
        '''
        self.replay_buffer.add_transition(state, action, next_state, reward, terminal)        
        states, actions, next_states, rewards, dones = self.replay_buffer.next_batch (self.batch_size)
 
        target_f = np.zeros((self.batch_size))

        for i in range(self.batch_size):
            if dones[i]:
                target_f[i] = rewards[i]
            else:
                target_f[i] = rewards[i] + self.discount_factor * np.max(self.Q_target.predict(self.sess, [next_states[i]]), 1)
                

        loss = self.Q.update(self.sess, states, actions, target_f)

        self.Q_target.update(self.sess)
        '''
        self.replay_buffer.add_transition(state, action, next_state, reward, terminal)
        batch_state, batch_action, batch_next_state, batch_rewards, batch_done = self.replay_buffer.next_batch(self.batch_size)

        td_target =  batch_rewards

        best_action = np.argmax(self.Q.predict(self.sess, batch_next_state)[np.logical_not(batch_done)], 1)
	
        td_target[np.logical_not(batch_done)] += self.discount_factor * self.Q_target.predict(self.sess, batch_next_state)[np.logical_not(batch_done), best_action]

        loss = self.Q.update(self.sess, batch_state, batch_action, td_target)
        self.Q_target.update(self.sess)


        return loss


    def act(self, state, deterministic):
        """
        This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action)    
        Args:
            state: current state input
            deterministic:  if True, the agent should execute the argmax action (False in training, True in evaluation)
        Returns:
            action id
        """
        '''
        r = np.random.uniform()

        if deterministic or r > self.epsilon:
            # TODO: take greedy action (argmax)
            # action_id = ...
            action_id = np.argmax(self.Q.predict(self.sess, [state]))
        else:
        
            if self.game == 'cartpole':
                action_id = random.randrange(self.num_actions)
            elif self.game == 'carracing':

            # TODO: sample random action
            # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work. 
            # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight.
            # To see how the agent explores, turn the rendering in the training on and look what the agent is doing.
            # action_id = ...
                
                probabilities = [0.1, 0.2, 0.2, 0.45, 0.05]
                
                action_id = np.random.choice (self.num_actions, p=probabilities)
        '''

        if deterministic:
            
            action_id = np.argmax(self.Q.predict(self.sess, [state]))
        else:
            if self.exploration == "greedy":
                if self.epsilon > self.epsilon_min:
                    self.epsilon *= self.epsilon_decay
                r = np.random.uniform()
                if r > self.epsilon:
                    # TODO: take greedy action (argmax)
                    
                    action_id = np.argmax(self.Q.predict(self.sess, [state]))
                else:
                    # TODO: sample random action
                    # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work. 
                    # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight.
                    # To see how the agent explores, turn the rendering inthe training on and look what the agent is doing.
                    if self.game == "cartpole" :
                        action_id = np.random.randint(self.num_actions)
                    elif self.game == "carracing":
                        probabilities = [0.15, 0.15, 0.15, 0.3, 0.05, 0.1, 0.1]
                
                        action_id = np.random.choice (self.num_actions, p=probabilities)
                    else:
                        print("Invalid game")
            elif self.exploration == "boltzmann":
                action_value = self.Q.predict(self.sess, [state])[0]
                prob = self.softmax(action_value/self.tau)
                action_id = np.random.choice(self.num_actions, p=prob)
            else:
                print("Invalid Exploration Type")


        return action_id


    def softmax(self, input):
        """
        Safe Softmax function to avoid overflow
        Args:
            input: input vector
        Returns:
            prob: softmax of input
        """
        input_max = np.max(input)
        e = np.exp(input-input_max)
        prob = e / np.sum(e)
        return prob


    def load(self, file_name):
        self.saver.restore(self.sess, file_name)


    def check_early_stop(self, reward, totalreward):
        return self.Q_target.check_early_stop (reward, totalreward)
예제 #12
0
def main():
    config = {'starting-floor': 0, 'total-floors': 9, 'dense-reward': 1,
              'lighting-type': 0, 'visual-theme': 0, 'default-theme': 0, 'agent-perspective': 1, 'allowed-rooms': 0,
              'allowed-modules': 0,
              'allowed-floors': 0,
              }
    env = ObstacleTowerEnv('./ObstacleTower/obstacletower',
                           worker_id=1, retro=True, realtime_mode=False, config=config)
    print(env.observation_space)
    print(env.action_space)

    hyper_params = {
        "seed": 6,  # which seed to use
        "replay-buffer-size": int(5e3),  # replay buffer size
        "learning-rate": 1e-4,  # learning rate for Adam optimizer
        "discount-factor": 0.99,  # discount factor
        "num-steps": int(1e6),  # total number of steps to run the environment for
        "batch-size": 32,  # number of transitions to optimize at the same time
        "learning-starts": 5000,  # number of steps before learning starts
        "learning-freq": 1,  # number of iterations between every optimization step
        "use-double-dqn": True,  # use double deep Q-learning
        "target-update-freq": 1000,  # number of iterations between every target network update
        "eps-start": 1.0,  # e-greedy start threshold
        "eps-end": 0.01,  # e-greedy end threshold
        "eps-fraction": 0.05,  # fraction of num-steps
        "print-freq": 10
    }

    np.random.seed(hyper_params["seed"])
    random.seed(hyper_params["seed"])

    #assert "NoFrameskip" in hyper_params["env"], "Require environment with no frameskip"
    #env = gym.make(hyper_params["env"])
    env.seed(hyper_params["seed"])

    #env = NoopResetEnv(env, noop_max=30)
    #env = MaxAndSkipEnv(env, skip=4)
    #env = EpisodicLifeEnv(env)
    #env = FireResetEnv(env)
    # env = WarpFrame(env)
    env = PyTorchFrame(env)
    # env = ClipRewardEnv(env)
    # env = FrameStack(env, 4)

    replay_buffer = ReplayBuffer(hyper_params["replay-buffer-size"])

    agent = DQNAgent(
        env.observation_space,
        env.action_space,
        replay_buffer,
        use_double_dqn=hyper_params["use-double-dqn"],
        lr=hyper_params["learning-rate"],
        batch_size=hyper_params["batch-size"],
        gamma=hyper_params["discount-factor"]
    )

    model_num = 500
    agent.policy_network.load_state_dict(torch.load('./Models/' + str(model_num) + '_policy.pt',map_location=torch.device(device)))

    eps_timesteps = hyper_params["eps-fraction"] * float(hyper_params["num-steps"])
    episode_rewards = [0.0]
    ep_nums = model_num

    state = env.reset()
    for t in range(hyper_params["num-steps"]):
        fraction = min(1.0, float(t) / eps_timesteps)
        eps_threshold = hyper_params["eps-start"] + fraction * (hyper_params["eps-end"] - hyper_params["eps-start"])
        sample = random.random()
        # TODO
        #  select random action if sample is less equal than eps_threshold
        # take step in env
        # add state, action, reward, next_state, float(done) to reply memory - cast done to float
        # add reward to episode_reward
        if sample > eps_threshold:
            action = agent.act(np.array(state))
        else:
            action = env.action_space.sample()

        next_state, reward, done, _ = env.step(action)
        agent.memory.add(state, action, reward, next_state, float(done))
        state = next_state

        episode_rewards[-1] += reward
        if done:
            state = env.reset()
            episode_rewards.append(0.0)
            ep_nums += 1
            if ep_nums % 50 == 0:
                agent.save_models(ep_nums)
                plot(episode_rewards,ep_nums)




        if t > hyper_params["learning-starts"] and t % hyper_params["learning-freq"] == 0:
            agent.optimise_td_loss()

        if t > hyper_params["learning-starts"] and t % hyper_params["target-update-freq"] == 0:
            agent.update_target_network()

        num_episodes = len(episode_rewards)

        if done and hyper_params["print-freq"] is not None and len(episode_rewards) % hyper_params[
            "print-freq"] == 0:
            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            print("********************************************************")
            print("steps: {}".format(t))
            print("episodes: {}".format(num_episodes))
            print("mean 100 episode reward: {}".format(mean_100ep_reward))
            print("% time spent exploring: {}".format(int(100 * eps_threshold)))
            print("********************************************************")


        #if done and ep_nums % 10 == 0:
        #    animate(env,agent,"anim/progress_"+str(ep_nums))
        #    state = env.reset()

    animate(env,agent,"anim/final")


    env.close()
예제 #13
0
파일: train_atari.py 프로젝트: raillab/dqn
    np.random.seed(args.seed)
    random.seed(args.seed)

    assert "NoFrameskip" in args.env, "Require environment with no frameskip"
    env = gym.make(args.env)
    env.seed(args.seed)
    env = NoopResetEnv(env, noop_max=30)
    env = MaxAndSkipEnv(env, skip=4)
    env = EpisodicLifeEnv(env)
    env = FireResetEnv(env)
    env = WarpFrame(env)
    env = PyTorchFrame(env)
    env = ClipRewardEnv(env)
    env = FrameStack(env, 4)

    replay_buffer = ReplayBuffer(args.replay_buffer_size)

    agent = DQNAgent(
        env.observation_space,
        env.action_space,
        replay_buffer,
        use_double_dqn=args.use_double_dqn,
        lr=args.lr,
        batch_size=args.batch_size,
        gamma=args.gamma
    )

    eps_timesteps = args.eps_fraction * float(args.num_steps)
    episode_rewards = [0.0]
    loss = [0.0]
예제 #14
0
class DQNAgent:
    def __init__(self,
                 Q,
                 Q_target,
                 num_actions,
                 discount_factor=0.995,
                 batch_size=64,
                 epsilon=0.05):
        """
         Q-Learning agent for off-policy TD control using Function Approximation.
         Finds the optimal greedy policy while following an epsilon-greedy policy.

         Args:
            Q: Action-Value function estimator (Neural Network)
            Q_target: Slowly updated target network to calculate the targets.
            num_actions: Number of actions of the environment.
            discount_factor: gamma, discount factor of future rewards.
            batch_size: Number of samples per batch.
            epsilon: Chance to sample a random action. Float betwen 0 and 1.
        """
        self.Q = Q
        self.Q_target = Q_target

        self.epsilon = epsilon
        self.epsilon_min = 0.1
        self.epsilon_decay = 0.99

        self.num_actions = num_actions
        self.batch_size = batch_size
        self.discount_factor = discount_factor

        self.neg_reward_counter = 0
        self.max_neg_rewards = 100

        # define replay buffer
        self.replay_buffer = ReplayBuffer()

        # Start tensorflow session
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver()

    def train(self, state, action, next_state, reward, terminal):
        """
        This method stores a transition to the replay buffer and updates the Q networks.
        """

        # TODO:
        # 1. add current transition to replay buffer
        self.replay_buffer.add_transition(state, action, next_state, reward,
                                          terminal)
        # 2. sample next batch and perform batch update:
        #self.gas_actions = np.array([a == 3 for a in self.replay_buffer._data.actions])
        batch_states, batch_actions, batch_next_states, batch_rewards, batch_dones = self.replay_buffer.next_batch(
            self.batch_size)

        td_target = batch_rewards
        td_target[np.logical_not(
            batch_dones)] += self.discount_factor * np.amax(
                self.Q_target.predict(self.sess, batch_next_states),
                1)[np.logical_not(batch_dones)]
        #print(batch_actions)
        loss = self.Q.update(self.sess, batch_states, batch_actions, td_target)

        self.Q_target.update(self.sess)

        #if self.epsilon > self.epsilon_min:
        #   self.epsilon *= self.epsilon_decay
        #print(self.epsilon)

    def act(self, state, deterministic):
        """
        This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action)
        Args:
            state: current state input
            deterministic:  if True, the agent should execute the argmax action (False in training, True in evaluation)
        Returns:
            action id
        """
        r = np.random.uniform()
        if deterministic or r > self.epsilon:
            act_values = self.Q.predict(self.sess, [state])
            action_id = np.argmax(self.Q.predict(self.sess, [state]))
            #print("I PREDICTED")
            #print("action_id_predicted: ", action_id)
            return action_id
        else:
            action_id = np.random.choice(
                [0, 1, 2, 3, 4],
                p=[0.3, 0.1, 0.1, 0.49,
                   0.01])  #straight, left, right, accelerate, brake
            # TODO: sample random action
            # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work.
            # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight.
            # To see how the agent explores, turn the rendering in the training on and look what the agent is doing.
            # print("action_id: ", action_id)
            #print("action_id_random: ", action_id)
            return action_id

    def load(self, file_name):
        self.saver.restore(self.sess, file_name)
예제 #15
0
    env = NoopResetEnv(env, noop_max=30)
    env = MaxAndSkipEnv(env, skip=4)
    env = EpisodicLifeEnv(env)
    env = FireResetEnv(env)
    env = WarpFrame(env)
    env = PyTorchFrame(env)
    env = ClipRewardEnv(env)
    env = FrameStack(env, 4)
    env = gym.wrappers.Monitor(
        env,
        './video/',
        video_callable=lambda episode_id: episode_id % 50 == 0,
        force=True)

    replay_buffer = ReplayBuffer(hyper_params["replay-buffer-size"])

    agent = DQNAgent(
        env.observation_space,
        env.action_space,
        replay_buffer,
        use_double_dqn=hyper_params["use-double-dqn"],
        lr=hyper_params['learning-rate'],
        batch_size=hyper_params['batch-size'],
        gamma=hyper_params['discount-factor'],
        device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
        dqn_type=hyper_params["dqn_type"])

    if (args.load_checkpoint_file):
        print(f"Loading a policy - { args.load_checkpoint_file } ")
        agent.policy_network.load_state_dict(
예제 #16
0
class DQNAgent:
    def __init__(self,
                 Q,
                 Q_target,
                 num_actions,
                 game="cartpole",
                 explore_type="epsilon_greedy",
                 epsilon_decay=1,
                 epsilon_min=0.05,
                 tau=1,
                 method="CQL",
                 discount_factor=0.99,
                 batch_size=64,
                 epsilon=0.05):
        """
         Q-Learning agent for off-policy TD control using Function Approximation.
         Finds the optimal greedy policy while following an epsilon-greedy policy.

         Args:
            Q: Action-Value function estimator (Neural Network)
            Q_target: Slowly updated target network to calculate the targets.
            num_actions: Number of actions of the environment.
            discount_factor: gamma, discount factor of future rewards.
            batch_size: Number of samples per batch.
            epsilon: Chance to sample a random action. Float betwen 0 and 1.
        """
        self.Q = Q
        self.Q_target = Q_target
        # now support cartpole or carracing two games
        self.game = game
        # self.state_dim = Q.
        self.epsilon = epsilon
        self.num_actions = num_actions
        self.batch_size = batch_size
        self.discount_factor = discount_factor
        # now support CQL(classical Q) or DQL(Double Q)
        self.method = method
        self.explore_type = explore_type
        # for epsilon annealing
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        # for boltzmann exploration
        self.tau = tau
        # define replay buffer
        self.replay_buffer = ReplayBuffer()

        # start tensorflow session
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver()

    def train(self, state, action, next_state, reward, terminal):
        """
        This method stores a transition to the replay buffer and updates the Q networks.
        """

        # TODO:
        # 1. add current transition to replay buffer
        # 2. sample next batch and perform batch update:
        #       2.1 compute td targets:
        #              td_target =  reward + discount * argmax_a Q_target(next_state_batch, a)
        #       2.2 update the Q network
        #              self.Q.update(...)
        #       2.3 call soft update for target network
        #              self.Q_target.update(...)
        self.replay_buffer.add_transition(state, action, next_state, reward,
                                          terminal)
        batch_states, batch_actions, batch_next_states, batch_rewards, batch_dones = self.replay_buffer.next_batch(
            self.batch_size)
        td_target = batch_rewards
        if self.method == "CQL":
            td_target[np.logical_not(
                batch_dones)] += self.discount_factor * np.max(
                    self.Q_target.predict(self.sess, batch_next_states),
                    1)[np.logical_not(batch_dones)]
            self.Q.update(self.sess, batch_states, batch_actions, td_target)
            self.Q_target.update(self.sess)
        elif self.method == "DQL":
            best_action = np.argmax(
                self.Q.predict(self.sess,
                               batch_next_states)[np.logical_not(batch_dones)],
                1)
            td_target[np.logical_not(
                batch_dones)] += self.discount_factor * self.Q_target.predict(
                    self.sess, batch_next_states)[np.logical_not(batch_dones),
                                                  best_action]
            self.Q.update(self.sess, batch_states, batch_actions, td_target)
            self.Q_target.update(self.sess)

    def act(self, state, deterministic):
        """
        This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action)    
        Args:
            state: current state input
            deterministic:  if True, the agent should execute the argmax action (False in training, True in evaluation)
        Returns:
            action id
        """
        if deterministic:
            action_id = np.argmax(self.Q.predict(self.sess, [state]))
        else:
            if self.explore_type == "epsilon_greedy":
                if self.epsilon > self.epsilon_min:
                    self.epsilon *= self.epsilon_decay
                r = np.random.uniform()
                if r > self.epsilon:
                    # TODO: take greedy action (argmax)
                    action_id = np.argmax(self.Q.predict(self.sess, [state]))
                else:
                    # TODO: sample random action
                    # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work.
                    # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight.
                    # To see how the agent explores, turn the rendering inthe training on and look what the agent is doing.
                    if self.game == "cartpole" or self.game == "mountaincar":
                        action_id = np.random.randint(self.num_actions)
                    elif self.game == "carracing":
                        # action_probability = np.array([1, 2, 2, 10, 1, 1, 1])
                        action_probability = np.array([2, 5, 5, 10, 1])
                        action_probability = action_probability / np.sum(
                            action_probability)
                        action_id = np.random.choice(self.num_actions,
                                                     p=action_probability)
                    else:
                        print("Invalid game")
            elif self.explore_type == "boltzmann":
                action_value = self.Q.predict(self.sess, [state])[0]
                prob = self.softmax(action_value / self.tau)
                action_id = np.random.choice(self.num_actions, p=prob)
            else:
                print("Invalid Exploration Type")
        return action_id

    def softmax(self, input):
        """
        Safe Softmax function to avoid overflow
        Args:
            input: input vector
        Returns:
            prob: softmax of input
        """
        input_max = np.max(input)
        e = np.exp(input - input_max)
        prob = e / np.sum(e)
        return prob

    def load(self, file_name):
        self.saver.restore(self.sess, file_name)
예제 #17
0
class DQNAgent(mp.Process):
    def __init__(self,
                 id,
                 env,
                 do_render,
                 state_size,
                 action_size,
                 n_episodes,
                 lr,
                 gamma,
                 update_every,
                 global_network,
                 target_network,
                 max_t=1000,
                 eps_start=1.0,
                 eps_end=0.01,
                 eps_decay=0.995):
        super(DQNAgent, self).__init__()
        self.id = id
        self.env = env
        self.do_render = do_render
        self.state_size = state_size
        self.action_size = action_size
        self.n_episodes = n_episodes
        self.gamma = gamma
        self.update_every = update_every

        self.local_memory = ReplayBuffer(env.action_space.n, BUFFER_SIZE,
                                         BATCH_SIZE)

        self.global_network = global_network
        self.qnetwork_target = target_network

        self.optimizer = optim.SGD(self.global_network.parameters(),
                                   lr=lr,
                                   momentum=.5)

        self.t_step = 0
        self.max_t = max_t
        self.eps_start = eps_start
        self.eps_end = eps_end
        self.eps_decay = eps_decay

    def act(self, state, eps=0.):
        if random.random() > eps:
            state = torch.from_numpy(state).float().unsqueeze(0).to(device)

            with torch.no_grad():
                action_values = self.global_network(state)

            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.local_memory.add(state, action, reward, next_state, done)

        # Increment local timer
        self.t_step += 1

        # If enough samples are available in memory, get random subset and learn
        # Learn every UPDATE_EVERY time steps.
        if self.t_step % self.update_every == 0:
            if self.t_step > BATCH_SIZE:
                experiences = self.local_memory.sample(BATCH_SIZE)
                self.learn(experiences)

    def compute_loss(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target.forward(
            next_states).detach().max(1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        # Q_expected = self.qnetwork_local(states).gather(1, actions)
        Q_expected = self.global_network.forward(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        return loss

    def learn(self, experiences):

        loss = self.compute_loss(experiences)

        # Update gradients per HogWild! algorithm
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update(self.global_network, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def run(self):
        scores = []
        scores_window = deque(maxlen=100)  # last 100 scores
        eps = self.eps_start  # initialize epsilon
        start_time = time.time()
        for i_episode in range(1, self.n_episodes + 1):
            state = self.env.reset()
            score = 0
            for t in range(self.max_t):
                action = self.act(state, eps)
                if self.do_render:
                    self.env.render()
                next_state, reward, done, _ = self.env.step(action)
                self.step(state, action, reward, next_state, done)
                state = next_state
                score += reward
                if done:
                    break
            scores_window.append(score)  # save most recent score
            scores.append(score)  # save most recent score
            eps = max(self.eps_end, self.eps_decay * eps)  # decrease epsilon
            elapsed_time = time.time() - start_time
            if self.id == 0:
                print(
                    '\rThread: {}, Episode {}\tAverage Score: {:.2f}, Runtime: '
                    .format(self.id, i_episode, np.mean(scores_window)) +
                    time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
            if i_episode % 100 == 0:
                print(
                    '\rThread: {}, Episode {}\tAverage Score: {:.2f}, Runtime: '
                    .format(self.id, i_episode, np.mean(scores_window)) +
                    time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
            if np.mean(scores_window) >= 200.0:
                print(
                    '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                    .format(i_episode - 100, np.mean(scores_window)))
                torch.save(self.global_network.state_dict(), 'checkpoint.pth')
                break
예제 #18
0
              'allowed-floors': 0,
              }
    worker_id = int(np.random.randint(999, size=1))
    print(worker_id)
    env = ObstacleTowerEnv('./ObstacleTower/obstacletower', docker_training=False, worker_id=worker_id, retro=True,
                            realtime_mode=False, config=config)
    env.seed(random_seed)

    # Run with specific wrappers #
    # This is the only Wrapper we used, as the others were didn't add enough value
    env = PyTorchFrame(env)
    # env = FrameStack(env, 3)
    # env = HumanActionEnv(env)

    # Create Agent to Train
    replay_buffer = ReplayBuffer(int(5e3))
    agent = DQNAgent(
        env.observation_space,
        env.action_space,
        replay_buffer,
        use_double_dqn=True,
        lr=args.lr,
        batch_size=hyper_params["batch-size"],
        gamma=hyper_params["discount-factor"],
    )

    # If we have pretrained weights, load them
    if(args.checkpoint):
        print(f"Loading a policy - { args.checkpoint } ")
        agent.policy_network.load_state_dict(torch.load(args.checkpoint))
예제 #19
0
class DQNAgent:
    def __init__(self,
                 Q,
                 Q_target,
                 num_actions,
                 discount_factor=0.95,
                 batch_size=64,
                 epsilon=1):
        """
         Q-Learning agent for off-policy TD control using Function Approximation.
         Finds the optimal greedy policy while following an epsilon-greedy policy.

         Args:
            Q: Action-Value function estimator (Neural Network)
            Q_target: Slowly updated target network to calculate the targets.
            num_actions: Number of actions of the environment.
            discount_factor: gamma, discount factor of future rewards.
            batch_size: Number of samples per batch.
            epsilon: Chance to sample a random action. Float betwen 0 and 1.
        """
        self.Q = Q
        self.Q_target = Q_target

        self.epsilon = epsilon
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01

        self.num_actions = num_actions
        self.batch_size = batch_size
        self.discount_factor = discount_factor

        # define replay buffer
        self.replay_buffer = ReplayBuffer()

        # Start tensorflow session
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver()

    def train(self, state, action, next_state, reward, terminal):
        """
        This method stores a transition to the replay buffer and updates the Q networks.
        """

        # TODO:
        # 1. add current transition to replay buffer
        self.replay_buffer.add_transition(state, action, next_state, reward,
                                          terminal)
        # 2. sample next batch and perform batch update:
        batch_states, batch_actions, batch_next_states, batch_rewards, batch_dones = self.replay_buffer.next_batch(
            self.batch_size)
        for i in range(self.batch_size):
            # print("next state: ", batch_next_states[i])
            td_target = batch_rewards[i]
            if not batch_dones[i]:
                td_target = batch_rewards[i] + self.discount_factor * np.amax(
                    self.Q_target.predict(self.sess, [batch_next_states[i]]))
            target_f = self.Q_target.predict(self.sess, [batch_states[i]])

            target_f[0][batch_actions[i]] = td_target
            loss = self.Q.update(self.sess, [batch_states[i]],
                                 [batch_actions[i]], target_f[0])  #td_targets)
            self.Q_target.update(self.sess)
        #print("loss:", loss)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        #print("epsilon: ", self.epsilon)

    def act(self, state, deterministic):
        """
        This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action)
        Args:
            state: current state input
            deterministic:  if True, the agent should execute the argmax action (False in training, True in evaluation)
        Returns:
            action id
        """
        r = np.random.uniform()
        if deterministic or r > self.epsilon:
            # TODO: take greedy action (argmax)
            #state = np.reshape(state, (1,4))
            act_values = self.Q.predict(self.sess, [state])  #it was q target
            # we should be using act_values[0], i guess
            # print("act values: ", act_values)             # act values:  [[0.05641035 0.06138265]]
            # print("act values[0]: ", act_values[0])       # act values[0]:  [0.05641035 0.06138265]

            action_id = np.argmax(act_values[0])

            #print("predicted action. deterministic: {}. epsilon cond: {}. action_id: {}."
            #.format(deterministic, (r > self.epsilon), action_id))
        else:
            action_id = random.randrange(self.num_actions)
            #print("random action. deterministic: {}. epsilon cond.: {}. action_id: {}."
            #.format(deterministic, (r > self.epsilon), action_id))
            # TODO: sample random action
            # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work.
            # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight.
            # To see how the agent explores, turn the rendering in the training on and look what the agent is doing.
        # print("action_id: ", action_id)
        return action_id

    def load(self, file_name):
        self.saver.restore(self.sess, file_name)
예제 #20
0
class Agent:
    def __init__(self,
                 Q,
                 Q_target,
                 num_actions,
                 discount_factor=0.99,
                 batch_size=64,
                 epsilon=0.95,
                 epsilon_min=0.05,
                 epsilon_decay=0.995,
                 exploration_type='e-annealing',
                 learning_type='dq',
                 replay_buffer_size=1e5):
        self.Q = Q
        self.Q_target = Q_target

        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay

        self.exploration_type = exploration_type
        self.learning_type = learning_type

        self.num_actions = num_actions
        self.batch_size = batch_size
        self.discount_factor = discount_factor

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(replay_buffer_size)

        # start tensorflow session
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver()

    # add transition to the replay buffer
    def add(self, state, action, next_state, reward, terminal):
        self.replay_buffer.add_transition(state, action, next_state, reward,
                                          terminal)

    # train network
    def train(self):
        # sample batch from the replay buffer
        batch_states, batch_actions, batch_next_states, batch_rewards, batch_dones = self.replay_buffer.next_batch(
            self.batch_size)

        # compute td targets using q- or double q-learning
        if self.learning_type == 'q':  # q learning
            batch_rewards[np.logical_not(
                batch_dones)] += self.discount_factor * np.max(
                    self.Q_target.predict(self.sess, batch_next_states),
                    axis=1)[np.logical_not(batch_dones)]
        else:  # double q learning
            q_actions = np.argmax(self.Q.predict(self.sess, batch_next_states),
                                  axis=1)
            batch_rewards[np.logical_not(
                batch_dones)] += self.discount_factor * self.Q_target.predict(
                    self.sess,
                    batch_next_states)[np.arange(self.batch_size),
                                       q_actions][np.logical_not(batch_dones)]

        # update network and target network
        loss = self.Q.update(self.sess, batch_states, batch_actions,
                             batch_rewards)
        self.Q_target.update(self.sess)

        return loss

    # get action for state
    def act(self, state, deterministic):
        r = np.random.uniform()
        if deterministic or (self.exploration_type != 'boltzmann'
                             and r > self.epsilon):
            # take greedy action (argmax)
            a_pred = self.Q.predict(self.sess, [state])
            action_id = np.argmax(a_pred)
        else:
            if self.exploration_type == 'boltzmann':
                actions = self.Q.predict(self.sess, [state])[0]

                # softmax calculation, subtracting max for stability
                actions = np.exp((actions - max(actions)) / self.epsilon)
                actions /= np.sum(actions)

                # selecting action following probabilities
                a_value = np.random.choice(actions, p=actions)
                action_id = np.argmax(a_value == actions)
            else:
                # sample random action
                action_id = np.random.randint(0, self.num_actions)
        return action_id

    # anneal epsilon
    def anneal(self, e=0):
        self.epsilon = max(self.epsilon_min,
                           self.epsilon * self.epsilon_decay)  # linear
        #self.epsilon = max(self.epsilon_min, self.epsilon * np.exp(-(1 - self.epsilon_decay) * e))

    # load trained network
    def load(self, folder):
        self.saver.restore(self.sess, tf.train.latest_checkpoint(folder))
예제 #21
0
class DQNAgent:
    def __init__(self,
                 Q,
                 Q_target,
                 num_actions,
                 discount_factor=0.99,
                 batch_size=64,
                 epsilon=0.05):
        """
         Q-Learning agent for off-policy TD control using Function Approximation.
         ########################################################################
         TD here for using as new target R + discount_factor * Q(S', A')
         off-policy -> use old data collected on other policy, too
         #######################################################################
         Finds the optimal greedy policy while following an epsilon-greedy policy.

         Args:
            Q: Action-Value function estimator (Neural Network)
            Q_target: Slowly updated target network to calculate the targets.
            num_actions: Number of actions of the environment.
            discount_factor: gamma, discount factor of future rewards.
            batch_size: Number of samples per batch.
            epsilon: Chance to sample a random action. Float betwen 0 and 1.
        """
        self.Q = Q
        self.Q_target = Q_target

        self.epsilon = epsilon

        self.num_actions = num_actions
        self.batch_size = batch_size
        self.discount_factor = discount_factor

        # define replay buffer
        self.replay_buffer = ReplayBuffer(use_manual_data=False)

        # Start tensorflow session
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver()

    def train(self,
              state,
              action,
              next_state,
              reward,
              terminal,
              collect_data_first=False):
        """
        This method stores a transition to the replay buffer and updates the Q networks.
        """

        # add current transition to replay buffer
        self.replay_buffer.add_transition(state, action, next_state, reward,
                                          terminal)
        # if the ReplayBuffer should be filled up first, then the train step is done here
        if collect_data_first and len(
                self.replay_buffer._data.states) < self.batch_size:
            print("No training yet. Filling up replay buffer..")

            # return 0 for loss and q_values
            return 0, [0, 0]

        # If the ReplayBuffer should not be filled up or is full enough, do the following
        else:
            # get a random batch from the ReplayBuffer
            batch_states, batch_actions, batch_next_states, batch_rewards, batch_dones = \
                self.replay_buffer.next_batch(self.batch_size)

            batch_targets = np.zeros((self.batch_size))

            for i in range(self.batch_size):
                # if a state is a final state, only use the direct reward
                if batch_dones[i]:
                    batch_targets[i] = batch_rewards[i]
                # otherwise comput the td_target
                else:
                    td_target = batch_rewards[i] + self.discount_factor * \
                        np.max(self.Q_target.predict(self.sess, [batch_next_states[i]]))
                    batch_targets[i] = td_target

            # update Q network
            loss = self.Q.update(self.sess, batch_states, batch_actions,
                                 batch_targets)
            # get predictions to check q-values -> e.g. are they diverging?
            q_preds = self.Q.predict(self.sess, batch_states)

            # update target network
            self.Q_target.update(self.sess)

        return loss, q_preds

    def act(self, state, deterministic):
        """
        This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action)
        Args:
            state: current state input
            deterministic:  if True, the agent should execute the argmax action (False in training, True in evaluation)
        Returns:
            action id
        """
        r = np.random.uniform()
        if deterministic or r > self.epsilon:
            # take greedy action (argmax)
            action_id = np.argmax(self.Q.predict(self.sess, [state]))
            # print("Deterministic action:", action_id)
        else:

            # sample random action
            # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work.
            # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight.
            # To see how the agent explores, turn the rendering in the training on and look what the agent is doing.
            # for carracing:
            if self.num_actions == 5:
                action_id = np.random.choice(range(5),
                                             p=[0.32, 0.09, 0.09, 0.4, 0.1])
            # for cartpole
            action_id = np.random.randint(self.num_actions)
            # print("Explorative action:", action_id)

        return action_id

    def load(self, file_name):
        self.saver.restore(self.sess, file_name)
예제 #22
0
class Agent_DQN:
    def __init__(self, args, env):
        self.args = args
        self.env = env
        self.input_channels = 3 if 'SpaceInvaders' in args.env_id else 4
        self.num_actions = self.env.action_space.n

        # if testing, simply load the model we have trained
        if args.test_dqn:
            self.load(args.model)
            self.online_net.eval()
            self.target_net.eval()
            return
        # DQN variants setting
        self.prioritized = args.prioritized
        self.double = args.double
        self.n_steps = args.n_steps
        self.noise_linear = args.noise_linear
        if self.prioritized:
            self.memory = PrioritizedReplayBuffer(10000, alpha=0.6)
            self.beta_schedule = LinearSchedule(args.num_timesteps,
                                                initial_p=0.4,
                                                final_p=1.0)

            self.criterion = MSELoss
        else:
            self.memory = ReplayBuffer(10000)
            self.criterion = nn.MSELoss()

        if args.atari:
            DQN = DQN_Atari
            input_feature = self.input_channels
        else:
            DQN = DQN_Simple
            input_feature = env.observation_space.shape[0]

        # build target, online network
        self.target_net = DQN(input_feature,
                              self.num_actions,
                              dueling=args.dueling,
                              noise_linear=args.noise_linear)
        self.target_net = self.target_net.cuda(
        ) if use_cuda else self.target_net
        self.online_net = DQN(input_feature,
                              self.num_actions,
                              dueling=args.dueling,
                              noise_linear=args.noise_linear)
        self.online_net = self.online_net.cuda(
        ) if use_cuda else self.online_net

        # discounted reward
        self.GAMMA = 0.99

        # exploration setting
        self.exploration = LinearSchedule(schedule_timesteps=int(
            0.1 * args.num_timesteps),
                                          initial_p=1.0,
                                          final_p=0.05)

        # training settings
        self.train_freq = 4
        self.learning_start = 10000
        self.batch_size = args.batch_size
        self.num_timesteps = args.num_timesteps
        self.display_freq = args.display_freq
        self.save_freq = args.save_freq
        self.target_update_freq = args.target_update_freq
        self.optimizer = optim.RMSprop(self.online_net.parameters(), lr=1e-4)
        # global status
        self.episodes_done = 0
        self.steps = 0

    def make_action(self, observation, test=True):
        return self.act(observation, test)

    def save(self, save_path):
        print('save model to', save_path)
        torch.save(self.online_net, save_path + '_online')
        torch.save(self.target_net, save_path + '_target')

    def load(self, load_path):
        if use_cuda:
            self.online_net = torch.load(load_path + '_online')
            self.target_net = torch.load(load_path + '_target')
        else:
            self.online_net = torch.load(
                load_path + '_online',
                map_location=lambda storage, loc: storage)
            self.target_net = torch.load(
                load_path + '_target',
                map_location=lambda storage, loc: storage)

    def act(self, state, test=False):
        sample = random.random()
        if test:
            eps_threshold = 0.01
            state = torch.from_numpy(state).permute(2, 0, 1).unsqueeze(0)
            state = state.cuda() if use_cuda else state
        else:
            eps_threshold = self.exploration.value(self.steps)

        if sample > eps_threshold:
            action = self.online_net(
                Variable(state,
                         volatile=True).type(FloatTensor)).data.max(1)[1].view(
                             1, 1)
        else:
            action = LongTensor([[random.randrange(self.num_actions)]])
        return action if not test else action[0, 0]

    def reset_noise(self):
        assert self.noise_linear == True
        self.online_net.reset_noise()
        self.target_net.reset_noise()

    def update(self):
        if self.prioritized:
            batch, weight, batch_idxes = self.memory.sample(
                self.batch_size, beta=self.beta_schedule.value(self.steps))
            weight_batch = Variable(Tensor(weight)).squeeze()
        else:
            batch = self.memory.sample(self.batch_size)
        # Compute a mask of non-final states and concatenate the batch elements
        non_final_mask = ByteTensor(
            tuple(map(lambda s: s is not None, batch.next_state)))

        # We don't want to backprop through the expected action values and volatile
        # will save us on temporarily changing the model parameters'
        # requires_grad to False!
        non_final_next_states = Variable(torch.cat(
            [s for s in batch.next_state if s is not None]),
                                         volatile=True)
        state_batch = Variable(torch.cat(batch.state))
        action_batch = Variable(torch.cat(batch.action))
        reward_batch = Variable(torch.cat(batch.reward))

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken
        state_action_values = self.online_net(state_batch).gather(
            1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        next_state_values = Variable(torch.zeros(self.batch_size).type(Tensor))
        q_next = self.target_net(non_final_next_states)
        if self.double:
            _, best_actions = self.online_net(non_final_next_states).max(1)
            next_state_values[non_final_mask] = q_next.gather(
                1, best_actions.unsqueeze(1)).squeeze(1)
        else:
            next_state_values[non_final_mask] = q_next.max(1)[0]

        # Now, we don't want to mess up the loss with a volatile flag, so let's
        # clear it. After this, we'll just end up with a Variable that has
        # requires_grad=False
        next_state_values.volatile = False
        # Compute the expected Q values
        expected_state_action_values = (
            next_state_values * (self.GAMMA**(self.n_steps))) + reward_batch

        # Compute loss
        if self.prioritized:
            loss = self.criterion(state_action_values,
                                  expected_state_action_values)
            loss = torch.mul(loss, weight_batch)
            new_priorities = np.abs(loss.cpu().data.numpy()) + 1e-6
            self.memory.update_priorities(batch_idxes, new_priorities)
            loss = loss.mean()
        else:
            loss = self.criterion(state_action_values,
                                  expected_state_action_values)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.data[0]

    def process_state(self, state):
        state = np.array(state)
        if self.args.atari:
            # map shape: (84,84,4) --> (1,4,84,84)
            state = torch.from_numpy(state).permute(2, 0, 1).unsqueeze(0)
        else:
            state = torch.Tensor(state).unsqueeze(0)
        return state.cuda() if use_cuda else state

    def train(self):
        total_reward = 0
        loss = 0
        # set training mode
        self.online_net.train()
        while (True):
            if self.noise_linear:
                self.reset_noise()

            state = self.process_state(self.env.reset())
            done = False
            episode_duration = 0
            while (not done):
                # select and perform action
                action = self.act(state)
                next_state, reward, done, _ = self.env.step(action[0, 0])
                total_reward += reward
                reward = Tensor([reward])

                # process new state
                next_state = self.process_state(next_state)
                if done:
                    next_state = None

                # store the transition in memory
                self.memory.push(state, action, next_state, reward)

                # move to the next state
                state = next_state

                # Perform one step of the optimization (on the target network)
                if self.steps > self.learning_start and self.steps % self.train_freq == 0:
                    loss = self.update()
                    if self.noise_linear:
                        self.reset_noise()

                # update target network
                if self.steps > self.learning_start and self.steps % self.target_update_freq == 0:
                    self.target_net.load_state_dict(
                        self.online_net.state_dict())

                if self.steps % self.save_freq == 0:
                    self.save('dqn.cpt')

                self.steps += 1
                episode_duration += 1

            if self.episodes_done % self.display_freq == 0:
                print(
                    'Episode: %d | Steps: %d/%d | Exploration: %f | Avg reward: %f | loss: %f | Episode Duration: %d'
                    % (self.episodes_done, self.steps, self.num_timesteps,
                       self.exploration.value(self.steps), total_reward /
                       self.display_freq, loss, episode_duration))
                writer.add_scalar('reward', total_reward / self.display_freq,
                                  self.steps)
                total_reward = 0

            self.episodes_done += 1
            if self.steps > self.num_timesteps:
                break
        self.save('dqn_final.model')

    def nsteps_train(self):
        '''
        Training procedure for multi-steps learning
        '''
        total_reward = 0
        loss = 0
        # set training mode
        self.online_net.train()
        while (True):
            if self.noise_linear:
                self.reset_noise()
            state_buffer = deque()  # store states for future use
            action_buffer = deque()  # store actions for future use
            reward_buffer = deque()  # store rewards for future use
            nstep_reward = 0  # calculate n-step discounted reward

            state = self.process_state(self.env.reset())
            state_buffer.append(state)

            done = False
            episode_duration = 0

            # run n-1 steps
            for _ in range(1, self.n_steps):
                action = self.act(state)
                next_state, reward, done, _ = self.env.step(action[0, 0])
                next_state = self.process_state(next_state)
                if done:
                    next_state = None
                state_buffer.append(next_state)
                action_buffer.append(action)
                nstep_reward = nstep_reward * self.GAMMA + reward
                reward_buffer.append(reward)

                state = next_state
                episode_duration += 1

            while (not done):
                # select and perform action
                action = self.act(state)
                next_state, reward, done, _ = self.env.step(action[0, 0])
                total_reward += reward

                # process new state
                next_state = self.process_state(next_state)
                if done:
                    next_state = None

                # save new state, action, reward
                state_buffer.append(next_state)
                action_buffer.append(action)
                reward_buffer.append(reward)
                nstep_reward = nstep_reward * self.GAMMA + reward

                # store the transition in memory
                self.memory.push(state_buffer.popleft(),
                                 action_buffer.popleft(), next_state,
                                 Tensor([nstep_reward]))

                # update n-step reward
                nstep_reward -= (self.GAMMA**(self.n_steps -
                                              1)) * reward_buffer.popleft()

                # move to the next state
                state = next_state

                # Perform one step of the optimization (on the target network)
                if self.steps > self.learning_start and self.steps % self.train_freq == 0:
                    loss = self.update()
                    if self.noise_linear:
                        self.reset_noise()

                # update target network
                if self.steps > self.learning_start and self.steps % self.target_update_freq == 0:
                    self.target_net.load_state_dict(
                        self.online_net.state_dict())

                if self.steps % self.save_freq == 0:
                    self.save('dqn.cpt')

                self.steps += 1
                episode_duration += 1

            if self.episodes_done % self.display_freq == 0:
                print(
                    'Episode: %d | Steps: %d/%d | Exploration: %f | Avg reward: %f | loss: %f | Episode Duration: %d'
                    % (self.episodes_done, self.steps, self.num_timesteps,
                       self.exploration.value(self.steps), total_reward /
                       self.display_freq, loss, episode_duration))
                writer.add_scalar('reward', total_reward / self.display_freq,
                                  self.steps)
                total_reward = 0

            self.episodes_done += 1
            if self.steps > self.num_timesteps:
                break
        self.save('dqn_final.model')
예제 #23
0
class DynaQAgent(mp.Process):
    def __init__(self,
                 id,
                 env,
                 state_size,
                 action_size,
                 n_episodes,
                 lr,
                 gamma,
                 global_network,
                 target_network,
                 q,
                 max_t=1000,
                 eps_start=1.0,
                 eps_end=0.01,
                 eps_decay=0.995):
        super(DynaQAgent, self).__init__()
        self.id = id
        self.env = env
        self.state_size = state_size
        self.action_size = action_size
        self.n_episodes = n_episodes
        self.gamma = gamma

        self.q = q

        self.local_memory = ReplayBuffer(self.action_size, BUFFER_SIZE,
                                         BATCH_SIZE)

        self.t_step = 0
        self.max_t = max_t
        self.eps_start = eps_start
        self.eps_end = eps_end
        self.eps_decay = eps_decay

        self.experience = namedtuple(
            "Experience",
            field_names=["state", "action", "reward", "next_state", "done"])

        self.global_network = global_network
        self.target_network = target_network

        self.optimizer = optim.SGD(self.global_network.parameters(),
                                   lr=lr,
                                   momentum=.5)

        self.scores_window = deque(maxlen=100)  # last 100 scores

    def act(self, state, eps=0.):
        if random.random() > eps:
            # Turn the state into a tensor
            state = torch.from_numpy(state).float().unsqueeze(0).to(device)

            with torch.no_grad():
                action_values = self.global_network(
                    state)  # Make choice based on local network

            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.local_memory.add(state, action, reward, next_state, done)

        # Increment local timer
        self.t_step += 1

        if self.t_step > BATCH_SIZE:
            experiences = self.local_memory.sample(BATCH_SIZE)
            self.learn(experiences)

            # TODO: Better way to do this??
            if self.q[0].empty() and np.mean(self.scores_window) < 180:
                experiences = self.local_memory.sample(BATCH_SIZE)
                self.q[0].put(experiences[0].detach().share_memory_())
                self.q[1].put(experiences[1].detach().share_memory_())
                self.q[2].put(experiences[2].detach().share_memory_())
                self.q[3].put(experiences[3].detach().share_memory_())
                self.q[4].put(experiences[4].detach().share_memory_())

    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.target_network(next_states).detach().max(
            1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.global_network(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update(self.global_network, self.target_network, TAU)

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def get_experience_as_tensor(self, e):
        states = torch.from_numpy(np.vstack([e.state])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state
                                                  ])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done]).astype(
            np.uint8)).float().to(device)

        return (states, actions, rewards, next_states, dones)

    def get_action_values(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        with torch.no_grad():
            action_values = self.target_network(state)

        return action_values.cpu().data.numpy()[0]

    def get_delta(self, state, action, next_state, reward):
        priority = reward + self.gamma * np.max(
            self.get_action_values(next_state)) - self.get_action_values(
                state)[action]
        return priority

    def run(self):
        scores = []

        eps = self.eps_start  # initialize epsilon
        start_time = time.time()
        for i_episode in range(1, self.n_episodes + 1):
            state = self.env.reset()
            score = 0

            for t in range(self.max_t):
                action = self.act(state, eps)
                # if do_render:
                #     self.env.render()
                next_state, reward, done, _ = self.env.step(action)
                self.step(state, action, reward, next_state, done)
                state = next_state
                score += reward
                if done:
                    break
            self.scores_window.append(score)  # save most recent score
            scores.append(score)  # save most recent score
            eps = max(self.eps_end, self.eps_decay * eps)  # decrease epsilon
            elapsed_time = time.time() - start_time
            if self.id == 0:
                print(
                    '\rThread: {}, Episode {}\tAverage Score: {:.2f}, Runtime: '
                    .format(self.id, i_episode, np.mean(self.scores_window)) +
                    time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
            if i_episode % 100 == 0:
                print(
                    '\rThread: {}, Episode {}\tAverage Score: {:.2f}, Runtime: '
                    .format(self.id, i_episode, np.mean(self.scores_window)) +
                    time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
            if np.mean(self.scores_window) >= 200.0:
                print(
                    '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                    .format(i_episode - 100, np.mean(self.scores_window)))
                break
예제 #24
0
    def __init__(self, args, env):
        self.args = args
        self.env = env
        self.input_channels = 3 if 'SpaceInvaders' in args.env_id else 4
        self.num_actions = self.env.action_space.n

        # if testing, simply load the model we have trained
        if args.test_dqn:
            self.load(args.model)
            self.online_net.eval()
            self.target_net.eval()
            return
        # DQN variants setting
        self.prioritized = args.prioritized
        self.double = args.double
        self.n_steps = args.n_steps
        self.noise_linear = args.noise_linear
        if self.prioritized:
            self.memory = PrioritizedReplayBuffer(10000, alpha=0.6)
            self.beta_schedule = LinearSchedule(args.num_timesteps,
                                                initial_p=0.4,
                                                final_p=1.0)

            self.criterion = MSELoss
        else:
            self.memory = ReplayBuffer(10000)
            self.criterion = nn.MSELoss()

        if args.atari:
            DQN = DQN_Atari
            input_feature = self.input_channels
        else:
            DQN = DQN_Simple
            input_feature = env.observation_space.shape[0]

        # build target, online network
        self.target_net = DQN(input_feature,
                              self.num_actions,
                              dueling=args.dueling,
                              noise_linear=args.noise_linear)
        self.target_net = self.target_net.cuda(
        ) if use_cuda else self.target_net
        self.online_net = DQN(input_feature,
                              self.num_actions,
                              dueling=args.dueling,
                              noise_linear=args.noise_linear)
        self.online_net = self.online_net.cuda(
        ) if use_cuda else self.online_net

        # discounted reward
        self.GAMMA = 0.99

        # exploration setting
        self.exploration = LinearSchedule(schedule_timesteps=int(
            0.1 * args.num_timesteps),
                                          initial_p=1.0,
                                          final_p=0.05)

        # training settings
        self.train_freq = 4
        self.learning_start = 10000
        self.batch_size = args.batch_size
        self.num_timesteps = args.num_timesteps
        self.display_freq = args.display_freq
        self.save_freq = args.save_freq
        self.target_update_freq = args.target_update_freq
        self.optimizer = optim.RMSprop(self.online_net.parameters(), lr=1e-4)
        # global status
        self.episodes_done = 0
        self.steps = 0
예제 #25
0
class DQNAgent:
    def __init__(self,
                 name,
                 Q_current,
                 Q_target,
                 num_actions,
                 discount_factor,
                 batch_size,
                 epsilon,
                 epsilon_decay,
                 boltzmann,
                 double_q,
                 buffer_capacity,
                 random_probs=None):
        """
         Q-Learning agent for off-policy TD control using Function Approximation.
         Finds the optimal greedy policy while following an epsilon-greedy policy.

         Args:
            Q: Action-Value function estimator (Neural Network)
            Q_target: Slowly updated target network to calculate the targets.
            num_actions: Number of actions of the environment.
            discount_factor: gamma, discount factor of future rewards.
            batch_size: Number of samples per batch.
            epsilon: Chance to sample a random action. Float betwen 0 and 1.
        """
        # save hyperparameters in folder

        self.name = name  # probably useless
        self.Q_current = Q_current
        self.Q_target = Q_target

        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.boltzmann = boltzmann

        self.num_actions = num_actions
        self.batch_size = batch_size
        self.discount_factor = discount_factor
        self.buffer_capacity = buffer_capacity

        self.double_q = double_q

        self.random_probs = random_probs

        # define replay buffer
        self.replay_buffer = ReplayBuffer(capacity=buffer_capacity)

        # Start tensorflow session
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver()

    def train(self, state, action, next_state, reward, terminal):
        """
        This method stores a transition to the replay buffer and updates the Q networks.
        """

        # TODO:
        # 1. add current transition to replay buffer
        self.replay_buffer.add_transition(state, action, next_state, reward,
                                          terminal)

        # 2. sample next batch
        batch_states, batch_actions, batch_next_states, batch_rewards, batch_dones = self.replay_buffer.next_batch(
            self.batch_size)

        # find optimal actions for the sampled s' states
        if self.double_q:
            # double Q learning (select actions using current network, rather than target network)
            # ...in order to decorrelate noise between selection and evaluation
            # (Q(state,action) is still evaluated using target network in any case)
            action_selector = self.Q_current
        else:
            action_selector = self.Q_target

        # as usual, the Q network returns a vector of... predicted values for every possible action
        a_prime = np.argmax(action_selector.predict(self.sess,
                                                    batch_next_states),
                            axis=1)

        # pick a''th value from each column of the Q prediction
        # note, this will include action predictions for "done" state, but we'll kill them later
        q_values_next = self.Q_current.predict(
            self.sess, batch_next_states)[np.arange(self.batch_size), a_prime]

        # 2.1 compute td targets:
        # if done, there will be no next state
        td_targets = batch_rewards + np.where(
            batch_dones, 0, self.discount_factor * q_values_next)

        # 2.2 update the Q (current) network
        self.Q_current.update(self.sess, batch_states, batch_actions,
                              td_targets)

        # 2.3 call soft update for target network
        # this is done by the dodgy associate_method therein
        self.Q_target.update(self.sess)

    def act(self, state, deterministic):
        """
        This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action)    
        Args:
            state: current state input
            deterministic:  if True, the agent should execute the argmax action (False in training, True in evaluation)
        Returns:
            action id
        """

        # get action probabilities from current network
        Q_values = np.squeeze(
            self.Q_current.predict(self.sess, np.expand_dims(state, axis=0)))

        argmax_a = np.argmax(Q_values)

        if deterministic:
            # take greedy action
            return argmax_a

        if self.boltzmann:
            # implementing an interaction here between boltzmann exploration and epsilon:
            # viz. that epsilon controls the temperature of the softmax function
            # so that as before, higher eps -> higher exploration
            action_probs = softmax(Q_values,
                                   temperature=1 / (1 - self.epsilon)**2)
            action = np.random.choice(np.arange(len(action_probs)),
                                      p=action_probs)

        else:
            action_probs = np.zeros_like(Q_values)

            if np.random.uniform() > self.epsilon:
                # choose the best action
                action = argmax_a
            else:
                # explore
                if self.random_probs is None:
                    action = np.random.randint(self.num_actions, size=1)[0]

                else:
                    action = np.random.choice(np.arange(self.num_actions),
                                              p=self.random_probs)

        # we decay epsilon AFTER we've checked it
        # (nb: if deterministic, epsilon will never decay, but of course this doesn't matter)
        if self.epsilon_decay > 0:
            self.epsilon *= (1 - self.epsilon_decay)

        return action

    def load(self, file_name):
        self.saver.restore(self.sess, file_name)
예제 #26
0
    random.seed(hyper_params['seed'])

    assert "NoFrameskip" in hyper_params[
        'env'], "Require environment with no frameskip"
    env = create_env(0, 1)
    env.seed(hyper_params['seed'])
    #env = NoopResetEnv(env, noop_max=30)
    env = MaxAndSkipEnv(env, skip=4)
    #env = EpisodicLifeEnv(env)
    #env = FireResetEnv(env)
    env = WarpFrame(env)
    env = PyTorchFrame(env)
    env = ClipRewardEnv(env)
    env = FrameStack(env, 3)

    replay_buffer = ReplayBuffer(hyper_params['replay_buffer_size'])

    agent = DQNAgent(env.observation_space,
                     env.action_space,
                     replay_buffer,
                     use_double_dqn=hyper_params['use_double_dqn'],
                     lr=hyper_params['learning_rate'],
                     batch_size=hyper_params['batch_size'],
                     gamma=hyper_params['discount_factor'])

    eps_timesteps = hyper_params['eps_fraction'] * float(
        hyper_params['num_steps'])
    episode_rewards = [0.0]
    loss = [0.0]
    policy_actions = unpickle_object('action_map')
예제 #27
0
class DQNAgent:
    def __init__(self,
                 Q,
                 Q_target,
                 num_actions,
                 discount_factor=0.99,
                 batch_size=64,
                 epsilon=0.05,
                 act_probabilities=None,
                 double_q=False,
                 buffer_capacity=100000,
                 prefill_bs_percentage=5):
        """
         Q-Learning agent for off-policy TD control using Function Approximation.
         Finds the optimal greedy policy while following an epsilon-greedy policy.

         Args:
            Q: Action-Value function estimator (Neural Network)
            Q_target: Slowly updated target network to calculate the targets.
            num_actions: Number of actions of the environment.
            discount_factor: gamma, discount factor of future rewards.
            batch_size: Number of samples per batch.
            epsilon: Chance to sample a random action. Float betwen 0 and 1.
        """
        self.Q = Q
        self.Q_target = Q_target

        self.epsilon = epsilon

        self.num_actions = num_actions
        self.batch_size = batch_size
        self.discount_factor = discount_factor

        # define replay buffer
        self.replay_buffer = ReplayBuffer(capacity=buffer_capacity,
                                          min_fill=prefill_bs_percentage *
                                          batch_size)

        # Start tensorflow session
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver()

        # <JAB>
        if act_probabilities is None:
            self.act_probabilities = np.ones(num_actions) / num_actions
        else:
            self.act_probabilities = act_probabilities

        self.double_dqn = double_q

    def train(self, state, action, next_state, reward, terminal):
        """
        This method stores a transition to the replay buffer and updates the Q networks.
        """

        # TODO:
        # 1. add current transition to replay buffer
        # 2. sample next batch and perform batch update:
        #       2.1 compute td targets:
        #              td_target =  reward + discount * argmax_a Q_target(next_state_batch, a)
        #       2.2 update the Q network
        #              self.Q.update(...)
        #       2.3 call soft update for target network
        #              self.Q_target.update(...)

        # <JAB>
        self.replay_buffer.add_transition(state, action, next_state, reward,
                                          terminal)

        # Let the buffer fill up, otherwise we will burn up a lot of $#!+¥ states early on
        if self.replay_buffer.has_min_items():
            buffer = self.replay_buffer.next_batch(self.batch_size)
            batch_states = buffer[0]
            batch_actions = buffer[1]
            batch_next_states = buffer[2]
            batch_rewards = buffer[3]
            batch_dones = buffer[4]

            non_terminal_states = np.logical_not(batch_dones)

            if self.double_dqn:
                a_predictions = self.Q.predict(self.sess, batch_next_states)
                a_predictions = np.argmax(a_predictions, axis=1)
                action_indexes = [np.arange(len(a_predictions)), a_predictions]
                q_predictions = self.Q_target.predict(self.sess,
                                                      batch_next_states)
                q_predictions = q_predictions[action_indexes]

            else:
                q_predictions = self.Q_target.predict(self.sess,
                                                      batch_next_states)
                q_predictions = np.max(q_predictions, axis=1)

            td_target = batch_rewards
            # If episode is not finished, add predicted Q values to the current rewards
            td_target[
                non_terminal_states] += self.discount_factor * q_predictions[
                    non_terminal_states]

            # Update Step
            self.Q.update(self.sess, batch_states, batch_actions, td_target)
            self.Q_target.update(self.sess)

    def act(self, state, deterministic):
        """
        This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action)    
        Args:
            state: current state input
            deterministic:  if True, the agent should execute the argmax action (False in training, True in evaluation)
        Returns:
            action id
        """
        r = np.random.uniform()
        if deterministic or r > self.epsilon:
            # <JAB>
            action_id = np.argmax(self.Q.predict(self.sess, state))
            # </JAB>

        else:

            # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work.
            # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight.
            # To see how the agent explores, turn the rendering in the training on and look what the agent is doing.
            # action_id = ...
            # <JAB>
            action_id = np.random.choice(np.arange(self.num_actions),
                                         p=self.act_probabilities)
            # </JAB>

        return action_id

    def load(self, file_name):
        self.saver.restore(self.sess, file_name)