Exemplo n.º 1
0
def q_learning(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1):
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes),
                                  methods="q-learning")
    policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)

    for i_episode in tqdm(range(num_episodes)):
        state = env.reset()
        action_prob = policy(state)
        action = np.random.choice(np.arange(env.action_space.n),
                                       p=action_prob)

        for t in itertools.count():
            next_state, reward, done, _ = env.step(action)
            next_action_prob = policy(next_state)
            next_action = np.random.choice(np.arange(env.action_space.n),
                                            p=next_action_prob)
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            td_target = reward + discount_factor*np.max(Q[next_state])
            td_error = td_target - Q[state][action]
            Q[state][action] += alpha * td_error

            if done:
                break
            state = next_state
            action = next_action
    return Q, stats
def sarsa_control_epsilon_greedy(env, n_episodes, epsilon, discount_factor, alpha ):
    
    Q=defaultdict(lambda: np.zeros(env.action_space.n))
    final_policy=defaultdict(lambda: np.zeros(env.action_space.n))
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(n_episodes),episode_rewards=np.zeros(n_episodes))
    
    for i in range(n_episodes):
        state=env.reset()
        done=False
        prob= epsilon_greedy_policy(state, Q, epsilon, env.action_space.n)
        action = np.random.choice(np.arange(len(prob)), p=prob)
        while not done:            
            
            next_state, reward, done, _ = env.step(action)
            next_prob = epsilon_greedy_policy(next_state, Q , epsilon, env.action_space.n) 
            next_action = np.random.choice(np.arange(len(next_prob)),p=next_prob )
            
            Q[state][action] += alpha*(reward + discount_factor * Q[next_state][next_action] - Q[state][action])
            
            stats.episode_rewards[i] += reward
            stats.episode_lengths[i] += 1
            
            state=next_state
            action=next_action
        
    for _state in Q:
        final_policy[_state]= epsilon_greedy_policy( _state, Q, 0.0 , env.action_space.n)
        
    return Q, final_policy , stats
Exemplo n.º 3
0
    def __init__(self, sess, env, saver, q_estimator, target_estimator,
                 state_processor, config):
        self.sess = sess
        self.env = env
        self.saver = saver
        self.q_estimator = q_estimator
        self.target_estimator = target_estimator
        self.params = config
        self.replay_memory = []
        self.state_processor = state_processor
        self.stats = plotting.EpisodeStats(
            episode_lengths=np.zeros(config.num_episodes),
            episode_rewards=np.zeros(config.num_episodes))

        latest_checkpoint = tf.train.latest_checkpoint(config.checkpoint_dir)
        if latest_checkpoint:
            print("Loading model checkpoint {}...\n".format(latest_checkpoint))
            self.saver.restore(sess, latest_checkpoint)

        self.total_t = sess.run(tf.contrib.framework.get_global_step())

        self.epsilons = np.linspace(config.epsilon_start, config.epsilon_end,
                                    config.epsilon_decay_steps)
        self.policy = make_epsilon_greedy_policy(q_estimator,
                                                 len(VALID_ACTIONS))
    def gibbs_sampling(self, num_episodes):
        stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                      episode_rewards=np.zeros(num_episodes))
        for i_episode in range(num_episodes):
            self.temperature = 1
            self.reset()
            #self.learning_rate = self.learning_rate * 0.9
            self.temperature = self.temperature - 0.9 / 50
            for i in itertools.count():
                self._update_hiddens()
                self.update_action(update=True)
                free_energy_1 = self.get_free_energy()
                reward_ = self._update_states()
                #print 'New State', self.old_state
                stats.episode_rewards[i_episode] += reward_
                stats.episode_lengths[i_episode] = i

                if self.old_state == 15 or self.old_state == 0:
                    print 'Ith episode, episode len', i_episode, i
                    break
                free_energy_2 = self.update_action(update=True)
                diff = reward_ + self.discount_factor * free_energy_2 - free_energy_1
                self._update_action_weights(diff)
                self._update_state_weights(diff)
        return stats
    def run_sarsa(self,
                  max_number_of_episodes=100,
                  interactive=False,
                  display_frequency=1):

        # repeat for each episode
        for episode_number in range(max_number_of_episodes):

            # initialize state
            state = self.env.reset()

            done = False  # used to indicate terminal state
            R = 0  # used to display accumulated rewards for an episode
            t = 0  # used to display accumulated steps for an episode i.e episode length

            # choose action from state using policy derived from Q
            action = self.agent.act(state)

            # repeat for each step of episode, until state is terminal
            while not done:

                t += 1  # increase step counter - for display

                # take action, observe reward and next state
                next_state, reward, done, _ = self.env.step(action)

                # choose next action from next state using policy derived from Q
                next_action = self.agent.act(next_state)

                # agent learn (SARSA update)
                self.agent.learn(state, action, reward, next_state,
                                 next_action)

                # state <- next state, action <- next_action
                state = next_state
                action = next_action

                R += reward  # accumulate reward - for display

                # if interactive display, show update for each step
                if interactive:
                    self.update_display_step()

            self.episode_length = np.append(
                self.episode_length, t)  # keep episode length - for display
            self.episode_reward = np.append(
                self.episode_reward, R)  # keep episode reward - for display

            # if interactive display, show update for the episode
            if interactive:
                self.update_display_episode()

        # if not interactive display, show graph at the end
        if not interactive:
            self.fig.clf()
            stats = plotting.EpisodeStats(
                episode_lengths=self.episode_length,
                episode_rewards=self.episode_reward,
                episode_running_variance=np.zeros(max_number_of_episodes))
            plotting.plot_episode_stats(stats, display_frequency)
Exemplo n.º 6
0
def qLearning(env, num_episodes, discount_factor=1.0, alpha=0.6, epsilon=0.1):
    """
    Q-Learning algorithm: Off-policy TD control.
    Finds the optimal greedy policy while improving
    following an epsilon-greedy policy"""

    # Action value function
    # A nested dictionary that maps
    # state -> (action -> action-value).
    Q = defaultdict(lambda: np.zeros(env.no_actions))

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    # Create an epsilon greedy policy function
    # appropriately for environment action space
    policy = createEpsilonGreedyPolicy(Q, epsilon, env.no_actions)

    # For every episode
    for ith_episode in range(num_episodes):

        # Reset the environment and pick the first action
        state = env.reset()

        for i in range(env.number_of_vehicles):

            # get probabilities of all actions from current state
            action_probabilities = policy(state)

            # choose action according to
            # the probability distribution
            action = np.random.choice(np.arange(len(action_probabilities)),
                                      p=action_probabilities)

            # take action and get reward, transit to next state
            reward, next_state, done = env.step(action)
            # print(state)
            # print(action)
            # print(next_state)
            # print("___________________________________________")
            # Update statistics
            stats.episode_rewards[ith_episode] += reward
            stats.episode_lengths[ith_episode] = i

            # TD Update
            best_next_action = np.argmax(Q[str(next_state)])
            td_target = reward + discount_factor * Q[str(
                next_state)][best_next_action]
            td_delta = td_target - Q[str(state)][action]
            Q[str(state)][action] += alpha * td_delta

            # done is True if episode terminated
            if done:
                break

            state = next_state

    return Q, stats
Exemplo n.º 7
0
    def g_learning(self,
                   num_episodes,
                   max_ep_steps=10000,
                   discount=1.0,
                   epsilon=0.1):
        """ The G-learning algorithm.
    
        Args:
            num_episodes: Number of episodes to run.
            max_ep_steps: Maximum time steps allocated to one episode.
            discount: Standard discount factor, usually denoted as \gamma.
            epsilon: Probability of taking random actions during exploration.
    
        Returns:
            A tuple (G, stats) of the G-values and statistics, which should be
            plotted and thoroughly analyzed.
        """
        cum_t = 0
        stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                      episode_rewards=np.zeros(num_episodes),
                                      kl_divergence=np.zeros(num_episodes))

        for i_episode in tqdm(range(num_episodes)):
            state = env.reset()

            # Run this episode until we finish as indicated by the environment.
            for t in range(1, max_ep_steps + 1):

                # Uses exploration policy to take a step.
                action = self.policy_exploration(state, epsilon)
                next_state, reward, done, _ = env.step(action)
                # cost = -reward
                # print(reward)

                # Collect statistics (cum_t currently not used).
                stats.episode_rewards[i_episode] += reward
                stats.episode_lengths[i_episode] = t
                stats.kl_divergence[i_episode] = self.kl_divergence()
                self.N[state, action] += 1
                cum_t += 1

                # Intermediate terms for the G-learning update.
                alpha = self.alpha_schedule(t, state, action)
                beta = self.beta_schedule(t)
                temp = np.sum(self.rho[next_state, :] *
                              np.exp(-beta * self.G[next_state, :]))

                # Official G-learning update at last. Equation 18 in the paper.
                td_target = -reward - (discount / beta) * np.log(temp)
                td_delta = td_target - self.G[state, action]
                self.G[state, action] += (alpha * td_delta)

                if done:
                    break
                state = next_state

        print
        return self.G, stats
Exemplo n.º 8
0
    def simulate(self, pi, n_trial, n_episode, return_stats=False):
        """TODO: Docstring for simulate

        Parameters
        ----------
        pi : behavior policy

        Returns
        -------
        D: a collection of transition samples

        """

        stats = plotting.EpisodeStats(episode_lengths=np.zeros(n_episode),
                                      episode_rewards=np.zeros(n_episode))

        D = []

        env = self._env
        for trial_i in range(n_trial):
            #D_t = D[trial_i]

            for epi_i in range(n_episode):

                last_reward = stats.episode_rewards[epi_i - 1]
                sys.stdout.flush()
                #D_e = D_t[epi_i]
                traj = []
                s = env.reset()

                for t in count():
                    a = pi.choose_action(s)
                    s_next, r, done, _ = env.step(a)

                    stats.episode_rewards[epi_i] += r
                    stats.episode_lengths[epi_i] = t

                    logging.debug("s {} a {} s_next {} r {} done {}".format(
                        s, a, r, s_next, done))
                    transition = T(s=s, a=a, r=r, s_next=s_next, done=done)
                    traj.append(transition)

                    s = s_next

                    if done:
                        logging.debug("done after {} steps".format(t))
                        break

                    print("\rStep {} @ Episode {}/{} ({})".format(
                        t, epi_i + 1, n_episode, last_reward),
                          end="")

                D.append(traj)
        if return_stats:
            return D, stats
        else:
            return D
Exemplo n.º 9
0
def q_learning(env, num_episodes, discount_factor=1.0, alpha=0.6, epsilon=0.1):
    """ 
    Q-Learning algorithm: Off-policy TD control. 
    Finds the optimal greedy policy while improving 
    following an epsilon-greedy policy
    """
    # Action value function
    # A nested dictionary that maps
    # state -> (action -> action-value).
    q = defaultdict(lambda: np.zeros(env.action_space.n))

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    # Create an epsilon greedy policy function
    # appropriate for environment action space
    policy = create_epsilon_greedy_policy(q, epsilon, env.action_space.n)

    # For every episode
    for ith_episode in range(num_episodes):

        # Reset the environment and pick the first action
        state = env.reset()

        for t in itertools.count():

            # get probabilities of all functions from current state
            actions_probabilities = policy(state)

            # choose action according to
            # the probability distribution
            action = np.random.choice(np.arange(len(actions_probabilities)),
                                      p=actions_probabilities)

            # take action and get reward, transit to next state
            next_state, reward, done, _ = env.step(action)

            # Update statistics
            stats.episode_rewards[ith_episode] += reward
            stats.episode_lengths[ith_episode] = t

            # TD Update
            best_next_action = np.argmax(q[next_state])
            td_target = reward + discount_factor * q[next_state][
                best_next_action]
            td_delta = td_target - q[state][action]
            q[state][action] += alpha * td_delta

            # done is TRUE if episode terminated
            if done:
                break

            state = next_state

    return q, stats
Exemplo n.º 10
0
def Q_learning(env, num_episodes, discount_factor=0.4, alpha=0.9, epsilon=0.5):
    """ 
    Q-Learning algorithm: Off-policy TD control. 
    Finds the optimal greedy policy while improving 
    following an epsilon-greedy policy"""

    # Action value function
    # A nested dictionary that maps
    # state -> (action -> action-value).
    Q = defaultdict(lambda: np.zeros(env.action_space.n))

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    # Create an epsilon greedy policy function
    # appropriately for environment action space
    policy = createEpsilonGreedyPolicy(Q, epsilon, env.action_space.n)

    # For every episode
    for ith_episode in range(num_episodes):
        #reset the enviroment and pick the 1st action
        state = env.reset()

        for t in itertools.count():
            # get probabilities of all actions from current state
            action_probabilities = policy(state)

            # choose action according to
            # the probability distribution
            action = np.random.choice(np.arange(len(action_probabilities)),
                                      p=action_probabilities)

            # take action and get reward, transit to next state
            next_state, reward, done, _ = env.step(action)

            # Update statistics
            stats.episode_rewards[ith_episode] += reward
            stats.episode_lengths[ith_episode] = t

            # TD Update
            best_next_action = np.argmax(Q[next_state])
            TD = reward + discount_factor * Q[next_state][
                best_next_action] - Q[state][action]
            Q[state][action] += alpha * TD
            # env.render()

            # done is True if episode terminated
            if done:
                # env.render()
                break

            state = next_state

    return Q, stats
Exemplo n.º 11
0
    def q_learning(self,
                   num_episodes,
                   max_ep_steps=10000,
                   discount=1.0,
                   epsilon=0.1):
        """ The Q-learning algorithm.
    
        Args:
            num_episodes: Number of episodes to run.
            max_ep_steps: Maximum time steps allocated to one episode.
            discount: Standard discount factor, usually denoted as \gamma.
            epsilon: Probability of taking random actions during exploration.
    
        Returns:
            A tuple (Q, stats) of the Q-values and statistics, which should be
            plotted and thoroughly analyzed.
        """
        cum_t = 0
        stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                      episode_rewards=np.zeros(num_episodes),
                                      kl_divergence=np.zeros(num_episodes))

        for i_episode in tqdm(range(num_episodes)):
            state = env.reset()

            # Run this episode until we finish as indicated by the environment.
            for t in range(1, max_ep_steps + 1):

                # Uses exploration policy to take a step.
                action = self.policy_exploration(state, epsilon)
                next_state, reward, done, _ = env.step(action)

                # Collect statistics (cum_t currently not used).
                stats.episode_rewards[i_episode] += reward
                stats.episode_lengths[i_episode] = t
                stats.kl_divergence[i_episode] = self.kl_divergence()
                self.N[state, action] += 1
                cum_t += 1

                # The official Q-learning update.
                alpha = self.alpha_schedule(t, state, action)
                best_next_action = np.argmax(self.Q[next_state, :])
                td_target = reward + discount * self.Q[next_state,
                                                       best_next_action]
                td_delta = td_target - self.Q[state, action]
                self.Q[state, action] += (alpha * td_delta)

                if done:
                    break
                state = next_state

        print("")
        return self.Q, stats
Exemplo n.º 12
0
def n_step_sarsa(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1, n=5):
    """
        args:
            n: definite the n-step
    """
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes),
                                  methods="n_step_sarsa")
    policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)

    for i_episode in tqdm(range(num_episodes)):
        state = env.reset()
        action_prob = policy(state)
        action = np.random.choice(np.arange(env.action_space.n),
                                            p=action_prob)

        state_store = []
        action_store = []
        reward_store = []
        T = 100000
        for t in itertools.count():
            if t < T:
                next_state, reward, done, _ = env.step(action)
                state_store.append(next_state)
                reward_store.append(reward)
                stats.episode_rewards[i_episode] += reward
                stats.episode_lengths[i_episode] = t
                if done:
                    T = t + 1
                    action_store.append(action)
                else:
                    next_action_prob = policy(next_state)
                    next_action = np.random.choice(np.arange(env.action_space.n),
                                                   p=next_action_prob)
                    action_store.append(next_action)

                    state = next_state
                    action = next_action
            tao = t - n + 1
            if tao >= 0:
                G = 1.0
                for i in range(tao+1, min(tao+n, T)):
                    G += (discount_factor ** (i-tao-1))*reward_store[i]
                if tao + n < T:
                    G += (discount_factor ** n) * Q[state_store[tao+n-1]][action_store[tao+n-1]]
                Q[state_store[tao]][action_store[tao]] += alpha*(G - Q[state_store[tao]][action_store[tao]])
            if tao == T - 1:
                break
    return Q, stats
 def __init__(self,name, lock, batch, env, actor, critic,  gamma, max_episodes, episodes_to_train):
     threading.Thread.__init__(self)
     self.name = name
     self.env = env
     self.batch = batch
     self.actor = actor
     self.critic = critic
     self.lock = lock
     self.gamma = gamma
     self.max_episodes = max_episodes
     self.episodes_to_train = episodes_to_train
     self.stats = plotting.EpisodeStats(
             episode_lengths=np.zeros(10000),
             episode_rewards=np.zeros(10000))
Exemplo n.º 14
0
def sarsa(env,
          num_episodes,
          discount_factor=1.0,
          alpha=0.5,
          epsilon=0.1,
          exp=False):
    """
        Args:
            env: environment
            num_episodes
            discount_factor: gamma in the updated equ
            alpha: learning_rate
            epsilon: the prob of exploray
            exp: whether use Expected SARSA or not
    """
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    method = "Expected SARSAR" if exp else "SARSA"
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes),
                                  methods=method)
    policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)

    for i_episode in tqdm(range(num_episodes)):
        state = env.reset()
        action_prob = policy(state)
        action = np.random.choice(np.arange(env.action_space.n), p=action_prob)
        for t in itertools.count():
            next_state, reward, done, _ = env.step(action)
            next_action_prob = policy(next_state)
            next_action = np.random.choice(np.arange(env.action_space.n),
                                           p=next_action_prob)
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t
            if exp:
                # use expected sarsa!
                td_target = reward + discount_factor * np.dot(
                    Q[next_state], next_action_prob)
                td_error = td_target - Q[state][action]
                Q[state][action] += alpha * td_error
            else:
                td_target = reward + discount_factor * Q[next_state][
                    next_action]
                td_error = td_target - Q[state][action]
                Q[state][action] += alpha * td_error

            if done:
                break
            action = next_action
            state = next_state
    return Q, stats
def q_learning_testing_rewards(env,
                               estimator,
                               reward_fn,
                               num_episodes,
                               discount_factor=1.0,
                               epsilon=0.0,
                               epsilon_decay=1.0,
                               render=False,
                               ep_details=False):
    '''
    Given the reward function, The RL agent learns the best policy.
    (Used for generating final results to compare with learning with default handcrafted rewards.) 
    '''
    # Statistics during learning process
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    for i in tqdm(range(num_episodes)):
        state = env.reset()
        done = False
        d = 0

        while not done and d <= 2000:

            prob = epsilon_greedy_policy(state, estimator,
                                         epsilon * epsilon_decay**i,
                                         env.action_space.n)
            action = np.random.choice(np.arange(len(prob)), p=prob)
            step = env.step(action)

            next_state = step[0]
            done = step[2]
            reward = reward_fn(state)
            if render:
                env.render()

            stats.episode_rewards[i] += reward
            stats.episode_lengths[i] += 1

            q_values_next = estimator.predict(next_state)
            td_target = reward + discount_factor * np.max(q_values_next)
            estimator.update(state, action, td_target)
            state = next_state
            d += 1

        if ep_details:
            print("Episode {} completed in {} timesteps".format(i, d))

    return stats
Exemplo n.º 16
0
def sarsa(env,
          estimator,
          num_episodes,
          discount_factor=1.0,
          epsilon=0.1,
          epsilon_decay=1.0):

    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    for i_episode in range(num_episodes):

        policy = make_epsilon_greedy_policy(estimator,
                                            epsilon * epsilon_decay**i_episode,
                                            env.action_space.n)
        state = env.reset()
        action_probs = policy(state)
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)

        for t in itertools.count():

            next_state, reward, end, _ = env.step(action)
            next_action_probs = policy(next_state)
            next_action = np.random.choice(np.arange(len(next_action_probs)),
                                           p=next_action_probs)

            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            q_values_next = estimator.predict(next_state)
            td_target = reward + discount_factor * q_values_next[next_action]

            estimator.update(state, action, td_target)

            if i_episode % 10 == 0:
                print("\rStep {} @ Episode {}/{} ({})".format(
                    t, i_episode + 1, num_episodes, reward))

            if end:
                break

            state = next_state
            action = next_action
    return stats
Exemplo n.º 17
0
def q_learning(env,
               estimator,
               num_episodes,
               discount_factor=1.0,
               epsilon=0.1,
               epsilon_decay=1.0):
    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    for i_episode in range(num_episodes):

        # The policy we're following
        policy = make_epsilon_greedy_policy(estimator,
                                            epsilon * epsilon_decay**i_episode,
                                            env.action_space.n)

        # Print out which episode we're on, useful for debugging.
        # Also print reward for last episode
        last_reward = stats.episode_rewards[i_episode - 1]
        print("\rEpisode {}/{} ({})".format(i_episode + 1, num_episodes,
                                            last_reward),
              end="")
        sys.stdout.flush()

        done = False
        state = env.reset()
        while not done:
            probs = policy(state)
            action = np.random.choice(np.arange(len(probs)), p=probs)
            nextstate, reward, done, _ = env.step(action)

            target = reward + discount_factor * np.max(
                estimator.predict(nextstate))
            estimator.update(state, action, target)

            stats.episode_lengths[i_episode] += 1
            stats.episode_rewards[i_episode] += reward

            state = nextstate

    return stats
 def initialize(self):
     self.step_idx = 0
     self.done_episodes = 0
     self.states, self.actions,self.adv, self.not_done_idx, self.last_states = [], [], [], [], []
     self.discounted_rewards = []
     self.epochs = 0
     self.done = False
     self.show = False
     self.video = 0
     self.solved = False
     self.episodes = 0
     self.video_index = 0
     self.last_rewards = collections.deque()
     self.state = self.env.reset()
     self.stats = plotting.EpisodeStats(
             episode_lengths=np.zeros(10000),
             episode_rewards=np.zeros(10000))
     if self.save_rendering:
         self.rec = gym.wrappers.monitoring.video_recorder.VideoRecorder(self.env, path=self.videopath + "/video1.mp4")
     self.actor_model = self.actor.build_model()
     self.critic_model = self.critic.build_model()
Exemplo n.º 19
0
def expected_sarsa(env,
                   estimator,
                   num_episodes,
                   discount_factor=1.0,
                   epsilon=0.015,
                   epsilon_decay=1.0):
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))
    rlist = []
    for i_episode in range(num_episodes):
        policy = make_epsilon_greedy_policy(estimator,
                                            epsilon * epsilon_decay**i_episode,
                                            env.action_space.n)
        #last_reward = stats.episode_rewards[i_episode - 1]
        state = env.reset()
        next_action = None
        for j in itertools.count():
            cum_reward = 0
            if next_action is None:
                action_probs = policy(state)
                action = np.random.choice(np.arange(len(action_probs)),
                                          p=action_probs)
            else:
                action = next_action
            env.render()
            next_state, reward, done, _ = env.step(action)

            cum_reward += reward
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = j
            Q_val = estimator.predict(next_state)
            action_probs = policy(next_state)
            td_target = reward + discount_factor * sum(action_probs * Q_val)
            estimator.update(state, action, td_target)
            if done:
                print('Episode no', i_episode)
                rlist.append(cum_reward)
                break
            state = next_state
    return stats
Exemplo n.º 20
0
def sarsa(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1):
    # The final action-value function.
    # A nested dictionary that maps state -> (action -> action-value).
    Q = defaultdict(lambda: np.zeros(env.action_space.n))

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    # The policy we're following
    policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)

    for i_episode in range(num_episodes):
        if (i_episode + 1) % 100 == 0:
            print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes),
                  end="")
            sys.stdout.flush()

        done = False
        t = 0
        state = env.reset()
        props = policy(state)
        action = np.random.choice(np.arange(len(props)), p=props)
        while not done:
            nextstate, reward, done, _ = env.step(action)
            props = policy(nextstate)
            nextaction = np.random.choice(np.arange(len(props)), p=props)

            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            Q[state][action] += alpha * (
                reward + discount_factor * Q[nextstate][nextaction] -
                Q[state][action])

            state = nextstate
            action = nextaction
            t += 1

    return Q, stats
def q_learning_best_policy(env,
                           estimator,
                           num_episodes,
                           discount_factor=1.0,
                           epsilon=0.0,
                           epsilon_decay=1.0,
                           print_ep_lens=False):
    '''
    ** RL Code for the learning part of the expert agent. This does not take the reward function.
    It uses the default environment reward function.
    '''
    # Statistics during learning process
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    for i in tqdm(range(num_episodes)):
        state = env.reset()
        done = False
        d = 0
        while not done:

            prob = epsilon_greedy_policy(state, estimator,
                                         epsilon * epsilon_decay**i,
                                         env.action_space.n)
            action = np.random.choice(np.arange(len(prob)), p=prob)
            next_state, reward, done, _ = env.step(action)

            stats.episode_rewards[i] += reward
            stats.episode_lengths[i] += 1

            q_values_next = estimator.predict(next_state)
            td_target = reward + discount_factor * np.max(q_values_next)
            estimator.update(state, action, td_target)
            state = next_state
            d += 1
        if print_ep_lens:
            print("Episode {} completed in {} timesteps".format(i, d))

    return stats
Exemplo n.º 22
0
def fa_test(env, estimator, num_episodes):
    stats_test = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                       episode_rewards=np.zeros(num_episodes),
                                       episode_transbag=np.zeros(num_episodes))

    for i_episode in range(num_episodes):

        # The policy we're following
        # Print out which episode we're on, useful for debugging.
        # Also print reward for last episode
        last_reward = stats_test.episode_transbag[i_episode - 1]
        sys.stdout.flush()

        # Reset the environment and pick the first action
        state = env.reset_test()

        # One step in the environment
        for t in itertools.count():

            q_values = estimator.predict(state)
            best_action = np.argmax(q_values)
            next_state, reward, done, data_overflow = env.step(best_action)

            # Update statistics
            stats_test.episode_rewards[i_episode] += reward
            stats_test.episode_lengths[i_episode] = t
            stats_test.episode_transbag[i_episode] += data_overflow
            if done:
                break
            if t >= 1000:
                break
            state = next_state
        print("\r@ Episode {}/{} ({})".format(i_episode + 1, num_episodes,
                                              last_reward),
              end="")

    return stats_test
inputs = Input(shape=(env.observation_space.shape[0], ))
x = Dense(128, activation='relu')(inputs)
predictions = Dense(env.action_space.n, activation='softmax')(x)

model = Model(inputs, predictions)

total_rewards = []
step_idx = 0
done_episodes = 0

batch_episodes = 0
batch_states, batch_actions, batch_scales = [], [], []
cur_rewards = []
gamma = 0.99

stats = plotting.EpisodeStats(episode_lengths=np.zeros(TOTAL_EPISODES),
                              episode_rewards=np.zeros(TOTAL_EPISODES))

model.reset_states()
discounted_reward = 0
step_idx = 0
for i in range(1, TOTAL_EPISODES):

    state = env.reset()
    epochs, penalties, reward, = 0, 0, 0
    done = False

    while not done:

        step_idx += 1
        action = np.random.choice(
            [a for a in range(env.action_space.n)],
Exemplo n.º 24
0
        RANDOM_incentives = []

        POPULAR_rewards = []
        POPULAR_serveratio = []
        POPULAR_incentives = []

        MIN_rewards = []
        MIN_serveratio = []
        MIN_incentives = []

        rewards = []

        # Number of trials (episodes)
        no_episodes = 50

        stats = plotting.EpisodeStats(episode_lengths=np.zeros(no_episodes),
                                      episode_rewards=np.zeros(no_episodes))

        T = 2000
        number_of_contents = 10
        myenv = MyEnv(density=density,
                      T=T,
                      number_of_contents=number_of_contents)

        if (RL == False):
            RL = DeepQNetwork(myenv.no_actions,
                              myenv.observation_length,
                              learning_rate=0.001,
                              reward_decay=0.9,
                              e_greedy=0.9,
                              replace_target_iter=5000,
                              memory_size=2000,
Exemplo n.º 25
0
def deep_q_learning(sess,
                    env,
                    q_estimator,
                    target_estimator,
                    state_processor,
                    num_episodes,
                    experiment_dir,
                    replay_memory_size=500000,
                    replay_memory_init_size=50000,
                    update_target_estimator_every=10000,
                    discount_factor=0.99,
                    epsilon_start=1.0,
                    epsilon_end=0.1,
                    epsilon_decay_steps=500000,
                    batch_size=32,
                    record_video_every=50):
    """
    Q-Learning algorithm for fff-policy TD control using Function Approximation.
    Finds the optimal greedy policy while following an epsilon-greedy policy.

    Args:
        sess: Tensorflow Session object
        env: OpenAI environment
        q_estimator: Estimator object used for the q values
        target_estimator: Estimator object used for the targets
        state_processor: A StateProcessor object
        num_episodes: Number of episodes to run for
        experiment_dir: Directory to save Tensorflow summaries in
        replay_memory_size: Size of the replay memory
        replay_memory_init_size: Number of random experiences to sampel when initializing 
          the reply memory.
        update_target_estimator_every: Copy parameters from the Q estimator to the 
          target estimator every N steps
        discount_factor: Lambda time discount factor
        epsilon_start: Chance to sample a random action when taking an action.
          Epsilon is decayed over time and this is the start value
        epsilon_end: The final minimum value of epsilon after decaying is done
        epsilon_decay_steps: Number of steps to decay epsilon over
        batch_size: Size of batches to sample from the replay memory
        record_video_every: Record a video every N episodes

    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """
    print('replay_memory_size = ' + str(replay_memory_size))
    print('replay_memory_init_size = ' + str(replay_memory_init_size))
    print('update_target_estimator_every = ' +
          str(update_target_estimator_every))
    print('epsilon_decay_steps = ' + str(epsilon_decay_steps))
    print('batch_size = ' + str(batch_size))
    print('numBlocks = ' + str(numBlocks))
    print('n_steps = ' + str(n_steps))
    print('n_hidden = ' + str(n_hidden))
    #    dict = {'replay_memory_size': str(replay_memory_size)}
    dict = {
        'replay_memory_size': replay_memory_size,
        'replay_memory_init_size': replay_memory_init_size,
        'update_target_estimator_every': update_target_estimator_every,
        'epsilon_decay_steps': epsilon_decay_steps,
        'batch_size': batch_size,
        'numBlocks': numBlocks,
        n_steps: 'n_steps',
        'n_hidden': n_hidden
    }
    file = open(experiment_dir + '/dump.txt', 'wb')
    pickle.dump(str.encode(str(dict)), file)
    file.close()

    #Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])
    Transition = namedtuple(
        "Transition",
        ["state", "action", "reward", "next_state", "done", "prev_states"])
    # The replay memory
    replay_memory = []

    # Make model copier object
    estimator_copy = ModelParametersCopier(q_estimator, target_estimator)

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    # For 'system/' summaries, usefull to check if currrent process looks healthy
    current_process = psutil.Process()

    # Create directories for checkpoints and summaries
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    checkpoint_path = os.path.join(checkpoint_dir, "model")
    monitor_path = os.path.join(experiment_dir, "monitor")

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    if not os.path.exists(monitor_path):
        os.makedirs(monitor_path)

    saver = tf.train.Saver()
    # Load a previous checkpoint if we find one
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        print("Loading model checkpoint {}...\n".format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)

    # Get the current time step
    total_t = sess.run(tf.contrib.framework.get_global_step())

    # The epsilon decay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)

    # The policy we're following
    policy = make_epsilon_greedy_policy(q_estimator, len(VALID_ACTIONS))

    # The following code will show all the trainable variables
    #variables_names = [v.name for v in tf.trainable_variables()]
    #values = sess.run(variables_names)
    #for k, v in zip(variables_names, values):
    #    print ("Variable: ", k)
    #    print ("Shape: ", v.shape)
    #    print (v)
    #input ('Press any key')

    # Populate the replay memory with initial experience
    print("Populating replay memory...")
    state = env.reset()
    state = state_processor.process(sess, state)
    #    print (state.shape)
    #    print (state)
    #    state = np.stack([state] * 4, axis=2)
    t = 0
    for i in range(replay_memory_init_size):
        #New version follows the existing policy, only useful if the estimator has been pre-loaded
        prev_states_current, prev_states_next = computePreviousStates(
            replay_memory, n_steps)
        action_probs = policy(sess, prev_states_current,
                              epsilons[min(total_t,
                                           epsilon_decay_steps - 1)], True)
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        #print ("\nTaking action " + str(VALID_ACTIONS[action]))
        #Old version without following the policy
        #action = np.random.choice(VALID_ACTIONS)
        next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
        next_state = state_processor.process(sess, next_state)
        #next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)
        prev_states_current, prev_states_next = computePreviousStates(
            replay_memory, n_steps)
        # Incorrect code, delete
        #if (t!=0):
        #    print ('Inside loop t not zero')
        #    print (state)
        #    state[numBlocks:]=numBlocks+1
        #    print (state)
        # Move the following line inside the if
        #replay_memory.append(Transition(state, action, reward, next_state, done,prev_states_next))
        # Trick to consider the case where we end the episode, the next state will change
        # because we reset the environment
        orig_state = state
        if done:
            t = 0
            state = env.reset()
            state = state_processor.process(sess, state)
            # Note that we use state as the next_state because we have reset the environment
            replay_memory.append(
                Transition(orig_state, action, reward, state, done,
                           prev_states_next))
            #state = np.stack([state] * 4, axis=2)

        else:
            replay_memory.append(
                Transition(state, action, reward, next_state, done,
                           prev_states_next))
            state = next_state
            t = t + 1

    # Record videos
    # Add env Monitor wrapper
    #env = Monitor(env, directory=monitor_path, video_callable=lambda count: count % record_video_every == 0, resume=True)

    for i_episode in range(num_episodes):

        # Save the current checkpoint
        saver.save(tf.get_default_session(), checkpoint_path)

        # Reset the environment
        state = env.reset()
        env.render()
        state = state_processor.process(sess, state)
        #state = np.stack([state] * 4, axis=2)
        loss = None
        print('\n******** NEW EPISODE ***********************************\n')
        # One step in the environment
        for t in itertools.count():

            # Epsilon for this time step
            epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)]

            # Maybe update the target estimator
            if total_t % update_target_estimator_every == 0:
                estimator_copy.make(sess)
                print("\nCopied model parameters to target network.")

            # Print out which step we're on, useful for debugging.
            print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format(
                t, total_t, i_episode + 1, num_episodes, loss),
                  end="")
            sys.stdout.flush()

            # Take a step
            prev_states_current, prev_states_next = computePreviousStates(
                replay_memory, n_steps)

            #action_probs = policy(sess, state, epsilon)
            action_probs = policy(sess, prev_states_current, epsilon, True)
            action = np.random.choice(np.arange(len(action_probs)),
                                      p=action_probs)
            print("\nTaking action " + str(VALID_ACTIONS[action]))
            next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
            #env.render()
            next_state = state_processor.process(sess, next_state)
            #            next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)

            # If our replay memory is full, pop the first element
            if len(replay_memory) == replay_memory_size:
                replay_memory.pop(0)

            # Save transition to replay memory
            # Here we should consider that when the episode is done next_state should be replaced
            # by the first state in the next episode?
            replay_memory.append(
                Transition(state, action, reward, next_state, done,
                           prev_states_next))
            #replay_memory.append(Transition(state, action, reward, next_state, done))

            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            # Sample a minibatch from the replay memory
            samples = random.sample(replay_memory, batch_size)
            states_batch, action_batch, reward_batch, next_states_batch, done_batch, prev_next_states_batch = map(
                np.array, zip(*samples))
            # Shape of next_states_batch is batch_size*n_input
            #print (next_states_batch.shape)
            #print (prev_next_states_batch.shape)
            # Calculate q values and targets
            #            q_values_next = target_estimator.predict(sess, next_states_batch)
            #print (prev_next_states_batch[0])
            # Shape of prev_next_states_batch is batch_size*n_steps*n_output
            prev_next_states_batch = np.reshape(prev_next_states_batch,
                                                (-1, n_steps, numBlocks * 2))
            #print (prev_next_states_batch[0])
            #print (prev_next_states_batch.shape)
            q_values_next = target_estimator.predict(sess,
                                                     prev_next_states_batch,
                                                     False)
            #print ('Q values next')
            #print (q_values_next.shape)
            #print (q_values_next)
            targets_batch = reward_batch + np.invert(done_batch).astype(
                np.float32) * discount_factor * np.amax(q_values_next, axis=1)
            #print ('Targets')
            #print (targets_batch.shape)
            #print (targets_batch)
            # Perform gradient descent update
            #states_batch = np.array(states_batch)
            states_batch = np.array(prev_next_states_batch)
            loss = q_estimator.update(sess, states_batch, action_batch,
                                      targets_batch)

            if done:
                print('PROBLEM SOLVED')
                print('\n*******************************************\n')
                #We just make this call because we want to store the last state of the episode
                prev_states_current, prev_states_next = computePreviousStates(
                    replay_memory, n_steps)
                action_probs = policy(sess, prev_states_current, epsilon, True)
                break
            #if (loss>500):
            #    break

            state = next_state
            total_t += 1

        # Add summaries to tensorboard
        episode_summary = tf.Summary()
        episode_summary.value.add(simple_value=epsilon, tag="episode/epsilon")
        episode_summary.value.add(
            simple_value=stats.episode_rewards[i_episode],
            tag="episode/reward")
        episode_summary.value.add(
            simple_value=stats.episode_lengths[i_episode],
            tag="episode/length")
        episode_summary.value.add(simple_value=current_process.cpu_percent(),
                                  tag="system/cpu_usage_percent")
        episode_summary.value.add(
            simple_value=current_process.memory_percent(memtype="vms"),
            tag="system/v_memeory_usage_percent")
        q_estimator.summary_writer.add_summary(episode_summary, i_episode)
        q_estimator.summary_writer.flush()

        yield total_t, plotting.EpisodeStats(
            episode_lengths=stats.episode_lengths[:i_episode + 1],
            episode_rewards=stats.episode_rewards[:i_episode + 1])

    return stats
def actor_critic(env, actor, critic, num_episodes,
    num_timesteps=5000, discount_factor=1.0):
    """
    Actor Critic Algorithm. Optimizes the policy 
    function approximator using policy gradient.
    
    Args:
        env: My self created environment, specified above.
        actor: Policy Function to be optimized 
        critic: Value function approximator 
        num_episodes: Number of episodes to run for
        discount_factor: Time-discount factor
    
    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and
        episode_rewards.
    """
    
    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))    
    
    Transition = collections.namedtuple("Transition", ["state", "action",
                                                       "reward", "next_state",
                                                       "done"])
    
    funds_wealth_all_episodes = []
    funds_return_all_ep = []    
    learning_fund_stats = np.zeros((num_episodes, num_timesteps, 6))
    
    for i_episode in range(num_episodes):
        
        # Reset everything
        prices = []
        funds_wealth = []
        funds_returns = []
        # Create our learning_fund
        learning_fund = LearningFund()
        
        # Create the funds (DynamicFund class is described in thurner_model.py) 
        number_of_funds = 10
        funds = [DynamicFund((i+1)*5) for i in range(number_of_funds)]
        
        # Add our learning fund
        funds.append(learning_fund)

        # Reset the environment 
        env.reset() 
        
        episode = []
        
        # One step in the environment
        for t in range(num_timesteps):
            
            # get the demand of the learning fund
            # (via getting demand from actor)
            
            demand = learning_fund.get_demand(env.p_t) 
            
            state = learning_fund.get_state(env.p_t)
            
            # Simulate a step in the environment,
            # record the wealth of all funds in current_wealth
            current_wealth, current_returns = env.step(funds)
            
            # record the wealth of all funds and the current price
            funds_wealth.append(current_wealth)
            funds_returns.append(current_returns)
            prices.append(env.p_t)
            
            # only update learning if learning fund is not bankrupt
            if learning_fund.is_active():
                # we assume one learning fund for the moment
                next_state = learning_fund.get_state(env.p_t) 
                
                reward = learning_fund.ret

                # experiment: high negative reward if learning_fund goes bankrupt
                #if learning_fund.activation_delay == 100:            
                #   reward = -100
 
                # Keep track of the transition
                episode.append(Transition(state=state, action=demand,
                                          reward=reward, next_state=next_state,
                                          done=env.done))
                
                # Update statistics
                stats.episode_rewards[i_episode] += reward
                stats.episode_lengths[i_episode] = t
                
                # Calculate TD Target
                value_next = critic.predict(next_state)
                td_target = reward + discount_factor * value_next
                td_error = td_target - critic.predict(state)
                
                # Update the value estimator
                critic.update(state, td_target)
                
                # Update the policy estimator
                # using the td error as our advantage estimate
                actor.update(state, td_error, demand)
            
            learning_fund_stats[i_episode][t] = np.array([env.p_t,
                                                          demand,
                                                          learning_fund.get_wealth(env.p_t),
                                                          learning_fund.cash,
                                                          learning_fund.shares,
                                                          learning_fund.ret])
            
            # Print out which step we're on, useful for debugging.
            print("\rt: {} @ Episode {}/{} ({})".format(
                    t, i_episode + 1, num_episodes,
                    stats.episode_rewards[i_episode - 1]), end="")
            
                        
            # env.done is True if one fund increases its wealth 50-fold
            #if env.done:
            #    break
            
            state = next_state

        # After each episode, record the wealth of all funds
        funds_wealth_all_episodes.append(funds_wealth)
        funds_return_all_ep.append(funds_returns) 
        # Save the variables to disk.
        checkpoint = "./checkpoints/{}-ep{}".format(experiment_name,i_episode)
        save_path = saver.save(sess,checkpoint)         
        print("\nModel saved in path: {}\n".format(save_path))
    
    return stats, funds_wealth_all_episodes, funds_return_all_ep, learning_fund_stats
Exemplo n.º 27
0
def deep_q_learning(sess,
                    env,
                    q_estimator,
                    target_estimator,
                    state_processor,
                    num_episodes,
                    experiment_dir,
                    replay_memory_size=500000,
                    replay_memory_init_size=50000,
                    update_target_estimator_every=10000,
                    discount_factor=0.99,
                    epsilon_start=1.0,
                    epsilon_end=0.0,
                    epsilon_decay_steps=500000,
                    batch_size=32,
                    record_video_every=50):
    '''
    The function that will train our network to play Breakout
    Args:
        sess: tensorflow session
        env: game environment
        q_estimator: q-value network
        target_estimator: target network
        state_processor: ProcessState class which will process the states
        num_episodes: number of episodes to run for
        experiment_dir: Directory to save tensorflow summaries in
        replay_memory_size: size of the replay memory
        replay_memory_init_size: number of random experiences to sample when initializing the reply memory.
        update_target_estimator_every_: after these iterations upadate the network
        discount_factor: discount factor
        epsilon_start: starting value of the epsilon
        epsilon_end: ending value of epsilon
        epsilon_decay_steps: number of steps over which to decay the epsilon value
        batch_size: size of batch during training
        record_video_every: record video after these episodes
    Returns:
        a tuple with two numpy arrays one for episode_lengths and other for episode_rewards
    '''

    Transition = namedtuple(
        'Transition', ['state', 'action', 'reward', 'next_state', 'done'])

    # replay memory
    replay_memory = []

    # make model copier
    estimator_copy = ModelParametersCopy(q_estimator, target_estimator)

    # keep statistics of the important stuff
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    # for 'system/' summaries, usefull to chec if current process looks healthy
    current_process = psutil.Process()

    # Create directories for checkpoints and summaries
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    checkpoint_path = os.path.join(checkpoint_dir, "model")
    monitor_path = os.path.join(experiment_dir, "monitor")

    # create directories if not exist
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    if not os.path.exists(monitor_path):
        os.makedirs(monitor_path)

    # saver
    saver = tf.train.Saver()

    # get to the current time step
    total_t = sess.run(tf.contrib.framework.get_global_step())

    # epsilon decay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)

    # policy
    policy = make_epsilon_greedy_policy(q_estimator, len(VALID_ACTIONS))

    # populate the replay memory with initial experience
    print("Populating replay memory...")

    # start new game
    state = env.reset()
    state = state_processor.process(sess, state)
    state = np.stack([state] * 4, axis=2)

    # run the loop to gte values
    for i in range(replay_memory_init_size):
        # get action probabilities
        action_probs = policy(sess, state,
                              epsilons[min(total_t, epsilon_decay_steps - 1)])
        # select action randomly
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        # perform one step
        next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
        # process the state
        next_state = state_processor.process(sess, next_state)
        # ??
        next_state = np.append(state[:, :, 1:],
                               np.expand_dims(next_state, 2),
                               axis=2)
        # add the state to replay memory
        replay_memory.append(
            Transition(state, action, reward, next_state, done))
        # if done reset
        if done:
            state = env.reset()
            state = state_processor.process(sess, state)
            state = np.stack([state] * 4, axis=2)
        else:
            state = next_state

    # record videos
    env = Monitor(env,
                  directory=monitor_path,
                  video_callable=lambda count: count % record_video_every == 0,
                  resume=True)

    # run the model for num_episodes
    for i_episode in range(num_episodes):

        # save the current checkpoint
        saver.save(tf.get_default_session(), checkpoint_path)

        # reset the environment
        state = env.reset()
        state = state_processor.process(sess, state)
        state = np.stack([state] * 4, axis=2)
        loss = None

        # one step in the environment
        for t in itertools.count():

            # epsilon at the current time step
            epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)]

            # update the target network if required
            if total_t % update_target_estimator_every == 0:
                estimator_copy.make(sess)
                print("[!]Copying model parameters...\n")

            # print out which step we're on, helps in debugging
            print('[!]Step {} ({}) @ Episode {}/{}, loss:'.format(
                t, total_t, i_episode + 1, num_episodes, loss),
                  end=" ")
            sys.stdout.flush()

            # take a step
            action_probs = policy(sess, state, epsilon)
            action = np.random.choice(np.arange(len(action_probs)),
                                      p=action_probs)
            next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
            # process the next state
            next_state = state_processor.process(sess, next_state)
            next_state = np.append(state[:, :, 1:],
                                   np.expand_dims(next_state, 2),
                                   axis=2)

            # if replay memory is full delete the first element
            if len(replay_memory) == replay_memory_size:
                del replay_memory[0]

            # Save the transition to replay memory
            replay_memory.append(
                Transition(state, actio, reward, next_state, done))

            # update the statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            # sample a minibatch from replay memory
            samples = random.sample(replay_memory, batch_size)
            states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(
                np.array, zip(*samples))

            # calculate q_values and targets
            q_values_next = target_estimator.predict(sess, next_states_batch)
            targets_batch = reward_batch + np.invert(done_batch).astype(
                np.float32) * discount_factor * np.argmax(q_values_next,
                                                          axis=1)

            # perform gradient descent update
            states_batch = np.array(states_batch)
            loss = q_estimator.update(sess, states_batch, action_batch,
                                      targets_batch)

            # if done
            if done:
                break

            # else update the state and go on
            state = next_state
            total_t += 1

        # add summaries to the tensorboard
        episode_summary = tf.Summary()

        # add values
        episode_summary.value.add(simple_value=epsilon, tag='episode/epsilon')
        episode_summary.value.add(
            simple_value=stats.episode_rewards[i_episode],
            tag='episode/reward')
        episode_summary.value.add(
            simple_value=state.episode_lengths[i_episode],
            tag='episode/length')
        episode_summary.value.add(simple_value=current_process.cpu_percent(),
                                  tag='system/cpu_usage_percent')
        episode_summary.value.add(
            simple_value=current_process.memory_percent(memtype='vms'),
            tag='system/v_memory_usage_percent')
        q_estimator.summary_writer.add_summary(episode_summary, i_episode)
        q_estimator.summary_write.flush()

        # yield
        yield total_t, plotting.EpisodeStats(
            episode_lengths=stats.episode_lengths[:i_episode + 1],
            episode_rewards=stats.episode_rewards[:i_episode + 1])

    return stats
Exemplo n.º 28
0
def deep_q_learning(sess,
                    env,
                    q_estimator,
                    target_estimator,
                    num_episodes,
                    experiment_dir,
                    replay_memory_size=500000,
                    replay_memory_init_size=50000,
                    update_target_estimator_every=10000,
                    discount_factor=0.99,
                    epsilon_start=1.0,
                    epsilon_end=0.1,
                    epsilon_decay_steps=500000,
                    batch_size=32,
                    record_video_every=50):
    """
    Q-Learning algorithm for off-policy TD control using Function Approximation.
    Finds the optimal greedy policy while following an epsilon-greedy policy.

    Args:
        sess: Tensorflow Session object
        env: OpenAI environment
        q_estimator: Estimator object used for the q values
        target_estimator: Estimator object used for the targets
        num_episodes: Number of episodes to run for
        experiment_dir: Directory to save Tensorflow summaries in
        replay_memory_size: Size of the replay memory
        replay_memory_init_size: Number of random experiences to sampel when initializing 
          the reply memory.
        update_target_estimator_every: Copy parameters from the Q estimator to the 
          target estimator every N steps
        discount_factor: Gamma discount factor
        epsilon_start: Chance to sample a random action when taking an action.
          Epsilon is decayed over time and this is the start value
        epsilon_end: The final minimum value of epsilon after decaying is done
        epsilon_decay_steps: Number of steps to decay epsilon over
        batch_size: Size of batches to sample from the replay memory
        record_video_every: Record a video every N episodes

    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])

    # The replay memory
    replay_memory = []
    
    # Make model copier object
    estimator_copy = ModelParametersCopier(q_estimator, target_estimator)

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes))
    
    # For 'system/' summaries, usefull to check if currrent process looks healthy
    current_process = psutil.Process()

    # Create directories for checkpoints and summaries
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    checkpoint_path = os.path.join(checkpoint_dir, "model")
    monitor_path = os.path.join(experiment_dir, "monitor")
    
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    if not os.path.exists(monitor_path):
        os.makedirs(monitor_path)

    saver = tf.train.Saver()
    # Load a previous checkpoint if we find one
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        print("Loading model checkpoint {}...\n".format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)
    
    # Get the current time step
    total_t = sess.run(tf.contrib.framework.get_global_step())

    # The epsilon decay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)

    # The policy we're following
    policy = make_epsilon_greedy_policy(q_estimator, len(VALID_ACTIONS))

    # Populate the replay memory with initial experience
    print("Populating replay memory...")
    state = env.reset()
    state = one_hot_encode_environment_state(state) # state_processor.process(sess, state)
    state = np.stack([state] * 4, axis=2)
    for i in range(replay_memory_init_size):
        action_probs = policy(sess, state, epsilons[min(total_t, epsilon_decay_steps-1)])
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        next_state, reward, done = env.step(VALID_ACTIONS[action])
        next_state = one_hot_encode_environment_state(next_state) # state_processor.process(sess, next_state)
        next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)
        replay_memory.append(Transition(state, action, reward, next_state, done))
        if done:
            state = env.reset()
            state = one_hot_encode_environment_state(state) # state_processor.process(sess, state)
            state = np.stack([state] * 4, axis=2)
        else:
            state = next_state


    # Record videos
    # Add env Monitor wrapper
    # env = Monitor(env, directory=monitor_path, video_callable=lambda count: count % record_video_every == 0, resume=True)

    for i_episode in range(num_episodes):

        # Save the current checkpoint
        saver.save(tf.get_default_session(), checkpoint_path)

        # Reset the environment
        state = env.reset()
        state = one_hot_encode_environment_state(state) # state_processor.process(sess, state)
        state = np.stack([state] * 4, axis=2)
        loss = None

        # One step in the environment
        for t in itertools.count():

            # Epsilon for this time step
            epsilon = epsilons[min(total_t, epsilon_decay_steps-1)]

            # Maybe update the target estimator
            if total_t % update_target_estimator_every == 0:
                estimator_copy.make(sess)
                print("\nCopied model parameters to target network.")

            # Print out which step we're on, useful for debugging.
            print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format(t, total_t, i_episode + 1, num_episodes, loss), end="")
            sys.stdout.flush()

            # Take a step
            action_probs = policy(sess, state, epsilon)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            next_state, reward, done = env.step(VALID_ACTIONS[action])
            next_state = one_hot_encode_environment_state(next_state) # state_processor.process(sess, next_state)
            next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)

            # If our replay memory is full, pop the first element
            if len(replay_memory) == replay_memory_size:
                replay_memory.pop(0)

            # Save transition to replay memory
            replay_memory.append(Transition(state, action, reward, next_state, done))   

            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            # Sample a minibatch from the replay memory
            samples = random.sample(replay_memory, batch_size)
            states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(np.array, zip(*samples))

            # Calculate q values and targets
            q_values_next = target_estimator.predict(sess, next_states_batch)
            targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * discount_factor * np.amax(q_values_next, axis=1)

            # Perform gradient descent update
            states_batch = np.array(states_batch)
            loss = q_estimator.update(sess, states_batch, action_batch, targets_batch)

            if done:
                break

            state = next_state
            total_t += 1

        # Add summaries to tensorboard
        episode_summary = tf.Summary()
        episode_summary.value.add(simple_value=epsilon, tag="episode/epsilon")
        episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], tag="episode/reward")
        episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], tag="episode/length")
        episode_summary.value.add(simple_value=current_process.cpu_percent(), tag="system/cpu_usage_percent")
        episode_summary.value.add(simple_value=current_process.memory_percent(memtype="vms"), tag="system/v_memeory_usage_percent")
        q_estimator.summary_writer.add_summary(episode_summary, i_episode)
        q_estimator.summary_writer.flush()
        
        yield total_t, plotting.EpisodeStats(episode_lengths=stats.episode_lengths[:i_episode+1], episode_rewards=stats.episode_rewards[:i_episode+1])

    return stats
Exemplo n.º 29
0
def deep_q_learning(sess,
                    env,
                    q_estimator,
                    target_estimator,
                    num_episodes,
                    experiment_dir,
                    replay_memory_size=500000,
                    replay_memory_init_size=50000,
                    update_target_estimator_every=10000,
                    discount_factor=0.99,
                    epsilon_start=1.0,
                    epsilon_end=0.1,
                    epsilon_decay_steps=500000,
                    batch_size=32,
                    record_video_every=50):
    """
    DQN algorithm with fff-policy Temporal Differnce control
    returns EpisodeStats object with 2 numpy arrays for episode_lengths and episode_rewards
    """

    Transition = namedtuple(
        "Transition", ["state", "action", "reward", "next_state", "done"])

    replay_memory = []

    # useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    # directories for checkpoints and summaries
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    checkpoint_path = os.path.join(checkpoint_dir, "model")
    monitor_path = os.path.join(experiment_dir, "monitor")

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    if not os.path.exists(monitor_path):
        os.makedirs(monitor_path)

    saver = tf.train.Saver()
    # Load a previous checkpoint if we find one
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        print("Loading model checkpoint {}...\n".format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)

    # get current time step
    total_t = sess.run(tf.train.get_global_step())

    # epsilon decay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)

    # q policy we are following
    policy = make_epsilon_greedy_policy(q_estimator, len(VALID_ACTIONS))

    action_mapping = compose_action_from_id()
    # load initial experience into replay memory
    print("Populating replay memory...")
    state = env.reset()

    state, _, _ = env.step(actions.FunctionCall(_SELECT_ARMY, [_SELECT_ADD]))

    # make the minimap data the state.
    #state = state[0].observation["rgb_minimap"].astype(np.uint8)
    #state = state_processor.process(sess, state)
    #state = np.stack([state] * 4, axis=2)
    for i in range(replay_memory_init_size):
        if i % 1000 == 0:
            print("iteration " + str(i))
        # according to policy, create a action probability array
        action_probs = policy(sess, state,
                              epsilons[min(total_t, epsilon_decay_steps - 1)])
        # randomly select an action according to action probs from policy
        action = np.random.choice(np.arange(len(VALID_ACTIONS)),
                                  p=action_probs)
        # action = action_mapping[action]
        action = action_mapping[0]

        # openAI gym take a step in action space
        next_state, reward, done = env.step(action)
        # process image data
        #next_state = state_processor.process(sess, next_state)
        #next_state = np.append(state[:, :, 1:], np.expand_dims(next_state, 2), axis=2)
        # add action to replay memory
        replay_memory.append(
            Transition(state, action, reward, next_state, done))
        if done:
            # if found goal, start over
            state = env.reset()
            state, _, _ = env.step(
                actions.FunctionCall(_SELECT_ARMY, [_SELECT_ADD]))
            # make the minimap data the state.
            #state = state[0].observation["rgb_minimap"].astype(np.uint8)
            #state = state_processor.process(sess, state)
            #state = np.stack([state] * 4, axis=2)

        else:
            # if not found goal, update state to next state
            state = next_state

    # record videos
    # ad env monitor wrapper
    # Does this work for PySC2?
    #env = Monitor(env, directory=monitor_path, video_callable=lambda count: count % record_video_every == 0,
    #              resume=True)

    for i_episode in range(num_episodes):
        # save the current checkpoint
        if i_episode % 100 == 0:
            print("episode: " + str(i_episode))
        saver.save(tf.get_default_session(), checkpoint_path)

        # reset openAI environment
        state = env.reset()
        state, _, _ = env.step(
            actions.FunctionCall(_SELECT_ARMY, [_SELECT_ADD]))
        #state = state_processor.process(sess, state)
        #state = np.stack([state] * 4, axis=2)
        loss = None
        # main forloop after loading initial state
        for t in itertools.count():

            # epsilon for this timestep
            epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)]

            # add epsilon to tensorboard
            episode_summary = tf.Summary()
            episode_summary.value.add(simple_value=epsilon, tag="epsilon")
            q_estimator.summary_writer.add_summary(episode_summary, total_t)

            # maybe update the target estimator
            # update means copying parameters from q estimator -> target estimator
            if total_t % update_target_estimator_every == 0:
                copy_model_parameters(sess, q_estimator, target_estimator)
                print("\nCopied model parameters to target network.")

            # Print out which step we're on, useful for debugging.
            print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format(
                t, total_t, i_episode + 1, num_episodes, loss),
                  end="")
            sys.stdout.flush()

            # take the next step in the environment
            # similar to earlier when loading replay memory with first step
            action_probs = policy(sess, state, epsilon)
            action = np.random.choice(np.arange(len(VALID_ACTIONS)),
                                      p=action_probs)
            action = action_mapping[action]
            action = action_mapping[0]
            next_state, reward, done = env.step(action)
            #next_state = state_processor.process(sess, next_state)
            #next_state = np.append(state[:, :, 1:], np.expand_dims(next_state, 2), axis=2)

            # if replay memory is full, pop
            if len(replay_memory) == replay_memory_size:
                replay_memory.pop(0)

            # save transition to replay memory
            replay_memory.append(
                Transition(state, action, reward, next_state, done))

            # update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            # sample minibatch from replay memory
            samples = random.sample(replay_memory, batch_size)
            states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(
                np.array, zip(*samples))

            # calculate qvalues and targets
            # Q ALGO RIGHT HERE LMAO
            q_values_next = target_estimator.predict(sess, next_states_batch)
            targets_batch = reward_batch + np.invert(done_batch).astype(
                np.float32) * discount_factor * np.max(q_values_next, axis=1)

            # gradient descent
            states_batch = np.array(states_batch)
            loss = q_estimator.update(sess, states_batch, action_batch,
                                      targets_batch)

            if done:
                break

            state = next_state
            total_t += 1

    # Add summaries to tensorboard
    episode_summary = tf.Summary()
    episode_summary.value.add(simple_value=stats.episode_rewards[i_episode],
                              node_name="episode_reward",
                              tag="episode_reward")
    episode_summary.value.add(simple_value=stats.episode_lengths[i_episode],
                              node_name="episode_length",
                              tag="episode_length")
    q_estimator.summary_writer.add_summary(episode_summary, total_t)
    q_estimator.summary_writer.flush()

    yield total_t, plotting.EpisodeStats(
        episode_lengths=stats.episode_lengths[:i_episode + 1],
        episode_rewards=stats.episode_rewards[:i_episode + 1])

    return stats
def reinforce(env,
              estimator_policy,
              estimator_value,
              num_episodes,
              discount_factor=1.0):
    """
    REINFORCE (Monte Carlo Policy Gradient) Algorithm. Optimizes the policy
    function approximator using policy gradient.
    
    Args:
        env: OpenAI environment.
        estimator_policy: Policy Function to be optimized 
        estimator_value: Value function approximator, used as a baseline
        num_episodes: Number of episodes to run for
        discount_factor: Time-discount factor
    
    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    Transition = collections.namedtuple(
        "Transition", ["state", "action", "reward", "next_state", "done"])

    for i_episode in range(num_episodes):
        # Reset the environment and pick the fisrst action
        state = env.reset()

        episode = []

        # One step in the environment
        for t in itertools.count():

            # Take a step
            #action_means = np.ndarray.flatten(estimator_policy.predict(state))
            #action = np.random.multivariate_normal(mean=action_means, cov=full_var)

            action = estimator_policy.predict(state)
            '''
            max_idx = np.argmax(np.abs(action))
            a_max = action[max_idx]

            if a_max > high_threshold or a_max < low_threshold: 
                action_clipped = action / (10*np.abs(a_max))
            '''
            '''
            action_clipped = [np.max([np.min([action[0], high_threshold]), low_threshold]), 
                          np.max([np.min([action[1], high_threshold]),
                           low_threshold])]
            '''

            next_state, reward, done, _ = env.step(action)
            '''
            if t > 50:
                done = True
            '''

            # Keep track of the transition
            episode.append(
                Transition(state=state,
                           action=action,
                           reward=reward,
                           next_state=next_state,
                           done=done))

            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t
            '''
            # Print out which step we're on, useful for debugging.
            print("\rStep {} @ Episode {}/{} ({})".format(
                    t, i_episode + 1, num_episodes, stats.episode_rewards[i_episode - 1]), end="")
            # sys.stdout.flush()
            '''

            if done:
                break

            state = next_state

        monitor_epoch = 50
        if i_episode % monitor_epoch == 0 and i_episode > 0:
            print("avg reward : %f" % (np.mean(
                stats.episode_rewards[i_episode - monitor_epoch:i_episode])))

        baseline_value = np.mean([cur_trans.reward for cur_trans in episode])

        # Go through the episode and make policy updates
        for t, transition in enumerate(episode):
            # The return after this timestep
            total_return = sum(discount_factor**i * t.reward
                               for i, t in enumerate(episode[t:]))

            advantage = total_return - baseline_value
            #advantage += np.max([0, baseline_value - v_prev])

            # Update our policy estimator
            estimator_policy.update(transition.state, advantage,
                                    transition.action)

            #print(p_action)

            if i_episode % 200 == 0 and i_episode > 0:
                plt.figure(1)
                plt.plot(transition.state[0], transition.state[1], 'bo')
                if t == len(episode) - 1:
                    plt.show()

    return stats