예제 #1
0
    def test(self, num_actions):
        self.saver.restore(self.session, FLAGS.checkpoint_path)
        print "Restored model weights from ", FLAGS.checkpoint_path
        monitor_env = gym.make(FLAGS.game)
        monitor_env.monitor.start("/tmp/" + FLAGS.game ,force=True)
        env = Env(env, FLAGS.width, FLAGS.height, FLAGS.history_length, FLAGS.game_type)
        
        for i_episode in xrange(FLAGS.num_eval_episodes):
            state = env.get_initial_state()
            episode_reward = 0
            done = False
            
            # create state sequence
            state_sequence = np.zeros((t_max, FLAGS.history_length, FLAGS.width, FLAGS.height))
            state_sequence[t_max -1, :, :, :] = state
            
            while not done:
                monitor_env.render()
                q_values = self.q_values.eval(session = self.session, feed_dict = {self.state : [state_sequence]})
                action_index = np.argmax(q_values)
                new_state, reward, done = env.step(action_index)
                state = new_state

                # update state sequence
                state_sequence = np.delete(state_sequence, 0, 0)
                state_sequence = np.insert(state_sequence, t_max-1, state, 0)
                episode_reward += reward
            print "Finished episode " + str(i_episode + 1) + " with score " + str(episode_reward)
        
        monitor_env.monitor.close()
예제 #2
0
    def test(self, env):

        # initialize environment
        env = Env(env, 84, 84, 4)

        terminal = False
        # Get initial game observation
        state = env.get_initial_state()

        # episode's reward and cost
        episode_reward = 0

        for _ in range(100):
            while not terminal:

                # forward pass of network. Get probability of all actions
                probs, v = self.sess.run((self.policy, self.state_value),
                                         feed_dict={self.input_state: [state]})

                probs = probs[0]
                v = v[0][0]

                if random.random() < 0.01:
                    action_index = random.choice([0, 1, 2, 3])
                else:
                    action_index = np.argmax(probs)

                # Gym excecutes action in game environment on behalf of actor-learner
                new_state, reward, terminal = env.step(action_index)
                env.env.render()
                # clip reward to -1, 1
                # Update the state and global counters
                state = new_state
                # update episode's counter
                episode_reward += reward

            if terminal:

                terminal = False
                print "THREAD:", self.thread_id, "/ TIME", T, "/ REWARD", \
                    episode_reward, "/ COST"
                episode_reward = 0
                counter = 0
                # Get initial game observation
                state = env.get_initial_state()
예제 #3
0
    def test(self, num_actions):
        self.saver.restore(self.session, FLAGS.checkpoint_path)
        print "Restored model weights from ", FLAGS.checkpoint_path
        monitor_env = gym.make(FLAGS.game)
        monitor_env.monitor.start("/tmp/" + FLAGS.game ,force=True)
        env = Env(monitor_env, FLAGS.width, FLAGS.height, FLAGS.history_length, FLAGS.game_type)

        for i_episode in xrange(FLAGS.num_eval_episodes):
            state = env.get_initial_state()
            episode_reward = 0
            done = False
            while not done:
                monitor_env.render()
                probs = self.session.run(self.policy_values, feed_dict={self.state: [state]})[0]
                action_index = sample_policy_action(num_actions, probs)
                new_state, reward, done = env.step(action_index)
                state = new_state
                episode_reward += reward
            print "Finished episode " + str(i_episode + 1) + " with score " + str(episode_reward)
        
        monitor_env.monitor.close()
예제 #4
0
    def actor_learner_thread(self, env, thread_id, num_actions):

        # create instance of Doom environment
        env = Env(env, FLAGS.width, FLAGS.height, FLAGS.history_length, FLAGS.game_type)
   
        print 'Starting thread ' + str(thread_id) 
        time.sleep(3*thread_id)
        
        # Get initial game observation
        state = env.get_initial_state()

        # episode's counter
        episode_reward = 0
        counter = 0

        while self.T < self.TMAX:
                    
            done = False
            
            # clear gradients
            states = []
            actions = []
            prev_reward = []

            t = 0
            t_start = t
            
            # synchronize policy and value network
            self.session.run(self.update_policy[thread_id])
            self.session.run(self.update_value[thread_id])
            
            while not (done or ((t - t_start)  == t_max)):
                
                # forward pass of network. Get probability of all actions
                probs = self.session.run(self.local_policy[thread_id], feed_dict={self.local_states[thread_id]: [state]})[0]

                # define list of actions. All values are zeros except , the
                # value of action that is executed
                action_list = np.zeros([num_actions])

                # choose action based on policy
                action_index = sample_policy_action(num_actions, probs)
                action_list[action_index] = 1

                # add state and action to list
                actions.append(action_list)
                states.append(state)
                
                # Gym excecutes action in game environment on behalf of actor-learner
                new_state, reward, done = env.step(action_index)

                # clip reward to -1, 1
                clipped_reward = np.clip(reward, -1, 1)
                prev_reward.append(clipped_reward)

                # Update the state and global counters
                state = new_state
                self.T += 1
                t += 1
                counter += 1
                # update episode's counter
                episode_reward += reward
    
    
                # Save model progress
                if counter % FLAGS.checkpoint_interval == 0:
                    if FLAGS.game_type == 'Doom':
                        self.saver.save(self.session, FLAGS.checkpoint_dir+"/" + FLAGS.game.split("/")[1] + ".ckpt" , global_step = counter)
                    else:
                        self.saver.save(self.session, FLAGS.checkpoint_dir+"/" + FLAGS.game + ".ckpt" , global_step = counter)

            if done:
                R_t = 0
            else:
                R_t = self.session.run(self.local_value[thread_id], feed_dict = {self.local_states[thread_id] : [state]})[0][0]

            targets = np.zeros((t - t_start))
                
            for i in range(t - t_start -1 , -1, -1):
                R_t = prev_reward[i] + FLAGS.gamma * R_t
                targets[i] = R_t

            #update q value network
            self.session.run(self.grad_update, feed_dict = {self.state: states,
                                                          self.actions: actions,
                                                          self.targets: targets})
                
            if done:
                print "THREAD:", thread_id, "/ TIME", self.T, "/ TIMESTEP", counter, "/ REWARD", episode_reward
                episode_reward = 0
                
                # Get initial game observation
                state = env.get_initial_state()
예제 #5
0
    def train(self,
              env,
              checkpoint_interval,
              checkpoint_dir,
              saver,
              gamma=0.99):
        global T
        self.saver = saver

        # initialize environment
        time.sleep(3 * self.thread_id)
        env = Env(env, 84, 84, 4)

        print 'Starting thread ' + str(self.thread_id)

        terminal = False
        # Get initial game observation

        state = env.get_initial_state()

        # episode's reward and cost
        episode_reward = 0
        total_cost = 0
        counter = 0

        while T < self.TMAX:

            # lists for feeding placeholders
            states = []
            actions = []
            prev_reward = []
            state_values = []

            t = 0
            t_start = t
            self.sess.run(self.sync_op)
            while not (terminal or ((t - t_start) == self.tmax)):

                # forward pass of network. Get probability of all actions
                probs, v = self.sess.run((self.policy, self.state_value),
                                         feed_dict={self.input_state: [state]})

                probs = probs[0]
                v = v[0][0]
                # print the outputs of the neural network fpr sanity chack
                if T % 2000 == 0:
                    print probs
                    print v

                # define list of actions. All values are zeros except , the
                # value of action that is executed
                action_list = np.zeros([self.output_size])

                # choose action based on policy
                action_index = sample_policy_action(probs)

                action_list[action_index] = 1

                # add state and action to list
                actions.append(action_list)
                states.append(state)

                state_values.append(v)

                # Gym executes action in game environment on behalf of actor-learner
                new_state, reward, terminal = env.step(action_index)

                # clip reward to -1, 1
                clipped_reward = np.clip(reward, -1, 1)
                prev_reward.append(clipped_reward)

                # Update the state and global counters
                state = new_state
                T += 1
                t += 1
                counter += 1

                # update episode's counter
                episode_reward += reward

                # Save model progress
                if T % checkpoint_interval < 200:
                    T += 200
                    self.saver.save(self.sess,
                                    checkpoint_dir + "/breakout.ckpt",
                                    global_step=T)

            if terminal:
                R_t = 0
            else:
                R_t = self.sess.run(self.state_value,
                                    feed_dict={self.input_state: [state]})
                R_t = R_t[0][0]

            state_values.append(R_t)
            targets = np.zeros((t - t_start))

            for i in range(t - t_start - 1, -1, -1):
                R_t = prev_reward[i] + gamma * R_t
                targets[i] = R_t

            # compute the advantage based on GAE
            # code from https://github.com/openai/universe-starter-agent
            delta = np.array(prev_reward) + gamma * np.array(
                state_values[1:]) - np.array(state_values[:-1])
            advantage = scipy.signal.lfilter([1], [1, -gamma],
                                             delta[::-1],
                                             axis=0)[::-1]

            # update the global network
            cost, _ = self.sess.run(
                (self.loss, self.opt),
                feed_dict={
                    self.input_state: states,
                    self.actions: actions,
                    self.targets: targets,
                    self.advantage: advantage
                })
            total_cost += cost

            if terminal:

                terminal = False
                print "THREAD:", self.thread_id, "/ TIME", T, "/ REWARD", \
                    episode_reward, "/ COST", total_cost/counter
                episode_reward = 0
                total_cost = 0
                counter = 0

                # Get initial game observation
                state = env.get_initial_state()
예제 #6
0
    def actor_learner_thread(self, env, thread_id, num_actions):

        # create instance of Doom environment
        env = Env(env, FLAGS.width, FLAGS.height, FLAGS.history_length, FLAGS.game_type)

        # Initialize network gradients
        states = []
        actions = []
        targets = []

        initial_epsilon = 1
        epsilon = 1
        final_epsilon = self.sample_final_epsilon()
        print('Starting thread ' + str(thread_id) + ' with final epsilon ' + str(final_epsilon))

        time.sleep(3 * thread_id)
        t = 0

        while self.T < self.TMAX:

            # Get initial game observation
            state = env.get_initial_state()
            done = False

            # episode's counter
            episode_reward = 0
            mean_q = 0
            frames = 0

            while not done:
                # forward pass of network. Get Q(s,a)
                q_values = self.q_values.eval(session=self.session, feed_dict={self.state: [state]})

                # define list of actions. All values are zeros except , the
                # value of action that is executed
                action_list = np.zeros([num_actions])

                action_index = 0

                # chose action based on current policy
                if random.random() <= epsilon:
                    action_index = random.randrange(num_actions)
                else:
                    action_index = np.argmax(q_values)
                action_list[action_index] = 1

                # reduce epsilon
                if epsilon > final_epsilon:
                    epsilon -= (initial_epsilon - final_epsilon) / FLAGS.anneal_epsilon_timesteps

                # decrease learning rate
                if self.lr > 0:
                    self.lr -= FLAGS.learning_rate / self.TMAX

                # Gym excecutes action in game environment on behalf of actor-learner
                new_state, reward, done = env.step(action_index)

                # forward pass of target network. Get Q(s',a)
                target_q_values = self.target_q_values.eval(session=self.session,
                                                            feed_dict={self.new_state: [new_state]})

                # clip reward to -1, 1
                clipped_reward = np.clip(reward, -1, 1)

                # compute targets based on Q-learning update rule
                # targets = r + gamma*max(Q(s',a))
                if done:
                    targets.append(clipped_reward)
                else:
                    targets.append(clipped_reward + FLAGS.gamma * np.max(target_q_values))

                actions.append(action_list)
                states.append(state)

                # Update the state and global counters
                state = new_state
                self.T += 1
                t += 1

                # update episode's counter
                frames += 1
                episode_reward += reward
                mean_q += np.max(q_values)

                # update_target_network
                if self.T % FLAGS.target_network_update_frequency == 0:
                    self.session.run(self.update_target)

                # train online network
                if t % FLAGS.network_update_frequency == 0 or done:
                    if states:
                        self.session.run(self.grad_update, feed_dict={self.state: states,
                                                                      self.actions: actions,
                                                                      self.targets: targets,
                                                                      self.learning_rate: self.lr})
                    # Clear gradients
                    states = []
                    actions = []
                    targets = []

                # Save model progress
                if t % FLAGS.checkpoint_interval == 0:
                    if FLAGS.game_type == 'Doom':
                        self.saver.save(self.session, FLAGS.checkpoint_dir + "/" + FLAGS.game.split("/")[1] + ".ckpt",
                                        global_step=t)
                    else:
                        self.saver.save(self.session, FLAGS.checkpoint_dir + "/" + FLAGS.game + ".ckpt", global_step=t)

                # Print end of episode stats
                if done:
                    print("THREAD:", thread_id, "/ TIME", self.T, "/ TIMESTEP", t, "/ EPSILON", epsilon, "/ REWARD",
                          episode_reward, "/ Q_MAX %.4f" % (mean_q / float(frames)), "/ EPSILON PROGRESS",
                          t / float(FLAGS.anneal_epsilon_timesteps))
                    break
예제 #7
0
    def actor_learner_thread(self, env, thread_id, num_actions):

        # create instance of Doom environment
        env = Env(env, FLAGS.width, FLAGS.height, FLAGS.history_length,
                  FLAGS.game_type)

        initial_epsilon = 1
        epsilon = 1
        final_epsilon = self.sample_final_epsilon()
        print 'Starting thread ' + str(
            thread_id) + ' with final epsilon ' + str(final_epsilon)
        time.sleep(3 * thread_id)

        # Get initial game observation
        state = env.get_initial_state()

        # episode's counter
        episode_reward = 0
        mean_q = 0
        frames = 0
        counter = 0
        while self.T < self.TMAX:

            done = False

            # clear gradients
            states = []
            actions = []
            targets = []
            prev_reward = []

            t = 0
            t_start = t
            self.session.run(self.update_local_model[thread_id])

            while not (done or ((t - t_start) == t_max)):
                # forward pass of network. Get Q(s,a)
                q_values = self.local_values[thread_id].eval(
                    session=self.session,
                    feed_dict={self.local_states[thread_id]: [state]})

                # define list of actions. All values are zeros except , the
                # value of action that is executed
                action_list = np.zeros([num_actions])

                # chose action based on current policy
                if random.random() <= epsilon:
                    action_index = random.randrange(num_actions)
                else:
                    action_index = np.argmax(q_values)
                action_list[action_index] = 1

                # add state and action to list
                actions.append(action_list)
                states.append(state)

                # reduce epsilon
                if epsilon > final_epsilon:
                    epsilon -= (initial_epsilon -
                                final_epsilon) / FLAGS.anneal_epsilon_timesteps

                # Gym excecutes action in game environment on behalf of actor-learner
                new_state, reward, done = env.step(action_index)

                # clip reward to -1, 1
                clipped_reward = np.clip(reward, -1, 1)
                prev_reward.append(clipped_reward)

                # Update the state and global counters
                state = new_state
                self.T += 1
                t += 1
                counter += 1
                # update episode's counter
                frames += 1
                episode_reward += reward
                mean_q += np.max(q_values)

                # update_target_network
                if self.T % FLAGS.target_network_update_frequency == 0:
                    print "Target Network Updated"
                    self.session.run(self.update_target)

                # Save model progress
                if self.T % FLAGS.checkpoint_interval < 400:
                    self.T += 400
                    if FLAGS.game_type == 'Doom':
                        self.saver.save(self.session,
                                        FLAGS.checkpoint_dir + "/" +
                                        FLAGS.game.split("/")[1] + ".ckpt",
                                        global_step=self.T)
                    else:
                        self.saver.save(self.session,
                                        FLAGS.checkpoint_dir + "/" +
                                        FLAGS.game + ".ckpt",
                                        global_step=self.T)

            if done:
                R_t = 0
            else:
                R_t = np.max(
                    self.target_q_values.eval(
                        session=self.session,
                        feed_dict={self.new_state: [state]}))

            targets = np.zeros((t - t_start))

            for i in range(t - t_start - 1, -1, -1):
                R_t = prev_reward[i] + FLAGS.gamma * R_t
                targets[i] = R_t

            #update q value network
            self.session.run(self.grad_update[thread_id],
                             feed_dict={
                                 self.state: states,
                                 self.actions: actions,
                                 self.targets: targets
                             })

            if done:
                print "THREAD:", thread_id, "/ TIME", self.T, "/ TIMESTEP", counter, "/ EPSILON", epsilon, "/ REWARD", episode_reward, "/ Q_MAX %.4f" % (
                    mean_q /
                    float(frames)), "/ EPSILON PROGRESS", counter / float(
                        FLAGS.anneal_epsilon_timesteps)
                episode_reward = 0
                file_path = 'rewards'
                try:
                    with open(file_path, 'a+') as f:
                        f.write(
                            str(episode_reward) + ', ' +
                            str((mean_q / float(frames))) + '\n')
                except IOError:
                    with open(file_path, 'w+') as f:
                        f.write(
                            str(episode_reward) + ', ' +
                            str((mean_q / float(frames))) + '\n')
                f.close()
                # Get initial game observation
                episode_reward = 0
                mean_q = 0
                frames = 0
                state = env.get_initial_state()