示例#1
0
    def run(self):


        env = gym.make("CartPole-v0") # Create the environment

        episode_done = True
        trajectory = Trajectory()
        next_state = []
        episode_num = 0
        episode_reward = 1 # TODO: check for alternative init rewards

        while True:

            if self.stop: # Check if we need to stop the current thread
                print("Worker:",self.worker_id, "STOP")
                break

            if episode_done: # Check if the episode is done
                state = env.reset() # Reset the environment and recover initial state
                episode_done = False
                template = "in worker {}, episode {} done after {} steps" # Template for print
                print(template.format(self.worker_id, episode_num, episode_reward))
                self.res_queue.put(episode_reward)
                episode_num += 1
                episode_reward = 1

            for i in range(LOOKAHEAD):
                #print("I=",i)
                if self.is_eval(): # Render curretn state of the environement
                    env.render()
                state = tf.convert_to_tensor(state)  # Convert state to tensor
                state = tf.expand_dims(state, 0)
                action = self.act(state) # Select action based on the current policy.
                trajectory.store(s=state, a=action)

                state, reward, episode_done, _ = env.step(action) # Recover next state, reward and if the episode is done.
                #print("|New state\n",state,"\n|reward: \n",reward," \n|episode done \n",episode_done)

                if episode_done: # If the agent fails the task
                    reward = -1
                # Store the reward
                trajectory.store(r=reward)
                episode_reward += reward
                next_state = state # update current state

                if episode_done:
                    next_state = []
                    break

            if episode_done and self.is_eval():
                if self.is_eval():
                    env.close()
                    template = "in worker {}, episode {} done after {} steps"  # Template for print
                    print(template.format(self.worker_id, episode_num, episode_reward))
                    break

            if not self.is_eval():
                self.train(trajectory, next_state)# Update network using the trajectory
            trajectory.clear()
示例#2
0
    def run(self):
        env = gym.make("CartPole-v0")
        episode_done = True
        trajectory = Trajectory()
        next_state = []
        episode_num = 0
        episode_reward = 1

        while True:
            if episode_done:
                state = env.reset()
                episode_done = False
                template = "in worker {}, episode {} done after {} steps"
                print(
                    template.format(self.worker_id, episode_num,
                                    episode_reward))
                self.res_queue.put(episode_reward)
                episode_num += 1
                episode_reward = 1

            for i in range(LOOKAHEAD):
                # env.render()
                state = tf.convert_to_tensor(state)
                state = tf.expand_dims(state, 0)
                action = self.act(state)
                trajectory.store(s=state, a=action)
                state, reward, episode_done, _ = env.step(action)
                if episode_done:
                    reward = -1

                trajectory.store(r=reward)
                episode_reward += reward
                next_state = state

                if episode_done:
                    next_state = []
                    break

            self.train(trajectory, next_state)
            trajectory.clear()