def run(self): env = gym.make("CartPole-v0") # Create the environment episode_done = True trajectory = Trajectory() next_state = [] episode_num = 0 episode_reward = 1 # TODO: check for alternative init rewards while True: if self.stop: # Check if we need to stop the current thread print("Worker:",self.worker_id, "STOP") break if episode_done: # Check if the episode is done state = env.reset() # Reset the environment and recover initial state episode_done = False template = "in worker {}, episode {} done after {} steps" # Template for print print(template.format(self.worker_id, episode_num, episode_reward)) self.res_queue.put(episode_reward) episode_num += 1 episode_reward = 1 for i in range(LOOKAHEAD): #print("I=",i) if self.is_eval(): # Render curretn state of the environement env.render() state = tf.convert_to_tensor(state) # Convert state to tensor state = tf.expand_dims(state, 0) action = self.act(state) # Select action based on the current policy. trajectory.store(s=state, a=action) state, reward, episode_done, _ = env.step(action) # Recover next state, reward and if the episode is done. #print("|New state\n",state,"\n|reward: \n",reward," \n|episode done \n",episode_done) if episode_done: # If the agent fails the task reward = -1 # Store the reward trajectory.store(r=reward) episode_reward += reward next_state = state # update current state if episode_done: next_state = [] break if episode_done and self.is_eval(): if self.is_eval(): env.close() template = "in worker {}, episode {} done after {} steps" # Template for print print(template.format(self.worker_id, episode_num, episode_reward)) break if not self.is_eval(): self.train(trajectory, next_state)# Update network using the trajectory trajectory.clear()
def run(self): env = gym.make("CartPole-v0") episode_done = True trajectory = Trajectory() next_state = [] episode_num = 0 episode_reward = 1 while True: if episode_done: state = env.reset() episode_done = False template = "in worker {}, episode {} done after {} steps" print( template.format(self.worker_id, episode_num, episode_reward)) self.res_queue.put(episode_reward) episode_num += 1 episode_reward = 1 for i in range(LOOKAHEAD): # env.render() state = tf.convert_to_tensor(state) state = tf.expand_dims(state, 0) action = self.act(state) trajectory.store(s=state, a=action) state, reward, episode_done, _ = env.step(action) if episode_done: reward = -1 trajectory.store(r=reward) episode_reward += reward next_state = state if episode_done: next_state = [] break self.train(trajectory, next_state) trajectory.clear()