def __init__(self, id, prediction_q, training_q, episode_log_q, display=False): super(ProcessAgent, self).__init__() self.id = id self.prediction_q = prediction_q self.training_q = training_q self.episode_log_q = episode_log_q self.env = Environment(display=display) self.discount_factor = Config.DISCOUNT # one frame at a time self.wait_q = Queue(maxsize=1) self.exit_flag = Value('i', 0)
def __init__(self, id, prediction_q, training_q, episode_log_q, reward_modifier_q=None): super(ProcessAgent, self).__init__() self.id = id self.prediction_q = prediction_q self.training_q = training_q self.episode_log_q = episode_log_q self.reward_modifier_q = reward_modifier_q self.env = Environment() self.num_actions = self.env.get_num_actions() self.onehots = np.eye(self.num_actions) self.actions = np.arange(self.num_actions) self.discount_factor = Config.DISCOUNT # one frame at a time self.wait_q = Queue(maxsize=1) self.exit_flag = Value('i', 0)
def __init__(self, reward_modifier=None): self.stats = ProcessStats() if reward_modifier: self.reward_modifier_q = Queue(maxsize=Config.MAX_QUEUE_SIZE) self.reward_modifier = reward_modifier self.training_q = Queue(maxsize=Config.MAX_QUEUE_SIZE) self.prediction_q = Queue(maxsize=Config.MAX_QUEUE_SIZE) self.model = NetworkVP(Config.DEVICE, Config.NETWORK_NAME, Environment().get_num_actions()) if Config.LOAD_CHECKPOINT: self.stats.episode_count.value = self.model.try_to_load() self.training_step = 0 self.frame_counter = 0 self.agents = [] self.predictors = [] self.trainers = [] self.dynamic_adjustment = ThreadDynamicAdjustment(self)
from ga3c.NetworkVP import NetworkVP from ga3c.Environment import Environment from ga3c.Config import Config as Ga3cConfig np.set_printoptions(precision=2, linewidth=150) if __name__ == '__main__': env_id = 'MontezumaRevenge-v0' model_names = ['left'] Ga3cConfig.ATARI_GAME = env_id Ga3cConfig.MAKE_ENV_FUNCTION = gym.make Ga3cConfig.PLAY_MODE = True env = Environment() actions = np.arange(env.get_num_actions()) done = False command = None command_steps = -1 preprogrammed_sequences = { 'BOTTOM_RIGHT': [ 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 ] } models = { name: NetworkVP('cpu:0', name, len(actions))
class ProcessAgent(Process): def __init__(self, id, prediction_q, training_q, episode_log_q, display=False): super(ProcessAgent, self).__init__() self.id = id self.prediction_q = prediction_q self.training_q = training_q self.episode_log_q = episode_log_q self.env = Environment(display=display) self.discount_factor = Config.DISCOUNT # one frame at a time self.wait_q = Queue(maxsize=1) self.exit_flag = Value('i', 0) @staticmethod def _accumulate_rewards(experiences, discount_factor, terminal_reward): reward_sum = terminal_reward for t in reversed(range(0, len(experiences) - 1)): r = np.clip(experiences[t].reward, Config.REWARD_MIN, Config.REWARD_MAX) reward_sum = discount_factor * reward_sum + r experiences[t].reward = reward_sum return experiences[:-1] @staticmethod def convert_data(experiences): x_ = np.array([exp.state for exp in experiences]) a_ = np.array([exp.action for exp in experiences]) r_ = np.array([exp.reward for exp in experiences]) return x_, r_, a_ def predict(self, state): # put the state in the prediction q # print('agent%d put one prediction'%self.id) self.prediction_q.put((self.id, state)) # wait for the prediction to come back p, v = self.wait_q.get() return p, v def select_action(self, prediction): return prediction def run_episode(self): self.env.reset() done = False experiences = [] time_count = 0 reward_sum = 0.0 while not done: prediction, value = self.predict(self.env.current_state) action = self.select_action(prediction) reward, done = self.env.step(action) reward_sum += reward exp = Experience(self.env.previous_state, action, prediction, reward, done) experiences.append(exp) if done or time_count == Config.TIME_MAX: terminal_reward = 0 if done else value updated_exps = ProcessAgent._accumulate_rewards( experiences, self.discount_factor, terminal_reward) x_, r_, a_ = self.convert_data(updated_exps) yield x_, r_, a_, reward_sum # reset the tmax count time_count = 0 # keep the last experience for the next batch experiences = [experiences[-1]] reward_sum = 0.0 time_count += 1 def run(self): # randomly sleep up to 1 second. helps agents boot smoothly. time.sleep(np.random.rand()) np.random.seed(np.int32(time.time() % 1 * 1000 + self.id * 10)) print('start agent') while self.exit_flag.value == 0: total_reward = 0 total_length = 0 for x_, r_, a_, reward_sum in self.run_episode(): total_reward += reward_sum if self.id == 0: print('sum of reward is %f' % total_reward) total_length += len(r_) + 1 # +1 for last frame that we drop self.training_q.put((x_, r_, a_)) self.episode_log_q.put( (datetime.now(), total_reward, total_length))
class ProcessAgent(Process): def __init__(self, id, prediction_q, training_q, episode_log_q, reward_modifier_q=None): super(ProcessAgent, self).__init__() self.id = id self.prediction_q = prediction_q self.training_q = training_q self.episode_log_q = episode_log_q self.reward_modifier_q = reward_modifier_q self.env = Environment() self.num_actions = self.env.get_num_actions() self.onehots = np.eye(self.num_actions) self.actions = np.arange(self.num_actions) self.discount_factor = Config.DISCOUNT # one frame at a time self.wait_q = Queue(maxsize=1) self.exit_flag = Value('i', 0) @staticmethod def _accumulate_rewards(experiences, discount_factor, terminal_reward): reward_sum = terminal_reward for t in reversed(range(0, len(experiences) - 1)): r = np.clip(experiences[t].reward, Config.REWARD_MIN, Config.REWARD_MAX) reward_sum = discount_factor * reward_sum + r experiences[t].reward = reward_sum return experiences[:-1] def convert_data(self, experiences): x_ = np.array([exp.state for exp in experiences]) a_ = self.onehots[np.array([exp.action for exp in experiences], dtype=int)].astype(np.float32) r_ = np.array([exp.reward for exp in experiences]) return x_, r_, a_ def predict(self, state): # put the state in the prediction q self.prediction_q.put((self.id, state)) # wait for the prediction to come back p, v = self.wait_q.get() return p, v def select_action(self, prediction): if Config.PLAY_MODE: action = np.argmax(prediction) else: action = np.random.choice(self.actions, p=prediction) return action def run_episode(self): self.env.reset() done = False experiences = [] path = { "obs": [], "original_rewards": [], "actions": [], "human_obs": [], } time_count = 0 while not done: # very first few frames if self.env.current_state is None: self.env.step(0) # 0 == NOOP continue prediction, value = self.predict(self.env.current_state) action = self.select_action(prediction) reward, done, info = self.env.step(action) exp = Experience(self.env.previous_state, action, prediction, reward, done, info["human_obs"]) experiences.append(exp) if done or time_count == Config.TIME_MAX: terminal_reward = 0 if done else value ################################ # START REWARD MODIFICATIONS # ################################ if self.reward_modifier_q: # Translate the experiences into the "path" that RL-Teacher expects if len(path["obs"]) > 0: # Cut off the first item in the list because it's from an old episode new_experiences = experiences[1:] else: new_experiences = experiences path["obs"] += [e.state for e in new_experiences] path["original_rewards"] += [ e.reward for e in new_experiences ] path["actions"] += [e.action for e in new_experiences] path["human_obs"] += [e.human_obs for e in new_experiences] # TODO SPEED UP!! THIS IS SLOWING THINGS DOWN! self.reward_modifier_q.put((self.id, done, path)) path["rewards"] = self.wait_q.get() # Translate new rewards back into the experiences for i in range(len(experiences)): # Work backwards because the path is longer than the experience list, but their ends are synced experiences[-(1 + i)].reward = path["rewards"][-(1 + i)] ################################ # END REWARD MODIFICATIONS # ################################ reward_sum = sum([x.reward for x in experiences]) updated_exps = ProcessAgent._accumulate_rewards( experiences, self.discount_factor, terminal_reward) x_, r_, a_ = self.convert_data(updated_exps) yield x_, r_, a_, reward_sum # reset the tmax count time_count = 0 # keep the last experience for the next batch experiences = [experiences[-1]] reward_sum = 0.0 time_count += 1 def run(self): # randomly sleep up to 1 second. helps agents boot smoothly. time.sleep(np.random.rand()) np.random.seed(np.int32(time.time() % 1 * 1000 + self.id * 10)) while self.exit_flag.value == 0: total_reward = 0 total_length = 0 for x_, r_, a_, reward_sum in self.run_episode(): total_reward += reward_sum total_length += len(r_) + 1 # +1 for last frame that we drop self.training_q.put((x_, r_, a_)) self.episode_log_q.put( (datetime.now(), total_reward, total_length))