def __init__(self, agent, name, environment, network, global_dict, report_frequency, step_size=0.9, epsilon_annealing_start=0.9, epsilon_annealing_end=0, load_model_path=None, using_e_greedy=True): super().__init__(agent=agent, name=name, environment=environment, network=network, global_dict=global_dict, report_frequency=report_frequency) self.step_size = step_size self.epsilon_start = epsilon_annealing_start self.epsilon_end = epsilon_annealing_end self.load_model_path = load_model_path self.using_e_greedy = using_e_greedy r, _ = environment.get_state_space().get_range() self.num_of_states = len(r) self.table = [0.5 for _ in range(self.num_of_states)] self.epsilon_annealer = Annealer(epsilon_annealing_start, epsilon_annealing_end, self.agent.max_training_steps) self.current_epsilon = epsilon_annealing_start
def __init__(self, agent, name, environment, network, global_dict, async_update_steps=1, reward_clip_vals=None, using_e_greedy=True, epsilon_annealing_start=1, epsilon_annealing_choices=[0.1, 0.01, 0.5], epsilon_annealing_probabilities=[0.4, 0.3, 0.3], epsilon_annealing_steps=100000, global_epsilon_annealing=True, report_frequency=1): MOBaseLearner.__init__(self, network.get_config().get_num_of_objectives()) threading.Thread.__init__(self) range, is_range = environment.get_action_space().get_range() if not is_range: raise ValueError("Does not support this type of action space") self.using_e_greedy = using_e_greedy if using_e_greedy: end_rand = np.random.choice(epsilon_annealing_choices, p=epsilon_annealing_probabilities) self.epsilon_annealer = Annealer(epsilon_annealing_start, end_rand, epsilon_annealing_steps) self.current_epsilon = epsilon_annealing_start self.step_count = 0 self.eps_count = 0 self.environment = environment self.reward_clip_vals = reward_clip_vals self.name = name self.agent = agent self.num_actions = len(range) self.network = network self.config = network.network_config self.history_length = self.config.get_history_length() if self.history_length > 1: self.frame_buffer = StateBuffer([1] + self.config.get_input_shape()) self.async_update_step = async_update_steps self.global_dict = global_dict self.global_epsilon_annealing = global_epsilon_annealing self.report_frequency = report_frequency self.minibatch_vars = {} self.reset_minibatch() self.testing = False
def __init__(self, agent, name, environment, network, global_dict, report_frequency, network_update_steps=5, reward_clip_thresholds=(-1, 1) ): super().__init__(agent=agent, name=name, environment=environment, network=network, global_dict=global_dict, report_frequency=report_frequency) self.async_update_steps = network_update_steps self.reward_clip_thresholds = reward_clip_thresholds self.initial_learning_rate = network.get_config().get_initial_learning_rate() self.current_learning_rate = self.initial_learning_rate self.learning_rate_annealer = Annealer(self.initial_learning_rate, 0, self.agent.max_training_steps)
def __init__(self, agent, name, environment, network, global_dict, report_frequency, network_update_steps=5, reward_clip_thresholds=(-1, 1), auxiliary_model_path=None, alpha=0.2, epsilon=0.02): super().__init__(agent=agent, name=name, environment=environment, network=network, global_dict=global_dict, report_frequency=report_frequency) self.async_update_steps = network_update_steps self.reward_clip_thresholds = reward_clip_thresholds self.initial_learning_rate = network.get_config( ).get_initial_learning_rate() self.current_learning_rate = self.initial_learning_rate self.learning_rate_annealer = Annealer(self.initial_learning_rate, 0, self.agent.max_training_steps) self.auxiliary_model_path = auxiliary_model_path self.alpha = alpha self.epsilon = epsilon self.load_model()
def __init__(self, agent, name, environment, network, global_dict, report_frequency, batch_size=32, warmup_steps=50000, training_frequency=4, experience_replay_size=2**19, epsilon_annealing_start=1, epsilon_annealing_end=0.1, initial_beta=0.4, prioritized_alpha=0.6, epsilon_annealing_steps=1e6, reward_clip_thresholds=(-1, 1)): super().__init__(agent=agent, name=name, environment=environment, network=network, global_dict=global_dict, report_frequency=report_frequency) global experience_replay with global_dict[AgentMonitor.Q_LOCK]: if experience_replay is None: experience_replay = SyncSumTree( alpha=prioritized_alpha, size=experience_replay_size, state_history=network.get_config().get_history_length(), debug=False) self.replay = experience_replay self.batch_size = batch_size self.warmup_steps = warmup_steps self.training_frequency = training_frequency self.reward_clip_thresholds = reward_clip_thresholds self.epsilon_annealer = Annealer(epsilon_annealing_start, epsilon_annealing_end, epsilon_annealing_steps) self.current_learning_rate = network.get_config( ).get_initial_learning_rate() self.current_epsilon = epsilon_annealing_start self.beta_annealer = Annealer(initial_beta, 1, self.agent.max_training_steps) self.current_beta = initial_beta self.initial_beta = initial_beta self.prioritized_alpha = prioritized_alpha
def __init__(self, environment, num_of_epochs=10, steps_per_epoch=100000, log_dir='./train/moq', using_e_greedy=True, report_frequency=100, summary_frequency=900000, discounted_factor=0.9, learning_rate=0.9, traces_factor=0.9, batch_size=5, epsilon_annealing_start=0.9, load_model_path=None, thresholds=None, target_reward=None, is_linear=False): super().__init__(None, environment, num_of_threads=1, num_of_epochs=num_of_epochs, steps_per_epoch=steps_per_epoch, log_dir=log_dir, using_e_greedy=using_e_greedy, anneal_learning_rate=False, report_frequency=report_frequency, save_frequency=summary_frequency ) self.initial_learning_rate = learning_rate self.current_learning_rate = learning_rate self.load_model_path = load_model_path self.gamma = discounted_factor self.traces_factor = traces_factor self.is_linear = is_linear # Disable annealing learning rate self.learning_rate_annealer = Annealer(self.initial_learning_rate, 0, None) self.num_of_objectives = environment.get_num_of_objectives() self.init_q_values = [0.] * self.num_of_objectives self.thresholds = [0.] * (self.num_of_objectives - 1) if not is_linear: self.table = TLOLookupTable(environment=environment, init_value=0., thresholds=self.thresholds) else: self.table = LinearLookupTable(environment=environment, init_value=0., thresholds=self.thresholds) self.env_pool = [self.env.clone() for _ in range(self.num_of_threads)] self.thread_host = AgentMonitor(self, network=None, log_dir=self.log_dir, save_interval=summary_frequency, max_training_epochs=self.num_of_epochs, steps_per_epoch=self.steps_per_epoch, multi_objectives=True, idle_time=0) self.thread_pool = [MOQWorker(self, name='MOQWorker: ' + str(t), environment=self.env_pool[t], global_dict=self.thread_host.shared_dict, num_of_objs=self.num_of_objectives, async_update_steps=1, using_e_greedy=self.using_e_greedy, report_frequency=report_frequency, epsilon_annealing_start=epsilon_annealing_start, epsilon_annealing_choices=[0], epsilon_annealing_probabilities=[1.0], epsilon_annealing_steps=num_of_epochs * steps_per_epoch, global_epsilon_annealing=True, gamma=discounted_factor, traces_factor=traces_factor, batch_size=batch_size, load_model_path=load_model_path, lookup_table=self.table, thresholds=thresholds, target_reward=target_reward, is_linear=is_linear ) for t in range(self.num_of_threads)]
def __init__(self, agent, name, environment, network, global_dict, report_frequency, batch_size=32, warmup_steps=50000, training_frequency=4, experience_replay_size=2**19, epsilon_annealing_start=1, epsilon_annealing_end=0.1, epsilon_annealing_steps=1e6, reward_clip_thresholds=(-1, 1)): super().__init__(agent=agent, name=name, environment=environment, network=network, global_dict=global_dict, report_frequency=report_frequency) global experience_replay with global_dict[AgentMonitor.Q_LOCK]: if experience_replay is None: experience_replay = SyncExperienceReplay( experience_replay_size, state_history=network.network_config.get_history_length()) self.replay = experience_replay self.batch_size = batch_size self.warmup_steps = warmup_steps self.training_frequency = training_frequency self.reward_clip_thresholds = reward_clip_thresholds self.epsilon_annealer = Annealer(epsilon_annealing_start, epsilon_annealing_end, epsilon_annealing_steps) self.current_learning_rate = network.get_config( ).get_initial_learning_rate() self.current_epsilon = epsilon_annealing_start
def __init__(self, agent, name, environment, network, global_dict, report_frequency, batch_size=5, discounted_factor=0.9, learning_rate=0.9, traces_factor=0.9, epsilon_annealing_start=0.9, epsilon_annealing_end=0, load_model_path=None, thresholds=None, target_reward=None, is_linear=False, using_e_greedy=True, async_update_steps=1): super().__init__(agent=agent, name=name, environment=environment, network=network, global_dict=global_dict, report_frequency=report_frequency) self.load_model_path = load_model_path self.target_reward = target_reward self.is_linear = is_linear self.discounted_factor = discounted_factor self.traces_factor = traces_factor self.using_e_greedy = using_e_greedy self.async_update_steps = async_update_steps self.num_of_objectives = environment.get_number_of_objectives() self.init_q_values = [0.] * self.num_of_objectives if thresholds is None: if not is_linear: self.thresholds = [0.] * (self.num_of_objectives - 1) else: self.thresholds = [1. / self.num_of_objectives ] * self.num_of_objectives else: self.thresholds = thresholds global table with global_dict[AgentMonitor.Q_LOCK]: if table is None: if not is_linear: table = TLOLookupTable(environment=environment, init_value=0., thresholds=self.thresholds) else: table = LinearLookupTable(environment=environment, init_value=0., thresholds=self.thresholds) self.table = table self.batch_size = batch_size self.epsilon_annealer = Annealer(epsilon_annealing_start, epsilon_annealing_end, self.agent.max_training_steps) self.current_learning_rate = learning_rate self.current_epsilon = epsilon_annealing_start self.converged = False
class MOQLearner(Learner): def __init__(self, agent, name, environment, network, global_dict, report_frequency, batch_size=5, discounted_factor=0.9, learning_rate=0.9, traces_factor=0.9, epsilon_annealing_start=0.9, epsilon_annealing_end=0, load_model_path=None, thresholds=None, target_reward=None, is_linear=False, using_e_greedy=True, async_update_steps=1): super().__init__(agent=agent, name=name, environment=environment, network=network, global_dict=global_dict, report_frequency=report_frequency) self.load_model_path = load_model_path self.target_reward = target_reward self.is_linear = is_linear self.discounted_factor = discounted_factor self.traces_factor = traces_factor self.using_e_greedy = using_e_greedy self.async_update_steps = async_update_steps self.num_of_objectives = environment.get_number_of_objectives() self.init_q_values = [0.] * self.num_of_objectives if thresholds is None: if not is_linear: self.thresholds = [0.] * (self.num_of_objectives - 1) else: self.thresholds = [1. / self.num_of_objectives ] * self.num_of_objectives else: self.thresholds = thresholds global table with global_dict[AgentMonitor.Q_LOCK]: if table is None: if not is_linear: table = TLOLookupTable(environment=environment, init_value=0., thresholds=self.thresholds) else: table = LinearLookupTable(environment=environment, init_value=0., thresholds=self.thresholds) self.table = table self.batch_size = batch_size self.epsilon_annealer = Annealer(epsilon_annealing_start, epsilon_annealing_end, self.agent.max_training_steps) self.current_learning_rate = learning_rate self.current_epsilon = epsilon_annealing_start self.converged = False @staticmethod def get_default_number_of_learners(): return 1 def load_model(self): self.table.load_value_function(self.load_model_path) print("Load values:") self.table.print_values() def save_model(self, file_name): print("Save values:") self.table.print_values() self.table.save_value_function(file_name) def get_action(self, state): if self.using_e_greedy: if np.random.uniform(0, 1) <= self.current_epsilon: e_greedy = np.random.randint(self.num_actions) return e_greedy else: return self.table.select_greedy_action(state) else: return self.table.select_greedy_action(state) def report(self, reward): print(self.name, 'Episode Count:', self.eps_count, 'Episode reward:', reward, 'Steps:', self.environment.get_current_steps(), 'Step count:', self.step_count, 'Learning rate:', self.current_learning_rate, 'Epsilon:', self.current_epsilon, 'Thresholds:', self.thresholds) # Testing purpose if self.target_reward is not None and self.thresholds is not None: backup_epsilon = self.current_epsilon self.current_epsilon = 0 greedy_reward = self.run_episode() self.global_dict[AgentMonitor.Q_ADD_REWARD]( greedy_reward, self.environment.get_current_steps()) self.current_epsilon = backup_epsilon converged = True for i in range(len(greedy_reward)): if greedy_reward[i] != self.target_reward[i]: converged = False break if converged: print("Converged") self.converged = True def update(self, state, action, reward, next_state, terminal): self.step_count += 1 self.global_dict['counter'] += 1 if not self.testing: if self.step_count % self.async_update_steps == 0: if not terminal: greedy = self.get_action(state) self.table.calculate_td_errors(action, state, greedy, next_state, self.discounted_factor, reward) else: self.table.calculate_terminal_td_errors( action, state, self.discounted_factor, reward) self.table.update(action, state, 1.0, self.current_learning_rate) self.current_epsilon = self.epsilon_annealer.anneal( self.global_dict[AgentMonitor.Q_GLOBAL_STEPS])
class TDLearner(Learner): def __init__(self, agent, name, environment, network, global_dict, report_frequency, step_size=0.9, epsilon_annealing_start=0.9, epsilon_annealing_end=0, load_model_path=None, using_e_greedy=True): super().__init__(agent=agent, name=name, environment=environment, network=network, global_dict=global_dict, report_frequency=report_frequency) self.step_size = step_size self.epsilon_start = epsilon_annealing_start self.epsilon_end = epsilon_annealing_end self.load_model_path = load_model_path self.using_e_greedy = using_e_greedy r, _ = environment.get_state_space().get_range() self.num_of_states = len(r) self.table = [0.5 for _ in range(self.num_of_states)] self.epsilon_annealer = Annealer(epsilon_annealing_start, epsilon_annealing_end, self.agent.max_training_steps) self.current_epsilon = epsilon_annealing_start def report(self, reward): print(self.name, 'Episode Count:', self.eps_count, 'Episode reward:', reward, 'Steps:', self.environment.get_current_steps(), 'Step count:', self.step_count, 'Learning rate:', self.step_size, 'Epsilon:', self.current_epsilon) def get_action(self, state): if self.using_e_greedy: if np.random.uniform(0, 1) <= self.current_epsilon: e_greedy = np.random.randint(self.num_actions) return e_greedy else: return self.table.select_greedy_action(state) else: return self.table.select_greedy_action(state) def update(self, state, action, reward, next_state, terminal): self.step_count += 1 self.global_dict[AgentMonitor.Q_GLOBAL_STEPS] += 1 if not self.testing: if terminal: self.data_dict['returns'].append(0) for i in range(len(self.data_dict['states'])): new_val = self.data_dict['returns'][-i - 2] = self.data_dict['rewards'][-i - 1] + \ self.discounted_factor * self.data_dict['returns'][-i - 1] cur_val = self.table.get_q_values( self.data_dict['actions'][-i - 1], self.data_dict['states'][-i - 1]) new_val = (cur_val * self.eps_count + new_val) / (self.eps_count + 1) self.table.set_q_values(self.data_dict['actions'][-i - 1], self.data_dict['states'][-i - 1], new_val) self.current_epsilon = self.epsilon_annealer.anneal( self.global_dict[AgentMonitor.Q_GLOBAL_STEPS])
class A3CLearner(Learner): def __init__(self, agent, name, environment, network, global_dict, report_frequency, network_update_steps=5, reward_clip_thresholds=(-1, 1) ): super().__init__(agent=agent, name=name, environment=environment, network=network, global_dict=global_dict, report_frequency=report_frequency) self.async_update_steps = network_update_steps self.reward_clip_thresholds = reward_clip_thresholds self.initial_learning_rate = network.get_config().get_initial_learning_rate() self.current_learning_rate = self.initial_learning_rate self.learning_rate_annealer = Annealer(self.initial_learning_rate, 0, self.agent.max_training_steps) @staticmethod def get_default_number_of_learners(): return multiprocessing.cpu_count() def reset(self): super().reset() self.reset_batch() def reset_batch(self): self.data_dict['states'] = [] self.data_dict['actions'] = [] self.data_dict['rewards'] = [] self.data_dict['next_states'] = [] self.data_dict['terminals'] = [] def get_action(self, state): probs = self.get_probs(state) action_probs = probs - np.finfo(np.float32).epsneg try: sample = np.random.multinomial(1, action_probs) action_index = int(np.nonzero(sample)[0]) except: print('Select greedy action', action_probs) action_index = np.argmax(probs) return action_index def update(self, state, action, reward, next_state, terminal): if self.history_length > 1: self.frame_buffer.add_state(state) if self.reward_clip_thresholds is not None: reward = np.clip(reward, self.reward_clip_thresholds[0], self.reward_clip_thresholds[1]) if not self.testing: if self.history_length > 1: current_s = self.frame_buffer.get_buffer()[0] next_s = self.frame_buffer.get_buffer_add_state(next_state)[0] else: current_s = state next_s = next_state self.data_dict['states'].append(current_s) self.data_dict['actions'].append(action) self.data_dict['rewards'].append(reward) self.data_dict['next_states'].append(next_s) self.data_dict['terminals'].append(terminal) self.step_count += 1 self.global_dict[AgentMonitor.Q_GLOBAL_STEPS] += 1 if not self.testing: if self.step_count % self.async_update_steps == 0 or terminal: logging = self.global_dict[AgentMonitor.Q_LOGGING] self.current_learning_rate = self.learning_rate_annealer.anneal( self.global_dict[AgentMonitor.Q_GLOBAL_STEPS]) self.data_dict['learning_rate'] = self.current_learning_rate self.global_dict[AgentMonitor.Q_LEARNING_RATE] = self.current_learning_rate if logging: self.global_dict[AgentMonitor.Q_LOGGING] = False self.data_dict['logging'] = True summary = self.network.train_network(self.data_dict) self.global_dict[AgentMonitor.Q_WRITER].\ add_summary(summary, global_step=self.global_dict[AgentMonitor.Q_GLOBAL_STEPS]) else: self.data_dict['logging'] = False self.network.train_network(self.data_dict) self.reset_batch()
class PrioritizedDQNLearner(Learner): def __init__(self, agent, name, environment, network, global_dict, report_frequency, batch_size=32, warmup_steps=50000, training_frequency=4, experience_replay_size=2**19, epsilon_annealing_start=1, epsilon_annealing_end=0.1, initial_beta=0.4, prioritized_alpha=0.6, epsilon_annealing_steps=1e6, reward_clip_thresholds=(-1, 1)): super().__init__(agent=agent, name=name, environment=environment, network=network, global_dict=global_dict, report_frequency=report_frequency) global experience_replay with global_dict[AgentMonitor.Q_LOCK]: if experience_replay is None: experience_replay = SyncSumTree( alpha=prioritized_alpha, size=experience_replay_size, state_history=network.get_config().get_history_length(), debug=False) self.replay = experience_replay self.batch_size = batch_size self.warmup_steps = warmup_steps self.training_frequency = training_frequency self.reward_clip_thresholds = reward_clip_thresholds self.epsilon_annealer = Annealer(epsilon_annealing_start, epsilon_annealing_end, epsilon_annealing_steps) self.current_learning_rate = network.get_config( ).get_initial_learning_rate() self.current_epsilon = epsilon_annealing_start self.beta_annealer = Annealer(initial_beta, 1, self.agent.max_training_steps) self.current_beta = initial_beta self.initial_beta = initial_beta self.prioritized_alpha = prioritized_alpha def initialize(self): self.data_dict = { 'states': [], 'actions': [], 'rewards': [], 'next_states': [], 'terminals': [], 'learning_rate': self.network.get_config().get_initial_learning_rate(), 'logging': False, 'global_step': 0, 'is_weights': None } @staticmethod def get_default_number_of_learners(): return 1 def get_action(self, state): probs = self.get_probs(state) if self.current_epsilon is not None: if np.random.uniform(0, 1) < self.current_epsilon: return np.random.randint(0, len(probs)) else: return np.argmax(probs) else: return np.argmax(probs) def report(self, reward): print(self.name, 'Episode Count:', self.eps_count, 'Episode reward:', reward, 'Steps:', self.environment.get_current_steps(), 'Step count:', self.step_count, 'Learning rate:', self.global_dict[AgentMonitor.Q_LEARNING_RATE], 'Epsilon:', self.current_epsilon, 'Beta:', self.current_beta) def update(self, state, action, reward, next_state, terminal): if self.history_length > 1: self.frame_buffer.add_state(state) if self.reward_clip_thresholds is not None: reward = np.clip(reward, self.reward_clip_thresholds[0], self.reward_clip_thresholds[1]) if not self.testing: if self.history_length > 1: current_s = self.frame_buffer.get_buffer()[0] next_s = self.frame_buffer.get_buffer_add_state(next_state)[0] else: current_s = state next_s = next_state self.replay.append(current_s, action, reward, next_s, terminal) self.step_count += 1 self.global_dict['counter'] += 1 if self.step_count < self.warmup_steps: return if not self.testing: if self.step_count % self.training_frequency == 0: logging = self.global_dict[AgentMonitor.Q_LOGGING] s, a, r, n, t, e, w, p, mw = self.replay.get_mini_batch( batch_size=self.batch_size, current_beta=self.current_beta) self.current_beta = self.beta_annealer.anneal( self.global_dict[AgentMonitor.Q_GLOBAL_STEPS]) self.data_dict['states'] = s self.data_dict['actions'] = a self.data_dict['rewards'] = r self.data_dict['next_states'] = n self.data_dict['terminals'] = t self.data_dict['learning_rate'] = self.current_learning_rate self.data_dict['global_step'] = self.global_dict[ AgentMonitor.Q_GLOBAL_STEPS] self.data_dict['is_weights'] = w if logging: self.global_dict[AgentMonitor.Q_LOGGING] = False self.data_dict['logging'] = True summary = self.network.train_network(self.data_dict) self.global_dict[AgentMonitor.Q_WRITER]. \ add_summary(summary, global_step=self.global_dict[AgentMonitor.Q_GLOBAL_STEPS]) else: self.data_dict['logging'] = False self.network.train_network(self.data_dict) td_errors = self.network.network_config.get_td_errors( self.network.get_session(), self.data_dict) if self.reward_clip_thresholds is not None: td_errors = np.clip(td_errors, self.reward_clip_thresholds[0], self.reward_clip_thresholds[1]) self.replay.update_mini_batch(e, td_errors) if self.step_count % 100000 == 0: print( '###################################################################' ) print('TD Errors:', td_errors) print('Beta:', self.current_beta) print('Mini Batches:', e) print('Weights:', w) print('Max Weight:', mw) print('Probability:', p) print( '###################################################################' ) self.current_epsilon = self.epsilon_annealer.anneal( self.global_dict[AgentMonitor.Q_GLOBAL_STEPS])
def __init__(self, agent, name, environment, global_dict, num_of_objs=1, async_update_steps=5, using_e_greedy=True, epsilon_annealing_start=1, epsilon_annealing_choices=[0.1, 0.01, 0.5], epsilon_annealing_probabilities=[0.4, 0.3, 0.3], epsilon_annealing_steps=10000, global_epsilon_annealing=True, report_frequency=1, gamma=0.9, traces_factor=0.9, batch_size=5, load_model_path=None, lookup_table=None, thresholds=None, target_reward=None, is_linear=False): MOBaseLearner.__init__(self, num_of_objs) threading.Thread.__init__(self) range, is_range = environment.get_action_space().get_range() if not is_range: raise ValueError("Does not support this type of action space") self.using_e_greedy = using_e_greedy if using_e_greedy: end_rand = np.random.choice(epsilon_annealing_choices, p=epsilon_annealing_probabilities) self.epsilon_annealer = Annealer(epsilon_annealing_start, end_rand, epsilon_annealing_steps) self.current_epsilon = epsilon_annealing_start self.step_count = 0 self.eps_count = 0 self.environment = environment self.name = name self.agent = agent self.gamma = gamma self.traces_factor = traces_factor self.batch_size = batch_size self.load_model_path = load_model_path self.num_actions = len(range) self.async_update_step = async_update_steps self.global_dict = global_dict self.global_epsilon_annealing = global_epsilon_annealing self.report_frequency = report_frequency self.minibatch_vars = {} self.reset_minibatch() self.testing = False self.target_reward = target_reward self.is_linear = is_linear self.thresholds = thresholds if self.thresholds is None: self.thresholds = [0] * (self.num_of_objs - 1) self.pareto_solutions = self.environment.get_pareto_solutions() # if self.pareto_solutions is not None: # for i in range(len(self.pareto_solutions)): self.table = lookup_table self.table.set_threshold(self.thresholds) if self.load_model_path is not None: self.agent.load_model() self.alpha = self.agent.get_current_learning_rate()
class MOBaseThreadLearner(threading.Thread, MOBaseLearner): def __init__(self, agent, name, environment, global_dict, num_of_objs=1, async_update_steps=5, using_e_greedy=True, epsilon_annealing_start=1, epsilon_annealing_choices=[0.1, 0.01, 0.5], epsilon_annealing_probabilities=[0.4, 0.3, 0.3], epsilon_annealing_steps=10000, global_epsilon_annealing=True, report_frequency=1, gamma=0.9, traces_factor=0.9, batch_size=5, load_model_path=None, lookup_table=None, thresholds=None, target_reward=None, is_linear=False): MOBaseLearner.__init__(self, num_of_objs) threading.Thread.__init__(self) range, is_range = environment.get_action_space().get_range() if not is_range: raise ValueError("Does not support this type of action space") self.using_e_greedy = using_e_greedy if using_e_greedy: end_rand = np.random.choice(epsilon_annealing_choices, p=epsilon_annealing_probabilities) self.epsilon_annealer = Annealer(epsilon_annealing_start, end_rand, epsilon_annealing_steps) self.current_epsilon = epsilon_annealing_start self.step_count = 0 self.eps_count = 0 self.environment = environment self.name = name self.agent = agent self.gamma = gamma self.traces_factor = traces_factor self.batch_size = batch_size self.load_model_path = load_model_path self.num_actions = len(range) self.async_update_step = async_update_steps self.global_dict = global_dict self.global_epsilon_annealing = global_epsilon_annealing self.report_frequency = report_frequency self.minibatch_vars = {} self.reset_minibatch() self.testing = False self.target_reward = target_reward self.is_linear = is_linear self.thresholds = thresholds if self.thresholds is None: self.thresholds = [0] * (self.num_of_objs - 1) self.pareto_solutions = self.environment.get_pareto_solutions() # if self.pareto_solutions is not None: # for i in range(len(self.pareto_solutions)): self.table = lookup_table self.table.set_threshold(self.thresholds) if self.load_model_path is not None: self.agent.load_model() self.alpha = self.agent.get_current_learning_rate() def reset(self): self.testing = self.agent.is_testing_mode self.reset_minibatch() # self.environment.render() def run(self): while not self.global_dict['done']: reward = self.run_episode(self.environment) self.eps_count += 1 if self.target_reward is None: self.global_dict['add_reward']( reward, self.environment.get_current_steps()) if self.eps_count % self.report_frequency == 0: current_epsilon = '' if self.using_e_greedy: current_epsilon = 'Current epsilon: {0}'.format( self.current_epsilon) print(self.name, 'Episode Count:', self.eps_count, 'Episode reward:', reward, 'Steps:', self.environment.get_current_steps(), 'Step count:', self.step_count, current_epsilon) # Testing purpose if self.target_reward is not None and self.thresholds is not None: backup_epsilon = self.current_epsilon self.current_epsilon = 0 greedy_reward = self.run_episode(self.environment) self.global_dict['add_reward']( greedy_reward, self.environment.get_current_steps()) self.current_epsilon = backup_epsilon converged = True for i in range(len(greedy_reward)): if greedy_reward[i] != self.target_reward[i]: converged = False break if converged: print("Converged") self.agent.converged = True if not self.testing: print(self.current_epsilon) self.anneal_epsilon() def update(self, *args, **kwargs): return NotImplemented def anneal_epsilon(self): if self.using_e_greedy: anneal_step = self.global_dict[ 'counter'] if self.global_epsilon_annealing else self.step_count self.current_epsilon = self.epsilon_annealer.anneal_to(anneal_step) def get_action(self, state): if self.using_e_greedy: # print(self.table.select_greedy_action(state)) if np.random.uniform(0, 1) <= self.current_epsilon: e_greedy = np.random.randint(self.num_actions) return e_greedy else: return self.table.select_greedy_action(state) else: return self.table.select_greedy_action(state) def reset_minibatch(self): pass
class MOExpReplayBaseThreadLearner(threading.Thread, MOBaseLearner): def __init__(self, agent, name, environment, network, global_dict, async_update_steps=1, reward_clip_vals=None, using_e_greedy=True, epsilon_annealing_start=1, epsilon_annealing_choices=[0.1, 0.01, 0.5], epsilon_annealing_probabilities=[0.4, 0.3, 0.3], epsilon_annealing_steps=100000, global_epsilon_annealing=True, report_frequency=1): MOBaseLearner.__init__(self, network.get_config().get_num_of_objectives()) threading.Thread.__init__(self) range, is_range = environment.get_action_space().get_range() if not is_range: raise ValueError("Does not support this type of action space") self.using_e_greedy = using_e_greedy if using_e_greedy: end_rand = np.random.choice(epsilon_annealing_choices, p=epsilon_annealing_probabilities) self.epsilon_annealer = Annealer(epsilon_annealing_start, end_rand, epsilon_annealing_steps) self.current_epsilon = epsilon_annealing_start self.step_count = 0 self.eps_count = 0 self.environment = environment self.reward_clip_vals = reward_clip_vals self.name = name self.agent = agent self.num_actions = len(range) self.network = network self.config = network.network_config self.history_length = self.config.get_history_length() if self.history_length > 1: self.frame_buffer = StateBuffer([1] + self.config.get_input_shape()) self.async_update_step = async_update_steps self.global_dict = global_dict self.global_epsilon_annealing = global_epsilon_annealing self.report_frequency = report_frequency self.minibatch_vars = {} self.reset_minibatch() self.testing = False def reset(self): self.testing = self.agent.is_testing_mode self.reset_minibatch() self.network.reset_network() if self.history_length > 1: self.frame_buffer.reset() state = self.environment.get_state() if self.history_length > 1: for _ in range(self.history_length): self.frame_buffer.add_state(state) def run(self): while not self.global_dict['done']: reward = self.run_episode(self.environment) self.eps_count += 1 #self.global_dict['add_reward'](reward, self.environment.get_current_steps()) if self.eps_count % self.report_frequency == 0: current_epsilon = '' if self.using_e_greedy: current_epsilon = 'Current epsilon: {0}'.format( self.current_epsilon) print(self.name, 'Episode Count:', self.eps_count, 'Episode reward:', reward, 'Steps:', self.environment.get_current_steps(), 'Step count:', self.step_count, current_epsilon) # Testing purpose backup_epsilon = self.current_epsilon self.current_epsilon = 0 greedy_reward = self.run_episode(self.environment) print("Greedy reward:", greedy_reward) self.global_dict['add_reward']( greedy_reward, self.environment.get_current_steps()) self.current_epsilon = backup_epsilon def update(self, *args, **kwargs): return NotImplemented def anneal_epsilon(self): if self.using_e_greedy: anneal_step = self.global_dict[ 'counter'] if self.global_epsilon_annealing else self.step_count self.current_epsilon = self.epsilon_annealer.anneal_to(anneal_step) def get_action(self, state): if self.using_e_greedy: if np.random.uniform(0, 1) <= self.current_epsilon: e_greedy = np.random.randint(self.num_actions) return e_greedy else: if self.history_length > 1: return self.network.get_output( self.frame_buffer.get_buffer_add_state(state)) else: return self.network.get_output(state) else: if self.history_length > 1: return self.network.get_output( self.frame_buffer.get_buffer_add_state(state)) else: return self.network.get_output(state) def reset_minibatch(self): pass
class DQNLearner(Learner): def __init__(self, agent, name, environment, network, global_dict, report_frequency, batch_size=32, warmup_steps=50000, training_frequency=4, experience_replay_size=2**19, epsilon_annealing_start=1, epsilon_annealing_end=0.1, epsilon_annealing_steps=1e6, reward_clip_thresholds=(-1, 1)): super().__init__(agent=agent, name=name, environment=environment, network=network, global_dict=global_dict, report_frequency=report_frequency) global experience_replay with global_dict[AgentMonitor.Q_LOCK]: if experience_replay is None: experience_replay = SyncExperienceReplay( experience_replay_size, state_history=network.network_config.get_history_length()) self.replay = experience_replay self.batch_size = batch_size self.warmup_steps = warmup_steps self.training_frequency = training_frequency self.reward_clip_thresholds = reward_clip_thresholds self.epsilon_annealer = Annealer(epsilon_annealing_start, epsilon_annealing_end, epsilon_annealing_steps) self.current_learning_rate = network.get_config( ).get_initial_learning_rate() self.current_epsilon = epsilon_annealing_start @staticmethod def get_default_number_of_learners(): return 1 def get_action(self, state): probs = self.get_probs(state) if self.current_epsilon is not None: if np.random.uniform(0, 1) < self.current_epsilon: return np.random.randint(0, len(probs)) else: return np.argmax(probs) else: return np.argmax(probs) def report(self, reward): print(self.name, 'Episode Count:', self.eps_count, 'Episode reward:', reward, 'Steps:', self.environment.get_current_steps(), 'Step count:', self.step_count, 'Learning rate:', self.global_dict[AgentMonitor.Q_LEARNING_RATE], 'Epsilon:', self.current_epsilon) def update(self, state, action, reward, next_state, terminal): if self.history_length > 1: self.frame_buffer.add_state(state) if self.reward_clip_thresholds is not None: reward = np.clip(reward, self.reward_clip_thresholds[0], self.reward_clip_thresholds[1]) if not self.testing: if self.history_length > 1: current_s = self.frame_buffer.get_buffer()[0] next_s = self.frame_buffer.get_buffer_add_state(next_state)[0] else: current_s = state next_s = next_state self.replay.append(current_s, action, reward, next_s, terminal) self.step_count += 1 self.global_dict['counter'] += 1 if self.step_count < self.warmup_steps: return if not self.testing: if self.step_count % self.training_frequency == 0: logging = self.global_dict[AgentMonitor.Q_LOGGING] s, a, r, n, t = self.replay.get_mini_batch( batch_size=self.batch_size) self.data_dict['states'] = s self.data_dict['actions'] = a self.data_dict['rewards'] = r self.data_dict['next_states'] = n self.data_dict['terminals'] = t self.data_dict['learning_rate'] = self.current_learning_rate self.data_dict['global_step'] = self.global_dict[ AgentMonitor.Q_GLOBAL_STEPS] if logging: self.global_dict[AgentMonitor.Q_LOGGING] = False self.data_dict['logging'] = True summary = self.network.train_network(self.data_dict) self.global_dict[AgentMonitor.Q_WRITER]. \ add_summary(summary, global_step=self.global_dict[AgentMonitor.Q_GLOBAL_STEPS]) else: self.data_dict['logging'] = False self.network.train_network(self.data_dict) self.current_epsilon = self.epsilon_annealer.anneal( self.global_dict[AgentMonitor.Q_GLOBAL_STEPS])