def __init__(self, name, choices, network_config, reinforce_config, log=True): super(DQNAdaptive, self).__init__() self.name = name self.choices = choices self.network_config = network_config self.reinforce_config = reinforce_config self.update_frequency = reinforce_config.update_frequency self.replay_memory = Memory(self.reinforce_config.memory_size) self.learning = True self.explanation = False self.steps = 0 self.previous_state = None self.previous_action = None self.current_reward = 0 self.total_reward = 0 self.log = log if self.log: self.summary = SummaryWriter() self.target_model = DQNModel(self.name + "_target", self.network_config) self.eval_model = DQNModel(self.name + "_eval", self.network_config) self.episode = 0
def __init__(self, name, choices, network_config, reinforce_config): super(PGAdaptive, self).__init__() self.name = name self.choices = choices self.network_config = network_config self.reinforce_config = reinforce_config self.update_frequency = reinforce_config.update_frequency self.replay_memory = Memory(self.reinforce_config.batch_size) self.steps = 0 self.total_reward = 0 self.previous_state = None self.previous_action = None self.clear_rewards() self.model = ActorModel(self.name + "_actor", self.network_config) self.summary = SummaryWriter( log_dir=self.reinforce_config.summaries_path + "/" + self.name) self.episode = 0 self.epsilon_schedule = LinearSchedule( 10 * 1000, initial_p=self.reinforce_config.starting_epsilon, final_p=0.1)
def __init__(self, name, choices, network_config, reinforce_config): super(A3CAdaptive, self).__init__() self.name = name self.choices = choices self.network_config = network_config self.reinforce_config = reinforce_config self.update_frequency = reinforce_config.update_frequency self.replay_memory = Memory(self.reinforce_config.memory_size) self.learning = True self.steps = 0 self.previous_state = None self.previous_action = None self.reward_types = len(self.network_config.networks) self.current_reward = 0 self.total_reward = 0 self.session = tf.Session() self.critic_model = CriticModel(self.name + "_critic", self.network_config, self.session) self.actor_model = ActorModel(self.name + "_actor", self.network_config, self.session) #TODO: # * Add more information/summaries related to reinforcement learning # * Option to disable summary? clear_summary_path(self.reinforce_config.summaries_path + "/" + self.name) self.summaries_writer = tf.summary.FileWriter(self.reinforce_config.summaries_path + "/" + self.name, graph = self.session.graph) self.episode = 0
def __init__(self, name, network_config, discount_factor = 0.99, batch_size = 32): super(QPredictor, self).__init__() self.name = name self.session = tf.Session() self.eval_model = DQNModel(name + "_eval", network_config, self.session) self.target_model = DQNModel(name + "_target", network_config, self.session) self.previous_state = None self.replay_memory = Memory(5000) self.discount_factor = discount_factor self.update_frequency = 1000 self.batch_size = batch_size self.steps = 0
class DQNAdaptive(object): """Adaptive which uses the DQN algorithm""" def __init__(self, name, choices, network_config, reinforce_config, log=True): super(DQNAdaptive, self).__init__() self.name = name self.choices = choices self.network_config = network_config self.reinforce_config = reinforce_config self.update_frequency = reinforce_config.update_frequency self.replay_memory = Memory(self.reinforce_config.memory_size) self.learning = True self.explanation = False self.steps = 0 self.previous_state = None self.previous_action = None self.current_reward = 0 self.total_reward = 0 self.log = log if self.log: self.summary = SummaryWriter() self.target_model = DQNModel(self.name + "_target", self.network_config) self.eval_model = DQNModel(self.name + "_eval", self.network_config) self.episode = 0 def __del__(self): pass def should_explore(self): epsilon = np.max([ 0.1, self.reinforce_config.starting_epsilon * (self.reinforce_config.decay_rate **(self.steps / self.reinforce_config.decay_steps)) ]) if self.log: self.summary.add_scalar(tag='epsilon', scalar_value=epsilon, global_step=self.steps) return np.random.choice([True, False], p=[epsilon, 1 - epsilon]) def predict(self, state): self.steps += 1 saliencies = [] # add to experience if self.previous_state is not None: experience = Experience(self.previous_state, self.previous_action, self.current_reward, state) self.replay_memory.add(experience) if self.learning and self.should_explore(): action = np.random.choice(len(self.choices)) q_values = [None] * len( self.choices ) # TODO should it be output shape or from choices? choice = self.choices[action] else: _state = Variable(torch.Tensor(state)).unsqueeze(0) q_values = self.eval_model.predict(_state) q_values = q_values.data.numpy()[0] action = np.argmax(q_values) choice = self.choices[action] if self.explanation: eb.use_eb(True) prob_outputs = Variable(torch.zeros((len(self.choices), ))) for action in range(len(self.choices)): prob_outputs[action] = 1 saliency = eb.excitation_backprop(self.eval_model.model, _state, prob_outputs, contrastive=False) saliency = np.squeeze( saliency.view(*_state.shape).data.numpy()) saliencies.append(saliency) if self.learning and self.steps % self.update_frequency == 0: logger.debug("Replacing target model for %s" % self.name) self.target_model.replace(self.eval_model) self.update() self.current_reward = 0 self.previous_state = state self.previous_action = action return choice, q_values, saliencies def disable_learning(self): logger.info("Disabled Learning for %s agent" % self.name) self.eval_model.save_network() self.target_model.save_network() self.learning = False self.episode = 0 def end_episode(self, state): if not self.learning: return logger.info("End of Episode %d with total reward %d" % (self.episode + 1, self.total_reward)) self.episode += 1 if self.log: self.summary.add_scalar(tag='%s agent reward' % self.name, scalar_value=self.total_reward, global_step=self.episode) experience = Experience(self.previous_state, self.previous_action, self.current_reward, state, True) self.replay_memory.add(experience) self.current_reward = 0 self.total_reward = 0 self.previous_state = None self.previous_action = None if self.replay_memory.current_size > 30: self.update() def reward(self, r): self.total_reward += r self.current_reward += r def update(self): if self.replay_memory.current_size < self.reinforce_config.batch_size: return batch = self.replay_memory.sample(self.reinforce_config.batch_size) states = [experience.state for experience in batch] next_states = [experience.next_state for experience in batch] states = Variable(torch.Tensor(states)) next_states = Variable(torch.Tensor(next_states)) is_terminal = [ 0 if experience.is_terminal else 1 for experience in batch ] actions = [experience.action for experience in batch] reward = [experience.reward for experience in batch] q_next = self.target_model.predict(next_states) q_max = torch.max(q_next, dim=1)[0].data.numpy() q_max = np.array( [a * b if a == 0 else b for a, b in zip(is_terminal, q_max)]) q_predict = self.eval_model.predict(states) q_target = q_predict.data.numpy() batch_index = np.arange(self.reinforce_config.batch_size, dtype=np.int32) q_target[ batch_index, actions] = reward + self.reinforce_config.discount_factor * q_max q_target = Variable(torch.Tensor(q_target)) self.eval_model.fit(states, q_target, self.steps)
class PGAdaptive(object): """PGAdaptive using Vanilla Policy Gradient""" def __init__(self, name, choices, network_config, reinforce_config): super(PGAdaptive, self).__init__() self.name = name self.choices = choices self.network_config = network_config self.reinforce_config = reinforce_config self.update_frequency = reinforce_config.update_frequency self.replay_memory = Memory(self.reinforce_config.batch_size) self.steps = 0 self.total_reward = 0 self.previous_state = None self.previous_action = None self.clear_rewards() self.model = ActorModel(self.name + "_actor", self.network_config) self.summary = SummaryWriter( log_dir=self.reinforce_config.summaries_path + "/" + self.name) self.episode = 0 self.epsilon_schedule = LinearSchedule( 10 * 1000, initial_p=self.reinforce_config.starting_epsilon, final_p=0.1) def __del__(self): self.summary.close() def predict(self, state): self.steps += 1 if self.previous_state is not None and self.previous_action is not None: self.replay_memory.add((self.previous_state, self.previous_action, self.current_reward, state, False)) _state = Variable(torch.Tensor(state)).unsqueeze(0) action_probs = self.model.predict(_state) #TODO continuous action m = Categorical(action_probs) action = m.sample() choice = self.choices[action] self.update() self.clear_rewards() self.previous_state = state self.previous_action = action return choice, q_values def disable_learning(self): logger.info("Disabled Learning for %s agent" % self.name) self.model.save_network() self.episode = 0 def end_episode(self, state): if not self.learning: return logger.info("End of Episode %d with total reward %.2f" % (self.episode + 1, self.total_reward)) self.episode += 1 self.summary.add_scalar(tag='%s agent reward' % self.name, scalar_value=self.total_reward, global_step=self.episode) self.replay_memory.add((self.previous_state, self.previous_action, self.reward_list(), state, True)) self.clear_rewards() self.total_reward = 0 self.previous_state = None self.previous_action = None self.update() def clear_rewards(self): self.current_reward = 0 def reward(self, value): self.current_reward += value self.total_reward += value def update(self): if self.steps <= self.reinforce_config.batch_size: return states, actions, reward, next_states, is_terminal, weights, batch_idxes = self.replay_memory.sample( self.reinforce_config.batch_size, self.beta_schedule.value(self.steps)) states = Variable(torch.Tensor(states)) next_states = Variable(torch.Tensor(next_states)) is_terminal = [0 if t else 1 for t in is_terminal] self.replay_memory.clear() self.model.fit(states, q_target, self.steps) return td_errors
class A3CAdaptive(object): """A3CAdaptive using Actor Critic Algorithm""" def __init__(self, name, choices, network_config, reinforce_config): super(A3CAdaptive, self).__init__() self.name = name self.choices = choices self.network_config = network_config self.reinforce_config = reinforce_config self.update_frequency = reinforce_config.update_frequency self.replay_memory = Memory(self.reinforce_config.memory_size) self.learning = True self.steps = 0 self.previous_state = None self.previous_action = None self.reward_types = len(self.network_config.networks) self.current_reward = 0 self.total_reward = 0 self.session = tf.Session() self.critic_model = CriticModel(self.name + "_critic", self.network_config, self.session) self.actor_model = ActorModel(self.name + "_actor", self.network_config, self.session) #TODO: # * Add more information/summaries related to reinforcement learning # * Option to disable summary? clear_summary_path(self.reinforce_config.summaries_path + "/" + self.name) self.summaries_writer = tf.summary.FileWriter(self.reinforce_config.summaries_path + "/" + self.name, graph = self.session.graph) self.episode = 0 def __del__(self): self.summaries_writer.close() self.session.close() def should_explore(self): epsilon = np.max([0.1, self.reinforce_config.starting_epsilon * (self.reinforce_config.decay_rate ** (self.steps / self.reinforce_config.decay_steps))]) epsilon_summary = tf.Summary() epsilon_summary.value.add(tag='epsilon', simple_value = epsilon) self.summaries_writer.add_summary(epsilon_summary, self.steps) return np.random.choice([True, False], p = [epsilon, 1 - epsilon]) def predict(self, state): self.steps += 1 if self.learning: # TODO add noise when learning is True actor_prob = self.actor_model.predict(state) critic_values = self.critic_model.predict(state) action = np.random.choice(range(len(self.choices)), p = actor_prob) choice = self.choices[action] else: actor_prob = self.actor_model.predict(state) critic_values = self.critic_model.predict(state) action = np.random.choice(range(len(self.choices)), p = actor_prob) choice = self.choices[action] # add to experience if self.previous_state is not None and self.previous_action is not None: experience = Experience(self.previous_state, self.previous_action, self.current_reward, state, action) self.replay_memory.add(experience) # TODO # if self.learning and self.steps % self.update_frequency == 0: # logger.debug("Replacing target model for %s" % self.name) # self.target_model.replace(self.eval_model) self.update() self.current_reward = 0 self.previous_state = state self.previous_action = action return choice, actor_prob, critic_values def disable_learning(self): logger.info("Disabled Learning for %s agent" % self.name) self.actor_model.save_network() self.critic_model.save_network() self.learning = False self.episode = 0 def end_episode(self, state): if not self.learning: return logger.info("End of Episode %d with total reward %d" % (self.episode + 1, self.total_reward)) self.episode += 1 reward_summary = tf.Summary() reward_summary.value.add(tag='%s agent reward' % self.name, simple_value = self.total_reward) self.summaries_writer.add_summary(reward_summary, self.episode) experience = Experience(self.previous_state, self.previous_action, self.current_reward, state, is_terminal = True) self.replay_memory.add(experience) self.current_reward = 0 self.total_reward = 0 self.previous_state = None self.previous_action = None self.update() def reward(self, reward): self.total_reward += reward for i in range(self.reward_types): self.current_reward[i] += decomposed_rewards[i] def update_critic(self, batch): # TODO: Convert to tensor operations instead of for loops states = [experience.state for experience in batch] next_states = [experience.next_state for experience in batch] is_terminal = np.array([ 0 if experience.is_terminal else 1 for experience in batch]) actions = [experience.action for experience in batch] reward = np.array([experience.reward for experience in batch]) v_next = self.critic_model.predict_batch(next_states) v_next = is_terminal.reshape(self.reinforce_config.batch_size, 1) * v_next v_current = self.critic_model.predict_batch(states) v_target = reward.reshape(self.reinforce_config.batch_size, 1) + self.reinforce_config.discount_factor * v_next self.critic_model.fit(states, v_target, self.steps) def update_actor(self, batch): states = [experience.state for experience in batch] is_terminal = np.array([ 0 if experience.is_terminal else 1 for experience in batch]) actions = np.array([experience.action for experience in batch]).reshape(self.reinforce_config.batch_size, 1) v_current = is_terminal.reshape(self.reinforce_config.batch_size, 1) * self.critic_model.predict_batch(states) self.actor_model.fit(states, actions, v_current, self.steps) def update(self): if self.replay_memory.current_size < self.reinforce_config.batch_size: return batch = self.replay_memory.sample(self.reinforce_config.batch_size) self.update_critic(batch) self.update_actor(batch)
class QPredictor(object): """ Predictor are equivalent to General Value Functions (GVFs) """ #TODO # * discount factor how to decide? # * batch size should it be the same? # * save predictor def __init__(self, name, network_config, discount_factor = 0.99, batch_size = 32): super(QPredictor, self).__init__() self.name = name self.session = tf.Session() self.eval_model = DQNModel(name + "_eval", network_config, self.session) self.target_model = DQNModel(name + "_target", network_config, self.session) self.previous_state = None self.replay_memory = Memory(5000) self.discount_factor = discount_factor self.update_frequency = 1000 self.batch_size = batch_size self.steps = 0 def __del__(self): self.eval_model.save_network() self.target_model.save_network() self.session.close() def learn(self, current_state, action, reward, is_terminal, terminal_reward): self.steps += 1 if is_terminal: reward = terminal_reward if action is None: action = 0 if self.previous_state is not None: experience = Experience(self.previous_state, action, reward, current_state, is_terminal) self.replay_memory.add(experience) if self.steps % self.update_frequency == 0: logger.info("Predictor -- Replacing target model for %s" % self.name) self.target_model.replace(self.eval_model) self.previous_state = current_state self.update() def update(self): if self.replay_memory.current_size < self.batch_size: return batch = self.replay_memory.sample(self.batch_size) # TODO: Convert to tensor operations instead of for loops states = [experience.state for experience in batch] next_states = [experience.next_state for experience in batch] is_terminal = [ 0 if experience.is_terminal else 1 for experience in batch] actions = [experience.action for experience in batch] reward = [experience.reward for experience in batch] q_next = self.target_model.predict_batch(next_states) q_mean = np.mean(q_next, axis = 1) q_mean = np.array([ a * b if a == 0 else b for a,b in zip(is_terminal, q_mean)]) q_values = self.eval_model.predict_batch(states) q_target = q_values.copy() batch_index = np.arange(self.batch_size, dtype=np.int32) q_target[batch_index, actions] = reward + self.discount_factor * q_mean self.eval_model.fit(states, q_target, self.steps) def predict(self, state): action, q_values = self.eval_model.predict(state) return q_values