示例#1
0
    def __init__(self,
                 name,
                 choices,
                 network_config,
                 reinforce_config,
                 log=True):
        super(DQNAdaptive, self).__init__()
        self.name = name
        self.choices = choices
        self.network_config = network_config
        self.reinforce_config = reinforce_config
        self.update_frequency = reinforce_config.update_frequency

        self.replay_memory = Memory(self.reinforce_config.memory_size)
        self.learning = True
        self.explanation = False

        self.steps = 0
        self.previous_state = None
        self.previous_action = None
        self.current_reward = 0
        self.total_reward = 0
        self.log = log
        if self.log:
            self.summary = SummaryWriter()

        self.target_model = DQNModel(self.name + "_target",
                                     self.network_config)
        self.eval_model = DQNModel(self.name + "_eval", self.network_config)

        self.episode = 0
示例#2
0
    def __init__(self, name, choices, network_config, reinforce_config):
        super(PGAdaptive, self).__init__()
        self.name = name
        self.choices = choices
        self.network_config = network_config
        self.reinforce_config = reinforce_config
        self.update_frequency = reinforce_config.update_frequency

        self.replay_memory = Memory(self.reinforce_config.batch_size)

        self.steps = 0
        self.total_reward = 0

        self.previous_state = None
        self.previous_action = None
        self.clear_rewards()

        self.model = ActorModel(self.name + "_actor", self.network_config)
        self.summary = SummaryWriter(
            log_dir=self.reinforce_config.summaries_path + "/" + self.name)

        self.episode = 0
        self.epsilon_schedule = LinearSchedule(
            10 * 1000,
            initial_p=self.reinforce_config.starting_epsilon,
            final_p=0.1)
示例#3
0
    def __init__(self, name, choices, network_config, reinforce_config):
        super(A3CAdaptive, self).__init__()
        self.name = name
        self.choices = choices
        self.network_config = network_config
        self.reinforce_config = reinforce_config
        self.update_frequency = reinforce_config.update_frequency

        self.replay_memory = Memory(self.reinforce_config.memory_size)
        self.learning = True

        self.steps = 0
        self.previous_state = None
        self.previous_action = None
        self.reward_types = len(self.network_config.networks)
        self.current_reward = 0
        self.total_reward = 0
        self.session = tf.Session()

        self.critic_model = CriticModel(self.name + "_critic", self.network_config, self.session)
        self.actor_model = ActorModel(self.name + "_actor", self.network_config, self.session)

        #TODO:
        # * Add more information/summaries related to reinforcement learning
        # * Option to disable summary?
        clear_summary_path(self.reinforce_config.summaries_path + "/" + self.name)

        self.summaries_writer = tf.summary.FileWriter(self.reinforce_config.summaries_path + "/" + self.name, graph = self.session.graph)

        self.episode = 0
示例#4
0
    def __init__(self, name, network_config, discount_factor = 0.99, batch_size = 32):
        super(QPredictor, self).__init__()
        self.name = name
        self.session = tf.Session()

        self.eval_model = DQNModel(name + "_eval", network_config, self.session)
        self.target_model = DQNModel(name + "_target", network_config, self.session)

        self.previous_state = None
        self.replay_memory = Memory(5000)
        self.discount_factor = discount_factor
        self.update_frequency = 1000
        self.batch_size = batch_size
        self.steps = 0
示例#5
0
class DQNAdaptive(object):
    """Adaptive which uses the  DQN algorithm"""
    def __init__(self,
                 name,
                 choices,
                 network_config,
                 reinforce_config,
                 log=True):
        super(DQNAdaptive, self).__init__()
        self.name = name
        self.choices = choices
        self.network_config = network_config
        self.reinforce_config = reinforce_config
        self.update_frequency = reinforce_config.update_frequency

        self.replay_memory = Memory(self.reinforce_config.memory_size)
        self.learning = True
        self.explanation = False

        self.steps = 0
        self.previous_state = None
        self.previous_action = None
        self.current_reward = 0
        self.total_reward = 0
        self.log = log
        if self.log:
            self.summary = SummaryWriter()

        self.target_model = DQNModel(self.name + "_target",
                                     self.network_config)
        self.eval_model = DQNModel(self.name + "_eval", self.network_config)

        self.episode = 0

    def __del__(self):
        pass

    def should_explore(self):
        epsilon = np.max([
            0.1, self.reinforce_config.starting_epsilon *
            (self.reinforce_config.decay_rate
             **(self.steps / self.reinforce_config.decay_steps))
        ])
        if self.log:
            self.summary.add_scalar(tag='epsilon',
                                    scalar_value=epsilon,
                                    global_step=self.steps)
        return np.random.choice([True, False], p=[epsilon, 1 - epsilon])

    def predict(self, state):
        self.steps += 1
        saliencies = []

        # add to experience
        if self.previous_state is not None:
            experience = Experience(self.previous_state, self.previous_action,
                                    self.current_reward, state)
            self.replay_memory.add(experience)

        if self.learning and self.should_explore():
            action = np.random.choice(len(self.choices))
            q_values = [None] * len(
                self.choices
            )  # TODO should it be output shape or from choices?
            choice = self.choices[action]
        else:
            _state = Variable(torch.Tensor(state)).unsqueeze(0)
            q_values = self.eval_model.predict(_state)
            q_values = q_values.data.numpy()[0]
            action = np.argmax(q_values)
            choice = self.choices[action]

        if self.explanation:
            eb.use_eb(True)
            prob_outputs = Variable(torch.zeros((len(self.choices), )))
            for action in range(len(self.choices)):
                prob_outputs[action] = 1
                saliency = eb.excitation_backprop(self.eval_model.model,
                                                  _state,
                                                  prob_outputs,
                                                  contrastive=False)
                saliency = np.squeeze(
                    saliency.view(*_state.shape).data.numpy())
                saliencies.append(saliency)

        if self.learning and self.steps % self.update_frequency == 0:
            logger.debug("Replacing target model for %s" % self.name)
            self.target_model.replace(self.eval_model)

        self.update()

        self.current_reward = 0

        self.previous_state = state
        self.previous_action = action

        return choice, q_values, saliencies

    def disable_learning(self):
        logger.info("Disabled Learning for %s agent" % self.name)
        self.eval_model.save_network()
        self.target_model.save_network()

        self.learning = False
        self.episode = 0

    def end_episode(self, state):
        if not self.learning:
            return

        logger.info("End of Episode %d with total reward %d" %
                    (self.episode + 1, self.total_reward))

        self.episode += 1
        if self.log:
            self.summary.add_scalar(tag='%s agent reward' % self.name,
                                    scalar_value=self.total_reward,
                                    global_step=self.episode)
        experience = Experience(self.previous_state, self.previous_action,
                                self.current_reward, state, True)
        self.replay_memory.add(experience)

        self.current_reward = 0
        self.total_reward = 0

        self.previous_state = None
        self.previous_action = None

        if self.replay_memory.current_size > 30:
            self.update()

    def reward(self, r):
        self.total_reward += r
        self.current_reward += r

    def update(self):
        if self.replay_memory.current_size < self.reinforce_config.batch_size:
            return

        batch = self.replay_memory.sample(self.reinforce_config.batch_size)

        states = [experience.state for experience in batch]
        next_states = [experience.next_state for experience in batch]

        states = Variable(torch.Tensor(states))
        next_states = Variable(torch.Tensor(next_states))

        is_terminal = [
            0 if experience.is_terminal else 1 for experience in batch
        ]

        actions = [experience.action for experience in batch]
        reward = [experience.reward for experience in batch]

        q_next = self.target_model.predict(next_states)
        q_max = torch.max(q_next, dim=1)[0].data.numpy()
        q_max = np.array(
            [a * b if a == 0 else b for a, b in zip(is_terminal, q_max)])

        q_predict = self.eval_model.predict(states)

        q_target = q_predict.data.numpy()
        batch_index = np.arange(self.reinforce_config.batch_size,
                                dtype=np.int32)
        q_target[
            batch_index,
            actions] = reward + self.reinforce_config.discount_factor * q_max
        q_target = Variable(torch.Tensor(q_target))

        self.eval_model.fit(states, q_target, self.steps)
示例#6
0
class PGAdaptive(object):
    """PGAdaptive using Vanilla Policy Gradient"""
    def __init__(self, name, choices, network_config, reinforce_config):
        super(PGAdaptive, self).__init__()
        self.name = name
        self.choices = choices
        self.network_config = network_config
        self.reinforce_config = reinforce_config
        self.update_frequency = reinforce_config.update_frequency

        self.replay_memory = Memory(self.reinforce_config.batch_size)

        self.steps = 0
        self.total_reward = 0

        self.previous_state = None
        self.previous_action = None
        self.clear_rewards()

        self.model = ActorModel(self.name + "_actor", self.network_config)
        self.summary = SummaryWriter(
            log_dir=self.reinforce_config.summaries_path + "/" + self.name)

        self.episode = 0
        self.epsilon_schedule = LinearSchedule(
            10 * 1000,
            initial_p=self.reinforce_config.starting_epsilon,
            final_p=0.1)

    def __del__(self):
        self.summary.close()

    def predict(self, state):
        self.steps += 1

        if self.previous_state is not None and self.previous_action is not None:
            self.replay_memory.add((self.previous_state, self.previous_action,
                                    self.current_reward, state, False))

        _state = Variable(torch.Tensor(state)).unsqueeze(0)
        action_probs = self.model.predict(_state)

        #TODO continuous action
        m = Categorical(action_probs)
        action = m.sample()

        choice = self.choices[action]

        self.update()

        self.clear_rewards()

        self.previous_state = state
        self.previous_action = action

        return choice, q_values

    def disable_learning(self):
        logger.info("Disabled Learning for %s agent" % self.name)
        self.model.save_network()
        self.episode = 0

    def end_episode(self, state):
        if not self.learning:
            return

        logger.info("End of Episode %d with total reward %.2f" %
                    (self.episode + 1, self.total_reward))

        self.episode += 1

        self.summary.add_scalar(tag='%s agent reward' % self.name,
                                scalar_value=self.total_reward,
                                global_step=self.episode)

        self.replay_memory.add((self.previous_state, self.previous_action,
                                self.reward_list(), state, True))

        self.clear_rewards()
        self.total_reward = 0

        self.previous_state = None
        self.previous_action = None

        self.update()

    def clear_rewards(self):
        self.current_reward = 0

    def reward(self, value):
        self.current_reward += value
        self.total_reward += value

    def update(self):
        if self.steps <= self.reinforce_config.batch_size:
            return

        states, actions, reward, next_states, is_terminal, weights, batch_idxes = self.replay_memory.sample(
            self.reinforce_config.batch_size,
            self.beta_schedule.value(self.steps))
        states = Variable(torch.Tensor(states))
        next_states = Variable(torch.Tensor(next_states))

        is_terminal = [0 if t else 1 for t in is_terminal]

        self.replay_memory.clear()

        self.model.fit(states, q_target, self.steps)

        return td_errors
示例#7
0
class A3CAdaptive(object):
    """A3CAdaptive using Actor Critic Algorithm"""
    def __init__(self, name, choices, network_config, reinforce_config):
        super(A3CAdaptive, self).__init__()
        self.name = name
        self.choices = choices
        self.network_config = network_config
        self.reinforce_config = reinforce_config
        self.update_frequency = reinforce_config.update_frequency

        self.replay_memory = Memory(self.reinforce_config.memory_size)
        self.learning = True

        self.steps = 0
        self.previous_state = None
        self.previous_action = None
        self.reward_types = len(self.network_config.networks)
        self.current_reward = 0
        self.total_reward = 0
        self.session = tf.Session()

        self.critic_model = CriticModel(self.name + "_critic", self.network_config, self.session)
        self.actor_model = ActorModel(self.name + "_actor", self.network_config, self.session)

        #TODO:
        # * Add more information/summaries related to reinforcement learning
        # * Option to disable summary?
        clear_summary_path(self.reinforce_config.summaries_path + "/" + self.name)

        self.summaries_writer = tf.summary.FileWriter(self.reinforce_config.summaries_path + "/" + self.name, graph = self.session.graph)

        self.episode = 0

    def __del__(self):
        self.summaries_writer.close()
        self.session.close()

    def should_explore(self):
        epsilon = np.max([0.1, self.reinforce_config.starting_epsilon * (self.reinforce_config.decay_rate ** (self.steps / self.reinforce_config.decay_steps))])

        epsilon_summary = tf.Summary()
        epsilon_summary.value.add(tag='epsilon', simple_value = epsilon)
        self.summaries_writer.add_summary(epsilon_summary, self.steps)

        return np.random.choice([True, False],  p = [epsilon, 1 - epsilon])


    def predict(self, state):
        self.steps += 1

        if self.learning:
            # TODO add noise when learning is True
            actor_prob = self.actor_model.predict(state)
            critic_values = self.critic_model.predict(state)
            action = np.random.choice(range(len(self.choices)), p = actor_prob)
            choice = self.choices[action]
        else:
            actor_prob = self.actor_model.predict(state)
            critic_values = self.critic_model.predict(state)
            action = np.random.choice(range(len(self.choices)), p = actor_prob)
            choice = self.choices[action]

        # add to experience
        if self.previous_state is not None and self.previous_action is not None:
            experience = Experience(self.previous_state, self.previous_action, self.current_reward, state, action)
            self.replay_memory.add(experience)



        # TODO
        # if self.learning and self.steps % self.update_frequency == 0:
        #     logger.debug("Replacing target model for %s" % self.name)
        #     self.target_model.replace(self.eval_model)

        self.update()

        self.current_reward = 0

        self.previous_state = state
        self.previous_action = action

        return choice, actor_prob, critic_values

    def disable_learning(self):
        logger.info("Disabled Learning for %s agent" % self.name)
        self.actor_model.save_network()
        self.critic_model.save_network()

        self.learning = False
        self.episode = 0

    def end_episode(self, state):
        if not self.learning:
            return

        logger.info("End of Episode %d with total reward %d" % (self.episode + 1, self.total_reward))

        self.episode += 1

        reward_summary = tf.Summary()
        reward_summary.value.add(tag='%s agent reward' % self.name, simple_value = self.total_reward)
        self.summaries_writer.add_summary(reward_summary, self.episode)

        experience = Experience(self.previous_state, self.previous_action, self.current_reward, state,  is_terminal = True)
        self.replay_memory.add(experience)

        self.current_reward = 0
        self.total_reward = 0

        self.previous_state = None
        self.previous_action = None

        self.update()

    def reward(self, reward):
        self.total_reward += reward
        for i in range(self.reward_types):
            self.current_reward[i] += decomposed_rewards[i]


    def update_critic(self, batch):
        # TODO: Convert to tensor operations instead of for loops

        states = [experience.state for experience in batch]

        next_states = [experience.next_state for experience in batch]

        is_terminal = np.array([ 0 if experience.is_terminal else 1 for experience in batch])

        actions = [experience.action for experience in batch]

        reward = np.array([experience.reward for experience in batch])

        v_next = self.critic_model.predict_batch(next_states)

        v_next = is_terminal.reshape(self.reinforce_config.batch_size, 1) * v_next

        v_current = self.critic_model.predict_batch(states)

        v_target = reward.reshape(self.reinforce_config.batch_size, 1) + self.reinforce_config.discount_factor * v_next

        self.critic_model.fit(states, v_target, self.steps)


    def update_actor(self, batch):
        states = [experience.state for experience in batch]

        is_terminal = np.array([ 0 if experience.is_terminal else 1 for experience in batch])

        actions = np.array([experience.action for experience in batch]).reshape(self.reinforce_config.batch_size, 1)

        v_current = is_terminal.reshape(self.reinforce_config.batch_size, 1) * self.critic_model.predict_batch(states)

        self.actor_model.fit(states, actions, v_current, self.steps)


    def update(self):
        if self.replay_memory.current_size < self.reinforce_config.batch_size:
            return

        batch = self.replay_memory.sample(self.reinforce_config.batch_size)

        self.update_critic(batch)

        self.update_actor(batch)
示例#8
0
class QPredictor(object):
    """ Predictor are equivalent to General Value Functions (GVFs) """

    #TODO
    # * discount factor how to decide?
    # * batch size should it be the same?
    # * save predictor

    def __init__(self, name, network_config, discount_factor = 0.99, batch_size = 32):
        super(QPredictor, self).__init__()
        self.name = name
        self.session = tf.Session()

        self.eval_model = DQNModel(name + "_eval", network_config, self.session)
        self.target_model = DQNModel(name + "_target", network_config, self.session)

        self.previous_state = None
        self.replay_memory = Memory(5000)
        self.discount_factor = discount_factor
        self.update_frequency = 1000
        self.batch_size = batch_size
        self.steps = 0

    def __del__(self):
        self.eval_model.save_network()
        self.target_model.save_network()
        self.session.close()

    def learn(self, current_state, action, reward, is_terminal, terminal_reward):
        self.steps += 1

        if is_terminal:
            reward = terminal_reward

        if action is None:
            action = 0

        if self.previous_state is not None:
            experience = Experience(self.previous_state, action, reward, current_state, is_terminal)
            self.replay_memory.add(experience)

        if self.steps % self.update_frequency == 0:
            logger.info("Predictor -- Replacing target model for %s" % self.name)
            self.target_model.replace(self.eval_model)

        self.previous_state = current_state

        self.update()

    def update(self):
        if self.replay_memory.current_size < self.batch_size:
            return

        batch = self.replay_memory.sample(self.batch_size)

        # TODO: Convert to tensor operations instead of for loops

        states = [experience.state for experience in batch]

        next_states = [experience.next_state for experience in batch]

        is_terminal = [ 0 if experience.is_terminal else 1 for experience in batch]

        actions = [experience.action for experience in batch]

        reward = [experience.reward for experience in batch]

        q_next = self.target_model.predict_batch(next_states)

        q_mean = np.mean(q_next, axis = 1)

        q_mean = np.array([ a * b if a == 0 else b for a,b in zip(is_terminal, q_mean)])

        q_values = self.eval_model.predict_batch(states)

        q_target = q_values.copy()

        batch_index = np.arange(self.batch_size, dtype=np.int32)

        q_target[batch_index, actions] = reward + self.discount_factor * q_mean

        self.eval_model.fit(states, q_target, self.steps)

    def predict(self, state):
        action, q_values = self.eval_model.predict(state)
        return q_values