示例#1
0
class DQNAgent:
    def __init__(self, environment):
        self.env = environment
        self.memory = ReplayMemory(MEMORY_CAPACITY)
        self.dim_actions = self.env.action_space.n
        self.dim_states = self.env.observation_space.shape
        self.NN = NN(self.env.observation_space.shape, self.env.action_space.n,
                     BATCH_SIZE, SIZE_HIDDEN, LEARNING_RATE, ACTIVATION)
        self.observers = []
        self.episode_count = 0
        self.step_count_total = 1
        self.step_count_episode = 1
        self.epsilon_min = EPSILON_MIN
        self.epsilon_max = EPSILON_MAX
        self.epsilon_decay = EPSILON_DECAY
        self.target_update = TARGET_UPDATE
        self.max_steps = MAX_STEPS
        self.n_episodes = N_EPISODES
        self.epsilon = EPSILON_MAX
        self.batch_size = BATCH_SIZE
        self.usetarget = False
        self.gamma = GAMMA
        self.loss = 0
        self.done = False
        self.reward = 0
        self.reward_episode = 0
        self.learning_switch = False
        self.learning_start = LEARNING_START

    def notify(self, event):
        for observer in self.observers:
            observer(event)
        pass

    def act(self, state):
        self.step_count_total += 1
        action = self.choose_action(state)
        return action

    def learn(self, obs):
        self.memory.store(obs)
        if self.learning_switch:
            self.backup()
        self.notify('step_done')
        pass

    def backup(self):
        self.flashback()
        if self.step_count_total % self.target_update == 0:
            print('update')
            print(self.epsilon)
            self.NN.update_target()
            self.usetarget = True
        pass

    def flashback(self):
        X, y = self._make_batch()
        self.loss = self.NN.train(X, y)
        if np.isnan(self.loss.history['loss']).any():
            print('Warning, loss is {}'.format(self.loss))
        pass

    def choose_action(self, state):
        if np.random.rand() <= self.epsilon:
            choice = self.random_choice()
        else:
            choice = self.greedy_choice(state)
        return choice

    def greedy_choice(self, state):
        greedy_choice = self.NN.best_action(state, usetarget=False)
        return greedy_choice

    def random_choice(self):
        random_choice = np.random.randint(0, self.dim_actions)
        return random_choice

    def _make_batch(self):
        X = []
        y = []
        batch = self.memory.get_batch(self.batch_size)
        for state, action, newstate, reward, done in batch:
            X.append(state)
            target = self.NN.predict(state, False)
            q_vals_new_t = self.NN.predict(newstate, self.usetarget)
            a_select = self.NN.best_action(newstate, False)
            if done:
                target[action] = reward
            else:
                target[action] = reward + self.gamma * q_vals_new_t[a_select]
            y.append(target)
        return X, y

    def add_observer(self, observer):
        self.observers.append(observer)
        pass
class Agent(object):
    """
    The learner and decision maker.
    Based on the DQN algorithm - ref Mnih et. al 2015
    i.e. Q-Learning with experience replay & a target network

    All calls to tensorflow are wrapped into methods.

    Support for environments is currently manually configured.
    """
    def __init__(self,
                 env,
                 discount,
                 tau,
                 sess,
                 total_steps,
                 batch_size,
                 layers,
                 learning_rate,
                 epsilon_decay_fraction=0.5,
                 memory_fraction=0.25,
                 process_observation=False,
                 process_target=False,
                 **kwargs):

        self.env = env
        self.discount = discount
        self.tau = tau
        self.sess = sess
        self.batch_size = batch_size

        #  number of steps where epsilon is decayed from 1.0 to 0.1
        decay_steps = total_steps * epsilon_decay_fraction
        self.epsilon_getter = EpsilonDecayer(decay_steps)

        #  the counter is stepped up every time we act or learn
        self.counter = 0

        if repr(env) == '<TimeLimit<CartPoleEnv<CartPole-v1>>>':
            obs_space_shape = env.observation_space.shape
            #  the shape of the gym Discrete space is the number of actions
            #  not the shape of a single action array
            #  create a tuple to specify the action space
            self.action_space_shape = (1, )
            #  a list of all possible actions
            self.actions = [act for act in range(env.action_space.n)]

        elif repr(env) == '<TimeLimit<PendulumEnv<Pendulum-v0>>>':
            raise ValueError('Build in progress')
            obs_space_shape = env.observation_space.shape
            self.action_space_shape = env.action_space.shape
            self.actions = np.linspace(env.action_space.low,
                                       env.action_space.high,
                                       num=20,
                                       endpoint=True).tolist()

        elif repr(env) == '<TimeLimit<MountainCarEnv<MountainCar-v0>>>':
            obs_space_shape = env.observation_space.shape
            self.action_space_shape = (1, )
            self.actions = [act for act in range(env.action_space.n)]
        else:
            raise ValueError('Environment not supported')

        self.memory = ReplayMemory(obs_space_shape,
                                   self.action_space_shape,
                                   size=int(total_steps * memory_fraction))

        model_config = {
            'input_shape': obs_space_shape,
            'output_shape': (len(self.actions), ),
            'layers': layers,
            'learning_rate': learning_rate
        }

        #  the two approximations of Q(s,a)
        #  use the same config dictionary for both
        self.online = Qfunc(model_config, scope='online')
        self.target = Qfunc(model_config, scope='target')

        #  set up the operations to copy the online network parameters to
        #  the target network
        self.update_ops = self.make_target_net_update_ops()

        if process_observation:
            self.observation_processor = Normalizer(obs_space_shape[0])

        if process_target:
            self.target_processor = Normalizer(1)

        self.acting_writer = tf.summary.FileWriter('./results/acting',
                                                   graph=self.sess.graph)

        self.learning_writer = tf.summary.FileWriter('./results/learning',
                                                     graph=self.sess.graph)

        self.sess.run(tf.global_variables_initializer())

        self.update_target_network()

    def __repr__(self):
        return '<class DQN Agent>'

    def make_target_net_update_ops(self):
        """
        Creates the Tensorflow operations to update the target network.

        The two lists of Tensorflow Variables (one for the online net, one
        for the target net) are iterated over together and new weights
        are assigned to the target network
        """
        with tf.variable_scope('update_target_network'):
            update_ops = []
            for online, target in zip(self.online.params, self.target.params):
                logging.debug('copying {} to {}'.format(
                    online.name, target.name))
                val = tf.add(tf.multiply(online, self.tau),
                             tf.multiply(target, 1 - self.tau))

                operation = target.assign(val)
                update_ops.append(operation)
        return update_ops

    def remember(self, observation, action, reward, next_observation, done):
        """
        Store experience in the agent's memory.

        args
            observation (np.array)
            action (np.array)
            reward (np.array)
            next_observation (np.array)
            done (np.array)
        """
        if hasattr(self, 'observation_processor'):
            observation = self.observation_processor(observation)
            next_observation = self.observation_processor(next_observation)

        return self.memory.remember(observation, action, reward,
                                    next_observation, done)

    def predict_target(self, observations):
        """
        Target network is used to predict the maximum discounted expected
        return for the next_observation as experienced by the agent

        args
            observations (np.array)

        returns
            max_q (np.array) shape=(batch_size, 1)
        """
        fetches = [
            self.target.q_values, self.target.max_q, self.target.acting_summary
        ]

        feed_dict = {self.target.observation: observations}

        q_vals, max_q, summary = self.sess.run(fetches, feed_dict)
        self.learning_writer.add_summary(summary, self.counter)

        logging.debug('predict_target - next_obs {}'.format(observations))
        logging.debug('predict_target - q_vals {}'.format(q_vals))
        logging.debug('predict_target - max_q {}'.format(max_q))

        return max_q.reshape(observations.shape[0], 1)

    def predict_online(self, observation):
        """
        We use our online network to choose actions.

        args
            observation (np.array) a single observation

        returns
            action
        """
        obs = observation.reshape((1, *self.env.observation_space.shape))

        fetches = [
            self.online.q_values, self.online.max_q,
            self.online.optimal_action_idx, self.online.acting_summary
        ]

        feed_dict = {self.online.observation: obs}
        q_values, max_q, action_idx, summary = self.sess.run(
            fetches, feed_dict)
        self.acting_writer.add_summary(summary, self.counter)

        max_q = max_q.flatten()[0]
        max_q_sum = tf.Summary(
            value=[tf.Summary.Value(tag='max_q_acting', simple_value=max_q)])

        self.acting_writer.add_summary(max_q_sum, self.counter)
        self.acting_writer.flush()

        #  index at zero because TF returns an array
        action = self.actions[action_idx[0]]

        logging.debug('predict_online - observation {}'.format(obs))
        logging.debug('predict_online - pred_q_values {}'.format(q_values))
        logging.debug('predict_online - max_q {}'.format(max_q))
        logging.debug('predict_online - action_index {}'.format(action_idx))
        logging.debug('predict_online - action {}'.format(action))

        return action

    def update_target_network(self):
        """
        Updates the target network weights using the parameter tau

        Relies on the sorted lists of tf.Variables kept in each Qfunc object
        """
        logging.debug('updating target net at count {}'.format(self.counter))

        return self.sess.run(self.update_ops)

    def act(self, observation):
        """
        Our agent attempts to manipulate the world.

        Acting according to epsilon greedy policy.

        args
            observation (np.array)

        returns
            action (np.array)
        """
        self.counter += 1
        epsilon = self.epsilon_getter.epsilon
        logging.debug('epsilon is {}'.format(epsilon))

        if epsilon > random_uniform():
            action = self.env.action_space.sample()
            logging.debug('acting randomly - action is {}'.format(action))
        else:
            action = self.predict_online(observation)
            logging.debug('acting optimally action is {}'.format(action))

        epsilon_sum = tf.Summary(
            value=[tf.Summary.Value(tag='epsilon', simple_value=epsilon)])
        self.acting_writer.add_summary(epsilon_sum, self.counter)
        self.acting_writer.flush()

        # return np.array(action).reshape(1, *self.action_space_shape)
        return action

    def learn(self):
        """
        Our agent attempts to make sense of the world.

        A batch sampled using experience replay is used to train the online
        network using targets from the target network.

        returns
            train_info (dict)
        """
        batch = self.memory.get_batch(self.batch_size)
        observations = batch['observations']
        actions = batch['actions']
        rewards = batch['rewards']
        terminals = batch['terminal']
        next_observations = batch['next_observations']

        next_obs_q = self.predict_target(next_observations)

        #  if next state is terminal, set the value to zero
        next_obs_q[terminals] = 0

        #  creating a target for Q(s,a) using the Bellman equation
        rewards = rewards.reshape(rewards.shape[0], 1)
        target = rewards + self.discount * next_obs_q

        if hasattr(self, 'target_processor'):
            target = self.target_processor(target)

        indicies = np.zeros((actions.shape[0], 1), dtype=int)

        for arr, action in zip(indicies, actions):
            idx = self.actions.index(action)
            arr[0] = idx

        rng = np.arange(actions.shape[0]).reshape(actions.shape[0], 1)
        indicies = np.concatenate([rng, indicies], axis=1)

        fetches = [
            self.online.q_values, self.online.q_value, self.online.loss,
            self.online.train_op, self.online.learning_summary
        ]

        feed_dict = {
            self.online.observation: observations,
            self.online.action: indicies,
            self.online.target: target
        }

        q_vals, q_val, loss, train_op, train_sum = self.sess.run(
            fetches, feed_dict)

        logging.debug('learning - observations {}'.format(observations))

        logging.debug('learning - rewards {}'.format(rewards))
        logging.debug('learning - terminals {}'.format(terminals))
        logging.debug('learning - next_obs_q {}'.format(next_obs_q))

        logging.debug('learning - actions {}'.format(actions))
        logging.debug('learning - indicies {}'.format(indicies))
        logging.debug('learning - q_values {}'.format(q_vals))
        logging.debug('learning - q_value {}'.format(q_val))

        logging.debug('learning - target {}'.format(target))
        logging.debug('learning - loss {}'.format(loss))

        self.learning_writer.add_summary(train_sum, self.counter)

        self.update_target_network()

        return {'loss': loss}
示例#3
0
class Agent:
    def __init__(self,
                 environment,
                 optimizer,
                 memory_length,
                 dueling=True,
                 loss='mse',
                 noisy_net=False,
                 egreedy=False,
                 save_memory=None,
                 save_weights=None,
                 verbose_action=False,
                 ):

        self.environment = environment
        self._optimizer = optimizer
        self._loss = loss
        self.dueling = dueling
        self.egreedy = egreedy
        self.noisy_net = noisy_net

        # Initialize discount and exploration rate, etc
        self.total_steps = 0
        self.gamma = 0.99
        self.epsilon = 1
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.00005
        self.tau = 0.05
        self.pretraining_steps = 0

        # Build networks
        self.q_network = self._build_compile_model()
        self.target_network = self._build_compile_model()
        self.align_target_model(how='hard')

        self.memory = ReplayMemory(memory_length)

        self.save_weights_fp = save_weights
        self.save_memory_fp = save_memory
        self.start_time = datetime.datetime.now()
        self.verbose_action = verbose_action

    def load_memory(self, fp):
        with open(fp, 'rb') as f:
            self.memory.load_memory(pickle.load(f))
            print(f'loading {self.memory.length} memories...')

    def save_memory(self, fp):
        if fp:
            with open(fp, 'wb') as f:
                print('saving replay memory...')
                pickle.dump(self.memory.get_memory(), f)

    def load_weights(self, weights_fp):
        if weights_fp:
            print('loading weights...')
            self.q_network.load_weights(weights_fp)
            self.align_target_model(how='hard')

    def save_weights(self, weights_fp):
        if weights_fp:
            self.q_network.save_weights(weights_fp)

    def set_epsilon_decay_schedule(self, epsilon, epsilon_min, annealed_steps):
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = math.log(self.epsilon / self.epsilon_min) / annealed_steps

    def set_beta_schedule(self, beta_start, beta_max, annealed_samplings):
        self.memory.beta = beta_start
        self.memory.beta_max = beta_max
        self.memory.beta_increment_per_sampling = (self.memory.beta_max - self.memory.beta) / annealed_samplings

    def predict(self, state, use_target=False):
        if use_target:
            return self.target_network.predict(state)
        else:
            return self.q_network.predict(state)

    def _decay_epsilon(self):
        self.epsilon = self.epsilon * np.exp(-self.epsilon_decay)

    def store(self, state, action, reward, next_state, terminated):
        self.memory.add((state, action, reward, next_state, terminated))
        self.total_steps += 1

        if not self.egreedy:
            if (self.epsilon > self.epsilon_min) and (self.memory.length > self.pretraining_steps):
                self._decay_epsilon()

    def batch_store(self, batch_load):
        batch_load[-2][2] = -0.1  # custom reward altering
        for row in batch_load:
            self.store(*row)

    def _build_compile_model(self):
        inputs = tf.keras.layers.Input(shape=(32, 290, 4))
        conv1 = tf.keras.layers.Conv2D(32, (8, 8), strides=4, padding='same', activation='relu')(inputs)
        conv2 = tf.keras.layers.Conv2D(64, (4, 4), strides=2, padding='same', activation='relu')(conv1)
        conv3 = tf.keras.layers.Conv2D(64, (3, 3), strides=1, padding='same', activation='relu')(conv2)
        conv3 = tf.keras.layers.Flatten()(conv3)

        if self.noisy_net:
            advt = NoisyNetDense(256, activation='relu')(conv3)
            final = NoisyNetDense(2)(advt)
        else:
            advt = tf.keras.layers.Dense(256, activation='relu')(conv3)
            final = tf.keras.layers.Dense(2)(advt)

        if self.dueling:
            if self.noisy_net:
                value = NoisyNetDense(256, activation='relu')(conv3)
                value = NoisyNetDense(1)(value)
            else:
                value = tf.keras.layers.Dense(256, activation='relu')(conv3)
                value = tf.keras.layers.Dense(1)(value)

            advt = tf.keras.layers.Lambda(lambda x: x - tf.reduce_mean(x, axis=1, keepdims=True))(final)
            final = tf.keras.layers.Add()([value, advt])

        model = tf.keras.models.Model(inputs=inputs, outputs=final)
        model.compile(optimizer=self._optimizer,
                      loss=self._loss,
                      metrics=['accuracy'])
        return model

    def align_target_model(self, how):
        assert how in ('hard', 'soft'), '"how" must be either "hard" or "soft"'

        if how == 'hard':
            self.target_network.set_weights(self.q_network.get_weights())

        elif how == 'soft':
            for t, e in zip(self.target_network.trainable_variables, self.q_network.trainable_variables):
                t.assign(t * (1 - self.tau) + (e * self.tau))

    def choose_action(self, state):
        if not self.egreedy:
            if np.random.rand() <= self.epsilon:
                action = self.environment.action_space.sample()
                if self.verbose_action:
                    print(f'action: {action}, q: random')
                return action

        q_values = self.predict(state, use_target=False)
        action = np.argmax(q_values[0])
        if self.verbose_action:
            print(f'action: {action}, q: {q_values}')
        return action

    def train(self, batch, is_weights):

        td_errors = np.zeros(len(batch))
        states = np.zeros((len(batch), 32, 290, 4))
        targets = np.zeros((len(batch), 2))

        for i, (state, action, reward, next_state, terminated) in enumerate(batch):
            target, td_error = self._get_target(state, action, reward, next_state, terminated)
            states[i] = state.reshape(32, 290, 4)
            targets[i] = target
            td_errors[i] = td_error

        self.q_network.fit(states, targets, sample_weight=is_weights, batch_size=32, epochs=1, verbose=0)
        self.align_target_model(how='soft')

        return td_errors

    def replay(self, batch_size, epoch_steps=None):

        num_batches = 1
        if epoch_steps:
            num_batches = int(np.max([np.floor(epoch_steps / 4), 1]))

        bar = progressbar.ProgressBar(maxval=num_batches,
                                      widgets=[f'training - ', progressbar.widgets.Counter(), f'/{num_batches} ',
                                               progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
        bar.start()

        for i in range(num_batches):
            leaf_idx, batch, is_weights = self.memory.get_batch(batch_size)  # prioritized experience replay
            td_errors = self.train(batch, is_weights)
            self.memory.update_sum_tree(leaf_idx, td_errors)

            bar.update(i + 1)

        bar.finish()
        self.save_weights(self.save_weights_fp)

    def _get_target(self, state, action, reward, next_state, terminated):
        target = self.predict(state, use_target=False)
        prev_target = target[0][action]

        if terminated:
            target[0][action] = reward
        else:
            a = np.argmax(self.predict(next_state, use_target=False)[0])
            target[0][action] = reward + (self.gamma * self.predict(next_state, use_target=True)[0][a])  # double Q Network

        td_error = abs(prev_target - target[0][action])

        return target, td_error