Exemplo n.º 1
0
def test_fit_observations():
    memory = SequentialMemory(100,
                              window_length=2,
                              ignore_episode_boundaries=False)
    agent = TestAgent(memory)
    env = TestEnv()
    agent.compile()
    agent.fit(env, 20, verbose=0)

    # Inspect memory to see if observations are correct.
    experiencies = memory.sample(batch_size=8, batch_idxs=range(8))

    assert experiencies[0].reward == .2
    assert experiencies[0].action == 1
    assert_allclose(experiencies[0].state0, np.array([0, 1]))
    assert_allclose(experiencies[0].state1, np.array([1, 2]))
    assert experiencies[0].terminal1 is False

    assert experiencies[1].reward == .3
    assert experiencies[1].action == 2
    assert_allclose(experiencies[1].state0, np.array([1, 2]))
    assert_allclose(experiencies[1].state1, np.array([2, 3]))
    assert experiencies[1].terminal1 is False

    assert experiencies[2].reward == .4
    assert experiencies[2].action == 3
    assert_allclose(experiencies[2].state0, np.array([2, 3]))
    assert_allclose(experiencies[2].state1, np.array([3, 4]))
    assert experiencies[2].terminal1 is False

    assert experiencies[3].reward == .5
    assert experiencies[3].action == 4
    assert_allclose(experiencies[3].state0, np.array([3, 4]))
    assert_allclose(experiencies[3].state1, np.array([4, 5]))
    assert experiencies[3].terminal1 is False

    assert experiencies[4].reward == .6
    assert experiencies[4].action == 5
    assert_allclose(experiencies[4].state0, np.array([4, 5]))
    assert_allclose(experiencies[4].state1, np.array([5, 6]))
    assert experiencies[4].terminal1 is True

    # Experience 5 has been re-sampled since since state0 would be terminal in which case we
    # cannot really have a meaningful transition because the environment gets reset. We thus
    # just ensure that state0 is not terminal.
    assert not np.all(experiencies[5].state0 == np.array([5, 6]))

    assert experiencies[6].reward == .2
    assert experiencies[6].action == 1
    assert_allclose(experiencies[6].state0, np.array([0, 1]))
    assert_allclose(experiencies[6].state1, np.array([1, 2]))
    assert experiencies[6].terminal1 is False

    assert experiencies[7].reward == .3
    assert experiencies[7].action == 2
    assert_allclose(experiencies[7].state0, np.array([1, 2]))
    assert_allclose(experiencies[7].state1, np.array([2, 3]))
    assert experiencies[7].terminal1 is False
Exemplo n.º 2
0
def test_fit_observations():
    memory = SequentialMemory(100, window_length=2, ignore_episode_boundaries=False)
    agent = TestAgent(memory)
    env = TestEnv()
    agent.compile()
    agent.fit(env, 20, verbose=0)

    # Inspect memory to see if observations are correct.
    experiencies = memory.sample(batch_size=6, batch_idxs=range(2, 8))

    assert experiencies[0].reward == .4
    assert experiencies[0].action == 3
    assert_allclose(experiencies[0].state0, np.array([2, 3]))
    assert_allclose(experiencies[0].state1, np.array([3, 4]))
    assert experiencies[0].terminal1 is False

    assert experiencies[1].reward == .5
    assert experiencies[1].action == 4
    assert_allclose(experiencies[1].state0, np.array([3, 4]))
    assert_allclose(experiencies[1].state1, np.array([4, 5]))
    assert experiencies[1].terminal1 is False

    assert experiencies[2].reward == .6
    assert experiencies[2].action == 5
    assert_allclose(experiencies[2].state0, np.array([4, 5]))
    assert_allclose(experiencies[2].state1, np.array([5, 6]))
    assert experiencies[2].terminal1 is True

    # Experience 3 has been re-sampled since since state0 would be terminal in which case we
    # cannot really have a meaningful transition because the environment gets reset. We thus
    # just ensure that state0 is not terminal.
    assert not np.all(experiencies[3].state0 == np.array([5, 6]))

    assert experiencies[4].reward == .2
    assert experiencies[4].action == 1
    assert_allclose(experiencies[4].state0, np.array([0, 1]))
    assert_allclose(experiencies[4].state1, np.array([1, 2]))
    assert experiencies[4].terminal1 is False

    assert experiencies[5].reward == .3
    assert experiencies[5].action == 2
    assert_allclose(experiencies[5].state0, np.array([1, 2]))
    assert_allclose(experiencies[5].state1, np.array([2, 3]))
    assert experiencies[5].terminal1 is False
Exemplo n.º 3
0
def test_sampling():
    memory = SequentialMemory(100, window_length=2, ignore_episode_boundaries=False)
    obs_size = (3, 4)
    actions = range(5)
    
    obs0 = np.random.random(obs_size)
    terminal0 = False
    action0 = np.random.choice(actions)
    reward0 = np.random.random()
    
    obs1 = np.random.random(obs_size)
    terminal1 = False
    action1 = np.random.choice(actions)
    reward1 = np.random.random()
    
    obs2 = np.random.random(obs_size)
    terminal2 = False
    action2 = np.random.choice(actions)
    reward2 = np.random.random()
    
    obs3 = np.random.random(obs_size)
    terminal3 = True
    action3 = np.random.choice(actions)
    reward3 = np.random.random()

    obs4 = np.random.random(obs_size)
    terminal4 = False
    action4 = np.random.choice(actions)
    reward4 = np.random.random()

    obs5 = np.random.random(obs_size)
    terminal5 = False
    action5 = np.random.choice(actions)
    reward5 = np.random.random()

    obs6 = np.random.random(obs_size)
    terminal6 = False
    action6 = np.random.choice(actions)
    reward6 = np.random.random()
    
    # memory.append takes the current observation, the reward after taking an action and if
    # the *new* observation is terminal, thus `obs0` and `terminal1` is correct.
    memory.append(obs0, action0, reward0, terminal1)
    memory.append(obs1, action1, reward1, terminal2)
    memory.append(obs2, action2, reward2, terminal3)
    memory.append(obs3, action3, reward3, terminal4)
    memory.append(obs4, action4, reward4, terminal5)
    memory.append(obs5, action5, reward5, terminal6)
    assert memory.nb_entries == 6

    experiences = memory.sample(batch_size=5, batch_idxs=[0, 1, 2, 3, 4])
    assert len(experiences) == 5

    assert_allclose(experiences[0].state0, np.array([np.zeros(obs_size), obs0]))
    assert_allclose(experiences[0].state1, np.array([obs0, obs1]))
    assert experiences[0].action == action0
    assert experiences[0].reward == reward0
    assert experiences[0].terminal1 is False

    assert_allclose(experiences[1].state0, np.array([obs0, obs1]))
    assert_allclose(experiences[1].state1, np.array([obs1, obs2]))
    assert experiences[1].action == action1
    assert experiences[1].reward == reward1
    assert experiences[1].terminal1 is False

    assert_allclose(experiences[2].state0, np.array([obs1, obs2]))
    assert_allclose(experiences[2].state1, np.array([obs2, obs3]))
    assert experiences[2].action == action2
    assert experiences[2].reward == reward2
    assert experiences[2].terminal1 is True

    # Next experience has been re-sampled since since state0 would be terminal in which case we
    # cannot really have a meaningful transition because the environment gets reset. We thus
    # just ensure that state0 is not terminal.
    assert not np.all(experiences[3].state0 == np.array([obs2, obs3]))

    assert_allclose(experiences[4].state0, np.array([np.zeros(obs_size), obs4]))
    assert_allclose(experiences[4].state1, np.array([obs4, obs5]))
    assert experiences[4].action == action4
    assert experiences[4].reward == reward4
    assert experiences[4].terminal1 is False
Exemplo n.º 4
0
class MACE(Agent):
    def __init__(self, env: gym.Env, **kwargs):

        super(MACE, self).__init__(**kwargs)
        self.nb_actions = env.action_space.shape[0]

        obs_input_actor = Input(shape=(1, ) + env.observation_space.shape,
                                name='observation_input')
        x_ac = Flatten()(obs_input_actor)
        x_ac = Dense(units=256, activation='relu')(x_ac)

        obs_input_critic = Input(shape=(1, ) + env.observation_space.shape,
                                 name='observation_input')
        x_cr = Flatten()(obs_input_critic)
        x_cr = Dense(units=256, activation='relu')(x_cr)

        x_critic = Dense(units=128, activation='relu')(x_cr)
        value = Dense(units=1)(x_critic)

        x_actor = Dense(units=128, activation='relu')(x_ac)
        action = Dense(units=self.nb_actions, activation='tanh')(x_actor)

        actor = Model(inputs=obs_input_actor, outputs=action)
        critic = Model(inputs=obs_input_critic, outputs=value)

        metrics = []
        metrics += [mean_q]
        critic_metrics = metrics

        critic_optimizer = Adam(lr=1e-3)
        actor_optimizer = Adam(lr=1e-3)

        #        critic_optimizer = SGD(lr=1e-4, momentum=0.9)
        #        actor_optimizer = SGD(lr=1e-3, momentum=0.9)

        self.actor = actor
        self.critic = critic

        self.target_actor = clone_model(self.actor)
        self.target_actor.compile(optimizer='sgd', loss='mse')
        self.target_critic = clone_model(self.critic)
        self.target_critic.compile(optimizer='sgd', loss='mse')

        self.target_model_update = 1e-3
        #self.target_model_update=500

        if self.target_model_update < 1.:
            # We use the `AdditionalUpdatesOptimizer` to efficiently soft-update the target model.
            critic_updates = get_soft_target_model_updates(
                self.target_critic, self.critic, self.target_model_update)
            critic_optimizer = AdditionalUpdatesOptimizer(
                critic_optimizer, critic_updates)
            actor_updates = get_soft_target_model_updates(
                self.target_actor, self.actor, self.target_model_update)
            actor_optimizer = AdditionalUpdatesOptimizer(
                actor_optimizer, actor_updates)

        self.delta_clip = np.inf

        def clipped_error(y_true, y_pred):
            return K.mean(huber_loss(y_true, y_pred, self.delta_clip), axis=-1)

        actor.compile(actor_optimizer, loss='mse')
        critic.compile(critic_optimizer, loss='mse', metrics=critic_metrics)

        self.compiled = True

        self.memory = SequentialMemory(limit=100000, window_length=1)
        self.memory_interval = 1
        self.memory_actor = SequentialMemory(limit=100000, window_length=1)
        self.memory_critic = SequentialMemory(limit=100000, window_length=1)

        self.nb_steps_warmup = 50000

        self.train_interval = 4
        self.batch_size = 64
        self.gamma = 0.99

        self.processor = None
        self.random_process = OrnsteinUhlenbeckProcess(theta=.15,
                                                       mu=0.,
                                                       sigma=.3,
                                                       size=self.nb_actions)
        self.eps = 0.9

    def process_state_batch(self, batch):
        batch = np.array(batch)
        if self.processor is None:
            return batch
        return self.processor.process_state_batch(batch)

    def select_action(self, state):
        batch = [state]
        action = self.actor.predict_on_batch(np.asarray(batch)).flatten()
        # Apply noise, if a random process is set.
        if self.training and self.random_process is not None:
            #Actor exploration Bernoulli
            #            rd = np.random.rand()
            #            if rd<self.eps:
            noise = self.random_process.sample()
            assert noise.shape == action.shape
            action += noise
#               self.action_exploration=True


#            else:
#                self.action_exploration=False
        return action

    def forward(self, observation):
        # Select an action.

        state = self.memory.get_recent_state(observation)
        action = self.select_action(state)  # TODO: move this into policy

        # Book-keeping.
        self.recent_observation = observation
        self.recent_action = action

        return action

    def backward(self, reward, terminal=False):
        # Store most recent experience in memory.
        if self.step % self.memory_interval == 0:
            self.memory.append(self.recent_observation,
                               self.recent_action,
                               reward,
                               terminal,
                               training=self.training)

        metrics = [np.nan for _ in self.metrics_names]
        if not self.training:
            # We're done here. No need to update the experience memory since we only use the working
            # memory to obtain the state over the most recent observations.
            return metrics

        # Train the network on a single stochastic batch.
        can_train_either = self.step > self.nb_steps_warmup
        if can_train_either and self.step % self.train_interval == 0:
            experiences = self.memory.sample(self.batch_size)
            assert len(experiences) == self.batch_size

            # Start by extracting the necessary parameters (we use a vectorized implementation).
            state0_batch = []
            reward_batch = []
            action_batch = []
            terminal1_batch = []
            state1_batch = []
            for e in experiences:
                state0_batch.append(e.state0)
                state1_batch.append(e.state1)
                reward_batch.append(e.reward)
                action_batch.append(e.action)
                terminal1_batch.append(0. if e.terminal1 else 1.)

            # Prepare and validate parameters.
            state0_batch = self.process_state_batch(state0_batch)
            state1_batch = self.process_state_batch(state1_batch)
            terminal1_batch = np.array(terminal1_batch)
            reward_batch = np.array(reward_batch)
            action_batch = np.array(action_batch)
            assert reward_batch.shape == (self.batch_size, )
            assert terminal1_batch.shape == reward_batch.shape
            assert action_batch.shape == (self.batch_size, self.nb_actions)

            # Update actor and critic, if warm up is over.
            if self.step > self.nb_steps_warmup:
                if len(self.critic.inputs) >= 3:
                    state1_batch_with_action = state1_batch[:]
                else:
                    state1_batch_with_action = [state1_batch]
                target_q_values = self.target_critic.predict_on_batch(
                    state1_batch_with_action).flatten()
                assert target_q_values.shape == (self.batch_size, )

                # Compute r_t + gamma * max_a Q(s_t+1, a) and update the target ys accordingly,
                # but only for the affected output units (as given by action_batch).
                discounted_reward_batch = self.gamma * target_q_values
                discounted_reward_batch *= terminal1_batch
                assert discounted_reward_batch.shape == reward_batch.shape
                targets = (reward_batch + discounted_reward_batch).reshape(
                    self.batch_size, 1)

                # Perform a single batch update on the critic network.
                if len(self.critic.inputs) >= 3:
                    state0_batch_with_action = state0_batch[:]
                else:
                    state0_batch_with_action = [state0_batch]
                #state0_batch_with_action.insert(self.critic_action_input_idx, action_batch)
                metrics = self.critic.train_on_batch(state0_batch_with_action,
                                                     targets)
                if self.processor is not None:
                    metrics += self.processor.metrics

            #Actor
            experiences = self.memory.sample(self.batch_size)
            assert len(experiences) == self.batch_size

            # Start by extracting the necessary parameters (we use a vectorized implementation).
            state0_batch = []
            reward_batch = []
            action_batch = []
            terminal1_batch = []
            state1_batch = []
            for e in experiences:
                state0_batch.append(e.state0)
                state1_batch.append(e.state1)
                reward_batch.append(e.reward)
                action_batch.append(e.action)
                terminal1_batch.append(0. if e.terminal1 else 1.)

            # Prepare and validate parameters.
            state0_batch = self.process_state_batch(state0_batch)
            state1_batch = self.process_state_batch(state1_batch)
            terminal1_batch = np.array(terminal1_batch)
            reward_batch = np.array(reward_batch)
            action_batch = np.array(action_batch)
            assert reward_batch.shape == (self.batch_size, )
            assert terminal1_batch.shape == reward_batch.shape
            assert action_batch.shape == (self.batch_size, self.nb_actions)

            if self.step > self.nb_steps_warmup:
                #Actor
                target_q_values1 = self.target_critic.predict_on_batch(
                    state1_batch_with_action).flatten()
                discounted_reward_batch = self.gamma * target_q_values1
                discounted_reward_batch *= terminal1_batch
                targets = (reward_batch + discounted_reward_batch)
                target_q_values0 = self.target_critic.predict_on_batch(
                    state0_batch_with_action).flatten()
                delta = targets - target_q_values0
                if len(self.actor.inputs) >= 2:
                    inputs = state0_batch[:]
                else:
                    #inputs = [state0_batch]
                    inputs = state0_batch
                pos_dif = delta > 0
                #                if self.step%1000==0:
                #                    print(np.sum(pos_dif))
                inputs = np.asarray(inputs)[pos_dif]
                actions_target = action_batch[pos_dif]
                #state0_batch_with_action.insert(self.critic_action_input_idx, action_batch)
                self.actor.train_on_batch(inputs, actions_target)

        if self.target_model_update >= 1 and self.step % self.target_model_update == 0:
            self.update_target_models_hard()

        return metrics

    def reset_states(self):
        if self.random_process is not None:
            self.random_process.reset_states()
        self.recent_action = None
        self.recent_observation = None
        if self.compiled:
            self.actor.reset_states()
            self.critic.reset_states()
            self.target_actor.reset_states()
            self.target_critic.reset_states()

    def update_target_models_hard(self):
        self.target_critic.set_weights(self.critic.get_weights())
        self.target_actor.set_weights(self.actor.get_weights())

    @property
    def metrics_names(self):
        names = self.critic.metrics_names[:]
        if self.processor is not None:
            names += self.processor.metrics_names[:]
        return names
Exemplo n.º 5
0
class deepAMDP():
    def __init__(self,
                 inputDim=16,
                 alpha=1e-4,
                 gamma=0.99,
                 epsilon=0.1,
                 numberOfActions=0,
                 tau=1e-1):

        self.predictionModel = Sequential()
        self.predictionModel.add(
            Dense(16, input_dim=inputDim, activation='relu'))
        self.predictionModel.add(Dense(16, activation='relu'))
        self.predictionModel.add(Dense(numberOfActions, activation='linear'))
        self.predictionModel.compile(loss="mse", optimizer=Adam(lr=alpha))

        self.targetModel = Sequential()
        self.targetModel.add(Dense(16, input_dim=inputDim))
        self.targetModel.add(Dense(16, activation='relu'))
        self.targetModel.add(Dense(numberOfActions, activation="linear"))
        self.targetModel.compile(loss="mse", optimizer=Adam(lr=alpha))

        self.memory = SequentialMemory(limit=100000, window_length=1)

        self.otherMemory = deque(maxlen=2000)

        self.numberOfActions = numberOfActions
        self.alpha = alpha
        self.gamma = gamma
        self.tau = tau
        self.epsilon = epsilon

        self.initialEpsilon = 0.1
        self.finalEpsilon = 0.01
        self.currentEpsilon = self.initialEpsilon
        self.episodesToDecay = 500

    def addExperience(self, latentState, action, reward, done):
        self.memory.append(latentState, action, reward, done)
        #print(latentState, action, reward)

    def memorize(self, state, action, reward, next_state, done):
        self.otherMemory.append((state, action, reward, next_state, done))

    def action(self, state):
        if np.random.random() < self.currentEpsilon:
            return np.random.randint(self.numberOfActions)
        state = state.reshape(1, -1)
        qValues = self.predictionModel.predict(state)
        return np.argmax(self.predictionModel.predict(state)[0])

    def replay(self, batchSize=8):
        #print("replay")
        #if len(self.memory) < batchSize:
        #    return

        experiences = self.memory.sample(batchSize)

        # Start by extracting the necessary parameters (we use a vectorized implementation).
        state0Batch = []
        rewardBatch = []
        actionBatch = []
        terminal1Batch = []
        state1Batch = []
        for e in experiences:
            # print(e.state0, e.state1, e.reward, e.action)
            state0Batch.append(e.state0[0])
            state1Batch.append(e.state1[0])
            rewardBatch.append(e.reward)
            actionBatch.append(e.action)
            terminal1Batch.append(0. if e.terminal1 else 1.)

        state0Batch = np.array(state0Batch)
        rewardBatch = np.array(rewardBatch)
        actionBatch = np.array(actionBatch)
        terminal1Batch = np.array(terminal1Batch)
        state1Batch = np.array(state1Batch)

        #state0Batch = normalize(state0Batch, axis=-1)
        #state1Batch = normalize(state1Batch, axis=-1)

        targetQValues = self.targetModel.predict_on_batch(state1Batch)

        #print("Target Q Values")
        #print(targetQValues)
        #print("Target Q 1")
        #print(state1Batch[0])
        qBatch = np.max(targetQValues, axis=1).flatten()
        #targets = np.zeros((batchSize, self.numberOfActions))
        #targets = np.random.rand(batchSize, self.numberOfActions)
        targets = targetQValues

        discountedRewardBatch = self.gamma * qBatch
        discountedRewardBatch *= terminal1Batch
        Rs = rewardBatch + discountedRewardBatch

        #print("Our Targets")
        #print(targets)
        for (target, r, action) in zip(targets, Rs, actionBatch):
            target[action] = r
        #print("Updated Targets")
        #print(targets)

        self.predictionModel.fit(state0Batch, targets, verbose=0)
        #self.updateTargetModel()

    def otherReplay(self, batch_size=8):
        minibatch = random.sample(self.otherMemory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            state = state.reshape(1, -1)
            next_state = next_state.reshape(1, -1)
            if not done:
                target = (reward + self.gamma *
                          np.amax(self.predictionModel.predict(next_state)[0]))
            target_f = self.predictionModel.predict(state)
            target_f[0][action] = target
            self.predictionModel.fit(state, target_f, epochs=1, verbose=0)
        #self.updateTargetModel()

    def updateTargetModel(self):
        predictionWeights = self.predictionModel.get_weights()
        targetWeights = self.targetModel.get_weights()

        for i in range(0, len(targetWeights)):
            targetWeights[i] = predictionWeights[i] * self.tau + targetWeights[
                i] * (1 - self.tau)
        self.targetModel.set_weights(targetWeights)