# 80% for train and 20% for test.
N_split = N_ - int(N_ / 5)
train_uvs, train_obs, train_acs, train_states = transform( uvs[:N_split], obs[:N_split], acs[:N_split], states[:N_split] )
test_uvs, test_obs, test_acs, test_states = transform( uvs[N_split:], obs[N_split:], acs[N_split:], states[N_split:] )


# Uncomment to see visualisation.
#plotter = PyGame2D()
#plotter.draw_path(testenv, outputss[0])

sfpnn = StatefulACPNN()

# Start fitting the model.
sfpnn.fit(train_states, train_uvs, train_obs, epochs=2)

# TxNxC
preds = sfpnn.predict(test_states, test_uvs)

verify_cost = sfpnn.verify(test_states, test_uvs, test_obs)

print("Cross-validation cost: ", verify_cost)
# Unwrap preds.
# N_xTxWxC
preds = preds.reshape([preds.shape[0], N_ - N_split, W, preds.shape[2]]).transpose([1,0,2,3])

plotter = PyGame2D()
plotter.draw_path_cmp(testenv, outputss[-1], preds[-1])

#print(outputss[-1])
#print(preds[-1])
#print(train_states[:,-1,:])
예제 #2
0
    def iter(self, episode_length=10):

        # TxN_xA
        trA = np.zeros((1, self.num_episodes, self.num_actions))

        # TxN_x(A-1)
        dbgQ = np.zeros((1, self.num_episodes, self.num_actions - 1))
        # One Hot Action: Initialise.
        trA[:, :, 6] = 1
        for epno in range(0, episode_length):
            # TxN_xS
            trS = self.acpnn.next_state(trA)
            # One hot action samples.
            # N_x(A-1)
            nA = self.dqn.sample(trS[-1, :, :])
            # N_x(A-1)
            nQ = self.dqn.q(trS[-1, :, :])

            # N_xA
            nA = np.concatenate([nA, np.zeros((nA.shape[0], 1))], axis=1)

            for env, ac in zip(self.envs, nA):
                env.action(ac.argmax())

            trA = np.concatenate([trA, nA.reshape((1, ) + nA.shape)], axis=0)
            dbgQ = np.concatenate([dbgQ, nQ.reshape((1, ) + nQ.shape)], axis=0)

        outputss = []
        for env in self.envs:
            env.flush()
            # WARN: Check get_trace() index.
            outputss.append(env.get_trace()[-1])

        # N_xTxWxC
        obs = np.asarray([[output[0][0] for output in outputs]
                          for outputs in outputss])
        # N_xTxA
        acs = np.asarray([[to_one_hot(output[1], 7) for output in outputs]
                          for outputs in outputss])
        # N_xTx2
        poss = np.asarray([[output[2] for output in outputs]
                           for outputs in outputss])
        # N_xTx2
        dirs = np.asarray([[output[3] for output in outputs]
                           for outputs in outputss])
        # N_xTxW
        uvs = np.asarray(
            [[np.linspace(-1, 1, obs.shape[2]) for output in outputs]
             for outputs in outputss])

        states = np.concatenate([poss, dirs], axis=2)

        N_ = obs.shape[0]
        W = obs.shape[2]

        # 80% for train and 20% for test.
        N_split = N_
        train_uvs, train_obs, train_acs, train_states = transform(
            uvs, obs, acs, states)
        #test_uvs, test_obs, test_acs, test_states = transform(uvs[N_split:], obs[N_split:], acs[N_split:],
        #                                                      states[N_split:])

        # Action Values.

        # Train the ACPNN on these traces.
        self.acpnn.fit(train_acs,
                       train_uvs,
                       train_obs,
                       epochs=self.pnn_epochs,
                       qvals=dbgQ.transpose([1, 0, 2]))

        # TxNxC
        preds = self.acpnn.predict(train_acs, train_uvs)
        # TxNxC
        actuals = train_obs

        # TxN_xWxC
        preds = preds.reshape([preds.shape[0], N_, W, preds.shape[2]])
        actuals = actuals.reshape([actuals.shape[0], N_, W, actuals.shape[2]])

        # TxN_xR
        trR = np.tanh(
            self.reward_multiplier *
            np.sum(np.sum(np.square(preds - actuals), axis=3), axis=2))
        trR = trR.reshape((trR.shape + (1, )))

        rewards = trR

        # We have TxN_xS state vector now
        trS = self.acpnn.next_state(trA)
        attns = self.acpnn.predict_attention(train_acs, train_uvs)

        attns = attns.reshape([attns.shape[0], N_, W, attns.shape[2]])

        # Leave the first one to make S prime.
        # T-1xN_xS
        trS_ = trS[1:, :, :]
        # Leave the last one
        # T-1xN_xS
        trS = trS[:-1, :, :]

        # Get a random shuffle index.
        shuf = np.arange(trS.shape[0] * trS.shape[1])
        np.random.shuffle(shuf)

        # Flatten and Shuffle all tensors.
        # TODO: Check that the index for trA and trR are correct.
        # (N*)xA
        trA = trA[1:, :, :].reshape(
            ((trA.shape[0] - 1) * trA.shape[1], trA.shape[2]))[shuf, :]
        # (N*)x(A-1)
        trA = cropLast(trA)
        # (N*)x(S-1)
        trS = trS.reshape((trS.shape[0] * trS.shape[1], trS.shape[2]))[shuf, :]
        # (N*)x(S-1)
        trS_ = trS_.reshape(
            (trS_.shape[0] * trS_.shape[1], trS_.shape[2]))[shuf, :]
        # (N*)x(S-1)
        trR = trR[1:, :, :].reshape(
            ((trR.shape[0] - 1) * trR.shape[1], trR.shape[2]))[shuf, :]

        # Train DQN
        self.dqn.fit(trS[:self.buffer_samples],
                     trA[:self.buffer_samples],
                     trR[:self.buffer_samples, 0],
                     trS_[:self.buffer_samples],
                     epochs=self.dqn_epochs)

        plotter = PyGame2D()
        plotter.draw_path_cmp(self.envs[0].env, outputss[0],
                              preds.transpose([1, 0, 2, 3])[0],
                              attns.transpose([1, 0, 2, 3])[0],
                              rewards.transpose([1, 0, 2])[0])