Exemplo n.º 1
0
            loss = reward - np.amax(Q_out)

            if not done:
                new_state = prepare_state(new_state)

                Q_out = Q(FloatTensor(new_state)).to('cpu').detach().numpy()[0]
                Q_targ_out = Q_targ(
                    FloatTensor(new_state)).to('cpu').detach().numpy()[0]

                loss += gamma * Q_targ_out[np.argmax(Q_out)]

            loss = abs(loss)

            #print(curr_state.shape)
            replay_mem.add_element((curr_state, action, reward, new_state),
                                   loss)

            curr_state = new_state

            #
            #Learning
            #

            sarses = replay_mem.get_batch(batch_size)

            #Targets

            Q_true = []

            #print()
            #print('+++++++++++++++++++++++++++')