Exemplo n.º 1
0
    critic = CriticNet(state_dim, action_dim, HIDDEN1_UNITS, HIDDEN2_UNITS,
                       HIDDEN2_UNITS, action_dim)
    buff = Memory(BUFFER_SIZE, 9)
    step = 0
    reward_result = []

    for i in range(MAX_EPISODES):

        s_t = env.reset()
        s_t = np.reshape(s_t, (1, 3))[0]
        total_reward = 0.
        for j in range(MAX_EP_STEPS):
            loss = 0
            if RENDER_ENV:
                env.render()
            a_t = actor.predict(s_t, ACTION_BOUND, target=False)
            action = a_t + ou.sample(a_t[0])
            s_t_1, r_t, done, info = env.step(action)
            buff.store(s_t, a_t[0], r_t, np.reshape(s_t_1, (1, 3))[0], [done])
            if buff.t_memory > MINIBATCH_SIZE:
                batch = buff.sample(MINIBATCH_SIZE)
                states_t = batch[:, 0:3]
                actions = batch[:, 3]
                rewards = batch[:, 4]
                b_s_ = batch[:, 5:8]
                dones = batch[:, -1]
                y = np.zeros((len(batch), 1))
                a_tgt = actor.predict(b_s_, ACTION_BOUND, target=True)
                Q_tgt = critic.predict(b_s_, a_tgt, target=True)
                for i in range(len(batch)):
                    if dones[i]:
Exemplo n.º 2
0
    for times in range(max_time):

        step=0

        x=np.linspace(1,MAX_EPISODES,MAX_EPISODES)

        for ii in range(MAX_EPISODES):
            s_t = env.reset()
            total_reward = 0.
            count=0
            for j in range(MAX_EP_STEPS):
                loss=0;
                loss2 = 0;
                if RENDER_ENV:
                    env.render()
                a_t = actor.predict(np.reshape(s_t,(1,3)), ACTION_BOUND, target=False)+1./(1.+ii+j)
                s_t_1, r_t, done, info = env.step(a_t[0])
                buff.add(s_t, a_t[0], r_t, s_t_1, done)
                if buff.count() > MINIBATCH_SIZE:
                    batch = buff.getBatch(MINIBATCH_SIZE)
                    states_t = np.asarray([e[0] for e in batch])
                    actions = np.asarray([e[1] for e in batch])
                    rewards = np.asarray([e[2] for e in batch])
                    states_t_1 = np.asarray([e[3] for e in batch])
                    dones = np.asarray([e[4] for e in batch])
                    y=np.zeros((len(batch), action_dim))
                    a_tgt=actor.predict(states_t_1, ACTION_BOUND, target=True)
                    Q_tgt = critic.predict(states_t_1, a_tgt,target=True)

                    for i in range(len(batch)):
                        if dones[i]: