示例#1
0
def train_job(prci, ret):
    save_reward = np.zeros((N_TIMES, N_EP))

    dqn = DQN(SDIM, N_ACTION, ADIM)
    mem = ReplayMemory(SDIM, ADIM, N_MEM)

    env = BitFlipEnv(N_BITS)

    for t in range(N_TIMES):
        dqn.reset()
        mem.reset()
        for ep in range(N_EP):
            current, target = env.reset()
            state = np.hstack([current, target])
            ep_reward = 0
            for step in range(N_STEP):
                action = dqn.choose_action(state)
                state_next, reward, done, _ = env.step(action)

                current, target = state_next
                state_next = np.hstack([current, target])

                reward = -abs(current - target).sum()
                ep_reward += reward

                mem.store_transition(state, state_next, action, reward)

                if ep >= N_WARMUP_EP:
                    dqn.learn(mem, N_BATCH)

                state = state_next

                if done:
                    break

            print("P: {p} T: {t}/{tt} E: {ep} S: {step} R: {reward}".format(
                p=prci, t=t, tt=N_TIMES, ep=ep, step=step, reward=ep_reward))
            save_reward[t, ep] = int(done) / (step + 1)
    ret[prci] = save_reward
N_MEM = 3000000
N_BATCH = 32

SDIM = N_BITS * 2
ADIM = 1
N_ACTION = N_BITS

save_reward = np.zeros((N_TIMES, N_EP))

EpMemory = namedtuple('EpMemory', ['state', 'state_next', 'action', 'reward'])

dqn = DQN(SDIM, N_ACTION, ADIM)
mem = ReplayMemory(SDIM, ADIM, N_MEM)

env = BitFlipEnv(N_BITS)

for t in range(N_TIMES):
    dqn.reset()
    mem.reset()
    for ep in range(N_EP):
        current, target = env.reset()
        state = np.hstack([current, target])
        ep_mem = []
        for step in range(N_STEP):
            action = dqn.choose_action(state)
            state_next, reward, done, _ = env.step(action)

            current, target = state_next
            state_next = np.hstack([current, target])
示例#3
0
def train_job(prci, ret):
    save_reward = np.zeros((N_TIMES, N_EP))

    EpMemory = namedtuple('EpMemory',
                          ['state', 'state_next', 'action', 'reward'])

    dqn = DQN(SDIM, N_ACTION, ADIM)
    mem = ReplayMemory(SDIM, ADIM, N_MEM)

    env = BitFlipEnv(N_BITS)

    for t in range(N_TIMES):
        dqn.reset()
        mem.reset()
        for ep in range(N_EP):
            current, target = env.reset()
            state = np.hstack([current, target])
            ep_mem = []
            for step in range(N_STEP):
                action = dqn.choose_action(state)
                state_next, reward, done, _ = env.step(action)

                current, target = state_next
                state_next = np.hstack([current, target])

                ep_mem.append(
                    EpMemory(state=state,
                             state_next=state_next,
                             action=action,
                             reward=reward))

                if ep >= N_WARMUP_EP:
                    dqn.learn(mem, N_BATCH)

                state = state_next

                if done:
                    break

            ep_reward = 0
            for e in ep_mem:
                mem.store_transition(e.state, e.state_next, e.action, e.reward)
                ep_reward += e.reward
            if not done:
                fake_target = ep_mem[-1].state_next[:N_BITS]
                for i in range(len(ep_mem)):
                    e = ep_mem[i]
                    state = e.state.copy()
                    state_next = e.state_next.copy()
                    state[N_BITS:] = fake_target
                    state_next[N_BITS:] = fake_target
                    reward = e.reward
                    if (i == step):
                        reward += 1
                    mem.store_transition(state, state_next, e.action, reward)

            print("P: {p} T: {t}/{tt} E: {ep} S: {step} R: {reward}".format(
                p=prci,
                t=t,
                tt=N_TIMES,
                ep=ep,
                step=step,
                reward=ep_reward / (step + 1)))
            save_reward[t, ep] = int(done) / (step + 1)
    ret[prci] = save_reward