def train_job(prci, ret): save_reward = np.zeros((N_TIMES, N_EP)) dqn = DQN(SDIM, N_ACTION, ADIM) mem = ReplayMemory(SDIM, ADIM, N_MEM) env = BitFlipEnv(N_BITS) for t in range(N_TIMES): dqn.reset() mem.reset() for ep in range(N_EP): current, target = env.reset() state = np.hstack([current, target]) ep_reward = 0 for step in range(N_STEP): action = dqn.choose_action(state) state_next, reward, done, _ = env.step(action) current, target = state_next state_next = np.hstack([current, target]) reward = -abs(current - target).sum() ep_reward += reward mem.store_transition(state, state_next, action, reward) if ep >= N_WARMUP_EP: dqn.learn(mem, N_BATCH) state = state_next if done: break print("P: {p} T: {t}/{tt} E: {ep} S: {step} R: {reward}".format( p=prci, t=t, tt=N_TIMES, ep=ep, step=step, reward=ep_reward)) save_reward[t, ep] = int(done) / (step + 1) ret[prci] = save_reward
N_MEM = 3000000 N_BATCH = 32 SDIM = N_BITS * 2 ADIM = 1 N_ACTION = N_BITS save_reward = np.zeros((N_TIMES, N_EP)) EpMemory = namedtuple('EpMemory', ['state', 'state_next', 'action', 'reward']) dqn = DQN(SDIM, N_ACTION, ADIM) mem = ReplayMemory(SDIM, ADIM, N_MEM) env = BitFlipEnv(N_BITS) for t in range(N_TIMES): dqn.reset() mem.reset() for ep in range(N_EP): current, target = env.reset() state = np.hstack([current, target]) ep_mem = [] for step in range(N_STEP): action = dqn.choose_action(state) state_next, reward, done, _ = env.step(action) current, target = state_next state_next = np.hstack([current, target])
def train_job(prci, ret): save_reward = np.zeros((N_TIMES, N_EP)) EpMemory = namedtuple('EpMemory', ['state', 'state_next', 'action', 'reward']) dqn = DQN(SDIM, N_ACTION, ADIM) mem = ReplayMemory(SDIM, ADIM, N_MEM) env = BitFlipEnv(N_BITS) for t in range(N_TIMES): dqn.reset() mem.reset() for ep in range(N_EP): current, target = env.reset() state = np.hstack([current, target]) ep_mem = [] for step in range(N_STEP): action = dqn.choose_action(state) state_next, reward, done, _ = env.step(action) current, target = state_next state_next = np.hstack([current, target]) ep_mem.append( EpMemory(state=state, state_next=state_next, action=action, reward=reward)) if ep >= N_WARMUP_EP: dqn.learn(mem, N_BATCH) state = state_next if done: break ep_reward = 0 for e in ep_mem: mem.store_transition(e.state, e.state_next, e.action, e.reward) ep_reward += e.reward if not done: fake_target = ep_mem[-1].state_next[:N_BITS] for i in range(len(ep_mem)): e = ep_mem[i] state = e.state.copy() state_next = e.state_next.copy() state[N_BITS:] = fake_target state_next[N_BITS:] = fake_target reward = e.reward if (i == step): reward += 1 mem.store_transition(state, state_next, e.action, reward) print("P: {p} T: {t}/{tt} E: {ep} S: {step} R: {reward}".format( p=prci, t=t, tt=N_TIMES, ep=ep, step=step, reward=ep_reward / (step + 1))) save_reward[t, ep] = int(done) / (step + 1) ret[prci] = save_reward