def train_job(prci, ret): save_reward = np.zeros((N_TIMES, N_EP)) dqn = DQN(SDIM, N_ACTION, ADIM) mem = ReplayMemory(SDIM, ADIM, N_MEM) env = BitFlipEnv(N_BITS) for t in range(N_TIMES): dqn.reset() mem.reset() for ep in range(N_EP): current, target = env.reset() state = np.hstack([current, target]) ep_reward = 0 for step in range(N_STEP): action = dqn.choose_action(state) state_next, reward, done, _ = env.step(action) current, target = state_next state_next = np.hstack([current, target]) reward = -abs(current - target).sum() ep_reward += reward mem.store_transition(state, state_next, action, reward) if ep >= N_WARMUP_EP: dqn.learn(mem, N_BATCH) state = state_next if done: break print("P: {p} T: {t}/{tt} E: {ep} S: {step} R: {reward}".format( p=prci, t=t, tt=N_TIMES, ep=ep, step=step, reward=ep_reward)) save_reward[t, ep] = int(done) / (step + 1) ret[prci] = save_reward
N_EP = 10000 N_STEP = N_BITS * 4 N_WARMUP_EP = 5 N_MEM = 3000000 N_BATCH = 32 SDIM = N_BITS * 2 ADIM = 1 N_ACTION = N_BITS save_reward = np.zeros((N_TIMES, N_EP)) EpMemory = namedtuple('EpMemory', ['state', 'state_next', 'action', 'reward']) dqn = DQN(SDIM, N_ACTION, ADIM) mem = ReplayMemory(SDIM, ADIM, N_MEM) env = BitFlipEnv(N_BITS) for t in range(N_TIMES): dqn.reset() mem.reset() for ep in range(N_EP): current, target = env.reset() state = np.hstack([current, target]) ep_mem = [] for step in range(N_STEP): action = dqn.choose_action(state) state_next, reward, done, _ = env.step(action)
N_MEM = 30000 N_BATCH = 32 s_dim = 4 a_dim = 1 a_val = 2 now = datetime.datetime.now() LOG_PATH = 'logs/R_' + os.path.basename(__file__) + now.strftime( '_%Y_%m_%d_%H_%M_%S') os.mkdir(LOG_PATH) LOG_FILE = LOG_PATH + '/train_.log' log_fp = open(LOG_FILE, 'w') print(LOG_FILE) dqn = DQN(s_dim, a_val, a_dim) mem = ReplayMemory(s_dim, a_dim, N_MEM) total_step = 0 train_step = 0 for ep in range(N_EP): ep_reward = 0 step = 0 state = env.reset() while True: # env.render() action = dqn.choose_action(state) state_next, reward, done, _ = env.step(action) xpos, xvel, theta, thetavel = state_next r1 = (env.x_threshold - abs(xpos)) / env.x_threshold - 0.5
def train_job(prci, ret): save_reward = np.zeros((N_TIMES, N_EP)) EpMemory = namedtuple('EpMemory', ['state', 'state_next', 'action', 'reward']) dqn = DQN(SDIM, N_ACTION, ADIM) mem = ReplayMemory(SDIM, ADIM, N_MEM) env = BitFlipEnv(N_BITS) for t in range(N_TIMES): dqn.reset() mem.reset() for ep in range(N_EP): current, target = env.reset() state = np.hstack([current, target]) ep_mem = [] for step in range(N_STEP): action = dqn.choose_action(state) state_next, reward, done, _ = env.step(action) current, target = state_next state_next = np.hstack([current, target]) ep_mem.append( EpMemory(state=state, state_next=state_next, action=action, reward=reward)) if ep >= N_WARMUP_EP: dqn.learn(mem, N_BATCH) state = state_next if done: break ep_reward = 0 for e in ep_mem: mem.store_transition(e.state, e.state_next, e.action, e.reward) ep_reward += e.reward if not done: fake_target = ep_mem[-1].state_next[:N_BITS] for i in range(len(ep_mem)): e = ep_mem[i] state = e.state.copy() state_next = e.state_next.copy() state[N_BITS:] = fake_target state_next[N_BITS:] = fake_target reward = e.reward if (i == step): reward += 1 mem.store_transition(state, state_next, e.action, reward) print("P: {p} T: {t}/{tt} E: {ep} S: {step} R: {reward}".format( p=prci, t=t, tt=N_TIMES, ep=ep, step=step, reward=ep_reward / (step + 1))) save_reward[t, ep] = int(done) / (step + 1) ret[prci] = save_reward
board = candy_crush_board.CandyCrushBoard(config_file='../config/train/config1.txt') def get_state(board): raw_state = board.get_numpy_board() raw_state = np.ascontiguousarray(raw_state, dtype=np.float32) raw_state = torch.from_numpy(raw_state) return raw_state.unsqueeze(0).to(device) init_screen = get_state(board) _, _, screen_height, screen_width = init_screen.shape actions = board.get_actions() n_actions = len(actions) policy_net = DQN(screen_height, screen_width, n_actions).to(device) target_net = DQN(screen_height, screen_width, n_actions).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = optim.RMSprop(policy_net.parameters()) memory = ReplayMemory(10000) steps_done = 0 def select_action(state): global steps_done sample = random.random() eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY) steps_done += 1