예제 #1
0
def train_job(prci, ret):
    save_reward = np.zeros((N_TIMES, N_EP))

    dqn = DQN(SDIM, N_ACTION, ADIM)
    mem = ReplayMemory(SDIM, ADIM, N_MEM)

    env = BitFlipEnv(N_BITS)

    for t in range(N_TIMES):
        dqn.reset()
        mem.reset()
        for ep in range(N_EP):
            current, target = env.reset()
            state = np.hstack([current, target])
            ep_reward = 0
            for step in range(N_STEP):
                action = dqn.choose_action(state)
                state_next, reward, done, _ = env.step(action)

                current, target = state_next
                state_next = np.hstack([current, target])

                reward = -abs(current - target).sum()
                ep_reward += reward

                mem.store_transition(state, state_next, action, reward)

                if ep >= N_WARMUP_EP:
                    dqn.learn(mem, N_BATCH)

                state = state_next

                if done:
                    break

            print("P: {p} T: {t}/{tt} E: {ep} S: {step} R: {reward}".format(
                p=prci, t=t, tt=N_TIMES, ep=ep, step=step, reward=ep_reward))
            save_reward[t, ep] = int(done) / (step + 1)
    ret[prci] = save_reward
N_EP = 10000
N_STEP = N_BITS * 4
N_WARMUP_EP = 5

N_MEM = 3000000
N_BATCH = 32

SDIM = N_BITS * 2
ADIM = 1
N_ACTION = N_BITS

save_reward = np.zeros((N_TIMES, N_EP))

EpMemory = namedtuple('EpMemory', ['state', 'state_next', 'action', 'reward'])

dqn = DQN(SDIM, N_ACTION, ADIM)
mem = ReplayMemory(SDIM, ADIM, N_MEM)

env = BitFlipEnv(N_BITS)

for t in range(N_TIMES):
    dqn.reset()
    mem.reset()
    for ep in range(N_EP):
        current, target = env.reset()
        state = np.hstack([current, target])
        ep_mem = []
        for step in range(N_STEP):
            action = dqn.choose_action(state)
            state_next, reward, done, _ = env.step(action)
N_MEM = 30000
N_BATCH = 32

s_dim = 4
a_dim = 1
a_val = 2

now = datetime.datetime.now()
LOG_PATH = 'logs/R_' + os.path.basename(__file__) + now.strftime(
    '_%Y_%m_%d_%H_%M_%S')
os.mkdir(LOG_PATH)
LOG_FILE = LOG_PATH + '/train_.log'
log_fp = open(LOG_FILE, 'w')
print(LOG_FILE)

dqn = DQN(s_dim, a_val, a_dim)
mem = ReplayMemory(s_dim, a_dim, N_MEM)

total_step = 0
train_step = 0
for ep in range(N_EP):
    ep_reward = 0
    step = 0
    state = env.reset()
    while True:
        # env.render()
        action = dqn.choose_action(state)
        state_next, reward, done, _ = env.step(action)

        xpos, xvel, theta, thetavel = state_next
        r1 = (env.x_threshold - abs(xpos)) / env.x_threshold - 0.5
예제 #4
0
def train_job(prci, ret):
    save_reward = np.zeros((N_TIMES, N_EP))

    EpMemory = namedtuple('EpMemory',
                          ['state', 'state_next', 'action', 'reward'])

    dqn = DQN(SDIM, N_ACTION, ADIM)
    mem = ReplayMemory(SDIM, ADIM, N_MEM)

    env = BitFlipEnv(N_BITS)

    for t in range(N_TIMES):
        dqn.reset()
        mem.reset()
        for ep in range(N_EP):
            current, target = env.reset()
            state = np.hstack([current, target])
            ep_mem = []
            for step in range(N_STEP):
                action = dqn.choose_action(state)
                state_next, reward, done, _ = env.step(action)

                current, target = state_next
                state_next = np.hstack([current, target])

                ep_mem.append(
                    EpMemory(state=state,
                             state_next=state_next,
                             action=action,
                             reward=reward))

                if ep >= N_WARMUP_EP:
                    dqn.learn(mem, N_BATCH)

                state = state_next

                if done:
                    break

            ep_reward = 0
            for e in ep_mem:
                mem.store_transition(e.state, e.state_next, e.action, e.reward)
                ep_reward += e.reward
            if not done:
                fake_target = ep_mem[-1].state_next[:N_BITS]
                for i in range(len(ep_mem)):
                    e = ep_mem[i]
                    state = e.state.copy()
                    state_next = e.state_next.copy()
                    state[N_BITS:] = fake_target
                    state_next[N_BITS:] = fake_target
                    reward = e.reward
                    if (i == step):
                        reward += 1
                    mem.store_transition(state, state_next, e.action, reward)

            print("P: {p} T: {t}/{tt} E: {ep} S: {step} R: {reward}".format(
                p=prci,
                t=t,
                tt=N_TIMES,
                ep=ep,
                step=step,
                reward=ep_reward / (step + 1)))
            save_reward[t, ep] = int(done) / (step + 1)
    ret[prci] = save_reward
예제 #5
0
board = candy_crush_board.CandyCrushBoard(config_file='../config/train/config1.txt')

def get_state(board):
  raw_state = board.get_numpy_board()
  raw_state = np.ascontiguousarray(raw_state, dtype=np.float32)
  raw_state = torch.from_numpy(raw_state)
  return raw_state.unsqueeze(0).to(device)

init_screen = get_state(board)
_, _, screen_height, screen_width = init_screen.shape

actions = board.get_actions()
n_actions = len(actions)

policy_net = DQN(screen_height, screen_width, n_actions).to(device)
target_net = DQN(screen_height, screen_width, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(10000)


steps_done = 0

def select_action(state):
  global steps_done
  sample = random.random()
  eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY)
  steps_done += 1