def train(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) model = DeepQNetwork() if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) writer = SummaryWriter(opt.log_path) optimizer = torch.optim.Adam(model.parameters(), lr=1e-6) criterion = nn.MSELoss() game_state = FlappyBird() image, reward, terminal = game_state.next_frame(0) image = pre_processing( image[:game_state.screen_width, :int(game_state.base_y)], opt.image_size, opt.image_size) image = torch.from_numpy(image) if torch.cuda.is_available(): model.cuda() image = image.cuda() state = torch.cat(tuple(image for _ in range(4)))[None, :, :, :] replay_memory = [] iter = 0 while iter < opt.num_iters: prediction = model(state)[0] # Exploration or exploitation epsilon = opt.final_epsilon + ( (opt.num_iters - iter) * (opt.initial_epsilon - opt.final_epsilon) / opt.num_iters) u = random() random_action = u <= epsilon if random_action: print("Perform a random action") action = randint(0, 1) else: action = torch.argmax(prediction) next_image, reward, terminal = game_state.next_frame(action) next_image = pre_processing( next_image[:game_state.screen_width, :int(game_state.base_y)], opt.image_size, opt.image_size) next_image = torch.from_numpy(next_image) if torch.cuda.is_available(): next_image = next_image.cuda() next_state = torch.cat((state[0, 1:, :, :], next_image))[None, :, :, :] replay_memory.append([state, action, reward, next_state, terminal]) if len(replay_memory) > opt.replay_memory_size: del replay_memory[0] batch = sample(replay_memory, min(len(replay_memory), opt.batch_size)) state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = zip( *batch) state_batch = torch.cat(tuple(state for state in state_batch)) action_batch = torch.from_numpy( np.array([[1, 0] if action == 0 else [0, 1] for action in action_batch], dtype=np.float32)) reward_batch = torch.from_numpy( np.array(reward_batch, dtype=np.float32)[:, None]) next_state_batch = torch.cat(tuple(state for state in next_state_batch)) if torch.cuda.is_available(): state_batch = state_batch.cuda() action_batch = action_batch.cuda() reward_batch = reward_batch.cuda() next_state_batch = next_state_batch.cuda() current_prediction_batch = model(state_batch) next_prediction_batch = model(next_state_batch) y_batch = torch.cat( tuple(reward if terminal else reward + opt.gamma * torch.max(prediction) for reward, terminal, prediction in zip( reward_batch, terminal_batch, next_prediction_batch))) q_value = torch.sum(current_prediction_batch * action_batch, dim=1) optimizer.zero_grad() # y_batch = y_batch.detach() loss = criterion(q_value, y_batch) loss.backward() optimizer.step() state = next_state iter += 1 print( "Iteration: {}/{}, Action: {}, Loss: {}, Epsilon {}, Reward: {}, Q-value: {}" .format(iter + 1, opt.num_iters, action, loss, epsilon, reward, torch.max(prediction))) writer.add_scalar('Train/Loss', loss, iter) writer.add_scalar('Train/Epsilon', epsilon, iter) writer.add_scalar('Train/Reward', reward, iter) writer.add_scalar('Train/Q-value', torch.max(prediction), iter) if (iter + 1) % 1000000 == 0: torch.save(model, "{}/flappy_bird_{}".format(opt.saved_path, iter + 1)) torch.save(model, "{}/flappy_bird".format(opt.saved_path))
def training(arguments): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) model = DeepQNetwork() if os.path.isdir(arguments.log_path): shutil.rmtree(arguments.log_path) os.makedirs(arguments.log_path) writer = SummaryWriter(arguments.log_path) optimiser = torch.optim.Adam(model.parameters(), lr=1e-6) criterion = nn.MSELoss() gameState = Flappyplayer() image, reward, terminal = gameState.next_frame(0) image = pre_processing(image[:gameState.SCREENW, :int(gameState.base_y)], arguments.image_size, arguments.image_size) image = torch.from_numpy(image) if torch.cuda.is_available(): model.cuda() image = image.cuda() state = torch.cat(tuple(image for _ in range(4)))[None, :, :, :] replay_mem = [] iter = 0 while iter < arguments.iters: prediction = model(state)[0] # Exploration or exploitation epsilon = arguments.final_epsilon + ( (arguments.iters - iter) * (arguments.initial_epsilon - arguments.final_epsilon) / arguments.iters) u = random() random_action = u <= epsilon if random_action: print("Perform a random action") action = 1 print(action) else: action = 0 nextImage, reward, terminal = gameState.next_frame(action) nextImage = pre_processing(nextImage[:gameState.SCREENW, :int(gameState.base_y)], arguments.image_size, arguments.image_size) nextImage = torch.from_numpy(nextImage) if torch.cuda.is_available(): nextImage = nextImage.cuda() nextState = torch.cat((state[0, 1:, :, :], nextImage))[None, :, :, :] replay_mem.append([state, action, reward, nextState, terminal]) if len(replay_mem) > arguments.replay_mem: del replay_mem[0] batch = sample(replay_mem, min(len(replay_mem), arguments.batch_size)) stateBatch, actionBatch, rewardBatch, nextStateBatch, terminalBatch = zip(*batch) stateBatch = torch.cat(tuple(state for state in stateBatch)) actionBatch = torch.from_numpy(np.array([[1,0] if action == 0 else [0,1] for action in actionBatch], dtype=np.float32)) rewardBatch = torch.from_numpy(np.array(rewardBatch, dtype=np.float32)[:, None]) nextStateBatch = torch.cat(tuple(state for state in nextStateBatch)) if torch.cuda.is_available(): stateBatch = stateBatch.cuda() actionBatch = actionBatch.cuda() rewardBatch = rewardBatch.cuda() nextStateBatch = nextStateBatch.cuda() currentPredBatch = model(stateBatch) nextPredBatch = model(nextStateBatch) yBatch = torch.cat(tuple(reward if terminal else reward + arguments.gamma * torch.max(prediction) for reward, terminal, prediction in zip(rewardBatch, terminalBatch, nextPredBatch))) qValue = torch.sum(currentPredBatch*actionBatch, dim=1) optimiser.zero_grad() loss = criterion(qValue, yBatch) loss.backward() optimiser.step() state = nextState iter +=1 print("Iteration: {}/{}, Action: {}, Loss: {}, Epsilon: {}, Reward: {}, Q-Value: {}".format(iter+1,arguments.iters, action, loss, epsilon, reward, torch.max(prediction))) writer.add_scalar('Train/Loss', loss, iter) writer.add_scalar('Train/Epsilon', epsilon, iter) writer.add_scalar('Train/Reward', reward, iter) writer.add_scalar('Train/Q-Value', torch.max(prediction), iter) if (iter+1) % 1000000 == 0: torch.save(model, "{}/flappy_bird_{}".format(arguments.saved_path, iter+1)) torch.save(model, "{}/flappy_bird".format(arguments.saved_path))
def train(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) writer = SummaryWriter(opt.log_path) env = Tetris(width=opt.width, height=opt.height, block_size=opt.block_size) model = DeepQNetwork() optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr) criterion = nn.MSELoss() state = env.reset() if torch.cuda.is_available(): model.cuda() state = state.cuda() replay_memory = deque(maxlen=opt.replay_memory_size) epoch = 0 while epoch < opt.num_epochs: next_steps = env.get_next_states() # Exploration or exploitation epsilon = opt.final_epsilon + ( max(opt.num_decay_epochs - epoch, 0) * (opt.initial_epsilon - opt.final_epsilon) / opt.num_decay_epochs) u = random() random_action = u <= epsilon next_actions, next_states = zip(*next_steps.items()) next_states = torch.stack(next_states) if torch.cuda.is_available(): next_states = next_states.cuda() model.eval() with torch.no_grad(): predictions = model(next_states)[:, 0] model.train() if random_action: index = randint(0, len(next_steps) - 1) else: index = torch.argmax(predictions).item() next_state = next_states[index, :] action = next_actions[index] reward, done = env.step(action, render=True) if torch.cuda.is_available(): next_state = next_state.cuda() replay_memory.append([state, reward, next_state, done]) if done: final_score = env.score final_tetrominoes = env.tetrominoes final_cleared_lines = env.cleared_lines state = env.reset() if torch.cuda.is_available(): state = state.cuda() else: state = next_state continue if len(replay_memory) < opt.replay_memory_size / 10: continue epoch += 1 batch = sample(replay_memory, min(len(replay_memory), opt.batch_size)) state_batch, reward_batch, next_state_batch, done_batch = zip(*batch) state_batch = torch.stack(tuple(state for state in state_batch)) reward_batch = torch.from_numpy( np.array(reward_batch, dtype=np.float32)[:, None]) next_state_batch = torch.stack( tuple(state for state in next_state_batch)) if torch.cuda.is_available(): state_batch = state_batch.cuda() reward_batch = reward_batch.cuda() next_state_batch = next_state_batch.cuda() q_values = model(state_batch) model.eval() with torch.no_grad(): next_prediction_batch = model(next_state_batch) model.train() y_batch = torch.cat( tuple(reward if done else reward + opt.gamma * prediction for reward, done, prediction in zip( reward_batch, done_batch, next_prediction_batch)))[:, None] optimizer.zero_grad() loss = criterion(q_values, y_batch) loss.backward() optimizer.step() print( "Epoch: {}/{}, Action: {}, Score: {}, Tetrominoes {}, Cleared lines: {}" .format(epoch, opt.num_epochs, action, final_score, final_tetrominoes, final_cleared_lines)) writer.add_scalar('Train/Score', final_score, epoch - 1) writer.add_scalar('Train/Tetrominoes', final_tetrominoes, epoch - 1) writer.add_scalar('Train/Cleared lines', final_cleared_lines, epoch - 1) if epoch > 0 and epoch % opt.save_interval == 0: torch.save(model, "{}/tetris_{}".format(opt.saved_path, epoch)) torch.save(model, "{}/tetris".format(opt.saved_path))