def test(model): game_state = GameState() # initial action is do nothing action = torch.zeros([model.number_of_actions], dtype=torch.float32) action[0] = 1 image_data, reward, terminal = game_state.frame_step(action) image_data = resize_and_bgr2gray(image_data) image_data = image_to_tensor(image_data) state = torch.cat( (image_data, image_data, image_data, image_data)).unsqueeze(0) while True: # get output from the neural network output = model(state)[0] action = torch.zeros([model.number_of_actions], dtype=torch.float32) if torch.cuda.is_available(): # put on GPU if CUDA is available action = action.cuda() # get action action_index = torch.argmax(output) if torch.cuda.is_available(): # put on GPU if CUDA is available action_index = action_index.cuda() action[action_index] = 1 # get next state image_data_1, reward, terminal = game_state.frame_step(action) image_data_1 = resize_and_bgr2gray(image_data_1) image_data_1 = image_to_tensor(image_data_1) state_1 = torch.cat( (state.squeeze(0)[1:, :, :], image_data_1)).unsqueeze(0) # set state to be state_1 state = state_1
def test(model): game_state = GameState() # initial action is do nothing action = torch.zeros([model.number_of_actions], dtype=torch.float32) action[0] = 1 image_data, reward, terminal = game_state.frame_step(action) image_data = resize_and_bgr2gray(image_data) image_data = image_to_tensor(image_data) state = torch.cat((image_data, image_data, image_data, image_data)).unsqueeze(0) while True: # get output from the neural network output = model(state)[0] action = torch.zeros([model.number_of_actions], dtype=torch.float32) if torch.cuda.is_available(): # put on GPU if CUDA is available action = action.cuda() # get action action_index = torch.argmax(output) if torch.cuda.is_available(): # put on GPU if CUDA is available action_index = action_index.cuda() action[action_index] = 1 # get next state image_data_1, reward, terminal = game_state.frame_step(action) image_data_1 = resize_and_bgr2gray(image_data_1) image_data_1 = image_to_tensor(image_data_1) state_1 = torch.cat((state.squeeze(0)[1:, :, :], image_data_1)).unsqueeze(0) # set state to be state_1 state = state_1
def test(model): game_state = GameState() action = torch.zeros([model.number_of_actions], dtype=torch.float32) action[0] = 1 img_data, reward, done = game_state.frame_step(action) img_data = preprocess(img_data) img_data = convert_img_to_tensor(img_data) state = torch.cat((img_data, img_data, img_data, img_data)).unsqueeze(0) while True: output = model(state)[0] action = torch.zeros([model.number_of_actions], dtype=torch.float32) action_index = torch.argmax(output) action[action_index] = 1 next_img_data, reward, done = game_state.frame_step(action) next_img_data = preprocess(next_img_data) next_img_data = convert_img_to_tensor(img_data) next_state = torch.cat( (state.squeeze(0)[1:, :, :], next_img_data)).unsqueeze(0) state = next_state
def test(model): game_state = GameState() # initial action is do nothing action = torch.zeros([model.number_of_actions], dtype=torch.float32) action[0] = 1 image_data, reward, terminal = game_state.frame_step(action) image_data = resize_and_bgr2gray(image_data) image_data = image_to_tensor(image_data) state = torch.cat( (image_data, image_data, image_data, image_data)).unsqueeze(0) epsilon = 0 total_reward = 0 max_reward = 0 cur_reward = 0 rewards = [] while True: if epsilon >= 10: break # get output from the neural network output = model(state)[0] action = torch.zeros([model.number_of_actions], dtype=torch.float32) if torch.cuda.is_available(): # put on GPU if CUDA is available action = action.cuda() # get action action_index = torch.argmax(output) if torch.cuda.is_available(): # put on GPU if CUDA is available action_index = action_index.cuda() action[action_index] = 1 # get next state image_data_1, reward, terminal = game_state.frame_step(action) if terminal: epsilon += 1 if cur_reward > max_reward: max_reward = cur_reward rewards.append(cur_reward) cur_reward = 0 if reward > 0.1: total_reward += reward cur_reward += reward image_data_1 = resize_and_bgr2gray(image_data_1) image_data_1 = image_to_tensor(image_data_1) state_1 = torch.cat( (state.squeeze(0)[1:, :, :], image_data_1)).unsqueeze(0) # set state to be state_1 state = state_1 print('reward:', total_reward / 10.0) print('max reward:', max_reward) print('standard deviation:', np.std(rewards, axis=0))
def test(model): game_state = GameState() # initial action is do nothing action = torch.zeros([model.number_of_actions], dtype=torch.float32) action[0] = 1 image_data, reward, terminal = game_state.frame_step(action) image_origin = resize_and_bgr(image_data) image_data = resize_and_bgr2gray(image_data) image_data = image_to_tensor(image_data) state = torch.cat((image_data, image_data, image_data, image_data)).unsqueeze(0) while True: # get output from the neural network output, conv_output = model(state) heatmap = np.mean(np.mean(conv_output.detach().numpy(), axis=0), axis=0) heatmap = np.maximum(heatmap, 0) heatmap /= np.max(heatmap) heatmap = cv2.resize(heatmap, (image_origin.shape[1], image_origin.shape[0])) heatmap = np.fliplr(rotate(heatmap, 90*3)) image_origin = np.fliplr(rotate(image_origin, 90*3)) plt.imshow(image_origin) plt.imshow(heatmap, cmap=plt.cm.jet, alpha=0.5, interpolation='nearest', vmin=0, vmax=1) plt.show() action = torch.zeros([model.number_of_actions], dtype=torch.float32) if torch.cuda.is_available(): # put on GPU if CUDA is available action = action.cuda() # get action action_index = torch.argmax(output) if torch.cuda.is_available(): # put on GPU if CUDA is available action_index = action_index.cuda() action[action_index] = 1 # get next state image_data_1, reward, terminal = game_state.frame_step(action) image_origin = resize_and_bgr(image_data_1) image_data_1 = resize_and_bgr2gray(image_data_1) image_data_1 = image_to_tensor(image_data_1) state_1 = torch.cat((state.squeeze(0)[1:, :, :], image_data_1)).unsqueeze(0) # set state to be state_1 state = state_1
def train(model, start): # define Adam optimizer optimizer = optim.Adam(model.parameters(), lr=1e-6) # initialize mean squared error loss criterion = nn.MSELoss() # instantiate game game_state = GameState() # initialize replay memory replay_memory = [] # initial action is do nothing action = torch.zeros([model.number_of_actions], dtype=torch.float32) action[0] = 1 image_data, reward, terminal = game_state.frame_step(action) image_data = resize_and_bgr2gray(image_data) image_data = image_to_tensor(image_data) state = torch.cat( (image_data, image_data, image_data, image_data)).unsqueeze(0) # initialize epsilon value epsilon = model.initial_epsilon iteration = 0 epsilon_decrements = np.linspace(model.initial_epsilon, model.final_epsilon, model.number_of_iterations) # main infinite loop while iteration < model.number_of_iterations: # get output from the neural network output = model(state)[0] # initialize action action = torch.zeros([model.number_of_actions], dtype=torch.float32) if torch.cuda.is_available(): # put on GPU if CUDA is available action = action.cuda() # epsilon greedy exploration random_action = random.random() <= epsilon if random_action: print("Performed random action!") action_index = [ torch.randint( model.number_of_actions, torch.Size([]), dtype=torch.int) if random_action else torch.argmax(output) ][0] if torch.cuda.is_available(): # put on GPU if CUDA is available action_index = action_index.cuda() action[action_index] = 1 # get next state and reward image_data_1, reward, terminal = game_state.frame_step(action) image_data_1 = resize_and_bgr2gray(image_data_1) image_data_1 = image_to_tensor(image_data_1) state_1 = torch.cat( (state.squeeze(0)[1:, :, :], image_data_1)).unsqueeze(0) action = action.unsqueeze(0) reward = torch.from_numpy(np.array([reward], dtype=np.float32)).unsqueeze(0) # save transition to replay memory replay_memory.append((state, action, reward, state_1, terminal)) # if replay memory is full, remove the oldest transition if len(replay_memory) > model.replay_memory_size: replay_memory.pop(0) # epsilon annealing epsilon = epsilon_decrements[iteration] # sample random minibatch minibatch = random.sample( replay_memory, min(len(replay_memory), model.minibatch_size)) # unpack minibatch state_batch = torch.cat(tuple(d[0] for d in minibatch)) action_batch = torch.cat(tuple(d[1] for d in minibatch)) reward_batch = torch.cat(tuple(d[2] for d in minibatch)) state_1_batch = torch.cat(tuple(d[3] for d in minibatch)) if torch.cuda.is_available(): # put on GPU if CUDA is available state_batch = state_batch.cuda() action_batch = action_batch.cuda() reward_batch = reward_batch.cuda() state_1_batch = state_1_batch.cuda() # get output for the next state output_1_batch = model(state_1_batch) # set y_j to r_j for terminal state, otherwise to r_j + gamma*max(Q) y_batch = torch.cat( tuple(reward_batch[i] if minibatch[i][4] else reward_batch[i] + model.gamma * torch.max(output_1_batch[i]) for i in range(len(minibatch)))) # extract Q-value q_value = torch.sum(model(state_batch) * action_batch, dim=1) # PyTorch accumulates gradients by default, so they need to be reset in each pass optimizer.zero_grad() # returns a new Tensor, detached from the current graph, the result will never require gradient y_batch = y_batch.detach() # calculate loss loss = criterion(q_value, y_batch) # do backward pass loss.backward() optimizer.step() # set state to be state_1 state = state_1 iteration += 1 if iteration % 25000 == 0: torch.save( model, "pretrained_model/current_model_" + str(iteration) + ".pth") print("iteration:", iteration, "elapsed time:", time.time() - start, "epsilon:", epsilon, "action:", action_index.cpu().detach().numpy(), "reward:", reward.numpy()[0][0], "Q max:", np.max(output.cpu().detach().numpy()))
def train(): env = GameState() # num_inputs = env.observation_space.shape[0] num_inputs = 3136 num_actions = 2 print('state size:', num_inputs) print('action size:', num_actions) online_net = DRQN(num_inputs, num_actions) target_net = DRQN(num_inputs, num_actions) update_target_model(online_net, target_net) optimizer = optim.Adam(online_net.parameters(), lr=lr) if torch.cuda.is_available(): # put on GPU if CUDA is available online_net = online_net.cuda() target_net = target_net.cuda() online_net.train() target_net.train() memory = Memory(replay_memory_capacity) epsilon = 1.0 loss = 0 iteration = 0 while iteration < 2000000: done = False action = torch.zeros([2], dtype=torch.float32) action[0] = 1 image_data, reward, done = env.frame_step(action) image_data = resize_and_bgr2gray(image_data) image_data = image_to_tensor(image_data) state = image_data state = torch.Tensor(state) if torch.cuda.is_available(): state = state.cuda() hidden = None while not done: action, hidden, action_index = get_action(state, target_net, epsilon, env, hidden) image_data, reward, done = env.frame_step(action) image_data = resize_and_bgr2gray(image_data) image_data = image_to_tensor(image_data) next_state = image_data next_state = torch.Tensor(next_state) if torch.cuda.is_available(): next_state = next_state.cuda() mask = 0 if done else 1 reward = reward if not done else -1 memory.push(state, next_state, action_index, reward, mask) state = next_state if iteration > initial_exploration and len(memory) > batch_size: epsilon -= 0.00005 epsilon = max(epsilon, 0.1) batch = memory.sample(batch_size) loss = DRQN.train_model(online_net, target_net, optimizer, batch) if iteration % update_target == 0: print('iteration: {}, update model'.format(iteration)) update_target_model(online_net, target_net) iteration += 1 if iteration % 25000 == 0: torch.save(online_net, "pretrained_model/current_model_" + str(iteration) + ".pth") print('iteration: {}'.format(iteration))
def test(): cuda_is_available = torch.cuda.is_available() env = GameState() # num_inputs = env.observation_space.shape[0] num_inputs = 3136 num_actions = 2 print('state size:', num_inputs) print('action size:', num_actions) model = torch.load( 'pretrained_model/current_model_2000000.pth', map_location='cpu' if not cuda_is_available else None ).eval() if torch.cuda.is_available(): # put on GPU if CUDA is available model = model.cuda() action = torch.zeros([2], dtype=torch.float32) action[0] = 1 image_data, reward, done = env.frame_step(action) image_data = resize_and_bgr2gray(image_data) image_data = image_to_tensor(image_data) state = image_data state = torch.Tensor(state) if torch.cuda.is_available(): state = state.cuda() hidden = None epsilon = 0 total_reward = 0 max_reward = 0 cur_reward = 0 rewards = [] while True: if epsilon >= 10: break action, hidden, action_index = get_action(state, model, 0, env, hidden) image_data, reward, done = env.frame_step(action) image_data = resize_and_bgr2gray(image_data) image_data = image_to_tensor(image_data) next_state = image_data next_state = torch.Tensor(next_state) if torch.cuda.is_available(): next_state = next_state.cuda() state = next_state if done: epsilon += 1 if cur_reward > max_reward: max_reward = cur_reward rewards.append(cur_reward) cur_reward = 0 if reward > 0.1: total_reward += reward cur_reward += reward print('reward:', total_reward/10.0) print('max reward:', max_reward) print('standard deviation:', np.std(rewards, axis=0))
def train(model, start): # define Adam optimizer optimizer = optim.Adam(model.parameters(), lr=1e-6) # initialize mean squared error loss criterion = nn.MSELoss() # instantiate game game_state = GameState() # initialize replay memory replay_memory = [] # initial action is do nothing action = torch.zeros([model.number_of_actions], dtype=torch.float32) action[0] = 1 image_data, reward, terminal = game_state.frame_step(action) image_data = resize_and_bgr2gray(image_data) image_data = image_to_tensor(image_data) state = torch.cat((image_data, image_data, image_data, image_data)).unsqueeze(0) # initialize epsilon value epsilon = model.initial_epsilon iteration = 0 epsilon_decrements = np.linspace(model.initial_epsilon, model.final_epsilon, model.number_of_iterations) # main infinite loop while iteration < model.number_of_iterations: # get output from the neural network output = model(state)[0] # initialize action action = torch.zeros([model.number_of_actions], dtype=torch.float32) if torch.cuda.is_available(): # put on GPU if CUDA is available action = action.cuda() # epsilon greedy exploration random_action = random.random() <= epsilon if random_action: print("Performed random action!") action_index = [torch.randint(model.number_of_actions, torch.Size([]), dtype=torch.int) if random_action else torch.argmax(output)][0] if torch.cuda.is_available(): # put on GPU if CUDA is available action_index = action_index.cuda() action[action_index] = 1 # get next state and reward image_data_1, reward, terminal = game_state.frame_step(action) image_data_1 = resize_and_bgr2gray(image_data_1) image_data_1 = image_to_tensor(image_data_1) state_1 = torch.cat((state.squeeze(0)[1:, :, :], image_data_1)).unsqueeze(0) action = action.unsqueeze(0) reward = torch.from_numpy(np.array([reward], dtype=np.float32)).unsqueeze(0) # save transition to replay memory replay_memory.append((state, action, reward, state_1, terminal)) # if replay memory is full, remove the oldest transition if len(replay_memory) > model.replay_memory_size: replay_memory.pop(0) # epsilon annealing epsilon = epsilon_decrements[iteration] # sample random minibatch minibatch = random.sample(replay_memory, min(len(replay_memory), model.minibatch_size)) # unpack minibatch state_batch = torch.cat(tuple(d[0] for d in minibatch)) action_batch = torch.cat(tuple(d[1] for d in minibatch)) reward_batch = torch.cat(tuple(d[2] for d in minibatch)) state_1_batch = torch.cat(tuple(d[3] for d in minibatch)) if torch.cuda.is_available(): # put on GPU if CUDA is available state_batch = state_batch.cuda() action_batch = action_batch.cuda() reward_batch = reward_batch.cuda() state_1_batch = state_1_batch.cuda() # get output for the next state output_1_batch = model(state_1_batch) # set y_j to r_j for terminal state, otherwise to r_j + gamma*max(Q) y_batch = torch.cat(tuple(reward_batch[i] if minibatch[i][4] else reward_batch[i] + model.gamma * torch.max(output_1_batch[i]) for i in range(len(minibatch)))) # extract Q-value q_value = torch.sum(model(state_batch) * action_batch, dim=1) # PyTorch accumulates gradients by default, so they need to be reset in each pass optimizer.zero_grad() # returns a new Tensor, detached from the current graph, the result will never require gradient y_batch = y_batch.detach() # calculate loss loss = criterion(q_value, y_batch) # do backward pass loss.backward() optimizer.step() # set state to be state_1 state = state_1 iteration += 1 if iteration % 25000 == 0: torch.save(model, "pretrained_model/current_model_" + str(iteration) + ".pth") print("iteration:", iteration, "elapsed time:", time.time() - start, "epsilon:", epsilon, "action:", action_index.cpu().detach().numpy(), "reward:", reward.numpy()[0][0], "Q max:", np.max(output.cpu().detach().numpy()))
def train(model): device = torch.device( "cuda") if torch.cuda.is_available() else torch.device("cpu") # instantiate game game_state = GameState() agentBird = AgentBird(ACTION_NUM, model) # initial action is do nothing # [1, 0] represents "Do nothing" # [0, 1] represents "Fly up" action = torch.zeros([ACTION_NUM], dtype=torch.float32) action[0] = 1 state_image, reward, terminal, state_score = game_state.frame_step(action) image = preprocess_image(state_image) state = torch.cat((image, image, image, image)).unsqueeze(0) run = 1 i = 0 while True: #for i in range(NUM_ITERATIONS): # Select and perform an action action_idx, q_values = agentBird.predict_action(state) action = torch.zeros([ACTION_NUM], dtype=torch.float32) action[action_idx] = 1 # get next state and reward state_image_1, reward, terminal, state_score = game_state.frame_step( action) image_1 = preprocess_image(state_image_1) state_1 = torch.cat((state.squeeze(0)[1:, :, :], image_1)).unsqueeze(0) score = reward reward = torch.from_numpy(np.array([reward], dtype=np.float32)).unsqueeze(0) reward = torch.tensor([reward]) reward = reward.to(device) action = action.unsqueeze(0) action = action.to(device) # Store the transition in memory agentBird.add_memory(state, action, reward, state_1, terminal) agentBird.update_exploration_rate(i) # Perform one step of the optimization (on the target network) loss = agentBird.experience_replay() print("iteration " + str(i) + ", exploration: " + str(agentBird.exploration_rate) + ", Q max:" + str(np.max(q_values.cpu().detach().numpy())) + ", action:" + str(action_idx) + ", reward:" + str(score)) writer.add_scalar('Q_value', np.max(q_values.cpu().detach().numpy()), i) writer.add_scalar('loss', float(loss), i) # Move to the next state state = state_1 i += 1 if terminal: run += 1 print("episode " + str(run) + ", Score: " + str(state_score)) writer.add_scalar('Score', state_score, run) if i % 50000 == 0: date_today = date.today() curr_time = datetime.now() formatted_time = curr_time.strftime('%H%M%S') save_model = agentBird.return_model() torch.save( save_model, "pretrained_model/easy_model_" + str(i) + "_" + str(date_today) + "_" + str(formatted_time) + ".pth")
def train(model, start): optimizer = optim.Adam(model.parameters(), lr=1e-6) criterion = nn.MSELoss() game_state = GameState() replay_memory = [] action = torch.zeros([model.number_of_actions], dtype=torch.float32) action[0] = 1 img_data, reward, done = game_state.frame_step(action) img_data = preprocess(img_data) img_data = convert_img_to_tensor(img_data) state = torch.cat((img_data, img_data, img_data, img_data)).unsqueeze(0) epsilon = model.epsilon1 iteration = 0 epsilon_decrements = np.linspace(model.epsilon1, model.epsilon2, model.number_of_iterations) while iteration < model.number_of_iterations: output = model(state)[0] action = torch.zeros([model.number_of_actions], dtype=torch.float32) random_action = random.random() <= epsilon if random_action: print("Performed random action!") action_index = [ torch.randint( model.number_of_actions, torch.Size([]), dtype=torch.int) if random_action else torch.argmax(output) ][0] action[action_index] = 1 next_img_data, reward, done = game_state.frame_step(action) next_img_data = preprocess(next_img_data) next_img_data = convert_img_to_tensor(next_img_data) next_state = torch.cat( (state.squeeze(0)[1:, :, :], next_img_data)).unsqueeze(0) action = action.unsqueeze(0) reward = torch.from_numpy(np.array([reward], dtype=np.float32)).unsqueeze(0) replay_memory.append((state, action, reward, next_state, terminal)) if len(replay_memory) > model.replay_memory_size: replay_memory.pop(0) epsilon = epsilon_decrements[iteration] batch = random.sample(replay_memory, min(len(replay_memory), model.minibatch_size)) state_memory = torch.cat(tuple(d[0] for d in batch)) action_memory = torch.cat(tuple(d[1] for d in batch)) reward_memory = torch.cat(tuple(d[2] for d in batch)) next_state_memory = torch.cat(tuple(d[3] for d in batch)) output_memory = model(next_state_memory) y_memory = torch.cat( tuple(reward_memory[i] if minibatch[i][4] else reward_memory[i] + model.gamma * torch.max(output_memory[i]) for i in range(len(minibatch)))) q_value = torch.sum(model(state_memory) * action_memory, dim=1) optimizer.zero_grad() y_memory = y_memory.detach() loss = criterion(q_value, y_memory) loss.backward() optimizer.step() state = next_state iteration += 1 print("iteration:", iteration, "elapsed time:", time.time() - start, "epsilon:", epsilon, "action:", action_index.cpu().detach().numpy(), "reward:", reward.numpy()[0][0], "Q max:", np.max(output.cpu().detach().numpy()))