class Agent_DQN(): def __init__(self, env, args): # Parameters for q-learning super(Agent_DQN, self).__init__() self.env = env state = env.reset() state = state.transpose(2, 0, 1) self.policy_net = DQN(state.shape, self.env.action_space.n) # Behavior Q self.target_net = DQN(state.shape, self.env.action_space.n) # Target Q self.target_net.load_state_dict(self.policy_net.state_dict()) #Initial Q if USE_CUDA: print("Using CUDA . . . ") self.policy_net = self.policy_net.cuda() self.target_net = self.target_net.cuda() print('hyperparameters and network initialized') if args.test_dqn or LOAD == True: print('loading trained model') checkpoint = torch.load('trainData') self.policy_net.load_state_dict(checkpoint['model_state_dict']) self.target_net.load_state_dict(self.policy_net.state_dict()) def init_game_setting(self): print('loading trained model') checkpoint = torch.load('trainData') self.policy_net.load_state_dict(checkpoint['model_state_dict']) def push(self, state, action, reward, next_state, done): state = np.expand_dims(state, 0) next_state = np.expand_dims(next_state, 0) memory.append((state, action, reward, next_state, done)) def replay_buffer(self): state, action, reward, next_state, done = zip( *random.sample(memory, batch_size)) return np.concatenate(state), action, reward, np.concatenate( next_state), done def __len__(self): return len(self.buffer) def make_action(self, observation, test=True): observation = observation.transpose(2, 0, 1) if np.random.random() > EPSILON or test == True: observation = Variable(torch.FloatTensor( np.float32(observation)).unsqueeze(0), volatile=True) q_value = self.policy_net.forward(observation) action = q_value.max(1)[1].data[0] action = int(action.item()) else: action = random.randrange(4) return action def optimize_model(self): states, actions, next_states, rewards, dones = self.replay_buffer() states_v = Variable(torch.FloatTensor(np.float32(states))) next_states_v = Variable(torch.FloatTensor(np.float32(next_states)), volatile=True) actions_v = Variable(torch.LongTensor(actions)) rewards_v = Variable(torch.FloatTensor(rewards)) done = Variable(torch.FloatTensor(dones)) state_action_values = self.policy_net(states_v).gather( 1, actions_v.unsqueeze(1)).squeeze(1) next_state_values = self.target_net(next_states_v).max(1)[0] expected_q_value = rewards_v + next_state_values * GAMMA * ( 1 - done) #+ rewards_v loss = (state_action_values - Variable(expected_q_value.data)).pow(2).mean() return loss def train(self): optimizer = optim.Adam(self.policy_net.parameters(), lr=ALPHA) # Fill the memory with experiences print('Gathering experiences ...') meanScore = 0 AvgRewards = [] AllScores = [] step = 1 iEpisode = 0 while meanScore < 50: state = self.env.reset() done = False EpisodeScore = 0 tBegin = time.time() done = False while not done: action = self.make_action(state) nextState, reward, done, _ = self.env.step(action) self.push(state.transpose(2, 0, 1), action, nextState.transpose(2, 0, 1), reward, done) state = nextState if len(memory) > StartLearning: loss = self.optimize_model() optimizer.zero_grad() loss.backward() optimizer.step() else: iEpisode = 0 continue # Update exploration factor EPSILON = EPS_END + (EPS_START - EPS_END) * math.exp( -1. * step / EPS_DECAY) storeEpsilon.append(EPSILON) step += 1 EpisodeScore += reward if step % TARGET_UPDATE == 0: print('Updating Target Network . . .') self.target_net.load_state_dict( self.policy_net.state_dict()) iEpisode += 1 AllScores.append(EpisodeScore) meanScore = np.mean(AllScores[-100:]) AvgRewards.append(meanScore) if len(memory) > StartLearning: print('Episode: ', iEpisode, ' score:', EpisodeScore, ' Avg Score:', meanScore, ' epsilon: ', EPSILON, ' t: ', time.time() - tBegin, ' loss:', loss.item()) else: print('Gathering Data . . .') if iEpisode % 500 == 0: torch.save( { 'epoch': iEpisode, 'model_state_dict': self.policy_net.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': loss, 'AvgRewards': AvgRewards }, 'trainData') os.remove("Rewards.csv") with open('Rewards.csv', mode='w') as dataFile: rewardwriter = csv.writer(dataFile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) rewardwriter.writerow(AvgRewards) print('======== Complete ========') torch.save( { 'epoch': iEpisode, 'model_state_dict': self.policy_net.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': loss, 'AvgRewards': AvgRewards }, 'trainData') with open('Rewards.csv', mode='w') as dataFile: rewardwriter = csv.writer(dataFile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) rewardwriter.writerow(AvgRewards)
class Agent_DQN(Agent): def __init__(self, env, args): """ Initialize every things you need here. For example: building your model """ super(Agent_DQN, self).__init__(env) self.env = env self.args = args self.episode = 0 self.n_actions = self.env.action_space.n self.epsilon_start = 1.0 self.epsilon_final = 0.025 self.epsilon_decay = 3000 self.epsilon_by_frame = lambda frame_idx: self.epsilon_final + ( self.epsilon_start - self.epsilon_final) * math.exp( -1. * frame_idx / self.epsilon_decay) self.epsilon = 0 self.eval_net = DQN().cuda() self.target_net = DQN().cuda() self.target_net.load_state_dict(self.eval_net.state_dict()) self.criterion = nn.MSELoss() #self._model = Net(self.env.observation_space.shape, self.env.action_space.n) self._use_cuda = torch.cuda.is_available() self.optim = torch.optim.Adam(self.eval_net.parameters(), lr=self.args.learning_rate) if self._use_cuda: self.eval_net = self.eval_net.cuda() self.target_net = self.target_net.cuda() self.criterion = self.criterion.cuda() # self.replaybuffer = ReplayBuffer(args.buffer_size) self.buffer = deque(maxlen=10000) if args.test_dqn: #you can load your model here print('loading trained model') self.eval_net.load_state_dict(torch.load(args.model_dqn)) self.target_net.load_state_dict(self.eval_net.state_dict()) if self._use_cuda: self.eval_net = self.eval_net.cuda() self.target_net = self.target_net.cuda() ################## # YOUR CODE HERE # ################## def init_game_setting(self): """ Testing function will call this function at the begining of new game Put anything you want to initialize if necessary """ ################## # YOUR CODE HERE # ################## pass def push(self, state, action, reward, next_state, done): state = np.expand_dims(state, 0) next_state = np.expand_dims(next_state, 0) self.buffer.append((state, action, reward, next_state, done)) def replay_buffer(self, batch_size): state, action, reward, next_state, done = zip( *random.sample(self.buffer, batch_size)) return np.concatenate(state), action, reward, np.concatenate( next_state), done def train(self): """ Implement your training algorithm here """ ################## # YOUR CODE HERE # ################## print('begin train...') # if self.args.log_file is not None: # fp_log = open(self.args.log_file, 'w', buffering=1) fout = open('dqn_score.log', 'w') if os.path.exists('model') == False: os.makedirs('model') losses = [] all_rewards = [] episode_reward = 0 best_mean_reward = 0 state = self.env.reset() for i_step in range(self.args.max_steps): self.epsilon = self.epsilon_by_frame(i_step) action = self.make_action(state) next_state, reward, done, _ = self.env.step(action) self.push(state, action, reward, next_state, done) state = next_state episode_reward += reward if done: state = self.env.reset() all_rewards.append(episode_reward) self.episode += 1 print('{},{}'.format(self.episode, episode_reward)) fout.write('Episode{},episode_reward{}\n'.format( self.episode, episode_reward)) episode_reward = 0 if len(self.buffer) == self.args.buffer_size: if i_step % self.args.eval_net_update_step == 0: loss = self.optimize_model() losses.append(loss) if i_step % self.args.target_net_update_step == 0: self.target_net.load_state_dict(self.eval_net.state_dict()) if i_step % self.args.save_freq == 0: mean_reward = \ sum(all_rewards[-100:]) / 100 if best_mean_reward < mean_reward: print('save best model with mean reward = %f' % mean_reward) best_mean_reward = mean_reward torch.save(self.eval_net.state_dict(), self.args.model_dqn) def make_action(self, observation, test=True): """ Return predicted action of your agent Input: observation: np.array stack 4 last preprocessed frames, shape: (84, 84, 4) Return: action: int the predicted action from trained model """ ################## # YOUR CODE HERE # ################## observation = torch.cuda.FloatTensor( observation.reshape((1, 84, 84, 4))).transpose(1, 3).transpose(2, 3) # print(type(observation)) Q_value = self.eval_net.forward(observation).data.cpu().numpy() if random.random() > self.epsilon: action = np.argmax(Q_value) else: action = self.env.get_random_action() return action def optimize_model(self): state, action, reward, next_state, done = self.replay_buffer( self.args.batch_size) state = torch.FloatTensor(np.float32(state)).permute(0, 3, 1, 2) next_state = torch.FloatTensor(np.float32(next_state)).permute( 0, 3, 1, 2) action = torch.LongTensor(action) reward = torch.FloatTensor(reward) done = torch.ByteTensor(done) if self._use_cuda: state = state.cuda() next_state = next_state.cuda() action = action.cuda() reward = reward.cuda() done = done.cuda() q_values = self.eval_net(state) # next_q_values = self.target_net(next_state).detach() q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) next_q_values = self.target_net(next_state).detach() next_q_value = next_q_values.max(1)[0] expected_q_value = reward + self.args.gamma * next_q_value * (1 - done) loss = self.criterion(q_value, expected_q_value.data) self.optim.zero_grad() loss.backward() self.optim.step() return loss
class Agent_DQN(Agent): def __init__(self, env, args): """ Initialize everything you need here. For example: paramters for neural network initialize Q net and target Q net parameters for repaly buffer parameters for q-learning; decaying epsilon-greedy """ super(Agent_DQN, self).__init__(env) ########################### # initializations for replay memory self.env = env self.buffer = collections.deque( maxlen=REPLAY_SIZE) # initializing a replay memory buffer #initializations of agent self._reset() self.last_action = 0 self.net = DQN((4, 84, 84), self.env.action_space.n).to(DEVICE) self.target_net = DQN((4, 84, 84), self.env.action_space.n).to(DEVICE) LOAD_MODEL = True if args.test_dqn: #you can load your model here print('preparing to load trained model') ########################### LOAD_MODEL = True if LOAD_MODEL: self.net.load_state_dict( torch.load(MODEL, map_location=lambda storage, loc: storage)) print('loaded trained model') self.target_net.load_state_dict(self.net.state_dict()) def init_game_setting(self): """ Testing function will call this function at the begining of new game Put anything you want to initialize if necessary. If no parameters need to be initialized, you can leave it as blank. """ ########################### ########################### pass def push(self, experience): """ You can add additional arguments as you need. Push new data to buffer and remove the old one if the buffer is full. """ ########################### self.buffer.append(experience) ########################### def replay_buffer(self, batch_size): """ You can add additional arguments as you need. Select batch from buffer. sample a batch of 32 from the experience collected """ ########################### indices = np.random.choice(len(self.buffer), batch_size, replace=False) states, actions, rewards, dones, next_states = zip( *[self.buffer[idx] for idx in indices]) ########################### # The 'states' below are already in the transposed form because they are sampled from experience return np.array(states, dtype=np.float32), np.array(actions), np.array( rewards, dtype=np.float32), np.array(dones, dtype=np.bool), np.array(next_states) def _reset(self): self.state = self.env.reset() self.total_reward = 0.0 def make_action(self, observation, test=True): """ this is exclusively for testing our actions select action """ state_a_test = np.array([observation.transpose(2, 0, 1)], copy=False) #torch.tensor opperation appends a '1' at the start of the numpy array state_v_test = torch.tensor(state_a_test).to('cpu') #feeding observation to the network Q_values_v_test = self.net.forward(state_v_test) # picking the action with maximum probability #picking the best action _, action_v_test = torch.max(Q_values_v_test, dim=1) #coverting tensor to int action_test = int(action_v_test.item()) ########################### return action_test def make_action_train(self, net, epsilon=0.0, device=DEVICE): """ select action using epsilon greedy method for training purposes """ if np.random.random() < self.epsilon: action = random.randrange(self.env.action_space.n) else: state_a = np.array([self.state.transpose(2, 0, 1)], copy=False) #torch.tensor opperation appends a '1' at the start of the numpy array # and makes it a tensor to be fed to the net state_v = Variable(torch.FloatTensor(state_a).to(device)) #Q_values_v = self.net(state_v) Q_values_v = self.net.forward(state_v) #picking the best action _, action_v = torch.max(Q_values_v, dim=1) #coverting tensor to int action = int(action_v.item()) ########################### return action def take_a_step(self, net, epsilon=0.0, device=DEVICE): """ execute action and take a step in the environment add the state,action,rewards to the experience replay return the total_reward """ done_reward = None action_for_exp = self.make_action_train(self.net, self.epsilon, DEVICE) new_state, reward, is_done, _ = self.env.step(action_for_exp) #Here total reward is the reward for each episode self.total_reward += reward new_state = new_state #remember that the state that comes in from taking a step in our environment # will be in the form of width X height X depth # But whatever state goes into experience will be in the form of depth X height X width # i.e the experience buffer will have state in the transposed format # because this is the format that pytorch input should look like exp = Experience(self.state.transpose(2, 0, 1), action_for_exp, reward, is_done, new_state.transpose(2, 0, 1)) #adding experiences in our replay memory self.push(exp) self.state = new_state if is_done: done_reward = self.total_reward self._reset() return done_reward def loss_function(self, batch, net, target_net, optimizer, device=DEVICE): states, actions, rewards, dones, next_states = batch states_v = Variable(torch.FloatTensor(states).to(device)) next_states_v = Variable(torch.FloatTensor(next_states).to(device)) actions_v = Variable(torch.LongTensor(actions).to(device)) rewards_v = Variable(torch.FloatTensor(rewards).to(device)) done = Variable(torch.FloatTensor(dones).to(device)) #Q_vals state_action_values = self.net(states_v).gather( 1, actions_v.long().unsqueeze(-1)).squeeze(-1) #next_Q_vals next_state_values = self.target_net(next_states_v).max(1)[0] #next_state_values[done] = 0.0 #next_state_values = next_state_values.detach() expected_state_action_values = rewards_v + next_state_values * GAMMA * ( 1 - done) loss = (state_action_values - Variable(expected_state_action_values)).pow(2).mean() # we dont wanna accumilate our gradients # hence it is importent to make them zero at every iteration optimizer.zero_grad() loss.backward() optimizer.step() return loss def train(self): """ Implement your training algorithm here """ ########################### device = torch.device(DEVICE) #defining the optimizer for your neural network optimizer = optim.RMSprop(self.net.parameters(), lr=LEARNING_RATE) #empty list of total rewards total_rewards = [] best_mean_reward = None # initializations for time and speed calculation frame_idx = 0 timestep_frame = 0 timestep = time.time() while True: frame_idx += 1 self.epsilon = EPSILON_END + (EPSILON_START - EPSILON_END) * math.exp( -1. * frame_idx / EPSILON_DECAY) reward = self.take_a_step(self.net, self.epsilon, device=device) if reward is not None: #appending rewards in an empty list of total_rewards total_rewards.append(reward) # not asked to calculate speed speed = (frame_idx - timestep_frame) / (time.time() - timestep) timestep_frame = frame_idx timestep = time.time() #calculating mean of last(recent) 1000 rewards mean_reward = np.mean(total_rewards[-100:]) print( "{} frames: done {} games, mean reward {}, epsilon {}, speed {} frames/s" .format(frame_idx, len(total_rewards), round(mean_reward, 3), round(self.epsilon, 2), round(speed, 2))) if best_mean_reward is None or best_mean_reward < mean_reward or len( total_rewards) % 25 == 0: if best_mean_reward is not None: print("New best mean reward {} -> {}, model saved". format(round(best_mean_reward, 3), round(mean_reward, 3))) if frame_idx % SAVE_INTERVAL == 0: torch.save(self.net.state_dict(), 'breakoutNoFrameSkip-4v1' + '.dat') #checking the replay memory if len(self.buffer) < LEARNING_STARTS: continue #check if we need to update our target function if frame_idx % TARGET_UPDATE_INTERVAL == 0: self.target_net.load_state_dict(self.net.state_dict()) # sampling a batch from buffer batch = self.replay_buffer(BATCH_SIZE) #calculate and backpropogate loss_t = self.loss_function(batch, self.net, self.target_net, optimizer, device) #printing loss at every 100 episodes if len(total_rewards) % 100 == 0: print("loss at episode" + str(len(total_rewards)) + "is" + str(float(loss_t.item()))) with open('rewards_collection-100mean.csv', mode='w') as dataFile: writer = csv.writer(dataFile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(total_rewards) self.env.close()
class Agent_DQN(Agent): def __init__(self, env, args): """ Initialize everything you need here. For example: paramters for neural network initialize Q net and target Q net parameters for repaly buffer parameters for q-learning; decaying epsilon-greedy ... """ super(Agent_DQN, self).__init__(env) ########################### # YOUR IMPLEMENTATION HERE # #Gym parameters self.num_actions = env.action_space.n # parameters for repaly buffer self.buffer_max_len = 20000 self.buffer = deque(maxlen=self.buffer_max_len) self.episode_reward_list = [] self.moving_reward_avg = [] # paramters for neural network self.batch_size = 32 self.gamma = 0.999 self.eps_threshold = 0 self.eps_start = 1 self.eps_end = 0.025 self.max_expisode_decay = 10000 self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") #Training self.steps_done = 0 self.num_episode = 20000 self.target_update = 5000 self.learning_rate = 1.5e-4 # Neural Network self.policy_net = DQN().to(self.device) self.target_net = DQN().to(self.device) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate) if args.test_dqn: #you can load your model here print('loading trained model') self.policy_net = torch.load('policy_net.hb5') self.policy_net.eval() ########################### # YOUR IMPLEMENTATION HERE # def init_game_setting(self): """ Testing function will call this function at the begining of new game Put anything you want to initialize if necessary. If no parameters need to be initialized, you can leave it as blank. """ ########################### # YOUR IMPLEMENTATION HERE # ########################### pass def make_action(self, observation, test=True): """ Return predicted action of your agent Input: observation: np.array stack 4 last preprocessed frames, shape: (84, 84, 4) Return: action: int the predicted action from trained model """ ########################### # YOUR IMPLEMENTATION HERE # with torch.no_grad(): sample = random.random() ## Check if this is the best way to decline observation = torch.tensor(observation, dtype=torch.float, device=self.device).permute( 2, 0, 1).unsqueeze(0) if test: print("testing") return self.policy_net(observation).max(1)[1].item() if sample > self.eps_threshold: #print("Above threshold") return self.policy_net(observation).max(1)[1].item() else: #print("Below Threshold") return self.env.action_space.sample() ########################### def push(self, state, reward, action, next_state, done): """ You can add additional arguments as you need. Push new data to buffer and remove the old one if the buffer is full. Hints: ----- you can consider deque(maxlen = 10000) list """ ########################### # YOUR IMPLEMENTATION HERE # self.buffer.append((state, reward, action, next_state, done)) ########################### def replay_buffer(self, batch_size): """ You can add additional arguments as you need. Select batch from buffer. """ ########################### # YOUR IMPLEMENTATION HERE # batch = random.sample(self.buffer, batch_size) states = [] rewards = [] actions = [] next_states = [] dones = [] for sample in batch: state, reward, action, next_state, done = sample states.append(state) rewards.append(reward) actions.append(action) next_states.append(next_state) dones.append(done) ########################### return states, rewards, actions, next_states, dones def update(self): if self.steps_done < 5000: return states, rewards, actions, next_states, dones = self.replay_buffer( self.batch_size) loss = self.compute_loss(states, rewards, actions, next_states, dones) self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp(-1, 1) self.optimizer.step() def compute_loss(self, states, rewards, actions, next_states, dones): non_final_mask = [not done for done in dones] states = torch.tensor(states, dtype=torch.float).permute(0, 3, 1, 2).to(self.device) rewards = torch.tensor(rewards, dtype=torch.float).to(self.device) actions = torch.tensor(actions, dtype=torch.long).to(self.device) next_states = torch.tensor(next_states, dtype=torch.float).permute( 0, 3, 1, 2).to(self.device) dones = torch.tensor(dones, dtype=torch.long).to(self.device) Q_current = self.policy_net.forward(states).gather( 1, actions.unsqueeze(1)) Q_current = Q_current.squeeze(1) ## Should do this with no grad next_state_values = torch.zeros(self.batch_size, device=self.device) next_state_values[non_final_mask] = self.target_net( next_states[non_final_mask]).max(1)[0].detach() expected_state_action_values = (next_state_values * self.gamma) + rewards loss = F.smooth_l1_loss(Q_current, expected_state_action_values) del states, rewards, actions, next_states, dones, Q_current, next_state_values, expected_state_action_values return loss def train(self): """ Implement your training algorithm here """ ########################### # YOUR IMPLEMENTATION HERE # for episode in range(self.num_episode): #Check this please observation = self.env.reset() / 255 self.eps_threshold = max( 1 + (((self.eps_end - self.eps_start) / self.max_expisode_decay) * episode), self.eps_end) episode_steps = 0 done = False episode_reward = 0 ## Not sure if this is the right way to do this? while not done: action = self.make_action(observation, test=False) new_observation, reward, done, _ = self.env.step(action) new_observation = new_observation / 255 episode_reward += reward self.steps_done += 1 episode_steps += 1 self.push(observation, reward, action, new_observation, done) ## Updating the network self.update() observation = new_observation if self.steps_done % self.target_update == 0: self.target_net.load_state_dict( self.policy_net.state_dict()) self.episode_reward_list.append(episode_reward) if episode % 100 == 0: print('episode: {} reward: {} episode length: {}'.format( episode, episode_reward, episode_steps)) torch.save(self.policy_net.state_dict(), 'test_model.pt') ########################### print("Done")