def train_model(self, sars, done): (state, action, reward, next_state) = sars state = u.t_float32(state) action = u.t_float32(action) reward = u.t_float32(reward) next_state = u.t_float32(next_state) value = self.critic(state) next_value = self.critic(next_state) if done: advantage = reward - value target = reward else: advantage = (reward + self.discount_factor * next_value) - value target = reward + self.discount_factor * next_value self.actor_optimizer.zero_grad() probs = self.actor(state) actor_loss = -Categorical(probs).log_prob(action) * advantage actor_loss.backward() self.actor_optimizer.step() self.critic_optimizer.zero_grad() critic_loss = torch.mean((target.detach() - self.critic(state))**2) critic_loss.backward() self.critic_optimizer.step() if done: TrainerMetadata().log(critic_loss, 'critic_loss') TrainerMetadata().log(actor_loss, 'actor_loss')
def append_sample(self, sars, done): state, action, reward, next_state = sars state, action, reward = u.t_float32(state), u.t_float32( action), u.t_float32(reward) # FIXME: 이거 t_uint8로도 할 수 있을텐데 GAE 파트에서 실수 값에 Byte 곱한다고 에러 뿜뿜 done = u.t_float32(done) transition = self.transition_structure(state, action, reward, done) self.memory.append(transition)
def append_sample(self, sars, done): state, action, reward, next_state = sars self.memory.push( u.t_float32(state), u.t_float32(action), u.t_float32(reward), u.t_float32(next_state), u.t_uint8(done) )
def critic_update(self, state, target): self.critic_optimizer.zero_grad() target = u.t_float32(target) loss = torch.mean((target - self.critic(state)) ** 2) loss.backward() self.critic_optimizer.step() return loss
def intrinsic_motivation_impl(self, i_episode, step, current_sars, current_done): # Random motivation current_state, current_action, current_reward, current_next_state = current_sars intrinsic_reward = torch.rand_like(u.t_float32(current_reward)) intrinsic_reward = (self.a * intrinsic_reward) + self.b return intrinsic_reward
def intrinsic_motivation_impl(self, i_episode, step, current_sars, current_done): # Predictive novelty motivation (NM) current_state, current_action, current_reward, current_next_state = current_sars state_prediction_error = self._train_model( u.t_float32(current_state), u.t_float32(current_action), u.t_float32(current_next_state)) intrinsic_reward = self.intrinsic_scale_1 * state_prediction_error # TODO: 환경 평소 보상 (1) 정도로 clip 해줄까? # intrinsic_reward = torch.clamp(intrinsic_reward, min=-2, max=2) # TrainerMetadata().log('intrinsic_reward', torch.mean(intrinsic_reward)) # TODO: 제일 처음 Expert망이 조금 학습된 다음에 내발적 동기 보상 리턴하기? # if self.delayed_start and (TrainerMetadata().global_step < i_episode + self.intrinsic_reward_start): # return 0 return intrinsic_reward
def intrinsic_motivation_impl(self, i_episode, step, current_sars, current_done): # Learning progress motivation (LPM) current_state, current_action, current_reward, current_next_state = current_sars examplar = ExemplarStructure(u.t_float32(current_state), u.t_float32(current_action), u.t_float32(current_next_state)) self.region_manager.add(examplar) region = self.region_manager.find_region(examplar) past_error = region.get_past_error_mean() current_error = region.get_current_error_mean() intrinsic_reward = past_error - current_error # TODO: 환경 평소 보상 (1) 정도로 clip 해줄까? # intrinsic_reward_batch = torch.clamp(intrinsic_reward_batch, min=-2, max=2) # self.viz.draw_line(y=torch.mean(intrinsic_reward_batch), interval=1000, name="intrinsic_reward_batch") return intrinsic_reward
def intrinsic_motivation_impl(self, i_episode, step, current_sars, current_done): # Predictive Surprise Motivation (SM) current_state, current_action, current_reward, current_next_state = current_sars state_prediction_error, meta_prediction_error = self._train_model( u.t_float32(current_state), u.t_float32(current_action), u.t_float32(current_next_state)) intrinsic_reward = self.intrinsic_scale_1 * (state_prediction_error / meta_prediction_error) # TODO: 환경 평소 보상 (1) 정도로 clip 해줄까? # intrinsic_reward = torch.clamp(intrinsic_reward, min=-2, max=2) # TODO: 제일 처음 Expert망이 조금 학습된 다음에 내발적 동기 보상 리턴하기? # if self.delayed_start and (TrainerMetadata().global_step < i_episode + self.intrinsic_reward_start): # return 0 # TODO: 멋대로 tanh를 넣어버렸다 ㅠ # intrinsic_reward = torch.tanh(intrinsic_reward).item() return intrinsic_reward
def intrinsic_motivation_impl(self, i_episode, step, current_sars, current_done): # Predictive familiarity motivation (FM) current_state, current_action, current_reward, current_next_state = current_sars examplar = ExemplarStructure( u.t_float32(current_state), u.t_float32(current_action), u.t_float32(current_next_state) ) self.region_manager.add(examplar) region = self.region_manager.find_region(examplar) current_error = region.get_current_error_mean() intrinsic_reward = self.intrinsic_scale_1 / current_error intrinsic_reward = u.t_float32(intrinsic_reward) # TrainerMetadata().log(value=intrinsic_reward, indicator='intrinsic_reward', # variable='raw', interval=1, show_only_last=False, compute_maxmin=False) intrinsic_reward = torch.clamp(intrinsic_reward, min=-2, max=2) # TrainerMetadata().log(value=intrinsic_reward, indicator='intrinsic_reward', # variable='clamp', interval=1, show_only_last=False, compute_maxmin=False) intrinsic_reward = intrinsic_reward.item() return intrinsic_reward
def train_model(self, state, action, reward, next_state, done): state = u.t_from_np_to_float32(state) action = u.t_float32(action) next_state = u.t_from_np_to_float32(next_state) value = self.critic(state) next_value = self.critic(next_state) if done: advantage = reward - value target = reward else: advantage = (reward + self.discount_factor * next_value) - value target = reward + self.discount_factor * next_value actor_loss = self.actor_update(state, action, advantage) critic_loss = self.critic_update(state, target) return actor_loss, critic_loss
def train_model(self): if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay # 메모리에서 일정 크기만큼 기억을 불러온다 # 그 후 기억을 모아 각 변수별로 모은다. (즉, 전치행렬) transitions = self.memory.sample(self.batch_size) batch = self.transition_structure(*zip(*transitions)) target = [] target_val = [] # 정책망에 각각의 기억에 대해 상태를 넣어서 각각의 액션 보상을 구한다. # 그 다음에 선택한 액션 쪽의 보상을 가져온다. for i in range(self.batch_size): state = batch.state[i] action = batch.action[i] target.append(self.policy_model(state).squeeze()[action]) # 안 죽었을 때의 타겟망 보상 추정하기 for i in range(self.batch_size): next_state = batch.next_state[i] target_val.append(self.target_model(next_state)) # 기존 보상에 안 죽었을 때만 큐함수 추정을 더하기 for i in range(self.batch_size): done = batch.done[i] reward = batch.reward[i] if done: target_val[i] = u.t_float32(reward).squeeze() else: target_val[i] = reward + self.discount_factor * torch.max( target_val[i]).to(device) # 정책망의 예측 보상과 타겟망의 예측 보상을 MSE 비교 self.policy_optimizer.zero_grad() loss = nn.MSELoss().to(device) loss = loss(torch.stack(target), torch.stack(target_val)) loss.backward() self.policy_optimizer.step() # 그래프 그리기용 return loss
console_log_order=[ 'Epoch', 'Score', 'Time', ]) if IS_LOAD: TrainerMetadata().load() # 최대 에피소드 수만큼 돌린다 for i_episode in range(TrainerMetadata().current_epoch, EPISODES): TrainerMetadata().start_episode() agent.algorithm_rl.reset() state = env.reset() score = u.t_float32(0) # 각 에피소드당 환경에 정의된 최대 스텝 수만큼 돌린다 # 단 그 전에 환경에서 정의된 종료 상태(done)가 나오면 거기서 끝낸다 for t in range(env.spec.max_episode_steps): TrainerMetadata().start_step() action = agent.get_action(state) next_state, reward, done, _ = env.step(action) sars = (state, action, reward, next_state) agent.algorithm_rl.append_sample(sars, done) if len(agent.algorithm_rl.memory ) >= agent.algorithm_rl.train_start: agent.train_model(i_episode, t, sars, done)
def append_sample(self, state, action, reward, next_state, done): self.memory.push(u.t_float32(state), action, reward, u.t_float32(next_state), u.t_uint8(done))
checkpoint_inst = Checkpoint(VERSION, IS_SAVE, SAVE_INTERVAL) """ 상태 공간 4개, 범위 -∞ < s < ∞ 행동 공간 1개, 이산값 0 or 1 """ env = gym.make('CartPole-v1') state_size = env.observation_space.shape[0] action_size = env.action_space.n agent = DQNAgent(state_size, action_size) # 최대 에피소드 수만큼 돌린다 for episode in range(metadata.current_epoch, EPISODES): start_time = time.time() state = env.reset() score = last_loss = u.t_float32(0) seongkwageup = 0 # 각 에피소드당 환경에 정의된 최대 스텝 수만큼 돌린다 # 단 그 전에 환경에서 정의된 종료 상태(done)가 나오면 거기서 끝낸다 for t in range(env.spec.max_episode_steps): action = agent.get_action(state) next_state, reward, seokyong_reward, done, _ = god_seokyong_reward_method_step( action, t) seongkwageup += seokyong_reward # next_state, reward, seokyong_reward, done, _ = env.step(action) reward = reward if not done or score == 499 + seongkwageup else -400 # reward = reward if not done or score == 499 else -100 agent.append_sample(state, action, reward, next_state, done)