def learn(self, num_episodes=300, batch_size=None, print_progess_frequency=10, min_replay_samples=50, repeat_train=16, imshow=True): # 智能體學習的主方法 if batch_size is not None: self.batch_size = batch_size # 學習一開始將steps_done清零,逐步降低隨機決策比率 self.steps_done = 0 train_cnt = 0 success_cnt = 0 keep_success_cnt = 0 start_train_episode = 0 start_train = False # 收集初始資料 while start_train == False: # 重置環境 self.env.reset() # 獎賞清零 total_rewards = 0 state = self.get_observation() for t in count(): # 基於目前狀態產生行動 action = self.select_action(state, model_only=False) # 基於行動產生獎賞以及判斷是否結束(此時已經更新至下一個時間點) reward, done = self.get_rewards(action) # 累積獎賞 total_rewards += reward # 任務完成強制終止(以300為基礎) conplete = (not done and t + 1 >= 300) if imshow: # 更新視覺化螢幕 self.env.render() # 取得下一時間點觀察值 next_state = None if done and not conplete else self.get_observation( ) # 將四元組儲存於記憶中 # 如果要減少「好案例」的儲存比例請移除註解 self.memory.push(state, action, next_state, reward) if len(self.memory) % 100 == 0: print("Replay Samples:{0}".format(len(self.memory))) if len(self.memory) == min_replay_samples: print('Start Train!!', flush=True) # 需要記憶中的案例數大於批次數才開始訓練 start_train = (len(self.memory) >= min_replay_samples) break # 切換至下一狀態 state = copy.deepcopy(next_state) if done or conplete: break # 開始訓練模式 self.training_context['steps'] = 0 self.steps_done = 0 for i_episode in range(num_episodes): for i in range(repeat_train): # 經驗回放獲得訓練用批次數據 self.output_fn = self.experience_replay # 訓練模型 self.train_model(None, None, current_epoch=i_episode, current_batch=i, total_epoch=num_episodes, total_batch=repeat_train, is_collect_data=True if t >= 0 else False, is_print_batch_progress=False, is_print_epoch_progress=False, log_gradients=False, log_weights=False, accumulate_grads=False) # 定期更新target_net權值 if i_episode % self.target_update == 0: self.target_net.load_state_dict(self.policy_net.state_dict(), strict=True) self.save_model(save_path=self.training_context['save_path']) # 重置環境 self.env.reset() # 獎賞清零 total_rewards = 0 state = self.get_observation() tmp_memory = [] for t in count(): # 透過優化器進行一步優化 # 基於目前狀態產生行動 action = self.select_action(state, model_only=True) # 基於行動產生獎賞以及判斷是否結束(此時已經更新至下一個時間點) reward, done = self.get_rewards(action) # 累積獎賞 total_rewards += reward # 任務完成強制終止(以300為基礎) conplete = (not done and t + 1 >= 300) if imshow: # 更新視覺化螢幕 self.env.render() # 取得下一時間點觀察值 next_state = None if done else self.get_observation() # 將四元組儲存於記憶中 tmp_memory.append((state, action, next_state, reward)) # 切換至下一狀態 state = next_state if done or conplete: if t >= 200: success_cnt += 1 else: success_cnt = 0 # 判斷是否連續可達300分,如果是則停止學習 if t + 1 >= 300: keep_success_cnt += 1 else: keep_success_cnt = 0 if keep_success_cnt >= 2: self.training_context['stop_update'] = 1 else: self.training_context['stop_update'] = 0 # 紀錄累積獎賞 self.epoch_metric_history.collect('total_rewards', i_episode, float(total_rewards)) self.epoch_metric_history.collect('original_rewards', i_episode, float(t)) # 紀錄完成比率(以200為基礎) self.epoch_metric_history.collect( 'task_complete', i_episode, 1.0 if t + 1 >= 200 else 0.0) # 定期列印學習進度 if i_episode > 0 and i_episode % print_progess_frequency == 0: self.print_epoch_progress(print_progess_frequency) # 定期繪製損失函數以及評估函數對時間的趨勢圖 if i_episode > 0 and i_episode % ( 5 * print_progess_frequency) == 0: print( 'negative_reward_ratio:', less( self.training_context['train_data'] ['reward_batch'], 0).mean().item()) print( 'predict_rewards:', self.training_context['train_data'] ['predict_rewards'].copy()[:5, 0]) print( 'target_rewards:', self.training_context['train_data'] ['target_rewards'].copy()[:5, 0]) print( 'reward_batch:', self.training_context['train_data'] ['reward_batch'].copy()[:5]) loss_metric_curve(self.epoch_loss_history, self.epoch_metric_history, legend=['dqn'], calculate_base='epoch', imshow=imshow) if success_cnt == 50: self.save_model( save_path=self.training_context['save_path']) print('50 episodes success, training finish! ') return True break # print([item[3] for item in tmp_memory]) sample_idx = [] indexs = list(range(len(tmp_memory))) if len(tmp_memory) > 10: # 只保留失敗前的3筆以及隨機抽樣sqrt(len(tmp_memory))+5筆 sample_idx.extend(indexs[-1 * min(3, len(tmp_memory)):]) sample_idx.extend( random_choice(indexs[:-3], int(sqrt(len(tmp_memory))))) sample_idx = list(set(sample_idx)) for k in range(len(tmp_memory)): state, action, next_state, reward = tmp_memory[k] if k in sample_idx or (k + 3 < len(tmp_memory) and tmp_memory[k + 1][3] < 1) or reward < 1: self.memory.push(state, action, next_state, reward) print('Complete') self.env.render() self.env.close() plt.ioff() plt.show()
def learn(self, num_episodes=300, batch_size=None, print_progess_frequency=10, imshow=True): """The main method for the agent learn Returns: object: """ if batch_size is not None: self.batch_size = batch_size self.steps_done = 0 for i_episode in range(num_episodes): # reset enviorment self.env.reset() # clear rewards total_rewards = 0 state = self.get_observation() # 需要記憶中的案例數大於批次數才開始訓練 start_train = (len(self.memory) > self.batch_size) for t in count(): # 基於目前狀態產生行動 action = self.select_action(state) # 基於行動產生獎賞以及判斷是否結束(此時已經更新至下一個時間點) reward, done = self.get_rewards(action) # 累積獎賞 total_rewards += reward # 任務完成強制終止(以300為基礎) conplete = (not done and t + 1 >= 300) if imshow: # 更新視覺化螢幕 self.env.render() # get next state next_state = self.get_observation() # 將四元組儲存於記憶中,建議要減少「好案例」的儲存比例 if reward < 1 or (reward == 1 and i_episode < 20) or ( reward == 1 and i_episode >= 20 and t < 100 and random.random() < 0.1 and i_episode >= 20 and t >= 100 and random.random() < 0.2): self.memory.push(state, action, next_state, reward) # switch next t state = deepcopy(next_state) if start_train: # get batch data from experimental replay trainData = self.experience_replay(self.batch_size) # switch model to training mode self.policy_net.train() self.train_model( trainData, None, current_epoch=i_episode, current_batch=t, total_epoch=num_episodes, total_batch=t + 1 if done or conplete else t + 2, is_collect_data=True if done or conplete else False, is_print_batch_progress=False, is_print_epoch_progress=False, log_gradients=False, log_weights=False, accumulate_grads=False) if done or conplete: if start_train: # self.epoch_metric_history.collect('episode_durations',i_episode,float(t)) # 紀錄累積獎賞 self.epoch_metric_history.collect( 'total_rewards', i_episode, float(total_rewards)) # 紀錄完成比率(以200為基礎) self.epoch_metric_history.collect( 'task_complete', i_episode, 1.0 if t + 1 >= 200 else 0.0) # 定期列印學習進度 if i_episode % print_progess_frequency == 0: self.print_epoch_progress(print_progess_frequency) # 定期繪製損失函數以及評估函數對時間的趨勢圖 if i_episode > 0 and (i_episode + 1) % ( 5 * print_progess_frequency) == 0: print('epsilon:', self.epsilon) print( 'predict_rewards:', self.training_context['train_data'] ['predict_rewards'][:5]) print( 'target_rewards:', self.training_context['train_data'] ['target_rewards'][:5]) print( 'reward_batch:', self.training_context['train_data'] ['reward_batch'][:5]) loss_metric_curve(self.epoch_loss_history, self.epoch_metric_history, legend=['dqn'], calculate_base='epoch', imshow=imshow) break # 定期更新target_net權值 if start_train and i_episode % self.target_update == 0: self.target_net.load_state_dict(self.policy_net.state_dict(), strict=True) self.save_model(save_path=self.training_context['save_path']) print('Complete') self.env.render() self.env.close() plt.ioff() plt.show()
def play(self, num_episodes, batch_size=1, min_replay_samples=1, print_progess_frequency=5, training=True, train_timing='on_episode_start', train_every_nstep=1, repeat_train=1, need_render=True): if train_timing not in [ 'on_episode_start', 'on_step_end', 'on_step_start' ]: raise ValueError( 'Only on_episode_start,on_step_end are valid train_timing options' ) if training: self._model.train() else: self._model.eval() if self.use_experience_replay: self.collect_samples(min_replay_samples=min_replay_samples) else: self.collect_samples( min_replay_samples=1, need_render=True if self.replay_unit == 'episode' else False) print('start train....') self.state_pool = [] self.reward_pool = [] self.action_pool = [] self.total_reward = 0 self.t = 0 self.i_episode = 0 if hasattr(self.env, 'recording_enabled'): self.env.recording_enabled = True for i_episode in range(num_episodes): self.i_episode = i_episode if training and train_timing == 'on_episode_start' and i_episode % train_every_nstep == 0: self.training_model(i_episode, 0, num_episodes=num_episodes, repeat_train=repeat_train, train_timing=train_timing, batch_size=batch_size) self.env.reset() self.total_rewards = 0 state = self.get_observation() for t in count(): self.t = t # # Train on_step_start # if training and train_timing == 'on_step_start' and t % train_every_nstep == 0: # self.training_model(i_episode, t,num_episodes=num_episodes, repeat_train=repeat_train, batch_size=batch_size) action = self.select_action(state, model_only=True) observation, reward, done, info = self.get_rewards(action) self.total_rewards += reward next_state = self.get_observation() if not done else None if need_render: self.env.render() if self.replay_unit == 'step': if self.push_into_memory_criteria( state, action, next_state, reward) or done: self.memory.push(state, action, next_state, reward) elif self.replay_unit == 'episode': self.state_pool.append(state) self.action_pool.append(action) self.reward_pool.append(reward) if done: if self.push_into_memory_criteria( self.state_pool, self.action_pool, None, self.reward_pool): self.memory.push(self.state_pool, self.action_pool, None, self.reward_pool) self.state_pool = [] self.action_pool = [] self.reward_pool = [] complete = self.episode_complete_criteria() # Train on_step_end if training and train_timing == 'on_step_end' and t % train_every_nstep == 0: self.training_model(i_episode, t, num_episodes=num_episodes, done=done or complete, repeat_train=repeat_train, train_timing=train_timing, batch_size=batch_size, accumulate_grads=accumulate_grads) state = next_state if done or complete: self.epoch_metric_history.collect( 'rewards', i_episode, float(self.total_rewards)) self.epoch_metric_history.collect('t', i_episode, float(t + 1)) if self.use_experience_replay: self.epoch_metric_history.collect( 'replay_buffer_utility', i_episode, float(len(self.memory)) / self.memory.capacity) if print_progess_frequency == 1 or ( i_episode > 0 and (i_episode + 1) % print_progess_frequency == 0): self.print_epoch_progress(print_progess_frequency) # n1 = self.action_logs['model'][0] # n2 = self.action_logs['model'][1] # n3 = self.action_logs['random'][0] # n4 = self.action_logs['random'][1] # print('model: 0:{0} 1:{1} random: 0:{2} 1:{3} random: {4}'.format(float(n1) / (n1 + n2), float(n2) / (n1 + n2), float(n3) / builtins.max(n3 + n4,1), # float(n4) / builtins.max(n3 + n4,1), float(n3 + n4) / builtins.max(n1 + n2 + n3 + n4,1))) # # self.action_logs = OrderedDict() # self.action_logs['model'] = OrderedDict() # self.action_logs['random'] = OrderedDict() # self.action_logs['model'][0] = 0 # self.action_logs['model'][1] = 0 # self.action_logs['random'][0] = 0 # self.action_logs['random'][1] = 0 # 定期繪製損失函數以及評估函數對時間的趨勢圖 if i_episode > 0 and (i_episode + 1) % ( 5 * print_progess_frequency) == 0: loss_metric_curve( self.epoch_loss_history, self.epoch_metric_history, metrics_names=list( self.epoch_metric_history.keys()), calculate_base='epoch', imshow=True) if self.task_complete_criteria(): self.save_model( save_path=self.training_context['save_path']) print( 'episode {0} meet task complete criteria, training finish! ' .format(i_episode)) return True break print('Complete') self.env.render() self.env.close()