def onpolicy_trainer( policy: BasePolicy, train_collector: Collector, test_collector: Collector, max_epoch: int, step_per_epoch: int, repeat_per_collect: int, episode_per_test: int, batch_size: int, step_per_collect: Optional[int] = None, episode_per_collect: Optional[int] = None, train_fn: Optional[Callable[[int, int], None]] = None, test_fn: Optional[Callable[[int, Optional[int]], None]] = None, stop_fn: Optional[Callable[[float], bool]] = None, save_fn: Optional[Callable[[BasePolicy], None]] = None, reward_metric: Optional[Callable[[np.ndarray], np.ndarray]] = None, logger: BaseLogger = LazyLogger(), verbose: bool = True, test_in_train: bool = True, ) -> Dict[str, Union[float, str]]: """A wrapper for on-policy trainer procedure. The "step" in trainer means an environment step (a.k.a. transition). :param policy: an instance of the :class:`~tianshou.policy.BasePolicy` class. :param Collector train_collector: the collector used for training. :param Collector test_collector: the collector used for testing. :param int max_epoch: the maximum number of epochs for training. The training process might be finished before reaching ``max_epoch`` if ``stop_fn`` is set. :param int step_per_epoch: the number of transitions collected per epoch. :param int repeat_per_collect: the number of repeat time for policy learning, for example, set it to 2 means the policy needs to learn each given batch data twice. :param int episode_per_test: the number of episodes for one policy evaluation. :param int batch_size: the batch size of sample data, which is going to feed in the policy network. :param int step_per_collect: the number of transitions the collector would collect before the network update, i.e., trainer will collect "step_per_collect" transitions and do some policy network update repeatly in each epoch. :param int episode_per_collect: the number of episodes the collector would collect before the network update, i.e., trainer will collect "episode_per_collect" episodes and do some policy network update repeatly in each epoch. :param function train_fn: a hook called at the beginning of training in each epoch. It can be used to perform custom additional operations, with the signature ``f( num_epoch: int, step_idx: int) -> None``. :param function test_fn: a hook called at the beginning of testing in each epoch. It can be used to perform custom additional operations, with the signature ``f( num_epoch: int, step_idx: int) -> None``. :param function save_fn: a hook called when the undiscounted average mean reward in evaluation phase gets better, with the signature ``f(policy: BasePolicy) -> None``. :param function stop_fn: a function with signature ``f(mean_rewards: float) -> bool``, receives the average undiscounted returns of the testing result, returns a boolean which indicates whether reaching the goal. :param function reward_metric: a function with signature ``f(rewards: np.ndarray with shape (num_episode, agent_num)) -> np.ndarray with shape (num_episode,)``, used in multi-agent RL. We need to return a single scalar for each episode's result to monitor training in the multi-agent RL setting. This function specifies what is the desired metric, e.g., the reward of agent 1 or the average reward over all agents. :param BaseLogger logger: A logger that logs statistics during training/testing/updating. Default to a logger that doesn't log anything. :param bool verbose: whether to print the information. Default to True. :param bool test_in_train: whether to test in the training phase. Default to True. :return: See :func:`~tianshou.trainer.gather_info`. .. note:: Only either one of step_per_collect and episode_per_collect can be specified. """ env_step, gradient_step = 0, 0 last_rew, last_len = 0.0, 0 stat: Dict[str, MovAvg] = defaultdict(MovAvg) start_time = time.time() train_collector.reset_stat() test_collector.reset_stat() test_in_train = test_in_train and train_collector.policy == policy test_result = test_episode(policy, test_collector, test_fn, 0, episode_per_test, logger, env_step, reward_metric) best_epoch = 0 best_reward, best_reward_std = test_result["rew"], test_result["rew_std"] for epoch in range(1, 1 + max_epoch): # train policy.train() with tqdm.tqdm(total=step_per_epoch, desc=f"Epoch #{epoch}", **tqdm_config) as t: while t.n < t.total: if train_fn: train_fn(epoch, env_step) result = train_collector.collect(n_step=step_per_collect, n_episode=episode_per_collect) if reward_metric: result["rews"] = reward_metric(result["rews"]) env_step += int(result["n/st"]) t.update(result["n/st"]) logger.log_train_data(result, env_step) last_rew = result['rew'] if 'rew' in result else last_rew last_len = result['len'] if 'len' in result else last_len data = { "env_step": str(env_step), "rew": f"{last_rew:.2f}", "len": str(int(last_len)), "n/ep": str(int(result["n/ep"])), "n/st": str(int(result["n/st"])), } if test_in_train and stop_fn and stop_fn(result["rew"]): test_result = test_episode(policy, test_collector, test_fn, epoch, episode_per_test, logger, env_step) if stop_fn(test_result["rew"]): if save_fn: save_fn(policy) t.set_postfix(**data) return gather_info(start_time, train_collector, test_collector, test_result["rew"], test_result["rew_std"]) else: policy.train() losses = policy.update(0, train_collector.buffer, batch_size=batch_size, repeat=repeat_per_collect) train_collector.reset_buffer() step = max( [1] + [len(v) for v in losses.values() if isinstance(v, list)]) gradient_step += step for k in losses.keys(): stat[k].add(losses[k]) losses[k] = stat[k].get() data[k] = f"{losses[k]:.3f}" logger.log_update_data(losses, gradient_step) t.set_postfix(**data) if t.n <= t.total: t.update() # test test_result = test_episode(policy, test_collector, test_fn, epoch, episode_per_test, logger, env_step) rew, rew_std = test_result["rew"], test_result["rew_std"] if best_epoch == -1 or best_reward < rew: best_reward, best_reward_std = rew, rew_std best_epoch = epoch if save_fn: save_fn(policy) if verbose: print( f"Epoch #{epoch}: test_reward: {rew:.6f} ± {rew_std:.6f}, best_rew" f"ard: {best_reward:.6f} ± {best_reward_std:.6f} in #{best_epoch}" ) if stop_fn and stop_fn(best_reward): break return gather_info(start_time, train_collector, test_collector, best_reward, best_reward_std)
def onpolicy_trainer( policy: BasePolicy, train_collector: Collector, test_collector: Collector, max_epoch: int, step_per_epoch: int, collect_per_step: int, repeat_per_collect: int, episode_per_test: Union[int, List[int]], batch_size: int, train_fn: Optional[Callable[[int, int], None]] = None, test_fn: Optional[Callable[[int, Optional[int]], None]] = None, stop_fn: Optional[Callable[[float], bool]] = None, save_fn: Optional[Callable[[BasePolicy], None]] = None, writer: Optional[SummaryWriter] = None, log_interval: int = 1, verbose: bool = True, test_in_train: bool = True, ) -> Dict[str, Union[float, str]]: """A wrapper for on-policy trainer procedure. The "step" in trainer means a policy network update. :param policy: an instance of the :class:`~tianshou.policy.BasePolicy` class. :param train_collector: the collector used for training. :type train_collector: :class:`~tianshou.data.Collector` :param test_collector: the collector used for testing. :type test_collector: :class:`~tianshou.data.Collector` :param int max_epoch: the maximum of epochs for training. The training process might be finished before reaching the ``max_epoch``. :param int step_per_epoch: the number of step for updating policy network in one epoch. :param int collect_per_step: the number of episodes the collector would collect before the network update. In other words, collect some episodes and do one policy network update. :param int repeat_per_collect: the number of repeat time for policy learning, for example, set it to 2 means the policy needs to learn each given batch data twice. :param episode_per_test: the number of episodes for one policy evaluation. :type episode_per_test: int or list of ints :param int batch_size: the batch size of sample data, which is going to feed in the policy network. :param function train_fn: a function receives the current number of epoch and step index, and performs some operations at the beginning of training in this poch. :param function test_fn: a function receives the current number of epoch and step index, and performs some operations at the beginning of testing in this epoch. :param function save_fn: a function for saving policy when the undiscounted average mean reward in evaluation phase gets better. :param function stop_fn: a function receives the average undiscounted returns of the testing result, return a boolean which indicates whether reaching the goal. :param torch.utils.tensorboard.SummaryWriter writer: a TensorBoard SummaryWriter. :param int log_interval: the log interval of the writer. :param bool verbose: whether to print the information. :param bool test_in_train: whether to test in the training phase. :return: See :func:`~tianshou.trainer.gather_info`. """ env_step, gradient_step = 0, 0 best_epoch, best_reward, best_reward_std = -1, -1.0, 0.0 stat: Dict[str, MovAvg] = {} start_time = time.time() train_collector.reset_stat() test_collector.reset_stat() test_in_train = test_in_train and train_collector.policy == policy for epoch in range(1, 1 + max_epoch): # train policy.train() with tqdm.tqdm( total=step_per_epoch, desc=f"Epoch #{epoch}", **tqdm_config ) as t: while t.n < t.total: if train_fn: train_fn(epoch, env_step) result = train_collector.collect(n_episode=collect_per_step) env_step += int(result["n/st"]) data = { "env_step": str(env_step), "rew": f"{result['rew']:.2f}", "len": str(int(result["len"])), "n/ep": str(int(result["n/ep"])), "n/st": str(int(result["n/st"])), "v/ep": f"{result['v/ep']:.2f}", "v/st": f"{result['v/st']:.2f}", } if writer and env_step % log_interval == 0: for k in result.keys(): writer.add_scalar( "train/" + k, result[k], global_step=env_step) if test_in_train and stop_fn and stop_fn(result["rew"]): test_result = test_episode( policy, test_collector, test_fn, epoch, episode_per_test, writer, env_step) if stop_fn(test_result["rew"]): if save_fn: save_fn(policy) for k in result.keys(): data[k] = f"{result[k]:.2f}" t.set_postfix(**data) return gather_info( start_time, train_collector, test_collector, test_result["rew"], test_result["rew_std"]) else: policy.train() losses = policy.update( 0, train_collector.buffer, batch_size=batch_size, repeat=repeat_per_collect) train_collector.reset_buffer() step = max([1] + [ len(v) for v in losses.values() if isinstance(v, list)]) gradient_step += step for k in losses.keys(): if stat.get(k) is None: stat[k] = MovAvg() stat[k].add(losses[k]) data[k] = f"{stat[k].get():.6f}" if writer and gradient_step % log_interval == 0: writer.add_scalar( k, stat[k].get(), global_step=gradient_step) t.update(step) t.set_postfix(**data) if t.n <= t.total: t.update() # test result = test_episode(policy, test_collector, test_fn, epoch, episode_per_test, writer, env_step) if best_epoch == -1 or best_reward < result["rew"]: best_reward, best_reward_std = result["rew"], result["rew_std"] best_epoch = epoch if save_fn: save_fn(policy) if verbose: print(f"Epoch #{epoch}: test_reward: {result['rew']:.6f} ± " f"{result['rew_std']:.6f}, best_reward: {best_reward:.6f} ± " f"{best_reward_std:.6f} in #{best_epoch}") if stop_fn and stop_fn(best_reward): break return gather_info(start_time, train_collector, test_collector, best_reward, best_reward_std)
def offline_trainer( policy: BasePolicy, buffer: ReplayBuffer, test_collector: Collector, max_epoch: int, update_per_epoch: int, episode_per_test: int, batch_size: int, test_fn: Optional[Callable[[int, Optional[int]], None]] = None, stop_fn: Optional[Callable[[float], bool]] = None, save_fn: Optional[Callable[[BasePolicy], None]] = None, reward_metric: Optional[Callable[[np.ndarray], np.ndarray]] = None, logger: BaseLogger = LazyLogger(), verbose: bool = True, ) -> Dict[str, Union[float, str]]: """A wrapper for offline trainer procedure. The "step" in offline trainer means a gradient step. :param policy: an instance of the :class:`~tianshou.policy.BasePolicy` class. :param Collector test_collector: the collector used for testing. :param int max_epoch: the maximum number of epochs for training. The training process might be finished before reaching ``max_epoch`` if ``stop_fn`` is set. :param int update_per_epoch: the number of policy network updates, so-called gradient steps, per epoch. :param episode_per_test: the number of episodes for one policy evaluation. :param int batch_size: the batch size of sample data, which is going to feed in the policy network. :param function test_fn: a hook called at the beginning of testing in each epoch. It can be used to perform custom additional operations, with the signature ``f( num_epoch: int, step_idx: int) -> None``. :param function save_fn: a hook called when the undiscounted average mean reward in evaluation phase gets better, with the signature ``f(policy: BasePolicy) -> None``. :param function stop_fn: a function with signature ``f(mean_rewards: float) -> bool``, receives the average undiscounted returns of the testing result, returns a boolean which indicates whether reaching the goal. :param function reward_metric: a function with signature ``f(rewards: np.ndarray with shape (num_episode, agent_num)) -> np.ndarray with shape (num_episode,)``, used in multi-agent RL. We need to return a single scalar for each episode's result to monitor training in the multi-agent RL setting. This function specifies what is the desired metric, e.g., the reward of agent 1 or the average reward over all agents. :param BaseLogger logger: A logger that logs statistics during updating/testing. Default to a logger that doesn't log anything. :param bool verbose: whether to print the information. Default to True. :return: See :func:`~tianshou.trainer.gather_info`. """ gradient_step = 0 stat: Dict[str, MovAvg] = defaultdict(MovAvg) start_time = time.time() test_collector.reset_stat() test_result = test_episode(policy, test_collector, test_fn, 0, episode_per_test, logger, gradient_step, reward_metric) best_epoch = 0 best_reward, best_reward_std = test_result["rew"], test_result["rew_std"] for epoch in range(1, 1 + max_epoch): policy.train() with tqdm.trange(update_per_epoch, desc=f"Epoch #{epoch}", **tqdm_config) as t: for i in t: gradient_step += 1 losses = policy.update(batch_size, buffer) data = {"gradient_step": str(gradient_step)} for k in losses.keys(): stat[k].add(losses[k]) losses[k] = stat[k].get() data[k] = f"{losses[k]:.3f}" logger.log_update_data(losses, gradient_step) t.set_postfix(**data) # test test_result = test_episode(policy, test_collector, test_fn, epoch, episode_per_test, logger, gradient_step, reward_metric) rew, rew_std = test_result["rew"], test_result["rew_std"] if best_epoch == -1 or best_reward < rew: best_reward, best_reward_std = rew, rew_std best_epoch = epoch if save_fn: save_fn(policy) if verbose: print( f"Epoch #{epoch}: test_reward: {rew:.6f} ± {rew_std:.6f}, best_rew" f"ard: {best_reward:.6f} ± {best_reward_std:.6f} in #{best_epoch}" ) if stop_fn and stop_fn(best_reward): break return gather_info(start_time, None, test_collector, best_reward, best_reward_std)
def offpolicy_trainer( policy: BasePolicy, train_collector, max_epoch: int, step_per_epoch: int, collect_per_step: int, batch_size: int, update_per_step: int = 1, train_fn: Optional[Callable[[int], None]] = None, writer: Optional[SummaryWriter] = None, log_interval: int = 1000, ) -> int: """A wrapper for off-policy trainer procedure. The ``step`` in trainer means a policy network update. :param policy: an instance of the :class:`~tianshou.policy.BasePolicy` class. :param train_collector: the collector used for training. :type train_collector: :class:`~tianshou.data.Collector` :param test_collector: the collector used for testing. :type test_collector: :class:`~tianshou.data.Collector` :param int max_epoch: the maximum of epochs for training. The training process might be finished before reaching the ``max_epoch``. :param int step_per_epoch: the number of step for updating policy network in one epoch. :param int collect_per_step: the number of frames the collector would collect before the network update. In other words, collect some frames and do some policy network update. :param episode_per_test: the number of episodes for one policy evaluation. :param int batch_size: the batch size of sample data, which is going to feed in the policy network. :param int update_per_step: the number of times the policy network would be updated after frames are collected, for example, set it to 256 means it updates policy 256 times once after ``collect_per_step`` frames are collected. :param function train_fn: a function receives the current number of epoch index and performs some operations at the beginning of training in this epoch. :param torch.utils.tensorboard.SummaryWriter writer: a TensorBoard SummaryWriter. :param int log_interval: the log interval of the writer. :return: See :func:`~tianshou.trainer.gather_info`. """ global_step = 0 best_epoch, best_reward = -1, -1. stat = {} start_time = time.time() for epoch in range(1, 1 + max_epoch): # train policy.train() if train_fn: train_fn(epoch) with tqdm.tqdm(total=step_per_epoch, desc=f'Epoch #{epoch}', **tqdm_config) as t: results = collections.deque(maxlen=100) while t.n < t.total: assert train_collector.policy == policy result = train_collector.collect(n_step=collect_per_step) results.extend([result]) data = {} for i in range(update_per_step * min( min(100, result['n/st']) // collect_per_step, t.total - t.n)): losses = policy.update(batch_size, train_collector.buffer) global_step += collect_per_step for k in result.keys(): data[k] = f'{result[k]:.2f}' if writer and global_step % log_interval == 0: writer.add_scalar('train/' + k, np.mean([r[k] for r in results]), global_step=global_step) for k in losses.keys(): if stat.get(k) is None: stat[k] = MovAvg() stat[k].add(losses[k]) data[k] = f'{stat[k].get():.6f}' if writer and global_step % log_interval == 0: writer.add_scalar(k, stat[k].get(), global_step=global_step) data['exp_noise'] = policy._noise._sigma t.update(1) t.set_postfix(**data) if t.n <= t.total: t.update() return global_step
def Myonpolicy_trainer( policy: BasePolicy, train_collector: MyCollector, test_collector: MyCollector, max_epoch: int, step_per_epoch: int, collect_per_step: int, repeat_per_collect: int, episode_per_test: Union[int, List[int]], # 每一次测试测试几个episode batch_size: int, train_fn: Optional[Callable[[int, int], None]] = None, test_fn: Optional[Callable[[int, Optional[int]], None]] = None, stop_fn: Optional[Callable[[float], bool]] = None, save_fn: Optional[Callable[[BasePolicy], None]] = None, writer: Optional[SummaryWriter] = None, log_interval: int = 1, verbose: bool = True, test_in_train: bool = True, test_probs: bool = False, ) -> Dict[str, Union[float, str]]: """A wrapper for on-policy trainer procedure. The "step" in trainer means a policy network update. :param policy: an instance of the :class:`~tianshou.policy.BasePolicy` class. :param train_collector: the collector used for training. :type train_collector: :class:`~tianshou.data.Collector` :param test_collector: the collector used for testing. :type test_collector: :class:`~tianshou.data.Collector` :param int max_epoch: the maximum of epochs for training. The training process might be finished before reaching the ``max_epoch``. :param int step_per_epoch: the number of step for updating policy network 在每一个epoch最多更新多少次网络 in one epoch. :param int collect_per_step: the number of episodes the collector would 在一个step要进行收集多少个数据 collect before the network update. In other words, collect some episodes and do one policy network update. :param int repeat_per_collect: the number of repeat time for policy learning, for example, set it to 2 means the policy needs to learn each given batch data twice. :param episode_per_test: the number of episodes for one policy evaluation. :type episode_per_test: int or list of ints :param int batch_size: the batch size of sample data, which is going to feed in the policy network. :param function train_fn: a function receives the current number of epoch and step index, and performs some operations at the beginning of training in this poch. :param function test_fn: a function receives the current number of epoch and step index, and performs some operations at the beginning of testing in this epoch. :param function save_fn: a function for saving policy when the undiscounted average mean reward in evaluation phase gets better. :param function stop_fn: a function receives the average undiscounted returns of the testing result, return a boolean which indicates whether reaching the goal. :param torch.utils.tensorboard.SummaryWriter writer: a TensorBoard SummaryWriter. :param int log_interval: the log interval of the writer. :param bool verbose: whether to print the information. :param bool test_in_train: whether to test in the training phase. :param bool test_probs: 在测试集使用多个精度. :return: See :func:`~tianshou.trainer.gather_info`. """ env_step, gradient_step = 0, 0 best_epoch, best_reward, best_reward_std = -1, -1.0, 0.0 best_rate = 0. best_mate_num = 0. best_avg_len = 0. stat: Dict[str, MovAvg] = {} start_time = time.time() # print("policy reset") train_collector.reset_stat() test_collector.reset_stat() test_in_train = test_in_train and train_collector.policy == policy # print('policy') for epoch in range(1, 1 + max_epoch): # train policy.train() with tqdm.tqdm(total=step_per_epoch, desc=f"Epoch #{epoch}", **tqdm_config) as t: while t.n < t.total: if train_fn: train_fn(epoch, env_step) result = train_collector.collect( n_episode=collect_per_step ) # collect之后就会返回一个result, 即为更新一次所需要收集的数据 env_step += int(result["n/st"]) data = { "env_step": str(env_step), "rew": f"{result['rew']:.2f}", "len": str(int(result["len"])), "n/ep": str(int(result["n/ep"])), "n/st": str(int(result["n/st"])), "v/ep": f"{result['v/ep']:.2f}", "v/st": f"{result['v/st']:.2f}", "rate": f"{result['hit_rate']:.2f}", } if writer and env_step % log_interval == 0: for k in result.keys(): if "class" not in k: writer.add_scalar("train/" + k, result[k], global_step=env_step) if test_in_train and stop_fn and stop_fn(result["rew"]): test_result = test_episode(policy, test_collector, test_fn, epoch, episode_per_test, writer, env_step) if stop_fn(test_result["rew"]): if save_fn: save_fn(policy) for k in result.keys(): data[k] = f"{result[k]:.2f}" t.set_postfix(**data) return gather_info(start_time, train_collector, test_collector, test_result["rew"], test_result["rew_std"]) else: policy.train() # print("what the f**k") losses = policy.update( 0, train_collector.buffer, # 训练数据就是collector的buff里面的内容 batch_size=batch_size, repeat=repeat_per_collect) # print("youxi") train_collector.reset_buffer() step = max( [1] + [len(v) for v in losses.values() if isinstance(v, list) ]) # 没太看懂这个step为什么要这样加 gradient_step += step for k in losses.keys(): if stat.get(k) is None: stat[k] = MovAvg() stat[k].add(losses[k]) data[k] = f"{stat[k].get():.6f}" if writer and gradient_step % log_interval == 0: writer.add_scalar(k, stat[k].get(), global_step=gradient_step) t.update(step) t.set_postfix(**data) if t.n <= t.total: t.update() # test # print("episode_per_test: ", episode_per_test) # print("test_collector_env_num: ", test_collector.get_env_num()) start_time = time.time() result = test_episode(policy, test_collector, test_fn, epoch, episode_per_test, writer, env_step) # 这里能保证遍历所有test集合的数据吗? 并且需要只遍历一次 end_time = time.time() print("total_time: ", (end_time - start_time) / 60, time.strftime("%Y-%m-%d-%H_%M_%S", time.localtime())) if test_probs: # 测试第二个阈值 # policy.actor.threshold = 0.9 test_probs_result = test_episode(policy, test_collector, test_fn, epoch, episode_per_test, writer, env_step, name='test_prob1/') print(result['hit_rate'], test_probs_result['hit_rate']) best_rate = max(best_rate, test_probs_result['hit_rate']) # policy.actor.threshold = 0.95 test_hit_rate = result['hit_rate'] # best_rate = max(best_rate, test_hit_rate) best_flag = 0 if best_epoch == -1 or best_rate < test_hit_rate: best_reward, best_reward_std = result["rew"], result["rew_std"] best_rate = test_hit_rate best_epoch = epoch best_mate_num = result['mate_num'] best_flag = 1 best_avg_len = result['len'] if best_rate == test_hit_rate and result['mate_num'] > best_mate_num: best_mate_num = result['mate_num'] best_avg_len = result['len'] best_flag = 1 if save_fn and best_flag == 1: print("happy") save_fn(policy) # import pickle # with open('./model/f**k.pk','wb') as f: # pickle.dump(test_collector, f) # import torch # pp = torch.load('./model/policy.pth') # pp_result = test_episode(pp, test_collector, test_fn=None, epoch=1, # n_episode=episode_per_test) # print(pp_result) if verbose: print(f"Epoch #{epoch}: test_reward: {result['rew']:.6f} ± " f"{result['rew_std']:.6f}, best_reward: {best_reward:.6f} ± " f"{best_reward_std:.6f}" f" hit_rate: {test_hit_rate}:.3f" f" mate_num: {result['mate_num']}" f" avg_len: {result['len']}") print(f" best_rate: {best_rate}:.3f" f" best_mate_num: {best_mate_num}:.3f" f" best_len: {best_avg_len}:.3f" f" in: #{best_epoch}:.3f") # if result['class_rate'] is not None: ##打印各类别的rate # ans = {} # right_class_num = total_class_num() # for ke, val in result['class_rate'].items(): # if ke in right_class_num.keys(): # ans[ke] = val / right_class_num[ke] # print(ans) # print(result['class_rate']) if stop_fn and stop_fn(best_reward): break return gather_info(start_time, train_collector, test_collector, best_reward, best_reward_std, best_rate, best_mate_num, best_avg_len)
def offpolicy_trainer( policy: BasePolicy, train_collector, max_epoch: int, step_per_epoch: int, collect_per_step: int, batch_size: int, update_per_step: int = 1, train_fn: Optional[Callable[[int], None]] = None, writer: Optional[SummaryWriter] = None, log_interval: int = 100, ) -> int: """A wrapper for off-policy trainer procedure. The ``step`` in trainer means a policy network update. :param policy: an instance of the :class:`~tianshou.policy.BasePolicy` class. :param train_collector: the collector used for training. :type train_collector: :class:`~tianshou.data.Collector` :param test_collector: the collector used for testing. :type test_collector: :class:`~tianshou.data.Collector` :param int max_epoch: the maximum of epochs for training. The training process might be finished before reaching the ``max_epoch``. :param int step_per_epoch: the number of step for updating policy network in one epoch. :param int collect_per_step: the number of frames the collector would collect before the network update. In other words, collect some frames and do some policy network update. :param episode_per_test: the number of episodes for one policy evaluation. :param int batch_size: the batch size of sample data, which is going to feed in the policy network. :param int update_per_step: the number of times the policy network would be updated after frames are collected, for example, set it to 256 means it updates policy 256 times once after ``collect_per_step`` frames are collected. :param function train_fn: a function receives the current number of epoch index and performs some operations at the beginning of training in this epoch. :param torch.utils.tensorboard.SummaryWriter writer: a TensorBoard SummaryWriter. :param int log_interval: the log interval of the writer. :return: See :func:`~tianshou.trainer.gather_info`. """ global_step = 0 update_step = 0 best_epoch, best_reward = -1, -1. stat = {} start_time = time.time() results = collections.deque(maxlen=300) world_results = [collections.deque(maxlen=10) for _ in range(WORLD_NUM)] world_count = [1] * WORLD_NUM world_pcount = [1] * WORLD_NUM for epoch in range(1, 1 + max_epoch): # train policy.train() if train_fn: train_fn(epoch) with tqdm.tqdm(total=step_per_epoch, desc=f'Epoch #{epoch}', **tqdm_config) as t: while t.n < t.total: assert train_collector.policy == policy result = train_collector.collect(n_step=collect_per_step) world_pcount = world_count.copy() for i, w in enumerate(result["world"]): world_results[w].append({"ep_rew": result["ep_rew"][i],\ "ep_len": result["ep_len"][i],\ "success": result["success"][i],\ "global_step": global_step}) world_count[w] += 1 for w in range(WORLD_NUM): if world_count[w] // 10 > world_pcount[w] // 10: for k in world_results[w][0].keys(): writer.add_scalar( 'world_%d/' % (w) + k, np.mean([r[k] for r in world_results[w]]), global_step=world_count[w]) n_ep = len(result["success"]) result = [{"ep_rew":result["ep_rew"][i],\ "ep_len":result["ep_len"][i],\ "success":result["success"][i]}\ for i in range(n_ep)] results.extend(result) data = {"n_ep": n_ep} n_step = sum([r["ep_len"] for r in result]) global_step += n_step n_step = np.clip(n_step, 10, 5000) for i in range(update_per_step * min(n_step // collect_per_step, t.total - t.n)): # for i in range(update_per_step):# * min(n_step // collect_per_step, t.total - t.n)): losses = policy.update(batch_size, train_collector.buffer) update_step += 1 if len(result) > 0: for k in result[0].keys(): data[k] = f"{np.mean([r[k] for r in result]):.2f}" if writer and update_step % log_interval == 0: writer.add_scalar('train/' + k, np.mean( [r[k] for r in results]), global_step=global_step) for k in losses.keys(): if stat.get(k) is None: stat[k] = MovAvg() stat[k].add(losses[k]) data[k] = f'{stat[k].get():.6f}' if writer and update_step % log_interval == 0: writer.add_scalar(k, stat[k].get(), global_step=update_step) try: data['exp_noise'] = policy._noise._sigma except: data['exp_noise'] = policy._noise t.update(1) t.set_postfix(**data) if t.n <= t.total: t.update() return global_step
def offpolicy_trainer( policy: BasePolicy, train_collector: Collector, test_collector: Collector, max_epoch: int, step_per_epoch: int, collect_per_step: int, episode_per_test: Union[int, List[int]], batch_size: int, update_per_step: int = 1, train_fn: Optional[Callable[[int], None]] = None, test_fn: Optional[Callable[[int], None]] = None, stop_fn: Optional[Callable[[float], bool]] = None, save_fn: Optional[Callable[[BasePolicy], None]] = None, writer: Optional[SummaryWriter] = None, log_interval: int = 1, verbose: bool = True, test_in_train: bool = True, ) -> Dict[str, Union[float, str]]: """A wrapper for off-policy trainer procedure. The ``step`` in trainer means a policy network update. :param policy: an instance of the :class:`~tianshou.policy.BasePolicy` class. :param train_collector: the collector used for training. :type train_collector: :class:`~tianshou.data.Collector` :param test_collector: the collector used for testing. :type test_collector: :class:`~tianshou.data.Collector` :param int max_epoch: the maximum of epochs for training. The training process might be finished before reaching the ``max_epoch``. :param int step_per_epoch: the number of step for updating policy network in one epoch. :param int collect_per_step: the number of frames the collector would collect before the network update. In other words, collect some frames and do some policy network update. :param episode_per_test: the number of episodes for one policy evaluation. :param int batch_size: the batch size of sample data, which is going to feed in the policy network. :param int update_per_step: the number of times the policy network would be updated after frames are collected, for example, set it to 256 means it updates policy 256 times once after ``collect_per_step`` frames are collected. :param function train_fn: a function receives the current number of epoch index and performs some operations at the beginning of training in this epoch. :param function test_fn: a function receives the current number of epoch index and performs some operations at the beginning of testing in this epoch. :param function save_fn: a function for saving policy when the undiscounted average mean reward in evaluation phase gets better. :param function stop_fn: a function receives the average undiscounted returns of the testing result, return a boolean which indicates whether reaching the goal. :param torch.utils.tensorboard.SummaryWriter writer: a TensorBoard SummaryWriter. :param int log_interval: the log interval of the writer. :param bool verbose: whether to print the information. :param bool test_in_train: whether to test in the training phase. :return: See :func:`~tianshou.trainer.gather_info`. """ global_step = 0 best_epoch, best_reward = -1, -1. stat = {} start_time = time.time() test_in_train = test_in_train and train_collector.policy == policy for epoch in range(1, 1 + max_epoch): # train policy.train() if train_fn: train_fn(epoch) with tqdm.tqdm(total=step_per_epoch, desc=f'Epoch #{epoch}', **tqdm_config) as t: while t.n < t.total: result = train_collector.collect(n_step=collect_per_step) data = {} if test_in_train and stop_fn and stop_fn(result['rew']): test_result = test_episode( policy, test_collector, test_fn, epoch, episode_per_test, writer, global_step) if stop_fn and stop_fn(test_result['rew']): if save_fn: save_fn(policy) for k in result.keys(): data[k] = f'{result[k]:.2f}' t.set_postfix(**data) return gather_info( start_time, train_collector, test_collector, test_result['rew']) else: policy.train() if train_fn: train_fn(epoch) for i in range(update_per_step * min( result['n/st'] // collect_per_step, t.total - t.n)): global_step += collect_per_step losses = policy.update(batch_size, train_collector.buffer) for k in result.keys(): data[k] = f'{result[k]:.2f}' if writer and global_step % log_interval == 0: writer.add_scalar('train/' + k, result[k], global_step=global_step) for k in losses.keys(): if stat.get(k) is None: stat[k] = MovAvg() stat[k].add(losses[k]) data[k] = f'{stat[k].get():.6f}' if writer and global_step % log_interval == 0: writer.add_scalar( k, stat[k].get(), global_step=global_step) t.update(1) t.set_postfix(**data) if t.n <= t.total: t.update() # test result = test_episode(policy, test_collector, test_fn, epoch, episode_per_test, writer, global_step) if best_epoch == -1 or best_reward < result['rew']: best_reward = result['rew'] best_epoch = epoch if save_fn: save_fn(policy) if verbose: print(f'Epoch #{epoch}: test_reward: {result["rew"]:.6f}, ' f'best_reward: {best_reward:.6f} in #{best_epoch}') if stop_fn and stop_fn(best_reward): break return gather_info( start_time, train_collector, test_collector, best_reward)
def onpolicy_trainer( policy: BasePolicy, train_collector: Collector, test_collector: Collector, max_epoch: int, frame_per_epoch: int, collect_per_step: int, repeat_per_collect: int, episode_per_test: Union[int, Sequence[int]], batch_size: int, train_fn: Optional[Callable[[int, int], None]] = None, test_fn: Optional[Callable[[int, Optional[int]], None]] = None, stop_fn: Optional[Callable[[float], bool]] = None, save_fn: Optional[Callable[[BasePolicy], None]] = None, writer: Optional[SummaryWriter] = None, log_interval: int = 1, verbose: bool = True, test_in_train: bool = True, **kwargs) -> Dict[str, Union[float, str]]: """Slightly modified Tianshou `onpolicy_trainer` original method to enable to define the maximum number of training steps instead of number of episodes, for consistency with other learning frameworks. """ global_step = 0 best_epoch, best_reward = -1, -1.0 stat: Dict[str, MovAvg] = {} start_time = time.time() train_collector.reset_stat() test_collector.reset_stat() test_in_train = test_in_train and train_collector.policy == policy for epoch in range(1, 1 + max_epoch): # train policy.train() with tqdm.tqdm( total=frame_per_epoch, desc=f"Epoch #{epoch}", **tqdm_config ) as t: while t.n < t.total: if train_fn: train_fn(epoch, global_step) result = train_collector.collect(n_step=collect_per_step) data = {} if test_in_train and stop_fn and stop_fn(result["rew"]): test_result = test_episode( policy, test_collector, test_fn, epoch, episode_per_test, writer, global_step) if stop_fn(test_result["rew"]): if save_fn: save_fn(policy) for k in result.keys(): data[k] = f"{result[k]:.2f}" t.set_postfix(**data) return gather_info( start_time, train_collector, test_collector, test_result["rew"]) else: policy.train() losses = policy.update( 0, train_collector.buffer, batch_size=batch_size, repeat=repeat_per_collect) train_collector.reset_buffer() step = 1 for v in losses.values(): if isinstance(v, (list, tuple)): step = max(step, len(v)) global_step += step * collect_per_step for k in result.keys(): data[k] = f"{result[k]:.2f}" if writer and global_step % log_interval == 0: writer.add_scalar( "train/" + k, result[k], global_step=global_step) for k in losses.keys(): if stat.get(k) is None: stat[k] = MovAvg() stat[k].add(losses[k]) data[k] = f"{stat[k].get():.6f}" if writer and global_step % log_interval == 0: writer.add_scalar( k, stat[k].get(), global_step=global_step) t.update(collect_per_step) t.set_postfix(**data) if t.n <= t.total: t.update() # test result = test_episode(policy, test_collector, test_fn, epoch, episode_per_test, writer, global_step) if best_epoch == -1 or best_reward < result["rew"]: best_reward = result["rew"] best_epoch = epoch if save_fn: save_fn(policy) if verbose: print(f"Epoch #{epoch}: test_reward: {result['rew']:.6f}, " f"best_reward: {best_reward:.6f} in #{best_epoch}") if stop_fn and stop_fn(best_reward): break return gather_info( start_time, train_collector, test_collector, best_reward)
def offpolicy_trainer( policy: BasePolicy, train_collector: Collector, test_collector: Collector, max_epoch: int, step_per_epoch: int, step_per_collect: int, episode_per_test: int, batch_size: int, update_per_step: Union[int, float] = 1, train_fn: Optional[Callable[[int, int], None]] = None, test_fn: Optional[Callable[[int, Optional[int]], None]] = None, stop_fn: Optional[Callable[[float], bool]] = None, save_fn: Optional[Callable[[BasePolicy], None]] = None, save_checkpoint_fn: Optional[Callable[[int, int, int], None]] = None, resume_from_log: bool = False, reward_metric: Optional[Callable[[np.ndarray], np.ndarray]] = None, logger: BaseLogger = LazyLogger(), verbose: bool = True, test_in_train: bool = True, ) -> Dict[str, Union[float, str]]: """A wrapper for off-policy trainer procedure. The "step" in trainer means an environment step (a.k.a. transition). :param policy: an instance of the :class:`~tianshou.policy.BasePolicy` class. :param Collector train_collector: the collector used for training. :param Collector test_collector: the collector used for testing. :param int max_epoch: the maximum number of epochs for training. The training process might be finished before reaching ``max_epoch`` if ``stop_fn`` is set. :param int step_per_epoch: the number of transitions collected per epoch. :param int step_per_collect: the number of transitions the collector would collect before the network update, i.e., trainer will collect "step_per_collect" transitions and do some policy network update repeatly in each epoch. :param episode_per_test: the number of episodes for one policy evaluation. :param int batch_size: the batch size of sample data, which is going to feed in the policy network. :param int/float update_per_step: the number of times the policy network would be updated per transition after (step_per_collect) transitions are collected, e.g., if update_per_step set to 0.3, and step_per_collect is 256, policy will be updated round(256 * 0.3 = 76.8) = 77 times after 256 transitions are collected by the collector. Default to 1. :param function train_fn: a hook called at the beginning of training in each epoch. It can be used to perform custom additional operations, with the signature ``f( num_epoch: int, step_idx: int) -> None``. :param function test_fn: a hook called at the beginning of testing in each epoch. It can be used to perform custom additional operations, with the signature ``f( num_epoch: int, step_idx: int) -> None``. :param function save_fn: a hook called when the undiscounted average mean reward in evaluation phase gets better, with the signature ``f(policy: BasePolicy) -> None``. :param function save_checkpoint_fn: a function to save training process, with the signature ``f(epoch: int, env_step: int, gradient_step: int) -> None``; you can save whatever you want. :param bool resume_from_log: resume env_step/gradient_step and other metadata from existing tensorboard log. Default to False. :param function stop_fn: a function with signature ``f(mean_rewards: float) -> bool``, receives the average undiscounted returns of the testing result, returns a boolean which indicates whether reaching the goal. :param function reward_metric: a function with signature ``f(rewards: np.ndarray with shape (num_episode, agent_num)) -> np.ndarray with shape (num_episode,)``, used in multi-agent RL. We need to return a single scalar for each episode's result to monitor training in the multi-agent RL setting. This function specifies what is the desired metric, e.g., the reward of agent 1 or the average reward over all agents. :param BaseLogger logger: A logger that logs statistics during training/testing/updating. Default to a logger that doesn't log anything. :param bool verbose: whether to print the information. Default to True. :param bool test_in_train: whether to test in the training phase. Default to True. :return: See :func:`~tianshou.trainer.gather_info`. """ if save_fn: warnings.warn("Please consider using save_checkpoint_fn instead of save_fn.") start_epoch, env_step, gradient_step = 0, 0, 0 if resume_from_log: start_epoch, env_step, gradient_step = logger.restore_data() last_rew, last_len = 0.0, 0 stat: Dict[str, MovAvg] = defaultdict(MovAvg) start_time = time.time() train_collector.reset_stat() test_collector.reset_stat() test_in_train = test_in_train and train_collector.policy == policy test_result = test_episode(policy, test_collector, test_fn, start_epoch, episode_per_test, logger, env_step, reward_metric) best_epoch = start_epoch best_reward, best_reward_std = test_result["rew"], test_result["rew_std"] for epoch in range(1 + start_epoch, 1 + max_epoch): # train policy.train() with tqdm.tqdm( total=step_per_epoch, desc=f"Epoch #{epoch}", **tqdm_config ) as t: while t.n < t.total: if train_fn: train_fn(epoch, env_step) result = train_collector.collect(n_step=step_per_collect) if result["n/ep"] > 0 and reward_metric: result["rews"] = reward_metric(result["rews"]) env_step += int(result["n/st"]) t.update(result["n/st"]) logger.log_train_data(result, env_step) last_rew = result['rew'] if 'rew' in result else last_rew last_len = result['len'] if 'len' in result else last_len data = { "env_step": str(env_step), "rew": f"{last_rew:.2f}", "len": str(int(last_len)), "n/ep": str(int(result["n/ep"])), "n/st": str(int(result["n/st"])), } if result["n/ep"] > 0: if test_in_train and stop_fn and stop_fn(result["rew"]): test_result = test_episode( policy, test_collector, test_fn, epoch, episode_per_test, logger, env_step) if stop_fn(test_result["rew"]): if save_fn: save_fn(policy) logger.save_data( epoch, env_step, gradient_step, save_checkpoint_fn) t.set_postfix(**data) return gather_info( start_time, train_collector, test_collector, test_result["rew"], test_result["rew_std"]) else: policy.train() for i in range(round(update_per_step * result["n/st"])): gradient_step += 1 losses = policy.update(batch_size, train_collector.buffer) for k in losses.keys(): stat[k].add(losses[k]) losses[k] = stat[k].get() data[k] = f"{losses[k]:.3f}" logger.log_update_data(losses, gradient_step) t.set_postfix(**data) if t.n <= t.total: t.update() # test test_result = test_episode(policy, test_collector, test_fn, epoch, episode_per_test, logger, env_step, reward_metric) rew, rew_std = test_result["rew"], test_result["rew_std"] if best_epoch < 0 or best_reward < rew: best_epoch, best_reward, best_reward_std = epoch, rew, rew_std if save_fn: save_fn(policy) logger.save_data(epoch, env_step, gradient_step, save_checkpoint_fn) if verbose: print(f"Epoch #{epoch}: test_reward: {rew:.6f} ± {rew_std:.6f}, best_rew" f"ard: {best_reward:.6f} ± {best_reward_std:.6f} in #{best_epoch}") if stop_fn and stop_fn(best_reward): break return gather_info(start_time, train_collector, test_collector, best_reward, best_reward_std)
def onpolicy_trainer( policy: BasePolicy, train_collector: Collector, max_epoch: int, step_per_epoch: int, collect_per_step: int, repeat_per_collect: int, batch_size: int, train_fn: Optional[Callable[[int], None]] = None, log_interval: int = 1, verbose: bool = True, test_in_train: bool = True, writer: Optional[SummaryWriter] = None, ): """A wrapper for on-policy trainer procedure. The ``step`` in trainer means a policy network update. :param policy: an instance of the :class:`~tianshou.policy.BasePolicy` class. :param train_collector: the collector used for training. :type train_collector: :class:`~tianshou.data.Collector` :param test_collector: the collector used for testing. :type test_collector: :class:`~tianshou.data.Collector` :param int max_epoch: the maximum of epochs for training. The training process might be finished before reaching the ``max_epoch``. :param int step_per_epoch: the number of step for updating policy network in one epoch. :param int collect_per_step: the number of episodes the collector would collect before the network update. In other words, collect some episodes and do one policy network update. :param int repeat_per_collect: the number of repeat time for policy learning, for example, set it to 2 means the policy needs to learn each given batch data twice. :param int batch_size: the batch size of sample data, which is going to feed in the policy network. :param function train_fn: a function receives the current number of epoch index and performs some operations at the beginning of training in this epoch. :param torch.utils.tensorboard.SummaryWriter writer: a TensorBoard SummaryWriter. :param int log_interval: the log interval of the writer. :param bool verbose: whether to print the information. :param bool test_in_train: whether to test in the training phase. :return: See :func:`~tianshou.trainer.gather_info`. """ global_step = 0 best_epoch, best_reward = -1, -1 stat = {} start_time = time.time() test_in_train = test_in_train and train_collector.policy == policy for epoch in range(1, 1 + max_epoch): # train policy.train() if train_fn: train_fn(epoch) with tqdm.tqdm(total=step_per_epoch, desc=f'Epoch #{epoch}', **tqdm_config) as t: while t.n < t.total: result = train_collector.collect(n_episode=collect_per_step) data = {} losses = policy.update(0, train_collector.buffer, batch_size, repeat_per_collect) train_collector.reset_buffer() step = 1 for k in losses.keys(): if isinstance(losses[k], list): step = max(step, len(losses[k])) global_step += step for k in result.keys(): data[k] = f'{result[k]:.2f}' if writer and global_step % log_interval == 0: writer.add_scalar(k, result[k], global_step=global_step) for k in losses.keys(): if stat.get(k) is None: stat[k] = MovAvg() stat[k].add(losses[k]) data[k] = f'{stat[k].get():.6f}' if writer and global_step % log_interval == 0: writer.add_scalar(k, stat[k].get(), global_step=global_step) t.update(step) t.set_postfix(**data) if t.n <= t.total: t.update() return global_step
def offline_trainer( policy: BasePolicy, buffer: ReplayBuffer, test_collector: Collector, max_epoch: int, step_per_epoch: int, episode_per_test: Union[int, List[int]], batch_size: int, test_fn: Optional[Callable[[int, Optional[int]], None]] = None, stop_fn: Optional[Callable[[float], bool]] = None, save_fn: Optional[Callable[[BasePolicy], None]] = None, writer: Optional[SummaryWriter] = None, log_interval: int = 1, verbose: bool = True, ) -> Dict[str, Union[float, str]]: """A wrapper for offline trainer procedure. The "step" in trainer means a policy network update. :param policy: an instance of the :class:`~tianshou.policy.BasePolicy` class. :param test_collector: the collector used for testing. :type test_collector: :class:`~tianshou.data.Collector` :param int max_epoch: the maximum number of epochs for training. The training process might be finished before reaching the ``max_epoch``. :param int step_per_epoch: the number of policy network updates, so-called gradient steps, per epoch. :param episode_per_test: the number of episodes for one policy evaluation. :param int batch_size: the batch size of sample data, which is going to feed in the policy network. :param function test_fn: a hook called at the beginning of testing in each epoch. It can be used to perform custom additional operations, with the signature ``f(num_epoch: int, step_idx: int) -> None``. :param function save_fn: a hook called when the undiscounted average mean reward in evaluation phase gets better, with the signature ``f(policy: BasePolicy) -> None``. :param function stop_fn: a function with signature ``f(mean_rewards: float) -> bool``, receives the average undiscounted returns of the testing result, returns a boolean which indicates whether reaching the goal. :param torch.utils.tensorboard.SummaryWriter writer: a TensorBoard SummaryWriter; if None is given, it will not write logs to TensorBoard. :param int log_interval: the log interval of the writer. :param bool verbose: whether to print the information. :return: See :func:`~tianshou.trainer.gather_info`. """ gradient_step = 0 best_epoch, best_reward, best_reward_std = -1, -1.0, 0.0 stat: Dict[str, MovAvg] = defaultdict(MovAvg) start_time = time.time() test_collector.reset_stat() for epoch in range(1, 1 + max_epoch): policy.train() with tqdm.trange(step_per_epoch, desc=f"Epoch #{epoch}", **tqdm_config) as t: for i in t: gradient_step += 1 losses = policy.update(batch_size, buffer) data = {"gradient_step": str(gradient_step)} for k in losses.keys(): stat[k].add(losses[k]) data[k] = f"{stat[k].get():.6f}" if writer and gradient_step % log_interval == 0: writer.add_scalar("train/" + k, stat[k].get(), global_step=gradient_step) t.set_postfix(**data) # test result = test_episode(policy, test_collector, test_fn, epoch, episode_per_test, writer, gradient_step) if best_epoch == -1 or best_reward < result["rew"]: best_reward, best_reward_std = result["rew"], result["rew_std"] best_epoch = epoch if save_fn: save_fn(policy) if verbose: print(f"Epoch #{epoch}: test_reward: {result['rew']:.6f} ± " f"{result['rew_std']:.6f}, best_reward: {best_reward:.6f} ± " f"{best_reward_std:.6f} in #{best_epoch}") if stop_fn and stop_fn(best_reward): break return gather_info(start_time, None, test_collector, best_reward, best_reward_std)
def offpolicy_trainer( policy: BasePolicy, train_collector: Collector, test_collector: Collector, max_epoch: int, step_per_epoch: int, collect_per_step: int, episode_per_test: Union[int, List[int]], batch_size: int, update_per_step: int = 1, train_fn: Optional[Callable[[int, int], None]] = None, test_fn: Optional[Callable[[int, Optional[int]], None]] = None, stop_fn: Optional[Callable[[float], bool]] = None, save_fn: Optional[Callable[[BasePolicy], None]] = None, writer: Optional[SummaryWriter] = None, log_interval: int = 1, verbose: bool = True, test_in_train: bool = True, ) -> Dict[str, Union[float, str]]: """A wrapper for off-policy trainer procedure. The "step" in trainer means a policy network update. :param policy: an instance of the :class:`~tianshou.policy.BasePolicy` class. :param train_collector: the collector used for training. :type train_collector: :class:`~tianshou.data.Collector` :param test_collector: the collector used for testing. :type test_collector: :class:`~tianshou.data.Collector` :param int max_epoch: the maximum number of epochs for training. The training process might be finished before reaching the ``max_epoch``. :param int step_per_epoch: the number of policy network updates, so-called gradient steps, per epoch. :param int collect_per_step: the number of frames the collector would collect before the network update. In other words, collect some frames and do some policy network update. :param episode_per_test: the number of episodes for one policy evaluation. :param int batch_size: the batch size of sample data, which is going to feed in the policy network. :param int update_per_step: the number of times the policy network would be updated after frames are collected, for example, set it to 256 means it updates policy 256 times once after ``collect_per_step`` frames are collected. :param function train_fn: a hook called at the beginning of training in each epoch. It can be used to perform custom additional operations, with the signature ``f(num_epoch: int, step_idx: int) -> None``. :param function test_fn: a hook called at the beginning of testing in each epoch. It can be used to perform custom additional operations, with the signature ``f(num_epoch: int, step_idx: int) -> None``. :param function save_fn: a hook called when the undiscounted average mean reward in evaluation phase gets better, with the signature ``f(policy: BasePolicy) -> None``. :param function stop_fn: a function with signature ``f(mean_rewards: float) -> bool``, receives the average undiscounted returns of the testing result, returns a boolean which indicates whether reaching the goal. :param torch.utils.tensorboard.SummaryWriter writer: a TensorBoard SummaryWriter; if None is given, it will not write logs to TensorBoard. :param int log_interval: the log interval of the writer. :param bool verbose: whether to print the information. :param bool test_in_train: whether to test in the training phase. :return: See :func:`~tianshou.trainer.gather_info`. """ env_step, gradient_step = 0, 0 best_epoch, best_reward, best_reward_std = -1, -1.0, 0.0 stat: Dict[str, MovAvg] = defaultdict(MovAvg) start_time = time.time() train_collector.reset_stat() test_collector.reset_stat() test_in_train = test_in_train and train_collector.policy == policy for epoch in range(1, 1 + max_epoch): # train policy.train() with tqdm.tqdm(total=step_per_epoch, desc=f"Epoch #{epoch}", **tqdm_config) as t: while t.n < t.total: if train_fn: train_fn(epoch, env_step) result = train_collector.collect(n_step=collect_per_step) env_step += int(result["n/st"]) data = { "env_step": str(env_step), "rew": f"{result['rew']:.2f}", "len": str(int(result["len"])), "n/ep": str(int(result["n/ep"])), "n/st": str(int(result["n/st"])), "v/ep": f"{result['v/ep']:.2f}", "v/st": f"{result['v/st']:.2f}", } if writer and env_step % log_interval == 0: for k in result.keys(): writer.add_scalar("train/" + k, result[k], global_step=env_step) if test_in_train and stop_fn and stop_fn(result["rew"]): test_result = test_episode(policy, test_collector, test_fn, epoch, episode_per_test, writer, env_step) if stop_fn(test_result["rew"]): if save_fn: save_fn(policy) for k in result.keys(): data[k] = f"{result[k]:.2f}" t.set_postfix(**data) return gather_info(start_time, train_collector, test_collector, test_result["rew"], test_result["rew_std"]) else: policy.train() for i in range(update_per_step * min( result["n/st"] // collect_per_step, t.total - t.n)): gradient_step += 1 losses = policy.update(batch_size, train_collector.buffer) for k in losses.keys(): stat[k].add(losses[k]) data[k] = f"{stat[k].get():.6f}" if writer and gradient_step % log_interval == 0: writer.add_scalar(k, stat[k].get(), global_step=gradient_step) t.update(1) t.set_postfix(**data) if t.n <= t.total: t.update() # test result = test_episode(policy, test_collector, test_fn, epoch, episode_per_test, writer, env_step) if best_epoch == -1 or best_reward < result["rew"]: best_reward, best_reward_std = result["rew"], result["rew_std"] best_epoch = epoch if save_fn: save_fn(policy) if verbose: print(f"Epoch #{epoch}: test_reward: {result['rew']:.6f} ± " f"{result['rew_std']:.6f}, best_reward: {best_reward:.6f} ± " f"{best_reward_std:.6f} in #{best_epoch}") if stop_fn and stop_fn(best_reward): break return gather_info(start_time, train_collector, test_collector, best_reward, best_reward_std)