def test_pg(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net( args.layer_num, args.state_shape, args.action_shape, device=args.device, softmax=True) net = net.to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.lr) dist = torch.distributions.Categorical policy = PGPolicy(net, optim, dist, args.gamma) # collector train_collector = Collector( policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs)
def test_pg(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, args.action_shape, device=args.device, softmax=True) net = net.to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.lr) dist = torch.distributions.Categorical policy = PGPolicy(net, optim, dist, args.gamma) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log writer = SummaryWriter(args.logdir + '/' + 'pg') def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = onpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_ddpg(args=get_args()): env = gym.make(args.task) if args.task == 'Pendulum-v0': env.spec.reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # train_envs = gym.make(args.task) train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = Actor( args.layer_num, args.state_shape, args.action_shape, args.max_action, args.device ).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic = Critic( args.layer_num, args.state_shape, args.action_shape, args.device ).to(args.device) critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr) policy = DDPGPolicy( actor, actor_optim, critic, critic_optim, args.tau, args.gamma, args.exploration_noise, [env.action_space.low[0], env.action_space.high[0]], reward_normalization=True, ignore_done=True) # collector train_collector = Collector( policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log writer = SummaryWriter(args.logdir + '/' + 'ddpg') def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = offpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_a2c(args=get_args()): torch.set_num_threads(1) # for poor CPU env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, device=args.device) actor = Actor(net, args.action_shape).to(args.device) critic = Critic(net).to(args.device) optim = torch.optim.Adam(list( actor.parameters()) + list(critic.parameters()), lr=args.lr) dist = torch.distributions.Categorical policy = A2CPolicy( actor, critic, optim, dist, args.gamma, gae_lambda=args.gae_lambda, vf_coef=args.vf_coef, ent_coef=args.ent_coef, max_grad_norm=args.max_grad_norm) # collector train_collector = Collector( policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log log_path = os.path.join(args.logdir, args.task, 'a2c') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = onpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_dqn(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # train_envs = gym.make(args.task) train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, args.action_shape, args.device) net = net.to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.lr) policy = DQNPolicy( net, optim, args.gamma, args.n_step, use_target_network=args.target_update_freq > 0, target_update_freq=args.target_update_freq) # collector train_collector = Collector( policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # policy.set_eps(1) train_collector.collect(n_step=args.batch_size) # log writer = SummaryWriter(args.logdir + '/' + 'dqn') def stop_fn(x): return x >= env.spec.reward_threshold def train_fn(x): policy.set_eps(args.eps_train) def test_fn(x): policy.set_eps(args.eps_test) # trainer result = offpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, train_fn=train_fn, test_fn=test_fn, stop_fn=stop_fn, writer=writer, task=args.task) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_dqn(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, args.action_shape, args.device) net = net.to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.lr) policy = DQNPolicy( net, optim, args.gamma, args.n_step, use_target_network=args.target_update_freq > 0, target_update_freq=args.target_update_freq) # collector train_collector = Collector( policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # policy.set_eps(1) train_collector.collect(n_step=args.batch_size) # log log_path = os.path.join(args.logdir, args.task, 'dqn') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold def train_fn(x): policy.set_eps(args.eps_train) def test_fn(x): policy.set_eps(args.eps_test) # trainer result = offpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, train_fn=train_fn, test_fn=test_fn, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() pprint.pprint(result)
def train(hyper: dict): env_id = 'CartPole-v1' env = gym.make(env_id) hyper['state_dim'] = 4 hyper['action_dim'] = 2 train_envs = VectorEnv([lambda: gym.make(env_id) for _ in range(hyper['training_num'])]) test_envs = SubprocVectorEnv([lambda: gym.make(env_id) for _ in range(hyper['test_num'])]) if hyper['seed']: np.random.seed(hyper['random_seed']) torch.manual_seed(hyper['random_seed']) train_envs.seed(hyper['random_seed']) test_envs.seed(hyper['random_seed']) device = Pytorch.device() net = Net(hyper['layer_num'], hyper['state_dim'], device=device) actor = Actor(net, hyper['action_dim']).to(device) critic = Critic(net).to(device) optim = torch.optim.Adam(list( actor.parameters()) + list(critic.parameters()), lr=hyper['learning_rate']) dist = torch.distributions.Categorical policy = A2CPolicy( actor, critic, optim, dist, hyper['gamma'], vf_coef=hyper['vf_coef'], ent_coef=hyper['ent_coef'], max_grad_norm=hyper['max_grad_norm']) # collector train_collector = Collector( policy, train_envs, ReplayBuffer(hyper['capacity'])) test_collector = Collector(policy, test_envs) writer = SummaryWriter('./a2c') def stop_fn(x): if env.env.spec.reward_threshold: return x >= env.spec.reward_threshold else: return False result = onpolicy_trainer( policy, train_collector, test_collector, hyper['epoch'], hyper['step_per_epoch'], hyper['collect_per_step'], hyper['repeat_per_collect'], hyper['test_num'], hyper['batch_size'], stop_fn=stop_fn, writer=writer, task=env_id) train_collector.close() test_collector.close() pprint.pprint(result) # 测试 env = gym.make(env_id) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=hyper['render']) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_ppo(args=get_args()): torch.set_num_threads(1) # we just need only one thread for NN env = gym_make() args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.action_range = [env.action_shape.low[0], env.action_space.high[0]] # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym_make() train_envs = VectorEnv( [lambda: gym_make() for _ in range(args.training_num)]) # test_envs = gym_make() test_envs = VectorEnv([lambda: gym_make() for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model policy = init_policy(args, env) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log log_path = os.path.join(args.logdir, args.task, 'ppo') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) # trainer result = onpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.test_num, args.batch_size, save_fn=save_fn, writer=writer) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym_make() collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_ppo(args=get_args()): torch.set_num_threads(1) # for poor CPU env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, device=args.device) actor = Actor(net, args.action_shape).to(args.device) critic = Critic(net).to(args.device) optim = torch.optim.Adam(list( actor.parameters()) + list(critic.parameters()), lr=args.lr) dist = torch.distributions.Categorical policy = PPOPolicy( actor, critic, optim, dist, args.gamma, max_grad_norm=args.max_grad_norm, eps_clip=args.eps_clip, vf_coef=args.vf_coef, ent_coef=args.ent_coef, action_range=None, gae_lambda=args.gae_lambda, reward_normalization=args.rew_norm, dual_clip=args.dual_clip, value_clip=args.value_clip) # collector train_collector = Collector( policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs)
def test_drqn(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Recurrent(args.layer_num, args.state_shape, args.action_shape, args.device) net = net.to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.lr) policy = DQNPolicy(net, optim, args.gamma, args.n_step, use_target_network=args.target_update_freq > 0, target_update_freq=args.target_update_freq) # collector train_collector = Collector( policy, train_envs, ReplayBuffer(args.buffer_size, stack_num=args.stack_num, ignore_obs_next=True)) # the stack_num is for RNN training: sample framestack obs test_collector = Collector(policy, test_envs) # policy.set_eps(1) train_collector.collect(n_step=args.batch_size) # log log_path = os.path.join(args.logdir, args.task, 'drqn') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold def train_fn(x): policy.set_eps(args.eps_train) def test_fn(x): policy.set_eps(args.eps_test) # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, train_fn=train_fn, test_fn=test_fn, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def train_agent( args: argparse.Namespace = get_args(), agent_learn: Optional[BasePolicy] = None, agent_opponent: Optional[BasePolicy] = None, optim: Optional[torch.optim.Optimizer] = None, ) -> Tuple[dict, BasePolicy]: def env_func(): return TicTacToeEnv(args.board_size, args.win_size) train_envs = VectorEnv([env_func for _ in range(args.training_num)]) test_envs = VectorEnv([env_func for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) policy, optim = get_agents(args, agent_learn=agent_learn, agent_opponent=agent_opponent, optim=optim) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # policy.set_eps(1) train_collector.collect(n_step=args.batch_size) # log if not hasattr(args, 'writer'): log_path = os.path.join(args.logdir, 'tic_tac_toe', 'dqn') writer = SummaryWriter(log_path) args.writer = writer else: writer = args.writer def save_fn(policy): if hasattr(args, 'model_save_path'): model_save_path = args.model_save_path else: model_save_path = os.path.join(args.logdir, 'tic_tac_toe', 'dqn', 'policy.pth') torch.save(policy.policies[args.agent_id - 1].state_dict(), model_save_path) def stop_fn(x): return x >= args.win_rate def train_fn(x): policy.policies[args.agent_id - 1].set_eps(args.eps_train) def test_fn(x): policy.policies[args.agent_id - 1].set_eps(args.eps_test) # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, train_fn=train_fn, test_fn=test_fn, stop_fn=stop_fn, save_fn=save_fn, writer=writer, test_in_train=False) train_collector.close() test_collector.close() return result, policy.policies[args.agent_id - 1]
def test_td3(args=get_args()): reg() env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # train_envs = gym.make(args.task) train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, device=args.device) actor = Actor(net, args.action_shape, args.max_action, args.device).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) net = Net(args.layer_num, args.state_shape, args.action_shape, concat=True, device=args.device) critic1 = Critic(net, args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(net, args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) policy = TD3Policy(actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, args.tau, args.gamma, GaussianNoise(sigma=args.exploration_noise), args.policy_noise, args.update_actor_freq, args.noise_clip, [env.action_space.low[0], env.action_space.high[0]], reward_normalization=True, ignore_done=True) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # train_collector.collect(n_step=args.buffer_size) # log writer = SummaryWriter(args.logdir + '/' + 'td3') def stop_fn(x): if env.spec.reward_threshold: return x >= env.spec.reward_threshold else: return False # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_step=1000, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
reward_normalization = False, max_grad_norm = 0.5 ) ### Optimizer parameters lr = 1.0e-3 buffer_size = trainer_config["collect_per_step"] # Instantiate the gym environment train_envs = VectorEnv([lambda: env_creator() for _ in range(N_THREADS)]) test_envs = VectorEnv([lambda: env_creator() for _ in range(N_THREADS)]) # Set the seed np.random.seed(SEED) torch.manual_seed(SEED) train_envs.seed(SEED) test_envs.seed(SEED) # Create the models ### Define the models class Net(nn.Module): def __init__(self, obs_space): super().__init__() n_input = np.prod(obs_space.shape) self.model = nn.Sequential(*[ nn.Linear(n_input, 64), nn.Tanh(), nn.Linear(64, 64), nn.Tanh() ]) def forward(self, obs, state=None, info={}):
class View(object): def __init__(self, args, mask=None, name='full'): env = gym.make(args.task) if args.task == 'Pendulum-v0': env.spec.reward_threshold = -250 self.state_shape = env.observation_space.shape or env.observation_space.n self.action_shape = env.action_space.shape or env.action_space.n self.max_action = env.action_space.high[0] self.stop_fn = lambda x: x >= env.spec.reward_threshold # env self.train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) self.test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # mask state_dim = int(np.prod(self.state_shape)) self._view_mask = torch.ones(state_dim) if mask == 'even': for i in range(0, state_dim, 2): self._view_mask[i] = 0 elif mask == "odd": for i in range(1, state_dim, 2): self._view_mask[i] = 0 elif type(mask) == int: self._view_mask[mask] = 0 # policy self.actor = ActorProbWithView( args.layer_num, self.state_shape, self.action_shape, self.max_action, self._view_mask, args.device ).to(args.device) self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=args.actor_lr) self.critic1 = CriticWithView( args.layer_num, self.state_shape, self._view_mask, self.action_shape, args.device ).to(args.device) self.critic1_optim = torch.optim.Adam(self.critic1.parameters(), lr=args.critic_lr) self.critic2 = CriticWithView( args.layer_num, self.state_shape, self._view_mask, self.action_shape, args.device ).to(args.device) self.critic2_optim = torch.optim.Adam(self.critic2.parameters(), lr=args.critic_lr) self.policy = SACPolicy( self.actor, self.actor_optim, self.critic1, self.critic1_optim, self.critic2, self.critic2_optim, args.tau, args.gamma, args.alpha, [env.action_space.low[0], env.action_space.high[0]], reward_normalization=True, ignore_done=True) # collector self.train_collector = Collector(self.policy, self.train_envs, ReplayBuffer(args.buffer_size)) self.test_collector = Collector(self.policy, self.test_envs) # log self.writer = SummaryWriter(f"{args.logdir}/{args.task}/sac/{args.note}/{name}") def seed(self, _seed): self.train_envs.seed(_seed) self.test_envs.seed(_seed) def close(self): self.train_collector.close() self.test_collector.close() def train(self): self.actor.train() self.critic1.train() self.critic2.train() def learn_from_demos(self, batch, demo, peer=0): acts = self.policy(batch).act demo = demo.act.detach() loss = F.mse_loss(acts, demo) if peer != 0: peer_demo = demo[torch.randperm(len(demo))] loss -= peer * F.mse_loss(acts, peer_demo) self.policy.actor_optim.zero_grad() loss.backward() self.policy.actor_optim.step()
def test_sac(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # train_envs = gym.make(args.task) train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = ActorProb(args.layer_num, args.state_shape, args.action_shape, args.max_action, args.device).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic1 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) policy = SACPolicy(actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, args.tau, args.gamma, args.alpha, [env.action_space.low[0], env.action_space.high[0]], reward_normalization=True, ignore_done=True) # Load expert model. assert args.load is not None, 'args.load should not be None' expert = deepcopy(policy) expert.load_state_dict( torch.load(f'{args.logdir}/{args.task}/sac/{args.load}/policy.pth')) expert.eval() # collector expert_collector = Collector(expert, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # train_collector.collect(n_step=args.buffer_size) # log writer = SummaryWriter(f'{args.logdir}/{args.task}/imitation/{args.note}') def stop_fn(x): return x >= (args.reward_threshold or env.spec.reward_threshold) def learner(pol, batch, batch_size, repeat, peer=0.): losses, peer_terms, ent_losses = [], [], [] for _ in range(repeat): for b in batch.split(batch_size): acts = pol(b).act demo = torch.tensor(b.act, dtype=torch.float) loss = F.mse_loss(acts, demo) if peer != 0: peer_demo = demo[torch.randperm(len(demo))] peer_term = peer * F.mse_loss(acts, peer_demo) loss -= peer_term peer_terms.append(peer_term.detach().cpu.numpy()) pol.actor_optim.zero_grad() loss.backward() pol.actor_optim.step() losses.append(loss.detach().cpu().numpy()) return { 'loss': losses, 'loss/ent': ent_losses, 'loss/peer': peer_terms if peer else None, 'peer': peer, } # trainer result = imitation_trainer(policy, learner, expert_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, 1, args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task, peer=args.peer, peer_decay_steps=args.peer_decay_steps) assert stop_fn(result['best_reward']) expert_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_sac(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # train_envs = gym.make(args.task) train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = ActorProb(args.layer_num, args.state_shape, args.action_shape, args.max_action, args.device).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic1 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) if args.auto_alpha: target_entropy = -np.prod(env.action_space.shape) log_alpha = torch.zeros(1, requires_grad=True, device=args.device) alpha_optim = torch.optim.Adam([log_alpha], lr=args.alpha_lr) alpha = (target_entropy, log_alpha, alpha_optim) else: alpha = args.alpha policy = SACPolicy(actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, args.tau, args.gamma, alpha, [env.action_space.low[0], env.action_space.high[0]], reward_normalization=args.rew_norm, ignore_done=True, exploration_noise=OUNoise(0.0, args.noise_std)) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # train_collector.collect(n_step=args.buffer_size) # log log_path = os.path.join(args.logdir, args.task, 'sac') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_td3(args=get_args()): # initialize environment env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = Actor(args.layer_num, args.state_shape, args.action_shape, args.max_action, args.device, hidden_layer_size=args.hidden_size).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic1 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device, hidden_layer_size=args.hidden_size).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device, hidden_layer_size=args.hidden_size).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) policy = TD3Policy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, args.tau, args.gamma, GaussianNoise(sigma=args.exploration_noise), args.policy_noise, args.update_actor_freq, args.noise_clip, action_range=[env.action_space.low[0], env.action_space.high[0]], reward_normalization=args.rew_norm, ignore_done=False) # collector if args.training_num == 0: max_episode_steps = train_envs._max_episode_steps else: max_episode_steps = train_envs.envs[0]._max_episode_steps train_collector = Collector( policy, train_envs, ReplayBuffer(args.buffer_size, max_ep_len=max_episode_steps)) test_collector = Collector(policy, test_envs, mode='test') # log log_path = os.path.join(args.logdir, args.task, 'td3', str(args.seed)) writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) env.spec.reward_threshold = 100000 def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = offpolicy_exact_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_dqn(args=get_args()): # env task_env = EnvRegister(args.task) env = gym.make(task_env) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n rospy.loginfo(args.state_shape) rospy.loginfo(args.action_shape) train_envs = VectorEnv( [lambda: gym.make(task_env) for _ in range(args.training_num)]) test_envs = VectorEnv( [lambda: gym.make(task_env) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, args.action_shape, args.device) net = net.to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.lr) policy = DQNPolicy(net, optim, args.gamma, args.n_step, use_target_network=args.target_update_freq > 0, target_update_freq=args.target_update_freq) # collector rospy.loginfo("init collector") train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) train_collector.collect(n_step=args.batch_size) # log writer = SummaryWriter(args.logdir + '/' + 'dqn') rew_record = [] def stop_fn(x): # if x >= 10000: for s in x: if s.get('reach_goal') == True: rew_record.extend(s) rospy.loginfo("reach goal times = " + str(len(rew_record))) if (len(rew_record) > 1000): return True else: return False # else: # rew_record.clear() # return False def train_fn(x): policy.set_eps(args.eps_train, args.eps_decay, args.eps_min) def test_fn(x): policy.set_eps(args.eps_test, args.eps_decay, args.eps_min) # trainer rospy.loginfo("start training") result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, train_fn=train_fn, test_fn=test_fn, stop_fn=stop_fn, writer=writer) assert stop_fn(result['best_reward']) pprint.pprint(result) train_collector.close() test_collector.close() # save network torch.save(net, 'ginger_dqn_pathplanning.pkl') rospy.loginfo("training finish, testing...") # Let's watch its performance! env_test = gym.make(task_env) net_test = torch.load('ginger_dqn_pathplanning.pkl') policy_test = DQNPolicy(net_test, optim, args.gamma, args.n_step, use_target_network=args.target_update_freq > 0, target_update_freq=args.target_update_freq) collector = Collector(policy_test, env_test) result = collector.collect(n_episode=1, render=args.render) rospy.loginfo(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
if args.task == 'Pendulum-v0': env.spec.reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, device=args.device) actor = Actor(net, args.action_shape, args.max_action, args.device).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) net = Net(args.layer_num, args.state_shape, args.action_shape, concat=True, device=args.device) critic1 = Critic(net, args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(net, args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)
def train(args=get_args()): torch.set_num_threads(1) # we just need only one thread for NN env = gym_make() args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.action_range = [env.action_space.low, env.action_space.high] train_envs = VectorEnv( [lambda: gym_make() for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv([lambda: gym_make() for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = Actor(args.layer, args.state_shape, args.action_shape, args.action_range, args.device).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic = Critic(args.layer, args.state_shape, args.action_shape, args.device).to(args.device) critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr) policy = DDPGPolicy(actor, actor_optim, critic, critic_optim, args.tau, args.gamma, args.exploration_noise, args.action_range, reward_normalization=args.rew_norm, ignore_done=True) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log log_path = os.path.join(args.logdir, args.task, 'ddpg') writer = SummaryWriter(log_path) # if a model exist, continue to train it model_path = os.path.join(log_path, 'policy.pth') if os.path.exists(model_path): policy.load_state_dict(torch.load(model_path)) def save_fn(policy): torch.save(policy.state_dict(), model_path) def stop_fn(x): return x >= 100 # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, save_fn=save_fn, writer=writer) train_collector.close() test_collector.close() if __name__ == '__main__': # Let's watch its performance! env = gym_make() collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_sac(args=get_args()): # initialize environment env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = ActorProb(args.layer_num, args.state_shape, args.action_shape, args.max_action, args.device, hidden_layer_size=args.hidden_size).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic1 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device, hidden_layer_size=args.hidden_size).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device, hidden_layer_size=args.hidden_size).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) # energy-based discriminator disc = Critic( args.layer_num, np.prod(args.state_shape) + np.prod(args.action_shape), 0, args.device, hidden_layer_size=args.hidden_size, output_dim=np.prod(args.state_shape) + 1, ).to(args.device) disc_optim = torch.optim.Adam(disc.parameters(), lr=args.critic_lr) # tunable temperature beta = torch.ones(1, requires_grad=True, device=args.device) beta_optim = torch.optim.Adam([beta], lr=args.critic_lr) if args.auto_alpha: target_entropy = -np.prod(env.action_space.shape) log_alpha = torch.zeros(1, requires_grad=True, device=args.device) alpha_optim = torch.optim.Adam([log_alpha], lr=args.alpha_lr) alpha = (target_entropy, log_alpha, alpha_optim) else: alpha = args.alpha rng = np.random.RandomState(seed=args.seed) policy = SACMUTRIRB2BPolicy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, args.tau, args.gamma, alpha, [env.action_space.low[0], env.action_space.high[0]], reward_normalization=False, ignore_done=False, norm_diff=False, use_diff=False, process_tri=(lambda x, beta: process_tri(x, rng=rng, beta=beta) ), # continuous transition construction beta=(beta, beta_optim), # the tunable temperature discriminator=(disc, disc_optim), # the energy-based discriminator tor_diff=args.tor_diff # the tolerance of distance ) # collector if args.training_num == 0: max_episode_steps = train_envs._max_episode_steps else: max_episode_steps = train_envs.envs[0]._max_episode_steps train_collector = Collector( policy, train_envs, ReplayBufferTriple(args.buffer_size, max_ep_len=max_episode_steps)) test_collector = Collector(policy, test_envs, mode='test') # log log_path = os.path.join(args.logdir, args.task, 'sac_ct', str(args.seed)) writer = SummaryWriter(log_path) def save_fn(policy, name='policy.pth'): torch.save(policy.state_dict(), os.path.join(log_path, name)) env.spec.reward_threshold = 100000 def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = offpolicy_exact_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer, epochs_to_save=[1, 50, 100, 150, 200]) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def _test_ppo(args=get_args()): # just a demo, I have not made it work :( env = gym.make(args.task) if args.task == 'Pendulum-v0': env.spec.reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = ActorProb(args.layer_num, args.state_shape, args.action_shape, args.max_action, args.device).to(args.device) critic = Critic(args.layer_num, args.state_shape, device=args.device).to(args.device) optim = torch.optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=args.lr) dist = torch.distributions.Normal policy = PPOPolicy( actor, critic, optim, dist, args.gamma, max_grad_norm=args.max_grad_norm, eps_clip=args.eps_clip, vf_coef=args.vf_coef, ent_coef=args.ent_coef, action_range=[env.action_space.low[0], env.action_space.high[0]]) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) train_collector.collect(n_step=args.step_per_epoch) # log writer = SummaryWriter(args.logdir + '/' + 'ppo') def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = onpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_ppo(args=get_args()): torch.set_num_threads(1) # we just need only one thread for NN env = gym.make(args.task) if args.task == 'Pendulum-v0': env.spec.reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = ActorProb(args.layer_num, args.state_shape, args.action_shape, args.max_action, args.device).to(args.device) critic = Critic(args.layer_num, args.state_shape, device=args.device).to(args.device) optim = torch.optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=args.lr) dist = torch.distributions.Normal policy = PPOPolicy( actor, critic, optim, dist, args.gamma, max_grad_norm=args.max_grad_norm, eps_clip=args.eps_clip, vf_coef=args.vf_coef, ent_coef=args.ent_coef, reward_normalization=args.rew_norm, dual_clip=args.dual_clip, value_clip=args.value_clip, # action_range=[env.action_space.low[0], env.action_space.high[0]],) # if clip the action, ppo would not converge :) gae_lambda=args.gae_lambda) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log log_path = os.path.join(args.logdir, args.task, 'ppo') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = onpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
class Collector(object): """The :class:`~tianshou.data.Collector` enables the policy to interact with different types of environments conveniently. :param policy: an instance of the :class:`~tianshou.policy.BasePolicy` class. :param env: an environment or an instance of the :class:`~tianshou.env.BaseVectorEnv` class. :param buffer: an instance of the :class:`~tianshou.data.ReplayBuffer` class, or a list of :class:`~tianshou.data.ReplayBuffer`. If set to ``None``, it will automatically assign a small-size :class:`~tianshou.data.ReplayBuffer`. :param int stat_size: for the moving average of recording speed, defaults to 100. Example: :: policy = PGPolicy(...) # or other policies if you wish env = gym.make('CartPole-v0') replay_buffer = ReplayBuffer(size=10000) # here we set up a collector with a single environment collector = Collector(policy, env, buffer=replay_buffer) # the collector supports vectorized environments as well envs = VectorEnv([lambda: gym.make('CartPole-v0') for _ in range(3)]) buffers = [ReplayBuffer(size=5000) for _ in range(3)] # you can also pass a list of replay buffer to collector, for multi-env # collector = Collector(policy, envs, buffer=buffers) collector = Collector(policy, envs, buffer=replay_buffer) # collect at least 3 episodes collector.collect(n_episode=3) # collect 1 episode for the first env, 3 for the third env collector.collect(n_episode=[1, 0, 3]) # collect at least 2 steps collector.collect(n_step=2) # collect episodes with visual rendering (the render argument is the # sleep time between rendering consecutive frames) collector.collect(n_episode=1, render=0.03) # sample data with a given number of batch-size: batch_data = collector.sample(batch_size=64) # policy.learn(batch_data) # btw, vanilla policy gradient only # supports on-policy training, so here we pick all data in the buffer batch_data = collector.sample(batch_size=0) policy.learn(batch_data) # on-policy algorithms use the collected data only once, so here we # clear the buffer collector.reset_buffer() For the scenario of collecting data from multiple environments to a single buffer, the cache buffers will turn on automatically. It may return the data more than the given limitation. .. note:: Please make sure the given environment has a time limitation. """ def __init__(self, policy, env, buffer=None, episodic=False, stat_size=5, **kwargs): super().__init__() if not isinstance(env, BaseVectorEnv): self.env = VectorEnv([env]) else: self.env = env self._collect_step = 0 self._collect_episode = 0 self._collect_time = 0 self.buffer = buffer self.policy = policy self.process_fn = policy.process_fn self._episodic = episodic if self._episodic and buffer is not None: self._cached_buf = [ ReplayBuffer(buffer._maxsize // self.env.env_num) for _ in range(self.env.env_num) ] self.stat_size = stat_size self._step_speed = collections.deque([], self.stat_size) self._episode_speed = collections.deque([], self.stat_size) self._episode_length = collections.deque([], self.stat_size) self._episode_reward = collections.deque([], self.stat_size) self.reset() def reset(self): """Reset all related variables in the collector.""" self.reset_env() self.reset_buffer() # state over batch is either a list, an np.ndarray, or a torch.Tensor self._step_speed.clear() self._episode_speed.clear() self._episode_length.clear() self._episode_reward.clear() self._collect_step = 0 self._collect_episode = 0 self._collect_time = 0 def reset_buffer(self): """Reset the main data buffer.""" if self._episodic: [b.reset() for b in self._cached_buf] if self.buffer is not None: self.buffer.reset() def get_env_num(self): """Return the number of environments the collector has.""" return self.env.env_num def reset_env(self): """Reset all of the environment(s)' states and reset all of the cache buffers (if need). """ self._obs = self.env.reset() self._act = self._rew = self._done = None self._hidden_next = self._hidden = np.zeros( (self.get_env_num(), HIDDEN_SIZE)) self.reward = np.zeros(self.env.env_num) self.length = np.zeros(self.env.env_num) def seed(self, seed=None): """Reset all the seed(s) of the given environment(s).""" return self.env.seed(seed) def render(self, **kwargs): """Render all the environment(s).""" return self.env.render(**kwargs) def close(self): """Close the environment(s).""" self.env.close() def _to_numpy(self, x): """Return an object without torch.Tensor.""" if isinstance(x, torch.Tensor): return x.cpu().numpy() elif isinstance(x, dict): for k in x: if isinstance(x[k], torch.Tensor): x[k] = x[k].cpu().numpy() return x elif isinstance(x, Batch): x.to_numpy() return x return x def collect(self, n_step=0, n_episode=0, sampling=False, render=None): """Collect a specified number of step or episode. :param int n_step: how many steps you want to collect. :param n_episode: how many episodes you want to collect (in each environment). :type n_episode: int or list :param float render: the sleep time between rendering consecutive frames, defaults to ``None`` (no rendering). .. note:: One and only one collection number specification is permitted, either ``n_step`` or ``n_episode``. :return: A dict including the following keys * ``n/ep`` the collected number of episodes. * ``n/st`` the collected number of steps. * ``v/st`` the speed of steps per second. * ``v/ep`` the speed of episode per second. * ``rew`` the mean reward over collected episodes. * ``len`` the mean length over collected episodes. """ warning_count = 0 start_time = time.time() assert not ( n_step and n_episode ), "One and only one collection number specification is permitted!" cur_step = 0 cur_episode = np.zeros(self.env.env_num) while True: if warning_count >= 100000: warnings.warn( 'There are already many steps in an episode. ' 'You should add a time limitation to your environment!', Warning) batch_data = Batch(obs=self._obs, act=self._act, rew=self._rew, done=self._done) if sampling == True: self._act = self.env.sample() else: with torch.no_grad(): result = self.policy(batch_data, self._hidden) if hasattr(result, 'hidden') and result.hidden is not None: self._hidden_next = result.hidden if isinstance(result.act, torch.Tensor): self._act = self._to_numpy(result.act) elif not isinstance(self._act, np.ndarray): self._act = np.array(result.act) else: self._act = result.act obs_next, self._rew, self._done, _ = self.env.step(self._act) if render is not None: self.env.render() if render > 0: time.sleep(render) self.length += 1 self.reward += self._rew for i in range(self.env.env_num): warning_count += 1 collection = Experience(self._hidden[i], self._obs[i], self._act[i], self._rew[i], obs_next[i], self._done[i]) if not self._episodic: cur_step += 1 if self.buffer is not None: self.buffer.add(collection) else: self._cached_buf[i].add(collection) if self._done[i]: if self._episodic: cur_step += len(self._cached_buf[i]) if self.buffer is not None: self.buffer.extend(self._cached_buf[i]) cur_episode[i] += 1 self._episode_reward.append(self.reward[i]) self._episode_length.append(self.length[i]) self.reward[i], self.length[i] = 0, 0 if sum(self._done): ids = np.where(self._done)[0] obs_next = self.env.reset(ids) self._hidden_next[self._done] = 0. self._obs = obs_next self._hidden = self._hidden_next if n_episode and np.sum(cur_episode) >= n_episode: break if n_step != 0 and cur_step >= n_step: break cur_episode = sum(cur_episode) duration = time.time() - start_time self._step_speed.append(cur_step / duration) self._episode_speed.append(cur_episode / duration) self._collect_step += cur_step self._collect_episode += cur_episode self._collect_time += duration return { 'n/ep': cur_episode, 'n/st': cur_step, 'n/buffer': len(self.buffer) if self.buffer else 0, 'v/st': np.nanmean(self._step_speed), 'v/ep': np.nanmean(self._episode_speed) if self._collect_episode else 0, 'ep/reward': np.nanmean(self._episode_reward) if self._collect_episode else 0, 'ep/len': np.nanmean(self._episode_length) if self._collect_episode else 0, } def sample(self, batch_size): """Sample a data batch from the internal replay buffer. It will call :meth:`~tianshou.policy.BasePolicy.process_fn` before returning the final batch data. :param int batch_size: ``0`` means it will extract all the data from the buffer, otherwise it will extract the data with the given batch_size. """ batch_data, indice = self.buffer.sample(batch_size) batch_data = self.process_fn(batch_data, self.buffer, indice) return batch_data
def test_ppo(args=get_args()): torch.set_num_threads(1) # for poor CPU env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, device=args.device) actor = Actor(net, args.action_shape).to(args.device) critic = Critic(net).to(args.device) optim = torch.optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=args.lr) dist = torch.distributions.Categorical policy = PPOPolicy(actor, critic, optim, dist, args.gamma, max_grad_norm=args.max_grad_norm, eps_clip=args.eps_clip, vf_coef=args.vf_coef, ent_coef=args.ent_coef, action_range=None, gae_lambda=args.gae_lambda, reward_normalization=args.rew_norm, dual_clip=args.dual_clip, value_clip=args.value_clip) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log log_path = os.path.join(args.logdir, args.task, 'ppo') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = onpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() pprint.pprint(result)
def run_pg(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, args.action_shape, device=args.device, softmax=True) net = net.to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.lr) dist = torch.distributions.Categorical policy = PGPolicy(net, optim, dist, args.gamma, reward_normalization=args.rew_norm) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log if not os.path.isdir(os.path.join(args.logdir)): os.mkdir(os.path.join(args.logdir)) if not os.path.isdir(os.path.join(args.logdir, args.task)): os.mkdir(os.path.join(args.logdir, args.task)) if not os.path.isdir(os.path.join(args.logdir, args.task, 'pg')): os.mkdir(os.path.join(args.logdir, args.task, 'pg')) log_path = os.path.join(args.logdir, args.task, 'pg') def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold # trainer onpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn) train_collector.close() test_collector.close() if __name__ == '__main__': # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_sac(args=get_args()): torch.set_num_threads(1) # we just need only one thread for NN env = gym.make(args.task) if args.task == 'Pendulum-v0': env.spec.reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = ActorProb(args.layer_num, args.state_shape, args.action_shape, args.max_action, args.device).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic1 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) policy = SACPolicy(actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, args.tau, args.gamma, args.alpha, [env.action_space.low[0], env.action_space.high[0]], reward_normalization=True, ignore_done=True) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # train_collector.collect(n_step=args.buffer_size) # log log_path = os.path.join(args.logdir, args.task, 'sac') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def training_ddpg(args=get_args()): env = EnvTwoUsers(args.step_per_epoch) args.state_shape = env.observation_space.shape args.action_shape = env.action_space.shape args.max_action = env.action_space.high[0] train_envs = VectorEnv([ lambda: EnvTwoUsers(args.step_per_epoch) for _ in range(args.training_num) ]) test_envs = VectorEnv([ lambda: EnvTwoUsers(args.step_per_epoch) for _ in range(args.test_num) ]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, device=args.device, hidden_layer_size=args.unit_num) actor = Actor(net, args.action_shape, args.max_action, args.device, hidden_layer_size=args.unit_num).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) net = Net(args.layer_num, args.state_shape, args.action_shape, concat=True, device=args.device, hidden_layer_size=args.unit_num) critic = Critic(net, args.device, hidden_layer_size=args.unit_num).to(args.device) critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr) # orthogonal initialization for m in list(actor.modules()) + list(critic.modules()): if isinstance(m, torch.nn.Linear): torch.nn.init.orthogonal_(m.weight) torch.nn.init.zeros_(m.bias) policy = DDPGPolicy( actor, actor_optim, critic, critic_optim, args.tau, args.gamma, OUNoise(sigma=args.exploration_noise), # GaussianNoise(sigma=args.exploration_noise), [env.action_space.low[0], env.action_space.high[0]], reward_normalization=True, ignore_done=True) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log log_path = os.path.join(args.logdir, args.task, 'ddpg') if not os.path.exists(log_path): os.makedirs(log_path) # writer = SummaryWriter(log_path) writer = None # policy.load_state_dict(torch.load(os.path.join(log_path, 'policy.pth'))) # print('reload model!') def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= 1e16 # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) train_collector.close() test_collector.close()