def make_dqn(args): """Make a DQN policy :return: policy""" net = DQN(*args.state_shape, args.action_shape, args.device).to(args.device) policy = DQNPolicy(net, None, args.gamma, args.n_step, target_update_freq=args.target_update_freq) policy.set_eps(0) return policy
def test_dqn(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # Q_param = V_param = {"hidden_sizes": [128]} # model net = Net( args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, device=args.device, # dueling=(Q_param, V_param), ).to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.lr) policy = DQNPolicy(net, optim, args.gamma, args.n_step, target_update_freq=args.target_update_freq) # buffer if args.prioritized_replay: buf = PrioritizedVectorReplayBuffer(args.buffer_size, buffer_num=len(train_envs), alpha=args.alpha, beta=args.beta) else: buf = VectorReplayBuffer(args.buffer_size, buffer_num=len(train_envs)) # collector train_collector = Collector(policy, train_envs, buf, exploration_noise=True) test_collector = Collector(policy, test_envs, exploration_noise=True) # policy.set_eps(1) train_collector.collect(n_step=args.batch_size * args.training_num) # log log_path = os.path.join(args.logdir, args.task, 'dqn') writer = SummaryWriter(log_path) logger = BasicLogger(writer) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): return mean_rewards >= env.spec.reward_threshold def train_fn(epoch, env_step): # eps annnealing, just a demo if env_step <= 10000: policy.set_eps(args.eps_train) elif env_step <= 50000: eps = args.eps_train - (env_step - 10000) / \ 40000 * (0.9 * args.eps_train) policy.set_eps(eps) else: policy.set_eps(0.1 * args.eps_train) def test_fn(epoch, env_step): policy.set_eps(args.eps_test) # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.step_per_collect, args.test_num, args.batch_size, update_per_step=args.update_per_step, train_fn=train_fn, test_fn=test_fn, stop_fn=stop_fn, save_fn=save_fn, logger=logger) assert stop_fn(result['best_reward']) if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) policy.eval() policy.set_eps(args.eps_test) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) rews, lens = result["rews"], result["lens"] print(f"Final reward: {rews.mean()}, length: {lens.mean()}") # save buffer in pickle format, for imitation learning unittest buf = VectorReplayBuffer(args.buffer_size, buffer_num=len(test_envs)) collector = Collector(policy, test_envs, buf) collector.collect(n_step=args.buffer_size) pickle.dump(buf, open(args.save_buffer_name, "wb"))
def test_dqn(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, args.action_shape, args.device, # dueling=(1, 1) ).to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.lr) policy = DQNPolicy( net, optim, args.gamma, args.n_step, target_update_freq=args.target_update_freq) # buffer if args.prioritized_replay > 0: buf = PrioritizedReplayBuffer( args.buffer_size, alpha=args.alpha, beta=args.beta) else: buf = ReplayBuffer(args.buffer_size) # collector train_collector = Collector(policy, train_envs, buf) test_collector = Collector(policy, test_envs) # policy.set_eps(1) train_collector.collect(n_step=args.batch_size) # log log_path = os.path.join(args.logdir, args.task, 'dqn') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): return mean_rewards >= env.spec.reward_threshold def train_fn(epoch, env_step): # eps annnealing, just a demo if env_step <= 10000: policy.set_eps(args.eps_train) elif env_step <= 50000: eps = args.eps_train - (env_step - 10000) / \ 40000 * (0.9 * args.eps_train) policy.set_eps(eps) else: policy.set_eps(0.1 * args.eps_train) def test_fn(epoch, env_step): policy.set_eps(args.eps_test) # trainer result = offpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, train_fn=train_fn, test_fn=test_fn, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) policy.eval() policy.set_eps(args.eps_test) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}')
def test_dqn(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model Q_param = {"hidden_sizes": args.dueling_q_hidden_sizes} V_param = {"hidden_sizes": args.dueling_v_hidden_sizes} net = Net(args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, device=args.device, dueling_param=(Q_param, V_param)).to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.lr) policy = DQNPolicy(net, optim, args.gamma, args.n_step, target_update_freq=args.target_update_freq) # collector train_collector = Collector(policy, train_envs, VectorReplayBuffer(args.buffer_size, len(train_envs)), exploration_noise=True) test_collector = Collector(policy, test_envs, exploration_noise=True) # policy.set_eps(1) train_collector.collect(n_step=args.batch_size * args.training_num) # log log_path = os.path.join(args.logdir, args.task, 'dqn') writer = SummaryWriter(log_path) logger = TensorboardLogger(writer) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): return mean_rewards >= env.spec.reward_threshold def train_fn(epoch, env_step): # exp decay eps = max(args.eps_train * (1 - 5e-6)**env_step, args.eps_test) policy.set_eps(eps) def test_fn(epoch, env_step): policy.set_eps(args.eps_test) # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.step_per_collect, args.test_num, args.batch_size, update_per_step=args.update_per_step, stop_fn=stop_fn, train_fn=train_fn, test_fn=test_fn, save_fn=save_fn, logger=logger) assert stop_fn(result['best_reward']) if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! policy.eval() policy.set_eps(args.eps_test) test_envs.seed(args.seed) test_collector.reset() result = test_collector.collect(n_episode=args.test_num, render=args.render) rews, lens = result["rews"], result["lens"] print(f"Final reward: {rews.mean()}, length: {lens.mean()}")
def test_dqn(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, args.action_shape, args.device, dueling=(2, 2)).to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.lr) policy = DQNPolicy( net, optim, args.gamma, args.n_step, target_update_freq=args.target_update_freq) # collector train_collector = Collector( policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # policy.set_eps(1) train_collector.collect(n_step=args.batch_size) # log log_path = os.path.join(args.logdir, args.task, 'dqn') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): return mean_rewards >= env.spec.reward_threshold def train_fn(epoch, env_step): # exp decay eps = max(args.eps_train * (1 - 5e-6) ** env_step, args.eps_test) policy.set_eps(eps) def test_fn(epoch, env_step): policy.set_eps(args.eps_test) # trainer result = offpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, train_fn=train_fn, test_fn=test_fn, stop_fn=stop_fn, save_fn=save_fn, writer=writer, test_in_train=False) assert stop_fn(result['best_reward']) if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! policy.eval() policy.set_eps(args.eps_test) test_envs.seed(args.seed) test_collector.reset() result = test_collector.collect(n_episode=[1] * args.test_num, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}')
net = Net(training_config['layer_num'], state_shape, action_shape, config['device']).to(config['device']) optim = torch.optim.Adam(net.parameters(), lr=training_config['learning_rate']) policy = DQNPolicy(net, optim, training_config['gamma'], training_config['n_step'], target_update_freq=training_config['target_update_freq']) if training_config['prioritized_replay']: buf = PrioritizedReplayBuffer(training_config['buffer_size'], alpha=training_config['alpha'], beta=training_config['beta']) else: buf = ReplayBuffer(training_config['buffer_size']) policy.set_eps(1) train_collector = Collector(policy, train_envs, buf) train_collector.collect(n_step=1) train_fn = lambda e: [ policy.set_eps( max( 0.05, 1 - e / training_config['epoch'] / training_config[ 'exploration_ratio'])), torch.save(policy.state_dict(), os.path.join(save_path, 'policy_%d.pth' % (e))) ] result = offpolicy_trainer(policy, train_collector, training_config['epoch'],
def test_dqn_icm(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n if args.reward_threshold is None: default_reward_threshold = {"CartPole-v0": 195} args.reward_threshold = default_reward_threshold.get( args.task, env.spec.reward_threshold) # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # Q_param = V_param = {"hidden_sizes": [128]} # model net = Net( args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, device=args.device, # dueling=(Q_param, V_param), ).to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.lr) policy = DQNPolicy( net, optim, args.gamma, args.n_step, target_update_freq=args.target_update_freq, ) feature_dim = args.hidden_sizes[-1] feature_net = MLP(np.prod(args.state_shape), output_dim=feature_dim, hidden_sizes=args.hidden_sizes[:-1], device=args.device) action_dim = np.prod(args.action_shape) icm_net = IntrinsicCuriosityModule(feature_net, feature_dim, action_dim, hidden_sizes=args.hidden_sizes[-1:], device=args.device).to(args.device) icm_optim = torch.optim.Adam(icm_net.parameters(), lr=args.lr) policy = ICMPolicy(policy, icm_net, icm_optim, args.lr_scale, args.reward_scale, args.forward_loss_weight) # buffer if args.prioritized_replay: buf = PrioritizedVectorReplayBuffer( args.buffer_size, buffer_num=len(train_envs), alpha=args.alpha, beta=args.beta, ) else: buf = VectorReplayBuffer(args.buffer_size, buffer_num=len(train_envs)) # collector train_collector = Collector(policy, train_envs, buf, exploration_noise=True) test_collector = Collector(policy, test_envs, exploration_noise=True) # policy.set_eps(1) train_collector.collect(n_step=args.batch_size * args.training_num) # log log_path = os.path.join(args.logdir, args.task, 'dqn_icm') writer = SummaryWriter(log_path) logger = TensorboardLogger(writer) def save_best_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): return mean_rewards >= args.reward_threshold def train_fn(epoch, env_step): # eps annnealing, just a demo if env_step <= 10000: policy.set_eps(args.eps_train) elif env_step <= 50000: eps = args.eps_train - (env_step - 10000) / \ 40000 * (0.9 * args.eps_train) policy.set_eps(eps) else: policy.set_eps(0.1 * args.eps_train) def test_fn(epoch, env_step): policy.set_eps(args.eps_test) # trainer result = offpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.step_per_collect, args.test_num, args.batch_size, update_per_step=args.update_per_step, train_fn=train_fn, test_fn=test_fn, stop_fn=stop_fn, save_best_fn=save_best_fn, logger=logger, ) assert stop_fn(result['best_reward']) if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) policy.eval() policy.set_eps(args.eps_test) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) rews, lens = result["rews"], result["lens"] print(f"Final reward: {rews.mean()}, length: {lens.mean()}")