def testing_ddpg(args=get_args()): env = EnvThreeUsers(args.step_per_epoch) args.state_shape = env.observation_space.shape args.action_shape = env.action_space.shape args.max_action = env.action_space.high[0] # model net = Net(args.layer_num, args.state_shape, 0, device=args.device, hidden_layer_size=args.unit_num) actor = Actor(net, args.action_shape, args.max_action, args.device, hidden_layer_size=args.unit_num).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) net = Net(args.layer_num, args.state_shape, args.action_shape, concat=True, device=args.device, hidden_layer_size=args.unit_num) critic = Critic(net, args.device, args.unit_num).to(args.device) critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr) policy = DDPGPolicy( actor, actor_optim, critic, critic_optim, args.tau, args.gamma, OUNoise(sigma=args.exploration_noise), # GaussianNoise(sigma=args.exploration_noise), [env.action_space.low[0], env.action_space.high[0]], reward_normalization=True, ignore_done=True) # restore model log_path = os.path.join(args.logdir, args.task, 'ddpg') policy.load_state_dict(torch.load(os.path.join(log_path, 'policy.pth'))) print('\nrelode model!') env = EnvThreeUsers(args.step_per_epoch) collector = Collector(policy, env) ep = 10000 result = collector.collect(n_episode=ep, render=args.render) print('''\nty1_succ_1: {:.6f}, q_len_1: {:.6f}, \nty1_succ_2: {:.2f}, q_len_2: {:.2f}, \nty1_succ_3: {:.2f}, q_len_3: {:.2f}, \nee_1: {:.2f}, ee_2: {:.2f}, ee_3: {:.2f}, \navg_rate:{:.2f}, \navg_power:{:.2f}\n'''.format( result["ty1s_1"][0] / ep, result["ql_1"][0] / ep, result["ty1s_2"][0] / ep, result["ql_2"][0] / ep, result["ty1s_3"][0] / ep, result["ql_3"][0] / ep, result["ee_1"][0] / ep, result["ee_2"][0] / ep, result["ee_3"][0] / ep, result["avg_r"] / ep, result["avg_p"] / ep)) print('large than Qmax: users1: {}, users2: {}, users3: {}.'.format( str(env.large_than_Q_1), str(env.large_than_Q_2), str(env.large_than_Q_3))) collector.close()
def training_ddpg(args=get_args()): env = EnvTwoUsers(args.step_per_epoch) args.state_shape = env.observation_space.shape args.action_shape = env.action_space.shape args.max_action = env.action_space.high[0] train_envs = VectorEnv([ lambda: EnvTwoUsers(args.step_per_epoch) for _ in range(args.training_num) ]) test_envs = VectorEnv([ lambda: EnvTwoUsers(args.step_per_epoch) for _ in range(args.test_num) ]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, device=args.device, hidden_layer_size=args.unit_num) actor = Actor(net, args.action_shape, args.max_action, args.device, hidden_layer_size=args.unit_num).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) net = Net(args.layer_num, args.state_shape, args.action_shape, concat=True, device=args.device, hidden_layer_size=args.unit_num) critic = Critic(net, args.device, hidden_layer_size=args.unit_num).to(args.device) critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr) # orthogonal initialization for m in list(actor.modules()) + list(critic.modules()): if isinstance(m, torch.nn.Linear): torch.nn.init.orthogonal_(m.weight) torch.nn.init.zeros_(m.bias) policy = DDPGPolicy( actor, actor_optim, critic, critic_optim, args.tau, args.gamma, OUNoise(sigma=args.exploration_noise), # GaussianNoise(sigma=args.exploration_noise), [env.action_space.low[0], env.action_space.high[0]], reward_normalization=True, ignore_done=True) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log log_path = os.path.join(args.logdir, args.task, 'ddpg') if not os.path.exists(log_path): os.makedirs(log_path) # writer = SummaryWriter(log_path) writer = None # policy.load_state_dict(torch.load(os.path.join(log_path, 'policy.pth'))) # print('reload model!') def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= 1e16 # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) train_collector.close() test_collector.close()